diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 5abb8d72df028..c8f331204bd49 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -32,8 +32,6 @@ enable_cir="${6}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests --succinct" start-group "CMake" -export PIP_BREAK_SYSTEM_PACKAGES=1 -pip install -q -r "${MONOREPO_ROOT}"/.ci/all_requirements.txt # Set the system llvm-symbolizer as preferred. export LLVM_SYMBOLIZER_PATH=`which llvm-symbolizer` diff --git a/.ci/utils.sh b/.ci/utils.sh index 2a3d2426b630a..87afbbd6cdd31 100644 --- a/.ci/utils.sh +++ b/.ci/utils.sh @@ -52,3 +52,10 @@ function start-group { echo "Starting $groupname" fi } + +export PIP_BREAK_SYSTEM_PACKAGES=1 +pip install -q -r "${MONOREPO_ROOT}"/.ci/all_requirements.txt + +if [[ "$GITHUB_ACTIONS" != "" ]]; then + python .ci/cache_lit_timing_files.py download +fi diff --git a/.github/renovate.json b/.github/renovate.json new file mode 100644 index 0000000000000..6ce98c4e7b105 --- /dev/null +++ b/.github/renovate.json @@ -0,0 +1,12 @@ +{ + "$schema": "https://docs.renovatebot.com/renovate-schema.json", + "extends": [ + "config:recommended" + ], + "includePaths": [".github/**"], + "schedule": "* 0 * * 1", + "minimumReleaseAge": "3 days", + "assignees": ["boomanaiden154"], + "ignorePaths": [".github/workflows/containers/**"], + "groupName": "[Github] Update GHA Dependencies" +} diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml index 167e7cf06b3b2..14c349b1b2fe5 100644 --- a/.github/workflows/build-ci-container-windows.yml +++ b/.github/workflows/build-ci-container-windows.yml @@ -44,7 +44,7 @@ jobs: run: | docker save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }} - name: Upload container image - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: container path: ${{ steps.vars.outputs.container-filename }} diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml index 67f35fd30701f..01f1b8dc4f990 100644 --- a/.github/workflows/build-ci-container.yml +++ b/.github/workflows/build-ci-container.yml @@ -64,7 +64,7 @@ jobs: podman save ${{ steps.vars.outputs.container-name-agent-tag }} > ${{ steps.vars.outputs.container-agent-filename }} - name: Upload container image - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: container-${{ matrix.arch }} path: "*.tar" diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml index cadcaa9a42e8f..69b571575f40c 100644 --- a/.github/workflows/build-metrics-container.yml +++ b/.github/workflows/build-metrics-container.yml @@ -49,7 +49,7 @@ jobs: run: | podman save ${{ steps.vars.outputs.container-name-tag }} > ${{ steps.vars.outputs.container-filename }} - name: Upload Container Image - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: container path: ${{ steps.vars.outputs.container-filename }} diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml index 7e8c15696e344..f18a69c192ee9 100644 --- a/.github/workflows/check-ci.yml +++ b/.github/workflows/check-ci.yml @@ -26,7 +26,7 @@ jobs: with: sparse-checkout: .ci - name: Setup Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: 3.13 cache: 'pip' diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml index 7d37b900d7909..49cf4100dd71c 100644 --- a/.github/workflows/ci-post-commit-analyzer.yml +++ b/.github/workflows/ci-post-commit-analyzer.yml @@ -44,7 +44,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: # A full build of llvm, clang, lld, and lldb takes about 250MB # of ccache space. There's not much reason to have more than this, @@ -87,7 +87,7 @@ jobs: scan-build --generate-index-only build/analyzer-results - name: Upload Results - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: analyzer-results diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml index a7be81b0e2da5..734dc212fa648 100644 --- a/.github/workflows/commit-access-review.yml +++ b/.github/workflows/commit-access-review.yml @@ -28,7 +28,7 @@ jobs: python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN - name: Upload Triage List - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: triagers path: triagers.log diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile index f5422b4527576..dc0c9cabc7f01 100644 --- a/.github/workflows/containers/github-action-ci/Dockerfile +++ b/.github/workflows/containers/github-action-ci/Dockerfile @@ -1,8 +1,8 @@ -FROM docker.io/library/ubuntu:24.04 as base +FROM docker.io/library/ubuntu:24.04 AS base ENV LLVM_SYSROOT=/opt/llvm -FROM base as stage1-toolchain -ENV LLVM_VERSION=21.1.0 +FROM base AS stage1-toolchain +ENV LLVM_VERSION=21.1.1 RUN apt-get update && \ apt-get install -y \ @@ -37,7 +37,7 @@ RUN cmake -B ./build -G Ninja ./llvm \ RUN ninja -C ./build stage2-clang-bolt stage2-install-distribution && ninja -C ./build install-distribution -FROM base as ci-container +FROM base AS ci-container COPY --from=stage1-toolchain $LLVM_SYSROOT $LLVM_SYSROOT @@ -62,6 +62,7 @@ RUN apt-get update && \ # Having a symlink from python to python3 enables code sharing between # the Linux and Windows pipelines. python3-pip \ + python3-venv \ file \ tzdata \ python-is-python3 && \ @@ -97,7 +98,7 @@ RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers USER gha WORKDIR /home/gha -FROM ci-container as ci-container-agent +FROM ci-container AS ci-container-agent ENV GITHUB_RUNNER_VERSION=2.328.0 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 8cdd39c164cca..b5f3413fe3b6b 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -95,9 +95,9 @@ jobs: workflow: - '.github/workflows/docs.yml' - name: Setup Python env - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.11' + python-version: '3.13' cache: 'pip' cache-dependency-path: 'llvm/docs/requirements-hashed.txt' - name: Install python dependencies @@ -209,7 +209,7 @@ jobs: mkdir built-docs/flang cp -r flang-build/docs/* built-docs/flang/ - name: Upload docs - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: docs-output path: built-docs/ diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml index 9390fba4d4e3b..981c6fa62cb19 100644 --- a/.github/workflows/email-check.yaml +++ b/.github/workflows/email-check.yaml @@ -39,7 +39,7 @@ jobs: [{"body" : "$COMMENT"}] EOF - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: workflow-args diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml index efb8143877c4e..63388ebc706bd 100644 --- a/.github/workflows/gha-codeql.yml +++ b/.github/workflows/gha-codeql.yml @@ -29,9 +29,9 @@ jobs: sparse-checkout: | .github/ - name: Initialize CodeQL - uses: github/codeql-action/init@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3 + uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 with: languages: actions queries: security-extended - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@192325c86100d080feab897ff886c34abd4c83a3 # v3.30.3 + uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4 diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml index 72cbbe2b7dded..dcb852312d41a 100644 --- a/.github/workflows/hlsl-test-all.yaml +++ b/.github/workflows/hlsl-test-all.yaml @@ -80,7 +80,7 @@ jobs: ninja check-hlsl-unit ninja ${{ inputs.TestTarget }} - name: Publish Test Results - uses: EnricoMi/publish-unit-test-result-action/macos@170bf24d20d201b842d7a52403b73ed297e6645b # v2 + uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0 if: always() && runner.os == 'macOS' with: comment_mode: off diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml index db9389b6afe53..26cd60c070251 100644 --- a/.github/workflows/issue-write.yml +++ b/.github/workflows/issue-write.yml @@ -40,7 +40,7 @@ jobs: - name: 'Comment on PR' if: steps.download-artifact.outputs.artifact-id != '' - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml index 8967cd0949c11..3a048aeb9405b 100644 --- a/.github/workflows/libc-fullbuild-tests.yml +++ b/.github/workflows/libc-fullbuild-tests.yml @@ -61,7 +61,7 @@ jobs: # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects # frequent small object writes. - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: max-size: 1G key: libc_fullbuild_${{ matrix.c_compiler }} diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml index 7154946ac5c3d..df9a20dce8eae 100644 --- a/.github/workflows/libc-overlay-tests.yml +++ b/.github/workflows/libc-overlay-tests.yml @@ -51,7 +51,7 @@ jobs: # Do not use direct GHAC access even though it is supported by sccache. GHAC rejects # frequent small object writes. - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: max-size: 1G key: libc_overlay_build_${{ matrix.os }}_${{ matrix.compiler.c_compiler }} diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml index d53a2f306afa2..5ccf976848197 100644 --- a/.github/workflows/libclang-abi-tests.yml +++ b/.github/workflows/libclang-abi-tests.yml @@ -131,7 +131,7 @@ jobs: sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi done - name: Upload ABI file - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: name: ${{ matrix.name }} path: '*${{ matrix.ref }}.abi' @@ -165,7 +165,7 @@ jobs: done - name: Upload ABI Comparison if: always() - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: name: compat-report-${{ github.sha }} path: compat_reports/ diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml index e168928325561..8fb8cec3b4f00 100644 --- a/.github/workflows/libclang-python-tests.yml +++ b/.github/workflows/libclang-python-tests.yml @@ -34,11 +34,11 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Python - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: python-version: ${{ matrix.python-version }} - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: max-size: 2G key: spirv-ubuntu-24.04 diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 2e6ff7f91b6fc..5fe2ffbf58b43 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -60,7 +60,7 @@ jobs: env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: ${{ matrix.config }}-${{ matrix.cxx }}-results @@ -105,7 +105,7 @@ jobs: env: CC: ${{ matrix.cc }} CXX: ${{ matrix.cxx }} - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() # Upload artifacts even if the build or test suite fails with: name: ${{ matrix.config }}-${{ matrix.cxx }}-results @@ -169,7 +169,7 @@ jobs: env: CC: clang-22 CXX: clang++-22 - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: ${{ matrix.config }}-results @@ -223,7 +223,7 @@ jobs: source .venv/bin/activate python -m pip install psutil bash libcxx/utils/ci/run-buildbot ${{ matrix.config }} - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() # Upload artifacts even if the build or test suite fails with: name: macos-${{ matrix.config }}-results diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml index cbaa8e0f65129..312cb47fc3d93 100644 --- a/.github/workflows/libcxx-build-containers.yml +++ b/.github/workflows/libcxx-build-containers.yml @@ -55,7 +55,7 @@ jobs: TAG: ${{ github.sha }} - name: Log in to GitHub Container Registry - uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0 + uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0 with: registry: ghcr.io username: ${{ github.actor }} diff --git a/.github/workflows/libcxx-check-generated-files.yml b/.github/workflows/libcxx-check-generated-files.yml index f338bd6952779..d34b6a79556d1 100644 --- a/.github/workflows/libcxx-check-generated-files.yml +++ b/.github/workflows/libcxx-check-generated-files.yml @@ -15,7 +15,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Install dependencies - uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1 + uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1 with: clangformat: 17.0.1 ninja: true diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml index 17a97df029ba5..0379a0a1f857d 100644 --- a/.github/workflows/libcxx-run-benchmarks.yml +++ b/.github/workflows/libcxx-run-benchmarks.yml @@ -35,7 +35,7 @@ jobs: steps: - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.10' + python-version: '3.13' - name: Extract information from the PR id: vars diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml index 5470662c97628..7d42abfadde7b 100644 --- a/.github/workflows/llvm-bugs.yml +++ b/.github/workflows/llvm-bugs.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-24.04 if: github.repository == 'llvm/llvm-project' steps: - - uses: actions/setup-node@1d0ff469b7ec7b3cb9d8673fde0c81c44821de2a # v4.2.0 + - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 with: node-version: 18 check-latest: true diff --git a/.github/workflows/llvm-tests.yml b/.github/workflows/llvm-tests.yml index ea80e229512d5..c4701c7283da0 100644 --- a/.github/workflows/llvm-tests.yml +++ b/.github/workflows/llvm-tests.yml @@ -128,14 +128,14 @@ jobs: # Remove symbol versioning from dumps, so we can compare across major versions. sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi - name: Upload ABI file - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: name: ${{ matrix.name }} path: ${{ matrix.ref }}.abi - name: Upload symbol list file if: matrix.name == 'build-baseline' - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: name: symbol-list path: llvm.symbols @@ -179,7 +179,7 @@ jobs: abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c" - name: Upload ABI Comparison if: always() - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: name: compat-report-${{ github.sha }} path: compat_reports/ diff --git a/.github/workflows/mlir-spirv-tests.yml b/.github/workflows/mlir-spirv-tests.yml index 78952ccad2642..5bb16c739cdde 100644 --- a/.github/workflows/mlir-spirv-tests.yml +++ b/.github/workflows/mlir-spirv-tests.yml @@ -30,7 +30,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: max-size: 2G key: spirv-mlir-ubuntu-24.04 diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml index 61c8680cd72a1..1e0dc7045c1cc 100644 --- a/.github/workflows/pr-code-format.yml +++ b/.github/workflows/pr-code-format.yml @@ -43,14 +43,14 @@ jobs: # of a release cycle (x.1.0) or the last version of a release cycle, or # if there have been relevant clang-format backports. - name: Install clang-format - uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1 + uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1 with: clangformat: 21.1.0 - name: Setup Python env - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.11' + python-version: '3.13' cache: 'pip' cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt' @@ -72,7 +72,7 @@ jobs: --end-rev HEAD \ --changed-files "$CHANGED_FILES" - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: workflow-args diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml index bc70933147bd2..776ec4af9d2dc 100644 --- a/.github/workflows/pr-code-lint.yml +++ b/.github/workflows/pr-code-lint.yml @@ -27,13 +27,13 @@ jobs: cancel-in-progress: true steps: - name: Fetch LLVM sources - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0 with: fetch-depth: 2 - name: Get changed files id: changed-files - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 with: separator: "," skip_initial_fetch: true @@ -47,15 +47,18 @@ jobs: echo "Changed files:" echo "$CHANGED_FILES" + # The clang tidy version should always be upgraded to the first version + # of a release cycle (x.1.0) or the last version of a release cycle, or + # if there have been relevant clang-format backports. - name: Install clang-tidy - uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1 + uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1 with: - clang-tidy: 20.1.8 + clang-tidy: 21.1.0 - name: Setup Python env - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: '3.12' + python-version: '3.13' - name: Install Python dependencies run: python3 -m pip install -r llvm/utils/git/requirements_linting.txt @@ -104,7 +107,7 @@ jobs: --changed-files "$CHANGED_FILES" - name: Upload results - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: workflow-args diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml index f0197d71d6aa9..8162a8984ee5f 100644 --- a/.github/workflows/pr-request-release-note.yml +++ b/.github/workflows/pr-request-release-note.yml @@ -41,7 +41,7 @@ jobs: request-release-note \ --pr-number ${{ github.event.pull_request.number}} - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 if: always() with: name: workflow-args diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 63ab4a8356971..a9c107e4a5f08 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -76,7 +76,7 @@ jobs: # https://github.com/actions/upload-artifact/issues/569 continue-on-error: true if: '!cancelled()' - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: Premerge Artifacts (Linux) path: artifacts/ @@ -130,7 +130,7 @@ jobs: # https://github.com/actions/upload-artifact/issues/569 continue-on-error: true if: '!cancelled()' - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: Premerge Artifacts (Windows) path: artifacts/ @@ -151,7 +151,7 @@ jobs: with: fetch-depth: 2 - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: max-size: "2000M" - name: Install Ninja diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml index 6546540a1b547..8b24948b568eb 100644 --- a/.github/workflows/release-asset-audit.yml +++ b/.github/workflows/release-asset-audit.yml @@ -38,7 +38,7 @@ jobs: if: >- github.event_name != 'pull_request' && failure() - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1 + uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 with: github-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }} script: | diff --git a/.github/workflows/release-binaries-save-stage/action.yml b/.github/workflows/release-binaries-save-stage/action.yml index f08088c7bc56f..84ccf98c23a82 100644 --- a/.github/workflows/release-binaries-save-stage/action.yml +++ b/.github/workflows/release-binaries-save-stage/action.yml @@ -30,14 +30,14 @@ runs: tar -C ${{ inputs.build-prefix }} -c build/ | zstd -T0 -c > build.tar.zst - name: Upload Stage 1 Source - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: ${{ runner.os }}-${{ runner.arch }}-${{ github.job }}-source path: llvm-project.tar.zst retention-days: 2 - name: Upload Stage 1 Build Dir - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: ${{ runner.os}}-${{ runner.arch }}-${{ github.job }}-build path: build.tar.zst diff --git a/.github/workflows/release-binaries-setup-stage/action.yml b/.github/workflows/release-binaries-setup-stage/action.yml index 8f45e22886b6e..475a25fa6b772 100644 --- a/.github/workflows/release-binaries-setup-stage/action.yml +++ b/.github/workflows/release-binaries-setup-stage/action.yml @@ -22,7 +22,7 @@ runs: using: "composite" steps: - name: Install Ninja - uses: llvm/actions/install-ninja@22e9f909d35b50bd1181709564bfe816eaeaae81 # main + uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main - name: Setup Windows if: startsWith(runner.os, 'Windows') diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml index 8f422a0147748..cba48e4d0c70a 100644 --- a/.github/workflows/release-binaries.yml +++ b/.github/workflows/release-binaries.yml @@ -68,9 +68,9 @@ jobs: steps: # It's good practice to use setup-python, but this is also required on macos-14 # due to https://github.com/actions/runner-images/issues/10385 - - uses: actions/setup-python@39cd14951b08e74b54015e9e001cdefcf80e669f + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: - python-version: '3.12' + python-version: '3.13' - name: Checkout LLVM uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -250,7 +250,7 @@ jobs: release_dir=`find ${{ steps.setup-stage.outputs.build-prefix }}/build -iname 'stage2-bins'` mv $release_dir/${{ needs.prepare.outputs.release-binary-filename }} . - - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 #v4.3.0 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: ${{ runner.os }}-${{ runner.arch }}-release-binary # Due to path differences on Windows when running in bash vs running on node, @@ -301,7 +301,7 @@ jobs: - name: Attest Build Provenance id: provenance - uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0 + uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4 with: subject-path: ${{ needs.prepare.outputs.release-binary-filename }} @@ -310,7 +310,7 @@ jobs: mv ${{ steps.provenance.outputs.bundle-path }} ${{ needs.prepare.outputs.release-binary-filename }}.jsonl - name: Upload Build Provenance - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: ${{ needs.prepare.outputs.release-binary-filename }}-attestation path: ${{ needs.prepare.outputs.release-binary-filename }}.jsonl diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml index 712ff1831170e..d3d375d3a6df9 100644 --- a/.github/workflows/release-documentation.yml +++ b/.github/workflows/release-documentation.yml @@ -37,7 +37,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Python env - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: cache: 'pip' cache-dependency-path: './llvm/docs/requirements.txt' @@ -59,7 +59,7 @@ jobs: ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen - name: Create Release Notes Artifact - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # 4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2 with: name: release-notes path: docs-build/html-export/ diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml index 17c677413f744..79e509e5e6a8b 100644 --- a/.github/workflows/release-doxygen.yml +++ b/.github/workflows/release-doxygen.yml @@ -43,7 +43,7 @@ jobs: uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup Python env - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: cache: 'pip' cache-dependency-path: './llvm/docs/requirements.txt' diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml index 60ec64462bc31..8b1ce04e12c4f 100644 --- a/.github/workflows/release-lit.yml +++ b/.github/workflows/release-lit.yml @@ -45,7 +45,7 @@ jobs: ./llvm/utils/release/./github-upload-release.py --token "$GITHUB_TOKEN" --user ${{ github.actor }} --user-token "$USER_TOKEN" check-permissions - name: Setup Cpp - uses: aminya/setup-cpp@17c11551771948abc5752bbf3183482567c7caf0 # v1.1.1 + uses: aminya/setup-cpp@a276e6e3d1db9160db5edc458e99a30d3b109949 # v1.7.1 with: compiler: llvm-16.0.6 cmake: true @@ -66,14 +66,14 @@ jobs: python3 setup.py sdist bdist_wheel - name: Upload lit to test.pypi.org - uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 with: password: ${{ secrets.LLVM_LIT_TEST_PYPI_API_TOKEN }} repository-url: https://test.pypi.org/legacy/ packages-dir: llvm/utils/lit/dist/ - name: Upload lit to pypi.org - uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 + uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0 with: password: ${{ secrets.LLVM_LIT_PYPI_API_TOKEN }} packages-dir: llvm/utils/lit/dist/ diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml index 14cc4c4e9b94f..2278b96dbe242 100644 --- a/.github/workflows/release-sources.yml +++ b/.github/workflows/release-sources.yml @@ -92,14 +92,14 @@ jobs: - name: Attest Build Provenance if: github.event_name != 'pull_request' id: provenance - uses: actions/attest-build-provenance@897ed5eab6ed058a474202017ada7f40bfa52940 # v1.0.0 + uses: actions/attest-build-provenance@ef244123eb79f2f7a7e75d99086184180e6d0018 # v1.4.4 with: subject-path: "*.xz" - if: github.event_name != 'pull_request' run: | mv ${{ steps.provenance.outputs.bundle-path }} . - name: Create Tarball Artifacts - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 #v4.3.3 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: path: | *.xz diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 40db5504294ef..c07df338cf989 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -36,7 +36,7 @@ jobs: persist-credentials: false - name: "Run analysis" - uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1 + uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2 with: results_file: results.sarif results_format: sarif @@ -49,7 +49,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: SARIF file path: results.sarif @@ -57,6 +57,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@80f993039571a6de66594ecaa432875a6942e8e0 # v2.20.6 + uses: github/codeql-action/upload-sarif@b8d3b6e8af63cde30bdc382c0bc28114f4346c88 # v2.28.1 with: sarif_file: results.sarif diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml index 8708fb06d9eb8..69374ae563306 100644 --- a/.github/workflows/spirv-tests.yml +++ b/.github/workflows/spirv-tests.yml @@ -26,7 +26,7 @@ jobs: steps: - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - name: Setup ccache - uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19 with: max-size: 2G key: spirv-ubuntu-24.04 diff --git a/.github/workflows/unprivileged-download-artifact/action.yml b/.github/workflows/unprivileged-download-artifact/action.yml index 9d8fb59a67c0e..5b50d7ce3d3fb 100644 --- a/.github/workflows/unprivileged-download-artifact/action.yml +++ b/.github/workflows/unprivileged-download-artifact/action.yml @@ -27,7 +27,7 @@ outputs: runs: using: "composite" steps: - - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea #v7.0.1 + - uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7.1.0 id: artifact-url with: script: | diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index 72c8817daa714..082f1cec34d52 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -326,6 +326,9 @@ class BinaryContext { /// Returns true if DWARF4 or lower is used. bool isDWARFLegacyUsed() const { return ContainsDwarfLegacy; } + /// Returns true if DWARFUnit is valid. + bool isValidDwarfUnit(DWARFUnit &DU) const; + std::map &getDwarfLineTables() { return DwarfLineTablesCUMap; } diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 51b139a15e1a0..7e0e3bff83259 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -192,9 +192,6 @@ class BinaryFunction { mutable MCSymbol *FunctionConstantIslandLabel{nullptr}; mutable MCSymbol *FunctionColdConstantIslandLabel{nullptr}; - - // Returns constant island alignment - uint16_t getAlignment() const { return sizeof(uint64_t); } }; static constexpr uint64_t COUNT_NO_PROFILE = @@ -2114,9 +2111,7 @@ class BinaryFunction { return *std::prev(CodeIter) <= *DataIter; } - uint16_t getConstantIslandAlignment() const { - return Islands ? Islands->getAlignment() : 1; - } + uint16_t getConstantIslandAlignment() const; /// If there is a constant island in the range [StartOffset, EndOffset), /// return its address. @@ -2168,6 +2163,11 @@ class BinaryFunction { return Islands && !Islands->DataOffsets.empty(); } + /// Return true if the whole function is a constant island. + bool isDataObject() const { + return Islands && Islands->CodeOffsets.size() == 0; + } + bool isStartOfConstantIsland(uint64_t Offset) const { return hasConstantIsland() && Islands->DataOffsets.count(Offset); } diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h index e4a4fc6b2f258..4c3c277adf422 100644 --- a/bolt/include/bolt/Core/DIEBuilder.h +++ b/bolt/include/bolt/Core/DIEBuilder.h @@ -217,7 +217,8 @@ class DIEBuilder { std::optional Parent, uint32_t NumberParentsInChain); - void registerUnit(DWARFUnit &DU, bool NeedSort); + /// Returns true if DWARFUnit is registered successfully. + bool registerUnit(DWARFUnit &DU, bool NeedSort); /// \return the unique ID of \p U if it exists. std::optional getUnitId(const DWARFUnit &DU); diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h index ee4dd689b8dd6..240d5138a093a 100644 --- a/bolt/include/bolt/Core/FunctionLayout.h +++ b/bolt/include/bolt/Core/FunctionLayout.h @@ -232,8 +232,24 @@ class FunctionLayout { return Blocks[Index]; } + /// Return the basic block after the given basic block iterator in the layout + /// or nullptr if the last basic block iterator is given. + const BinaryBasicBlock *getBasicBlockAfter(block_const_iterator BlockIt, + bool IgnoreSplits = true) const; + + /// Returns the basic block after the given basic block in the layout or + /// nullptr if the last basic block is given. + /// + /// Note: prefer the version that takes the iterator as this function uses + /// linear basic block lookup. + const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits = true) const; + /// Returns the basic block after the given basic block in the layout or /// nullptr if the last basic block is given. + /// + /// Note: prefer the version that takes the iterator as this function uses + /// linear basic block lookup. BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *const BB, const bool IgnoreSplits = true) { return const_cast( @@ -241,11 +257,6 @@ class FunctionLayout { BB, IgnoreSplits)); } - /// Returns the basic block after the given basic block in the layout or - /// nullptr if the last basic block is given. - const BinaryBasicBlock *getBasicBlockAfter(const BinaryBasicBlock *BB, - bool IgnoreSplits = true) const; - /// True if the layout contains at least two non-empty fragments. bool isSplit() const; diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 90129d475d870..5b711b0e27bab 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -51,6 +51,7 @@ class raw_ostream; namespace bolt { class BinaryBasicBlock; +class BinaryContext; class BinaryFunction; /// Different types of indirect branches encountered during disassembly. @@ -530,10 +531,15 @@ class MCPlusBuilder { return 0; } + /// Create a helper function to increment counter for Instrumentation + virtual void createInstrCounterIncrFunc(BinaryContext &BC) { + llvm_unreachable("not implemented"); + } + /// Create increment contents of target by 1 for Instrumentation - virtual InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const { + virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) { llvm_unreachable("not implemented"); return InstructionListType(); } diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 72c72bbaf4a65..98440cde7cebd 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -1624,10 +1624,26 @@ DWARFContext *BinaryContext::getDWOContext() const { return &DWOCUs.begin()->second->getContext(); } +bool BinaryContext::isValidDwarfUnit(DWARFUnit &DU) const { + // Invalid DWARF unit with a DWOId but lacking a dwo_name. + if (DU.getDWOId() && !DU.isDWOUnit() && + !DU.getUnitDIE().find( + {dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name})) { + this->outs() << "BOLT-ERROR: broken DWARF found in CU at offset 0x" + << Twine::utohexstr(DU.getOffset()) << " (DWOId=0x" + << Twine::utohexstr(*(DU.getDWOId())) + << ", missing DW_AT_dwo_name / DW_AT_GNU_dwo_name)\n"; + return false; + } + return true; +} + /// Handles DWO sections that can either be in .o, .dwo or .dwp files. void BinaryContext::preprocessDWODebugInfo() { for (const std::unique_ptr &CU : DwCtx->compile_units()) { DWARFUnit *const DwarfUnit = CU.get(); + if (!isValidDwarfUnit(*DwarfUnit)) + continue; if (std::optional DWOId = DwarfUnit->getDWOId()) { std::string DWOName = dwarf::toString( DwarfUnit->getUnitDIE().find( diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 578a87dc6c09d..07bc71ee538d6 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -284,6 +284,33 @@ BinaryFunction::getBasicBlockContainingOffset(uint64_t Offset) { return (Offset < BB->getOffset() + BB->getOriginalSize()) ? BB : nullptr; } +uint16_t BinaryFunction::getConstantIslandAlignment() const { + if (Islands == nullptr) + return 1; + + // For constant island inside a function, the default 8-byte alignment is + // probably good enough. + const uint16_t DefaultAlignment = sizeof(uint64_t); + if (!isDataObject()) + return DefaultAlignment; + + // If the constant island itself is a binary function, get its alignment + // based on its size, original address, and its owning section's alignment. + const uint64_t MaxAlignment = + std::min(uint64_t(1) << llvm::countr_zero(getAddress()), + OriginSection->getAlignment()); + const uint64_t MinAlignment = + std::max((uint64_t)DefaultAlignment, + uint64_t(1) << (63 - llvm::countl_zero(getSize()))); + uint64_t Alignment = std::min(MinAlignment, MaxAlignment); + if (Alignment >> 16) { + BC.errs() << "BOLT-ERROR: the constant island's alignment is too big: 0x" + << Twine::utohexstr(Alignment) << "\n"; + exit(1); + } + return (uint16_t)Alignment; +} + void BinaryFunction::markUnreachableBlocks() { std::stack Stack; @@ -3598,7 +3625,9 @@ void BinaryFunction::fixBranches() { auto &MIB = BC.MIB; MCContext *Ctx = BC.Ctx.get(); - for (BinaryBasicBlock *BB : BasicBlocks) { + for (auto BBI = Layout.block_begin(), BBE = Layout.block_end(); BBI != BBE; + ++BBI) { + BinaryBasicBlock *BB = *BBI; const MCSymbol *TBB = nullptr; const MCSymbol *FBB = nullptr; MCInst *CondBranch = nullptr; @@ -3612,7 +3641,7 @@ void BinaryFunction::fixBranches() { // Basic block that follows the current one in the final layout. const BinaryBasicBlock *const NextBB = - Layout.getBasicBlockAfter(BB, /*IgnoreSplits=*/false); + Layout.getBasicBlockAfter(BBI, /*IgnoreSplits*/ false); if (BB->succ_size() == 1) { // __builtin_unreachable() could create a conditional branch that diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp index b041dc5ea1cce..7ce55f9165136 100644 --- a/bolt/lib/Core/DIEBuilder.cpp +++ b/bolt/lib/Core/DIEBuilder.cpp @@ -584,7 +584,8 @@ DWARFDie DIEBuilder::resolveDIEReference( if ((RefCU = getUnitForOffset(*this, *DwarfContext, TmpRefOffset, AttrSpec))) { /// Trying to add to current working set in case it's cross CU reference. - registerUnit(*RefCU, true); + if (!registerUnit(*RefCU, true)) + return DWARFDie(); DWARFDataExtractor DebugInfoData = RefCU->getDebugInfoExtractor(); if (DwarfDebugInfoEntry.extractFast(*RefCU, &TmpRefOffset, DebugInfoData, RefCU->getNextUnitOffset(), 0)) { @@ -1008,12 +1009,14 @@ static uint64_t getHash(const DWARFUnit &DU) { return DU.getOffset(); } -void DIEBuilder::registerUnit(DWARFUnit &DU, bool NeedSort) { +bool DIEBuilder::registerUnit(DWARFUnit &DU, bool NeedSort) { + if (!BC.isValidDwarfUnit(DU)) + return false; auto IterGlobal = AllProcessed.insert(getHash(DU)); // If DU is already in a current working set or was already processed we can // skip it. if (!IterGlobal.second) - return; + return true; if (getState().Type == ProcessingType::DWARF4TUs) { getState().DWARF4TUVector.push_back(&DU); } else if (getState().Type == ProcessingType::DWARF5TUs) { @@ -1034,6 +1037,7 @@ void DIEBuilder::registerUnit(DWARFUnit &DU, bool NeedSort) { if (getState().DUList.size() == getState().CloneUnitCtxMap.size()) getState().CloneUnitCtxMap.emplace_back(); getState().DUList.push_back(&DU); + return true; } std::optional DIEBuilder::getUnitId(const DWARFUnit &DU) { diff --git a/bolt/lib/Core/FunctionLayout.cpp b/bolt/lib/Core/FunctionLayout.cpp index 4498fc44da954..98ed6e1320b3e 100644 --- a/bolt/lib/Core/FunctionLayout.cpp +++ b/bolt/lib/Core/FunctionLayout.cpp @@ -224,23 +224,29 @@ void FunctionLayout::clear() { } const BinaryBasicBlock * -FunctionLayout::getBasicBlockAfter(const BinaryBasicBlock *BB, +FunctionLayout::getBasicBlockAfter(block_const_iterator BBIter, bool IgnoreSplits) const { - const block_const_iterator BBPos = find(blocks(), BB); - if (BBPos == block_end()) - return nullptr; - - const block_const_iterator BlockAfter = std::next(BBPos); + const block_const_iterator BlockAfter = std::next(BBIter); if (BlockAfter == block_end()) return nullptr; if (!IgnoreSplits) - if (BlockAfter == getFragment(BB->getFragmentNum()).end()) + if (BlockAfter == getFragment((*BBIter)->getFragmentNum()).end()) return nullptr; return *BlockAfter; } +const BinaryBasicBlock * +FunctionLayout::getBasicBlockAfter(const BinaryBasicBlock *BB, + bool IgnoreSplits) const { + const block_const_iterator BBPos = find(blocks(), BB); + if (BBPos == block_end()) + return nullptr; + + return getBasicBlockAfter(BBPos, IgnoreSplits); +} + bool FunctionLayout::isSplit() const { const unsigned NonEmptyFragCount = llvm::count_if( fragments(), [](const FunctionFragment &FF) { return !FF.empty(); }); diff --git a/bolt/lib/Core/GDBIndex.cpp b/bolt/lib/Core/GDBIndex.cpp index c7fb4889646b4..4c34f5ee7fca7 100644 --- a/bolt/lib/Core/GDBIndex.cpp +++ b/bolt/lib/Core/GDBIndex.cpp @@ -77,7 +77,8 @@ void GDBIndex::updateGdbIndexSection( exit(1); } DenseSet OriginalOffsets; - for (unsigned Index = 0, Units = BC.DwCtx->getNumCompileUnits(); + for (unsigned Index = 0, PresentUnitsIndex = 0, + Units = BC.DwCtx->getNumCompileUnits(); Index < Units; ++Index) { const DWARFUnit *CU = BC.DwCtx->getUnitAtIndex(Index); if (SkipTypeUnits && CU->isTypeUnit()) @@ -90,7 +91,7 @@ void GDBIndex::updateGdbIndexSection( } OriginalOffsets.insert(Offset); - OffsetToIndexMap[Offset] = Index; + OffsetToIndexMap[Offset] = PresentUnitsIndex++; } // Ignore old address table. @@ -99,10 +100,19 @@ void GDBIndex::updateGdbIndexSection( Data += SymbolTableOffset - CUTypesOffset; // Calculate the size of the new address table. + const auto IsValidAddressRange = [](const DebugAddressRange &Range) { + return Range.HighPC > Range.LowPC; + }; + uint32_t NewAddressTableSize = 0; for (const auto &CURangesPair : ARangesSectionWriter.getCUAddressRanges()) { const SmallVector &Ranges = CURangesPair.second; - NewAddressTableSize += Ranges.size() * 20; + NewAddressTableSize += + llvm::count_if(Ranges, + [&IsValidAddressRange](const DebugAddressRange &Range) { + return IsValidAddressRange(Range); + }) * + 20; } // Difference between old and new table (and section) sizes. @@ -125,16 +135,52 @@ void GDBIndex::updateGdbIndexSection( using MapEntry = std::pair; std::vector CUVector(CUMap.begin(), CUMap.end()); + // Remove the CUs we won't emit anyway. + CUVector.erase(std::remove_if(CUVector.begin(), CUVector.end(), + [&OriginalOffsets](const MapEntry &It) { + // Skipping TU for DWARF5 when they are not + // included in CU list. + return OriginalOffsets.count(It.first) == 0; + }), + CUVector.end()); // Need to sort since we write out all of TUs in .debug_info before CUs. std::sort(CUVector.begin(), CUVector.end(), [](const MapEntry &E1, const MapEntry &E2) -> bool { return E1.second.Offset < E2.second.Offset; }); + // Create the original CU index -> updated CU index mapping, + // as the sort above could've changed the order and we have to update + // indices correspondingly in address map and constant pool. + std::unordered_map OriginalCUIndexToUpdatedCUIndexMap; + OriginalCUIndexToUpdatedCUIndexMap.reserve(CUVector.size()); + for (uint32_t I = 0; I < CUVector.size(); ++I) { + OriginalCUIndexToUpdatedCUIndexMap[OffsetToIndexMap.at(CUVector[I].first)] = + I; + } + const auto RemapCUIndex = [&OriginalCUIndexToUpdatedCUIndexMap, + CUVectorSize = CUVector.size(), + TUVectorSize = getGDBIndexTUEntryVector().size()]( + uint32_t OriginalIndex) { + if (OriginalIndex >= CUVectorSize) { + if (OriginalIndex >= CUVectorSize + TUVectorSize) { + errs() << "BOLT-ERROR: .gdb_index unknown CU index\n"; + exit(1); + } + // The index is into TU CU List, which we don't reorder, so return as is. + return OriginalIndex; + } + + const auto It = OriginalCUIndexToUpdatedCUIndexMap.find(OriginalIndex); + if (It == OriginalCUIndexToUpdatedCUIndexMap.end()) { + errs() << "BOLT-ERROR: .gdb_index unknown CU index\n"; + exit(1); + } + + return It->second; + }; + // Writing out CU List for (auto &CUInfo : CUVector) { - // Skipping TU for DWARF5 when they are not included in CU list. - if (!OriginalOffsets.count(CUInfo.first)) - continue; write64le(Buffer, CUInfo.second.Offset); // Length encoded in CU doesn't contain first 4 bytes that encode length. write64le(Buffer + 8, CUInfo.second.Length + 4); @@ -160,13 +206,19 @@ void GDBIndex::updateGdbIndexSection( // Generate new address table. for (const std::pair &CURangesPair : ARangesSectionWriter.getCUAddressRanges()) { - const uint32_t CUIndex = OffsetToIndexMap[CURangesPair.first]; + const uint32_t OriginalCUIndex = OffsetToIndexMap[CURangesPair.first]; + const uint32_t UpdatedCUIndex = RemapCUIndex(OriginalCUIndex); const DebugAddressRangesVector &Ranges = CURangesPair.second; for (const DebugAddressRange &Range : Ranges) { - write64le(Buffer, Range.LowPC); - write64le(Buffer + 8, Range.HighPC); - write32le(Buffer + 16, CUIndex); - Buffer += 20; + // Don't emit ranges that break gdb, + // https://sourceware.org/bugzilla/show_bug.cgi?id=33247. + // We've seen [0, 0) ranges here, for instance. + if (IsValidAddressRange(Range)) { + write64le(Buffer, Range.LowPC); + write64le(Buffer + 8, Range.HighPC); + write32le(Buffer + 16, UpdatedCUIndex); + Buffer += 20; + } } } @@ -178,6 +230,56 @@ void GDBIndex::updateGdbIndexSection( // Copy over the rest of the original data. memcpy(Buffer, Data, TrailingSize); + // Fixup CU-indices in constant pool. + const char *const OriginalConstantPoolData = + GdbIndexContents.data() + ConstantPoolOffset; + uint8_t *const UpdatedConstantPoolData = + NewGdbIndexContents + ConstantPoolOffset + Delta; + + const char *OriginalSymbolTableData = + GdbIndexContents.data() + SymbolTableOffset; + std::set CUVectorOffsets; + // Parse the symbol map and extract constant pool CU offsets from it. + while (OriginalSymbolTableData < OriginalConstantPoolData) { + const uint32_t NameOffset = read32le(OriginalSymbolTableData); + const uint32_t CUVectorOffset = read32le(OriginalSymbolTableData + 4); + OriginalSymbolTableData += 8; + + // Iff both are zero, then the slot is considered empty in the hash-map. + if (NameOffset || CUVectorOffset) { + CUVectorOffsets.insert(CUVectorOffset); + } + } + + // Update the CU-indicies in the constant pool + for (const auto CUVectorOffset : CUVectorOffsets) { + const char *CurrentOriginalConstantPoolData = + OriginalConstantPoolData + CUVectorOffset; + uint8_t *CurrentUpdatedConstantPoolData = + UpdatedConstantPoolData + CUVectorOffset; + + const uint32_t Num = read32le(CurrentOriginalConstantPoolData); + CurrentOriginalConstantPoolData += 4; + CurrentUpdatedConstantPoolData += 4; + + for (uint32_t J = 0; J < Num; ++J) { + const uint32_t OriginalCUIndexAndAttributes = + read32le(CurrentOriginalConstantPoolData); + CurrentOriginalConstantPoolData += 4; + + // We only care for the index, which is the lowest 24 bits, other bits are + // left as is. + const uint32_t OriginalCUIndex = + OriginalCUIndexAndAttributes & ((1 << 24) - 1); + const uint32_t Attributes = OriginalCUIndexAndAttributes >> 24; + const uint32_t UpdatedCUIndexAndAttributes = + RemapCUIndex(OriginalCUIndex) | (Attributes << 24); + + write32le(CurrentUpdatedConstantPoolData, UpdatedCUIndexAndAttributes); + CurrentUpdatedConstantPoolData += 4; + } + } + // Register the new section. BC.registerOrUpdateNoteSection(".gdb_index", NewGdbIndexContents, NewGdbIndexSize); diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index c2f876f0dff9e..150461b020f06 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -753,6 +753,8 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) { createSimpleFunction("__bolt_fini_trampoline", BC.MIB->createReturnInstructionList(BC.Ctx.get())); } + if (BC.isAArch64()) + BC.MIB->createInstrCounterIncrFunc(BC); } } diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp index 6752489ad562a..5c89a424caa7f 100644 --- a/bolt/lib/Rewrite/DWARFRewriter.cpp +++ b/bolt/lib/Rewrite/DWARFRewriter.cpp @@ -504,9 +504,7 @@ static void emitDWOBuilder(const std::string &DWOName, } emitUnit(DWODIEBuilder, *Streamer, SplitCU); } else { - for (std::unique_ptr &CU : - SplitCU.getContext().dwo_compile_units()) - emitUnit(DWODIEBuilder, *Streamer, *CU); + emitUnit(DWODIEBuilder, *Streamer, SplitCU); // emit debug_types sections for dwarf4 for (DWARFUnit *CU : DWODIEBuilder.getDWARF4TUVector()) diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index a6e4dbc9c192f..8b78c53aa99b3 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -1312,7 +1312,9 @@ void RewriteInstance::discoverFileObjects() { // Annotate functions with code/data markers in AArch64 for (auto &[Address, Type] : MarkerSymbols) { - auto *BF = BC->getBinaryFunctionContainingAddress(Address, true, true); + auto *BF = BC->getBinaryFunctionContainingAddress(Address, + /*CheckPastEnd*/ false, + /*UseMaxSize*/ true); if (!BF) { // Stray marker diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index f972646aa12ea..a6589f8f9ee42 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -26,6 +26,7 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegister.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/DataExtractor.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -35,6 +36,15 @@ using namespace llvm; using namespace bolt; +namespace opts { +extern cl::OptionCategory BoltInstrCategory; +static cl::opt NoLSEAtomics( + "no-lse-atomics", + cl::desc("generate instrumentation code sequence without using LSE atomic " + "instruction"), + cl::init(false), cl::Optional, cl::cat(BoltInstrCategory)); +} // namespace opts + namespace { static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { @@ -106,7 +116,7 @@ static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { } static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { - // NOTE: Supports only ARM with LSE extension + assert(!opts::NoLSEAtomics && "Supports only ARM with LSE extension"); Inst.setOpcode(AArch64::LDADDX); Inst.clear(); Inst.addOperand(MCOperand::createReg(AArch64::XZR)); @@ -135,6 +145,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { public: using MCPlusBuilder::MCPlusBuilder; + BinaryFunction *InstrCounterIncrFunc{nullptr}; + std::unique_ptr createTargetSymbolizer(BinaryFunction &Function, bool CreateNewSymbols) const override { @@ -2513,22 +2525,129 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return Insts; } - InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const override { + // Instrumentation code sequence using LSE atomic instruction has a total of + // 6 instructions: + // + // stp x0, x1, [sp, #-0x10]! + // adrp x0, page_address(counter) + // add x0, x0, page_offset(counter) + // mov x1, #0x1 + // stadd x1, [x0] + // ldp x0, x1, [sp], #0x10 + // + // Instrumentation code sequence without using LSE atomic instruction has + // 8 instructions at instrumentation place, with 6 instructions in the helper: + // + // stp x0, x30, [sp, #-0x10]! + // stp x1, x2, [sp, #-0x10]! + // adrp x0, page_address(counter) + // add x0, x0, page_offset(counter) + // adrp x1, page_address(helper) + // add x1, x1, page_offset(helper) + // blr x1 + // ldp x0, x30, [sp], #0x10 + // + // : + // ldaxr x1, [x0] + // add x1, x1, #0x1 + // stlxr w2, x1, [x0] + // cbnz w2, + // ldp x1, x2, [sp], #0x10 + // ret + + void createInstrCounterIncrFunc(BinaryContext &BC) override { + assert(InstrCounterIncrFunc == nullptr && + "helper function of counter increment for instrumentation " + "has already been created"); + + if (!opts::NoLSEAtomics) + return; + + MCContext *Ctx = BC.Ctx.get(); + InstrCounterIncrFunc = BC.createInjectedBinaryFunction( + "__bolt_instr_counter_incr", /*IsSimple*/ false); + std::vector> BBs; + + BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock()); + InstructionListType Instrs(4); + Instrs[0].setOpcode(AArch64::LDAXRX); + Instrs[0].clear(); + Instrs[0].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[0].addOperand(MCOperand::createReg(AArch64::X0)); + Instrs[1].setOpcode(AArch64::ADDXri); + Instrs[1].clear(); + Instrs[1].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[1].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[1].addOperand(MCOperand::createImm(1)); + Instrs[1].addOperand(MCOperand::createImm(0)); + Instrs[2].setOpcode(AArch64::STLXRX); + Instrs[2].clear(); + Instrs[2].addOperand(MCOperand::createReg(AArch64::W2)); + Instrs[2].addOperand(MCOperand::createReg(AArch64::X1)); + Instrs[2].addOperand(MCOperand::createReg(AArch64::X0)); + Instrs[3].setOpcode(AArch64::CBNZW); + Instrs[3].clear(); + Instrs[3].addOperand(MCOperand::createReg(AArch64::W2)); + Instrs[3].addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(BBs.back()->getLabel(), *Ctx))); + BBs.back()->addInstructions(Instrs.begin(), Instrs.end()); + BBs.back()->setCFIState(0); + + BBs.emplace_back(InstrCounterIncrFunc->createBasicBlock()); + InstructionListType InstrsEpilog(2); + createPopRegisters(InstrsEpilog[0], AArch64::X1, AArch64::X2); + createReturn(InstrsEpilog[1]); + BBs.back()->addInstructions(InstrsEpilog.begin(), InstrsEpilog.end()); + BBs.back()->setCFIState(0); + + BBs[0]->addSuccessor(BBs[0].get()); + BBs[0]->addSuccessor(BBs[1].get()); + + InstrCounterIncrFunc->insertBasicBlocks(nullptr, std::move(BBs), + /*UpdateLayout*/ true, + /*UpdateCFIState*/ false); + InstrCounterIncrFunc->updateState(BinaryFunction::State::CFG_Finalized); + + LLVM_DEBUG({ + dbgs() << "BOLT-DEBUG: instrumentation counter increment helper:\n"; + InstrCounterIncrFunc->dump(); + }); + } + + InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) override { unsigned int I = 0; - InstructionListType Instrs(6); + InstructionListType Instrs(opts::NoLSEAtomics ? 8 : 6); + + if (opts::NoLSEAtomics) { + createPushRegisters(Instrs[I++], AArch64::X0, AArch64::LR); + createPushRegisters(Instrs[I++], AArch64::X1, AArch64::X2); + } else { + createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + } - createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); assert(Addr.size() == 2 && "Invalid Addr size"); std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); I += Addr.size(); - InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1); - assert(Insts.size() == 2 && "Invalid Insts size"); - std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); - I += Insts.size(); - createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + + if (opts::NoLSEAtomics) { + const MCSymbol *Helper = InstrCounterIncrFunc->getSymbol(); + InstructionListType HelperAddr = + materializeAddress(Helper, Ctx, AArch64::X1); + assert(HelperAddr.size() == 2 && "Invalid HelperAddr size"); + std::copy(HelperAddr.begin(), HelperAddr.end(), Instrs.begin() + I); + I += HelperAddr.size(); + createIndirectCallInst(Instrs[I++], /*IsTailCall*/ false, AArch64::X1); + } else { + InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1); + assert(Insts.size() == 2 && "Invalid Insts size"); + std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); + I += Insts.size(); + } + createPopRegisters(Instrs[I++], AArch64::X0, + opts::NoLSEAtomics ? AArch64::LR : AArch64::X1); return Instrs; } diff --git a/bolt/lib/Target/AArch64/CMakeLists.txt b/bolt/lib/Target/AArch64/CMakeLists.txt index cb38117de659e..53554e75de15c 100644 --- a/bolt/lib/Target/AArch64/CMakeLists.txt +++ b/bolt/lib/Target/AArch64/CMakeLists.txt @@ -28,7 +28,7 @@ add_llvm_library(LLVMBOLTTargetAArch64 AArch64CommonTableGen ) -target_link_libraries(LLVMBOLTTargetAArch64 PRIVATE LLVMBOLTCore) +target_link_libraries(LLVMBOLTTargetAArch64 PRIVATE LLVMBOLTCore LLVMBOLTUtils) include_directories( ${LLVM_MAIN_SRC_DIR}/lib/Target/AArch64 diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index 10b4913b6ab7f..7c4a8781fd57d 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -626,9 +626,9 @@ class RISCVMCPlusBuilder : public MCPlusBuilder { return Insts; } - InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const override { + InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) override { // We need 2 scratch registers: one for the target address (x10), and one // for the increment value (x11). // addi sp, sp, -16 diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 1842509dcc5e0..9026a9df7b5c2 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -3053,9 +3053,9 @@ class X86MCPlusBuilder : public MCPlusBuilder { Inst.clear(); } - InstructionListType - createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, - unsigned CodePointerSize) const override { + InstructionListType createInstrIncMemory(const MCSymbol *Target, + MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) override { InstructionListType Instrs(IsLeaf ? 13 : 11); unsigned int I = 0; diff --git a/bolt/test/AArch64/constant-island-alignment.s b/bolt/test/AArch64/constant-island-alignment.s index 3ce0df9d4f290..957c4705f5eec 100644 --- a/bolt/test/AArch64/constant-island-alignment.s +++ b/bolt/test/AArch64/constant-island-alignment.s @@ -1,14 +1,36 @@ // This test checks that the constant island is aligned after BOLT tool. -// In case the nop before .Lci will be removed the pointer to exit function -// won't be alinged and the test will fail. + +# RUN: split-file %s %t + +// For the first test case, in case the nop before .Lci will be removed +// the pointer to exit function won't be alinged and the test will fail. # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \ -# RUN: %s -o %t.o -# RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -Wl,-q \ +# RUN: %t/xword_align.s -o %t_xa.o +# RUN: %clang %cflags -fPIC -pie %t_xa.o -o %t_xa.exe -Wl,-q \ # RUN: -nostartfiles -nodefaultlibs -Wl,-z,notext -# RUN: llvm-bolt %t.exe -o %t.bolt --use-old-text=0 --lite=0 --trap-old-code -# RUN: llvm-objdump -d --disassemble-symbols='$d' %t.bolt | FileCheck %s +# RUN: llvm-bolt %t_xa.exe -o %t_xa.bolt --use-old-text=0 --lite=0 \ +# RUN: --trap-old-code +# RUN: llvm-objdump -d --disassemble-symbols='$d' %t_xa.bolt | FileCheck %s + +// For the second and third test cases, we want to set an alignment based +// on various heuristics. + +# RUN: %clang %cflags -pie %t/page_align.s -o %t_pa.exe -Wl,-q \ +# RUN: -Wl,--init=_foo -Wl,--fini=_foo +# RUN: llvm-bolt %t_pa.exe -o %t_pa.bolt +# RUN: llvm-objdump -t %t_pa.exe | grep _const_island +# RUN: llvm-objdump -t %t_pa.bolt | grep _const_island | FileCheck %s \ +# RUN: --check-prefix=PAGE + +# RUN: %clang %cflags -pie %t/64B_align.s -o %t_64B.exe -Wl,-q \ +# RUN: -Wl,--init=_foo -Wl,--fini=_foo +# RUN: llvm-bolt %t_64B.exe -o %t_64B.bolt +# RUN: llvm-objdump -t %t_64B.exe | grep _const_island +# RUN: llvm-objdump -t %t_64B.bolt | grep _const_island | FileCheck %s \ +# RUN: --check-prefix=64BYTE +;--- xword_align.s .text .align 4 .global @@ -36,3 +58,51 @@ _start: .Lci: .xword exitOk .xword 0 + +;--- page_align.s + .text + .global _foo + .type _foo, %function +_foo: + ret + + .text + .global _const_island + .align 12 +# PAGE: {{[0-9a-f]*}}000 g +_const_island: + .rept 0x25100 + .byte 0xbb + .endr + + .global _start + .type _start, %function +_start: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_AARCH64_NONE + +;--- 64B_align.s + .text + .global _foo + .type _foo, %function +_foo: + ret + + .text + .global _const_island + .align 6 +# 64BYTE: {{[0-9a-f]*}}{{0|4|8|c}}0 g +_const_island: + .rept 0x2048 + .byte 0xbb + .endr + + .global _start + .type _start, %function +_start: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_AARCH64_NONE diff --git a/bolt/test/AArch64/dwarf4-dwp-aarch64.s b/bolt/test/AArch64/dwarf4-dwp-aarch64.s new file mode 100755 index 0000000000000..37507e100a62d --- /dev/null +++ b/bolt/test/AArch64/dwarf4-dwp-aarch64.s @@ -0,0 +1,407 @@ +## This test checks updating debuginfo via dwarf4 dwp file +# RUN: rm -rf %t && mkdir -p %t && cd %t +# RUN: split-file %s %t +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown --split-dwarf-file=main.exe-main.dwo %t/main.s -o %t/main.o +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown --split-dwarf-file=main.exe-callee.dwo %t/callee.s -o %t/callee.o +# RUN: %clangxx %cxxflags -gdwarf-4 -gsplit-dwarf=split -Wl,-e,main %t/main.o %t/callee.o -o main.exe +# RUN: llvm-dwp -e %t/main.exe -o %t/main.exe.dwp +# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections 2>&1 | FileCheck %s + +# CHECK-NOT: Assertion + +#--- main.s + .file "main.cpp" + .globl main // -- Begin function main + .type main,@function +main: // @main +.Lfunc_begin0: + .file 1 "." "main.cpp" + .loc 1 2 0 // main.cpp:2:0 + .loc 1 2 21 prologue_end // main.cpp:2:21 + .loc 1 2 14 epilogue_begin is_stmt 0 // main.cpp:2:14 + ret +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 // Abbreviation Code + .byte 17 // DW_TAG_compile_unit + .byte 0 // DW_CHILDREN_no + .byte 16 // DW_AT_stmt_list + .byte 23 // DW_FORM_sec_offset + .byte 27 // DW_AT_comp_dir + .byte 14 // DW_FORM_strp + .ascii "\264B" // DW_AT_GNU_pubnames + .byte 25 // DW_FORM_flag_present + .ascii "\260B" // DW_AT_GNU_dwo_name + .byte 14 // DW_FORM_strp + .ascii "\261B" // DW_AT_GNU_dwo_id + .byte 7 // DW_FORM_data8 + .byte 17 // DW_AT_low_pc + .byte 1 // DW_FORM_addr + .byte 18 // DW_AT_high_pc + .byte 6 // DW_FORM_data4 + .ascii "\263B" // DW_AT_GNU_addr_base + .byte 23 // DW_FORM_sec_offset + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 0 // EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .word .Ldebug_info_end0-.Ldebug_info_start0 // Length of Unit +.Ldebug_info_start0: + .hword 4 // DWARF version number + .word .debug_abbrev // Offset Into Abbrev. Section + .byte 8 // Address Size (in bytes) + .byte 1 // Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .word .Lline_table_start0 // DW_AT_stmt_list + .word .Lskel_string0 // DW_AT_comp_dir + // DW_AT_GNU_pubnames + .word .Lskel_string1 // DW_AT_GNU_dwo_name + .xword 1465063543908291764 // DW_AT_GNU_dwo_id + .xword .Lfunc_begin0 // DW_AT_low_pc + .word .Lfunc_end0-.Lfunc_begin0 // DW_AT_high_pc + .word .Laddr_table_base0 // DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." // string offset=0 +.Lskel_string1: + .asciz "main.exe-main.dwo" // string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "main" // string offset=0 +.Linfo_string1: + .asciz "int" // string offset=5 +.Linfo_string2: + .byte 0 // string offset=9 +.Linfo_string3: + .asciz "main.cpp" // string offset=10 +.Linfo_string4: + .asciz "main.exe-main.dwo" // string offset=19 + .section .debug_str_offsets.dwo,"e",@progbits + .word 0 + .word 5 + .word 9 + .word 10 + .word 19 + .section .debug_info.dwo,"e",@progbits + .word .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 // Length of Unit +.Ldebug_info_dwo_start0: + .hword 4 // DWARF version number + .word 0 // Offset Into Abbrev. Section + .byte 8 // Address Size (in bytes) + .byte 1 // Abbrev [1] 0xb:0x22 DW_TAG_compile_unit + .byte 2 // DW_AT_producer + .hword 33 // DW_AT_language + .byte 3 // DW_AT_name + .byte 4 // DW_AT_GNU_dwo_name + .xword 1465063543908291764 // DW_AT_GNU_dwo_id + .byte 2 // Abbrev [2] 0x19:0xf DW_TAG_subprogram + .byte 0 // DW_AT_low_pc + .word .Lfunc_end0-.Lfunc_begin0 // DW_AT_high_pc + .byte 1 // DW_AT_frame_base + .byte 109 + .byte 0 // DW_AT_name + .byte 1 // DW_AT_decl_file + .byte 2 // DW_AT_decl_line + .word 40 // DW_AT_type + // DW_AT_external + .byte 3 // Abbrev [3] 0x28:0x4 DW_TAG_base_type + .byte 1 // DW_AT_name + .byte 5 // DW_AT_encoding + .byte 4 // DW_AT_byte_size + .byte 0 // End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 // Abbreviation Code + .byte 17 // DW_TAG_compile_unit + .byte 1 // DW_CHILDREN_yes + .byte 37 // DW_AT_producer + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 19 // DW_AT_language + .byte 5 // DW_FORM_data2 + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .ascii "\260B" // DW_AT_GNU_dwo_name + .ascii "\202>" // DW_FORM_GNU_str_index + .ascii "\261B" // DW_AT_GNU_dwo_id + .byte 7 // DW_FORM_data8 + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 2 // Abbreviation Code + .byte 46 // DW_TAG_subprogram + .byte 0 // DW_CHILDREN_no + .byte 17 // DW_AT_low_pc + .ascii "\201>" // DW_FORM_GNU_addr_index + .byte 18 // DW_AT_high_pc + .byte 6 // DW_FORM_data4 + .byte 64 // DW_AT_frame_base + .byte 24 // DW_FORM_exprloc + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 58 // DW_AT_decl_file + .byte 11 // DW_FORM_data1 + .byte 59 // DW_AT_decl_line + .byte 11 // DW_FORM_data1 + .byte 73 // DW_AT_type + .byte 19 // DW_FORM_ref4 + .byte 63 // DW_AT_external + .byte 25 // DW_FORM_flag_present + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 3 // Abbreviation Code + .byte 36 // DW_TAG_base_type + .byte 0 // DW_CHILDREN_no + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 62 // DW_AT_encoding + .byte 11 // DW_FORM_data1 + .byte 11 // DW_AT_byte_size + .byte 11 // DW_FORM_data1 + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 0 // EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .xword .Lfunc_begin0 + .section .debug_gnu_pubnames,"",@progbits + .word .LpubNames_end0-.LpubNames_start0 // Length of Public Names Info +.LpubNames_start0: + .hword 2 // DWARF Version + .word .Lcu_begin0 // Offset of Compilation Unit Info + .word 48 // Compilation Unit Length + .word 25 // DIE offset + .byte 48 // Attributes: FUNCTION, EXTERNAL + .asciz "main" // External Name + .word 0 // End Mark +.LpubNames_end0: + .section .debug_gnu_pubtypes,"",@progbits + .word .LpubTypes_end0-.LpubTypes_start0 // Length of Public Types Info +.LpubTypes_start0: + .hword 2 // DWARF Version + .word .Lcu_begin0 // Offset of Compilation Unit Info + .word 48 // Compilation Unit Length + .word 40 // DIE offset + .byte 144 // Attributes: TYPE, STATIC + .asciz "int" // External Name + .word 0 // End Mark +.LpubTypes_end0: + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym _Z6calleei + .section .debug_line,"",@progbits +.Lline_table_start0: +#--- callee.s + .file "callee.cpp" + .globl _Z6calleei // -- Begin function _Z6calleei + .type _Z6calleei,@function +_Z6calleei: // @_Z6calleei +.Lfunc_begin0: + .file 1 "." "callee.cpp" + .loc 1 1 0 // callee.cpp:1:0 + .loc 1 1 28 prologue_end // callee.cpp:1:28 + .loc 1 1 21 epilogue_begin is_stmt 0 // callee.cpp:1:21 + ret +.Lfunc_end0: + .size _Z6calleei, .Lfunc_end0-_Z6calleei + .section .debug_abbrev,"",@progbits + .byte 1 // Abbreviation Code + .byte 17 // DW_TAG_compile_unit + .byte 0 // DW_CHILDREN_no + .byte 16 // DW_AT_stmt_list + .byte 23 // DW_FORM_sec_offset + .byte 27 // DW_AT_comp_dir + .byte 14 // DW_FORM_strp + .ascii "\264B" // DW_AT_GNU_pubnames + .byte 25 // DW_FORM_flag_present + .ascii "\260B" // DW_AT_GNU_dwo_name + .byte 14 // DW_FORM_strp + .ascii "\261B" // DW_AT_GNU_dwo_id + .byte 7 // DW_FORM_data8 + .byte 17 // DW_AT_low_pc + .byte 1 // DW_FORM_addr + .byte 18 // DW_AT_high_pc + .byte 6 // DW_FORM_data4 + .ascii "\263B" // DW_AT_GNU_addr_base + .byte 23 // DW_FORM_sec_offset + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 0 // EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .word .Ldebug_info_end0-.Ldebug_info_start0 // Length of Unit +.Ldebug_info_start0: + .hword 4 // DWARF version number + .word .debug_abbrev // Offset Into Abbrev. Section + .byte 8 // Address Size (in bytes) + .byte 1 // Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .word .Lline_table_start0 // DW_AT_stmt_list + .word .Lskel_string0 // DW_AT_comp_dir + // DW_AT_GNU_pubnames + .word .Lskel_string1 // DW_AT_GNU_dwo_name + .xword 7650227797527095061 // DW_AT_GNU_dwo_id + .xword .Lfunc_begin0 // DW_AT_low_pc + .word .Lfunc_end0-.Lfunc_begin0 // DW_AT_high_pc + .word .Laddr_table_base0 // DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." // string offset=0 +.Lskel_string1: + .asciz "main.exe-callee.dwo" // string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z6calleei" // string offset=0 +.Linfo_string1: + .asciz "callee" // string offset=11 +.Linfo_string2: + .asciz "int" // string offset=18 +.Linfo_string3: + .asciz "x" // string offset=22 +.Linfo_string4: + .byte 0 // string offset=24 +.Linfo_string5: + .asciz "callee.cpp" // string offset=25 +.Linfo_string6: + .asciz "main.exe-callee.dwo" // string offset=36 + .section .debug_str_offsets.dwo,"e",@progbits + .word 0 + .word 11 + .word 18 + .word 22 + .word 24 + .word 25 + .word 36 + .section .debug_info.dwo,"e",@progbits + .word .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 // Length of Unit +.Ldebug_info_dwo_start0: + .hword 4 // DWARF version number + .word 0 // Offset Into Abbrev. Section + .byte 8 // Address Size (in bytes) + .byte 1 // Abbrev [1] 0xb:0x2f DW_TAG_compile_unit + .byte 4 // DW_AT_producer + .hword 33 // DW_AT_language + .byte 5 // DW_AT_name + .byte 6 // DW_AT_GNU_dwo_name + .xword 7650227797527095061 // DW_AT_GNU_dwo_id + .byte 2 // Abbrev [2] 0x19:0x1c DW_TAG_subprogram + .byte 0 // DW_AT_low_pc + .word .Lfunc_end0-.Lfunc_begin0 // DW_AT_high_pc + .byte 1 // DW_AT_frame_base + .byte 111 + .byte 0 // DW_AT_linkage_name + .byte 1 // DW_AT_name + .byte 1 // DW_AT_decl_file + .byte 1 // DW_AT_decl_line + .word 53 // DW_AT_type + // DW_AT_external + .byte 3 // Abbrev [3] 0x29:0xb DW_TAG_formal_parameter + .byte 2 // DW_AT_location + .byte 145 + .byte 12 + .byte 3 // DW_AT_name + .byte 1 // DW_AT_decl_file + .byte 1 // DW_AT_decl_line + .word 53 // DW_AT_type + .byte 0 // End Of Children Mark + .byte 4 // Abbrev [4] 0x35:0x4 DW_TAG_base_type + .byte 2 // DW_AT_name + .byte 5 // DW_AT_encoding + .byte 4 // DW_AT_byte_size + .byte 0 // End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 // Abbreviation Code + .byte 17 // DW_TAG_compile_unit + .byte 1 // DW_CHILDREN_yes + .byte 37 // DW_AT_producer + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 19 // DW_AT_language + .byte 5 // DW_FORM_data2 + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .ascii "\260B" // DW_AT_GNU_dwo_name + .ascii "\202>" // DW_FORM_GNU_str_index + .ascii "\261B" // DW_AT_GNU_dwo_id + .byte 7 // DW_FORM_data8 + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 2 // Abbreviation Code + .byte 46 // DW_TAG_subprogram + .byte 1 // DW_CHILDREN_yes + .byte 17 // DW_AT_low_pc + .ascii "\201>" // DW_FORM_GNU_addr_index + .byte 18 // DW_AT_high_pc + .byte 6 // DW_FORM_data4 + .byte 64 // DW_AT_frame_base + .byte 24 // DW_FORM_exprloc + .byte 110 // DW_AT_linkage_name + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 58 // DW_AT_decl_file + .byte 11 // DW_FORM_data1 + .byte 59 // DW_AT_decl_line + .byte 11 // DW_FORM_data1 + .byte 73 // DW_AT_type + .byte 19 // DW_FORM_ref4 + .byte 63 // DW_AT_external + .byte 25 // DW_FORM_flag_present + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 3 // Abbreviation Code + .byte 5 // DW_TAG_formal_parameter + .byte 0 // DW_CHILDREN_no + .byte 2 // DW_AT_location + .byte 24 // DW_FORM_exprloc + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 58 // DW_AT_decl_file + .byte 11 // DW_FORM_data1 + .byte 59 // DW_AT_decl_line + .byte 11 // DW_FORM_data1 + .byte 73 // DW_AT_type + .byte 19 // DW_FORM_ref4 + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 4 // Abbreviation Code + .byte 36 // DW_TAG_base_type + .byte 0 // DW_CHILDREN_no + .byte 3 // DW_AT_name + .ascii "\202>" // DW_FORM_GNU_str_index + .byte 62 // DW_AT_encoding + .byte 11 // DW_FORM_data1 + .byte 11 // DW_AT_byte_size + .byte 11 // DW_FORM_data1 + .byte 0 // EOM(1) + .byte 0 // EOM(2) + .byte 0 // EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .xword .Lfunc_begin0 + .section .debug_gnu_pubnames,"",@progbits + .word .LpubNames_end0-.LpubNames_start0 // Length of Public Names Info +.LpubNames_start0: + .hword 2 // DWARF Version + .word .Lcu_begin0 // Offset of Compilation Unit Info + .word 48 // Compilation Unit Length + .word 25 // DIE offset + .byte 48 // Attributes: FUNCTION, EXTERNAL + .asciz "callee" // External Name + .word 0 // End Mark +.LpubNames_end0: + .section .debug_gnu_pubtypes,"",@progbits + .word .LpubTypes_end0-.LpubTypes_start0 // Length of Public Types Info +.LpubTypes_start0: + .hword 2 // DWARF Version + .word .Lcu_begin0 // Offset of Compilation Unit Info + .word 48 // Compilation Unit Length + .word 53 // DIE offset + .byte 144 // Attributes: TYPE, STATIC + .asciz "int" // External Name + .word 0 // End Mark +.LpubTypes_end0: + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/AArch64/instrumentation_sequence.s b/bolt/test/AArch64/instrumentation_sequence.s new file mode 100644 index 0000000000000..371851fe9a8e3 --- /dev/null +++ b/bolt/test/AArch64/instrumentation_sequence.s @@ -0,0 +1,50 @@ +# This test is to validate instrumentation code sequence generated with +# and without `--no-lse-atomics`. + +# REQUIRES: system-linux,bolt-runtime,target=aarch64{{.*}} + +# RUN: %clang %cflags -pie %s -o %t.so -Wl,-q -Wl,--init=_foo -Wl,--fini=_foo + + .text + .global _foo + .type _foo, %function +_foo: + ret + + .global _start + .type _start, %function +_start: + ret + + # Dummy relocation to force relocation mode + .reloc 0, R_AARCH64_NONE + +# RUN: llvm-bolt %t.so -o %t.instr.so --instrument +# RUN: llvm-objdump -d %t.instr.so | FileCheck %s --check-prefix=INLINE +# INLINE: {{.*}} <_foo>: +# INLINE-NEXT: {{.*}} stp x0, x1, [sp, #-0x10]! +# INLINE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}} +# INLINE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}} +# INLINE-NEXT: {{.*}} mov x1, #0x1 +# INLINE-NEXT: {{.*}} stadd x1, [x0] +# INLINE-NEXT: {{.*}} ldp x0, x1, [sp], #0x10 + +# RUN: llvm-bolt %t.so -o %t.instr.no_lse.so --instrument \ +# RUN: --no-lse-atomics +# RUN: llvm-objdump -d %t.instr.no_lse.so | FileCheck %s --check-prefix=NOLSE +# NOLSE: {{.*}} <_foo>: +# NOLSE-NEXT: {{.*}} stp x0, x30, [sp, #-0x10]! +# NOLSE-NEXT: {{.*}} stp x1, x2, [sp, #-0x10]! +# NOLSE-NEXT: {{.*}} adrp x0, 0x{{[0-9a-f]*}} {{.*}} +# NOLSE-NEXT: {{.*}} add x0, x0, #0x{{[0-9a-f]*}} +# NOLSE-NEXT: {{.*}} adrp x1, 0x[[PAGEBASE:[0-9a-f]*]]000 {{.*}} +# NOLSE-NEXT: {{.*}} add x1, x1, #0x[[PAGEOFF:[0-9a-f]*]] +# NOLSE-NEXT: {{.*}} blr x1 +# NOLSE-NEXT: {{.*}} ldp x0, x30, [sp], #0x10 +# NOLSE: {{[0]*}}[[PAGEBASE]][[PAGEOFF]] <__bolt_instr_counter_incr>: +# NOLSE-NEXT: {{.*}} ldaxr x1, [x0] +# NOLSE-NEXT: {{.*}} add x1, x1, #0x1 +# NOLSE-NEXT: {{.*}} stlxr w2, x1, [x0] +# NOLSE-NEXT: {{.*}} cbnz w2, 0x{{[0-9[a-f]*}} <__bolt_instr_counter_incr> +# NOLSE-NEXT: {{.*}} ldp x1, x2, [sp], #0x10 +# NOLSE-NEXT: {{.*}} ret diff --git a/bolt/test/AArch64/unmarked-data.test b/bolt/test/AArch64/unmarked-data.test index 7a62994bb5c38..af6de11f3df60 100644 --- a/bolt/test/AArch64/unmarked-data.test +++ b/bolt/test/AArch64/unmarked-data.test @@ -2,7 +2,7 @@ // RUN: yaml2obj %S/Inputs/unmarked-data.yaml -o %t.exe // RUN: llvm-bolt %t.exe -o %t.bolt --lite=0 --use-old-text=0 2>&1 | FileCheck %s -// CHECK-NOT: BOLT-WARNING +// CHECK-NOT: BOLT-WARNING: unable to disassemble instruction at offset // RUN: llvm-objdump -j .text -d --disassemble-symbols=first,second %t.bolt | FileCheck %s -check-prefix=CHECK-SYMBOL // CHECK-SYMBOL: : // CHECK-SYMBOL: : diff --git a/bolt/test/X86/dwarf4-dwp-x86.s b/bolt/test/X86/dwarf4-dwp-x86.s new file mode 100755 index 0000000000000..6dde1678f3840 --- /dev/null +++ b/bolt/test/X86/dwarf4-dwp-x86.s @@ -0,0 +1,405 @@ +## This test checks updating debuginfo via dwarf4 dwp file +# RUN: rm -rf %t && mkdir -p %t && cd %t +# RUN: split-file %s %t +# RUN: %clangxx %cxxflags -g -gdwarf-4 -gsplit-dwarf %t/main.s %t/callee.s -o main.exe +# RUN: llvm-dwp -e %t/main.exe -o %t/main.exe.dwp +# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections 2>&1 | FileCheck %s + +# CHECK-NOT: Assertion + +#--- main.s + .file "main.cpp" + .globl main # -- Begin function main + .type main,@function +main: # @main +.Lfunc_begin0: + .file 1 "." "main.cpp" + .loc 1 2 0 # main.cpp:2:0 + .loc 1 2 21 prologue_end # main.cpp:2:21 + .loc 1 2 14 epilogue_begin is_stmt 0 # main.cpp:2:14 + retq +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\264B" # DW_AT_GNU_pubnames + .byte 25 # DW_FORM_flag_present + .ascii "\260B" # DW_AT_GNU_dwo_name + .byte 14 # DW_FORM_strp + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .ascii "\263B" # DW_AT_GNU_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lskel_string0 # DW_AT_comp_dir + # DW_AT_GNU_pubnames + .long .Lskel_string1 # DW_AT_GNU_dwo_name + .quad 1465063543908291764 # DW_AT_GNU_dwo_id + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "main.exe-main.dwo" # string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "main" # string offset=0 +.Linfo_string1: + .asciz "int" # string offset=5 +.Linfo_string2: + .byte 0 # string offset=9 +.Linfo_string3: + .asciz "main.cpp" # string offset=10 +.Linfo_string4: + .asciz "main.exe-main.dwo" # string offset=19 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 5 + .long 9 + .long 10 + .long 19 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 4 # DWARF version number + .long 0 # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x22 DW_TAG_compile_unit + .byte 2 # DW_AT_producer + .short 33 # DW_AT_language + .byte 3 # DW_AT_name + .byte 4 # DW_AT_GNU_dwo_name + .quad 1465063543908291764 # DW_AT_GNU_dwo_id + .byte 2 # Abbrev [2] 0x19:0xf DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 2 # DW_AT_decl_line + .long 40 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x28:0x4 DW_TAG_base_type + .byte 1 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\260B" # DW_AT_GNU_dwo_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 17 # DW_AT_low_pc + .ascii "\201>" # DW_FORM_GNU_addr_index + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .quad .Lfunc_begin0 + .section .debug_gnu_pubnames,"",@progbits + .long .LpubNames_end0-.LpubNames_start0 # Length of Public Names Info +.LpubNames_start0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 48 # Compilation Unit Length + .long 25 # DIE offset + .byte 48 # Attributes: FUNCTION, EXTERNAL + .asciz "main" # External Name + .long 0 # End Mark +.LpubNames_end0: + .section .debug_gnu_pubtypes,"",@progbits + .long .LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info +.LpubTypes_start0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 48 # Compilation Unit Length + .long 40 # DIE offset + .byte 144 # Attributes: TYPE, STATIC + .asciz "int" # External Name + .long 0 # End Mark +.LpubTypes_end0: + .section ".note.GNU-stack","",@progbits + .addrsig + .addrsig_sym _Z6calleei + .section .debug_line,"",@progbits +.Lline_table_start0: +#--- callee.s + .file "callee.cpp" + .globl _Z6calleei # -- Begin function _Z6calleei + .type _Z6calleei,@function +_Z6calleei: # @_Z6calleei +.Lfunc_begin0: + .file 1 "." "callee.cpp" + .loc 1 1 0 # callee.cpp:1:0 + .loc 1 1 28 prologue_end # callee.cpp:1:28 + .loc 1 1 21 epilogue_begin is_stmt 0 # callee.cpp:1:21 + retq +.Lfunc_end0: + .size _Z6calleei, .Lfunc_end0-_Z6calleei + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 0 # DW_CHILDREN_no + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 14 # DW_FORM_strp + .ascii "\264B" # DW_AT_GNU_pubnames + .byte 25 # DW_FORM_flag_present + .ascii "\260B" # DW_AT_GNU_dwo_name + .byte 14 # DW_FORM_strp + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 17 # DW_AT_low_pc + .byte 1 # DW_FORM_addr + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .ascii "\263B" # DW_AT_GNU_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 4 # DWARF version number + .long .debug_abbrev # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x25 DW_TAG_compile_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lskel_string0 # DW_AT_comp_dir + # DW_AT_GNU_pubnames + .long .Lskel_string1 # DW_AT_GNU_dwo_name + .quad -8413212350243343807 # DW_AT_GNU_dwo_id + .quad .Lfunc_begin0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_GNU_addr_base +.Ldebug_info_end0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "main.exe-callee.dwo" # string offset=2 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z6calleei" # string offset=0 +.Linfo_string1: + .asciz "callee" # string offset=11 +.Linfo_string2: + .asciz "int" # string offset=18 +.Linfo_string3: + .asciz "x" # string offset=22 +.Linfo_string4: + .byte 0 # string offset=24 +.Linfo_string5: + .asciz "callee.cpp" # string offset=25 +.Linfo_string6: + .asciz "main.exe-callee.dwo" # string offset=36 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 11 + .long 18 + .long 22 + .long 24 + .long 25 + .long 36 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 4 # DWARF version number + .long 0 # Offset Into Abbrev. Section + .byte 8 # Address Size (in bytes) + .byte 1 # Abbrev [1] 0xb:0x2f DW_TAG_compile_unit + .byte 4 # DW_AT_producer + .short 33 # DW_AT_language + .byte 5 # DW_AT_name + .byte 6 # DW_AT_GNU_dwo_name + .quad -8413212350243343807 # DW_AT_GNU_dwo_id + .byte 2 # Abbrev [2] 0x19:0x1c DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 86 + .byte 0 # DW_AT_linkage_name + .byte 1 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 53 # DW_AT_type + # DW_AT_external + .byte 3 # Abbrev [3] 0x29:0xb DW_TAG_formal_parameter + .byte 2 # DW_AT_location + .byte 145 + .byte 124 + .byte 3 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 53 # DW_AT_type + .byte 0 # End Of Children Mark + .byte 4 # Abbrev [4] 0x35:0x4 DW_TAG_base_type + .byte 2 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\260B" # DW_AT_GNU_dwo_name + .ascii "\202>" # DW_FORM_GNU_str_index + .ascii "\261B" # DW_AT_GNU_dwo_id + .byte 7 # DW_FORM_data8 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .ascii "\201>" # DW_FORM_GNU_addr_index + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 110 # DW_AT_linkage_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .ascii "\202>" # DW_FORM_GNU_str_index + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_addr,"",@progbits +.Laddr_table_base0: + .quad .Lfunc_begin0 + .section .debug_gnu_pubnames,"",@progbits + .long .LpubNames_end0-.LpubNames_start0 # Length of Public Names Info +.LpubNames_start0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 48 # Compilation Unit Length + .long 25 # DIE offset + .byte 48 # Attributes: FUNCTION, EXTERNAL + .asciz "callee" # External Name + .long 0 # End Mark +.LpubNames_end0: + .section .debug_gnu_pubtypes,"",@progbits + .long .LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info +.LpubTypes_start0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 48 # Compilation Unit Length + .long 53 # DIE offset + .byte 144 # Attributes: TYPE, STATIC + .asciz "int" # External Name + .long 0 # End Mark +.LpubTypes_end0: + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test index 465062560d4fc..9b20325bd1fab 100644 --- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test +++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test @@ -18,9 +18,9 @@ # POSTCHECK-NEXT: 1: offset = 0x00000000, type_offset = 0x0000001e, type_signature = 0x00f6cca4e3a15118 # POSTCHECK: Address area offset = 0x68, has 2 entries # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR:]], -# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 1 +# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 0 # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR1:]], -# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 2 +# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 1 # POSTCHECK: Symbol table offset = 0x90, size = 1024, filled slots # POSTCHECK-NEXT: 2: Name offset = 0x20, CU vector offset = 0x0 # POSTCHECK-NEXT: String name: S, CU vector index: 0 diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test index 7589bfac57f58..e70bc89c42e22 100644 --- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test +++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test @@ -15,9 +15,9 @@ # POSTCHECK: Types CU list offset = 0x38, has 0 entries # POSTCHECK: Address area offset = 0x38, has 2 entries # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR:]], -# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 1 +# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 0 # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR1:]], -# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 2 +# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 1 # POSTCHECK: Symbol table offset = 0x60, size = 1024, filled slots # POSTCHECK-NEXT: 2: Name offset = 0x38, CU vector offset = 0x0 # POSTCHECK-NEXT: String name: S, CU vector index: 0 diff --git a/bolt/test/X86/dwarf5-dwoid-no-dwoname.s b/bolt/test/X86/dwarf5-dwoid-no-dwoname.s new file mode 100644 index 0000000000000..bc35973dc6f3f --- /dev/null +++ b/bolt/test/X86/dwarf5-dwoid-no-dwoname.s @@ -0,0 +1,629 @@ +## Check that DWARF CU with a valid DWOId but missing a dwo_name is correctly detected. +# RUN: rm -rf %t && mkdir -p %t && cd %t +# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -split-dwarf-file=main.dwo -o main.o +# RUN: %clang %cflags -O3 -g -gdwarf-5 -gsplit-dwarf -Wl,-q %t/main.o -o main.exe +# RUN: llvm-bolt %t/main.exe -o %t/main.exe.bolt -update-debug-sections 2>&1 | FileCheck %s --check-prefix=PRECHECK +# PRECHECK: BOLT-ERROR: broken DWARF found in CU at offset 0x3e (DWOId=0x0, missing DW_AT_dwo_name / DW_AT_GNU_dwo_name) + +## Checks that Broken dwarf CU is removed +# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t/main.exe.bolt | FileCheck %s --check-prefix=POSTCHECK +# POSTCHECK-LABEL: .debug_info contents: +# POSTCHECK: DW_TAG_skeleton_unit +# POSTCHECK-DAG: DW_AT_dwo_name{{.*=.*\.dwo.*}} +# POSTCHECK: NULL +# POSTCHECK-NOT: DW_TAG_skeleton_unit + + .text + .file "main.cpp" + .section .rodata.cst16,"aM",@progbits,16 +.LCPI0_0: +.LCPI0_1: +.LCPI0_2: +.LCPI0_3: +.LCPI0_4: +.LCPI0_5: +.LCPI0_6: +.LCPI0_7: +.LCPI0_8: +.LCPI0_9: +.LCPI0_10: + .text + .globl main + .type main,@function +main: # @main +.Lfunc_begin0: + .file 1 "." "main.cpp" md5 0x8a68374187457ce14ac0c6c2121349a2 + .loc 1 5 0 # main.cpp:5:0 +# %bb.0: # %vector.ph +.Ltmp0: +.Ltmp1: +.LBB0_1: # %vector.body +.Ltmp2: + .file 2 "." "callee.cpp" md5 0x86e19c24983503540b9bb1a6f7bad737 + .loc 2 8 15 prologue_end # callee.cpp:8:15 +.Ltmp3: + .loc 2 3 15 # callee.cpp:3:15 +.Ltmp4: + .loc 2 8 15 # callee.cpp:8:15 +.Ltmp5: + .loc 2 9 19 # callee.cpp:9:19 +.Ltmp6: + .loc 2 9 13 is_stmt 0 # callee.cpp:9:13 +.Ltmp7: + .loc 2 3 15 is_stmt 1 # callee.cpp:3:15 + .loc 2 3 19 is_stmt 0 # callee.cpp:3:19 +.Ltmp8: + .loc 2 4 19 is_stmt 1 # callee.cpp:4:19 +.Ltmp9: + .loc 2 4 13 is_stmt 0 # callee.cpp:4:13 +.Ltmp10: + .loc 2 4 19 # callee.cpp:4:19 +.Ltmp11: + .loc 2 4 13 # callee.cpp:4:13 +.Ltmp12: + .loc 2 2 12 is_stmt 1 # callee.cpp:2:12 + .loc 2 2 17 is_stmt 0 # callee.cpp:2:17 +.Ltmp13: + .loc 2 4 13 is_stmt 1 # callee.cpp:4:13 +.Ltmp14: + .loc 2 0 0 is_stmt 0 # callee.cpp:0:0 +.Ltmp15: + .loc 1 8 13 is_stmt 1 # main.cpp:8:13 +.Ltmp16: + .loc 2 0 0 is_stmt 0 # callee.cpp:0:0 +.Ltmp17: + .loc 1 8 13 # main.cpp:8:13 +.Ltmp18: + .loc 1 7 35 is_stmt 1 # main.cpp:7:35 +.Ltmp19: +# %bb.2: # %middle.block + .loc 1 7 5 is_stmt 0 # main.cpp:7:5 +.Ltmp20: + .loc 1 11 9 is_stmt 1 # main.cpp:11:9 +.Ltmp21: + .loc 1 15 1 # main.cpp:15:1 + retq +.Ltmp22: +.Lfunc_end0: + .size main, .Lfunc_end0-main + .section .debug_abbrev,"",@progbits + .byte 1 # Abbreviation Code + .byte 74 # DW_TAG_skeleton_unit + .byte 1 # DW_CHILDREN_yes + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .ascii "\264B" # DW_AT_GNU_pubnames + .byte 25 # DW_FORM_flag_present + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 116 # DW_AT_rnglists_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 29 # DW_TAG_inlined_subroutine + .byte 0 # DW_CHILDREN_no + .byte 49 # DW_AT_abstract_origin + .byte 16 # DW_FORM_ref_addr + .byte 85 # DW_AT_ranges + .byte 35 # DW_FORM_rnglistx + .byte 88 # DW_AT_call_file + .byte 11 # DW_FORM_data1 + .byte 89 # DW_AT_call_line + .byte 11 # DW_FORM_data1 + .byte 87 # DW_AT_call_column + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 74 # DW_TAG_skeleton_unit + .byte 1 # DW_CHILDREN_yes + .byte 16 # DW_AT_stmt_list + .byte 23 # DW_FORM_sec_offset + .byte 114 # DW_AT_str_offsets_base + .byte 23 # DW_FORM_sec_offset + .byte 27 # DW_AT_comp_dir + .byte 37 # DW_FORM_strx1 + .ascii "\264B" # DW_AT_GNU_pubnames + .byte 25 # DW_FORM_flag_present + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 115 # DW_AT_addr_base + .byte 23 # DW_FORM_sec_offset + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 32 # DW_AT_inline + .byte 33 # DW_FORM_implicit_const + .byte 1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_info,"",@progbits +.Lcu_begin0: + .long .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit +.Ldebug_info_start0: + .short 5 # DWARF version number + .byte 4 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad -1861901018463438211 + .byte 1 # Abbrev [1] 0x14:0x2a DW_TAG_skeleton_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .byte 0 # DW_AT_comp_dir + # DW_AT_GNU_pubnames + .byte 3 # DW_AT_dwo_name + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .long .Laddr_table_base0 # DW_AT_addr_base + .long .Lrnglists_table_base0 # DW_AT_rnglists_base + .byte 2 # Abbrev [2] 0x2c:0x11 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 2 # DW_AT_name + .byte 3 # Abbrev [3] 0x33:0x9 DW_TAG_inlined_subroutine + .long .debug_info+100 # DW_AT_abstract_origin + .byte 0 # DW_AT_ranges + .byte 1 # DW_AT_call_file + .byte 8 # DW_AT_call_line + .byte 16 # DW_AT_call_column + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark +.Ldebug_info_end0: +.Lcu_begin1: + .long .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit +.Ldebug_info_start1: + .short 5 # DWARF version number + .byte 4 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long .debug_abbrev # Offset Into Abbrev. Section + .quad 0 + .byte 4 # Abbrev [4] 0x14:0x15 DW_TAG_skeleton_unit + .long .Lline_table_start0 # DW_AT_stmt_list + .long .Lstr_offsets_base0 # DW_AT_str_offsets_base + .byte 0 # DW_AT_comp_dir + # DW_AT_GNU_pubnames + .byte 4 # DW_AT_producer + .short 33 # DW_AT_language + .byte 5 # DW_AT_name + .long .Laddr_table_base0 # DW_AT_addr_base + .byte 5 # Abbrev [5] 0x26:0x2 DW_TAG_subprogram + .byte 1 # DW_AT_name + # DW_AT_inline + .byte 0 # End Of Children Mark +.Ldebug_info_end1: + .section .debug_rnglists,"",@progbits + .long .Ldebug_list_header_end0-.Ldebug_list_header_start0 # Length +.Ldebug_list_header_start0: + .short 5 # Version + .byte 8 # Address size + .byte 0 # Segment selector size + .long 1 # Offset entry count +.Lrnglists_table_base0: + .long .Ldebug_ranges1-.Lrnglists_table_base0 +.Ldebug_ranges1: + .byte 4 # DW_RLE_offset_pair + .uleb128 .Ltmp2-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp15-.Lfunc_begin0 # ending offset + .byte 4 # DW_RLE_offset_pair + .uleb128 .Ltmp16-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp17-.Lfunc_begin0 # ending offset + .byte 0 # DW_RLE_end_of_list +.Ldebug_list_header_end0: + .section .debug_str_offsets,"",@progbits + .long 28 # Length of String Offsets Set + .short 5 + .short 0 +.Lstr_offsets_base0: + .section .debug_str,"MS",@progbits,1 +.Lskel_string0: + .asciz "." # string offset=0 +.Lskel_string1: + .asciz "hotFunction" # string offset=45 +.Lskel_string2: + .asciz "main" # string offset=57 +.Lskel_string3: + .asciz "main.dwo" # string offset=62 +.Lskel_string4: + .asciz "clang version 16.0.6" # string offset=71 +.Lskel_string5: + .asciz "callee.cpp" # string offset=177 + .section .debug_str_offsets,"",@progbits + .long .Lskel_string0 + .long .Lskel_string1 + .long .Lskel_string2 + .long .Lskel_string3 + .long .Lskel_string4 + .long .Lskel_string5 + .section .debug_str_offsets.dwo,"e",@progbits + .long 56 # Length of String Offsets Set + .short 5 + .short 0 + .section .debug_str.dwo,"eMS",@progbits,1 +.Linfo_string0: + .asciz "_Z11hotFunctioni" # string offset=0 +.Linfo_string1: + .asciz "hotFunction" # string offset=17 +.Linfo_string2: + .asciz "int" # string offset=29 +.Linfo_string3: + .asciz "x" # string offset=33 +.Linfo_string4: + .asciz "main" # string offset=35 +.Linfo_string5: + .asciz "argc" # string offset=40 +.Linfo_string6: + .asciz "argv" # string offset=45 +.Linfo_string7: + .asciz "char" # string offset=50 +.Linfo_string8: + .asciz "sum" # string offset=55 +.Linfo_string9: + .asciz "i" # string offset=59 +.Linfo_string10: + .asciz "clang version 16.0.6" # string offset=61 +.Linfo_string11: + .asciz "main.cpp" # string offset=167 +.Linfo_string12: + .asciz "main.dwo" # string offset=176 + .section .debug_str_offsets.dwo,"e",@progbits + .long 0 + .long 17 + .long 29 + .long 33 + .long 35 + .long 40 + .long 45 + .long 50 + .long 55 + .long 59 + .long 61 + .long 167 + .long 176 + .section .debug_info.dwo,"e",@progbits + .long .Ldebug_info_dwo_end0-.Ldebug_info_dwo_start0 # Length of Unit +.Ldebug_info_dwo_start0: + .short 5 # DWARF version number + .byte 5 # DWARF Unit Type + .byte 8 # Address Size (in bytes) + .long 0 # Offset Into Abbrev. Section + .quad -1861901018463438211 + .byte 1 # Abbrev [1] 0x14:0x71 DW_TAG_compile_unit + .byte 10 # DW_AT_producer + .short 33 # DW_AT_language + .byte 11 # DW_AT_name + .byte 12 # DW_AT_dwo_name + .byte 2 # Abbrev [2] 0x1a:0x12 DW_TAG_subprogram + .byte 0 # DW_AT_linkage_name + .byte 1 # DW_AT_name + .byte 2 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 44 # DW_AT_type + # DW_AT_external + # DW_AT_inline + .byte 3 # Abbrev [3] 0x23:0x8 DW_TAG_formal_parameter + .byte 3 # DW_AT_name + .byte 2 # DW_AT_decl_file + .byte 1 # DW_AT_decl_line + .long 44 # DW_AT_type + .byte 0 # End Of Children Mark + .byte 4 # Abbrev [4] 0x2c:0x4 DW_TAG_base_type + .byte 2 # DW_AT_name + .byte 5 # DW_AT_encoding + .byte 4 # DW_AT_byte_size + .byte 5 # Abbrev [5] 0x30:0x46 DW_TAG_subprogram + .byte 0 # DW_AT_low_pc + .long .Lfunc_end0-.Lfunc_begin0 # DW_AT_high_pc + .byte 1 # DW_AT_frame_base + .byte 87 + # DW_AT_call_all_calls + .byte 4 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 5 # DW_AT_decl_line + .long 44 # DW_AT_type + # DW_AT_external + .byte 6 # Abbrev [6] 0x3f:0xa DW_TAG_formal_parameter + .byte 1 # DW_AT_location + .byte 85 + .byte 5 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 5 # DW_AT_decl_line + .long 44 # DW_AT_type + .byte 6 # Abbrev [6] 0x49:0xa DW_TAG_formal_parameter + .byte 1 # DW_AT_location + .byte 84 + .byte 6 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 5 # DW_AT_decl_line + .long 118 # DW_AT_type + .byte 7 # Abbrev [7] 0x53:0x9 DW_TAG_variable + .byte 0 # DW_AT_const_value + .byte 8 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 6 # DW_AT_decl_line + .long 44 # DW_AT_type + .byte 8 # Abbrev [8] 0x5c:0x19 DW_TAG_lexical_block + .byte 1 # DW_AT_low_pc + .long .Ltmp20-.Ltmp2 # DW_AT_high_pc + .byte 7 # Abbrev [7] 0x62:0x9 DW_TAG_variable + .byte 0 # DW_AT_const_value + .byte 9 # DW_AT_name + .byte 1 # DW_AT_decl_file + .byte 7 # DW_AT_decl_line + .long 44 # DW_AT_type + .byte 9 # Abbrev [9] 0x6b:0x9 DW_TAG_inlined_subroutine + .long 26 # DW_AT_abstract_origin + .byte 0 # DW_AT_ranges + .byte 1 # DW_AT_call_file + .byte 8 # DW_AT_call_line + .byte 16 # DW_AT_call_column + .byte 0 # End Of Children Mark + .byte 0 # End Of Children Mark + .byte 10 # Abbrev [10] 0x76:0x5 DW_TAG_pointer_type + .long 123 # DW_AT_type + .byte 10 # Abbrev [10] 0x7b:0x5 DW_TAG_pointer_type + .long 128 # DW_AT_type + .byte 4 # Abbrev [4] 0x80:0x4 DW_TAG_base_type + .byte 7 # DW_AT_name + .byte 6 # DW_AT_encoding + .byte 1 # DW_AT_byte_size + .byte 0 # End Of Children Mark +.Ldebug_info_dwo_end0: + .section .debug_abbrev.dwo,"e",@progbits + .byte 1 # Abbreviation Code + .byte 17 # DW_TAG_compile_unit + .byte 1 # DW_CHILDREN_yes + .byte 37 # DW_AT_producer + .byte 37 # DW_FORM_strx1 + .byte 19 # DW_AT_language + .byte 5 # DW_FORM_data2 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 118 # DW_AT_dwo_name + .byte 37 # DW_FORM_strx1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 2 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 110 # DW_AT_linkage_name + .byte 37 # DW_FORM_strx1 + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 32 # DW_AT_inline + .byte 33 # DW_FORM_implicit_const + .byte 1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 3 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 4 # Abbreviation Code + .byte 36 # DW_TAG_base_type + .byte 0 # DW_CHILDREN_no + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 62 # DW_AT_encoding + .byte 11 # DW_FORM_data1 + .byte 11 # DW_AT_byte_size + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 5 # Abbreviation Code + .byte 46 # DW_TAG_subprogram + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 64 # DW_AT_frame_base + .byte 24 # DW_FORM_exprloc + .byte 122 # DW_AT_call_all_calls + .byte 25 # DW_FORM_flag_present + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 63 # DW_AT_external + .byte 25 # DW_FORM_flag_present + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 6 # Abbreviation Code + .byte 5 # DW_TAG_formal_parameter + .byte 0 # DW_CHILDREN_no + .byte 2 # DW_AT_location + .byte 24 # DW_FORM_exprloc + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 7 # Abbreviation Code + .byte 52 # DW_TAG_variable + .byte 0 # DW_CHILDREN_no + .byte 28 # DW_AT_const_value + .byte 13 # DW_FORM_sdata + .byte 3 # DW_AT_name + .byte 37 # DW_FORM_strx1 + .byte 58 # DW_AT_decl_file + .byte 11 # DW_FORM_data1 + .byte 59 # DW_AT_decl_line + .byte 11 # DW_FORM_data1 + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 8 # Abbreviation Code + .byte 11 # DW_TAG_lexical_block + .byte 1 # DW_CHILDREN_yes + .byte 17 # DW_AT_low_pc + .byte 27 # DW_FORM_addrx + .byte 18 # DW_AT_high_pc + .byte 6 # DW_FORM_data4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 9 # Abbreviation Code + .byte 29 # DW_TAG_inlined_subroutine + .byte 0 # DW_CHILDREN_no + .byte 49 # DW_AT_abstract_origin + .byte 19 # DW_FORM_ref4 + .byte 85 # DW_AT_ranges + .byte 35 # DW_FORM_rnglistx + .byte 88 # DW_AT_call_file + .byte 11 # DW_FORM_data1 + .byte 89 # DW_AT_call_line + .byte 11 # DW_FORM_data1 + .byte 87 # DW_AT_call_column + .byte 11 # DW_FORM_data1 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 10 # Abbreviation Code + .byte 15 # DW_TAG_pointer_type + .byte 0 # DW_CHILDREN_no + .byte 73 # DW_AT_type + .byte 19 # DW_FORM_ref4 + .byte 0 # EOM(1) + .byte 0 # EOM(2) + .byte 0 # EOM(3) + .section .debug_rnglists.dwo,"e",@progbits + .long .Ldebug_list_header_end1-.Ldebug_list_header_start1 # Length +.Ldebug_list_header_start1: + .short 5 # Version + .byte 8 # Address size + .byte 0 # Segment selector size + .long 1 # Offset entry count +.Lrnglists_dwo_table_base0: + .long .Ldebug_ranges0-.Lrnglists_dwo_table_base0 +.Ldebug_ranges0: + .byte 4 # DW_RLE_offset_pair + .uleb128 .Ltmp2-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp15-.Lfunc_begin0 # ending offset + .byte 4 # DW_RLE_offset_pair + .uleb128 .Ltmp16-.Lfunc_begin0 # starting offset + .uleb128 .Ltmp17-.Lfunc_begin0 # ending offset + .byte 0 # DW_RLE_end_of_list +.Ldebug_list_header_end1: + .section .debug_addr,"",@progbits + .long .Ldebug_addr_end0-.Ldebug_addr_start0 # Length of contribution +.Ldebug_addr_start0: + .short 5 # DWARF version number + .byte 8 # Address size + .byte 0 # Segment selector size +.Laddr_table_base0: + .quad .Lfunc_begin0 + .quad .Ltmp2 +.Ldebug_addr_end0: + .section .debug_gnu_pubnames,"",@progbits + .long .LpubNames_end0-.LpubNames_start0 # Length of Public Names Info +.LpubNames_start0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 62 # Compilation Unit Length + .long 26 # DIE offset + .byte 48 # Attributes: FUNCTION, EXTERNAL + .asciz "hotFunction" # External Name + .long 48 # DIE offset + .byte 48 # Attributes: FUNCTION, EXTERNAL + .asciz "main" # External Name + .long 0 # End Mark +.LpubNames_end0: + .section .debug_gnu_pubtypes,"",@progbits + .long .LpubTypes_end0-.LpubTypes_start0 # Length of Public Types Info +.LpubTypes_start0: + .short 2 # DWARF Version + .long .Lcu_begin0 # Offset of Compilation Unit Info + .long 62 # Compilation Unit Length + .long 44 # DIE offset + .byte 144 # Attributes: TYPE, STATIC + .asciz "int" # External Name + .long 128 # DIE offset + .byte 144 # Attributes: TYPE, STATIC + .asciz "char" # External Name + .long 0 # End Mark +.LpubTypes_end0: + .section .debug_gnu_pubnames,"",@progbits + .long .LpubNames_end1-.LpubNames_start1 # Length of Public Names Info +.LpubNames_start1: + .short 2 # DWARF Version + .long .Lcu_begin1 # Offset of Compilation Unit Info + .long 41 # Compilation Unit Length + .long 0 # End Mark +.LpubNames_end1: + .section .debug_gnu_pubtypes,"",@progbits + .long .LpubTypes_end1-.LpubTypes_start1 # Length of Public Types Info +.LpubTypes_start1: + .short 2 # DWARF Version + .long .Lcu_begin1 # Offset of Compilation Unit Info + .long 41 # Compilation Unit Length + .long 0 # End Mark +.LpubTypes_end1: + .ident "clang version 16.0.6" + .ident "clang version 16.0.6" + .section .GCC.command.line,"MS",@progbits,1 + .zero 1 + .ascii "" + .zero 1 + .ascii "" + .zero 1 + .section .debug_gnu_pubtypes,"",@progbits + .section ".note.GNU-stack","",@progbits + .addrsig + .section .debug_line,"",@progbits +.Lline_table_start0: diff --git a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test index 139b24afa1b0d..2426f240ad11c 100644 --- a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test +++ b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test @@ -18,9 +18,9 @@ # POSTCHECK-NEXT: 1: offset = 0x00000040, type_offset = 0x00000023, type_signature = 0x00f6cca4e3a15118 # POSTCHECK: Address area offset = 0x68, has 2 entries # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR:]], -# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 1 +# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 0 # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR1:]], -# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 3 +# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 1 # POSTCHECK: Symbol table offset = 0x90, size = 1024, filled slots # POSTCHECK-NEXT: 2: Name offset = 0x28, CU vector offset = 0x0 # POSTCHECK-NEXT: String name: S, CU vector index: 0 diff --git a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test index 26ee101e9d1d1..b67c5b28e7ce9 100644 --- a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test +++ b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test @@ -20,7 +20,7 @@ # POSTCHECK-NEXT: 1: offset = 0x00000040, type_offset = 0x00000023, type_signature = 0x00f6cca4e3a15118 # POSTCHECK: Address area offset = 0x88, has 2 entries # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR:]], -# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 1 +# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 2 # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR1:]], # POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 3 # POSTCHECK: Symbol table offset = 0xb0, size = 1024, filled slots @@ -37,7 +37,7 @@ # POSTCHECK-NEXT: 754: Name offset = 0x43, CU vector offset = 0x0 # POSTCHECK-NEXT: String name: int, CU vector index: 0 # POSTCHECK: Constant pool offset = 0x20b0, has 5 CU vectors -# POSTCHECK-NEXT: 0(0x0): 0x90000001 +# POSTCHECK-NEXT: 0(0x0): 0x90000002 # POSTCHECK-NEXT: 1(0x8): 0x90000003 -# POSTCHECK-NEXT: 2(0x10): 0x30000001 +# POSTCHECK-NEXT: 2(0x10): 0x30000002 # POSTCHECK-NEXT: 3(0x18): 0x30000003 diff --git a/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test b/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test index 731c560133399..740f199d14042 100644 --- a/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test +++ b/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test @@ -15,9 +15,9 @@ # POSTCHECK: Types CU list offset = 0x38, has 0 entries # POSTCHECK: Address area offset = 0x38, has 2 entries # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR:]], -# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 1 +# POSTCHECK-SAME: 0x[[#ADDR + 0xf]]) (Size: 0xf), CU id = 0 # POSTCHECK-NEXT: Low/High address = [0x[[#%.4x,ADDR1:]], -# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 3 +# POSTCHECK-SAME: 0x[[#ADDR1 + 0xd]]) (Size: 0xd), CU id = 1 # POSTCHECK: Symbol table offset = 0x60, size = 1024, filled slots # POSTCHECK-NEXT: 2: Name offset = 0x38, CU vector offset = 0x0 # POSTCHECK-NEXT: String name: S, CU vector index: 0 diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp index 1ab40aacbfe09..b37dc272ea156 100644 --- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp +++ b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp @@ -274,11 +274,12 @@ Error MustacheHTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS, } Error MustacheHTMLGenerator::createResources(ClangDocContext &CDCtx) { + std::string ResourcePath(CDCtx.OutDirectory + "/html"); for (const auto &FilePath : CDCtx.UserStylesheets) - if (Error Err = copyFile(FilePath, CDCtx.OutDirectory)) + if (Error Err = copyFile(FilePath, ResourcePath)) return Err; for (const auto &FilePath : CDCtx.JsScripts) - if (Error Err = copyFile(FilePath, CDCtx.OutDirectory)) + if (Error Err = copyFile(FilePath, ResourcePath)) return Err; return Error::success(); } diff --git a/clang-tools-extra/clang-doc/Serialize.cpp b/clang-tools-extra/clang-doc/Serialize.cpp index dd7cd0b2ae736..186f634dd892a 100644 --- a/clang-tools-extra/clang-doc/Serialize.cpp +++ b/clang-tools-extra/clang-doc/Serialize.cpp @@ -780,12 +780,10 @@ static void populateSymbolInfo(SymbolInfo &I, const T *D, const FullComment *C, MangledStream << D->getNameAsString(); // A 250 length limit was chosen since 255 is a common limit across // different filesystems, with a 5 character buffer for file extensions. - if (MangledName.size() > 250) - // File creation fails if the mangled name is too long, so default to the - // USR. We should look for a better check since filesystems differ in - // maximum filename length - I.MangledName = llvm::toStringRef(llvm::toHex(I.USR)); - else + if (MangledName.size() > 250) { + auto SymbolID = llvm::toStringRef(llvm::toHex(I.USR)).str(); + I.MangledName = MangledName.substr(0, 250 - SymbolID.size()) + SymbolID; + } else I.MangledName = MangledName; delete Mangler; } diff --git a/clang-tools-extra/clang-include-fixer/find-all-symbols/tool/run-find-all-symbols.py b/clang-tools-extra/clang-include-fixer/find-all-symbols/tool/run-find-all-symbols.py index 471dbf8c110b4..49a1b14932644 100755 --- a/clang-tools-extra/clang-include-fixer/find-all-symbols/tool/run-find-all-symbols.py +++ b/clang-tools-extra/clang-include-fixer/find-all-symbols/tool/run-find-all-symbols.py @@ -26,7 +26,7 @@ import json import multiprocessing import os -import Queue +from queue import Queue import shutil import subprocess import sys @@ -105,7 +105,7 @@ def main(): try: # Spin up a bunch of tidy-launching threads. - queue = Queue.Queue(max_task) + queue = Queue(max_task) for _ in range(max_task): t = threading.Thread( target=run_find_all_symbols, args=(args, tmpdir, build_path, queue) diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp index e100f1412b066..7e18f3806a143 100644 --- a/clang-tools-extra/clang-tidy/ClangTidy.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp @@ -9,7 +9,7 @@ /// \file This file implements a clang-tidy tool. /// /// This tool uses the Clang Tooling infrastructure, see -/// http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html +/// https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html /// for details on setting it up with LLVM source tree. /// //===----------------------------------------------------------------------===// diff --git a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp index 823c7b5626e97..65fd09f99ef0f 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidyDiagnosticConsumer.cpp @@ -10,7 +10,7 @@ /// and ClangTidyError classes. /// /// This tool uses the Clang Tooling infrastructure, see -/// http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html +/// https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html /// for details on setting it up with LLVM source tree. /// //===----------------------------------------------------------------------===// diff --git a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h index 414085146bfe4..49d27b3c35144 100644 --- a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::abseil { /// deduction (CTAD), in C++17 and higher. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/cleanup-ctad.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/cleanup-ctad.html class CleanupCtadCheck : public utils::TransformerClangTidyCheck { public: CleanupCtadCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h index e740326a3d6de..b728118c3da03 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationAdditionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// ``absl::Time`` domain. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-addition.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-addition.html class DurationAdditionCheck : public ClangTidyCheck { public: DurationAdditionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h index d9fc8cb165235..45ec6877fb6cb 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationComparisonCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// domain. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-comparison.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-comparison.html class DurationComparisonCheck : public ClangTidyCheck { public: DurationComparisonCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h index cd45bc078fde6..c41ba479a12b8 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationConversionCastCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// the right conversion function instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-conversion-cast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-conversion-cast.html class DurationConversionCastCheck : public ClangTidyCheck { public: DurationConversionCastCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h index 810f7d269f38f..802b6b91b60d5 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationDivisionCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::abseil { // Find potential incorrect uses of integer division of absl::Duration objects. // // For the user-facing documentation see: -// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-division.html +// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-division.html class DurationDivisionCheck : public ClangTidyCheck { public: diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h index 1d688da43e268..9133cd50b02cd 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryFloatCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::abseil { /// component. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-factory-float.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-factory-float.html class DurationFactoryFloatCheck : public ClangTidyCheck { public: DurationFactoryFloatCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h index 1d53d13fa9f9a..e02620b6c8201 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationFactoryScaleCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::abseil { /// case of zero and suggests `ZeroDuration()`. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-factory-scale.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-factory-scale.html class DurationFactoryScaleCheck : public ClangTidyCheck { public: DurationFactoryScaleCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h index b092561df909c..4b3ed9c53be6c 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationSubtractionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// `absl::Duration` domain. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-subtraction.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-subtraction.html class DurationSubtractionCheck : public ClangTidyCheck { public: DurationSubtractionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h index a5bd4dca6ce1f..59af8968e8b38 100644 --- a/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/DurationUnnecessaryConversionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// to numeric types and back again. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-unnecessary-conversion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/duration-unnecessary-conversion.html class DurationUnnecessaryConversionCheck : public ClangTidyCheck { public: DurationUnnecessaryConversionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h index 96e261d86697b..3a3f241088d14 100644 --- a/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/FasterStrsplitDelimiterCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// is a single character string literal and replaces it with a character. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/faster-strsplit-delimiter.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/faster-strsplit-delimiter.html class FasterStrsplitDelimiterCheck : public ClangTidyCheck { public: FasterStrsplitDelimiterCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h index 7b46ba55e008f..2911a1ad14ae8 100644 --- a/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/NoInternalDependenciesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// against doing so. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/no-internal-dependencies.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/no-internal-dependencies.html class NoInternalDependenciesCheck : public ClangTidyCheck { public: NoInternalDependenciesCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h b/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h index d3ab5cc5219ef..c970c9aced55f 100644 --- a/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/NoNamespaceCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// Abseil's compatibility guidelines. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/no-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/no-namespace.html class NoNamespaceCheck : public ClangTidyCheck { public: NoNamespaceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h index a5300a399c89d..8fc1eaf25d800 100644 --- a/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/RedundantStrcatCallsCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::abseil { /// StrCat(1, StrCat(2, 3)) ==> StrCat(1, 2, 3) /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/redundant-strcat-calls.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/redundant-strcat-calls.html class RedundantStrcatCallsCheck : public ClangTidyCheck { public: RedundantStrcatCallsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h index 93245c01cebb4..56d63b6565ffe 100644 --- a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// should be used instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/str-cat-append.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/str-cat-append.html class StrCatAppendCheck : public ClangTidyCheck { public: StrCatAppendCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h index f939c0b5791e5..a96f111a33468 100644 --- a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::abseil { /// types) and suggests replacing with absl::StrContains. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/string-find-str-contains.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/string-find-str-contains.html class StringFindStrContainsCheck : public utils::TransformerClangTidyCheck { public: StringFindStrContainsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h index bbf74bebd26ae..703d9514e8c07 100644 --- a/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/TimeComparisonCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// domain. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/time-comparison.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/time-comparison.html class TimeComparisonCheck : public ClangTidyCheck { public: TimeComparisonCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h index f8bb599d36d5d..3f44a3ded91e4 100644 --- a/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/TimeSubtractionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::abseil { /// in the time domain instead of the numeric domain. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/time-subtraction.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/time-subtraction.html class TimeSubtractionCheck : public ClangTidyCheck { public: TimeSubtractionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h index e4865b941f2ac..742ff41faed96 100644 --- a/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h +++ b/clang-tools-extra/clang-tidy/abseil/UpgradeDurationConversionsCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::abseil { /// factories. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/abseil/upgrade-duration-conversions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/abseil/upgrade-duration-conversions.html class UpgradeDurationConversionsCheck : public ClangTidyCheck { public: UpgradeDurationConversionsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py index dfa429e6455b2..80e1afe1a121c 100755 --- a/clang-tools-extra/clang-tidy/add_new_check.py +++ b/clang-tools-extra/clang-tidy/add_new_check.py @@ -109,7 +109,7 @@ def write_header( %(description)s /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/%(module)s/%(check_name)s.html +/// https://clang.llvm.org/extra/clang-tidy/checks/%(module)s/%(check_name)s.html class %(check_name_camel)s : public ClangTidyCheck { public: %(check_name_camel)s(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h index 0030faa3c5ec5..b777918ab7e7b 100644 --- a/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h +++ b/clang-tools-extra/clang-tidy/altera/IdDependentBackwardBranchCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::altera { /// degradation. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/altera/id-dependent-backward-branch.html +/// https://clang.llvm.org/extra/clang-tidy/checks/altera/id-dependent-backward-branch.html class IdDependentBackwardBranchCheck : public ClangTidyCheck { private: enum LoopType { UnknownLoop = -1, DoLoop = 0, WhileLoop = 1, ForLoop = 2 }; diff --git a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h index bd8aafecf4f76..182d10b5539e5 100644 --- a/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h +++ b/clang-tools-extra/clang-tidy/altera/KernelNameRestrictionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::altera { /// `Verilog.cl`, or `VHDL.cl`. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/altera/kernel-name-restriction.html +/// https://clang.llvm.org/extra/clang-tidy/checks/altera/kernel-name-restriction.html class KernelNameRestrictionCheck : public ClangTidyCheck { public: KernelNameRestrictionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h index 5560f2765f9f9..dab3dbce50371 100644 --- a/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h +++ b/clang-tools-extra/clang-tidy/altera/SingleWorkItemBarrierCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::altera { /// kernels, which may be inefficient or cause an error. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/altera/single-work-item-barrier.html +/// https://clang.llvm.org/extra/clang-tidy/checks/altera/single-work-item-barrier.html class SingleWorkItemBarrierCheck : public ClangTidyCheck { const unsigned AOCVersion; diff --git a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h index f6f2d1fa529e5..93da4d3d136fe 100644 --- a/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h +++ b/clang-tools-extra/clang-tidy/altera/StructPackAlignCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::altera { /// packing and/or aligning of said structs as needed. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/altera/struct-pack-align.html +/// https://clang.llvm.org/extra/clang-tidy/checks/altera/struct-pack-align.html class StructPackAlignCheck : public ClangTidyCheck { public: StructPackAlignCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h index 453176fa4894a..90c4755667754 100644 --- a/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h +++ b/clang-tools-extra/clang-tidy/altera/UnrollLoopsCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::altera { /// they cannot be fully unrolled, and should be partially unrolled. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/altera/unroll-loops.html +/// https://clang.llvm.org/extra/clang-tidy/checks/altera/unroll-loops.html class UnrollLoopsCheck : public ClangTidyCheck { public: UnrollLoopsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h index e7286dc519484..02c4e0056ea15 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecAccept4Check.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Finds code that uses accept4() without using the SOCK_CLOEXEC flag. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-accept4.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-accept4.html class CloexecAccept4Check : public CloexecCheck { public: CloexecAccept4Check(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h index 9b982b2b104ca..4540f938fd478 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecAcceptCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// accept() is better to be replaced by accept4(). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-accept.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-accept.html class CloexecAcceptCheck : public CloexecCheck { public: CloexecAcceptCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h index e0629f2ac4061..ee2f51abf05fc 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecCreatCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// creat() is better to be replaced by open(). /// Find the usage of creat() and redirect user to use open(). -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-creat.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-creat.html class CloexecCreatCheck : public CloexecCheck { public: CloexecCreatCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h index 3016867e56189..f5699685ed086 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecDupCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::android { /// Find the usage of dup() and redirect user to use fcntl(). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-dup.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-dup.html class CloexecDupCheck : public CloexecCheck { public: CloexecDupCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h index cb0d40b8b9f36..f467b87a6cf70 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreate1Check.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Finds code that uses epoll_create1() without using the EPOLL_CLOEXEC flag. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-epoll-create1.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-epoll-create1.html class CloexecEpollCreate1Check : public CloexecCheck { public: CloexecEpollCreate1Check(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h index 9010179bd7036..a8d17c82d457d 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecEpollCreateCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// epoll_create() is better to be replaced by epoll_create1(). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-epoll-create.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-epoll-create.html class CloexecEpollCreateCheck : public CloexecCheck { public: CloexecEpollCreateCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h index 1e0e7d76933c7..646b237a663e0 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecFopenCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::android { /// This check only works when corresponding argument is StringLiteral. No /// constant propagation. /// -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-fopen.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-fopen.html class CloexecFopenCheck : public CloexecCheck { public: CloexecFopenCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h index 50bc4bbaa7de5..3960d05e2e1f0 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInit1Check.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Finds code that uses inotify_init1() without using the IN_CLOEXEC flag. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-inotify-init1.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-inotify-init1.html class CloexecInotifyInit1Check : public CloexecCheck { public: CloexecInotifyInit1Check(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h index 7db4ab15c2f9e..cb9e6820571bc 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecInotifyInitCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// inotify_init() is better to be replaced by inotify_init1(). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-inotify-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-inotify-init.html class CloexecInotifyInitCheck : public CloexecCheck { public: CloexecInotifyInitCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h index 43a27dd5658a5..dd96ee968f3b4 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecMemfdCreateCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Finds code that uses memfd_create() without using the MFD_CLOEXEC flag. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-memfd-create.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-memfd-create.html class CloexecMemfdCreateCheck : public CloexecCheck { public: CloexecMemfdCreateCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h index 17d9b4f326e86..496bd6b6cbbc0 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h +++ b/clang-tools-extra/clang-tidy/android/CloexecPipe2Check.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Finds code that uses pipe2() without using the O_CLOEXEC flag. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-pipe2.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-pipe2.html class CloexecPipe2Check : public CloexecCheck { public: CloexecPipe2Check(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h index 47a202e8542eb..f0145e14eb49f 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecPipeCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Suggests to replace calls to pipe() with calls to pipe2(). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-pipe.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-pipe.html class CloexecPipeCheck : public CloexecCheck { public: CloexecPipeCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h index 8ef02c1f197b7..0a29d7224e781 100644 --- a/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h +++ b/clang-tools-extra/clang-tidy/android/CloexecSocketCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::android { /// Finds code that uses socket() without using the SOCK_CLOEXEC flag. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-socket.html +/// https://clang.llvm.org/extra/clang-tidy/checks/android/cloexec-socket.html class CloexecSocketCheck : public CloexecCheck { public: CloexecSocketCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h index 107d801969fc4..cd844656dd7b7 100644 --- a/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h +++ b/clang-tools-extra/clang-tidy/boost/UseRangesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::boost { /// replaced with a boost ranges version instead /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/boost/use-ranges.html +/// https://clang.llvm.org/extra/clang-tidy/checks/boost/use-ranges.html class UseRangesCheck : public utils::UseRangesCheck { public: UseRangesCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h index a245d11ee1c8a..af87f15a1dc0b 100644 --- a/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h +++ b/clang-tools-extra/clang-tidy/boost/UseToStringCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::boost { /// ``std::to_string`` and ``std::to_wstring`` calls. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/boost/use-to-string.html +/// https://clang.llvm.org/extra/clang-tidy/checks/boost/use-to-string.html class UseToStringCheck : public ClangTidyCheck { public: UseToStringCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h index 3ae4f36913d5f..613c3a7b30a2e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/AssignmentInIfConditionCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Catches assignments within the condition clause of an if statement. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/assignment-in-if-condition.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/assignment-in-if-condition.html class AssignmentInIfConditionCheck : public ClangTidyCheck { public: AssignmentInIfConditionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h index aa4d83c89a08d..9a6c9db418fd1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/BadSignalToKillThreadCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// Finds ``pthread_kill`` function calls when thread is terminated by /// ``SIGTERM`` signal. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/bad-signal-to-kill-thread.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/bad-signal-to-kill-thread.html class BadSignalToKillThreadCheck : public ClangTidyCheck { public: BadSignalToKillThreadCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h b/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h index 71dc159573619..f8e44bda3018e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/BitwisePointerCastCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// ``std::bit_cast`` or ``memcpy``. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/bitwise-pointer-cast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/bitwise-pointer-cast.html class BitwisePointerCastCheck : public ClangTidyCheck { public: BitwisePointerCastCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h index 22dbb2384900c..3888d5ede7b41 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// the true and false expressions are Type I clones of each other. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/branch-clone.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/branch-clone.html class BranchCloneCheck : public ClangTidyCheck { public: BranchCloneCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp index 8baa8f6b35d4c..e6115f67656bc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp @@ -19,6 +19,7 @@ #include "CapturingThisInMemberVariableCheck.h" #include "CastingThroughVoidCheck.h" #include "ChainedComparisonCheck.h" +#include "CommandProcessorCheck.h" #include "ComparePointerToMemberVirtualFunctionCheck.h" #include "CopyConstructorInitCheck.h" #include "CrtpConstructorAccessibilityCheck.h" @@ -130,6 +131,8 @@ class BugproneModule : public ClangTidyModule { "bugprone-casting-through-void"); CheckFactories.registerCheck( "bugprone-chained-comparison"); + CheckFactories.registerCheck( + "bugprone-command-processor"); CheckFactories.registerCheck( "bugprone-compare-pointer-to-member-virtual-function"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt index b0dbe84a16cd4..c8943e5b22ef8 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt @@ -15,6 +15,7 @@ add_clang_library(clangTidyBugproneModule STATIC CapturingThisInMemberVariableCheck.cpp CastingThroughVoidCheck.cpp ChainedComparisonCheck.cpp + CommandProcessorCheck.cpp ComparePointerToMemberVirtualFunctionCheck.cpp CopyConstructorInitCheck.cpp CrtpConstructorAccessibilityCheck.cpp diff --git a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h index 6aba9ee84d2bd..9acc3f2f9b63a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CapturingThisInMemberVariableCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// assignments. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/capturing-this-in-member-variable.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/capturing-this-in-member-variable.html class CapturingThisInMemberVariableCheck : public ClangTidyCheck { public: CapturingThisInMemberVariableCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h index 313f3f240f5b3..0d2eba1977e97 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CastingThroughVoidCheck.h @@ -15,7 +15,7 @@ namespace clang::tidy::bugprone { /// Detects unsafe or redundant two-step casting operations involving ``void*``. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/casting-through-void.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/casting-through-void.html class CastingThroughVoidCheck : public ClangTidyCheck { public: CastingThroughVoidCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h index bf8e3f709d30b..7c1022904a3a6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// behavior or logical errors. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/chained-comparison.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/chained-comparison.html class ChainedComparisonCheck : public ClangTidyCheck { public: ChainedComparisonCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/CommandProcessorCheck.cpp similarity index 95% rename from clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp rename to clang-tools-extra/clang-tidy/bugprone/CommandProcessorCheck.cpp index d87396f5189b1..a09c1a931cdb5 100644 --- a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/CommandProcessorCheck.cpp @@ -11,7 +11,7 @@ using namespace clang::ast_matchers; -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { void CommandProcessorCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( @@ -35,4 +35,4 @@ void CommandProcessorCheck::check(const MatchFinder::MatchResult &Result) { diag(E->getExprLoc(), "calling %0 uses a command processor") << Fn; } -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h b/clang-tools-extra/clang-tidy/bugprone/CommandProcessorCheck.h similarity index 72% rename from clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h rename to clang-tools-extra/clang-tidy/bugprone/CommandProcessorCheck.h index 94234f284c045..bd4683410ae6f 100644 --- a/clang-tools-extra/clang-tidy/cert/CommandProcessorCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CommandProcessorCheck.h @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_COMMAND_PROCESSOR_CHECK_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_COMMAND_PROCESSOR_CHECK_H +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COMMANDPROCESSORCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COMMANDPROCESSORCHECK_H #include "../ClangTidyCheck.h" -namespace clang::tidy::cert { +namespace clang::tidy::bugprone { /// Execution of a command processor can lead to security vulnerabilities, /// and is generally not required. Instead, prefer to launch executables @@ -19,7 +19,7 @@ namespace clang::tidy::cert { /// actually launched. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/env33-c.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/command-processor.html class CommandProcessorCheck : public ClangTidyCheck { public: CommandProcessorCheck(StringRef Name, ClangTidyContext *Context) @@ -28,6 +28,6 @@ class CommandProcessorCheck : public ClangTidyCheck { void check(const ast_matchers::MatchFinder::MatchResult &Result) override; }; -} // namespace clang::tidy::cert +} // namespace clang::tidy::bugprone -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_COMMAND_PROCESSOR_CHECK_H +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_COMMANDPROCESSORCHECK_H diff --git a/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.h b/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.h index 15561e068a670..0f245d18d9f8f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ComparePointerToMemberVirtualFunctionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// member virtual function and anything other than null-pointer-constant. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function.html class ComparePointerToMemberVirtualFunctionCheck : public ClangTidyCheck { public: ComparePointerToMemberVirtualFunctionCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h index 02755b5894b18..cba1a25d9bc19 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CopyConstructorInitCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// the base class. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/copy-constructor-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/copy-constructor-init.html class CopyConstructorInitCheck : public ClangTidyCheck { public: CopyConstructorInitCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h index c7d7c9f7c0e69..95cf673744b91 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/CrtpConstructorAccessibilityCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// CRTP can be constructed outside itself and the derived class. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/crtp-constructor-accessibility.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/crtp-constructor-accessibility.html class CrtpConstructorAccessibilityCheck : public ClangTidyCheck { public: CrtpConstructorAccessibilityCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h index 6443b0aa59548..486562c30f79e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Detect dangling references in value handlers like std::string_view. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/dangling-handle.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/dangling-handle.html class DanglingHandleCheck : public ClangTidyCheck { public: DanglingHandleCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h index d157e84ebdd98..6ef3cd7c04dc1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/DerivedMethodShadowingBaseMethodCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// a base class /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/derived-method-shadowing-base-method.html class DerivedMethodShadowingBaseMethodCheck : public ClangTidyCheck { public: DerivedMethodShadowingBaseMethodCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h index 284b4f5b9935e..8e74870e192d9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// swapped (or badly ordered) arguments. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/easily-swappable-parameters.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/easily-swappable-parameters.html class EasilySwappableParametersCheck : public ClangTidyCheck { public: EasilySwappableParametersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h index acef43934adba..db8ee02dfd404 100644 --- a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// Detects and suggests addressing issues with empty catch statements. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/empty-catch.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/empty-catch.html class EmptyCatchCheck : public ClangTidyCheck { public: EmptyCatchCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h index 974b07c42407d..ae6e2024e415d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ExceptionEscapeCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::bugprone { /// given as option to the checker. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/exception-escape.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/exception-escape.html class ExceptionEscapeCheck : public ClangTidyCheck { public: ExceptionEscapeCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h index 72dab600e3330..119728d972309 100644 --- a/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/FoldInitTypeCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::bugprone { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/fold-init-type.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/fold-init-type.html class FoldInitTypeCheck : public ClangTidyCheck { public: FoldInitTypeCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h index 700e52f7bb86d..5fa4f0984b107 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ForwardDeclarationNamespaceCheck.h @@ -34,7 +34,7 @@ namespace clang::tidy::bugprone { /// point. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/forward-declaration-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/forward-declaration-namespace.html class ForwardDeclarationNamespaceCheck : public ClangTidyCheck { public: ForwardDeclarationNamespaceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h index ead0edb6a2b3a..35e09d64481e7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ForwardingReferenceOverloadCheck.h @@ -23,7 +23,7 @@ namespace clang::tidy::bugprone { /// C++ Design, item 26. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/forwarding-reference-overload.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/forwarding-reference-overload.html class ForwardingReferenceOverloadCheck : public ClangTidyCheck { public: ForwardingReferenceOverloadCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h index 74c64eb43f3c9..3aba92f4a2a8d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ImplicitWideningOfMultiplicationResultCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// Diagnoses instances of an implicit widening of multiplication result. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result.html class ImplicitWideningOfMultiplicationResultCheck : public ClangTidyCheck { const ast_matchers::MatchFinder::MatchResult *Result; bool ShouldUseCXXStaticCast; diff --git a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h index 2e2dcb1cde7bc..f4be441489854 100644 --- a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// ambiguity in the variable's value. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/inc-dec-in-conditions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/inc-dec-in-conditions.html class IncDecInConditionsCheck : public ClangTidyCheck { public: IncDecInConditionsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h index ea9cb4ecd0006..d22bab57bcbd3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableIfCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// ``type`` type. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/incorrect-enable-if.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/incorrect-enable-if.html class IncorrectEnableIfCheck : public ClangTidyCheck { public: IncorrectEnableIfCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h index 866ae56631e36..554065fb52931 100644 --- a/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/IncorrectEnableSharedFromThisCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// otherwise occur when calling ``shared_from_this``. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/incorrect-enable-shared-from-this.html class IncorrectEnableSharedFromThisCheck : public ClangTidyCheck { public: IncorrectEnableSharedFromThisCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h index 0766e2fa3c35d..e26a55575b822 100644 --- a/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/InfiniteLoopCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// not changed at all). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/infinite-loop.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/infinite-loop.html class InfiniteLoopCheck : public ClangTidyCheck { public: InfiniteLoopCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h index b191cf693029e..777e31868c961 100644 --- a/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/IntegerDivisionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// cause unintended loss of precision. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/integer-division.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/integer-division.html class IntegerDivisionCheck : public ClangTidyCheck { public: IntegerDivisionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h index b9b4f20d111fc..4f1a4a2a21af3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/InvalidEnumDefaultInitializationCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// the enum has no enumerator with value of 0. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/invalid-enum-default-initialization.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/invalid-enum-default-initialization.html class InvalidEnumDefaultInitializationCheck : public ClangTidyCheck { public: InvalidEnumDefaultInitializationCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h index 9e53951c4a7bd..f672a56317395 100644 --- a/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/LambdaFunctionNameCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// (i.e., `operator()`). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/lambda-function-name.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/lambda-function-name.html class LambdaFunctionNameCheck : public ClangTidyCheck { public: struct SourceRangeLessThan { diff --git a/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h index b44f7a4ccb795..e0b6b929b5b95 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MisleadingSetterOfReferenceCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// is used to set value of a field with reference type. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/throw-keyword-missing.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/throw-keyword-missing.html class MisleadingSetterOfReferenceCheck : public ClangTidyCheck { public: MisleadingSetterOfReferenceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h index 764fd3ff97fed..f650145203ce6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedOperatorInStrlenInAllocCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// argument to a memory allocation function. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/misplaced-operator-in-strlen-in-alloc.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/misplaced-operator-in-strlen-in-alloc.html class MisplacedOperatorInStrlenInAllocCheck : public ClangTidyCheck { public: MisplacedOperatorInStrlenInAllocCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h index a86d2a33d503f..e78c30cbb644a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedPointerArithmeticInAllocCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// memory allocation function instead of its argument. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/misplaced-pointer-arithmetic-in-alloc.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/misplaced-pointer-arithmetic-in-alloc.html class MisplacedPointerArithmeticInAllocCheck : public ClangTidyCheck { public: MisplacedPointerArithmeticInAllocCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h index 5fde5c7d0e46d..151bbcf028783 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MisplacedWideningCastCheck.h @@ -24,7 +24,7 @@ namespace clang::tidy::bugprone { // be the most common case. Enabled by default. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/misplaced-widening-cast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/misplaced-widening-cast.html class MisplacedWideningCastCheck : public ClangTidyCheck { public: MisplacedWideningCastCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h index 4fc876a232f37..523f49a2c94a9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.h @@ -30,7 +30,7 @@ namespace clang::tidy::bugprone { /// The check suggests replacing the std::move with a std::forward. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/move-forwarding-reference.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/move-forwarding-reference.html class MoveForwardingReferenceCheck : public ClangTidyCheck { public: MoveForwardingReferenceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h index d0a9a21523862..6d3d6e5f1c687 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MultiLevelImplicitPointerConversionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// indirection. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion.html class MultiLevelImplicitPointerConversionCheck : public ClangTidyCheck { public: MultiLevelImplicitPointerConversionCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h index 53ad4a514bcc7..a94fa6eef7a54 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.h @@ -14,7 +14,7 @@ namespace clang::tidy::bugprone { /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/multiple-new-in-one-expression.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/multiple-new-in-one-expression.html class MultipleNewInOneExpressionCheck : public ClangTidyCheck { public: MultipleNewInOneExpressionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h index 73a00fa493797..1a2d4a410b46e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/MultipleStatementMacroCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// other ones will be executed unconditionally. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/multiple-statement-macro.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/multiple-statement-macro.html class MultipleStatementMacroCheck : public ClangTidyCheck { public: MultipleStatementMacroCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h index 1f37086e3af55..9c05827556e67 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NarrowingConversionsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// i += 0.1; /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/narrowing-conversions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/narrowing-conversions.html class NarrowingConversionsCheck : public ClangTidyCheck { public: NarrowingConversionsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h b/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h index 4760b171e75ce..8b01c700bdcc7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NoEscapeCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// attribute, then we warn the user of their error. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/no-escape.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/no-escape.html class NoEscapeCheck : public ClangTidyCheck { public: NoEscapeCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h index 977545fd5b65c..551d46eaaea0b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NonZeroEnumToBoolConversionCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// `enum` type doesn't have a zero-value enumerator. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion.html class NonZeroEnumToBoolConversionCheck : public ClangTidyCheck { public: NonZeroEnumToBoolConversionCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h index 698872fefca90..054d5804745bc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NondeterministicPointerIterationOrderCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// pointers. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/nondeterministic-pointer-iteration-order.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/nondeterministic-pointer-iteration-order.html class NondeterministicPointerIterationOrderCheck : public ClangTidyCheck { public: NondeterministicPointerIterationOrderCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h index fa2ca59b65300..ca3fbf0febf7a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/NotNullTerminatedResultCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// when the string is read. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/not-null-terminated-result.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/not-null-terminated-result.html class NotNullTerminatedResultCheck : public ClangTidyCheck { public: NotNullTerminatedResultCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h index 83e08e7359224..3341865e1ec84 100644 --- a/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/OptionalValueConversionCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::bugprone { /// of the same optional-like type. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/optional-value-conversion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/optional-value-conversion.html class OptionalValueConversionCheck : public ClangTidyCheck { public: OptionalValueConversionCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h b/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h index 2f86d75a6d64d..dfed78e6992fa 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ParentVirtualCallCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Finds calls to grand..-parent virtual methods instead of parent's. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/parent-virtual-call.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/parent-virtual-call.html class ParentVirtualCallCheck : public ClangTidyCheck { public: ParentVirtualCallCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h index 13f9df656c98c..6255359b558a7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/PointerArithmeticOnPolymorphicObjectCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// virtual function. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/pointer-arithmetic-on-polymorphic-object.html class PointerArithmeticOnPolymorphicObjectCheck : public ClangTidyCheck { public: PointerArithmeticOnPolymorphicObjectCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h index 854de520807fc..c4a1c0c94459e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/RedundantBranchConditionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// in the outer `if` statement and were not changed. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/redundant-branch-condition.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/redundant-branch-condition.html class RedundantBranchConditionCheck : public ClangTidyCheck { public: RedundantBranchConditionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h index 5b6fbff266217..89e2c86f99a8f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ReservedIdentifierCheck.h @@ -27,7 +27,7 @@ namespace clang::tidy::bugprone { /// double underscore occurring anywhere. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/reserved-identifier.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/reserved-identifier.html class ReservedIdentifierCheck final : public RenamerClangTidyCheck { const bool Invert; const std::vector AllowedIdentifiersRaw; diff --git a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h index 8149bd29030b3..9496bc08e995a 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ReturnConstRefFromParameterCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// xvalues as arguments. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/return-const-ref-from-parameter.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/return-const-ref-from-parameter.html class ReturnConstRefFromParameterCheck : public ClangTidyCheck { public: ReturnConstRefFromParameterCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h index 6589b19fbe048..b5317793cbf45 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SignalHandlerCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::bugprone { /// Checker for signal handler functions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/signal-handler.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/signal-handler.html class SignalHandlerCheck : public ClangTidyCheck { public: enum class AsyncSafeFunctionSetKind { Minimal, POSIX }; diff --git a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h index c735ac634c801..56504e5a0fd2f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SignedCharMisuseCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// implicit conversion happens. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/signed-char-misuse.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/signed-char-misuse.html class SignedCharMisuseCheck : public ClangTidyCheck { public: SignedCharMisuseCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h index 8fc351b8c6cb2..d87d37856ac09 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SizeofContainerCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// user wanted to use `.size()` instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/sizeof-container.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/sizeof-container.html class SizeofContainerCheck : public ClangTidyCheck { public: SizeofContainerCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h index 6d7c33977db93..652a76a0415ae 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Find suspicious usages of sizeof expressions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/sizeof-expression.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/sizeof-expression.html class SizeofExpressionCheck : public ClangTidyCheck { public: SizeofExpressionCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h index 23bf8056c0f61..bf24b7b211aa6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::bugprone { /// condition parameter. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/spuriously-wake-up-functions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/spuriously-wake-up-functions.html class SpuriouslyWakeUpFunctionsCheck : public ClangTidyCheck { public: SpuriouslyWakeUpFunctionsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h index 85b25d8e25abc..0ef65a3a392fa 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// as an alternative if it is an existing member function. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/standalone-empty.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/standalone-empty.html class StandaloneEmptyCheck : public ClangTidyCheck { public: StandaloneEmptyCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h index 5ab05e119abe9..0d7a203a52e12 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StringConstructorCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Finds suspicious string constructor and check their parameters. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/string-constructor.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/string-constructor.html class StringConstructorCheck : public ClangTidyCheck { public: StringConstructorCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h index 68783b7da53c6..67bd29738cbc3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StringIntegerAssignmentCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Finds instances where an integer is assigned to a string. /// /// For more details see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/string-integer-assignment.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/string-integer-assignment.html class StringIntegerAssignmentCheck : public ClangTidyCheck { public: StringIntegerAssignmentCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h index 59aece123057a..272b54eb09666 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StringLiteralWithEmbeddedNulCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Find suspicious string literals with embedded NUL characters. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/string-literal-with-embedded-nul.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/string-literal-with-embedded-nul.html class StringLiteralWithEmbeddedNulCheck : public ClangTidyCheck { public: StringLiteralWithEmbeddedNulCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h index 81a10101049c9..183a4f776a7f6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.h @@ -30,7 +30,7 @@ namespace clang::tidy::bugprone { /// to `nullptr`. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/stringview-nullptr.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/stringview-nullptr.html class StringviewNullptrCheck : public utils::TransformerClangTidyCheck { public: StringviewNullptrCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h index 542bf7577f927..91095ecd5d00d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousEnumUsageCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// The checker detects various cases when an enum is probably misused (as a /// bitmask). /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-enum-usage.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-enum-usage.html class SuspiciousEnumUsageCheck : public ClangTidyCheck { public: SuspiciousEnumUsageCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h index 03f569e5a483e..3aa9491ef0e3b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousIncludeCheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::bugprone { /// #include "baz.h" // no diagnostic /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-include.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-include.html class SuspiciousIncludeCheck : public ClangTidyCheck { public: SuspiciousIncludeCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h index c36d256242e19..391aa8f4dce3d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemoryComparisonCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// arguments. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-memory-comparison.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-memory-comparison.html class SuspiciousMemoryComparisonCheck : public ClangTidyCheck { public: SuspiciousMemoryComparisonCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h index 41ef525c7f9dd..a1f5f2bfd1a3b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMemsetUsageCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Finds memset calls with potential mistakes in their arguments. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-memset-usage.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-memset-usage.html class SuspiciousMemsetUsageCheck : public ClangTidyCheck { public: SuspiciousMemsetUsageCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h index 3a26b0a4a317e..fddd2491fa77d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// accidentally. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-missing-comma.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-missing-comma.html class SuspiciousMissingCommaCheck : public ClangTidyCheck { public: SuspiciousMissingCommaCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h index 2517d5f7ae319..2abcae4d94ec3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousReallocUsageCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// variable as passed to the first argument. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-realloc-usage.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-realloc-usage.html class SuspiciousReallocUsageCheck : public ClangTidyCheck { public: SuspiciousReallocUsageCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h index 73131c7f9f12a..10bdf328df0ff 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// unintendedly. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-semicolon.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-semicolon.html class SuspiciousSemicolonCheck : public ClangTidyCheck { public: SuspiciousSemicolonCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h index 6f01b1ad087bd..9c29f5907a28b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringCompareCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Find suspicious calls to string compare functions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-string-compare.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-string-compare.html class SuspiciousStringCompareCheck : public ClangTidyCheck { public: SuspiciousStringCompareCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h index 57cb164af8565..eae8aad5b7d3e 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousStringviewDataUsageCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// termination. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/suspicious-stringview-data-usage.html class SuspiciousStringviewDataUsageCheck : public ClangTidyCheck { public: SuspiciousStringviewDataUsageCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h b/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h index f5237775650ea..fccf14b194229 100644 --- a/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/SwitchMissingDefaultCaseCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// warnings. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/switch-missing-default-case.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/switch-missing-default-case.html class SwitchMissingDefaultCaseCheck : public ClangTidyCheck { public: SwitchMissingDefaultCaseCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h index 0c337df405061..01424ef8e57f9 100644 --- a/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/TaggedUnionMemberCountCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// different from the number of data members inside the union. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/tagged-union-member-count.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/tagged-union-member-count.html class TaggedUnionMemberCountCheck : public ClangTidyCheck { public: TaggedUnionMemberCountCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h index 79a794de3819a..c37fdc63e5d70 100644 --- a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// a condition which always evaluates to false). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/terminating-continue.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/terminating-continue.html class TerminatingContinueCheck : public ClangTidyCheck { public: TerminatingContinueCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp index 89eafb15f2652..9781f0a5ac9de 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp @@ -17,8 +17,11 @@ namespace clang::tidy::bugprone { void ThrowKeywordMissingCheck::registerMatchers(MatchFinder *Finder) { Finder->addMatcher( cxxConstructExpr( - hasType(cxxRecordDecl( - isSameOrDerivedFrom(matchesName("[Ee]xception|EXCEPTION")))), + hasType(cxxRecordDecl(anyOf( + matchesName("[Ee]xception|EXCEPTION"), + hasAnyBase(hasType(hasCanonicalType(recordType(hasDeclaration( + cxxRecordDecl(matchesName("[Ee]xception|EXCEPTION")) + .bind("base"))))))))), unless(anyOf( hasAncestor( stmt(anyOf(cxxThrowExpr(), callExpr(), returnStmt()))), @@ -37,6 +40,11 @@ void ThrowKeywordMissingCheck::check(const MatchFinder::MatchResult &Result) { diag(TemporaryExpr->getBeginLoc(), "suspicious exception object created but " "not thrown; did you mean 'throw %0'?") << TemporaryExpr->getType().getBaseTypeIdentifier()->getName(); + + if (const auto *BaseDecl = Result.Nodes.getNodeAs("base")) + diag(BaseDecl->getLocation(), + "object type inherits from base class declared here", + DiagnosticIDs::Note); } } // namespace clang::tidy::bugprone diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h index ee1e7d20d39e0..7a8165503a4c3 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// class that has 'EXCEPTION', 'Exception' or 'exception' in its name. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/throw-keyword-missing.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/throw-keyword-missing.html class ThrowKeywordMissingCheck : public ClangTidyCheck { public: ThrowKeywordMissingCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.h b/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.h index 0a6471e359061..4ec554fc78fc6 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/ThrowingStaticInitializationCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// throw. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/throwing-static-initialization.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/throwing-static-initialization.html class ThrowingStaticInitializationCheck : public ClangTidyCheck { public: ThrowingStaticInitializationCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h index e2c1bb7b002e3..1546533ed88b8 100644 --- a/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/TooSmallLoopVariableCheck.h @@ -24,7 +24,7 @@ namespace clang::tidy::bugprone { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/too-small-loop-variable.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/too-small-loop-variable.html class TooSmallLoopVariableCheck : public ClangTidyCheck { public: TooSmallLoopVariableCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h index 3c0f261126823..11086fb4bfda1 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedOptionalAccessCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// value. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unchecked-optional-access.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unchecked-optional-access.html class UncheckedOptionalAccessCheck : public ClangTidyCheck { public: UncheckedOptionalAccessCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.h b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.h index 365b409f8311c..5053a124343b2 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UncheckedStringToNumberConversionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// reasonable error handling for conversion errors. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unchecked-string-to-number-conversion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unchecked-string-to-number-conversion.html class UncheckedStringToNumberConversionCheck : public ClangTidyCheck { public: UncheckedStringToNumberConversionCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h index fd067c48a16e0..c9a232a1b177d 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UndefinedMemoryManipulationCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// behavior. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/undefined-memory-manipulation.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/undefined-memory-manipulation.html class UndefinedMemoryManipulationCheck : public ClangTidyCheck { public: UndefinedMemoryManipulationCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h index 0724b4ac6d3e9..ed51191bf9d58 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// failure. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unhandled-exception-at-new.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unhandled-exception-at-new.html class UnhandledExceptionAtNewCheck : public ClangTidyCheck { public: UnhandledExceptionAtNewCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h index 61d33028aadc8..d8f9057174d85 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledSelfAssignmentCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// using the copy-and-swap or the copy-and-move method. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unhandled-self-assignment.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unhandled-self-assignment.html class UnhandledSelfAssignmentCheck : public ClangTidyCheck { public: UnhandledSelfAssignmentCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h index af53dc6158696..88db2fb0dfcf7 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnintendedCharOstreamOutputCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::bugprone { /// an ostream. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unintended-char-ostream-output.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unintended-char-ostream-output.html class UnintendedCharOstreamOutputCheck : public ClangTidyCheck { public: UnintendedCharOstreamOutputCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h index 6495bd34f6c58..99bb91a173187 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// but is not exclusive to, the functions from the /// Annex K. "Bounds-checking interfaces" of C11. /// -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unsafe-functions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unsafe-functions.html class UnsafeFunctionsCheck : public ClangTidyCheck { public: UnsafeFunctionsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h index 92eaf290f2073..5de5a76755539 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedLocalNonTrivialVariableCheck.h @@ -24,7 +24,7 @@ namespace clang::tidy::bugprone { /// 'IncludeTypes' matches. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-local-non-trivial-variable.html class UnusedLocalNonTrivialVariableCheck : public ClangTidyCheck { public: UnusedLocalNonTrivialVariableCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h index 376f664f74548..10012aff487cc 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::bugprone { /// Finds temporaries that look like RAII objects. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-raii.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-raii.html class UnusedRaiiCheck : public ClangTidyCheck { public: UnusedRaiiCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h index f81603cadbe80..fd1b3d6aa9974 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// Detects function calls where the return value is unused. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-return-value.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/unused-return-value.html class UnusedReturnValueCheck : public ClangTidyCheck { public: UnusedReturnValueCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h index ac85c80ee0b5b..d38b29e09fa8b 100644 --- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::bugprone { /// intervening reinitialization. /// /// For details, see the user-facing documentation: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/use-after-move.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/use-after-move.html class UseAfterMoveCheck : public ClangTidyCheck { public: UseAfterMoveCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h index b852dffa7c6ea..71d302f49ff95 100644 --- a/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h +++ b/clang-tools-extra/clang-tidy/bugprone/VirtualNearMissCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::bugprone { /// very similar name and an identical signature defined in a base class. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/virtual-near-miss.html +/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/virtual-near-miss.html class VirtualNearMissCheck : public ClangTidyCheck { public: VirtualNearMissCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp index 9ba62219afee9..c1ca2cec7a1eb 100644 --- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp @@ -10,6 +10,7 @@ #include "../ClangTidyModule.h" #include "../ClangTidyModuleRegistry.h" #include "../bugprone/BadSignalToKillThreadCheck.h" +#include "../bugprone/CommandProcessorCheck.h" #include "../bugprone/PointerArithmeticOnPolymorphicObjectCheck.h" #include "../bugprone/ReservedIdentifierCheck.h" #include "../bugprone/SignalHandlerCheck.h" @@ -33,7 +34,6 @@ #include "../performance/MoveConstructorInitCheck.h" #include "../readability/EnumInitialValueCheck.h" #include "../readability/UppercaseLiteralSuffixCheck.h" -#include "CommandProcessorCheck.h" #include "DefaultOperatorNewAlignmentCheck.h" #include "DontModifyStdNamespaceCheck.h" #include "FloatLoopCounter.h" @@ -296,7 +296,8 @@ class CERTModule : public ClangTidyModule { CheckFactories.registerCheck( "cert-dcl37-c"); // ENV - CheckFactories.registerCheck("cert-env33-c"); + CheckFactories.registerCheck( + "cert-env33-c"); // ERR CheckFactories.registerCheck( "cert-err33-c"); diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt index 4933763f03fb5..453d1d30921e9 100644 --- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt @@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyCERTModule STATIC CERTTidyModule.cpp - CommandProcessorCheck.cpp DefaultOperatorNewAlignmentCheck.cpp DontModifyStdNamespaceCheck.cpp FloatLoopCounter.cpp diff --git a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h b/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h index f8cb4d6e32d69..8f9d0e470a755 100644 --- a/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h +++ b/clang-tools-extra/clang-tidy/cert/DefaultOperatorNewAlignmentCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cert { /// the default operator new. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/mem57-cpp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/mem57-cpp.html class DefaultOperatorNewAlignmentCheck : public ClangTidyCheck { public: DefaultOperatorNewAlignmentCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h b/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h index e9207385f0d20..d00c036f00f24 100644 --- a/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h +++ b/clang-tools-extra/clang-tidy/cert/FloatLoopCounter.h @@ -18,7 +18,7 @@ namespace clang::tidy::cert { /// https://www.securecoding.cert.org/confluence/display/c/FLP30-C.+Do+not+use+floating-point+variables+as+loop+counters /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/flp30-c.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/flp30-c.html class FloatLoopCounter : public ClangTidyCheck { public: FloatLoopCounter(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h index b024b9008d876..a9d607665adb3 100644 --- a/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h +++ b/clang-tools-extra/clang-tidy/cert/LimitedRandomnessCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::cert { /// This check warns for the usage of std::rand() function. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/msc50-cpp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/msc50-cpp.html class LimitedRandomnessCheck : public ClangTidyCheck { public: LimitedRandomnessCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h b/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h index ecb3d164b5272..c211fa004120c 100644 --- a/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h +++ b/clang-tools-extra/clang-tidy/cert/MutatingCopyCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cert { /// in copy constructors and copy assignment operators. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/oop58-cpp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop58-cpp.html class MutatingCopyCheck : public ClangTidyCheck { public: MutatingCopyCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h b/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h index 221bdca0baae7..4589ce444c878 100644 --- a/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h +++ b/clang-tools-extra/clang-tidy/cert/NonTrivialTypesLibcMemoryCallsCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cert { /// 'memcmp' and similar derivatives on non-trivial types. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/oop57-cpp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/oop57-cpp.html class NonTrivialTypesLibcMemoryCallsCheck : public ClangTidyCheck { public: NonTrivialTypesLibcMemoryCallsCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h index d34b8e702f670..ea30127e25e08 100644 --- a/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h +++ b/clang-tools-extra/clang-tidy/cert/ProperlySeededRandomGeneratorCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::cert { /// constant expression is a security vulnerability. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/msc51-cpp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/msc51-cpp.html class ProperlySeededRandomGeneratorCheck : public ClangTidyCheck { public: ProperlySeededRandomGeneratorCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h b/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h index 9b97feb7fe5f5..41a5145209686 100644 --- a/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h +++ b/clang-tools-extra/clang-tidy/cert/ThrownExceptionTypeCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::cert { /// Checks whether a thrown object is nothrow copy constructible. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cert/err60-cpp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cert/err60-cpp.html class ThrownExceptionTypeCheck : public ClangTidyCheck { public: ThrownExceptionTypeCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h b/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h index c5c707778bc32..12a4d4b6ec824 100644 --- a/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h +++ b/clang-tools-extra/clang-tidy/concurrency/MtUnsafeCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::concurrency { /// Checks that non-thread-safe functions are not used. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/concurrency/mt-unsafe.html +/// https://clang.llvm.org/extra/clang-tidy/checks/concurrency/mt-unsafe.html class MtUnsafeCheck : public ClangTidyCheck { public: MtUnsafeCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h b/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h index 2d5d82dfd9285..5711266f8d309 100644 --- a/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h +++ b/clang-tools-extra/clang-tidy/concurrency/ThreadCanceltypeAsynchronousCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::concurrency { /// Finds ``pthread_setcanceltype`` function calls where a thread's /// cancellation type is set to asynchronous. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/concurrency/thread-canceltype-asynchronous.html +/// https://clang.llvm.org/extra/clang-tidy/checks/concurrency/thread-canceltype-asynchronous.html class ThreadCanceltypeAsynchronousCheck : public ClangTidyCheck { public: ThreadCanceltypeAsynchronousCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h index 9d458fe9a4d00..c228b2ba9e3fe 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidConstOrRefDataMembersCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// they make the class non-copy-assignable. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-const-or-ref-data-members.html class AvoidConstOrRefDataMembersCheck : public ClangTidyCheck { public: AvoidConstOrRefDataMembersCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h index 0756d0860f961..06ec105ad07b6 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidDoWhileCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// subtle bugs. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-do-while.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-do-while.html class AvoidDoWhileCheck : public ClangTidyCheck { public: AvoidDoWhileCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h index 2b13df795d87c..929fceeb68a30 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidGotoCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// with looping constructs. Only forward jumps in nested loops are accepted. // /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-goto.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-goto.html class AvoidGotoCheck : public ClangTidyCheck { public: AvoidGotoCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h index 9c40fa3e9d341..5e7c968b12f97 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidNonConstGlobalVariablesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// subject to unpredictable changes. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables.html class AvoidNonConstGlobalVariablesCheck : public ClangTidyCheck { public: AvoidNonConstGlobalVariablesCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h index 3469ea7a8efee..78ed547787b9a 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/AvoidReferenceCoroutineParametersCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::cppcoreguidelines { /// longer be valid. This implements CppCoreGuideline CP.53. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-reference-coroutine-parameters.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/avoid-reference-coroutine-parameters.html class AvoidReferenceCoroutineParametersCheck : public ClangTidyCheck { public: AvoidReferenceCoroutineParametersCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h index a1476494b4046..08a4a9d7c7851 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::cppcoreguidelines { /// Find uninitialized local variables. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/init-variables.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/init-variables.html class InitVariablesCheck : public ClangTidyCheck { public: InitVariablesCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h index 2141fc2423bdf..780b4b39254a7 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InterfacesGlobalInitCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::cppcoreguidelines { /// Flags possible initialization order issues of static variables. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/interfaces-global-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/interfaces-global-init.html class InterfacesGlobalInitCheck : public ClangTidyCheck { public: InterfacesGlobalInitCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h index dd553ba613f1e..d310a0c5cb0b9 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MacroUsageCheck.h @@ -21,7 +21,7 @@ namespace tidy::cppcoreguidelines { /// constructs exist for the task. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/macro-usage.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/macro-usage.html class MacroUsageCheck : public ClangTidyCheck { public: MacroUsageCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h index 87187b3b70bcb..dcb05023768da 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MisleadingCaptureDefaultByValueCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::cppcoreguidelines { /// whether data members are captured by value or reference. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/misleading-capture-default-by-value.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/misleading-capture-default-by-value.html class MisleadingCaptureDefaultByValueCheck : public ClangTidyCheck { public: MisleadingCaptureDefaultByValueCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h index 247291076d939..2dd13e2124d74 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/MissingStdForwardCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::cppcoreguidelines { /// This check implement CppCoreGuideline F.19. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/missing-std-forward.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/missing-std-forward.html class MissingStdForwardCheck : public ClangTidyCheck { public: MissingStdForwardCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h index 4e664197b5f72..e4dece6a54c90 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoMallocCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::cppcoreguidelines { /// Static Analyzer - unix.Malloc. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/no-malloc.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/no-malloc.html class NoMallocCheck : public ClangTidyCheck { public: /// Construct Checker and read in configuration for function names. diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h index 877a5173e7f10..7670f59e18c9e 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NoSuspendWithLockCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// This check implements CppCoreGuideline CP.52. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/no-suspend-with-lock.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/no-suspend-with-lock.html class NoSuspendWithLockCheck : public ClangTidyCheck { public: NoSuspendWithLockCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h index e191f09943710..462e9864a3f5d 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/OwningMemoryCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// nature of it whenever possible. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/owning-memory.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/owning-memory.html class OwningMemoryCheck : public ClangTidyCheck { public: OwningMemoryCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h index 6275aa61ba03d..d9acda1ca6fc0 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// into the initialization list instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/prefer-member-initializer.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/prefer-member-initializer.html class PreferMemberInitializerCheck : public ClangTidyCheck { public: PreferMemberInitializerCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h index abd4e5a77009d..cea4bfacd6644 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsArrayToPointerDecayCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::cppcoreguidelines { /// This check flags all array to pointer decays /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-array-to-pointer-decay.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-array-to-pointer-decay.html class ProBoundsArrayToPointerDecayCheck : public ClangTidyCheck { public: ProBoundsArrayToPointerDecayCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h index 2a89be4724037..0755da7ce4409 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsAvoidUncheckedContainerAccess.h @@ -19,7 +19,7 @@ namespace clang::tidy::cppcoreguidelines { /// See /// https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#slcon3-avoid-bounds-errors /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-avoid-unchecked-container-access.html class ProBoundsAvoidUncheckedContainerAccess : public ClangTidyCheck { public: ProBoundsAvoidUncheckedContainerAccess(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h index 19d4ef8e25121..73f185529e1eb 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsConstantArrayIndexCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::cppcoreguidelines { /// have a constant index and are within bounds /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index.html class ProBoundsConstantArrayIndexCheck : public ClangTidyCheck { const StringRef GslHeader; utils::IncludeInserter Inserter; diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h index 2bd113b38c4d4..45b798527ed4e 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProBoundsPointerArithmeticCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::cppcoreguidelines { /// arrays) is flagged. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-pointer-arithmetic.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-bounds-pointer-arithmetic.html class ProBoundsPointerArithmeticCheck : public ClangTidyCheck { public: ProBoundsPointerArithmeticCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h index e05adc966a496..0b8cfc830854f 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::cppcoreguidelines { /// Imposes limitations on the use of const_cast within C++ code. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.html class ProTypeConstCastCheck : public ClangTidyCheck { public: ProTypeConstCastCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h index e6819c40a2bfc..f8e1d5a893da0 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeCstyleCastCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// downcast, const_cast, or reinterpret_cast. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-cstyle-cast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-cstyle-cast.html class ProTypeCstyleCastCheck : public ClangTidyCheck { public: ProTypeCstyleCastCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h index 58125303fb59b..8beaab394f04a 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h @@ -25,7 +25,7 @@ namespace clang::tidy::cppcoreguidelines { /// will result in false positives. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-member-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-member-init.html /// TODO: See if 'fixes' for false positives are optimized away by the compiler. /// TODO: For classes with multiple constructors, make sure that we don't offer /// multiple in-class initializer fixits for the same member. diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h index 63b04261ea436..4948d0ac2d785 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeReinterpretCastCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::cppcoreguidelines { /// Flags all occurrences of reinterpret_cast /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-reinterpret-cast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-reinterpret-cast.html class ProTypeReinterpretCastCheck : public ClangTidyCheck { public: ProTypeReinterpretCastCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h index 266441fd9144f..3d01fb9e52809 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// derived class. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.html class ProTypeStaticCastDowncastCheck : public ClangTidyCheck { public: ProTypeStaticCastDowncastCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h index 5127e652b6466..fe82ce9630589 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeUnionAccessCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// Access to a union as a whole (e.g. passing to a function) is not flagged. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-union-access.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-union-access.html class ProTypeUnionAccessCheck : public ClangTidyCheck { public: ProTypeUnionAccessCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h index f3b20e6e793e5..b28d3657703ba 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeVarargCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// of va_arg. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-vararg.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-vararg.html class ProTypeVarargCheck : public ClangTidyCheck { public: ProTypeVarargCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h index 739e1d706acc3..9fec58fb86036 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/RvalueReferenceParamNotMovedCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// the function body. This check implements CppCoreGuideline F.18. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/rvalue-reference-param-not-moved.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/rvalue-reference-param-not-moved.html class RvalueReferenceParamNotMovedCheck : public ClangTidyCheck { public: RvalueReferenceParamNotMovedCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h index 6d89a8a622a61..61990e6b493db 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SlicingCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// or vtable. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/slicing.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/slicing.html class SlicingCheck : public ClangTidyCheck { public: SlicingCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h index ffd072a7f6a98..8cdaf315eac52 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/SpecialMemberFunctionsCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::cppcoreguidelines { /// are defined. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/special-member-functions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/special-member-functions.html class SpecialMemberFunctionsCheck : public ClangTidyCheck { public: SpecialMemberFunctionsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h index c699f9116a120..c269affc563b3 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/UseEnumClassCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::cppcoreguidelines { /// instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/use-enum-class.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/use-enum-class.html class UseEnumClassCheck : public ClangTidyCheck { public: UseEnumClassCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h index 2c9d92ddeb4a7..3cfe8b2960488 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::cppcoreguidelines { /// nor protected and non-virtual. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/virtual-class-destructor.html +/// https://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/virtual-class-destructor.html class VirtualClassDestructorCheck : public ClangTidyCheck { public: VirtualClassDestructorCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h b/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h index 78cc968ba4efd..422292b9d9eab 100644 --- a/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h +++ b/clang-tools-extra/clang-tidy/darwin/AvoidSpinlockCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::darwin { /// problems. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/darwin/avoid-spinlock.html +/// https://clang.llvm.org/extra/clang-tidy/checks/darwin/avoid-spinlock.html class AvoidSpinlockCheck : public ClangTidyCheck { public: AvoidSpinlockCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h b/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h index 484b4f93e75f5..bc22ef8125f7c 100644 --- a/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h +++ b/clang-tools-extra/clang-tidy/darwin/DispatchOnceNonstaticCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::darwin { /// storage duration, as required by the libdispatch documentation. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/darwin/dispatch-once-nonstatic.html +/// https://clang.llvm.org/extra/clang-tidy/checks/darwin/dispatch-once-nonstatic.html class DispatchOnceNonstaticCheck : public ClangTidyCheck { public: DispatchOnceNonstaticCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h index 9ba311c04e679..51bb15325c955 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsCallsCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::fuchsia { /// Default arguments are not allowed in called functions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/default-arguments-calls.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/default-arguments-calls.html class DefaultArgumentsCallsCheck : public ClangTidyCheck { public: DefaultArgumentsCallsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h index b5a19c3b7c22e..1b0e3dd0a16f5 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/DefaultArgumentsDeclarationsCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::fuchsia { /// Default parameters are not allowed in declared functions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/default-arguments-declarations.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/default-arguments-declarations.html class DefaultArgumentsDeclarationsCheck : public ClangTidyCheck { public: DefaultArgumentsDeclarationsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h index 838987d20014f..66be18267ab8a 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/MultipleInheritanceCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::fuchsia { /// Multiple implementation inheritance is discouraged. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/multiple-inheritance.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/multiple-inheritance.html class MultipleInheritanceCheck : public ClangTidyCheck { public: MultipleInheritanceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h index b974c6d7a4473..d91ecf8e468d2 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/OverloadedOperatorCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::fuchsia { /// Overloading operators is disallowed by the Fuchsia coding standard. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/overloaded-operator.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/overloaded-operator.html class OverloadedOperatorCheck : public ClangTidyCheck { public: OverloadedOperatorCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h index 6b4ef681ee188..42d643e62f28b 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/StaticallyConstructedObjectsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::fuchsia { /// constructor or has no explicit constructor. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/statically-constructed-objects.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/statically-constructed-objects.html class StaticallyConstructedObjectsCheck : public ClangTidyCheck { public: StaticallyConstructedObjectsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h index db6bc33ca0e06..c644e875b3a38 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/TrailingReturnCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::fuchsia { /// return types. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/trailing-return.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/trailing-return.html class TrailingReturnCheck : public ClangTidyCheck { public: TrailingReturnCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h index 8a3182dd57df7..45c6019f3abe4 100644 --- a/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h +++ b/clang-tools-extra/clang-tidy/fuchsia/VirtualInheritanceCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::fuchsia { /// Defining classes with virtual inheritance is disallowed. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/fuchsia/virtual-inheritance.html +/// https://clang.llvm.org/extra/clang-tidy/checks/fuchsia/virtual-inheritance.html class VirtualInheritanceCheck : public ClangTidyCheck { public: VirtualInheritanceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h index dbd2034418762..a305bd524aefb 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h +++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h @@ -24,7 +24,7 @@ namespace clang::tidy::google::readability { /// ones generated by `-Wold-style-cast`. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/readability-casting.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/readability-casting.html class AvoidCStyleCastsCheck : public ClangTidyCheck { public: AvoidCStyleCastsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h index fda0d5906a5cd..0b726222506a9 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h +++ b/clang-tools-extra/clang-tidy/google/AvoidNSObjectNewCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google::objc { /// style guide. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/objc-avoid-nsobject-new.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/objc-avoid-nsobject-new.html class AvoidNSObjectNewCheck : public ClangTidyCheck { public: AvoidNSObjectNewCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h index d32c02b9cfb4b..26a0465bc197f 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h +++ b/clang-tools-extra/clang-tidy/google/AvoidThrowingObjCExceptionCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google::objc { /// the Google Objective-C Style Guide. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/objc-avoid-throwing-exception.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/objc-avoid-throwing-exception.html class AvoidThrowingObjCExceptionCheck : public ClangTidyCheck { public: AvoidThrowingObjCExceptionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h index c2e39d3a7026d..921bec5438daa 100644 --- a/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h +++ b/clang-tools-extra/clang-tidy/google/AvoidUnderscoreInGoogletestNameCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::google::readability { // https://google.github.io/googletest/faq.html#why-should-test-suite-names-and-test-names-not-contain-underscore /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/readability-avoid-underscore-in-googletest-name.html class AvoidUnderscoreInGoogletestNameCheck : public ClangTidyCheck { public: using ClangTidyCheck::ClangTidyCheck; diff --git a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h index c0e539598e00e..1d1e4e31f0c6c 100644 --- a/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h +++ b/clang-tools-extra/clang-tidy/google/DefaultArgumentsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google { /// See https://google.github.io/styleguide/cppguide.html#Default_Arguments /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/default-arguments.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/default-arguments.html class DefaultArgumentsCheck : public ClangTidyCheck { public: DefaultArgumentsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h index 4ed3671fd3951..0954a83223b7c 100644 --- a/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h +++ b/clang-tools-extra/clang-tidy/google/ExplicitConstructorCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google { /// See https://google.github.io/styleguide/cppguide.html#Explicit_Constructors /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/explicit-constructor.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/explicit-constructor.html class ExplicitConstructorCheck : public ClangTidyCheck { public: ExplicitConstructorCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h b/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h index 49d5172f932d2..a02614fce9bac 100644 --- a/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h +++ b/clang-tools-extra/clang-tidy/google/ExplicitMakePairCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::google::build { /// Corresponding cpplint.py check name: 'build/explicit_make_pair'. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/build-explicit-make-pair.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/build-explicit-make-pair.html class ExplicitMakePairCheck : public ClangTidyCheck { public: ExplicitMakePairCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/FloatTypesCheck.h b/clang-tools-extra/clang-tidy/google/FloatTypesCheck.h index b5534c046e68f..5efba5be5aa00 100644 --- a/clang-tools-extra/clang-tidy/google/FloatTypesCheck.h +++ b/clang-tools-extra/clang-tidy/google/FloatTypesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::google::runtime { /// of portability. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/runtime-float.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/runtime-float.html class RuntimeFloatCheck : public ClangTidyCheck { public: RuntimeFloatCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h index 1f4fe92d542a8..e4efadfd217a6 100644 --- a/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h +++ b/clang-tools-extra/clang-tidy/google/FunctionNamingCheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::google::objc { /// method or property declarations. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/objc-function-naming.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/objc-function-naming.html class FunctionNamingCheck : public ClangTidyCheck { public: FunctionNamingCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h b/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h index 4cc36630d3851..750b9f4f340f0 100644 --- a/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/google/GlobalNamesInHeadersCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google::readability { /// Right now it only triggers on using declarations and directives. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/global-names-in-headers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/global-names-in-headers.html class GlobalNamesInHeadersCheck : public ClangTidyCheck { public: GlobalNamesInHeadersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h index c6c32c3ff0884..9b55855b1fc86 100644 --- a/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/google/GlobalVariableDeclarationCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google::objc { /// 'g[A-Z].*' (variables). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/objc-global-variable-declaration.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/objc-global-variable-declaration.html class GlobalVariableDeclarationCheck : public ClangTidyCheck { public: GlobalVariableDeclarationCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp index 047c7f99ae299..52777fa5c4fd6 100644 --- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp +++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp @@ -87,7 +87,7 @@ void IntegerTypesCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { void IntegerTypesCheck::registerMatchers(MatchFinder *Finder) { // Match any integer types, unless they are passed to a printf-based API: // - // http://google.github.io/styleguide/cppguide.html#64-bit_Portability + // https://google.github.io/styleguide/cppguide.html#64-bit_Portability // "Where possible, avoid passing arguments of types specified by // bitwidth typedefs to printf-based APIs." Finder->addMatcher( diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h index be4989851f20a..a87c323211ec4 100644 --- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h +++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.h @@ -25,7 +25,7 @@ namespace tidy::google::runtime { /// Corresponding cpplint.py check: 'runtime/int'. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/runtime-int.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/runtime-int.html class IntegerTypesCheck : public ClangTidyCheck { public: IntegerTypesCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h b/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h index 126f0fbc61b87..fc4d5201245cb 100644 --- a/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h +++ b/clang-tools-extra/clang-tidy/google/OverloadedUnaryAndCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::google::runtime { /// Corresponding cpplint.py check name: 'runtime/operator'. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/runtime-operator.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/runtime-operator.html class OverloadedUnaryAndCheck : public ClangTidyCheck { public: OverloadedUnaryAndCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h index d56036095bab9..05f9cc6618eb1 100644 --- a/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h +++ b/clang-tools-extra/clang-tidy/google/TodoCommentCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::google::readability { /// Corresponding cpplint.py check: 'readability/todo' /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/readability-todo.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/readability-todo.html class TodoCommentCheck : public ClangTidyCheck { public: TodoCommentCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h index 84f8ae56f2635..78e8127a5a342 100644 --- a/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h +++ b/clang-tools-extra/clang-tidy/google/UnnamedNamespaceInHeaderCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::google::build { /// Corresponding cpplint.py check name: 'build/namespaces'. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/google/build-namespaces.html +/// https://clang.llvm.org/extra/clang-tidy/checks/google/build-namespaces.html class UnnamedNamespaceInHeaderCheck : public ClangTidyCheck { public: UnnamedNamespaceInHeaderCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h index bc21249663af8..12fe7f7eb340d 100644 --- a/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/ExceptionBaseclassCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::hicpp { /// std::exception. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/hicpp/exception-baseclass.html +/// https://clang.llvm.org/extra/clang-tidy/checks/hicpp/exception-baseclass.html class ExceptionBaseclassCheck : public ClangTidyCheck { public: ExceptionBaseclassCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h b/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h index 8cf58d5a6978a..07e624e446403 100644 --- a/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/IgnoredRemoveResultCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::hicpp { /// are not ignored according to rule 17.5.1. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/hicpp/ignored-remove-result.html +/// https://clang.llvm.org/extra/clang-tidy/checks/hicpp/ignored-remove-result.html class IgnoredRemoveResultCheck : public bugprone::UnusedReturnValueCheck { public: IgnoredRemoveResultCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h index 2507f6cde338e..902be2d9d324d 100644 --- a/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/MultiwayPathsCoveredCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::hicpp { /// without a final 'else'-branch. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/hicpp/multiway-paths-covered.html +/// https://clang.llvm.org/extra/clang-tidy/checks/hicpp/multiway-paths-covered.html class MultiwayPathsCoveredCheck : public ClangTidyCheck { public: MultiwayPathsCoveredCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h index cf397df1578a4..1ff40eae4622b 100644 --- a/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/NoAssemblerCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::hicpp { /// Find assembler statements. No fix is offered. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/hicpp/no-assembler.html +/// https://clang.llvm.org/extra/clang-tidy/checks/hicpp/no-assembler.html class NoAssemblerCheck : public ClangTidyCheck { public: NoAssemblerCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h index b3538e7e51f58..499a4e7bebc14 100644 --- a/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h +++ b/clang-tools-extra/clang-tidy/hicpp/SignedBitwiseCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::hicpp { /// bitwise operations on signed integer types. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/hicpp/signed-bitwise.html +/// https://clang.llvm.org/extra/clang-tidy/checks/hicpp/signed-bitwise.html class SignedBitwiseCheck : public ClangTidyCheck { public: SignedBitwiseCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h index a450f50c30cb8..e7dc0bee86ac2 100644 --- a/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h +++ b/clang-tools-extra/clang-tidy/linuxkernel/MustCheckErrsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::linuxkernel { /// directly return a value from one of these error functions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/linuxkernel/must-check-errs.html +/// https://clang.llvm.org/extra/clang-tidy/checks/linuxkernel/must-check-errs.html class MustCheckErrsCheck : public ClangTidyCheck { public: MustCheckErrsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h index 1eb307d8347df..cfc920bb85e23 100644 --- a/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h +++ b/clang-tools-extra/clang-tidy/llvm/HeaderGuardCheck.h @@ -15,7 +15,7 @@ namespace clang::tidy::llvm_check { /// Finds and fixes header guards that do not adhere to LLVM style. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvm/header-guard.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvm/header-guard.html class LLVMHeaderGuardCheck : public utils::HeaderGuardCheck { public: LLVMHeaderGuardCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h index 27c6798481866..3a424cb9166a8 100644 --- a/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h +++ b/clang-tools-extra/clang-tidy/llvm/IncludeOrderCheck.h @@ -15,7 +15,7 @@ namespace clang::tidy::llvm_check { /// Checks the correct order of `#includes`. /// -/// See http://llvm.org/docs/CodingStandards.html#include-style +/// See https://llvm.org/docs/CodingStandards.html#include-style class IncludeOrderCheck : public ClangTidyCheck { public: IncludeOrderCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h index cf4b64ad21686..aed7aa84f9d07 100644 --- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h +++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.h @@ -46,7 +46,7 @@ namespace clang::tidy::llvm_check { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvm/prefer-isa-or-dyn-cast-in-conditionals.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvm/prefer-isa-or-dyn-cast-in-conditionals.html class PreferIsaOrDynCastInConditionalsCheck : public ClangTidyCheck { public: PreferIsaOrDynCastInConditionalsCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h index 9a7a0c3f35857..09d2ba4bc7340 100644 --- a/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h +++ b/clang-tools-extra/clang-tidy/llvm/PreferRegisterOverUnsignedCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::llvm_check { /// the code more explicit. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvm/prefer-register-over-unsigned.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvm/prefer-register-over-unsigned.html class PreferRegisterOverUnsignedCheck : public ClangTidyCheck { public: PreferRegisterOverUnsignedCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h index a8738e6fbd70d..aa979b9040775 100644 --- a/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/llvm/PreferStaticOverAnonymousNamespaceCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::llvm_check { /// suggests replacing them with ``static`` declarations. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvm/prefer-static-over-anonymous-namespace.html class PreferStaticOverAnonymousNamespaceCheck : public ClangTidyCheck { public: PreferStaticOverAnonymousNamespaceCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h b/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h index b985288ea0e4c..c698a4c00d402 100644 --- a/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h +++ b/clang-tools-extra/clang-tidy/llvm/UseRangesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::llvm_check { /// range-based algorithms from `llvm/ADT/STLExtras.h`. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvm/use-ranges.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvm/use-ranges.html class UseRangesCheck : public utils::UseRangesCheck { public: UseRangesCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h index 34c628ea5f6e4..8c44ceabcced5 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::llvm_libc { /// Checks all calls resolve to functions within __llvm_libc namespace. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/callee-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/callee-namespace.html class CalleeNamespaceCheck : public ClangTidyCheck { public: CalleeNamespaceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h index da97443191b9f..62b200560fd8c 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/ImplementationInNamespaceCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::llvm_libc { /// Checks all llvm-libc implementation is within the correct namespace. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/implementation-in-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/implementation-in-namespace.html class ImplementationInNamespaceCheck : public ClangTidyCheck { public: ImplementationInNamespaceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h index 01a8df46ec666..cf8c11d7619ed 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/InlineFunctionDeclCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::llvm_libc { /// https://libc.llvm.org/dev/code_style.html. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/inline-function-decl-check.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/inline-function-decl-check.html class InlineFunctionDeclCheck : public ClangTidyCheck { public: InlineFunctionDeclCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h index 1c7b31037875d..4a7cfa3151bb0 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::llvm_libc { /// compiler provided. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/restrict-system-libc-headers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/llvmlibc/restrict-system-libc-headers.html class RestrictSystemLibcHeadersCheck : public portability::RestrictSystemIncludesCheck { public: diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp index 2d0323ac04515..b05fd049cef74 100644 --- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp @@ -30,7 +30,7 @@ ConfusableIdentifierCheck::ConfusableIdentifierCheck(StringRef Name, ConfusableIdentifierCheck::~ConfusableIdentifierCheck() = default; // Build a skeleton out of the Original identifier, inspired by the algorithm -// described in http://www.unicode.org/reports/tr39/#def-skeleton +// described in https://www.unicode.org/reports/tr39/#def-skeleton // // FIXME: TR39 mandates: // diff --git a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h index 650f35b50e189..fafcac407e029 100644 --- a/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h +++ b/clang-tools-extra/clang-tidy/misc/ConstCorrectnessCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::misc { /// This check warns on variables which could be declared const but are not. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/const-correctness.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/const-correctness.html class ConstCorrectnessCheck : public ClangTidyCheck { public: ConstCorrectnessCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h index 95c2b04b82ea7..e100509ea261d 100644 --- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h +++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::misc { /// types and types belonging to a configurable denylist. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/coroutine-hostile-raii.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/coroutine-hostile-raii.html class CoroutineHostileRAIICheck : public ClangTidyCheck { public: CoroutineHostileRAIICheck(llvm::StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h index ce1293038078c..0c162cc53ff5f 100644 --- a/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/misc/DefinitionsInHeadersCheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::misc { /// header files. True by default. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/definitions-in-headers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/definitions-in-headers.html class DefinitionsInHeadersCheck : public ClangTidyCheck { public: DefinitionsInHeadersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h index dcf538c4c2844..df66beb517255 100644 --- a/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h +++ b/clang-tools-extra/clang-tidy/misc/HeaderIncludeCycleCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::misc { /// Check detects cyclic #include dependencies between user-defined headers. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/header-include-cycle.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/header-include-cycle.html class HeaderIncludeCycleCheck : public ClangTidyCheck { public: HeaderIncludeCycleCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h index 941a2aad79856..43e1ed894a16c 100644 --- a/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h +++ b/clang-tools-extra/clang-tidy/misc/IncludeCleanerCheck.h @@ -29,7 +29,7 @@ namespace clang::tidy::misc { /// Findings correspond to https://clangd.llvm.org/design/include-cleaner. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/include-cleaner.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/include-cleaner.html class IncludeCleanerCheck : public ClangTidyCheck { public: IncludeCleanerCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h index 1abacb4c16426..2b8a05d003fad 100644 --- a/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h +++ b/clang-tools-extra/clang-tidy/misc/MisplacedConstCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::misc { /// pointer type rather than to the pointee. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/misplaced-const.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/misplaced-const.html class MisplacedConstCheck : public ClangTidyCheck { public: MisplacedConstCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h index b678137927351..234883747036d 100644 --- a/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h +++ b/clang-tools-extra/clang-tidy/misc/NoRecursionCheck.h @@ -22,7 +22,7 @@ namespace tidy::misc { /// and displays one example of possible call graph loop (recursion). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/no-recursion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/no-recursion.html class NoRecursionCheck : public ClangTidyCheck { public: NoRecursionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h index 09077226eb5c5..a507eae13e7dc 100644 --- a/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h +++ b/clang-tools-extra/clang-tidy/misc/NonPrivateMemberVariablesInClassesCheck.h @@ -23,7 +23,7 @@ namespace clang::tidy::misc { /// ignored and optionally all `public` member variables could be ignored. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/non-private-member-variables-in-classes.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/non-private-member-variables-in-classes.html class NonPrivateMemberVariablesInClassesCheck : public ClangTidyCheck { public: NonPrivateMemberVariablesInClassesCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h index 6e0909524991d..d8da30bd738d6 100644 --- a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h +++ b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::misc { /// in the base class. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/override-with-different-visibility.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/override-with-different-visibility.html class OverrideWithDifferentVisibilityCheck : public ClangTidyCheck { public: enum class ChangeKind { Any, Widening, Narrowing }; diff --git a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h index 784548355c164..57289c39df22d 100644 --- a/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h +++ b/clang-tools-extra/clang-tidy/misc/RedundantExpressionCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::misc { /// ineffective, useless parts. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/redundant-expression.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/redundant-expression.html class RedundantExpressionCheck : public ClangTidyCheck { public: RedundantExpressionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h index 960c85eb89cbc..be9e7b971256c 100644 --- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.h @@ -23,7 +23,7 @@ namespace clang::tidy::misc { /// * The operator must always return ``*this``. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/unconventional-assign-operator.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/unconventional-assign-operator.html class UnconventionalAssignOperatorCheck : public ClangTidyCheck { public: UnconventionalAssignOperatorCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h index ce77acf443e2c..96d8d9da3ceb2 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UnusedUsingDeclsCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::misc { /// Finds unused using declarations. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/unused-using-decls.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/unused-using-decls.html class UnusedUsingDeclsCheck : public ClangTidyCheck { public: UnusedUsingDeclsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h index 10e10b0c32360..8548fbbc6a3cc 100644 --- a/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UseAnonymousNamespaceCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::misc { /// suggests moving them to an anonymous namespace. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/use-anonymous-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/use-anonymous-namespace.html class UseAnonymousNamespaceCheck : public ClangTidyCheck { public: UseAnonymousNamespaceCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h index 8c82ac0b6b644..eedb5bd1adcab 100644 --- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h +++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::misc { /// an anonymous namespace to enforce internal linkage. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/misc/use-internal-linkage.html +/// https://clang.llvm.org/extra/clang-tidy/checks/misc/use-internal-linkage.html class UseInternalLinkageCheck : public ClangTidyCheck { public: UseInternalLinkageCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h index ba9e562324e55..94838cb1b5a78 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::modernize { /// FIXME: Add support for function references and member function references. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-bind.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-bind.html class AvoidBindCheck : public ClangTidyCheck { public: AvoidBindCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h index ff0809644050b..98488b8429002 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/AvoidCArraysCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Find C-style array types and recommend to use std::array<> / std::vector<>. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-c-arrays.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-c-arrays.html class AvoidCArraysCheck : public ClangTidyCheck { public: AvoidCArraysCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidSetjmpLongjmpCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidSetjmpLongjmpCheck.h index 23a1cd169e152..d03617284f6d1 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidSetjmpLongjmpCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/AvoidSetjmpLongjmpCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Guards against use of setjmp/longjmp in C++ code /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-setjmp-longjmp.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-setjmp-longjmp.html class AvoidSetjmpLongjmpCheck : public ClangTidyCheck { public: AvoidSetjmpLongjmpCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidVariadicFunctionsCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidVariadicFunctionsCheck.h index d93e9d0c5d678..4fdea59983410 100644 --- a/clang-tools-extra/clang-tidy/modernize/AvoidVariadicFunctionsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/AvoidVariadicFunctionsCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Find all function definitions of C-style variadic functions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-variadic-functions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/avoid-variadic-functions.html class AvoidVariadicFunctionsCheck : public ClangTidyCheck { public: AvoidVariadicFunctionsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h index c9409cb641c54..cbd1075497d8d 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.h @@ -29,7 +29,7 @@ namespace clang::tidy::modernize { /// Example: `` => `` /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/deprecated-headers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/deprecated-headers.html class DeprecatedHeadersCheck : public ClangTidyCheck { public: DeprecatedHeadersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h index 0a0b4deb5abba..496f9cb4f271b 100644 --- a/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedIosBaseAliasesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::modernize { /// ``std::ios_base`` and replaces those that have a non-deprecated equivalent. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/deprecated-ios-base-aliases.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/deprecated-ios-base-aliases.html class DeprecatedIosBaseAliasesCheck : public ClangTidyCheck { public: DeprecatedIosBaseAliasesCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h index 3f339f364d722..55c503a07f64f 100644 --- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.h @@ -18,7 +18,7 @@ class MacroToEnumCallbacks; /// Replaces groups of related macros with an unscoped anonymous enum. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/macro-to-enum.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/macro-to-enum.html class MacroToEnumCheck : public ClangTidyCheck { public: MacroToEnumCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h index 025ce757b3d5f..063b35fc46d4f 100644 --- a/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/MakeSharedCheck.h @@ -24,7 +24,7 @@ namespace clang::tidy::modernize { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/make-shared.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/make-shared.html class MakeSharedCheck : public MakeSmartPtrCheck { public: MakeSharedCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h index 5af9f846db29b..8ce6ec0bef636 100644 --- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.h @@ -20,7 +20,7 @@ using CharsBitSet = std::bitset<1 << CHAR_BIT>; /// raw string literals. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/raw-string-literal.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/raw-string-literal.html class RawStringLiteralCheck : public ClangTidyCheck { public: RawStringLiteralCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h index 44ca787fa4fcc..a58c22ba04ed3 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceDisallowCopyAndAssignMacroCheck.h @@ -35,7 +35,7 @@ namespace clang::tidy::modernize { /// ~~~ /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/replace-disallow-copy-and-assign-macro.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/replace-disallow-copy-and-assign-macro.html class ReplaceDisallowCopyAndAssignMacroCheck : public ClangTidyCheck { public: ReplaceDisallowCopyAndAssignMacroCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h index 23571dfa92175..5f2be10ca66bb 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReplaceRandomShuffleCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::modernize { /// replace all occurrences of std::random_shuffle with std::shuffle. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/replace-random-shuffle.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/replace-random-shuffle.html class ReplaceRandomShuffleCheck : public ClangTidyCheck { public: ReplaceRandomShuffleCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h index c023cb5c4c2ca..ef465ea5e189d 100644 --- a/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/ReturnBracedInitListCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::modernize { /// repeating the return type name. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/return-braced-init-list.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/return-braced-init-list.html class ReturnBracedInitListCheck : public ClangTidyCheck { public: ReturnBracedInitListCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h index 1f9ffc9b8b811..afbadba7741b7 100644 --- a/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/TypeTraitsCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::modernize { /// `traits<...>::value` into `traits_t<...>` and `traits_v<...>` respectively. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/type-traits.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/type-traits.html class TypeTraitsCheck : public ClangTidyCheck { public: TypeTraitsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h index 94e78f01b06f9..95611c9b13e77 100644 --- a/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UnaryStaticAssertCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::modernize { /// with the unary version. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/unary-static-assert.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/unary-static-assert.html class UnaryStaticAssertCheck : public ClangTidyCheck { public: UnaryStaticAssertCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h index 5b7b1e0cc3b6e..64aff84b1be64 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseBoolLiteralsCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Finds integer literals which are cast to bool. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-bool-literals.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-bool-literals.html class UseBoolLiteralsCheck : public ClangTidyCheck { public: UseBoolLiteralsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h index bf49f329baeab..c4c5c791d4a40 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Replace enable_if with C++20 requires clauses. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-constraints.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-constraints.html class UseConstraintsCheck : public ClangTidyCheck { public: UseConstraintsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp index d920af7fc477b..0d2c3a79b9ece 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.cpp @@ -8,17 +8,57 @@ #include "UseDefaultMemberInitCheck.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/Expr.h" #include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" #include "clang/Lex/Lexer.h" +#include "llvm/ADT/TypeSwitch.h" using namespace clang::ast_matchers; namespace clang::tidy::modernize { +static bool isExprAllowedInMemberInit(const Expr *E) { + if (!E) + return false; + return llvm::TypeSwitch(E) + .Case( + [](const auto *) { return true; }) + .Case([](const auto *) { return true; }) + .Case([](const ParenExpr *PE) { + return isExprAllowedInMemberInit(PE->getSubExpr()); + }) + .Case([](const UnaryOperator *UO) { + return isExprAllowedInMemberInit(UO->getSubExpr()); + }) + .Case([](const BinaryOperator *BO) { + return isExprAllowedInMemberInit(BO->getLHS()) && + isExprAllowedInMemberInit(BO->getRHS()); + }) + .Case([](const CastExpr *CE) { + return isExprAllowedInMemberInit(CE->getSubExpr()); + }) + .Case([](const DeclRefExpr *DRE) { + if (const ValueDecl *D = DRE->getDecl()) { + if (isa(D)) + return true; + if (const auto *VD = dyn_cast(D)) + return VD->isConstexpr() || VD->getStorageClass() == SC_Static; + } + return false; + }) + .Default(false); +} + namespace { + AST_MATCHER_P(InitListExpr, initCountIs, unsigned, N) { return Node.getNumInits() == N; } + +AST_MATCHER(Expr, allowedInitExpr) { return isExprAllowedInMemberInit(&Node); } + } // namespace static StringRef getValueOfValueInit(const QualType InitType) { @@ -206,30 +246,10 @@ void UseDefaultMemberInitCheck::storeOptions( } void UseDefaultMemberInitCheck::registerMatchers(MatchFinder *Finder) { - auto NumericLiteral = anyOf(integerLiteral(), floatLiteral()); - auto UnaryNumericLiteral = unaryOperator(hasAnyOperatorName("+", "-"), - hasUnaryOperand(NumericLiteral)); - - auto ConstExprRef = varDecl(anyOf(isConstexpr(), isStaticStorageClass())); - auto ImmutableRef = - declRefExpr(to(decl(anyOf(enumConstantDecl(), ConstExprRef)))); - - auto BinaryNumericExpr = binaryOperator( - hasOperands(anyOf(NumericLiteral, ImmutableRef, binaryOperator()), - anyOf(NumericLiteral, ImmutableRef, binaryOperator()))); - - auto InitBase = - anyOf(stringLiteral(), characterLiteral(), NumericLiteral, - UnaryNumericLiteral, cxxBoolLiteral(), cxxNullPtrLiteralExpr(), - implicitValueInitExpr(), ImmutableRef, BinaryNumericExpr); - - auto ExplicitCastExpr = castExpr(hasSourceExpression(InitBase)); - auto InitMatcher = anyOf(InitBase, ExplicitCastExpr); - - auto Init = - anyOf(initListExpr(anyOf(allOf(initCountIs(1), hasInit(0, InitMatcher)), - initCountIs(0), hasType(arrayType()))), - InitBase, ExplicitCastExpr); + auto Init = anyOf( + initListExpr(anyOf(allOf(initCountIs(1), hasInit(0, allowedInitExpr())), + initCountIs(0), hasType(arrayType()))), + allowedInitExpr()); Finder->addMatcher( cxxConstructorDecl(forEachConstructorInitializer( diff --git a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h index 7ae04b78006a1..be6a18ad66d99 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseDefaultMemberInitCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::modernize { /// member initializer. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-default-member-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-default-member-init.html class UseDefaultMemberInitCheck : public ClangTidyCheck { public: UseDefaultMemberInitCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h index e010509474287..2ca131963387d 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseDesignatedInitializersCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::modernize { /// written as designated initializers instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-designated-initializers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-designated-initializers.html class UseDesignatedInitializersCheck : public ClangTidyCheck { public: UseDesignatedInitializersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h index 2e9e142894a47..87ebf6ff98c2b 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseEmplaceCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::modernize { /// constructor of temporary object. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-emplace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-emplace.html class UseEmplaceCheck : public ClangTidyCheck { public: UseEmplaceCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h index 51b386c2acaca..519f1899170cc 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.h @@ -31,7 +31,7 @@ namespace clang::tidy::modernize { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-equals-default.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-equals-default.html class UseEqualsDefaultCheck : public ClangTidyCheck { public: UseEqualsDefaultCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h index 590aa900b8768..31a956bc49c5f 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDeleteCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::modernize { /// deleted member function from the ``private`` to the ``public`` section. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-equals-delete.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-equals-delete.html class UseEqualsDeleteCheck : public ClangTidyCheck { public: UseEqualsDeleteCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h index 106796f0c8072..860b9c882f6ea 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseIntegerSignComparisonCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::modernize { /// C++20 ``std::cmp_*`` alternative, if available. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-integer-sign-comparison.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-integer-sign-comparison.html class UseIntegerSignComparisonCheck : public ClangTidyCheck { public: UseIntegerSignComparisonCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h index cc46769900dd3..94d65cfae56ce 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseNodiscardCheck.h @@ -28,7 +28,7 @@ namespace clang::tidy::modernize { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-nodiscard.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-nodiscard.html class UseNodiscardCheck : public ClangTidyCheck { public: UseNodiscardCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h index 3a915e1fe7238..30b5d4ecd1cf2 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseNoexceptCheck.h @@ -26,7 +26,7 @@ namespace clang::tidy::modernize { /// \endcode /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-noexcept.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-noexcept.html class UseNoexceptCheck : public ClangTidyCheck { public: UseNoexceptCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp index 4dc4baecddd50..4084d713665ea 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp +++ b/clang-tools-extra/clang-tidy/modernize/UseNullptrCheck.cpp @@ -53,7 +53,7 @@ StatementMatcher makeCastSequenceMatcher(llvm::ArrayRef NameList) { unless(hasImplicitDestinationType( qualType(matchers::matchesAnyListedTypeName(NameList))))); - auto IsOrHasDescendant = [](auto InnerMatcher) { + auto IsOrHasDescendant = [](const auto &InnerMatcher) { return anyOf(InnerMatcher, hasDescendant(InnerMatcher)); }; diff --git a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h index 80ea6996afe55..d3a76dac912bb 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseRangesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::modernize { /// replaced with a ranges version instead /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-ranges.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-ranges.html class UseRangesCheck : public utils::UseRangesCheck { public: UseRangesCheck(StringRef CheckName, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h index 553031857e086..8f6b5a4b2aaff 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseScopedLockCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::modernize { /// alternative ``std::scoped_lock``. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-scoped-lock.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-scoped-lock.html class UseScopedLockCheck : public ClangTidyCheck { public: UseScopedLockCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h index 70df8b87cb6f4..e71ebe7602bd4 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::modernize { /// ``std::string_view``. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-starts-ends-with.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-starts-ends-with.html class UseStartsEndsWithCheck : public ClangTidyCheck { public: UseStartsEndsWithCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h index e369c17a0f733..7dad065f218d7 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseStdFormatCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::modernize { /// now-unnecessary calls to std::string::c_str() and std::string::data(). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-format.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-format.html class UseStdFormatCheck : public ClangTidyCheck { public: UseStdFormatCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h index f1bd3b4eee2ba..e8853609af3c9 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseStdNumbersCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::modernize { /// offers a replacement at the definition of those variables. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-numbers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-numbers.html class UseStdNumbersCheck : public ClangTidyCheck { public: UseStdNumbersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h index 1f7660991a275..f5b3f719c56ce 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseStdPrintCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Convert calls to printf-like functions to std::print and std::println /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-print.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-std-print.html class UseStdPrintCheck : public ClangTidyCheck { public: UseStdPrintCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h index 9050bd5eba5e2..4aa1adaf30db5 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.h @@ -23,7 +23,7 @@ struct ClassifiedToken { /// Rewrites function signatures to use a trailing return type. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-trailing-return-type.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-trailing-return-type.html class UseTrailingReturnTypeCheck : public ClangTidyCheck { public: UseTrailingReturnTypeCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h index dc9c76e8875a0..0af729b54cfce 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseTransparentFunctorsCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Prefer using transparent functors to non-transparent ones. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-transparent-functors.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-transparent-functors.html class UseTransparentFunctorsCheck : public ClangTidyCheck { public: UseTransparentFunctorsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h index 4c63efe0c6919..772133d492a9f 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseUncaughtExceptionsCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::modernize { /// without fixits. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-uncaught-exceptions.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-uncaught-exceptions.html class UseUncaughtExceptionsCheck : public ClangTidyCheck { public: UseUncaughtExceptionsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h index 4ab1c4f6b9646..5ecabc7a17a45 100644 --- a/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h +++ b/clang-tools-extra/clang-tidy/modernize/UseUsingCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::modernize { /// Check finds typedefs and replaces it with usings. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/modernize/use-using.html +/// https://clang.llvm.org/extra/clang-tidy/checks/modernize/use-using.html class UseUsingCheck : public ClangTidyCheck { const bool IgnoreMacros; diff --git a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h index 7922750c135ac..07ee68a55b6b4 100644 --- a/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h +++ b/clang-tools-extra/clang-tidy/mpi/BufferDerefCheck.h @@ -23,7 +23,7 @@ namespace clang::tidy::mpi { /// emitted. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/mpi/buffer-deref.html +/// https://clang.llvm.org/extra/clang-tidy/checks/mpi/buffer-deref.html class BufferDerefCheck : public ClangTidyCheck { public: BufferDerefCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h index 60bcb0f3cf70c..5a7db17819967 100644 --- a/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h +++ b/clang-tools-extra/clang-tidy/mpi/TypeMismatchCheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::mpi { /// null pointer constants are skipped, in the course of verification. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/mpi/type-mismatch.html +/// https://clang.llvm.org/extra/clang-tidy/checks/mpi/type-mismatch.html class TypeMismatchCheck : public ClangTidyCheck { public: TypeMismatchCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/AssertEquals.h b/clang-tools-extra/clang-tidy/objc/AssertEquals.h index 0f4e303feea8b..8c21f9bd3a75e 100644 --- a/clang-tools-extra/clang-tidy/objc/AssertEquals.h +++ b/clang-tools-extra/clang-tidy/objc/AssertEquals.h @@ -18,7 +18,7 @@ namespace clang::tidy::objc { /// operands of type NSString*. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/assert-equals.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/assert-equals.html class AssertEquals final : public ClangTidyCheck { public: AssertEquals(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h index 2fd3d11559a39..eba9f719ab26f 100644 --- a/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h +++ b/clang-tools-extra/clang-tidy/objc/AvoidNSErrorInitCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::objc { /// NSError. errorWithDomain:code:userInfo: should be used instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/avoid-nserror-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/avoid-nserror-init.html class AvoidNSErrorInitCheck : public ClangTidyCheck { public: AvoidNSErrorInitCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h b/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h index f44a123055eee..0b35655256544 100644 --- a/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h +++ b/clang-tools-extra/clang-tidy/objc/DeallocInCategoryCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::objc { /// potentially causing issues. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/dealloc-in-category.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/dealloc-in-category.html class DeallocInCategoryCheck final : public ClangTidyCheck { public: DeallocInCategoryCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h index 1f345c1da5156..2d238690d627a 100644 --- a/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h +++ b/clang-tools-extra/clang-tidy/objc/ForbiddenSubclassingCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::objc { /// documented to not support subclassing. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/forbidden-subclassing.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/forbidden-subclassing.html class ForbiddenSubclassingCheck : public ClangTidyCheck { public: ForbiddenSubclassingCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h index cf0261e3cc38a..d94da489b1740 100644 --- a/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h +++ b/clang-tools-extra/clang-tidy/objc/MissingHashCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::objc { /// appropriately implementing -hash. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/missing-hash.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/missing-hash.html class MissingHashCheck : public ClangTidyCheck { public: MissingHashCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp index 6a9adfe7d282d..1481b2bb24e95 100644 --- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp +++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp @@ -43,7 +43,8 @@ static bool isValidDatePattern(StringRef Pattern) { // Checks if the string pattern used as a date format specifier contains // any incorrect pattern and reports it as a warning. -// See: http://www.unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns +// See: +// https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Format_Patterns void NSDateFormatterCheck::check(const MatchFinder::MatchResult &Result) { // Callback implementation. const auto *StrExpr = Result.Nodes.getNodeAs("str_lit"); diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h index dc0e89a08b680..706249c2e51f1 100644 --- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h +++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::objc { /// warnings if it contains any incorrect sub-pattern. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/nsdate-formatter.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/nsdate-formatter.html class NSDateFormatterCheck : public ClangTidyCheck { public: NSDateFormatterCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h index d09ea8cc10298..d9d6c282256c5 100644 --- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h +++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::objc { /// argument object lifetimes. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/nsinvocation-argument-lifetime.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/nsinvocation-argument-lifetime.html class NSInvocationArgumentLifetimeCheck : public ClangTidyCheck { public: NSInvocationArgumentLifetimeCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h index c883e59321124..daaebb11673a8 100644 --- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::objc { /// @property(nonatomic) NSString *lowerCamelCase; /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/property-declaration.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/property-declaration.html class PropertyDeclarationCheck : public ClangTidyCheck { public: PropertyDeclarationCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h index baeba560a8fef..059ec680327ac 100644 --- a/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h +++ b/clang-tools-extra/clang-tidy/objc/SuperSelfCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::objc { /// of NSObject and recommends calling a superclass initializer instead. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/objc/super-self.html +/// https://clang.llvm.org/extra/clang-tidy/checks/objc/super-self.html class SuperSelfCheck : public ClangTidyCheck { public: SuperSelfCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h index 1703f55f902ba..39da124a4b37c 100644 --- a/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h +++ b/clang-tools-extra/clang-tidy/openmp/ExceptionEscapeCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::openmp { /// out of the Structured Block it was thrown in. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/openmp/exception-escape.html +/// https://clang.llvm.org/extra/clang-tidy/checks/openmp/exception-escape.html class ExceptionEscapeCheck : public ClangTidyCheck { public: ExceptionEscapeCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h b/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h index fb6b528df3ffb..659dab2759a52 100644 --- a/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h +++ b/clang-tools-extra/clang-tidy/openmp/UseDefaultNoneCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::openmp { /// other than ``none``, and suggests to use the ``default(none)`` clause. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/openmp/use-default-none.html +/// https://clang.llvm.org/extra/clang-tidy/checks/openmp/use-default-none.html class UseDefaultNoneCheck : public ClangTidyCheck { public: UseDefaultNoneCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h index d87e6b8ab9f5e..aa606edbc60dd 100644 --- a/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h +++ b/clang-tools-extra/clang-tidy/performance/EnumSizeCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::performance { /// base. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/enum-size.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/enum-size.html class EnumSizeCheck : public ClangTidyCheck { public: EnumSizeCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h index a7ab79a3809d4..2452d2e66ecd4 100644 --- a/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h +++ b/clang-tools-extra/clang-tidy/performance/FasterStringFindCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::performance { /// The character literal overload is more efficient. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/faster-string-find.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/faster-string-find.html class FasterStringFindCheck : public ClangTidyCheck { public: FasterStringFindCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h index 3ed05fecd015d..afe2f083aa1cf 100644 --- a/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h +++ b/clang-tools-extra/clang-tidy/performance/ForRangeCopyCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::performance { /// A check that detects copied loop variables and suggests using const /// references. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/for-range-copy.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/for-range-copy.html class ForRangeCopyCheck : public ClangTidyCheck { public: ForRangeCopyCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h index 810c0109574e9..b82a838e737d8 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h +++ b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::performance { /// concatenating strings, using the operator+, instead of operator+=. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/inefficient-string-concatenation.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/inefficient-string-concatenation.html class InefficientStringConcatenationCheck : public ClangTidyCheck { public: InefficientStringConcatenationCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h index 9737d9d5ecb1a..18f7c1937edf7 100644 --- a/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h +++ b/clang-tools-extra/clang-tidy/performance/InefficientVectorOperationCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::performance { /// field without calling Reserve() first. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/inefficient-vector-operation.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/inefficient-vector-operation.html class InefficientVectorOperationCheck : public ClangTidyCheck { public: InefficientVectorOperationCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h index 7c5aec8c59fc8..be0a24e6c1280 100644 --- a/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h +++ b/clang-tools-extra/clang-tidy/performance/MoveConstructorInitCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::performance { /// initializing a member or base class through a copy constructor instead of a /// move constructor. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/move-constructor-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/move-constructor-init.html class MoveConstructorInitCheck : public ClangTidyCheck { public: MoveConstructorInitCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h b/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h index af80e74f3a5b4..b78e0b98921a4 100644 --- a/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h +++ b/clang-tools-extra/clang-tidy/performance/NoAutomaticMoveCheck.h @@ -15,7 +15,7 @@ namespace clang::tidy::performance { /// Finds local variables that cannot be automatically moved due to constness. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/no-automatic-move.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/no-automatic-move.html class NoAutomaticMoveCheck : public ClangTidyCheck { public: NoAutomaticMoveCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h b/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h index ed6f60f697da4..c3af0776cfd4a 100644 --- a/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h +++ b/clang-tools-extra/clang-tidy/performance/NoIntToPtrCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::performance { /// Diagnoses every integer to pointer cast. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/no-int-to-ptr.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/no-int-to-ptr.html class NoIntToPtrCheck : public ClangTidyCheck { public: NoIntToPtrCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h index ae96359a544ce..bf3d4801cc1a8 100644 --- a/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h +++ b/clang-tools-extra/clang-tidy/performance/TriviallyDestructibleCheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::performance { /// A::~A() = default; /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/trivially-destructible.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/trivially-destructible.html class TriviallyDestructibleCheck : public ClangTidyCheck { public: TriviallyDestructibleCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h index 9d9b073c80400..cf74f80006274 100644 --- a/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h +++ b/clang-tools-extra/clang-tidy/performance/TypePromotionInMathFnCheck.h @@ -22,7 +22,7 @@ namespace clang::tidy::performance { /// C). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/type-promotion-in-math-fn.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/type-promotion-in-math-fn.html class TypePromotionInMathFnCheck : public ClangTidyCheck { public: TypePromotionInMathFnCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h index 571857020cef4..d59fb4105381e 100644 --- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h +++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::performance { /// can safely be converted to const references. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/performance/unnecessary-value-param.html +/// https://clang.llvm.org/extra/clang-tidy/checks/performance/unnecessary-value-param.html class UnnecessaryValueParamCheck : public ClangTidyCheck { public: UnnecessaryValueParamCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h b/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h index 3638a9c46773e..97c7eb6c1eb6f 100644 --- a/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h +++ b/clang-tools-extra/clang-tidy/portability/AvoidPragmaOnceCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::portability { /// portability. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/portability/avoid-pragma-once.html +/// https://clang.llvm.org/extra/clang-tidy/checks/portability/avoid-pragma-once.html class AvoidPragmaOnceCheck : public ClangTidyCheck { public: AvoidPragmaOnceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h index 5347ae9d68b02..e37f89336bc92 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::portability { /// includes are specified, the check will exit without issuing any warnings. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/portability/restrict-system-includes.html +/// https://clang.llvm.org/extra/clang-tidy/checks/portability/restrict-system-includes.html class RestrictSystemIncludesCheck : public ClangTidyCheck { public: RestrictSystemIncludesCheck(StringRef Name, ClangTidyContext *Context, diff --git a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h index ab0711335c920..db2d2307b1943 100644 --- a/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h +++ b/clang-tools-extra/clang-tidy/portability/SIMDIntrinsicsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::portability { /// Find SIMD intrinsics calls and suggest std::experimental::simd alternatives. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/portability/simd-intrinsics.html +/// https://clang.llvm.org/extra/clang-tidy/checks/portability/simd-intrinsics.html class SIMDIntrinsicsCheck : public ClangTidyCheck { public: SIMDIntrinsicsCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h b/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h index b2f5feac21918..b62847f3dc277 100644 --- a/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h +++ b/clang-tools-extra/clang-tidy/portability/StdAllocatorConstCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::portability { /// ``std::allocator``. They do not compile with libstdc++ or MSVC. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/portability/std-allocator-const.html +/// https://clang.llvm.org/extra/clang-tidy/checks/portability/std-allocator-const.html class StdAllocatorConstCheck : public ClangTidyCheck { public: StdAllocatorConstCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h b/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h index 01d5519d7e6fd..485bdef9b58dd 100644 --- a/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h +++ b/clang-tools-extra/clang-tidy/portability/TemplateVirtualMemberFunctionCheck.h @@ -21,7 +21,7 @@ namespace clang::tidy::portability { /// compiler. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/portability/template-virtual-member-function.html +/// https://clang.llvm.org/extra/clang-tidy/checks/portability/template-virtual-member-function.html class TemplateVirtualMemberFunctionCheck : public ClangTidyCheck { public: TemplateVirtualMemberFunctionCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h index 763cd7f01f9c3..f840cbec53473 100644 --- a/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h +++ b/clang-tools-extra/clang-tidy/readability/AmbiguousSmartptrResetCallCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// the pointee type also has a 'reset' method /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/ambiguous-smartptr-reset-call.html class AmbiguousSmartptrResetCallCheck : public ClangTidyCheck { public: AmbiguousSmartptrResetCallCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/AvoidDefaultLambdaCaptureCheck.cpp b/clang-tools-extra/clang-tidy/readability/AvoidDefaultLambdaCaptureCheck.cpp new file mode 100644 index 0000000000000..ae333464f5ab4 --- /dev/null +++ b/clang-tools-extra/clang-tidy/readability/AvoidDefaultLambdaCaptureCheck.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AvoidDefaultLambdaCaptureCheck.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/Basic/Lambda.h" +#include "clang/Lex/Lexer.h" + +using namespace clang::ast_matchers; + +namespace clang::tidy::readability { + +static std::string generateCaptureText(const LambdaCapture &Capture) { + if (Capture.capturesThis()) + return Capture.getCaptureKind() == LCK_StarThis ? "*this" : "this"; + + std::string Result; + if (Capture.getCaptureKind() == LCK_ByRef) + Result += "&"; + + Result += Capture.getCapturedVar()->getName().str(); + return Result; +} + +void AvoidDefaultLambdaCaptureCheck::registerMatchers(MatchFinder *Finder) { + Finder->addMatcher(lambdaExpr(hasDefaultCapture()).bind("lambda"), this); +} + +void AvoidDefaultLambdaCaptureCheck::check( + const MatchFinder::MatchResult &Result) { + const auto *Lambda = Result.Nodes.getNodeAs("lambda"); + assert(Lambda); + + const SourceLocation DefaultCaptureLoc = Lambda->getCaptureDefaultLoc(); + if (DefaultCaptureLoc.isInvalid()) + return; + + std::vector ImplicitCaptures; + for (const LambdaCapture &Capture : Lambda->implicit_captures()) { + // It is impossible to explicitly capture a VLA in C++, since VLAs don't + // exist in ISO C++ and so the syntax was never created to capture them. + if (Capture.getCaptureKind() == LCK_VLAType) + return; + ImplicitCaptures.push_back(generateCaptureText(Capture)); + } + + auto Diag = diag(DefaultCaptureLoc, + "lambda default captures are discouraged; " + "prefer to capture specific variables explicitly"); + + // For template-dependent lambdas, the list of captures hasn't been created + // yet, so the list of implicit captures is empty. + if (ImplicitCaptures.empty() && Lambda->isGenericLambda()) + return; + + const std::string ReplacementText = llvm::join(ImplicitCaptures, ", "); + + Diag << FixItHint::CreateReplacement(Lambda->getCaptureDefaultLoc(), + ReplacementText); +} + +} // namespace clang::tidy::readability diff --git a/clang-tools-extra/clang-tidy/readability/AvoidDefaultLambdaCaptureCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidDefaultLambdaCaptureCheck.h new file mode 100644 index 0000000000000..f5db72ca3ac1a --- /dev/null +++ b/clang-tools-extra/clang-tidy/readability/AvoidDefaultLambdaCaptureCheck.h @@ -0,0 +1,36 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDDEFAULTLAMBDACAPTURECHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDDEFAULTLAMBDACAPTURECHECK_H + +#include "../ClangTidyCheck.h" + +namespace clang::tidy::readability { + +/// Flags lambdas that use default capture modes +/// +/// For the user-facing documentation see: +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-default-lambda-capture.html +class AvoidDefaultLambdaCaptureCheck : public ClangTidyCheck { +public: + AvoidDefaultLambdaCaptureCheck(StringRef Name, ClangTidyContext *Context) + : ClangTidyCheck(Name, Context) {} + void registerMatchers(ast_matchers::MatchFinder *Finder) override; + void check(const ast_matchers::MatchFinder::MatchResult &Result) override; + std::optional getCheckTraversalKind() const override { + return TK_IgnoreUnlessSpelledInSource; + } + bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { + return LangOpts.CPlusPlus11; + } +}; + +} // namespace clang::tidy::readability + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_AVOIDDEFAULTLAMBDACAPTURECHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h index b14af6a0cf1c7..260c84304e138 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h +++ b/clang-tools-extra/clang-tidy/readability/AvoidNestedConditionalOperatorCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Identifies instances of nested conditional operators in the code. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-nested-conditional-operator.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-nested-conditional-operator.html class AvoidNestedConditionalOperatorCheck : public ClangTidyCheck { public: AvoidNestedConditionalOperatorCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h index 93e6268fd5dd5..1533b9acf2c02 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h +++ b/clang-tools-extra/clang-tidy/readability/AvoidReturnWithVoidValueCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// result types. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-return-with-void-value.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-return-with-void-value.html class AvoidReturnWithVoidValueCheck : public ClangTidyCheck { public: AvoidReturnWithVoidValueCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h index 2382a5e928972..6834ab7488afd 100644 --- a/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h +++ b/clang-tools-extra/clang-tidy/readability/AvoidUnconditionalPreprocessorIfCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// directives by analyzing `#if` conditions, such as `#if 0` and `#if 1`, etc. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-unconditional-preprocessor-if.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/avoid-unconditional-preprocessor-if.html class AvoidUnconditionalPreprocessorIfCheck : public ClangTidyCheck { public: AvoidUnconditionalPreprocessorIfCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt index 4b4c49d3b17d1..34b5cc912511c 100644 --- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt @@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS add_clang_library(clangTidyReadabilityModule STATIC AmbiguousSmartptrResetCallCheck.cpp AvoidConstParamsInDecls.cpp + AvoidDefaultLambdaCaptureCheck.cpp AvoidNestedConditionalOperatorCheck.cpp AvoidReturnWithVoidValueCheck.cpp AvoidUnconditionalPreprocessorIfCheck.cpp @@ -44,6 +45,7 @@ add_clang_library(clangTidyReadabilityModule STATIC RedundantDeclarationCheck.cpp RedundantFunctionPtrDereferenceCheck.cpp RedundantMemberInitCheck.cpp + RedundantParenthesesCheck.cpp RedundantPreprocessorCheck.cpp RedundantSmartptrGetCheck.cpp RedundantStringCStrCheck.cpp diff --git a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h index e3d9713d430ce..4b652f1a68226 100644 --- a/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ConstReturnTypeCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// the `const` qualifier from that return type. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/const-return-type.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/const-return-type.html class ConstReturnTypeCheck : public ClangTidyCheck { public: ConstReturnTypeCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h index e419785060df0..8e058f20427fd 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ContainerContainsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// to the `container.contains()` method. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/container-contains.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/container-contains.html class ContainerContainsCheck : public ClangTidyCheck { public: ContainerContainsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h index ee83d7b4784ff..2aab03f1a896f 100644 --- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h +++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// because they don't use the 'this' pointer. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/ +/// https://clang.llvm.org/extra/clang-tidy/checks/ /// readability-convert-member-functions-to-static.html class ConvertMemberFunctionsToStatic : public ClangTidyCheck { public: diff --git a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h index dc88646f07afa..52b1b2625e403 100644 --- a/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h +++ b/clang-tools-extra/clang-tidy/readability/DeleteNullPointerCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// pointer. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/delete-null-pointer.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/delete-null-pointer.html class DeleteNullPointerCheck : public ClangTidyCheck { public: DeleteNullPointerCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h index ab025032317c7..ebbed28f6068e 100644 --- a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Flags the usages of `else` after `return`. /// -/// http://llvm.org/docs/CodingStandards.html#don-t-use-else-after-a-return +/// https://llvm.org/docs/CodingStandards.html#don-t-use-else-after-a-return class ElseAfterReturnCheck : public ClangTidyCheck { public: ElseAfterReturnCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h index f070f867b6af8..d03e26b4468f4 100644 --- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h +++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// styles: none, first only, or all initialized explicitly. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/enum-initial-value.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/enum-initial-value.html class EnumInitialValueCheck : public ClangTidyCheck { public: EnumInitialValueCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp index f9d81212e2842..4791df037d77d 100644 --- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp @@ -39,7 +39,7 @@ struct CognitiveComplexity final { // For details you can look at the Specification at // https://www.sonarsource.com/docs/CognitiveComplexity.pdf // or user-facing docs at - // http://clang.llvm.org/extra/clang-tidy/checks/readability/function-cognitive-complexity.html + // https://clang.llvm.org/extra/clang-tidy/checks/readability/function-cognitive-complexity.html // Here are all the possible reasons: enum Criteria : uint8_t { None = 0U, diff --git a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h index 455fbfd9fa56a..046c6e162af88 100644 --- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h +++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.h @@ -28,7 +28,7 @@ namespace clang::tidy::readability { /// macros. Default is `false`. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/function-cognitive-complexity.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/function-cognitive-complexity.html class FunctionCognitiveComplexityCheck : public ClangTidyCheck { public: FunctionCognitiveComplexityCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h index 9626e2251426d..3adaf50bc57a1 100644 --- a/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h +++ b/clang-tools-extra/clang-tidy/readability/IdentifierLengthCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// Warns about identifiers names whose length is too short. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/identifier-length.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/identifier-length.html class IdentifierLengthCheck : public ClangTidyCheck { public: IdentifierLengthCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h index 8028a31719644..f88ceb1dd5a0c 100644 --- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Checks for use of implicit bool conversions in expressions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/implicit-bool-conversion.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/implicit-bool-conversion.html class ImplicitBoolConversionCheck : public ClangTidyCheck { public: ImplicitBoolConversionCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h index a7810bd25f69c..289e131d0d97a 100644 --- a/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h +++ b/clang-tools-extra/clang-tidy/readability/InconsistentDeclarationParameterNameCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// Checks for declarations of functions which differ in parameter names. /// /// For detailed documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/inconsistent-declaration-parameter-name.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/inconsistent-declaration-parameter-name.html /// class InconsistentDeclarationParameterNameCheck : public ClangTidyCheck { public: diff --git a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h index c7e1ea33a0d0d..750b4d887de58 100644 --- a/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/IsolateDeclarationCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// tries to refactor the code to one statement per declaration. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/isolate-declaration.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/isolate-declaration.html class IsolateDeclarationCheck : public ClangTidyCheck { public: IsolateDeclarationCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h index b703bd4ba984f..53954535ebe86 100644 --- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::readability { /// Detects magic numbers, integer and floating point literals embedded in code. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/magic-numbers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/magic-numbers.html class MagicNumbersCheck : public ClangTidyCheck { public: MagicNumbersCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h index 6be832260bd18..ac34b2b8258c1 100644 --- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Finds non-static member functions that can be made 'const'. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/make-member-function-const.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/make-member-function-const.html class MakeMemberFunctionConstCheck : public ClangTidyCheck { public: MakeMemberFunctionConstCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h index 3381d6612a709..d970a8a72cc27 100644 --- a/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MathMissingParenthesesCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// operators of different priorities. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/math-missing-parentheses.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/math-missing-parentheses.html class MathMissingParenthesesCheck : public ClangTidyCheck { public: MathMissingParenthesesCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h index 39bb4baba5141..8347f1a3611d9 100644 --- a/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MisleadingIndentationCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// or spaces are used consistently and not mixed. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/misleading-indentation.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/misleading-indentation.html class MisleadingIndentationCheck : public ClangTidyCheck { public: MisleadingIndentationCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h index 1b11b6bea108e..0a6e0c8fb25a0 100644 --- a/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h +++ b/clang-tools-extra/clang-tidy/readability/MisplacedArrayIndexCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// `array[index]`). /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/misplaced-array-index.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/misplaced-array-index.html class MisplacedArrayIndexCheck : public ClangTidyCheck { public: MisplacedArrayIndexCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h index 883a2a44fee8d..fe5736fad0d15 100644 --- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h +++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Checks that long namespaces have a closing comment. /// -/// http://llvm.org/docs/CodingStandards.html#namespace-indentation +/// https://llvm.org/docs/CodingStandards.html#namespace-indentation /// /// https://google.github.io/styleguide/cppguide.html#Namespaces class NamespaceCommentCheck : public ClangTidyCheck { diff --git a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h index 61d6ebd4c2f2a..b0156183c0b88 100644 --- a/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h +++ b/clang-tools-extra/clang-tidy/readability/NonConstParameterCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Warn when a pointer function parameter can be const. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/non-const-parameter.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/non-const-parameter.html class NonConstParameterCheck : public ClangTidyCheck { public: NonConstParameterCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h index f1a9793481ada..2ce8ac0161bf9 100644 --- a/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/OperatorsRepresentationCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// and overloaded operators in C++ code. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/operators-representation.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/operators-representation.html class OperatorsRepresentationCheck : public ClangTidyCheck { public: OperatorsRepresentationCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h index c63b426bda7c8..186ce0dc431a8 100644 --- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h +++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// 'const auto &'. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/qualified-auto.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/qualified-auto.html class QualifiedAutoCheck : public ClangTidyCheck { public: QualifiedAutoCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp index d01882dfc9daa..2479c112d425a 100644 --- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp @@ -11,6 +11,7 @@ #include "../ClangTidyModuleRegistry.h" #include "AmbiguousSmartptrResetCallCheck.h" #include "AvoidConstParamsInDecls.h" +#include "AvoidDefaultLambdaCaptureCheck.h" #include "AvoidNestedConditionalOperatorCheck.h" #include "AvoidReturnWithVoidValueCheck.h" #include "AvoidUnconditionalPreprocessorIfCheck.h" @@ -47,6 +48,7 @@ #include "RedundantFunctionPtrDereferenceCheck.h" #include "RedundantInlineSpecifierCheck.h" #include "RedundantMemberInitCheck.h" +#include "RedundantParenthesesCheck.h" #include "RedundantPreprocessorCheck.h" #include "RedundantSmartptrGetCheck.h" #include "RedundantStringCStrCheck.h" @@ -74,6 +76,8 @@ class ReadabilityModule : public ClangTidyModule { "readability-ambiguous-smartptr-reset-call"); CheckFactories.registerCheck( "readability-avoid-const-params-in-decls"); + CheckFactories.registerCheck( + "readability-avoid-default-lambda-capture"); CheckFactories.registerCheck( "readability-avoid-nested-conditional-operator"); CheckFactories.registerCheck( @@ -138,6 +142,8 @@ class ReadabilityModule : public ClangTidyModule { "readability-redundant-function-ptr-dereference"); CheckFactories.registerCheck( "readability-redundant-member-init"); + CheckFactories.registerCheck( + "readability-redundant-parentheses"); CheckFactories.registerCheck( "readability-redundant-preprocessor"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h index 6359dafc0e4eb..2b87848e77757 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantAccessSpecifiersCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Detects redundant access specifiers inside classes, structs, and unions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-access-specifiers.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-access-specifiers.html class RedundantAccessSpecifiersCheck : public ClangTidyCheck { public: RedundantAccessSpecifiersCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h index 97c87fb8b09a1..c09767d0cda2b 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// destination types, and subsequently recommend their removal. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-casting.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-casting.html class RedundantCastingCheck : public ClangTidyCheck { public: RedundantCastingCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h index 7698996d107e4..3018b1f8d14e6 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::readability { /// Eliminates redundant `continue` statements at the end of a loop body. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-control-flow.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-control-flow.html class RedundantControlFlowCheck : public ClangTidyCheck { public: RedundantControlFlowCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h index fff7827c6378a..b22cef9a2b776 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Find redundant variable declarations. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-declaration.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-declaration.html class RedundantDeclarationCheck : public ClangTidyCheck { public: RedundantDeclarationCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h index f4a3671b0f7d1..5c82a5e02645f 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantFunctionPtrDereferenceCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Eliminate redundant dereferences of a function pointer. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-function-ptr-dereference.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-function-ptr-dereference.html class RedundantFunctionPtrDereferenceCheck : public ClangTidyCheck { public: RedundantFunctionPtrDereferenceCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h index d1134b307a909..5e819e700fd16 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// declarations. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-inline-specifier.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-inline-specifier.html class RedundantInlineSpecifierCheck : public ClangTidyCheck { public: RedundantInlineSpecifierCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h index 2ce8c3f5f64f5..64d365d1e3f45 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// constructor would be called if they were not present. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-member-init.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-member-init.html class RedundantMemberInitCheck : public ClangTidyCheck { public: RedundantMemberInitCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp new file mode 100644 index 0000000000000..0ab59fff39d88 --- /dev/null +++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "RedundantParenthesesCheck.h" +#include "clang/AST/Expr.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" +#include "clang/ASTMatchers/ASTMatchersMacros.h" +#include + +using namespace clang::ast_matchers; + +namespace clang::tidy::readability { + +namespace { + +AST_MATCHER_P(ParenExpr, subExpr, ast_matchers::internal::Matcher, + InnerMatcher) { + return InnerMatcher.matches(*Node.getSubExpr(), Finder, Builder); +} + +AST_MATCHER(ParenExpr, isInMacro) { + const Expr *E = Node.getSubExpr(); + return Node.getLParen().isMacroID() || Node.getRParen().isMacroID() || + E->getBeginLoc().isMacroID() || E->getEndLoc().isMacroID(); +} + +} // namespace + +void RedundantParenthesesCheck::registerMatchers(MatchFinder *Finder) { + const auto ConstantExpr = + expr(anyOf(integerLiteral(), floatLiteral(), characterLiteral(), + cxxBoolLiteral(), stringLiteral(), cxxNullPtrLiteralExpr())); + Finder->addMatcher( + parenExpr(subExpr(anyOf(parenExpr(), ConstantExpr, declRefExpr())), + unless(anyOf(isInMacro(), + // sizeof(...) is common used. + hasParent(unaryExprOrTypeTraitExpr())))) + .bind("dup"), + this); +} + +void RedundantParenthesesCheck::check(const MatchFinder::MatchResult &Result) { + const auto *PE = Result.Nodes.getNodeAs("dup"); + diag(PE->getBeginLoc(), "redundant parentheses around expression") + << FixItHint::CreateRemoval(PE->getLParen()) + << FixItHint::CreateRemoval(PE->getRParen()); +} + +} // namespace clang::tidy::readability diff --git a/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h new file mode 100644 index 0000000000000..9a0409b83fff3 --- /dev/null +++ b/clang-tools-extra/clang-tidy/readability/RedundantParenthesesCheck.h @@ -0,0 +1,34 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTPARENTHESESCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTPARENTHESESCHECK_H + +#include "../ClangTidyCheck.h" +#include "clang/Basic/LangOptions.h" + +namespace clang::tidy::readability { + +/// Detect redundant parentheses. +/// +/// For the user-facing documentation see: +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-parentheses.html +class RedundantParenthesesCheck : public ClangTidyCheck { +public: + RedundantParenthesesCheck(StringRef Name, ClangTidyContext *Context) + : ClangTidyCheck(Name, Context) {} + void registerMatchers(ast_matchers::MatchFinder *Finder) override; + void check(const ast_matchers::MatchFinder::MatchResult &Result) override; + bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { + return LangOpts.CPlusPlus | LangOpts.C99; + } +}; + +} // namespace clang::tidy::readability + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTPARENTHESESCHECK_H diff --git a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h index ca34f9783c619..75e5d6371703c 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h +++ b/clang-tools-extra/clang-tidy/readability/RedundantPreprocessorCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// the same condition. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-preprocessor.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-preprocessor.html class RedundantPreprocessorCheck : public ClangTidyCheck { public: RedundantPreprocessorCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h index c95f65a3ec691..4c2b7f7e7f174 100644 --- a/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h +++ b/clang-tools-extra/clang-tidy/readability/ReferenceToConstructedTemporaryCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// of a temporary object that has just been constructed. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/reference-to-constructed-temporary.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/reference-to-constructed-temporary.html class ReferenceToConstructedTemporaryCheck : public ClangTidyCheck { public: ReferenceToConstructedTemporaryCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h index 2ab074e5dca69..466bc411bf800 100644 --- a/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h +++ b/clang-tools-extra/clang-tidy/readability/SimplifyBooleanExprCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// them to use the appropriate boolean expression directly. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/simplify-boolean-expr.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/simplify-boolean-expr.html class SimplifyBooleanExprCheck : public ClangTidyCheck { public: SimplifyBooleanExprCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h index 79ced95fd762c..45b4e3db67686 100644 --- a/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h +++ b/clang-tools-extra/clang-tidy/readability/SimplifySubscriptExprCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Simplifies subscript expressions. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/simplify-subscript-expr.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/simplify-subscript-expr.html class SimplifySubscriptExprCheck : public ClangTidyCheck { public: SimplifySubscriptExprCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h index 5b47bf7685bbf..c376806d00098 100644 --- a/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h +++ b/clang-tools-extra/clang-tidy/readability/StaticAccessedThroughInstanceCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// instances and replaces them with uses of the appropriate qualified-id. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/static-accessed-through-instance.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/static-accessed-through-instance.html class StaticAccessedThroughInstanceCheck : public ClangTidyCheck { public: StaticAccessedThroughInstanceCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h index 9207ba0075b5d..55306556fb0a6 100644 --- a/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h +++ b/clang-tools-extra/clang-tidy/readability/StaticDefinitionInAnonymousNamespaceCheck.h @@ -16,7 +16,7 @@ namespace clang::tidy::readability { /// Finds static function and variable definitions in anonymous namespace. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/static-definition-in-anonymous-namespace.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/static-definition-in-anonymous-namespace.html class StaticDefinitionInAnonymousNamespaceCheck : public ClangTidyCheck { public: StaticDefinitionInAnonymousNamespaceCheck(StringRef Name, diff --git a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h index 9ff80b075f101..034e272975df0 100644 --- a/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h +++ b/clang-tools-extra/clang-tidy/readability/StringCompareCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// equality or inequality. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/string-compare.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/string-compare.html class StringCompareCheck : public ClangTidyCheck { public: StringCompareCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp index ad8b47aa96425..29084f4e875f7 100644 --- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.cpp @@ -208,7 +208,7 @@ static bool applyLevenshteinHeuristic(StringRef Arg, StringRef Param, return Dist > Threshold; } -// Based on http://en.wikipedia.org/wiki/Jaro–Winkler_distance. +// Based on https://en.wikipedia.org/wiki/Jaro–Winkler_distance. static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param, int8_t Threshold) { std::size_t Match = 0, Transpos = 0; @@ -269,7 +269,7 @@ static bool applyJaroWinklerHeuristic(StringRef Arg, StringRef Param, return Dist > Threshold; } -// Based on http://en.wikipedia.org/wiki/Sørensen–Dice_coefficient +// Based on https://en.wikipedia.org/wiki/Sørensen–Dice_coefficient static bool applyDiceHeuristic(StringRef Arg, StringRef Param, int8_t Threshold) { llvm::StringSet<> ArgBigrams; diff --git a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h index 43ae0f181302f..a44fdaea4959e 100644 --- a/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h +++ b/clang-tools-extra/clang-tidy/readability/SuspiciousCallArgumentCheck.h @@ -20,7 +20,7 @@ namespace clang::tidy::readability { /// of the function. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/suspicious-call-argument.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/suspicious-call-argument.html class SuspiciousCallArgumentCheck : public ClangTidyCheck { enum class Heuristic { Equality, diff --git a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h index f7d6fe70058fc..ab6449e3fd416 100644 --- a/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h +++ b/clang-tools-extra/clang-tidy/readability/UniqueptrDeleteReleaseCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// replaces them with: `` = nullptr;`` /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/uniqueptr-delete-release.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/uniqueptr-delete-release.html class UniqueptrDeleteReleaseCheck : public ClangTidyCheck { public: UniqueptrDeleteReleaseCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h index 7c71fe064f3c9..e1eef3d5b58ee 100644 --- a/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h +++ b/clang-tools-extra/clang-tidy/readability/UppercaseLiteralSuffixCheck.h @@ -19,7 +19,7 @@ namespace clang::tidy::readability { /// Alternatively, a list of destination suffixes can be provided. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/uppercase-literal-suffix.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/uppercase-literal-suffix.html class UppercaseLiteralSuffixCheck : public ClangTidyCheck { public: UppercaseLiteralSuffixCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h index 4b7ffc1f36ace..f431311b4282a 100644 --- a/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h +++ b/clang-tools-extra/clang-tidy/readability/UseAnyOfAllOfCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::readability { /// or std::all_of. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/use-anyofallof.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/use-anyofallof.html class UseAnyOfAllOfCheck : public ClangTidyCheck { public: using ClangTidyCheck::ClangTidyCheck; diff --git a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h index 573394361cbda..e983412d6b7ba 100644 --- a/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h +++ b/clang-tools-extra/clang-tidy/readability/UseStdMinMaxCheck.h @@ -17,7 +17,7 @@ namespace clang::tidy::readability { /// Replaces certain conditional statements with equivalent calls to /// ``std::min`` or ``std::max``. /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/readability/UseStdMinMax.html +/// https://clang.llvm.org/extra/clang-tidy/checks/readability/UseStdMinMax.html class UseStdMinMaxCheck : public ClangTidyCheck { public: UseStdMinMaxCheck(StringRef Name, ClangTidyContext *Context); diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp index a2fa0edbbbea3..64157f530b8c0 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp @@ -9,7 +9,7 @@ /// \file This file implements a clang-tidy tool. /// /// This tool uses the Clang Tooling infrastructure, see -/// http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html +/// https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html /// for details on setting it up with LLVM source tree. /// //===----------------------------------------------------------------------===// diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h index 35f75396828dd..f86828e8c46e9 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.h @@ -9,7 +9,7 @@ /// \file This file declares the main function for the clang-tidy tool. /// /// This tool uses the Clang Tooling infrastructure, see -/// http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html +/// https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html /// for details on setting it up with LLVM source tree. /// //===----------------------------------------------------------------------===// diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp index ea2897dfe1390..a2ba638ea15e8 100644 --- a/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp +++ b/clang-tools-extra/clang-tidy/tool/ClangTidyToolMain.cpp @@ -9,7 +9,7 @@ /// \file This file contains clang-tidy tool entry point main function. /// /// This tool uses the Clang Tooling infrastructure, see -/// http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html +/// https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html /// for details on setting it up with LLVM source tree. /// //===----------------------------------------------------------------------===// diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py index eadf7194ab94f..f495f449b5b30 100755 --- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py +++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py @@ -31,7 +31,7 @@ -header-filter=extra/clang-tidy Compilation database setup: -http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html +https://clang.llvm.org/docs/HowToSetupToolingForLLVM.html """ import argparse diff --git a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp index bdde7249d2796..fd4320eb8144b 100644 --- a/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp +++ b/clang-tools-extra/clang-tidy/utils/ExceptionAnalyzer.cpp @@ -595,6 +595,11 @@ ExceptionAnalyzer::throwsException(const Stmt *St, Results.merge(DestructorExcs); } } + } else if (const auto *Lambda = dyn_cast(St)) { + for (const Stmt *Init : Lambda->capture_inits()) { + ExceptionInfo Excs = throwsException(Init, Caught, CallStack); + Results.merge(Excs); + } } else { for (const Stmt *Child : St->children()) { ExceptionInfo Excs = throwsException(Child, Caught, CallStack); diff --git a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h index 5ecf9c4172d18..ee96fa74affc6 100644 --- a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h +++ b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.h @@ -18,7 +18,7 @@ namespace clang::tidy::zircon { /// discouraged. /// /// For the user-facing documentation see: -/// http://clang.llvm.org/extra/clang-tidy/checks/zircon/temporary-objects.html +/// https://clang.llvm.org/extra/clang-tidy/checks/zircon/temporary-objects.html class TemporaryObjectsCheck : public ClangTidyCheck { public: TemporaryObjectsCheck(StringRef Name, ClangTidyContext *Context) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index bc916396a14ca..8451efdbd0d32 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -203,6 +203,16 @@ New checks Finds virtual function overrides with different visibility than the function in the base class. +- New :doc:`readability-avoid-default-lambda-capture + ` check. + + Warns on default lambda captures (e.g. ``[&](){ ... }``, ``[=](){ ... }``). + +- New :doc:`readability-redundant-parentheses + ` check. + + Detect redundant parentheses. + New check aliases ^^^^^^^^^^^^^^^^^ @@ -211,6 +221,11 @@ New check aliases ` keeping initial check as an alias to the new one. +- Renamed :doc:`cert-env33-c ` to + :doc:`bugprone-command-processor + ` + keeping initial check as an alias to the new one. + - Renamed :doc:`cert-err34-c ` to :doc:`bugprone-unchecked-string-to-number-conversion ` @@ -234,6 +249,11 @@ Changes in existing checks correcting a spelling mistake on its option ``NamePrefixSuffixSilenceDissimilarityTreshold``. +- Improved :doc:`bugprone-exception-escape + ` check's handling of lambdas: + exceptions from captures are now diagnosed, exceptions in the bodies of + lambdas that aren't actually invoked are not. + - Improved :doc:`bugprone-infinite-loop ` check by adding detection for variables introduced by structured bindings. @@ -260,6 +280,11 @@ Changes in existing checks namespace are treated as the tag or the data part of a user-defined tagged union respectively. +- Improved :doc:`bugprone-throw-keyword-missing + ` check by only considering + the canonical types of base classes as written and adding a note on the base + class that triggered the warning. + - Improved :doc:`bugprone-unchecked-optional-access ` check by supporting ``NullableValue::makeValue`` and ``NullableValue::makeValueInplace`` to @@ -301,10 +326,19 @@ Changes in existing checks uses of non-standard ``enable_if`` with a signature different from ``std::enable_if`` (such as ``boost::enable_if``). +- Improved :doc:`modernize-use-default-member-init + ` check to + enhance the robustness of the member initializer detection. + - Improved :doc:`modernize-use-designated-initializers ` check to suggest using designated initializers for aliased aggregate types. +- Improved :doc:`modernize-use-nullptr + ` check by fixing a crash + on Windows when the check was enabled with a 32-bit :program:`clang-tidy` + binary. + - Improved :doc:`modernize-use-std-format ` check to correctly match when the format string is converted to a different type by an implicit diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index ad7f22381a3ca..4ede4ea36c13d 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -436,7 +436,7 @@ in the release notes, as the first sentence in the doxygen comments in the heade for your check class and as the first sentence of the check documentation. Avoid the phrase "this check" in your check summary and check documentation. -If your check relates to a published coding guideline (C++ Core Guidelines, MISRA, etc.) +If your check relates to a published coding guideline (C++ Core Guidelines, SEI CERT, etc.) or style guide, provide links to the relevant guideline or style guide sections in your check documentation. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/command-processor.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/command-processor.rst new file mode 100644 index 0000000000000..cbffe7dddae04 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/command-processor.rst @@ -0,0 +1,16 @@ +.. title:: clang-tidy - bugprone-command-processor + +bugprone-command-processor +========================== + +Flags calls to ``system()``, ``popen()``, and ``_popen()``, which +execute a command processor. It does not flag calls to ``system()`` with a null +pointer argument, as such a call checks for the presence of a command processor +but does not actually attempt to execute a command. + +References +---------- + +This check corresponds to the CERT C Coding Standard rule +`ENV33-C. Do not call system() +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst index 9271c9ecccc00..751bccfaee8f2 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/cert/env33-c.rst @@ -3,10 +3,9 @@ cert-env33-c ============ -This check flags calls to ``system()``, ``popen()``, and ``_popen()``, which -execute a command processor. It does not flag calls to ``system()`` with a null -pointer argument, as such a call checks for the presence of a command processor -but does not actually attempt to execute a command. +The `cert-env33-c` check is an alias, please see +`bugprone-command-processor <../bugprone/command-processor.html>`_ +for more information. This check corresponds to the CERT C Coding Standard rule `ENV33-C. Do not call system() diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 472d509101cdb..b2c19f3d336da 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -87,6 +87,7 @@ Clang-Tidy Checks :doc:`bugprone-capturing-this-in-member-variable `, :doc:`bugprone-casting-through-void `, :doc:`bugprone-chained-comparison `, + :doc:`bugprone-command-processor `, :doc:`bugprone-compare-pointer-to-member-virtual-function `, :doc:`bugprone-copy-constructor-init `, "Yes" :doc:`bugprone-crtp-constructor-accessibility `, "Yes" @@ -173,7 +174,6 @@ Clang-Tidy Checks :doc:`bugprone-use-after-move `, :doc:`bugprone-virtual-near-miss `, "Yes" :doc:`cert-dcl58-cpp `, - :doc:`cert-env33-c `, :doc:`cert-err33-c `, :doc:`cert-err60-cpp `, :doc:`cert-flp30-c `, @@ -368,6 +368,7 @@ Clang-Tidy Checks :doc:`portability-template-virtual-member-function `, :doc:`readability-ambiguous-smartptr-reset-call `, "Yes" :doc:`readability-avoid-const-params-in-decls `, "Yes" + :doc:`readability-avoid-default-lambda-capture `, "Yes" :doc:`readability-avoid-nested-conditional-operator `, :doc:`readability-avoid-return-with-void-value `, "Yes" :doc:`readability-avoid-unconditional-preprocessor-if `, @@ -404,6 +405,7 @@ Clang-Tidy Checks :doc:`readability-redundant-function-ptr-dereference `, "Yes" :doc:`readability-redundant-inline-specifier `, "Yes" :doc:`readability-redundant-member-init `, "Yes" + :doc:`readability-redundant-parentheses `, "Yes" :doc:`readability-redundant-preprocessor `, :doc:`readability-redundant-smartptr-get `, "Yes" :doc:`readability-redundant-string-cstr `, "Yes" @@ -440,6 +442,7 @@ Check aliases :doc:`cert-dcl54-cpp `, :doc:`misc-new-delete-overloads `, :doc:`cert-dcl59-cpp `, :doc:`google-build-namespaces `, :doc:`cert-err09-cpp `, :doc:`misc-throw-by-value-catch-by-reference `, + :doc:`cert-env33-c `, :doc:`bugprone-command-processor `, :doc:`cert-err34-c `, :doc:`bugprone-unchecked-string-to-number-conversion `, :doc:`cert-err52-cpp `, :doc:`modernize-avoid-setjmp-longjmp `, :doc:`cert-err58-cpp `, :doc:`bugprone-throwing-static-initialization `, diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-default-lambda-capture.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-default-lambda-capture.rst new file mode 100644 index 0000000000000..e18cf3ac653f0 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/avoid-default-lambda-capture.rst @@ -0,0 +1,57 @@ +.. title:: clang-tidy - readability-avoid-default-lambda-capture + +readability-avoid-default-lambda-capture +======================================== + +Warns on default lambda captures (e.g. ``[&](){ ... }``, ``[=](){ ... }``). + +Captures can lead to subtle bugs including dangling references and unnecessary +copies. Writing out the name of the variables being captured reminds programmers +and reviewers about what is being captured. + +This check does not warn on variable-length array (VLA) captures. VLAs are not +ISO C++, and it is impossible to explicitly capture them as the syntax for doing +so does not exist. + +Coding guidelines that recommend against defaulted lambda captures include: + +* Item 31 of Effective Modern C++ by Scott Meyers + +Example +------- + +.. code-block:: c++ + + #include + + class Widget { + std::vector> callbacks; + int widgetId; + void addCallback(int factoryId) { + callbacks.emplace_back( + [&](){ + std::cout << "Widget " << widgetId << " made in factory " << factoryId; + } + ); + } + } + +When ``callbacks`` is executed, ``factoryId`` will dangle. Writing the name of +``factoryId`` in the capture list reminds the reader that it is being captured, +which will hopefully lead to the bug being fixed during code review. + +.. code-block:: c++ + + #include + + class Widget { + std::vector> callbacks; + int widgetId; + void addCallback(int factoryId) { + callbacks.emplace_back( + [&factoryId, &widgetId](){ // Why isn't factoryId captured by value?? + std::cout << "Widget " << widgetId << " made in factory " << factoryId; + } + ); + } + } diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst new file mode 100644 index 0000000000000..23d975e646490 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-parentheses.rst @@ -0,0 +1,29 @@ +.. title:: clang-tidy - readability-redundant-parentheses + +readability-redundant-parentheses +================================= + +Detect redundant parentheses. + +When modifying code, one often forgets to remove the corresponding parentheses. +This results in overly lengthy code. When the expression is complex, finding +the matching parentheses becomes particularly difficult. + +Example +------- + +.. code-block:: c++ + + (1); + ((a + 2)) * 3; + (a); + ("aaa"); + +Currently this check does not take into account the precedence of operations. +Even if the expression within the parentheses has a higher priority than that +outside the parentheses. In other words, removing the parentheses will not +affect the semantics. + +.. code-block:: c++ + + int a = (1 * 2) + 3; // no warning diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp index b33337588da19..db96fc4aebe5a 100644 --- a/clang-tools-extra/test/clang-doc/long-name.cpp +++ b/clang-tools-extra/test/clang-doc/long-name.cpp @@ -9,6 +9,6 @@ struct ThisStructHasANameThatResultsInAMangledNameThatIsExactly250CharactersLong struct ThisStructHasANameThatResultsInAMangledNameThatIsExactly251CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheLengthIsTooLongThenClangDocWillCrashAnd123 {}; // CHECK-JSON: ThisStructHasANameThatResultsInAMangledNameThatIsExactly250CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheLengthIsTooLongThenClangDocWillCrashAnd12.json -// CHECK-JSON: {{[0-9A-F]*}}.json +// CHECK-JSON: _ZTV244ThisStructHasANameThatResultsInAMangledNameThatIsExactly251CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheL29DE8558215A13A506661C0E01E50AA3E5C9C7FA.json // CHECK-HTML: ThisStructHasANameThatResultsInAMangledNameThatIsExactly250CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheLengthIsTooLongThenClangDocWillCrashAnd12.html -// CHECK-HTML: {{[0-9A-F]*}}.html +// CHECK-HTML: _ZTV244ThisStructHasANameThatResultsInAMangledNameThatIsExactly251CharactersLongThatIsSupposedToTestTheFilenameLengthLimitsWithinClangDocInOrdertoSeeifclangdocwillcrashornotdependingonthelengthofthestructIfTheL29DE8558215A13A506661C0E01E50AA3E5C9C7FA.html diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/env33-c.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/command-processor.c similarity index 83% rename from clang-tools-extra/test/clang-tidy/checkers/cert/env33-c.c rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/command-processor.c index 5846b496242c5..e592b57c9fb29 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cert/env33-c.c +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/command-processor.c @@ -1,4 +1,4 @@ -// RUN: %check_clang_tidy %s cert-env33-c %t +// RUN: %check_clang_tidy %s bugprone-command-processor %t typedef struct FILE {} FILE; @@ -11,7 +11,7 @@ void f(void) { system(0); system("test"); - // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: calling 'system' uses a command processor [cert-env33-c] + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: calling 'system' uses a command processor [bugprone-command-processor] popen("test", "test"); // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: calling 'popen' uses a command processor diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp index b10bd1d482867..a52bbe2246d1e 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp @@ -894,3 +894,65 @@ void pointer_exception_can_not_escape_with_void_handler() noexcept { } catch (void *) { } } + +void throw_in_uninvoked_lambda() noexcept { + [] { throw 42; }; +} + +void throw_in_lambda() noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_lambda' which should not throw exceptions + [] { throw 42; }(); + // CHECK-MESSAGES: :[[@LINE-1]]:8: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here + // CHECK-MESSAGES: :[[@LINE-2]]:19: note: frame #1: function 'throw_in_lambda' calls function 'operator()' here +} + +struct copy_constructor_throws { + copy_constructor_throws(const copy_constructor_throws&) { throw 42; } +}; + +void throw_in_lambda_default_by_value_capture(const copy_constructor_throws& a) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_lambda_default_by_value_capture' which should not throw exceptions + [=] { a; }; + // CHECK-MESSAGES: :[[@LINE-6]]:61: note: frame #0: unhandled exception of type 'int' may be thrown in function 'copy_constructor_throws' here + // CHECK-MESSAGES: :[[@LINE-2]]:4: note: frame #1: function 'throw_in_lambda_default_by_value_capture' calls function 'copy_constructor_throws' here +} + +void throw_in_lambda_explicit_by_value_capture(const copy_constructor_throws& a) noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_lambda_explicit_by_value_capture' which should not throw exceptions + [a] {}; + // CHECK-MESSAGES: :[[@LINE-13]]:61: note: frame #0: unhandled exception of type 'int' may be thrown in function 'copy_constructor_throws' here + // CHECK-MESSAGES: :[[@LINE-2]]:4: note: frame #1: function 'throw_in_lambda_explicit_by_value_capture' calls function 'copy_constructor_throws' here +} + +void no_throw_in_lambda_by_reference_capture(const copy_constructor_throws& a) noexcept { + [&] { a; }; + [&a] {}; +} + +void throw_in_lambda_init_capture() noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_in_lambda_init_capture' which should not throw exceptions + [a = [] { throw 42; return 0; }()] {}; + // CHECK-MESSAGES: :[[@LINE-1]]:13: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here + // CHECK-MESSAGES: :[[@LINE-2]]:34: note: frame #1: function 'throw_in_lambda_init_capture' calls function 'operator()' here +} + +void throw_from_nested_lambda() noexcept { + // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: an exception may be thrown in function 'throw_from_nested_lambda' which should not throw exceptions + [] { [] { throw 42; }(); }(); + // CHECK-MESSAGES: :[[@LINE-1]]:13: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here + // CHECK-MESSAGES: :[[@LINE-2]]:24: note: frame #1: function 'operator()' calls function 'operator()' here + // CHECK-MESSAGES: :[[@LINE-3]]:29: note: frame #2: function 'throw_from_nested_lambda' calls function 'operator()' here +} + +const auto throw_in_noexcept_lambda = [] () noexcept { throw 42; }; +// CHECK-MESSAGES: :[[@LINE-1]]:39: warning: an exception may be thrown in function 'operator()' which should not throw exceptions +// CHECK-MESSAGES: :[[@LINE-2]]:56: note: frame #0: unhandled exception of type 'int' may be thrown in function 'operator()' here + +void thrower() { + throw 42; +} + +const auto indirect_throw_in_noexcept_lambda = [] () noexcept { thrower(); }; +// CHECK-MESSAGES: :[[@LINE-1]]:48: warning: an exception may be thrown in function 'operator()' which should not throw exceptions +// CHECK-MESSAGES: :[[@LINE-5]]:3: note: frame #0: unhandled exception of type 'int' may be thrown in function 'thrower' here +// CHECK-MESSAGES: :[[@LINE-3]]:65: note: frame #1: function 'operator()' calls function 'thrower' here diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp index bafd3d19b5a31..0ae51780ccc00 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp @@ -20,6 +20,7 @@ typedef basic_string string; typedef basic_string wstring; // std::exception and std::runtime_error declaration. +// CHECK-MESSAGES-DAG: [[#EXCEPTION_LINE:@LINE + 1]]:8 struct exception { exception(); exception(const exception &other); @@ -32,8 +33,9 @@ struct runtime_error : public exception { } // namespace std -// The usage of this class should never emit a warning. +// The usage of these classes should never emit a warning. struct RegularClass {}; +struct RegularDerived : public RegularClass {}; // Class name contains the substring "exception", in certain cases using this class should emit a warning. struct RegularException { @@ -41,18 +43,21 @@ struct RegularException { // Constructors with a single argument are treated differently (cxxFunctionalCastExpr). RegularException(int) {} + + typedef RegularClass RegularAlias; }; // -------------- void stdExceptionNotTrownTest(int i) { if (i < 0) - // CHECK-MESSAGES: :[[@LINE+1]]:5: warning: suspicious exception object created but not thrown; did you mean 'throw {{.*}}'? [bugprone-throw-keyword-missing] + // CHECK-MESSAGES-DAG: :[[@LINE+1]]:5: warning: suspicious exception object created but not thrown; did you mean 'throw {{.*}}'? [bugprone-throw-keyword-missing] std::exception(); if (i > 0) - // CHECK-MESSAGES: :[[@LINE+1]]:5: warning: suspicious exception + // CHECK-MESSAGES-DAG: :[[@LINE+1]]:5: warning: suspicious exception std::runtime_error("Unexpected argument"); + // CHECK-MESSAGES: note: object type inherits from base class declared here } void stdExceptionThrownTest(int i) { @@ -68,6 +73,10 @@ void regularClassNotThrownTest(int i) { RegularClass(); } +void regularClassWithAliasNotThrownTest(int i) { + RegularDerived(); +} + void regularClassThrownTest(int i) { if (i < 0) throw RegularClass(); @@ -174,6 +183,7 @@ class RegularError : public ERROR_BASE {}; void typedefTest() { // CHECK-MESSAGES: :[[@LINE+1]]:3: warning: suspicious exception RegularError(); + // CHECK-MESSAGES: :[[#EXCEPTION_LINE]]:8: note: object type inherits from base class declared here } struct ExceptionRAII { diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp index 97ba1fce2a1ec..e5de9e33bccd9 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits.cpp @@ -14,11 +14,25 @@ namespace std { static constexpr bool value = true; }; + template + static constexpr bool is_same_v = is_same::value; // NOLINT + template struct enable_if { using type = T; }; + template + using enable_if_t = typename enable_if::type; // NOLINT + + template + struct remove_reference { + using type = T; + }; + + template + using remove_reference_t = typename remove_reference::type; // NOLINT + template struct common_type { using type = int; @@ -126,3 +140,13 @@ namespace my_std = std; using Alias = my_std::add_const::type; // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: use c++14 style type templates // CHECK-FIXES: using Alias = my_std::add_const_t; + +template +struct ImplicitlyInstantiatedConstructor { + template >> + ImplicitlyInstantiatedConstructor(U) {} +}; + +const ImplicitlyInstantiatedConstructor ImplicitInstantiation(std::remove_reference::type(123)); +// CHECK-MESSAGES: :[[@LINE-1]]:68: warning: use c++14 style type templates +// CHECK-FIXES: const ImplicitlyInstantiatedConstructor ImplicitInstantiation(std::remove_reference_t(123)); diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-default-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-default-member-init.cpp index 015216c4a9d59..52b15dec37cd5 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-default-member-init.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-default-member-init.cpp @@ -596,3 +596,26 @@ class DefaultMemberInitWithArithmetic { }; } //namespace PR122480 + +namespace GH156295 { + +class NotFix { + NotFix(int v) : x(0 + 0 + (0 * 0 * (((((((v)))) - 20))) + 10)) {} + int x; +}; + +class ShouldFix { + ShouldFix(int v) : x(0 + 0 + (0 * 0 * (((((((1)))) - 20))) + 10)) {} + int x; + // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: use default member initializer for 'x' [modernize-use-default-member-init] + // CHECK-FIXES: int x{0 + 0 + (0 * 0 * (((((((1)))) - 20))) + 10)}; +}; + +} // namespace GH156295 + +namespace GH160394 { +struct A { + A(int i) : f((i & 0x1f) == 1) {} + bool f; +}; +} // namespace GH160394 diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-default-lambda-capture.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-default-lambda-capture.cpp new file mode 100644 index 0000000000000..89822440849a7 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-default-lambda-capture.cpp @@ -0,0 +1,179 @@ +// RUN: %check_clang_tidy %s readability-avoid-default-lambda-capture %t -- -- -Wno-vla-extension -std=c++20 + +void test_default_captures() { + int value = 42; + int another = 10; + + auto lambda1 = [=](int x) { return value + x; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto lambda1 = [value](int x) { return value + x; }; + + auto lambda2 = [&](int x) { return value + x; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto lambda2 = [&value](int x) { return value + x; }; + + auto lambda3 = [=, &another](int x) { return value + another + x; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto lambda3 = [value, &another](int x) { return value + another + x; }; + + auto lambda4 = [&, value](int x) { return value + another + x; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto lambda4 = [&another, value](int x) { return value + another + x; }; +} + +template +void test_pack_expansion_captures(Args... args) { + int local = 5; + + auto lambda1 = [=]() { return (args + ...); }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + + auto lambda2 = [&]() { return (args + ...); }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + + auto lambda3 = [=]() { return (args + ...) + local; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + + auto lambda4 = [&]() { return (args + ...) + local; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + + auto lambda5 = [=, ...copied = args]() { return (copied + ...); }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + + auto lambda6 = [&, ...refs = args]() { return (refs + ...); }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] +} + +void instantiate_pack_expansion_tests() { + test_pack_expansion_captures(1, 2, 3); + test_pack_expansion_captures(1.0, 2.0, 3.0); +} + +void test_acceptable_captures() { + int value = 42; + int another = 10; + + auto lambda1 = [value](int x) { return value + x; }; + auto lambda2 = [&value](int x) { return value + x; }; + auto lambda3 = [value, another](int x) { return value + another + x; }; + auto lambda4 = [&value, &another](int x) { return value + another + x; }; + + auto lambda5 = [](int x, int y) { return x + y; }; + + struct S { + int member = 5; + void foo() { + auto lambda = [this]() { return member; }; + } + }; +} + +void test_nested_lambdas() { + int outer_var = 1; + int middle_var = 2; + int inner_var = 3; + + auto outer = [=]() { + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto outer = [outer_var, middle_var, inner_var]() { + + auto inner = [&](int x) { return outer_var + middle_var + inner_var + x; }; + // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto inner = [&outer_var, &middle_var, &inner_var](int x) { return outer_var + middle_var + inner_var + x; }; + + return inner(10); + }; +} + +void test_lambda_returns() { + int a = 1, b = 2, c = 3; + + auto create_adder = [=](int x) { + // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto create_adder = [](int x) { + return [x](int y) { return x + y; }; // Inner lambda is fine - explicit capture + }; + + auto func1 = [&]() { return a; }; + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto func1 = [&a]() { return a; }; + + auto func2 = [=]() { return b; }; + // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto func2 = [b]() { return b; }; +} + +class TestClass { + int member = 42; + +public: + void test_member_function_lambdas() { + int local = 10; + + auto lambda1 = [=]() { return member + local; }; + // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto lambda1 = [this, local]() { return member + local; }; + + auto lambda2 = [&]() { return member + local; }; + // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: auto lambda2 = [this, &local]() { return member + local; }; + + auto lambda3 = [this, local]() { return member + local; }; + auto lambda4 = [this, &local]() { return member + local; }; + } +}; + +// Lambda captures dependent on a template parameter don't have a fix it +template +void test_template_lambdas() { + T value{}; + + auto lambda = [=](T x) { return value + x; }; + // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] +} + +void instantiate_templates() { + test_template_lambdas(); + test_template_lambdas(); +} + +void test_init_captures() { + int x = 3; + int nx = 5; + + int y1 = [&, z = x + 5]() -> int { + // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: int y1 = [&nx, z = x + 5]() -> int { + return z * z + nx; + }(); + + int y2 = [=, &ref = x]() { + // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: int y2 = [nx, &ref = x]() { + ref += 1; + return nx - ref; + }(); + + int y3 = [=, &ref = x, z = x + 5]() { + // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: lambda default captures are discouraged; prefer to capture specific variables explicitly [readability-avoid-default-lambda-capture] + // CHECK-FIXES: int y3 = [nx, &ref = x, z = x + 5]() { + ref += 2; + return nx + z - ref; + }(); + + (void)y1; + (void)y2; + (void)y3; +} + +void test_vla_no_crash() { + // VLAs create implicit VLA bound captures that cannot be written explicitly. + // No warning should be issued. + int n = 5; + int vla[n]; + for (int i = 0; i < n; ++i) { + vla[i] = i * 10; + } + + auto lambda = [&]() { return vla[0]; }; +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp new file mode 100644 index 0000000000000..926cb118c77cf --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-parentheses.cpp @@ -0,0 +1,64 @@ +// RUN: %check_clang_tidy %s readability-redundant-parentheses %t + +void parenExpr() { + 1 + 1; + (1 + 1); + ((1 + 1)); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: (1 + 1); + (((1 + 1))); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-MESSAGES: :[[@LINE-2]]:4: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: (1 + 1); + ((((1 + 1)))); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-MESSAGES: :[[@LINE-2]]:4: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-MESSAGES: :[[@LINE-3]]:5: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: (1 + 1); +} + +#define EXP (1 + 1) +#define PAREN(e) (e) +void parenExprWithMacro() { + EXP; // 1 + (EXP); // 2 + ((EXP)); // 3 + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: (EXP); // 3 + PAREN((1)); +} + +void constant() { + (1); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: 1; + (1.0); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: 1.0; + (true); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: true; + (','); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: ','; + ("v4"); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: "v4"; + (nullptr); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: nullptr; +} + +void declRefExpr(int a) { + (a); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: a; +} + +void exceptions() { + sizeof(1); + alignof(2); + alignof((3)); + // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant parentheses around expression [readability-redundant-parentheses] + // CHECK-FIXES: alignof(3); +} diff --git a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp index 602058f5d9eb8..c7ac387ecf7c3 100644 --- a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp +++ b/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp @@ -12,9 +12,7 @@ #include "config.h" #include "support/Utils.h" #include "clang/Basic/Version.h" -#include "llvm/Support/Path.h" #include "llvm/Testing/Support/Error.h" -#include "llvm/Testing/Support/SupportHelpers.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -48,41 +46,10 @@ getClangDocContext(std::vector UserStylesheets = {}, return CDCtx; } -static void verifyFileContents(const Twine &Path, StringRef Contents) { - auto Buffer = MemoryBuffer::getFile(Path); - ASSERT_TRUE((bool)Buffer); - StringRef Data = Buffer.get()->getBuffer(); - ASSERT_EQ(Data, Contents); -} - TEST(HTMLMustacheGeneratorTest, createResources) { auto G = getHTMLMustacheGenerator(); ASSERT_THAT(G, NotNull()) << "Could not find HTMLMustacheGenerator"; ClangDocContext CDCtx = getClangDocContext(); EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed()) << "Empty UserStylesheets or JsScripts should fail!"; - - unittest::TempDir RootTestDirectory("createResourcesTest", /*Unique=*/true); - CDCtx.OutDirectory = RootTestDirectory.path(); - - unittest::TempFile CSS("clang-doc-mustache", "css", "CSS"); - unittest::TempFile JS("mustache", "js", "JavaScript"); - - CDCtx.UserStylesheets[0] = CSS.path(); - CDCtx.JsScripts[0] = JS.path(); - - EXPECT_THAT_ERROR(G->createResources(CDCtx), Succeeded()) - << "Failed to create resources with valid UserStylesheets and JsScripts"; - { - SmallString<256> PathBuf; - llvm::sys::path::append(PathBuf, RootTestDirectory.path(), - "clang-doc-mustache.css"); - verifyFileContents(PathBuf, "CSS"); - } - - { - SmallString<256> PathBuf; - llvm::sys::path::append(PathBuf, RootTestDirectory.path(), "mustache.js"); - verifyFileContents(PathBuf, "JavaScript"); - } } diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 1bb73599970c1..4eaa712899856 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -80,6 +80,12 @@ if(CLANG_BUILT_STANDALONE) include(GetErrcMessages) include(LLVMDistributionSupport) + if(CMAKE_CROSSCOMPILING) + set(LLVM_USE_HOST_TOOLS ON) + include(CrossCompile) + llvm_create_cross_target(Clang NATIVE "" Release) + endif() + set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}") set(BUG_REPORT_URL "${LLVM_PACKAGE_BUGREPORT}" CACHE STRING "Default URL where bug reports are to be submitted.") diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py index 13a91d83ede1c..80140d2787608 100644 --- a/clang/bindings/python/clang/cindex.py +++ b/clang/bindings/python/clang/cindex.py @@ -1435,58 +1435,61 @@ def is_unexposed(self): OMP_SCOPE_DIRECTIVE = 306 # OpenMP reverse directive. - OMPReverseDirective = 307 + OMP_REVERSE_DIRECTIVE = 307 # OpenMP interchange directive. - OMPInterchangeDirective = 308 + OMP_INTERCHANGE_DIRECTIVE = 308 # OpenMP assume directive. - OMPAssumeDirective = 309 + OMP_ASSUME_DIRECTIVE = 309 # OpenMP stripe directive. OMP_STRIPE_DIRECTIVE = 310 + # OpenMP fuse directive. + OMP_FUSE_DIRECTIVE = 311 + # OpenACC Compute Construct. OPEN_ACC_COMPUTE_DIRECTIVE = 320 # OpenACC Loop Construct. - OpenACCLoopConstruct = 321 + OPEN_ACC_LOOP_CONSTRUCT = 321 # OpenACC Combined Constructs. - OpenACCCombinedConstruct = 322 + OPEN_ACC_COMBINED_CONSTRUCT = 322 # OpenACC data Construct. - OpenACCDataConstruct = 323 + OPEN_ACC_DATA_CONSTRUCT = 323 # OpenACC enter data Construct. - OpenACCEnterDataConstruct = 324 + OPEN_ACC_ENTER_DATA_CONSTRUCT = 324 # OpenACC exit data Construct. - OpenACCExitDataConstruct = 325 + OPEN_ACC_EXIT_DATA_CONSTRUCT = 325 # OpenACC host_data Construct. - OpenACCHostDataConstruct = 326 + OPEN_ACC_HOST_DATA_CONSTRUCT = 326 # OpenACC wait Construct. - OpenACCWaitConstruct = 327 + OPEN_ACC_WAIT_CONSTRUCT = 327 # OpenACC init Construct. - OpenACCInitConstruct = 328 + OPEN_ACC_INIT_CONSTRUCT = 328 # OpenACC shutdown Construct. - OpenACCShutdownConstruct = 329 + OPEN_ACC_SHUTDOWN_CONSTRUCT = 329 # OpenACC set Construct. - OpenACCSetConstruct = 330 + OPEN_ACC_SET_CONSTRUCT = 330 # OpenACC update Construct. - OpenACCUpdateConstruct = 331 + OPEN_ACC_UPDATE_CONSTRUCT = 331 # OpenACC atomic Construct. - OpenACCAtomicConstruct = 332 + OPEN_ACC_ATOMIC_CONSTRUCT = 332 # OpenACC cache Construct. - OpenACCCacheConstruct = 333 + OPEN_ACC_CACHE_CONSTRUCT = 333 ### # Other Kinds diff --git a/clang/cmake/caches/PGO.cmake b/clang/cmake/caches/PGO.cmake index 15bc755d110d1..d6471160037c1 100644 --- a/clang/cmake/caches/PGO.cmake +++ b/clang/cmake/caches/PGO.cmake @@ -5,7 +5,7 @@ set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "") set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "") set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "") -set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "") +set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE BOOL "") set(CLANG_BOOTSTRAP_TARGETS generate-profdata stage2 diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 25f4e3b3fbd26..b503283559db4 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -950,7 +950,8 @@ Each builtin accesses memory according to a provided boolean mask. These are provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first argument is always boolean mask vector. The ``__builtin_masked_load`` builtin takes an optional third vector argument that will be used for the result of the -masked-off lanes. These builtins assume the memory is always aligned. +masked-off lanes. These builtins assume the memory is unaligned, use +``__builtin_assume_aligned`` if alignment is desired. The ``__builtin_masked_expand_load`` and ``__builtin_masked_compress_store`` builtins have the same interface but store the result in consecutive indices. @@ -969,17 +970,17 @@ Example: using v8b = bool [[clang::ext_vector_type(8)]]; using v8i = int [[clang::ext_vector_type(8)]]; - v8i load(v8b mask, v8i *ptr) { return __builtin_masked_load(mask, ptr); } + v8i load(v8b mask, int *ptr) { return __builtin_masked_load(mask, ptr); } - v8i load_expand(v8b mask, v8i *ptr) { + v8i load_expand(v8b mask, int *ptr) { return __builtin_masked_expand_load(mask, ptr); } - void store(v8b mask, v8i val, v8i *ptr) { + void store(v8b mask, v8i val, int *ptr) { __builtin_masked_store(mask, val, ptr); } - void store_compress(v8b mask, v8i val, v8i *ptr) { + void store_compress(v8b mask, v8i val, int *ptr) { __builtin_masked_compress_store(mask, val, ptr); } diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index b09bd9ce6ae9a..68ca7bedddb06 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -360,6 +360,7 @@ information or if you want to help with the implementation. + +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ |Feature | C/C++ Status | Fortran Status | Reviews | +=============================================================+===========================+===========================+==========================================================================+ @@ -407,8 +408,16 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ + +.. _OpenMP 5.2 Deprecations: + +OpenMP 5.2 Deprecations +======================= + + + +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -|OpenMP 5.2 Deprecations | C/C++ Status | Fortran Status | Reviews | +| | C/C++ Status | Fortran Status | Reviews | +=============================================================+===========================+===========================+==========================================================================+ | Linear clause syntax | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ @@ -473,6 +482,8 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | loop transformation apply clause | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ +| loop fuse transformation | :good:`done` | :none:`unclaimed` | | ++-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | workdistribute construct | | :none:`in progress` | @skc7, @mjklemm | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | task_iteration | :none:`unclaimed` | :none:`unclaimed` | | diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 70c82b090107a..270b5d336eba7 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -150,6 +150,10 @@ C++ Language Changes C++2c Feature Support ^^^^^^^^^^^^^^^^^^^^^ +- Started the implementation of `P2686R5 `_ Constexpr structured bindings. + At this timem, references to constexpr and decomposition of *tuple-like* types are not supported + (only arrays and aggregates are). + C++23 Feature Support ^^^^^^^^^^^^^^^^^^^^^ @@ -268,6 +272,9 @@ Attribute Changes in Clang attribute, allowing the attribute to only be attached to the declaration. Prior, this would be treated as an error where the definition and declaration would have differing types. +- New format attributes ``gnu_printf``, ``gnu_scanf``, ``gnu_strftime`` and ``gnu_strfmon`` are added + as aliases for ``printf``, ``scanf``, ``strftime`` and ``strfmon``. (#GH16219) + Improvements to Clang's diagnostics ----------------------------------- - Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic @@ -292,7 +299,8 @@ Improvements to Clang's diagnostics "format specifies type 'unsigned int' but the argument has type 'int', which differs in signedness [-Wformat-signedness]" "signedness of format specifier 'u' is incompatible with 'c' [-Wformat-signedness]" and the API-visible diagnostic id will be appropriate. - +- Clang now produces better diagnostics for template template parameter matching + involving 'auto' template parameters. - Fixed false positives in ``-Waddress-of-packed-member`` diagnostics when potential misaligned members get processed before they can get discarded. (#GH144729) @@ -353,6 +361,7 @@ Bug Fixes in This Version first parameter. (#GH113323). - Fixed a crash with incompatible pointer to integer conversions in designated initializers involving string literals. (#GH154046) +- Fix crash on CTAD for alias template. (#GH131342) - Clang now emits a frontend error when a function marked with the `flatten` attribute calls another function that requires target features not enabled in the caller. This prevents a fatal error in the backend. @@ -422,6 +431,7 @@ Bug Fixes to C++ Support ``__builtin_addressof``, and related issues with builtin arguments. (#GH154034) - Fix an assertion failure when taking the address on a non-type template parameter argument of object type. (#GH151531) +- Suppress ``-Wdouble-promotion`` when explicitly asked for with C++ list initialization (#GH33409). Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -558,6 +568,7 @@ Crash and bug fixes - Fixed a crash in the static analyzer that when the expression in an ``[[assume(expr)]]`` attribute was enclosed in parentheses. (#GH151529) - Fixed a crash when parsing ``#embed`` parameters with unmatched closing brackets. (#GH152829) +- Fixed a crash when compiling ``__real__`` or ``__imag__`` unary operator on scalar value with type promotion. (#GH160583) Improvements ^^^^^^^^^^^^ @@ -572,7 +583,9 @@ Sanitizers Python Binding Changes ---------------------- -- Exposed `clang_getCursorLanguage` via `Cursor.language`. +- Exposed ``clang_getCursorLanguage`` via ``Cursor.language``. +- Add all missing ``CursorKind``s, ``TypeKind``s and + ``ExceptionSpecificationKind``s from ``Index.h`` OpenMP Support -------------- @@ -586,6 +599,7 @@ OpenMP Support - Added support for ``defaultmap`` directive implicit-behavior ``storage``. - Added support for ``defaultmap`` directive implicit-behavior ``private``. - Added parsing and semantic analysis support for ``groupprivate`` directive. +- Added support for 'omp fuse' directive. Improvements ^^^^^^^^^^^^ diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index be038d9165fc6..f13d9c9307b40 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2162,6 +2162,10 @@ enum CXCursorKind { */ CXCursor_OMPStripeDirective = 310, + /** OpenMP fuse directive + */ + CXCursor_OMPFuseDirective = 311, + /** OpenACC Compute Construct. */ CXCursor_OpenACCComputeConstruct = 320, diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index a2c55c71e09ae..12351e98e5a2b 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -25,10 +25,12 @@ #include "clang/AST/RawCommentList.h" #include "clang/AST/SYCLKernelInfo.h" #include "clang/AST/TemplateName.h" +#include "clang/AST/TypeOrdering.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/PartialDiagnostic.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/IntrusiveRefCntPtr.h" @@ -51,6 +53,36 @@ class FixedPointSemantics; struct fltSemantics; template class SmallPtrSet; +struct ScalableVecTyKey { + clang::QualType EltTy; + unsigned NumElts; + unsigned NumFields; + + bool operator==(const ScalableVecTyKey &RHS) const { + return EltTy == RHS.EltTy && NumElts == RHS.NumElts && + NumFields == RHS.NumFields; + } +}; + +// Provide a DenseMapInfo specialization so that ScalableVecTyKey can be used +// as a key in DenseMap. +template <> struct DenseMapInfo { + static inline ScalableVecTyKey getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), ~0U, ~0U}; + } + static inline ScalableVecTyKey getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), ~0U, ~0U}; + } + static unsigned getHashValue(const ScalableVecTyKey &Val) { + return hash_combine(DenseMapInfo::getHashValue(Val.EltTy), + Val.NumElts, Val.NumFields); + } + static bool isEqual(const ScalableVecTyKey &LHS, + const ScalableVecTyKey &RHS) { + return LHS == RHS; + } +}; + } // namespace llvm namespace clang { @@ -505,6 +537,9 @@ class ASTContext : public RefCountedBase { SmallVector> ObjCSubClasses; + // A mapping from Scalable Vector Type keys to their corresponding QualType. + mutable llvm::DenseMap ScalableVecTyMap; + ASTContext &this_() { return *this; } public: diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 9fedb230ce397..5f16bac94d5e6 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4714,7 +4714,7 @@ class SubstNonTypeTemplateParmExpr : public Expr { // sugared: it doesn't need to be resugared later. bool getFinal() const { return Final; } - NamedDecl *getParameter() const; + NonTypeTemplateParmDecl *getParameter() const; bool isReferenceParameter() const { return AssociatedDeclAndRef.getInt(); } diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index 081244fe0efb6..5f06117d65a47 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -842,9 +842,7 @@ struct OpenACCPrivateRecipe { VarDecl *AllocaDecl; Expr *InitExpr; - OpenACCPrivateRecipe(VarDecl *A, Expr *I) : AllocaDecl(A), InitExpr(I) { - assert(!AllocaDecl || AllocaDecl->getInit() == nullptr); - } + OpenACCPrivateRecipe(VarDecl *A, Expr *I) : AllocaDecl(A), InitExpr(I) {} bool isSet() const { return AllocaDecl; } diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index b2a6d4b9182b0..68d220a77b18c 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -1149,6 +1149,80 @@ class OMPFullClause final : public OMPNoChildClause { static OMPFullClause *CreateEmpty(const ASTContext &C); }; +/// This class represents the 'looprange' clause in the +/// '#pragma omp fuse' directive +/// +/// \code {c} +/// #pragma omp fuse looprange(1,2) +/// { +/// for(int i = 0; i < 64; ++i) +/// for(int j = 0; j < 256; j+=2) +/// for(int k = 127; k >= 0; --k) +/// \endcode +class OMPLoopRangeClause final : public OMPClause { + friend class OMPClauseReader; + /// Location of '(' + SourceLocation LParenLoc; + + /// Location of first and count expressions + SourceLocation FirstLoc, CountLoc; + + /// Number of looprange arguments (always 2: first, count) + enum { FirstExpr, CountExpr, NumArgs }; + Stmt *Args[NumArgs] = {nullptr, nullptr}; + + /// Set looprange 'first' expression + void setFirst(Expr *E) { Args[FirstExpr] = E; } + + /// Set looprange 'count' expression + void setCount(Expr *E) { Args[CountExpr] = E; } + + /// Build an empty clause for deserialization. + explicit OMPLoopRangeClause() + : OMPClause(llvm::omp::OMPC_looprange, {}, {}) {} + +public: + /// Build a 'looprange' clause AST node. + static OMPLoopRangeClause * + Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation FirstLoc, SourceLocation CountLoc, + SourceLocation EndLoc, Expr *First, Expr *Count); + + /// Build an empty 'looprange' clause node. + static OMPLoopRangeClause *CreateEmpty(const ASTContext &C); + + // Location getters/setters + SourceLocation getLParenLoc() const { return LParenLoc; } + SourceLocation getFirstLoc() const { return FirstLoc; } + SourceLocation getCountLoc() const { return CountLoc; } + + void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; } + void setFirstLoc(SourceLocation Loc) { FirstLoc = Loc; } + void setCountLoc(SourceLocation Loc) { CountLoc = Loc; } + + /// Get looprange 'first' expression + Expr *getFirst() const { return cast_or_null(Args[FirstExpr]); } + + /// Get looprange 'count' expression + Expr *getCount() const { return cast_or_null(Args[CountExpr]); } + + child_range children() { return child_range(Args, Args + NumArgs); } + const_child_range children() const { + return const_child_range(Args, Args + NumArgs); + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == llvm::omp::OMPC_looprange; + } +}; + /// Representation of the 'partial' clause of the '#pragma omp unroll' /// directive. /// @@ -5816,6 +5890,12 @@ class OMPClauseMappableExprCommon { ValueDecl *getAssociatedDeclaration() const { return AssociatedDeclaration; } + + bool operator==(const MappableComponent &Other) const { + return AssociatedExpressionNonContiguousPr == + Other.AssociatedExpressionNonContiguousPr && + AssociatedDeclaration == Other.AssociatedDeclaration; + } }; // List of components of an expression. This first one is the whole @@ -5829,6 +5909,95 @@ class OMPClauseMappableExprCommon { using MappableExprComponentLists = SmallVector; using MappableExprComponentListsRef = ArrayRef; + // Hash function to allow usage as DenseMap keys. + friend llvm::hash_code hash_value(const MappableComponent &MC) { + return llvm::hash_combine(MC.getAssociatedExpression(), + MC.getAssociatedDeclaration(), + MC.isNonContiguous()); + } + +public: + /// Get the type of an element of a ComponentList Expr \p Exp. + /// + /// For something like the following: + /// ```c + /// int *p, **p; + /// ``` + /// The types for the following Exprs would be: + /// Expr | Type + /// ---------|----------- + /// p | int * + /// *p | int + /// p[0] | int + /// p[0:1] | int + /// pp | int ** + /// pp[0] | int * + /// pp[0:1] | int * + /// Note: this assumes that if \p Exp is an array-section, it is contiguous. + static QualType getComponentExprElementType(const Expr *Exp); + + /// Find the attach pointer expression from a list of mappable expression + /// components. + /// + /// This function traverses the component list to find the first + /// expression that has a pointer type, which represents the attach + /// base pointer expr for the current component-list. + /// + /// For example, given the following: + /// + /// ```c + /// struct S { + /// int a; + /// int b[10]; + /// int c[10][10]; + /// int *p; + /// int **pp; + /// } + /// S s, *ps, **pps, *(pas[10]), ***ppps; + /// int i; + /// ``` + /// + /// The base-pointers for the following map operands would be: + /// map list-item | attach base-pointer | attach base-pointer + /// | for directives except | target_update (if + /// | target_update | different) + /// ----------------|-----------------------|--------------------- + /// s | N/A | + /// s.a | N/A | + /// s.p | N/A | + /// ps | N/A | + /// ps->p | ps | + /// ps[1] | ps | + /// *(ps + 1) | ps | + /// (ps + 1)[1] | ps | + /// ps[1:10] | ps | + /// ps->b[10] | ps | + /// ps->p[10] | ps->p | + /// ps->c[1][2] | ps | + /// ps->c[1:2][2] | (error diagnostic) | N/A, TODO: ps + /// ps->c[1:1][2] | ps | N/A, TODO: ps + /// pps[1][2] | pps[1] | + /// pps[1:1][2] | pps[1:1] | N/A, TODO: pps[1:1] + /// pps[1:i][2] | pps[1:i] | N/A, TODO: pps[1:i] + /// pps[1:2][2] | (error diagnostic) | N/A + /// pps[1]->p | pps[1] | + /// pps[1]->p[10] | pps[1] | + /// pas[1] | N/A | + /// pas[1][2] | pas[1] | + /// ppps[1][2] | ppps[1] | + /// ppps[1][2][3] | ppps[1][2] | + /// ppps[1][2:1][3] | ppps[1][2:1] | N/A, TODO: ppps[1][2:1] + /// ppps[1][2:2][3] | (error diagnostic) | N/A + /// Returns a pair of the attach pointer expression and its depth in the + /// component list. + /// TODO: This may need to be updated to handle ref_ptr/ptee cases for byref + /// map operands. + /// TODO: Handle cases for target-update, where the list-item is a + /// non-contiguous array-section that still has a base-pointer. + static std::pair> + findAttachPtrExpr(MappableExprComponentListRef Components, + OpenMPDirectiveKind CurDirKind); + protected: // Return the total number of elements in a list of component lists. static unsigned diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 1d1b7f183f75a..7a2881f6124f3 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2194,6 +2194,7 @@ bool RecursiveASTVisitor::TraverseTemplateArgumentLocsHelper( is the only callback that's made for this instantiation. \ We use getTemplateArgsAsWritten() to distinguish. */ \ if (const auto *ArgsWritten = D->getTemplateArgsAsWritten()) { \ + assert(D->getTemplateSpecializationKind() != TSK_ImplicitInstantiation); \ /* The args that remains unspecialized. */ \ TRY_TO(TraverseTemplateArgumentLocsHelper( \ ArgsWritten->getTemplateArgs(), ArgsWritten->NumTemplateArgs)); \ @@ -3176,6 +3177,9 @@ DEF_TRAVERSE_STMT(OMPUnrollDirective, DEF_TRAVERSE_STMT(OMPReverseDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) +DEF_TRAVERSE_STMT(OMPFuseDirective, + { TRY_TO(TraverseOMPExecutableDirective(S)); }) + DEF_TRAVERSE_STMT(OMPInterchangeDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) @@ -3493,6 +3497,14 @@ bool RecursiveASTVisitor::VisitOMPFullClause(OMPFullClause *C) { return true; } +template +bool RecursiveASTVisitor::VisitOMPLoopRangeClause( + OMPLoopRangeClause *C) { + TRY_TO(TraverseStmt(C->getFirst())); + TRY_TO(TraverseStmt(C->getCount())); + return true; +} + template bool RecursiveASTVisitor::VisitOMPPartialClause(OMPPartialClause *C) { TRY_TO(TraverseStmt(C->getFactor())); diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h index d9f87f1e49b40..bc6aeaa8d143c 100644 --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -21,6 +21,7 @@ #include "clang/AST/StmtCXX.h" #include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/SourceLocation.h" +#include "llvm/Support/Casting.h" namespace clang { @@ -677,6 +678,10 @@ class OMPParallelDirective : public OMPExecutableDirective { } }; +// Forward declaration of a generic loop transformation. Used in the declaration +// of OMPLoopBasedDirective. +class OMPLoopTransformationDirective; + /// The base class for all loop-based directives, including loop transformation /// directives. class OMPLoopBasedDirective : public OMPExecutableDirective { @@ -889,24 +894,23 @@ class OMPLoopBasedDirective : public OMPExecutableDirective { /// Calls the specified callback function for all the loops in \p CurStmt, /// from the outermost to the innermost. - static bool doForAllLoops( - Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, - llvm::function_ref Callback, - llvm::function_ref - OnTransformationCallback); + static bool + doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops, + unsigned NumLoops, + llvm::function_ref Callback, + llvm::function_ref + OnTransformationCallback); static bool doForAllLoops(const Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, llvm::function_ref Callback, - llvm::function_ref< - void(const OMPCanonicalLoopNestTransformationDirective *)> + llvm::function_ref OnTransformationCallback) { auto &&NewCallback = [Callback](unsigned Cnt, Stmt *CurStmt) { return Callback(Cnt, CurStmt); }; auto &&NewTransformCb = - [OnTransformationCallback]( - OMPCanonicalLoopNestTransformationDirective *A) { + [OnTransformationCallback](OMPLoopTransformationDirective *A) { OnTransformationCallback(A); }; return doForAllLoops(const_cast(CurStmt), TryImperfectlyNestedLoops, @@ -919,7 +923,7 @@ class OMPLoopBasedDirective : public OMPExecutableDirective { doForAllLoops(Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, llvm::function_ref Callback) { - auto &&TransformCb = [](OMPCanonicalLoopNestTransformationDirective *) {}; + auto &&TransformCb = [](OMPLoopTransformationDirective *) {}; return doForAllLoops(CurStmt, TryImperfectlyNestedLoops, NumLoops, Callback, TransformCb); } @@ -957,9 +961,11 @@ class OMPLoopBasedDirective : public OMPExecutableDirective { }; /// Common class of data shared between -/// OMPCanonicalLoopNestTransformationDirective and transformations over -/// canonical loop sequences. +/// OMPCanonicalLoopNestTransformationDirective and +/// OMPCanonicalLoopSequenceTransformationDirective class OMPLoopTransformationDirective { + friend class ASTStmtReader; + /// Number of (top-level) generated loops. /// This value is 1 for most transformations as they only map one loop nest /// into another. @@ -969,15 +975,39 @@ class OMPLoopTransformationDirective { /// generate more than one loop nest, so the value would be >= 1. unsigned NumGeneratedTopLevelLoops = 1; + /// We need this because we cannot easily make OMPLoopTransformationDirective + /// a proper Stmt. + Stmt *S = nullptr; + protected: void setNumGeneratedTopLevelLoops(unsigned N) { NumGeneratedTopLevelLoops = N; } + explicit OMPLoopTransformationDirective(Stmt *S) : S(S) {} + public: unsigned getNumGeneratedTopLevelLoops() const { return NumGeneratedTopLevelLoops; } + + /// Returns the specific directive related to this loop transformation. + Stmt *getDirective() const { return S; } + + /// Get the de-sugared statements after the loop transformation. + /// + /// Might be nullptr if either the directive generates no loops and is handled + /// directly in CodeGen, or resolving a template-dependence context is + /// required. + Stmt *getTransformedStmt() const; + + /// Return preinits statement. + Stmt *getPreInits() const; + + static bool classof(const Stmt *T) { + return isa(T); + } }; /// The base class for all transformation directives of canonical loop nests. @@ -990,7 +1020,8 @@ class OMPCanonicalLoopNestTransformationDirective explicit OMPCanonicalLoopNestTransformationDirective( StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc, SourceLocation EndLoc, unsigned NumAssociatedLoops) - : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops) {} + : OMPLoopBasedDirective(SC, Kind, StartLoc, EndLoc, NumAssociatedLoops), + OMPLoopTransformationDirective(this) {} public: /// Return the number of associated (consumed) loops. @@ -5928,6 +5959,112 @@ class OMPInterchangeDirective final } }; +/// The base class for all transformation directives of canonical loop +/// sequences (currently only 'fuse') +class OMPCanonicalLoopSequenceTransformationDirective + : public OMPExecutableDirective, + public OMPLoopTransformationDirective { + friend class ASTStmtReader; + +protected: + explicit OMPCanonicalLoopSequenceTransformationDirective( + StmtClass SC, OpenMPDirectiveKind Kind, SourceLocation StartLoc, + SourceLocation EndLoc) + : OMPExecutableDirective(SC, Kind, StartLoc, EndLoc), + OMPLoopTransformationDirective(this) {} + +public: + /// Get the de-sugared statements after the loop transformation. + /// + /// Might be nullptr if either the directive generates no loops and is handled + /// directly in CodeGen, or resolving a template-dependence context is + /// required. + Stmt *getTransformedStmt() const; + + /// Return preinits statement. + Stmt *getPreInits() const; + + static bool classof(const Stmt *T) { + Stmt::StmtClass C = T->getStmtClass(); + return C == OMPFuseDirectiveClass; + } +}; + +/// Represents the '#pragma omp fuse' loop transformation directive +/// +/// \code{c} +/// #pragma omp fuse +/// { +/// for(int i = 0; i < m1; ++i) {...} +/// for(int j = 0; j < m2; ++j) {...} +/// ... +/// } +/// \endcode +class OMPFuseDirective final + : public OMPCanonicalLoopSequenceTransformationDirective { + friend class ASTStmtReader; + friend class OMPExecutableDirective; + + // Offsets of child members. + enum { + PreInitsOffset = 0, + TransformedStmtOffset, + }; + + explicit OMPFuseDirective(SourceLocation StartLoc, SourceLocation EndLoc) + : OMPCanonicalLoopSequenceTransformationDirective( + OMPFuseDirectiveClass, llvm::omp::OMPD_fuse, StartLoc, EndLoc) {} + + void setPreInits(Stmt *PreInits) { + Data->getChildren()[PreInitsOffset] = PreInits; + } + + void setTransformedStmt(Stmt *S) { + Data->getChildren()[TransformedStmtOffset] = S; + } + +public: + /// Create a new AST node representation for #pragma omp fuse' + /// + /// \param C Context of the AST + /// \param StartLoc Location of the introducer (e.g the 'omp' token) + /// \param EndLoc Location of the directive's end (e.g the tok::eod) + /// \param Clauses The directive's clauses + /// \param NumLoops Total number of loops in the canonical loop sequence. + /// \param NumGeneratedTopLevelLoops Number of top-level generated loops. + // Typically 1 but looprange clause can + // change this. + /// \param AssociatedStmt The outermost associated loop + /// \param TransformedStmt The loop nest after fusion, or nullptr in + /// dependent + /// \param PreInits Helper preinits statements for the loop nest + static OMPFuseDirective * + Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, + ArrayRef Clauses, unsigned NumGeneratedTopLevelLoops, + Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits); + + /// Build an empty '#pragma omp fuse' AST node for deserialization + /// + /// \param C Context of the AST + /// \param NumClauses Number of clauses to allocate + /// \param NumLoops Number of top level loops to allocate + static OMPFuseDirective *CreateEmpty(const ASTContext &C, + unsigned NumClauses); + + /// Gets the associated loops after the transformation. This is the de-sugared + /// replacement or nulltpr in dependent contexts. + Stmt *getTransformedStmt() const { + return Data->getChildren()[TransformedStmtOffset]; + } + + /// Return preinits statement. + Stmt *getPreInits() const { return Data->getChildren()[PreInitsOffset]; } + + static bool classof(const Stmt *T) { + return T->getStmtClass() == OMPFuseDirectiveClass; + } +}; + /// This represents '#pragma omp scan' directive. /// /// \code @@ -6596,4 +6733,37 @@ class OMPAssumeDirective final : public OMPExecutableDirective { } // end namespace clang +namespace llvm { +// Allow a Stmt* be casted correctly to an OMPLoopTransformationDirective*. +// The default routines would just use a C-style cast which won't work well +// for the multiple inheritance here. We have to use a static cast from the +// corresponding subclass. +template <> +struct CastInfo + : public NullableValueCastFailed, + public DefaultDoCastIfPossible< + clang::OMPLoopTransformationDirective *, clang::Stmt *, + CastInfo> { + static bool isPossible(const clang::Stmt *T) { + return clang::OMPLoopTransformationDirective::classof(T); + } + + static clang::OMPLoopTransformationDirective *doCast(clang::Stmt *T) { + if (auto *D = + dyn_cast(T)) + return static_cast(D); + if (auto *D = + dyn_cast(T)) + return static_cast(D); + llvm_unreachable("unexpected type"); + } +}; +template <> +struct CastInfo + : public ConstStrippingForwardingCast< + clang::OMPLoopTransformationDirective, const clang::Stmt *, + CastInfo> {}; + +} // namespace llvm + #endif diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h index b02d9c7499fe5..e0d00b82f2b76 100644 --- a/clang/include/clang/AST/TypeBase.h +++ b/clang/include/clang/AST/TypeBase.h @@ -3495,7 +3495,9 @@ class AdjustedType : public Type, public llvm::FoldingSetNode { AdjustedType(TypeClass TC, QualType OriginalTy, QualType AdjustedTy, QualType CanonicalPtr) - : Type(TC, CanonicalPtr, OriginalTy->getDependence()), + : Type(TC, CanonicalPtr, + AdjustedTy->getDependence() | + (OriginalTy->getDependence() & ~TypeDependence::Dependent)), OriginalTy(OriginalTy), AdjustedTy(AdjustedTy) {} public: diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h index 492863ddfc4a1..0a0d42ca259b8 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchers.h +++ b/clang/include/clang/ASTMatchers/ASTMatchers.h @@ -5096,6 +5096,20 @@ AST_MATCHER_P(LambdaCapture, capturesVar, internal::Matcher, /// matches `[this]() { return cc; }`. AST_MATCHER(LambdaCapture, capturesThis) { return Node.capturesThis(); } +/// Matches lambda expressions that have default capture modes. +/// +/// Given +/// \code +/// auto l1 = [=]() {}; // matches +/// auto l2 = [&]() {}; // matches +/// auto l3 = []() {}; // does not match +/// \endcode +/// lambdaExpr(hasDefaultCapture()) +/// matches l1 and l2, but not l3. +AST_MATCHER(LambdaExpr, hasDefaultCapture) { + return Node.getCaptureDefault() != LCD_None; +} + /// Matches a constructor call expression which uses list initialization. AST_MATCHER(CXXConstructExpr, isListInitialization) { return Node.isListInitialization(); diff --git a/clang/include/clang/Analysis/Analyses/LifetimeAnnotations.h b/clang/include/clang/Analysis/Analyses/LifetimeAnnotations.h new file mode 100644 index 0000000000000..229d16c20b0f8 --- /dev/null +++ b/clang/include/clang/Analysis/Analyses/LifetimeAnnotations.h @@ -0,0 +1,44 @@ +//===- LifetimeAnnotations.h - -*--------------- C++--------------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Helper functions to inspect and infer lifetime annotations. +//===----------------------------------------------------------------------===// +#ifndef LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMEANNOTATIONS_H +#define LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMEANNOTATIONS_H + +#include "clang/AST/DeclCXX.h" + +namespace clang { +namespace lifetimes { + +/// Returns the most recent declaration of the method to ensure all +/// lifetime-bound attributes from redeclarations are considered. +const FunctionDecl *getDeclWithMergedLifetimeBoundAttrs(const FunctionDecl *FD); + +/// Returns the most recent declaration of the method to ensure all +/// lifetime-bound attributes from redeclarations are considered. +const CXXMethodDecl * +getDeclWithMergedLifetimeBoundAttrs(const CXXMethodDecl *CMD); + +// Return true if this is an "normal" assignment operator. +// We assume that a normal assignment operator always returns *this, that is, +// an lvalue reference that is the same type as the implicit object parameter +// (or the LHS for a non-member operator==). +bool isNormalAssignmentOperator(const FunctionDecl *FD); + +/// Returns true if this is an assignment operator where the parameter +/// has the lifetimebound attribute. +bool isAssignmentOperatorLifetimeBound(const CXXMethodDecl *CMD); + +/// Returns true if the implicit object parameter (this) should be considered +/// lifetimebound, either due to an explicit lifetimebound attribute on the +/// method or because it's a normal assignment operator. +bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD); +} // namespace lifetimes +} // namespace clang + +#endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMEANNOTATIONS_H diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h index 7e1bfc903083e..512cb76cd6349 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h @@ -75,13 +75,14 @@ template struct ID { } }; -template -inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, ID ID) { - return OS << ID.Value; -} - using LoanID = ID; using OriginID = ID; +inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, LoanID ID) { + return OS << ID.Value; +} +inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OriginID ID) { + return OS << ID.Value; +} // Using LLVM's immutable collections is efficient for dataflow analysis // as it avoids deep copies during state transitions. diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h index d20f172f446e6..ffdfde8b7d453 100644 --- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h +++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h @@ -543,10 +543,14 @@ class SExprBuilder { til::BasicBlock *CurrentBB = nullptr; BlockInfo *CurrentBlockInfo = nullptr; + // The closure that captures state required for the lookup; this may be + // mutable, so we have to save/restore before/after recursive lookups. + using LookupLocalVarExprClosure = + std::function; // Recursion guard. llvm::DenseSet VarsBeingTranslated; // Context-dependent lookup of currently valid definitions of local variables. - std::function LookupLocalVarExpr; + LookupLocalVarExprClosure LookupLocalVarExpr; }; #ifndef NDEBUG diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h index 78b03d325efd9..6496771ad037e 100644 --- a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h +++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h @@ -90,7 +90,7 @@ template class CachedConstAccessorsLattice : public Base { return Base::operator==(Other); } - LatticeJoinEffect join(const CachedConstAccessorsLattice &Other); + LatticeEffect join(const CachedConstAccessorsLattice &Other); private: // Maps a record storage location and const method to the value to return @@ -121,13 +121,14 @@ joinConstMethodMap( llvm::SmallDenseMap> &Map2, LatticeEffect &Effect) { + // Intersect the two maps, and note if change was made. llvm::SmallDenseMap> Result; for (auto &[Loc, DeclToT] : Map1) { auto It = Map2.find(Loc); if (It == Map2.end()) { - Effect = LatticeJoinEffect::Changed; + Effect = LatticeEffect::Changed; continue; } const auto &OtherDeclToT = It->second; @@ -135,7 +136,7 @@ joinConstMethodMap( for (auto [Func, Var] : DeclToT) { T *OtherVar = OtherDeclToT.lookup(Func); if (OtherVar == nullptr || OtherVar != Var) { - Effect = LatticeJoinEffect::Changed; + Effect = LatticeEffect::Changed; continue; } JoinedDeclToT.insert({Func, Var}); diff --git a/clang/include/clang/Analysis/PathDiagnostic.h b/clang/include/clang/Analysis/PathDiagnostic.h index 5907df022e449..197920d4cd100 100644 --- a/clang/include/clang/Analysis/PathDiagnostic.h +++ b/clang/include/clang/Analysis/PathDiagnostic.h @@ -885,6 +885,10 @@ class PathDiagnostic : public llvm::FoldingSetNode { return UniqueingDecl; } + /// Get a hash that identifies the issue. + SmallString<32> getIssueHash(const SourceManager &SrcMgr, + const LangOptions &LangOpts) const; + void flattenLocations() { Loc.flatten(); for (const auto &I : pathImpl) diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def index 48437c9397570..9aad00b55d64a 100644 --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -66,7 +66,8 @@ // The third value provided to the macro specifies information about attributes // of the function. These must be kept in sync with the predicates in the -// Builtin::Context class. Currently we have: +// Builtin::Context class. Note: In the descriptions below, {num} is a +// placeholder for an integer. Currently we have: // n -> nothrow // r -> noreturn // U -> pure @@ -82,23 +83,23 @@ // h -> this function requires a specific header or an explicit declaration. // i -> this is a runtime library implemented function without the // '__builtin_' prefix. It will be implemented in compiler-rt or libgcc. -// p:N: -> this is a printf-like function whose Nth argument is the format -// string. -// P:N: -> similar to the p:N: attribute, but the function is like vprintf -// in that it accepts its arguments as a va_list rather than -// through an ellipsis -// s:N: -> this is a scanf-like function whose Nth argument is the format -// string. -// S:N: -> similar to the s:N: attribute, but the function is like vscanf -// in that it accepts its arguments as a va_list rather than -// through an ellipsis +// p:{num}: -> this is a printf-like function whose {num}th argument is the +// format string. +// P:{num}: -> similar to the p:{num}: attribute, but the function is like +// vprintf in that it accepts its arguments as a va_list rather than +// through an ellipsis +// s:{num}: -> this is a scanf-like function whose {num}th argument is the +// format string. +// S:{num}: -> similar to the s:{num}: attribute, but the function is like +// vscanf in that it accepts its arguments as a va_list rather than +// through an ellipsis // e -> const, but only when -fno-math-errno and FP exceptions are ignored // g -> const when FP exceptions are ignored // j -> returns_twice (like setjmp) // u -> arguments are not evaluated for their side-effects -// V:N: -> requires vectors of at least N bits to be legal -// C -> callback behavior: argument N is called with argument -// M_0, ..., M_k as payload +// V:{num}: -> requires vectors of at least {num} bits to be legal +// C<{num},M_0,...,M_k> -> callback behavior: argument {num} is called with +// argument M_0, ..., M_k as payload // z -> this is a function in (possibly-versioned) namespace std // E -> this function can be constant evaluated by Clang frontend // G -> this is a C++20 consteval function diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td index 35d2c3e19fdf9..468121f7d20ab 100644 --- a/clang/include/clang/Basic/Builtins.td +++ b/clang/include/clang/Basic/Builtins.td @@ -4945,6 +4945,12 @@ def HLSLResourceHandleFromImplicitBinding : LangBuiltin<"HLSL_LANG"> { let Prototype = "void(...)"; } +def HLSLResourceNonUniformIndex : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_resource_nonuniformindex"]; + let Attributes = [NoThrow]; + let Prototype = "uint32_t(uint32_t)"; +} + def HLSLAll : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_all"]; let Attributes = [NoThrow, Const]; @@ -5095,6 +5101,12 @@ def HLSLIsinf : LangBuiltin<"HLSL_LANG"> { let Prototype = "void(...)"; } +def HLSLIsnan : LangBuiltin<"HLSL_LANG"> { + let Spellings = ["__builtin_hlsl_elementwise_isnan"]; + let Attributes = [NoThrow, Const]; + let Prototype = "void(...)"; +} + def HLSLLerp : LangBuiltin<"HLSL_LANG"> { let Spellings = ["__builtin_hlsl_lerp"]; let Attributes = [NoThrow, Const, CustomTypeChecking]; diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 32b5aa5ac1377..3e45c04687a64 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -163,7 +163,7 @@ BUILTIN(__builtin_amdgcn_qsad_pk_u16_u8, "WUiWUiUiWUi", "nc") BUILTIN(__builtin_amdgcn_mqsad_pk_u16_u8, "WUiWUiUiWUi", "nc") BUILTIN(__builtin_amdgcn_mqsad_u32_u8, "V4UiWUiUiV4Ui", "nc") -BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sii", "nc") +BUILTIN(__builtin_amdgcn_make_buffer_rsrc, "Qbv*sWii", "nc") BUILTIN(__builtin_amdgcn_raw_buffer_store_b8, "vUcQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_store_b16, "vUsQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_store_b32, "vUiQbiiIi", "n") diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 044c755d4d7cf..77e599587edc3 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -93,9 +93,6 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { } let Features = "sse2" in { - def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; - def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; - def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">; def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">; def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">; @@ -108,6 +105,9 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { def pavgw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">; def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">; + def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; + def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; + def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; } let Features = "sse3" in { @@ -312,7 +312,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; - def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; @@ -338,6 +337,7 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVector def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">; def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; + def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; } let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { @@ -571,10 +571,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">; - def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; - def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; - def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; - def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">; def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; @@ -647,6 +643,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">; def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">; + def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; + def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; + def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; + def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; } let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { @@ -1219,15 +1219,15 @@ let Features = "avx512f", Attributes = [NoThrow, RequiredVectorWidth<512>] in { def scatterdiv16si : X86Builtin<"void(void *, unsigned char, _Vector<8, long long int>, _Vector<8, int>, _Constant int)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def knotqi : X86Builtin<"unsigned char(unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in { def knothi : X86Builtin<"unsigned short(unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def knotsi : X86Builtin<"unsigned int(unsigned int)">; def knotdi : X86Builtin<"unsigned long long int(unsigned long long int)">; } @@ -1308,11 +1308,14 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512> let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">; - def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; + def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; +} + +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">; - def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; + def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">; - def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">; + def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">; } let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { @@ -3076,51 +3079,51 @@ let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<128 def fpclassss_mask : X86Builtin<"unsigned char(_Vector<4, float>, _Constant int, unsigned char)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def kaddqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">; def kaddhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def kaddsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">; def kadddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def kandqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in { def kandhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def kandsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">; def kanddi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def kandnqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in { def kandnhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def kandnsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">; def kandndi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def korqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in { def korhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def korsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">; def kordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } @@ -3160,28 +3163,28 @@ let Features = "avx512f", Attributes = [NoThrow, Const] in { def kunpckhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def kxnorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in { def kxnorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def kxnorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">; def kxnordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr] in { def kxorqi : X86Builtin<"unsigned char(unsigned char, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr] in { def kxorhi : X86Builtin<"unsigned short(unsigned short, unsigned short)">; } -let Features = "avx512bw", Attributes = [NoThrow, Const] in { +let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr] in { def kxorsi : X86Builtin<"unsigned int(unsigned int, unsigned int)">; def kxordi : X86Builtin<"unsigned long long int(unsigned long long int, unsigned long long int)">; } diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index 0bd8a423c393e..6e50e225a8cc1 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -433,6 +433,12 @@ def err_omp_more_one_clause : Error< "directive '#pragma omp %0' cannot contain more than one '%1' clause%select{| with '%3' name modifier| with 'source' dependence}2">; def err_omp_required_clause : Error< "directive '#pragma omp %0' requires the '%1' clause">; +def warn_omp_gpu_unsupported_clause: Warning< + "clause '%0' is currently not supported on a GPU; clause ignored">, + InGroup; +def warn_omp_gpu_unsupported_modifier_for_clause: Warning< + "modifier '%0' is currently not supported on a GPU for the '%1' clause; modifier ignored">, + InGroup; // Static Analyzer Core def err_unknown_analyzer_checker_or_package : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index bd896524321d1..b157cbb0b8069 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -35,9 +35,8 @@ defm decomp_decl : CXX17Compat<"decomposition declarations are">; defm inline_variable : CXX17Compat<"inline variables are">; // C++20 compatibility with C++17 and earlier. -defm decomp_decl_spec : CXX20Compat< - "decomposition declaration declared " - "%plural{1:'%1'|:with '%1' specifiers}0 is">; +defm decomp_decl_spec + : CXX20Compat<"decomposition declaration declared '%0' is">; defm constexpr_local_var_no_init : CXX20Compat< "uninitialized variable in a constexpr %select{function|constructor}0 is">; defm constexpr_function_try_block : CXX20Compat< @@ -593,9 +592,8 @@ def warn_modifying_shadowing_decl : // C++ decomposition declarations def err_decomp_decl_context : Error< "decomposition declaration not permitted in this context">; -def err_decomp_decl_spec : Error< - "decomposition declaration cannot be declared " - "%plural{1:'%1'|:with '%1' specifiers}0">; +def err_decomp_decl_spec + : Error<"decomposition declaration cannot be declared '%0'">; def err_decomp_decl_type : Error< "decomposition declaration cannot be declared with type %0; " "declared type must be 'auto' or reference to 'auto'">; @@ -1777,7 +1775,8 @@ def note_unsatisfied_trait "%Empty{empty}|" "%StandardLayout{standard-layout}|" "%Aggregate{aggregate}|" - "%Final{final}" + "%Final{final}|" + "%Abstract{abstract}" "}1">; def note_unsatisfied_trait_reason @@ -1827,7 +1826,12 @@ def note_unsatisfied_trait_reason "%PrivateProtectedDirectDataMember{has a %select{private|protected}1 direct data member}|" "%PrivateProtectedDirectBase{has a %select{private|protected}1 direct base}|" "%NotClassOrUnion{is not a class or union type}|" - "%NotMarkedFinal{is not marked 'final'}" + "%NotMarkedFinal{is not marked 'final'}|" + "%PointerType{is a pointer type}|" + "%ArrayType{is an array type}|" + "%UnionType{is a union type}|" + "%NotStructOrClass{is not a struct or class type}|" + "%OverridesAllPureVirtual{overrides all pure virtual functions from base class %1}" "}0">; def warn_consteval_if_always_true : Warning< @@ -3983,6 +3987,14 @@ def warn_sme_locally_streaming_has_vl_args_returns : Warning< "%select{returning|passing}0 a VL-dependent argument %select{from|to}0 a locally streaming function is undefined" " behaviour when the streaming and non-streaming vector lengths are different at runtime">, InGroup, DefaultIgnore; +def warn_sme_streaming_compatible_vl_mismatch : Warning< + "%select{returning|passing}0 a VL-dependent argument %select{from|to}0 a %select{non-streaming|streaming}1" + " function is undefined behaviour when the streaming-compatible caller is%select{| not}1 in streaming" + " mode, because the streaming vector length (%2 bit) and non-streaming vector length (%3 bit) differ">, + InGroup, DefaultIgnore; +def err_sme_streaming_transition_vl_mismatch : Error< + "%select{returning|passing}0 a VL-dependent argument %select{from|to}0 a function with a different" + " streaming-mode is undefined behaviour because the streaming vector length (%1 bit) and non-streaming vector length (%2 bit) differ">; def err_conflicting_attributes_arm_agnostic : Error< "__arm_agnostic(\"sme_za_state\") cannot share ZA state with its caller">; def err_conflicting_attributes_arm_state : Error< @@ -5758,8 +5770,10 @@ def err_template_recursion_depth_exceeded : Error< def err_constraint_depends_on_self : Error<"satisfaction of constraint %0 depends on itself">, NoSFINAE; -def note_template_recursion_depth : Note< - "use -ftemplate-depth=N to increase recursive template instantiation depth">; +def note_template_recursion_depth + : Note<"use -ftemplate-depth=N to increase recursive template " + "instantiation depth">, + NoSFINAE; def err_template_instantiate_within_definition : Error< "%select{implicit|explicit}0 instantiation of template %1 within its" @@ -10448,6 +10462,9 @@ def warn_format_conversion_argument_type_mismatch : Warning< "format specifies type %0 but the argument has " "%select{type|underlying type}2 %1">, InGroup; +def err_format_conversion_argument_type_mismatch : Error< + "format specifies type %0 but the argument has " + "%select{type|underlying type}2 %1">; def warn_format_conversion_argument_type_mismatch_pedantic : Extension< warn_format_conversion_argument_type_mismatch.Summary>, InGroup; @@ -10497,6 +10514,8 @@ def warn_printf_asterisk_missing_arg : Warning< def warn_printf_asterisk_wrong_type : Warning< "field %select{width|precision}0 should have type %1, but argument has type %2">, InGroup; +def err_printf_asterisk_wrong_type : Error< + "field %select{width|precision}0 should have type %1, but argument has type %2">; def warn_printf_nonsensical_optional_amount: Warning< "%select{field width|precision}0 used with '%1' conversion specifier, resulting in undefined behavior">, InGroup; @@ -11744,6 +11763,18 @@ def note_omp_implicit_dsa : Note< "implicitly determined as %0">; def err_omp_loop_var_dsa : Error< "loop iteration variable in the associated loop of 'omp %1' directive may not be %0, predetermined as %2">; +def err_omp_not_a_loop_sequence + : Error<"statement after '#pragma omp %0' must be a loop sequence " + "containing canonical loops or loop-generating constructs">; +def err_omp_empty_loop_sequence + : Error<"loop sequence after '#pragma omp %0' must contain at least 1 " + "canonical loop or loop-generating construct">; +def err_omp_invalid_looprange + : Error<"looprange clause selects loops from %1 to %2 but this exceeds the " + "number of loops (%3) in the loop sequence">; +def warn_omp_redundant_fusion : Warning<"looprange clause selects a single " + "loop, resulting in redundant fusion">, + InGroup; def err_omp_not_for : Error< "%select{statement after '#pragma omp %1' must be a for loop|" "expected %2 for loops after '#pragma omp %1'%select{|, but found only %4}3}0">; diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 115af7b19d6e4..ed89a31e2684b 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -312,6 +312,14 @@ bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind); /// otherwise - false. bool isOpenMPTargetDataManagementDirective(OpenMPDirectiveKind DKind); +/// Checks if the specified directive is a map-entering target directive. +/// \param DKind Specified directive. +/// \return true - the directive is a map-entering target directive like +/// 'omp target', 'omp target data', 'omp target enter data', +/// 'omp target parallel', etc. (excludes 'omp target exit data', 'omp target +/// update') otherwise - false. +bool isOpenMPTargetMapEnteringDirective(OpenMPDirectiveKind DKind); + /// Checks if the specified composite/combined directive constitutes a teams /// directive in the outermost nest. For example /// 'omp teams distribute' or 'omp teams distribute parallel for'. @@ -383,6 +391,13 @@ bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind); bool isOpenMPCanonicalLoopNestTransformationDirective( OpenMPDirectiveKind DKind); +/// Checks if the specified directive is a loop transformation directive that +/// applies to a canonical loop sequence. +/// \param DKind Specified directive. +/// \return True iff the directive is a loop transformation. +bool isOpenMPCanonicalLoopSequenceTransformationDirective( + OpenMPDirectiveKind DKind); + /// Checks if the specified directive is a loop transformation directive. /// \param DKind Specified directive. /// \return True iff the directive is a loop transformation. diff --git a/clang/include/clang/Basic/Sarif.h b/clang/include/clang/Basic/Sarif.h index e6c46224b316d..a88d1ee2965a9 100644 --- a/clang/include/clang/Basic/Sarif.h +++ b/clang/include/clang/Basic/Sarif.h @@ -322,6 +322,8 @@ class SarifResult { uint32_t RuleIdx; std::string RuleId; std::string DiagnosticMessage; + std::string HostedViewerURI; + llvm::SmallDenseMap PartialFingerprints; llvm::SmallVector Locations; llvm::SmallVector ThreadFlows; std::optional LevelOverride; @@ -347,6 +349,11 @@ class SarifResult { return *this; } + SarifResult setHostedViewerURI(llvm::StringRef URI) { + HostedViewerURI = URI.str(); + return *this; + } + SarifResult setLocations(llvm::ArrayRef DiagLocs) { #ifndef NDEBUG for (const auto &Loc : DiagLocs) { @@ -366,6 +373,12 @@ class SarifResult { LevelOverride = TheLevel; return *this; } + + SarifResult addPartialFingerprint(llvm::StringRef key, + llvm::StringRef value) { + PartialFingerprints[key] = value; + return *this; + } }; /// This class handles creating a valid SARIF document given various input @@ -475,6 +488,8 @@ class SarifDocumentWriter { /// reported diagnostics, resulting in an expensive call. llvm::json::Object createDocument(); + static std::string fileNameToURI(llvm::StringRef Filename); + private: /// Source Manager to use for the current SARIF document. const SourceManager &SourceMgr; diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index dd1a24405fae7..bf3686bb372d5 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -238,6 +238,10 @@ def OMPUnrollDirective : StmtNode; def OMPReverseDirective : StmtNode; def OMPInterchangeDirective : StmtNode; +def OMPCanonicalLoopSequenceTransformationDirective + : StmtNode; +def OMPFuseDirective + : StmtNode; def OMPForDirective : StmtNode; def OMPForSimdDirective : StmtNode; def OMPSectionsDirective : StmtNode; diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index e5c5ada3b0858..ceb16174e13e7 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -1259,6 +1259,10 @@ class TargetInfo : public TransferrableTargetInfo, ArrayRef OutputConstraints, unsigned &Index) const; + std::string + simplifyConstraint(StringRef Constraint, + SmallVectorImpl *OutCons = nullptr) const; + // Constraint parm will be left pointing at the last character of // the constraint. In practice, it won't be changed unless the // constraint is longer than one character. diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index f1e24a5215dc8..bb394440bf8d8 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -2326,6 +2326,10 @@ def CIR_FuncOp : CIR_Op<"func", [ The function linkage information is specified by `linkage`, as defined by `GlobalLinkageKind` attribute. + The `lambda` translates to a C++ `operator()` that implements a lambda, this + allow callsites to make certain assumptions about the real function nature + when writing analysis. + The `no_proto` keyword is used to identify functions that were declared without a prototype and, consequently, may contain calls with invalid arguments and undefined behavior. @@ -2348,6 +2352,7 @@ def CIR_FuncOp : CIR_Op<"func", [ let arguments = (ins SymbolNameAttr:$sym_name, CIR_VisibilityAttr:$global_visibility, TypeAttrOf:$function_type, + UnitAttr:$lambda, UnitAttr:$no_proto, UnitAttr:$dso_local, DefaultValuedAttr; -def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">; -def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">; -def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">; +def fopenmp_assume_teams_oversubscription : Flag<["-"], "fopenmp-assume-teams-oversubscription">, + HelpText<"Allow the optimizer to discretely increase the number of " + "teams. May cause ignore environment variables that set " + "the number of teams to be ignored. The combination of " + "-fopenmp-assume-teams-oversubscription " + "and -fopenmp-assume-threads-oversubscription " + "may allow the conversion of loops into sequential code by " + "ensuring that each team/thread executes at most one iteration.">; +def fopenmp_assume_threads_oversubscription : Flag<["-"], "fopenmp-assume-threads-oversubscription">, + HelpText<"Allow the optimizer to discretely increase the number of " + "threads. May cause ignore environment variables that set " + "the number of threads to be ignored. The combination of " + "-fopenmp-assume-teams-oversubscription " + "and -fopenmp-assume-threads-oversubscription " + "may allow the conversion of loops into sequential code by " + "ensuring that each team/thread executes at most one iteration.">; +def fno_openmp_assume_teams_oversubscription : Flag<["-"], "fno-openmp-assume-teams-oversubscription">, + HelpText<"Do not assume teams oversubscription.">; +def fno_openmp_assume_threads_oversubscription : Flag<["-"], "fno-openmp-assume-threads-oversubscription">, + HelpText<"Do not assume threads oversubscription.">; def fopenmp_assume_no_thread_state : Flag<["-"], "fopenmp-assume-no-thread-state">, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag>; @@ -4587,7 +4603,7 @@ defm ptrauth_block_descriptor_pointers : OptInCC1FFlag<"ptrauth-block-descriptor def fenable_matrix : Flag<["-"], "fenable-matrix">, Group, Visibility<[ClangOption, CC1Option]>, HelpText<"Enable matrix data type and related builtin functions">, - MarshallingInfoFlag>; + MarshallingInfoFlag, hlsl.KeyPath>; defm raw_string_literals : BoolFOption<"raw-string-literals", LangOpts<"RawStringLiterals">, Default, @@ -4754,13 +4770,13 @@ defm column_info : BoolOption<"g", "column-info", PosFlag, BothFlags<[], [ClangOption, CLOption, DXCOption]>>, Group; def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group, - Visibility<[ClangOption, CLOption, DXCOption]>; + Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>; def gsplit_dwarf_EQ : Joined<["-"], "gsplit-dwarf=">, Group, - Visibility<[ClangOption, CLOption, DXCOption]>, + Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>, HelpText<"Set DWARF fission mode">, Values<"split,single">; def gno_split_dwarf : Flag<["-"], "gno-split-dwarf">, Group, - Visibility<[ClangOption, CLOption, DXCOption]>; + Visibility<[ClangOption, CLOption, DXCOption, FlangOption]>; def gtemplate_alias : Flag<["-"], "gtemplate-alias">, Group, Visibility<[ClangOption, CC1Option]>; def gno_template_alias : Flag<["-"], "gno-template-alias">, Group, Visibility<[ClangOption]>; def gsimple_template_names : Flag<["-"], "gsimple-template-names">, Group; @@ -8405,7 +8421,7 @@ def main_file_name : Separate<["-"], "main-file-name">, MarshallingInfoString>; def split_dwarf_output : Separate<["-"], "split-dwarf-output">, HelpText<"File name to use for split dwarf debug info output">, - Visibility<[CC1Option, CC1AsOption]>, + Visibility<[CC1Option, CC1AsOption, FC1Option]>, MarshallingInfoString>; let Visibility = [CC1Option, FC1Option] in { @@ -8437,6 +8453,10 @@ def dependent_lib : Joined<["--"], "dependent-lib=">, HelpText<"Add dependent library">, MarshallingInfoStringVector>; +def split_dwarf_file : Separate<["-"], "split-dwarf-file">, + HelpText<"Name of the split dwarf debug info file to encode in the object file">, + MarshallingInfoString>; + } // let Visibility = [CC1Option, FC1Option] let Visibility = [CC1Option] in { @@ -8447,9 +8467,6 @@ def fblocks_runtime_optional : Flag<["-"], "fblocks-runtime-optional">, def fexternc_nounwind : Flag<["-"], "fexternc-nounwind">, HelpText<"Assume all functions with C linkage do not unwind">, MarshallingInfoFlag>; -def split_dwarf_file : Separate<["-"], "split-dwarf-file">, - HelpText<"Name of the split dwarf debug info file to encode in the object file">, - MarshallingInfoString>; def fno_wchar : Flag<["-"], "fno-wchar">, HelpText<"Disable C++ builtin type wchar_t">, MarshallingInfoNegativeFlag, cplusplus.KeyPath>, diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h index f86c2f5074de0..49fd920d1ec43 100644 --- a/clang/include/clang/Frontend/Utils.h +++ b/clang/include/clang/Frontend/Utils.h @@ -143,8 +143,9 @@ class ModuleDependencyCollector : public DependencyCollector { std::error_code copyToRoot(StringRef Src, StringRef Dst = {}); public: - ModuleDependencyCollector(std::string DestDir) - : DestDir(std::move(DestDir)) {} + ModuleDependencyCollector(std::string DestDir, + IntrusiveRefCntPtr VFS) + : DestDir(std::move(DestDir)), Canonicalizer(std::move(VFS)) {} ~ModuleDependencyCollector() override { writeFileMap(); } StringRef getDest() { return DestDir; } diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 30edd303e1824..e301cf1080977 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -6767,6 +6767,9 @@ class Parser : public CodeCompletionHandler { OpenMPClauseKind Kind, bool ParseOnly); + /// Parses the 'looprange' clause of a '#pragma omp fuse' directive. + OMPClause *ParseOpenMPLoopRangeClause(); + /// Parses the 'sizes' clause of a '#pragma omp tile' directive. OMPClause *ParseOpenMPSizesClause(); diff --git a/clang/include/clang/Sema/HLSLExternalSemaSource.h b/clang/include/clang/Sema/HLSLExternalSemaSource.h index d93fb8c8eef6b..049fc7b8fe3f2 100644 --- a/clang/include/clang/Sema/HLSLExternalSemaSource.h +++ b/clang/include/clang/Sema/HLSLExternalSemaSource.h @@ -44,6 +44,7 @@ class HLSLExternalSemaSource : public ExternalSemaSource { private: void defineTrivialHLSLTypes(); void defineHLSLVectorAlias(); + void defineHLSLMatrixAlias(); void defineHLSLTypesWithForwardDeclarations(); void onCompletion(CXXRecordDecl *Record, CompletionFunction Fn); }; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index d017d1f829015..f53aafdeb4f36 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -503,7 +503,6 @@ enum class FormatStringType { FreeBSDKPrintf, OSTrace, OSLog, - Syslog, Unknown }; @@ -11715,6 +11714,23 @@ class Sema final : public SemaBase { const TemplateArgumentListInfo *TemplateArgs, bool IsAddressOfOperand); + UnsignedOrNone getPackIndex(TemplateArgument Pack) const { + return Pack.pack_size() - 1 - *ArgPackSubstIndex; + } + + TemplateArgument + getPackSubstitutedTemplateArgument(TemplateArgument Arg) const { + Arg = Arg.pack_elements()[*ArgPackSubstIndex]; + if (Arg.isPackExpansion()) + Arg = Arg.getPackExpansionPattern(); + return Arg; + } + + ExprResult BuildSubstNonTypeTemplateParmExpr( + Decl *AssociatedDecl, const NonTypeTemplateParmDecl *NTTP, + SourceLocation loc, TemplateArgument Replacement, + UnsignedOrNone PackIndex, bool Final); + /// Form a template name from a name that is syntactically required to name a /// template, either due to use of the 'template' keyword or because a name in /// this syntactic context is assumed to name a template (C++ @@ -13319,8 +13335,6 @@ class Sema final : public SemaBase { Sema &SemaRef; bool Invalid; bool AlreadyInstantiating; - bool CheckInstantiationDepth(SourceLocation PointOfInstantiation, - SourceRange InstantiationRange); InstantiatingTemplate(Sema &SemaRef, CodeSynthesisContext::SynthesisKind Kind, @@ -13513,7 +13527,7 @@ class Sema final : public SemaBase { ~ArgPackSubstIndexRAII() { Self.ArgPackSubstIndex = OldSubstIndex; } }; - void pushCodeSynthesisContext(CodeSynthesisContext Ctx); + bool pushCodeSynthesisContext(CodeSynthesisContext Ctx); void popCodeSynthesisContext(); void PrintContextStack(InstantiationContextDiagFuncRef DiagFunc) { diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index c0fd7a6d63611..daf58b18a03cb 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -463,6 +463,13 @@ class SemaOpenMP : public SemaBase { Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc); + + /// Called on well-formed '#pragma omp fuse' after parsing of its + /// clauses and the associated statement. + StmtResult ActOnOpenMPFuseDirective(ArrayRef Clauses, + Stmt *AStmt, SourceLocation StartLoc, + SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp for' after parsing /// of the associated statement. StmtResult @@ -921,6 +928,12 @@ class SemaOpenMP : public SemaBase { SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc); + + /// Called on well-form 'looprange' clause after parsing its arguments. + OMPClause * + ActOnOpenMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation FirstLoc, + SourceLocation CountLoc, SourceLocation EndLoc); /// Called on well-formed 'ordered' clause. OMPClause * ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc, @@ -1485,7 +1498,81 @@ class SemaOpenMP : public SemaBase { bool checkTransformableLoopNest( OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, SmallVectorImpl &LoopHelpers, - Stmt *&Body, SmallVectorImpl> &OriginalInits); + Stmt *&Body, SmallVectorImpl> &OriginalInits); + + /// Holds the result of the analysis of a (possibly canonical) loop. + struct LoopAnalysis { + /// The analyzed loop or loop transformation. + Stmt *AStmt = nullptr; + /// Loop analyses results. + OMPLoopBasedDirective::HelperExprs HelperExprs; + /// The for-statement of the loop. TheForStmt equals AStmt only when the + /// latter is a canonical loop (i.e. not a loop transformation). + Stmt *TheForStmt = nullptr; + /// Initialization statements before transformations. + SmallVector OriginalInits; + /// Initialization statements required after transformation of this loop. + SmallVector TransformsPreInits; + + explicit LoopAnalysis(Stmt *S) : AStmt(S) {} + + bool isRegularLoop() const { return isRegularLoop(AStmt); } + bool isLoopTransformation() const { return isLoopTransformation(AStmt); } + + // Convenience functions used when building LoopSequenceAnalysis. + static bool isRegularLoop(Stmt *S) { + return isa(S); + } + static bool isLoopTransformation(Stmt *S) { + return isa(S); + } + }; + + /// Holds the result of the analysis of a (possibly canonical) loop sequence. + struct LoopSequenceAnalysis { + /// Number of top level canonical loops. + unsigned LoopSeqSize = 0; + /// For each loop results of the analysis. + SmallVector Loops; + /// Additional code required before entering the transformed loop sequence. + SmallVector LoopSequencePreInits; + + // Convenience function used when building the LoopSequenceAnalysis. + static bool isLoopSequenceDerivation(Stmt *S) { + return LoopAnalysis::isRegularLoop(S) || + LoopAnalysis::isLoopTransformation(S); + } + }; + + /// The main recursive process of `checkTransformableLoopSequence` that + /// performs grammatical parsing of a canonical loop sequence. It extracts + /// key information, such as the number of top-level loops, loop statements, + /// helper expressions, and other relevant loop-related data, all in a single + /// execution to avoid redundant traversals. This analysis flattens inner + /// Loop Sequences + /// + /// \param LoopSeqStmt The AST of the original statement. + /// \param SeqAnalysis [out] Result of the analysis of \p LoopSeqStmt + /// \param Context + /// \param Kind The loop transformation directive kind. + /// \return Whether the original statement is both syntactically and + /// semantically correct according to OpenMP 6.0 canonical loop + /// sequence definition. + bool analyzeLoopSequence(Stmt *LoopSeqStmt, LoopSequenceAnalysis &SeqAnalysis, + ASTContext &Context, OpenMPDirectiveKind Kind); + + /// Validates and checks whether a loop sequence can be transformed according + /// to the given directive, providing necessary setup and initialization + /// (Driver function) before recursion using `analyzeLoopSequence`. + /// + /// \param Kind The loop transformation directive kind. + /// \param AStmt The AST of the original statement + /// \param SeqAnalysis [out] Result of the analysis of \p LoopSeqStmt + /// \param Context + /// \return Whether there was an absence of errors or not + bool checkTransformableLoopSequence(OpenMPDirectiveKind Kind, Stmt *AStmt, + LoopSequenceAnalysis &SeqAnalysis, + ASTContext &Context); /// Helper to keep information about the current `omp begin/end declare /// variant` nesting. diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 441047d64f48c..99864c7373908 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -1951,6 +1951,7 @@ enum StmtCode { STMT_OMP_UNROLL_DIRECTIVE, STMT_OMP_REVERSE_DIRECTIVE, STMT_OMP_INTERCHANGE_DIRECTIVE, + STMT_OMP_FUSE_DIRECTIVE, STMT_OMP_FOR_DIRECTIVE, STMT_OMP_FOR_SIMD_DIRECTIVE, STMT_OMP_SECTIONS_DIRECTIVE, diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h index a6cb6c0f12a8c..7f25223d232cf 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h @@ -455,6 +455,20 @@ class SMTConv { QualType OperandTy; llvm::SMTExprRef OperandExp = getSymExpr(Solver, Ctx, USE->getOperand(), &OperandTy, hasComparison); + + if (const BinarySymExpr *BSE = + dyn_cast(USE->getOperand())) { + if (USE->getOpcode() == UO_Minus && + BinaryOperator::isComparisonOp(BSE->getOpcode())) + // The comparison operator yields a boolean value in the Z3 + // language and applying the unary minus operator on a boolean + // crashes Z3. However, the unary minus does nothing in this + // context (a number is truthy if and only if its negative is + // truthy), so let's just ignore the unary minus. + // TODO: Replace this with a more general solution. + return OperandExp; + } + llvm::SMTExprRef UnaryExp = OperandTy->isRealFloatingType() ? fromFloatUnOp(Solver, USE->getOpcode(), OperandExp) diff --git a/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h b/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h index 3234b0976a8e7..ed2aa55c99279 100644 --- a/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h +++ b/clang/include/clang/Tooling/Refactoring/RefactoringOptionVisitor.h @@ -37,11 +37,11 @@ namespace internal { template struct HasHandle { private: template - static auto check(ClassT *) -> typename std::is_same< - decltype(std::declval().visit( - std::declval(), - *std::declval *>())), - void>::type; + static auto check(ClassT *) + -> std::is_same().visit( + std::declval(), + *std::declval *>())), + void>; template static std::false_type check(...); diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 97c59b2ceec2f..61dd330553860 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -4567,6 +4567,10 @@ QualType ASTContext::getWebAssemblyExternrefType() const { /// type. QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts, unsigned NumFields) const { + auto K = llvm::ScalableVecTyKey{EltTy, NumElts, NumFields}; + if (auto It = ScalableVecTyMap.find(K); It != ScalableVecTyMap.end()) + return It->second; + if (Target->hasAArch64ACLETypes()) { uint64_t EltTySize = getTypeSize(EltTy); @@ -4575,29 +4579,29 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts, if (EltTy->hasIntegerRepresentation() && !EltTy->isBooleanType() && \ EltTy->hasSignedIntegerRepresentation() == IsSigned && \ EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ - return SingletonId; \ + return ScalableVecTyMap[K] = SingletonId; \ } #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, \ ElBits, NF) \ if (EltTy->hasFloatingRepresentation() && !EltTy->isBFloat16Type() && \ EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ - return SingletonId; \ + return ScalableVecTyMap[K] = SingletonId; \ } #define SVE_VECTOR_TYPE_BFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ ElBits, NF) \ if (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() && \ EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ - return SingletonId; \ + return ScalableVecTyMap[K] = SingletonId; \ } #define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ ElBits, NF) \ if (EltTy->isMFloat8Type() && EltTySize == ElBits && \ NumElts == (NumEls * NF) && NumFields == 1) { \ - return SingletonId; \ + return ScalableVecTyMap[K] = SingletonId; \ } #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1) \ - return SingletonId; + return ScalableVecTyMap[K] = SingletonId; #include "clang/Basic/AArch64ACLETypes.def" } else if (Target->hasRISCVVTypes()) { uint64_t EltTySize = getTypeSize(EltTy); @@ -4611,10 +4615,10 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts, (EltTy->hasFloatingRepresentation() && EltTy->isBFloat16Type() && \ IsBF && !IsFP)) && \ EltTySize == ElBits && NumElts == NumEls && NumFields == NF) \ - return SingletonId; + return ScalableVecTyMap[K] = SingletonId; #define RVV_PREDICATE_TYPE(Name, Id, SingletonId, NumEls) \ if (EltTy->isBooleanType() && NumElts == NumEls) \ - return SingletonId; + return ScalableVecTyMap[K] = SingletonId; #include "clang/Basic/RISCVVTypes.def" } return QualType(); @@ -5869,8 +5873,14 @@ ASTContext::getSubstBuiltinTemplatePack(const TemplateArgument &ArgPack) { QualType Canon; TemplateArgument CanonArgPack = getCanonicalTemplateArgument(ArgPack); - if (!CanonArgPack.structurallyEquals(ArgPack)) + if (!CanonArgPack.structurallyEquals(ArgPack)) { Canon = getSubstBuiltinTemplatePack(CanonArgPack); + // Refresh InsertPos, in case the recursive call above caused rehashing, + // which would invalidate the bucket pointer. + [[maybe_unused]] const auto *Nothing = + SubstBuiltinTemplatePackTypes.FindNodeOrInsertPos(ID, InsertPos); + assert(!Nothing); + } auto *PackType = new (*this, alignof(SubstBuiltinTemplatePackType)) SubstBuiltinTemplatePackType(Canon, ArgPack); diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index fafec47f7de3c..0b7b6cd64dd97 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -1787,7 +1787,12 @@ bool Compiler::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) { return false; if (DiscardResult) return this->emitPopPtr(E); - return true; + + if (E->isGLValue()) + return true; + + OptPrimType T = classifyPrim(E); + return this->emitLoadPop(*T, E); } template @@ -2378,13 +2383,8 @@ bool Compiler::VisitMemberExpr(const MemberExpr *E) { return this->visitDeclRef(Member, E); } - if (Initializing) { - if (!this->delegate(Base)) - return false; - } else { - if (!this->visit(Base)) - return false; - } + if (!this->visit(Base)) + return false; // Base above gives us a pointer on the stack. const auto *FD = cast(Member); @@ -2934,8 +2934,9 @@ bool Compiler::VisitMaterializeTemporaryExpr( // For everyhing else, use local variables. if (SubExprT) { bool IsConst = SubExpr->getType().isConstQualified(); - unsigned LocalIndex = - allocateLocalPrimitive(E, *SubExprT, IsConst, E->getExtendingDecl()); + bool IsVolatile = SubExpr->getType().isVolatileQualified(); + unsigned LocalIndex = allocateLocalPrimitive( + E, *SubExprT, IsConst, IsVolatile, E->getExtendingDecl()); if (!this->visit(SubExpr)) return false; if (!this->emitSetLocal(*SubExprT, LocalIndex, E)) @@ -4452,6 +4453,9 @@ bool Compiler::visitAssignment(const Expr *LHS, const Expr *RHS, if (!this->visit(LHS)) return false; + if (LHS->getType().isVolatileQualified()) + return this->emitInvalidStore(LHS->getType().getTypePtr(), E); + // We don't support assignments in C. if (!Ctx.getLangOpts().CPlusPlus && !this->emitInvalid(E)) return false; @@ -4560,13 +4564,14 @@ bool Compiler::emitConst(const APSInt &Value, const Expr *E) { template unsigned Compiler::allocateLocalPrimitive( - DeclTy &&Src, PrimType Ty, bool IsConst, const ValueDecl *ExtendingDecl, - ScopeKind SC, bool IsConstexprUnknown) { + DeclTy &&Src, PrimType Ty, bool IsConst, bool IsVolatile, + const ValueDecl *ExtendingDecl, ScopeKind SC, bool IsConstexprUnknown) { // FIXME: There are cases where Src.is() is wrong, e.g. // (int){12} in C. Consider using Expr::isTemporaryObject() instead // or isa(). Descriptor *D = P.createDescriptor(Src, Ty, nullptr, Descriptor::InlineDescMD, - IsConst, isa(Src)); + IsConst, isa(Src), + /*IsMutable=*/false, IsVolatile); D->IsConstexprUnknown = IsConstexprUnknown; Scope::Local Local = this->createLocal(D); if (auto *VD = dyn_cast_if_present(Src.dyn_cast())) @@ -4874,7 +4879,8 @@ Compiler::visitVarDecl(const VarDecl *VD, const Expr *Init, if (VarT) { unsigned Offset = this->allocateLocalPrimitive( - VD, *VarT, VD->getType().isConstQualified(), nullptr, ScopeKind::Block, + VD, *VarT, VD->getType().isConstQualified(), + VD->getType().isVolatileQualified(), nullptr, ScopeKind::Block, IsConstexprUnknown); if (Init) { // If this is a toplevel declaration, create a scope for the diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 09599b3547888..5c46f75af4da3 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -327,6 +327,7 @@ class Compiler : public ConstStmtVisitor, bool>, /// Creates a local primitive value. unsigned allocateLocalPrimitive(DeclTy &&Decl, PrimType Ty, bool IsConst, + bool IsVolatile = false, const ValueDecl *ExtendingDecl = nullptr, ScopeKind SC = ScopeKind::Block, bool IsConstexprUnknown = false); diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp index cfda6e8ded760..683e916391337 100644 --- a/clang/lib/AST/ByteCode/Context.cpp +++ b/clang/lib/AST/ByteCode/Context.cpp @@ -18,6 +18,7 @@ #include "clang/AST/ASTLambda.h" #include "clang/AST/Expr.h" #include "clang/Basic/TargetInfo.h" +#include "llvm/Support/SystemZ/zOSSupport.h" using namespace clang; using namespace clang::interp; @@ -236,6 +237,52 @@ bool Context::evaluateCharRange(State &Parent, const Expr *SizeExpr, return evaluateStringRepr(Parent, SizeExpr, PtrExpr, Result); } +bool Context::evaluateString(State &Parent, const Expr *E, + std::string &Result) { + assert(Stk.empty()); + Compiler C(*this, *P, Parent, Stk); + + auto PtrRes = C.interpretAsPointer(E, [&](const Pointer &Ptr) { + const Descriptor *FieldDesc = Ptr.getFieldDesc(); + if (!FieldDesc->isPrimitiveArray()) + return false; + + if (!Ptr.isConst()) + return false; + + unsigned N = Ptr.getNumElems(); + + if (Ptr.elemSize() == 1 /* bytes */) { + const char *Chars = reinterpret_cast(Ptr.getRawAddress()); + unsigned Length = strnlen(Chars, N); + // Wasn't null terminated. + if (N == Length) + return false; + Result.assign(Chars, Length); + return true; + } + + PrimType ElemT = FieldDesc->getPrimType(); + for (unsigned I = Ptr.getIndex(); I != N; ++I) { + INT_TYPE_SWITCH(ElemT, { + auto Elem = Ptr.elem(I); + if (Elem.isZero()) + return true; + Result.push_back(static_cast(Elem)); + }); + } + // We didn't find a 0 byte. + return false; + }); + + if (PtrRes.isInvalid()) { + C.cleanup(); + Stk.clear(); + return false; + } + return true; +} + bool Context::evaluateStrlen(State &Parent, const Expr *E, uint64_t &Result) { assert(Stk.empty()); Compiler C(*this, *P, Parent, Stk); @@ -245,6 +292,9 @@ bool Context::evaluateStrlen(State &Parent, const Expr *E, uint64_t &Result) { if (!FieldDesc->isPrimitiveArray()) return false; + if (Ptr.isDummy() || Ptr.isUnknownSizeArray()) + return false; + unsigned N = Ptr.getNumElems(); if (Ptr.elemSize() == 1) { Result = strnlen(reinterpret_cast(Ptr.getRawAddress()), N); @@ -517,9 +567,15 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FuncDecl) { // Assign descriptors to all parameters. // Composite objects are lowered to pointers. for (const ParmVarDecl *PD : FuncDecl->parameters()) { + bool IsConst = PD->getType().isConstQualified(); + bool IsVolatile = PD->getType().isVolatileQualified(); + OptPrimType T = classify(PD->getType()); PrimType PT = T.value_or(PT_Ptr); - Descriptor *Desc = P->createDescriptor(PD, PT); + Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt, + IsConst, /*IsTemporary=*/false, + /*IsMutable=*/false, IsVolatile); + ParamDescriptors.insert({ParamOffset, {PT, Desc}}); ParamOffsets.push_back(ParamOffset); ParamOffset += align(primSize(PT)); @@ -545,9 +601,14 @@ const Function *Context::getOrCreateObjCBlock(const BlockExpr *E) { // Assign descriptors to all parameters. // Composite objects are lowered to pointers. for (const ParmVarDecl *PD : BD->parameters()) { + bool IsConst = PD->getType().isConstQualified(); + bool IsVolatile = PD->getType().isVolatileQualified(); + OptPrimType T = classify(PD->getType()); PrimType PT = T.value_or(PT_Ptr); - Descriptor *Desc = P->createDescriptor(PD, PT); + Descriptor *Desc = P->createDescriptor(PD, PT, nullptr, std::nullopt, + IsConst, /*IsTemporary=*/false, + /*IsMutable=*/false, IsVolatile); ParamDescriptors.insert({ParamOffset, {PT, Desc}}); ParamOffsets.push_back(ParamOffset); ParamOffset += align(primSize(PT)); diff --git a/clang/lib/AST/ByteCode/Context.h b/clang/lib/AST/ByteCode/Context.h index 280a31725555f..f5fa977cbcad8 100644 --- a/clang/lib/AST/ByteCode/Context.h +++ b/clang/lib/AST/ByteCode/Context.h @@ -67,6 +67,10 @@ class Context final { bool evaluateCharRange(State &Parent, const Expr *SizeExpr, const Expr *PtrExpr, std::string &Result); + /// Evaluate \param E and if it can be evaluated to a null-terminated string, + /// copy the result into \param Result. + bool evaluateString(State &Parent, const Expr *E, std::string &Result); + /// Evalute \param E and if it can be evaluated to a string literal, /// run strlen() on it. bool evaluateStrlen(State &Parent, const Expr *E, uint64_t &Result); diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index ab3b9f7c3b1d7..fd0903f2e652c 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -323,6 +323,8 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const { : TerminalColor{llvm::raw_ostream::RED, false}); OS << (GP.isInitialized() ? "initialized " : "uninitialized "); } + if (GP.block()->isDummy()) + OS << "dummy "; Desc->dump(OS); if (GP.isInitialized() && Desc->IsTemporary) { diff --git a/clang/lib/AST/ByteCode/EvalEmitter.h b/clang/lib/AST/ByteCode/EvalEmitter.h index e81ea67adf97a..a9f87db5d7f8d 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.h +++ b/clang/lib/AST/ByteCode/EvalEmitter.h @@ -16,6 +16,7 @@ #include "EvaluationResult.h" #include "InterpState.h" #include "PrimType.h" +#include "Record.h" #include "Source.h" namespace clang { diff --git a/clang/lib/AST/ByteCode/EvaluationResult.cpp b/clang/lib/AST/ByteCode/EvaluationResult.cpp index ba818788d7026..7c3c21cf28251 100644 --- a/clang/lib/AST/ByteCode/EvaluationResult.cpp +++ b/clang/lib/AST/ByteCode/EvaluationResult.cpp @@ -133,6 +133,8 @@ bool EvaluationResult::checkFullyInitialized(InterpState &S, if (Ptr.isZero()) return true; + if (!Ptr.isBlockPointer()) + return true; // We can't inspect dead pointers at all. Return true here so we can // diagnose them later. diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 0f322f6ed42ac..21af3d6ac7f90 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -889,6 +889,8 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { return false; if (!CheckConst(S, OpPC, Ptr)) return false; + if (!CheckVolatile(S, OpPC, Ptr, AK_Assign)) + return false; if (!S.inConstantContext() && isConstexprUnknown(Ptr)) return false; return true; @@ -1027,8 +1029,8 @@ static bool CheckCallDepth(InterpState &S, CodePtr OpPC) { return true; } -bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This) { - if (!This.isZero()) +bool CheckThis(InterpState &S, CodePtr OpPC) { + if (S.Current->hasThisPointer()) return true; const Expr *E = S.Current->getExpr(OpPC); @@ -1198,8 +1200,8 @@ static bool runRecordDestructor(InterpState &S, CodePtr OpPC, const Record *R = Desc->ElemRecord; assert(R); - if (Pointer::pointToSameBlock(BasePtr, S.Current->getThis()) && - S.Current->getFunction()->isDestructor()) { + if (S.Current->hasThisPointer() && S.Current->getFunction()->isDestructor() && + Pointer::pointToSameBlock(BasePtr, S.Current->getThis())) { const SourceInfo &Loc = S.Current->getSource(OpPC); S.FFDiag(Loc, diag::note_constexpr_double_destroy); return false; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index b3b4b998439cc..bb0c4580b14a9 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -104,7 +104,7 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr); bool CheckInit(InterpState &S, CodePtr OpPC, const Pointer &Ptr); /// Checks the 'this' pointer. -bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This); +bool CheckThis(InterpState &S, CodePtr OpPC); /// Checks if dynamic memory allocation is available in the current /// language mode. @@ -1440,9 +1440,9 @@ template ::T> bool GetThisField(InterpState &S, CodePtr OpPC, uint32_t I) { if (S.checkingPotentialConstantExpression()) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); const Pointer &Field = This.atField(I); if (!CheckLoad(S, OpPC, Field)) return false; @@ -1454,10 +1454,10 @@ template ::T> bool SetThisField(InterpState &S, CodePtr OpPC, uint32_t I) { if (S.checkingPotentialConstantExpression()) return false; + if (!CheckThis(S, OpPC)) + return false; const T &Value = S.Stk.pop(); const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) - return false; const Pointer &Field = This.atField(I); if (!CheckStore(S, OpPC, Field)) return false; @@ -1560,9 +1560,9 @@ template ::T> bool InitThisField(InterpState &S, CodePtr OpPC, uint32_t I) { if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); const Pointer &Field = This.atField(I); assert(Field.canBeInitialized()); Field.deref() = S.Stk.pop(); @@ -1574,9 +1574,9 @@ template ::T> bool InitThisFieldActivate(InterpState &S, CodePtr OpPC, uint32_t I) { if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); const Pointer &Field = This.atField(I); assert(Field.canBeInitialized()); Field.deref() = S.Stk.pop(); @@ -1593,9 +1593,9 @@ bool InitThisBitField(InterpState &S, CodePtr OpPC, const Record::Field *F, assert(F->isBitField()); if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); const Pointer &Field = This.atField(FieldOffset); assert(Field.canBeInitialized()); const auto &Value = S.Stk.pop(); @@ -1610,9 +1610,9 @@ bool InitThisBitFieldActivate(InterpState &S, CodePtr OpPC, assert(F->isBitField()); if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); const Pointer &Field = This.atField(FieldOffset); assert(Field.canBeInitialized()); const auto &Value = S.Stk.pop(); @@ -1730,9 +1730,8 @@ inline bool GetPtrLocal(InterpState &S, CodePtr OpPC, uint32_t I) { } inline bool GetPtrParam(InterpState &S, CodePtr OpPC, uint32_t I) { - if (S.checkingPotentialConstantExpression()) { + if (S.Current->isBottomFrame()) return false; - } S.Stk.push(S.Current->getParamPointer(I)); return true; } @@ -1750,9 +1749,9 @@ bool GetPtrFieldPop(InterpState &S, CodePtr OpPC, uint32_t Off); inline bool GetPtrThisField(InterpState &S, CodePtr OpPC, uint32_t Off) { if (S.checkingPotentialConstantExpression() && S.Current->getDepth() == 0) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); S.Stk.push(This.atField(Off)); return true; } @@ -1844,9 +1843,9 @@ inline bool GetMemberPtrBasePop(InterpState &S, CodePtr OpPC, int32_t Off) { inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) { if (S.checkingPotentialConstantExpression()) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); S.Stk.push(This.atField(Off)); return true; } @@ -1925,10 +1924,10 @@ inline bool GetPtrThisVirtBase(InterpState &S, CodePtr OpPC, assert(D); if (S.checkingPotentialConstantExpression()) return false; - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; - return VirtBaseHelper(S, OpPC, D, S.Current->getThis()); + const Pointer &This = S.Current->getThis(); + return VirtBaseHelper(S, OpPC, D, This); } //===----------------------------------------------------------------------===// @@ -1991,6 +1990,8 @@ static inline bool Activate(InterpState &S, CodePtr OpPC) { static inline bool ActivateThisField(InterpState &S, CodePtr OpPC, uint32_t I) { if (S.checkingPotentialConstantExpression()) return false; + if (!S.Current->hasThisPointer()) + return false; const Pointer &Ptr = S.Current->getThis(); assert(Ptr.atField(I).canBeInitialized()); @@ -2124,10 +2125,10 @@ bool InitElem(InterpState &S, CodePtr OpPC, uint32_t Idx) { const T &Value = S.Stk.pop(); const Pointer &Ptr = S.Stk.peek(); - if (Ptr.isUnknownSizeArray()) + const Descriptor *Desc = Ptr.getFieldDesc(); + if (Desc->isUnknownSizeArray()) return false; - const Descriptor *Desc = Ptr.getFieldDesc(); // In the unlikely event that we're initializing the first item of // a non-array, skip the atIndex(). if (Idx == 0 && !Desc->isArray()) { @@ -2158,10 +2159,10 @@ bool InitElemPop(InterpState &S, CodePtr OpPC, uint32_t Idx) { const T &Value = S.Stk.pop(); const Pointer &Ptr = S.Stk.pop(); - if (Ptr.isUnknownSizeArray()) + const Descriptor *Desc = Ptr.getFieldDesc(); + if (Desc->isUnknownSizeArray()) return false; - const Descriptor *Desc = Ptr.getFieldDesc(); // In the unlikely event that we're initializing the first item of // a non-array, skip the atIndex(). if (Idx == 0 && !Desc->isArray()) { @@ -2813,13 +2814,11 @@ inline bool IsNonNull(InterpState &S, CodePtr OpPC) { inline bool This(InterpState &S, CodePtr OpPC) { // Cannot read 'this' in this mode. - if (S.checkingPotentialConstantExpression()) { + if (S.checkingPotentialConstantExpression()) return false; - } - - const Pointer &This = S.Current->getThis(); - if (!CheckThis(S, OpPC, This)) + if (!CheckThis(S, OpPC)) return false; + const Pointer &This = S.Current->getThis(); // Ensure the This pointer has been cast to the correct base. if (!This.isDummy()) { @@ -3344,6 +3343,18 @@ inline bool InvalidCast(InterpState &S, CodePtr OpPC, CastKind Kind, return false; } +inline bool InvalidStore(InterpState &S, CodePtr OpPC, const Type *T) { + if (S.getLangOpts().CPlusPlus) { + QualType VolatileType = QualType(T, 0).withVolatile(); + S.FFDiag(S.Current->getSource(OpPC), + diag::note_constexpr_access_volatile_type) + << AK_Assign << VolatileType; + } else { + S.FFDiag(S.Current->getSource(OpPC)); + } + return false; +} + inline bool InvalidDeclRef(InterpState &S, CodePtr OpPC, const DeclRefExpr *DR, bool InitializerFailed) { assert(DR); @@ -3534,6 +3545,9 @@ inline bool AllocCN(InterpState &S, CodePtr OpPC, const Descriptor *ElementDesc, if (!CheckDynamicMemoryAllocation(S, OpPC)) return false; + if (!ElementDesc) + return false; + SizeT NumElements = S.Stk.pop(); if (!CheckArraySize(S, OpPC, &NumElements, ElementDesc->getSize(), IsNoThrow)) { diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h index ea9f44c38842e..9b3dadca6cc14 100644 --- a/clang/lib/AST/ByteCode/InterpBlock.h +++ b/clang/lib/AST/ByteCode/InterpBlock.h @@ -115,9 +115,10 @@ class Block final { return reinterpret_cast(this) + sizeof(Block); } - template T deref() const { + template const T &deref() const { return *reinterpret_cast(data()); } + template T &deref() { return *reinterpret_cast(data()); } /// Invokes the constructor. void invokeCtor() { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 77729a5d67c87..891344d4e6ed0 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -563,9 +563,9 @@ static bool interp_floating_comparison(InterpState &S, CodePtr OpPC, case Builtin::BI__builtin_islessequal: return LHS <= RHS; case Builtin::BI__builtin_islessgreater: { - ComparisonCategoryResult cmp = LHS.compare(RHS); - return cmp == ComparisonCategoryResult::Less || - cmp == ComparisonCategoryResult::Greater; + ComparisonCategoryResult Cmp = LHS.compare(RHS); + return Cmp == ComparisonCategoryResult::Less || + Cmp == ComparisonCategoryResult::Greater; } case Builtin::BI__builtin_isunordered: return LHS.compare(RHS) == ComparisonCategoryResult::Unordered; @@ -583,8 +583,7 @@ static bool interp_floating_comparison(InterpState &S, CodePtr OpPC, static bool interp__builtin_isfpclass(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType FPClassArgT = *S.getContext().classify(Call->getArg(1)->getType()); - APSInt FPClassArg = popToAPSInt(S.Stk, FPClassArgT); + APSInt FPClassArg = popToAPSInt(S, Call->getArg(1)); const Floating &F = S.Stk.pop(); int32_t Result = static_cast( @@ -655,8 +654,7 @@ static bool interp__builtin_fabs(InterpState &S, CodePtr OpPC, static bool interp__builtin_abs(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - APSInt Val = popToAPSInt(S.Stk, ArgT); + APSInt Val = popToAPSInt(S, Call->getArg(0)); if (Val == APSInt(APInt::getSignedMinValue(Val.getBitWidth()), /*IsUnsigned=*/false)) return false; @@ -674,8 +672,7 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC, const Pointer &Arg = S.Stk.pop(); Val = convertBoolVectorToInt(Arg); } else { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - Val = popToAPSInt(S.Stk, ArgT); + Val = popToAPSInt(S, Call->getArg(0)); } pushInteger(S, Val.popcount(), Call->getType()); return true; @@ -684,8 +681,7 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC, static bool interp__builtin_parity(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - APSInt Val = popToAPSInt(S.Stk, ArgT); + APSInt Val = popToAPSInt(S, Call->getArg(0)); pushInteger(S, Val.popcount() % 2, Call->getType()); return true; } @@ -693,8 +689,7 @@ static bool interp__builtin_parity(InterpState &S, CodePtr OpPC, static bool interp__builtin_clrsb(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - APSInt Val = popToAPSInt(S.Stk, ArgT); + APSInt Val = popToAPSInt(S, Call->getArg(0)); pushInteger(S, Val.getBitWidth() - Val.getSignificantBits(), Call->getType()); return true; } @@ -702,8 +697,7 @@ static bool interp__builtin_clrsb(InterpState &S, CodePtr OpPC, static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - APSInt Val = popToAPSInt(S.Stk, ArgT); + APSInt Val = popToAPSInt(S, Call->getArg(0)); pushInteger(S, Val.reverseBits(), Call->getType()); return true; } @@ -746,11 +740,8 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC, static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call, bool Right) { - PrimType AmountT = *S.getContext().classify(Call->getArg(1)->getType()); - PrimType ValueT = *S.getContext().classify(Call->getArg(0)->getType()); - - APSInt Amount = popToAPSInt(S.Stk, AmountT); - APSInt Value = popToAPSInt(S.Stk, ValueT); + APSInt Amount = popToAPSInt(S, Call->getArg(1)); + APSInt Value = popToAPSInt(S, Call->getArg(0)); APSInt Result; if (Right) @@ -767,8 +758,7 @@ static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC, static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - APSInt Value = popToAPSInt(S.Stk, ArgT); + APSInt Value = popToAPSInt(S, Call->getArg(0)); uint64_t N = Value.countr_zero(); pushInteger(S, N == Value.getBitWidth() ? 0 : N + 1, Call->getType()); @@ -796,8 +786,7 @@ static bool interp__builtin_move(InterpState &S, CodePtr OpPC, static bool interp__builtin_eh_return_data_regno(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ArgT = *S.getContext().classify(Call->getArg(0)->getType()); - APSInt Arg = popToAPSInt(S.Stk, ArgT); + APSInt Arg = popToAPSInt(S, Call->getArg(0)); int Result = S.getASTContext().getTargetInfo().getEHDataRegisterNumber( Arg.getZExtValue()); @@ -971,17 +960,15 @@ static bool interp__builtin_clz(InterpState &S, CodePtr OpPC, unsigned BuiltinOp) { std::optional Fallback; - if (BuiltinOp == Builtin::BI__builtin_clzg && Call->getNumArgs() == 2) { - PrimType FallbackT = *S.getContext().classify(Call->getArg(1)); - Fallback = popToAPSInt(S.Stk, FallbackT); - } + if (BuiltinOp == Builtin::BI__builtin_clzg && Call->getNumArgs() == 2) + Fallback = popToAPSInt(S, Call->getArg(1)); + APSInt Val; if (Call->getArg(0)->getType()->isExtVectorBoolType()) { const Pointer &Arg = S.Stk.pop(); Val = convertBoolVectorToInt(Arg); } else { - PrimType ValT = *S.getContext().classify(Call->getArg(0)); - Val = popToAPSInt(S.Stk, ValT); + Val = popToAPSInt(S, Call->getArg(0)); } // When the argument is 0, the result of GCC builtins is undefined, whereas @@ -1008,17 +995,15 @@ static bool interp__builtin_ctz(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call, unsigned BuiltinID) { std::optional Fallback; - if (BuiltinID == Builtin::BI__builtin_ctzg && Call->getNumArgs() == 2) { - PrimType FallbackT = *S.getContext().classify(Call->getArg(1)); - Fallback = popToAPSInt(S.Stk, FallbackT); - } + if (BuiltinID == Builtin::BI__builtin_ctzg && Call->getNumArgs() == 2) + Fallback = popToAPSInt(S, Call->getArg(1)); + APSInt Val; if (Call->getArg(0)->getType()->isExtVectorBoolType()) { const Pointer &Arg = S.Stk.pop(); Val = convertBoolVectorToInt(Arg); } else { - PrimType ValT = *S.getContext().classify(Call->getArg(0)); - Val = popToAPSInt(S.Stk, ValT); + Val = popToAPSInt(S, Call->getArg(0)); } if (Val == 0) { @@ -1036,13 +1021,10 @@ static bool interp__builtin_ctz(InterpState &S, CodePtr OpPC, static bool interp__builtin_bswap(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ReturnT = *S.getContext().classify(Call->getType()); - PrimType ValT = *S.getContext().classify(Call->getArg(0)); - const APSInt &Val = popToAPSInt(S.Stk, ValT); + const APSInt &Val = popToAPSInt(S, Call->getArg(0)); assert(Val.getActiveBits() <= 64); - INT_TYPE_SWITCH(ReturnT, - { S.Stk.push(T::from(Val.byteSwap().getZExtValue())); }); + pushInteger(S, Val.byteSwap(), Call->getType()); return true; } @@ -1057,9 +1039,8 @@ static bool interp__builtin_atomic_lock_free(InterpState &S, CodePtr OpPC, return true; }; - PrimType ValT = *S.getContext().classify(Call->getArg(0)); const Pointer &Ptr = S.Stk.pop(); - const APSInt &SizeVal = popToAPSInt(S.Stk, ValT); + const APSInt &SizeVal = popToAPSInt(S, Call->getArg(0)); // For __atomic_is_lock_free(sizeof(_Atomic(T))), if the size is a power // of two less than or equal to the maximum inline atomic width, we know it @@ -1125,21 +1106,17 @@ static bool interp__builtin_c11_atomic_is_lock_free(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { - PrimType ValT = *S.getContext().classify(Call->getArg(0)); - const APSInt &SizeVal = popToAPSInt(S.Stk, ValT); - - auto returnBool = [&S](bool Value) -> bool { - S.Stk.push(Value); - return true; - }; + const APSInt &SizeVal = popToAPSInt(S, Call->getArg(0)); CharUnits Size = CharUnits::fromQuantity(SizeVal.getZExtValue()); if (Size.isPowerOfTwo()) { // Check against inlining width. unsigned InlineWidthBits = S.getASTContext().getTargetInfo().getMaxAtomicInlineWidth(); - if (Size <= S.getASTContext().toCharUnitsFromBits(InlineWidthBits)) - return returnBool(true); + if (Size <= S.getASTContext().toCharUnitsFromBits(InlineWidthBits)) { + S.Stk.push(true); + return true; + } } return false; // returnBool(false); @@ -1169,8 +1146,7 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call, unsigned BuiltinOp) { - PrimType AlignmentT = *S.Ctx.classify(Call->getArg(1)); - const APSInt &Alignment = popToAPSInt(S.Stk, AlignmentT); + const APSInt &Alignment = popToAPSInt(S, Call->getArg(1)); if (Alignment < 0 || !Alignment.isPowerOf2()) { S.FFDiag(Call, diag::note_constexpr_invalid_alignment) << Alignment; @@ -1184,8 +1160,7 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, return false; } - // The first parameter is either an integer or a pointer (but not a function - // pointer). + // The first parameter is either an integer or a pointer. PrimType FirstArgT = *S.Ctx.classify(Call->getArg(0)); if (isIntegralType(FirstArgT)) { @@ -1204,12 +1179,12 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, } return true; } - assert(FirstArgT == PT_Ptr); const Pointer &Ptr = S.Stk.pop(); + if (!Ptr.isBlockPointer()) + return false; - unsigned PtrOffset = Ptr.getByteOffset(); - PtrOffset = Ptr.getIndex(); + unsigned PtrOffset = Ptr.getIndex(); CharUnits BaseAlignment = S.getASTContext().getDeclAlign(Ptr.getDeclDesc()->asValueDecl()); CharUnits PtrAlign = @@ -1326,10 +1301,8 @@ static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC, !Call->getArg(1)->getType()->isIntegerType()) return false; - PrimType ValT = *S.Ctx.classify(Call->getArg(0)); - PrimType IndexT = *S.Ctx.classify(Call->getArg(1)); - APSInt Index = popToAPSInt(S.Stk, IndexT); - APSInt Val = popToAPSInt(S.Stk, ValT); + APSInt Index = popToAPSInt(S, Call->getArg(1)); + APSInt Val = popToAPSInt(S, Call->getArg(0)); unsigned BitWidth = Val.getBitWidth(); uint64_t Shift = Index.extractBitsAsZExtValue(8, 0); @@ -1370,32 +1343,6 @@ static bool interp__builtin_ia32_bzhi(InterpState &S, CodePtr OpPC, return true; } -static bool interp__builtin_ia32_lzcnt(InterpState &S, CodePtr OpPC, - const InterpFrame *Frame, - const CallExpr *Call) { - QualType CallType = Call->getType(); - if (!CallType->isIntegerType() || - !Call->getArg(0)->getType()->isIntegerType()) - return false; - - APSInt Val = popToAPSInt(S, Call->getArg(0)); - pushInteger(S, Val.countLeadingZeros(), CallType); - return true; -} - -static bool interp__builtin_ia32_tzcnt(InterpState &S, CodePtr OpPC, - const InterpFrame *Frame, - const CallExpr *Call) { - QualType CallType = Call->getType(); - if (!CallType->isIntegerType() || - !Call->getArg(0)->getType()->isIntegerType()) - return false; - - APSInt Val = popToAPSInt(S, Call->getArg(0)); - pushInteger(S, Val.countTrailingZeros(), CallType); - return true; -} - static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { @@ -2541,6 +2488,24 @@ static bool interp__builtin_is_within_lifetime(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_elementwise_int_unaryop( + InterpState &S, CodePtr OpPC, const CallExpr *Call, + llvm::function_ref Fn) { + assert(Call->getNumArgs() == 1); + assert(Call->getType()->isIntegerType()); + + // Single integer case. + if (!Call->getArg(0)->getType()->isVectorType()) { + APSInt Src = popToAPSInt(S, Call->getArg(0)); + APInt Result = Fn(Src); + pushInteger(S, APSInt(std::move(Result), !Src.isSigned()), Call->getType()); + return true; + } + + // TODO: Add vector integer handling. + return false; +} + static bool interp__builtin_elementwise_int_binop( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref Fn) { @@ -2604,6 +2569,52 @@ static bool interp__builtin_elementwise_int_binop( return true; } +static bool +interp__builtin_x86_pack(InterpState &S, CodePtr, const CallExpr *E, + llvm::function_ref PackFn) { + const auto *VT0 = E->getArg(0)->getType()->castAs(); + [[maybe_unused]] const auto *VT1 = + E->getArg(1)->getType()->castAs(); + assert(VT0 && VT1 && "pack builtin VT0 and VT1 must be VectorType"); + assert(VT0->getElementType() == VT1->getElementType() && + VT0->getNumElements() == VT1->getNumElements() && + "pack builtin VT0 and VT1 ElementType must be same"); + + const Pointer &RHS = S.Stk.pop(); + const Pointer &LHS = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + + const ASTContext &ASTCtx = S.getASTContext(); + const unsigned SrcBits = ASTCtx.getIntWidth(VT0->getElementType()); + const unsigned LHSVecLen = VT0->getNumElements(); + const unsigned SrcPerLane = 128 / SrcBits; + const unsigned Lanes = LHSVecLen * SrcBits / 128; + + PrimType SrcT = *S.getContext().classify(VT0->getElementType()); + PrimType DstT = *S.getContext().classify(getElemType(Dst)); + const bool IsUnsigend = getElemType(Dst)->isUnsignedIntegerType(); + + for (unsigned Lane = 0; Lane != Lanes; ++Lane) { + const unsigned BaseSrc = Lane * SrcPerLane; + const unsigned BaseDst = Lane * (2 * SrcPerLane); + + for (unsigned I = 0; I != SrcPerLane; ++I) { + INT_TYPE_SWITCH_NO_BOOL(SrcT, { + APSInt A = LHS.elem(BaseSrc + I).toAPSInt(); + APSInt B = RHS.elem(BaseSrc + I).toAPSInt(); + + assignInteger(S, Dst.atIndex(BaseDst + I), DstT, + APSInt(PackFn(A), IsUnsigend)); + assignInteger(S, Dst.atIndex(BaseDst + SrcPerLane + I), DstT, + APSInt(PackFn(B), IsUnsigend)); + }); + } + } + + Dst.initializeAllElements(); + return true; +} + static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned BuiltinID) { @@ -3273,12 +3284,18 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_lzcnt_u16: case clang::X86::BI__builtin_ia32_lzcnt_u32: case clang::X86::BI__builtin_ia32_lzcnt_u64: - return interp__builtin_ia32_lzcnt(S, OpPC, Frame, Call); + return interp__builtin_elementwise_int_unaryop( + S, OpPC, Call, [](const APSInt &Src) { + return APInt(Src.getBitWidth(), Src.countLeadingZeros()); + }); case clang::X86::BI__builtin_ia32_tzcnt_u16: case clang::X86::BI__builtin_ia32_tzcnt_u32: case clang::X86::BI__builtin_ia32_tzcnt_u64: - return interp__builtin_ia32_tzcnt(S, OpPC, Frame, Call); + return interp__builtin_elementwise_int_unaryop( + S, OpPC, Call, [](const APSInt &Src) { + return APInt(Src.getBitWidth(), Src.countTrailingZeros()); + }); case clang::X86::BI__builtin_ia32_pdep_si: case clang::X86::BI__builtin_ia32_pdep_di: @@ -3477,6 +3494,29 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, } return LHS.lshr(RHS.getZExtValue()); }); + case clang::X86::BI__builtin_ia32_packsswb128: + case clang::X86::BI__builtin_ia32_packsswb256: + case clang::X86::BI__builtin_ia32_packsswb512: + case clang::X86::BI__builtin_ia32_packssdw128: + case clang::X86::BI__builtin_ia32_packssdw256: + case clang::X86::BI__builtin_ia32_packssdw512: + return interp__builtin_x86_pack(S, OpPC, Call, [](const APSInt &Src) { + return APInt(Src).truncSSat(Src.getBitWidth() / 2); + }); + case clang::X86::BI__builtin_ia32_packusdw128: + case clang::X86::BI__builtin_ia32_packusdw256: + case clang::X86::BI__builtin_ia32_packusdw512: + case clang::X86::BI__builtin_ia32_packuswb128: + case clang::X86::BI__builtin_ia32_packuswb256: + case clang::X86::BI__builtin_ia32_packuswb512: + return interp__builtin_x86_pack(S, OpPC, Call, [](const APSInt &Src) { + unsigned DstBits = Src.getBitWidth() / 2; + if (Src.isNegative()) + return APInt::getZero(DstBits); + if (Src.isIntN(DstBits)) + return APInt(Src).trunc(DstBits); + return APInt::getAllOnes(DstBits); + }); case clang::X86::BI__builtin_ia32_vprotbi: case clang::X86::BI__builtin_ia32_vprotdi: @@ -3607,6 +3647,61 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_selectpd_512: return interp__builtin_select(S, OpPC, Call); + case X86::BI__builtin_ia32_kandqi: + case X86::BI__builtin_ia32_kandhi: + case X86::BI__builtin_ia32_kandsi: + case X86::BI__builtin_ia32_kanddi: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, + [](const APSInt &LHS, const APSInt &RHS) { return LHS & RHS; }); + + case X86::BI__builtin_ia32_kandnqi: + case X86::BI__builtin_ia32_kandnhi: + case X86::BI__builtin_ia32_kandnsi: + case X86::BI__builtin_ia32_kandndi: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, + [](const APSInt &LHS, const APSInt &RHS) { return ~LHS & RHS; }); + + case X86::BI__builtin_ia32_korqi: + case X86::BI__builtin_ia32_korhi: + case X86::BI__builtin_ia32_korsi: + case X86::BI__builtin_ia32_kordi: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, + [](const APSInt &LHS, const APSInt &RHS) { return LHS | RHS; }); + + case X86::BI__builtin_ia32_kxnorqi: + case X86::BI__builtin_ia32_kxnorhi: + case X86::BI__builtin_ia32_kxnorsi: + case X86::BI__builtin_ia32_kxnordi: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, + [](const APSInt &LHS, const APSInt &RHS) { return ~(LHS ^ RHS); }); + + case X86::BI__builtin_ia32_kxorqi: + case X86::BI__builtin_ia32_kxorhi: + case X86::BI__builtin_ia32_kxorsi: + case X86::BI__builtin_ia32_kxordi: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, + [](const APSInt &LHS, const APSInt &RHS) { return LHS ^ RHS; }); + + case X86::BI__builtin_ia32_knotqi: + case X86::BI__builtin_ia32_knothi: + case X86::BI__builtin_ia32_knotsi: + case X86::BI__builtin_ia32_knotdi: + return interp__builtin_elementwise_int_unaryop( + S, OpPC, Call, [](const APSInt &Src) { return ~Src; }); + + case X86::BI__builtin_ia32_kaddqi: + case X86::BI__builtin_ia32_kaddhi: + case X86::BI__builtin_ia32_kaddsi: + case X86::BI__builtin_ia32_kadddi: + return interp__builtin_elementwise_int_binop( + S, OpPC, Call, + [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; }); + case Builtin::BI__builtin_elementwise_fshl: return interp__builtin_elementwise_triop(S, OpPC, Call, llvm::APIntOps::fshl); diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp index c411a371282ef..039acb5d72b2c 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.cpp +++ b/clang/lib/AST/ByteCode/InterpFrame.cpp @@ -58,15 +58,12 @@ InterpFrame::InterpFrame(InterpState &S, const Function *Func, CodePtr RetPC, // If the fuction has a This pointer, that one is next. // Then follow the actual arguments (but those are handled // in getParamPointer()). - if (Func->hasRVO()) - RVOPtr = stackRef(0); - - if (Func->hasThisPointer()) { - if (Func->hasRVO()) - This = stackRef(sizeof(Pointer)); - else - This = stackRef(0); + if (Func->hasRVO()) { + // RVO pointer offset is always 0. } + + if (Func->hasThisPointer()) + ThisPointerOffset = Func->hasRVO() ? sizeof(Pointer) : 0; } InterpFrame::~InterpFrame() { @@ -167,7 +164,7 @@ void InterpFrame::describe(llvm::raw_ostream &OS) const { /*Indentation=*/0); OS << "."; } else if (const auto *M = dyn_cast(F)) { - print(OS, This, S.getASTContext(), + print(OS, getThis(), S.getASTContext(), S.getASTContext().getLValueReferenceType( S.getASTContext().getCanonicalTagType(M->getParent()))); OS << "."; @@ -234,6 +231,8 @@ Pointer InterpFrame::getParamPointer(unsigned Off) { if (auto Pt = Params.find(Off); Pt != Params.end()) return Pointer(reinterpret_cast(Pt->second.get())); + assert(!isBottomFrame()); + // Allocate memory to store the parameter and the block metadata. const auto &Desc = Func->getParamDescriptor(Off); size_t BlockSize = sizeof(Block) + Desc.second->getAllocSize(); diff --git a/clang/lib/AST/ByteCode/InterpFrame.h b/clang/lib/AST/ByteCode/InterpFrame.h index 129851155bd86..fa9de2e1e7c6d 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.h +++ b/clang/lib/AST/ByteCode/InterpFrame.h @@ -14,7 +14,8 @@ #define LLVM_CLANG_AST_INTERP_INTERPFRAME_H #include "Frame.h" -#include "Program.h" +#include "InterpBlock.h" +#include "Pointer.h" namespace clang { namespace interp { @@ -93,7 +94,7 @@ class InterpFrame final : public Frame { auto Pt = Params.find(Offset); if (Pt == Params.end()) return stackRef(Offset); - return Pointer(reinterpret_cast(Pt->second.get())).deref(); + return reinterpret_cast(Pt->second.get())->deref(); } /// Mutates a local copy of a parameter. @@ -104,11 +105,19 @@ class InterpFrame final : public Frame { /// Returns a pointer to an argument - lazily creates a block. Pointer getParamPointer(unsigned Offset); + bool hasThisPointer() const { return Func && Func->hasThisPointer(); } /// Returns the 'this' pointer. - const Pointer &getThis() const { return This; } + const Pointer &getThis() const { + assert(hasThisPointer()); + return stackRef(ThisPointerOffset); + } /// Returns the RVO pointer, if the Function has one. - const Pointer &getRVOPtr() const { return RVOPtr; } + const Pointer &getRVOPtr() const { + assert(Func); + assert(Func->hasRVO()); + return stackRef(0); + } /// Checks if the frame is a root frame - return should quit the interpreter. bool isRoot() const { return !Func; } @@ -143,7 +152,7 @@ class InterpFrame final : public Frame { /// Returns an offset to a local. template T &localRef(unsigned Offset) const { - return getLocalPointer(Offset).deref(); + return localBlock(Offset)->deref(); } /// Returns a pointer to a local's block. @@ -163,10 +172,8 @@ class InterpFrame final : public Frame { unsigned Depth; /// Reference to the function being executed. const Function *Func; - /// Current object pointer for methods. - Pointer This; - /// Pointer the non-primitive return value gets constructed in. - Pointer RVOPtr; + /// Offset of the instance pointer. Use with stackRef<>(). + unsigned ThisPointerOffset; /// Return address. CodePtr RetPC; /// The size of all the arguments. diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td index 7af2df5318106..532c4448e6f40 100644 --- a/clang/lib/AST/ByteCode/Opcodes.td +++ b/clang/lib/AST/ByteCode/Opcodes.td @@ -797,6 +797,7 @@ def SideEffect : Opcode {} def InvalidCast : Opcode { let Args = [ArgCastKind, ArgBool]; } +def InvalidStore : Opcode { let Args = [ArgTypePtr]; } def CheckPseudoDtor : Opcode {} def InvalidDeclRef : Opcode { diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp index 81d4ce14f9310..663134c8696de 100644 --- a/clang/lib/AST/ByteCode/Pointer.cpp +++ b/clang/lib/AST/ByteCode/Pointer.cpp @@ -110,19 +110,21 @@ Pointer &Pointer::operator=(const Pointer &P) { StorageKind = P.StorageKind; Offset = P.Offset; - if (P.isBlockPointer()) { + switch (StorageKind) { + case Storage::Int: + Int = P.Int; + break; + case Storage::Block: BS = P.BS; if (BS.Pointee) BS.Pointee->addPointer(this); - } else if (P.isIntegralPointer()) { - Int = P.Int; - } else if (P.isFunctionPointer()) { + break; + case Storage::Fn: Fn = P.Fn; - } else if (P.isTypeidPointer()) { + break; + case Storage::Typeid: Typeid = P.Typeid; - } else { - assert(false && "Unhandled storage kind"); } return *this; } @@ -147,19 +149,21 @@ Pointer &Pointer::operator=(Pointer &&P) { StorageKind = P.StorageKind; Offset = P.Offset; - if (P.isBlockPointer()) { + switch (StorageKind) { + case Storage::Int: + Int = P.Int; + break; + case Storage::Block: BS = P.BS; if (BS.Pointee) BS.Pointee->addPointer(this); - } else if (P.isIntegralPointer()) { - Int = P.Int; - } else if (P.isFunctionPointer()) { + break; + case Storage::Fn: Fn = P.Fn; - } else if (P.isTypeidPointer()) { + break; + case Storage::Typeid: Typeid = P.Typeid; - } else { - assert(false && "Unhandled storage kind"); } return *this; } @@ -358,13 +362,17 @@ void Pointer::print(llvm::raw_ostream &OS) const { } size_t Pointer::computeOffsetForComparison() const { - if (isIntegralPointer()) - return asIntPointer().Value + Offset; - if (isTypeidPointer()) + switch (StorageKind) { + case Storage::Int: + return Int.Value + Offset; + case Storage::Block: + // See below. + break; + case Storage::Fn: + return Fn.getIntegerRepresentation() + Offset; + case Storage::Typeid: return reinterpret_cast(asTypeidPointer().TypePtr) + Offset; - - if (!isBlockPointer()) - return Offset; + } size_t Result = 0; Pointer P = *this; diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index bbf20801ce923..cd738ce8b2a3e 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -56,7 +56,7 @@ struct TypeidPointer { const Type *TypeInfoType; }; -enum class Storage { Block, Int, Fn, Typeid }; +enum class Storage { Int, Block, Fn, Typeid }; /// A pointer to a memory block, live or dead. /// @@ -252,14 +252,17 @@ class Pointer { /// Checks if the pointer is null. bool isZero() const { - if (isBlockPointer()) + switch (StorageKind) { + case Storage::Int: + return Int.Value == 0 && Offset == 0; + case Storage::Block: return BS.Pointee == nullptr; - if (isFunctionPointer()) + case Storage::Fn: return Fn.isZero(); - if (isTypeidPointer()) + case Storage::Typeid: return false; - assert(isIntegralPointer()); - return Int.Value == 0 && Offset == 0; + } + llvm_unreachable("Unknown clang::interp::Storage enum"); } /// Checks if the pointer is live. bool isLive() const { diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp index 97ae4a07f32aa..95de6a82a5270 100644 --- a/clang/lib/AST/ExprCXX.cpp +++ b/clang/lib/AST/ExprCXX.cpp @@ -1725,8 +1725,8 @@ SizeOfPackExpr *SizeOfPackExpr::CreateDeserialized(ASTContext &Context, return new (Storage) SizeOfPackExpr(EmptyShell(), NumPartialArgs); } -NamedDecl *SubstNonTypeTemplateParmExpr::getParameter() const { - return cast( +NonTypeTemplateParmDecl *SubstNonTypeTemplateParmExpr::getParameter() const { + return cast( getReplacedTemplateParameterList(getAssociatedDecl())->asArray()[Index]); } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3b9ca82910033..b706b14945b6d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11575,6 +11575,46 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO, return false; } +static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result, + llvm::function_ref PackFn) { + APValue LHS, RHS; + if (!EvaluateAsRValue(Info, E->getArg(0), LHS) || + !EvaluateAsRValue(Info, E->getArg(1), RHS)) + return false; + + unsigned LHSVecLen = LHS.getVectorLength(); + unsigned RHSVecLen = RHS.getVectorLength(); + + assert(LHSVecLen != 0 && LHSVecLen == RHSVecLen && + "pack builtin LHSVecLen must equal to RHSVecLen"); + + const VectorType *VT0 = E->getArg(0)->getType()->castAs(); + const unsigned SrcBits = Info.Ctx.getIntWidth(VT0->getElementType()); + + const VectorType *DstVT = E->getType()->castAs(); + QualType DstElemTy = DstVT->getElementType(); + const bool DstIsUnsigned = DstElemTy->isUnsignedIntegerType(); + + const unsigned SrcPerLane = 128 / SrcBits; + const unsigned Lanes = LHSVecLen * SrcBits / 128; + + SmallVector Out; + Out.reserve(LHSVecLen + RHSVecLen); + + for (unsigned Lane = 0; Lane != Lanes; ++Lane) { + unsigned base = Lane * SrcPerLane; + for (unsigned I = 0; I != SrcPerLane; ++I) + Out.emplace_back(APValue( + APSInt(PackFn(LHS.getVectorElt(base + I).getInt()), DstIsUnsigned))); + for (unsigned I = 0; I != SrcPerLane; ++I) + Out.emplace_back(APValue( + APSInt(PackFn(RHS.getVectorElt(base + I).getInt()), DstIsUnsigned))); + } + + Result = APValue(Out.data(), Out.size()); + return true; +} + bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!IsConstantEvaluatedBuiltinCall(E)) return ExprEvaluatorBaseTy::VisitCallExpr(E); @@ -11768,7 +11808,29 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { } return LHS.lshr(RHS.getZExtValue()); }); - + case X86::BI__builtin_ia32_packsswb128: + case X86::BI__builtin_ia32_packsswb256: + case X86::BI__builtin_ia32_packsswb512: + case X86::BI__builtin_ia32_packssdw128: + case X86::BI__builtin_ia32_packssdw256: + case X86::BI__builtin_ia32_packssdw512: + return evalPackBuiltin(E, Info, Result, [](const APSInt &Src) { + return APSInt(Src).truncSSat(Src.getBitWidth() / 2); + }); + case X86::BI__builtin_ia32_packusdw128: + case X86::BI__builtin_ia32_packusdw256: + case X86::BI__builtin_ia32_packusdw512: + case X86::BI__builtin_ia32_packuswb128: + case X86::BI__builtin_ia32_packuswb256: + case X86::BI__builtin_ia32_packuswb512: + return evalPackBuiltin(E, Info, Result, [](const APSInt &Src) { + unsigned DstBits = Src.getBitWidth() / 2; + if (Src.isNegative()) + return APInt::getZero(DstBits); + if (Src.isIntN(DstBits)) + return APInt((Src).trunc(DstBits)); + return APInt::getAllOnes(DstBits); + }); case clang::X86::BI__builtin_ia32_pmuldq128: case clang::X86::BI__builtin_ia32_pmuldq256: case clang::X86::BI__builtin_ia32_pmuldq512: @@ -13588,6 +13650,20 @@ static bool getBuiltinAlignArguments(const CallExpr *E, EvalInfo &Info, bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, unsigned BuiltinOp) { + + auto HandleMaskBinOp = + [&](llvm::function_ref Fn) + -> bool { + APValue LHS, RHS; + if (!Evaluate(LHS, Info, E->getArg(0)) || + !Evaluate(RHS, Info, E->getArg(1))) + return false; + + APSInt ResultInt = Fn(LHS.getInt(), RHS.getInt()); + + return Success(APValue(ResultInt), E); + }; + switch (BuiltinOp) { default: return false; @@ -14687,6 +14763,65 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, Result.setBitVal(P++, Val[I]); return Success(Result, E); } + + case X86::BI__builtin_ia32_kandqi: + case X86::BI__builtin_ia32_kandhi: + case X86::BI__builtin_ia32_kandsi: + case X86::BI__builtin_ia32_kanddi: { + return HandleMaskBinOp( + [](const APSInt &LHS, const APSInt &RHS) { return LHS & RHS; }); + } + + case X86::BI__builtin_ia32_kandnqi: + case X86::BI__builtin_ia32_kandnhi: + case X86::BI__builtin_ia32_kandnsi: + case X86::BI__builtin_ia32_kandndi: { + return HandleMaskBinOp( + [](const APSInt &LHS, const APSInt &RHS) { return ~LHS & RHS; }); + } + + case X86::BI__builtin_ia32_korqi: + case X86::BI__builtin_ia32_korhi: + case X86::BI__builtin_ia32_korsi: + case X86::BI__builtin_ia32_kordi: { + return HandleMaskBinOp( + [](const APSInt &LHS, const APSInt &RHS) { return LHS | RHS; }); + } + + case X86::BI__builtin_ia32_kxnorqi: + case X86::BI__builtin_ia32_kxnorhi: + case X86::BI__builtin_ia32_kxnorsi: + case X86::BI__builtin_ia32_kxnordi: { + return HandleMaskBinOp( + [](const APSInt &LHS, const APSInt &RHS) { return ~(LHS ^ RHS); }); + } + + case X86::BI__builtin_ia32_kxorqi: + case X86::BI__builtin_ia32_kxorhi: + case X86::BI__builtin_ia32_kxorsi: + case X86::BI__builtin_ia32_kxordi: { + return HandleMaskBinOp( + [](const APSInt &LHS, const APSInt &RHS) { return LHS ^ RHS; }); + } + + case X86::BI__builtin_ia32_knotqi: + case X86::BI__builtin_ia32_knothi: + case X86::BI__builtin_ia32_knotsi: + case X86::BI__builtin_ia32_knotdi: { + APSInt Val; + if (!EvaluateInteger(E->getArg(0), Val, Info)) + return false; + APSInt Result = ~Val; + return Success(APValue(Result), E); + } + + case X86::BI__builtin_ia32_kaddqi: + case X86::BI__builtin_ia32_kaddhi: + case X86::BI__builtin_ia32_kaddsi: + case X86::BI__builtin_ia32_kadddi: { + return HandleMaskBinOp( + [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; }); + } } } @@ -18770,9 +18905,15 @@ std::optional Expr::tryEvaluateString(ASTContext &Ctx) const { uint64_t Result; std::string StringResult; + if (Info.EnableNewConstInterp) { + if (!Info.Ctx.getInterpContext().evaluateString(Info, this, StringResult)) + return std::nullopt; + return StringResult; + } + if (EvaluateBuiltinStrLen(this, Result, Info, &StringResult)) return StringResult; - return {}; + return std::nullopt; } template diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 11a43e8c7a030..2ce4419940e52 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -15,6 +15,7 @@ #include "clang/AST/Attr.h" #include "clang/AST/Decl.h" #include "clang/AST/DeclOpenMP.h" +#include "clang/AST/ExprOpenMP.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/TargetInfo.h" @@ -1023,6 +1024,26 @@ OMPPartialClause *OMPPartialClause::CreateEmpty(const ASTContext &C) { return new (C) OMPPartialClause(); } +OMPLoopRangeClause * +OMPLoopRangeClause::Create(const ASTContext &C, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation FirstLoc, + SourceLocation CountLoc, SourceLocation EndLoc, + Expr *First, Expr *Count) { + OMPLoopRangeClause *Clause = CreateEmpty(C); + Clause->setLocStart(StartLoc); + Clause->setLParenLoc(LParenLoc); + Clause->setFirstLoc(FirstLoc); + Clause->setCountLoc(CountLoc); + Clause->setLocEnd(EndLoc); + Clause->setFirst(First); + Clause->setCount(Count); + return Clause; +} + +OMPLoopRangeClause *OMPLoopRangeClause::CreateEmpty(const ASTContext &C) { + return new (C) OMPLoopRangeClause(); +} + OMPAllocateClause *OMPAllocateClause::Create( const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, Expr *Allocator, Expr *Alignment, SourceLocation ColonLoc, @@ -1159,6 +1180,77 @@ unsigned OMPClauseMappableExprCommon::getUniqueDeclarationsTotalNumber( return UniqueDecls.size(); } +QualType +OMPClauseMappableExprCommon::getComponentExprElementType(const Expr *Exp) { + assert(!isa(Exp) && + "Cannot get element-type from array-shaping expr."); + + // Unless we are handling array-section expressions, including + // array-subscripts, derefs, we can rely on getType. + if (!isa(Exp)) + return Exp->getType().getNonReferenceType().getCanonicalType(); + + // For array-sections, we need to find the type of one element of + // the section. + const auto *OASE = cast(Exp); + + QualType BaseType = ArraySectionExpr::getBaseOriginalType(OASE->getBase()); + + QualType ElemTy; + if (const auto *ATy = BaseType->getAsArrayTypeUnsafe()) + ElemTy = ATy->getElementType(); + else + ElemTy = BaseType->getPointeeType(); + + ElemTy = ElemTy.getNonReferenceType().getCanonicalType(); + return ElemTy; +} + +std::pair> +OMPClauseMappableExprCommon::findAttachPtrExpr( + MappableExprComponentListRef Components, OpenMPDirectiveKind CurDirKind) { + + // If we only have a single component, we have a map like "map(p)", which + // cannot have a base-pointer. + if (Components.size() < 2) + return {nullptr, std::nullopt}; + + // Only check for non-contiguous sections on target_update, since we can + // assume array-sections are contiguous on maps on other constructs, even if + // we are not sure of it at compile-time, like for a[1:x][2]. + if (Components.back().isNonContiguous() && CurDirKind == OMPD_target_update) + return {nullptr, std::nullopt}; + + // To find the attach base-pointer, we start with the second component, + // stripping away one component at a time, until we reach a pointer Expr + // (that is not a binary operator). The first such pointer should be the + // attach base-pointer for the component list. + for (auto [I, Component] : llvm::enumerate(Components)) { + // Skip past the first component. + if (I == 0) + continue; + + const Expr *CurExpr = Component.getAssociatedExpression(); + if (!CurExpr) + break; + + // If CurExpr is something like `p + 10`, we need to ignore it, since + // we are looking for `p`. + if (isa(CurExpr)) + continue; + + // Keep going until we reach an Expr of pointer type. + QualType CurType = getComponentExprElementType(CurExpr); + if (!CurType->isPointerType()) + continue; + + // We have found a pointer Expr. This must be the attach pointer. + return {CurExpr, Components.size() - I}; + } + + return {nullptr, std::nullopt}; +} + OMPMapClause *OMPMapClause::Create( const ASTContext &C, const OMPVarListLocTy &Locs, ArrayRef Vars, ArrayRef Declarations, @@ -1892,6 +1984,21 @@ void OMPClausePrinter::VisitOMPPartialClause(OMPPartialClause *Node) { } } +void OMPClausePrinter::VisitOMPLoopRangeClause(OMPLoopRangeClause *Node) { + OS << "looprange"; + + Expr *First = Node->getFirst(); + Expr *Count = Node->getCount(); + + if (First && Count) { + OS << "("; + First->printPretty(OS, nullptr, Policy, 0); + OS << ","; + Count->printPretty(OS, nullptr, Policy, 0); + OS << ")"; + } +} + void OMPClausePrinter::VisitOMPAllocatorClause(OMPAllocatorClause *Node) { OS << "allocator("; Node->getAllocator()->printPretty(OS, nullptr, Policy, 0); diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp index 1f6586f95a9f8..a5b0cd3786a28 100644 --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -125,13 +125,12 @@ OMPLoopBasedDirective::tryToFindNextInnerLoop(Stmt *CurStmt, bool OMPLoopBasedDirective::doForAllLoops( Stmt *CurStmt, bool TryImperfectlyNestedLoops, unsigned NumLoops, llvm::function_ref Callback, - llvm::function_ref + llvm::function_ref OnTransformationCallback) { CurStmt = CurStmt->IgnoreContainers(); for (unsigned Cnt = 0; Cnt < NumLoops; ++Cnt) { while (true) { - auto *Dir = - dyn_cast(CurStmt); + auto *Dir = dyn_cast(CurStmt); if (!Dir) break; @@ -371,6 +370,22 @@ OMPForDirective *OMPForDirective::Create( return Dir; } +Stmt *OMPLoopTransformationDirective::getTransformedStmt() const { + if (auto *D = dyn_cast(S)) + return D->getTransformedStmt(); + if (auto *D = dyn_cast(S)) + return D->getTransformedStmt(); + llvm_unreachable("unexpected object type"); +} + +Stmt *OMPLoopTransformationDirective::getPreInits() const { + if (auto *D = dyn_cast(S)) + return D->getPreInits(); + if (auto *D = dyn_cast(S)) + return D->getPreInits(); + llvm_unreachable("unexpected object type"); +} + Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const { switch (getStmtClass()) { #define STMT(CLASS, PARENT) @@ -380,7 +395,7 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getTransformedStmt() const { return static_cast(this)->getTransformedStmt(); #include "clang/AST/StmtNodes.inc" default: - llvm_unreachable("Not a loop transformation"); + llvm_unreachable("Not a loop transformation for canonical loop nests"); } } @@ -393,7 +408,34 @@ Stmt *OMPCanonicalLoopNestTransformationDirective::getPreInits() const { return static_cast(this)->getPreInits(); #include "clang/AST/StmtNodes.inc" default: - llvm_unreachable("Not a loop transformation"); + llvm_unreachable("Not a loop transformation for canonical loop nests"); + } +} + +Stmt * +OMPCanonicalLoopSequenceTransformationDirective::getTransformedStmt() const { + switch (getStmtClass()) { +#define STMT(CLASS, PARENT) +#define ABSTRACT_STMT(CLASS) +#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \ + case Stmt::CLASS##Class: \ + return static_cast(this)->getTransformedStmt(); +#include "clang/AST/StmtNodes.inc" + default: + llvm_unreachable("Not a loop transformation for canonical loop sequences"); + } +} + +Stmt *OMPCanonicalLoopSequenceTransformationDirective::getPreInits() const { + switch (getStmtClass()) { +#define STMT(CLASS, PARENT) +#define ABSTRACT_STMT(CLASS) +#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \ + case Stmt::CLASS##Class: \ + return static_cast(this)->getPreInits(); +#include "clang/AST/StmtNodes.inc" + default: + llvm_unreachable("Not a loop transformation for canonical loop sequences"); } } @@ -510,6 +552,27 @@ OMPInterchangeDirective::CreateEmpty(const ASTContext &C, unsigned NumClauses, SourceLocation(), SourceLocation(), NumLoops); } +OMPFuseDirective *OMPFuseDirective::Create( + const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, + ArrayRef Clauses, unsigned NumGeneratedTopLevelLoops, + Stmt *AssociatedStmt, Stmt *TransformedStmt, Stmt *PreInits) { + + OMPFuseDirective *Dir = createDirective( + C, Clauses, AssociatedStmt, TransformedStmtOffset + 1, StartLoc, EndLoc); + Dir->setTransformedStmt(TransformedStmt); + Dir->setPreInits(PreInits); + Dir->setNumGeneratedTopLevelLoops(NumGeneratedTopLevelLoops); + return Dir; +} + +OMPFuseDirective *OMPFuseDirective::CreateEmpty(const ASTContext &C, + unsigned NumClauses) { + OMPFuseDirective *Dir = createEmptyDirective( + C, NumClauses, /*HasAssociatedStmt=*/true, TransformedStmtOffset + 1, + SourceLocation(), SourceLocation()); + return Dir; +} + OMPForSimdDirective * OMPForSimdDirective::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, unsigned CollapsedNum, diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index 2c9c3581a2962..586c3000f105c 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -795,6 +795,11 @@ void StmtPrinter::VisitOMPInterchangeDirective(OMPInterchangeDirective *Node) { PrintOMPExecutableDirective(Node); } +void StmtPrinter::VisitOMPFuseDirective(OMPFuseDirective *Node) { + Indent() << "#pragma omp fuse"; + PrintOMPExecutableDirective(Node); +} + void StmtPrinter::VisitOMPForDirective(OMPForDirective *Node) { Indent() << "#pragma omp for"; PrintOMPExecutableDirective(Node); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 37c4d43ec0b2f..589a156a2b6ea 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -510,6 +510,13 @@ void OMPClauseProfiler::VisitOMPPartialClause(const OMPPartialClause *C) { Profiler->VisitExpr(Factor); } +void OMPClauseProfiler::VisitOMPLoopRangeClause(const OMPLoopRangeClause *C) { + if (const Expr *First = C->getFirst()) + Profiler->VisitExpr(First); + if (const Expr *Count = C->getCount()) + Profiler->VisitExpr(Count); +} + void OMPClauseProfiler::VisitOMPAllocatorClause(const OMPAllocatorClause *C) { if (C->getAllocator()) Profiler->VisitStmt(C->getAllocator()); @@ -1025,6 +1032,15 @@ void StmtProfiler::VisitOMPInterchangeDirective( VisitOMPCanonicalLoopNestTransformationDirective(S); } +void StmtProfiler::VisitOMPCanonicalLoopSequenceTransformationDirective( + const OMPCanonicalLoopSequenceTransformationDirective *S) { + VisitOMPExecutableDirective(S); +} + +void StmtProfiler::VisitOMPFuseDirective(const OMPFuseDirective *S) { + VisitOMPCanonicalLoopSequenceTransformationDirective(S); +} + void StmtProfiler::VisitOMPForDirective(const OMPForDirective *S) { VisitOMPLoopDirective(S); } @@ -1353,7 +1369,8 @@ void StmtProfiler::VisitExpr(const Expr *S) { } void StmtProfiler::VisitConstantExpr(const ConstantExpr *S) { - VisitExpr(S); + // Profile exactly as the sub-expression. + Visit(S->getSubExpr()); } void StmtProfiler::VisitDeclRefExpr(const DeclRefExpr *S) { diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index cd59678d67f2f..f3448af5f8f50 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -846,16 +846,45 @@ void TypePrinter::printExtVectorAfter(const ExtVectorType *T, raw_ostream &OS) { } } -void TypePrinter::printConstantMatrixBefore(const ConstantMatrixType *T, - raw_ostream &OS) { - printBefore(T->getElementType(), OS); - OS << " __attribute__((matrix_type("; +static void printDims(const ConstantMatrixType *T, raw_ostream &OS) { OS << T->getNumRows() << ", " << T->getNumColumns(); +} + +static void printHLSLMatrixBefore(TypePrinter &TP, const ConstantMatrixType *T, + raw_ostream &OS) { + OS << "matrix<"; + TP.printBefore(T->getElementType(), OS); +} + +static void printHLSLMatrixAfter(const ConstantMatrixType *T, raw_ostream &OS) { + OS << ", "; + printDims(T, OS); + OS << ">"; +} + +static void printClangMatrixBefore(TypePrinter &TP, const ConstantMatrixType *T, + raw_ostream &OS) { + TP.printBefore(T->getElementType(), OS); + OS << " __attribute__((matrix_type("; + printDims(T, OS); OS << ")))"; } +void TypePrinter::printConstantMatrixBefore(const ConstantMatrixType *T, + raw_ostream &OS) { + if (Policy.UseHLSLTypes) { + printHLSLMatrixBefore(*this, T, OS); + return; + } + printClangMatrixBefore(*this, T, OS); +} + void TypePrinter::printConstantMatrixAfter(const ConstantMatrixType *T, raw_ostream &OS) { + if (Policy.UseHLSLTypes) { + printHLSLMatrixAfter(T, OS); + return; + } printAfter(T->getElementType(), OS); } diff --git a/clang/lib/Analysis/CMakeLists.txt b/clang/lib/Analysis/CMakeLists.txt index 0523d92480cb3..5a26f3eeea418 100644 --- a/clang/lib/Analysis/CMakeLists.txt +++ b/clang/lib/Analysis/CMakeLists.txt @@ -21,6 +21,7 @@ add_clang_library(clangAnalysis FixitUtil.cpp IntervalPartition.cpp IssueHash.cpp + LifetimeAnnotations.cpp LifetimeSafety.cpp LiveVariables.cpp MacroExpansionContext.cpp diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp index 60371d9498c25..06f12784aa82d 100644 --- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp +++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp @@ -657,7 +657,12 @@ class TransferVisitor : public ConstStmtVisitor { if (LocSrc == nullptr || LocDst == nullptr) return; - copyRecord(*LocSrc, *LocDst, Env); + // If the destination object here is of a derived class, `Arg0` may be a + // cast of that object to a base class, and the source object may be of a + // sibling derived class. To handle these cases, ensure we are copying + // only the fields for `Arg0`'s type, not the type of the underlying + // `RecordStorageLocation`. + copyRecord(*LocSrc, *LocDst, Env, Arg0->getType()); // The assignment operator can have an arbitrary return type. We model the // return value only if the return type is the same as or a base class of diff --git a/clang/lib/Analysis/LifetimeAnnotations.cpp b/clang/lib/Analysis/LifetimeAnnotations.cpp new file mode 100644 index 0000000000000..e79122475625e --- /dev/null +++ b/clang/lib/Analysis/LifetimeAnnotations.cpp @@ -0,0 +1,75 @@ +//===- LifetimeAnnotations.cpp - -*--------------- C++------------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "clang/Analysis/Analyses/LifetimeAnnotations.h" +#include "clang/AST/ASTContext.h" +#include "clang/AST/Attr.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclCXX.h" +#include "clang/AST/Type.h" +#include "clang/AST/TypeLoc.h" + +namespace clang { +namespace lifetimes { + +const FunctionDecl * +getDeclWithMergedLifetimeBoundAttrs(const FunctionDecl *FD) { + return FD != nullptr ? FD->getMostRecentDecl() : nullptr; +} + +const CXXMethodDecl * +getDeclWithMergedLifetimeBoundAttrs(const CXXMethodDecl *CMD) { + const FunctionDecl *FD = CMD; + return cast_if_present( + getDeclWithMergedLifetimeBoundAttrs(FD)); +} + +bool isNormalAssignmentOperator(const FunctionDecl *FD) { + OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator(); + bool IsAssignment = OO == OO_Equal || isCompoundAssignmentOperator(OO); + if (!IsAssignment) + return false; + QualType RetT = FD->getReturnType(); + if (!RetT->isLValueReferenceType()) + return false; + ASTContext &Ctx = FD->getASTContext(); + QualType LHST; + auto *MD = dyn_cast(FD); + if (MD && MD->isCXXInstanceMember()) + LHST = Ctx.getLValueReferenceType(MD->getFunctionObjectParameterType()); + else + LHST = FD->getParamDecl(0)->getType(); + return Ctx.hasSameType(RetT, LHST); +} + +bool isAssignmentOperatorLifetimeBound(const CXXMethodDecl *CMD) { + CMD = getDeclWithMergedLifetimeBoundAttrs(CMD); + return CMD && isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 && + CMD->getParamDecl(0)->hasAttr(); +} + +bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { + FD = getDeclWithMergedLifetimeBoundAttrs(FD); + const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); + if (!TSI) + return false; + // Don't declare this variable in the second operand of the for-statement; + // GCC miscompiles that by ending its lifetime before evaluating the + // third operand. See gcc.gnu.org/PR86769. + AttributedTypeLoc ATL; + for (TypeLoc TL = TSI->getTypeLoc(); + (ATL = TL.getAsAdjusted()); + TL = ATL.getModifiedLoc()) { + if (ATL.getAttrAs()) + return true; + } + + return isNormalAssignmentOperator(FD); +} + +} // namespace lifetimes +} // namespace clang diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index d016c6f12e82e..c18b8fb890a05 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -10,6 +10,7 @@ #include "clang/AST/Expr.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/Type.h" +#include "clang/Analysis/Analyses/LifetimeAnnotations.h" #include "clang/Analysis/Analyses/PostOrderCFGView.h" #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" @@ -213,10 +214,13 @@ class Fact { /// out of scope). Expire, /// An origin is propagated from a source to a destination (e.g., p = q). - AssignOrigin, + /// This can also optionally kill the destination origin before flowing into + /// it. Otherwise, the source's loan set is merged into the destination's + /// loan set. + OriginFlow, /// An origin escapes the function by flowing into the return value. ReturnOfOrigin, - /// An origin is used (eg. dereferencing a pointer). + /// An origin is used (eg. appears as l-value expression like DeclRefExpr). Use, /// A marker for a specific point in the code, for testing. TestPoint, @@ -285,25 +289,33 @@ class ExpireFact : public Fact { } }; -class AssignOriginFact : public Fact { +class OriginFlowFact : public Fact { OriginID OIDDest; OriginID OIDSrc; + // True if the destination origin should be killed (i.e., its current loans + // cleared) before the source origin's loans are flowed into it. + bool KillDest; public: static bool classof(const Fact *F) { - return F->getKind() == Kind::AssignOrigin; + return F->getKind() == Kind::OriginFlow; } - AssignOriginFact(OriginID OIDDest, OriginID OIDSrc) - : Fact(Kind::AssignOrigin), OIDDest(OIDDest), OIDSrc(OIDSrc) {} + OriginFlowFact(OriginID OIDDest, OriginID OIDSrc, bool KillDest) + : Fact(Kind::OriginFlow), OIDDest(OIDDest), OIDSrc(OIDSrc), + KillDest(KillDest) {} + OriginID getDestOriginID() const { return OIDDest; } OriginID getSrcOriginID() const { return OIDSrc; } + bool getKillDest() const { return KillDest; } + void dump(llvm::raw_ostream &OS, const LoanManager &, const OriginManager &OM) const override { - OS << "AssignOrigin (Dest: "; + OS << "OriginFlow (Dest: "; OM.dump(getDestOriginID(), OS); OS << ", Src: "; OM.dump(getSrcOriginID(), OS); + OS << (getKillDest() ? "" : ", Merge"); OS << ")\n"; } }; @@ -454,7 +466,7 @@ class FactGenerator : public ConstStmtVisitor { if (const auto *VD = dyn_cast(D)) if (hasOrigin(VD)) if (const Expr *InitExpr = VD->getInit()) - addAssignOriginFact(*VD, *InitExpr); + killAndFlowOrigin(*VD, *InitExpr); } void VisitDeclRefExpr(const DeclRefExpr *DRE) { @@ -492,9 +504,23 @@ class FactGenerator : public ConstStmtVisitor { isa(MCE->getCalleeDecl())) { // The argument is the implicit object itself. handleFunctionCall(MCE, MCE->getMethodDecl(), - {MCE->getImplicitObjectArgument()}); + {MCE->getImplicitObjectArgument()}, + /*IsGslConstruction=*/true); + } + if (const CXXMethodDecl *Method = MCE->getMethodDecl()) { + // Construct the argument list, with the implicit 'this' object as the + // first argument. + llvm::SmallVector Args; + Args.push_back(MCE->getImplicitObjectArgument()); + Args.append(MCE->getArgs(), MCE->getArgs() + MCE->getNumArgs()); + + handleFunctionCall(MCE, Method, Args, /*IsGslConstruction=*/false); } - // FIXME: A more general VisitCallExpr could also be used here. + } + + void VisitCallExpr(const CallExpr *CE) { + handleFunctionCall(CE, CE->getDirectCallee(), + {CE->getArgs(), CE->getNumArgs()}); } void VisitCXXNullPtrLiteralExpr(const CXXNullPtrLiteralExpr *N) { @@ -508,7 +534,7 @@ class FactGenerator : public ConstStmtVisitor { return; // An ImplicitCastExpr node itself gets an origin, which flows from the // origin of its sub-expression (after stripping its own parens/casts). - addAssignOriginFact(*ICE, *ICE->getSubExpr()); + killAndFlowOrigin(*ICE, *ICE->getSubExpr()); } void VisitUnaryOperator(const UnaryOperator *UO) { @@ -522,7 +548,7 @@ class FactGenerator : public ConstStmtVisitor { // its sub-expression (x). This fact will cause the dataflow analysis // to propagate any loans held by the sub-expression's origin to the // origin of this UnaryOperator expression. - addAssignOriginFact(*UO, *SubExpr); + killAndFlowOrigin(*UO, *SubExpr); } } @@ -542,8 +568,15 @@ class FactGenerator : public ConstStmtVisitor { } void VisitCXXOperatorCallExpr(const CXXOperatorCallExpr *OCE) { - if (OCE->isAssignmentOp() && OCE->getNumArgs() == 2) + // Assignment operators have special "kill-then-propagate" semantics + // and are handled separately. + if (OCE->isAssignmentOp() && OCE->getNumArgs() == 2) { handleAssignment(OCE->getArg(0), OCE->getArg(1)); + return; + } + handleFunctionCall(OCE, OCE->getDirectCallee(), + {OCE->getArgs(), OCE->getNumArgs()}, + /*IsGslConstruction=*/false); } void VisitCXXFunctionalCastExpr(const CXXFunctionalCastExpr *FCE) { @@ -552,7 +585,7 @@ class FactGenerator : public ConstStmtVisitor { if (handleTestPoint(FCE)) return; if (isGslPointerType(FCE->getType())) - addAssignOriginFact(*FCE, *FCE->getSubExpr()); + killAndFlowOrigin(*FCE, *FCE->getSubExpr()); } void VisitInitListExpr(const InitListExpr *ILE) { @@ -561,7 +594,7 @@ class FactGenerator : public ConstStmtVisitor { // For list initialization with a single element, like `View{...}`, the // origin of the list itself is the origin of its single element. if (ILE->getNumInits() == 1) - addAssignOriginFact(*ILE, *ILE->getInit(0)); + killAndFlowOrigin(*ILE, *ILE->getInit(0)); } void VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *MTE) { @@ -569,7 +602,7 @@ class FactGenerator : public ConstStmtVisitor { return; // A temporary object's origin is the same as the origin of the // expression that initializes it. - addAssignOriginFact(*MTE, *MTE->getSubExpr()); + killAndFlowOrigin(*MTE, *MTE->getSubExpr()); } void handleDestructor(const CFGAutomaticObjDtor &DtorOpt) { @@ -624,34 +657,51 @@ class FactGenerator : public ConstStmtVisitor { if (CCE->getNumArgs() != 1) return; if (hasOrigin(CCE->getArg(0))) - addAssignOriginFact(*CCE, *CCE->getArg(0)); + killAndFlowOrigin(*CCE, *CCE->getArg(0)); else // This could be a new borrow. handleFunctionCall(CCE, CCE->getConstructor(), - {CCE->getArgs(), CCE->getNumArgs()}); + {CCE->getArgs(), CCE->getNumArgs()}, + /*IsGslConstruction=*/true); } /// Checks if a call-like expression creates a borrow by passing a value to a /// reference parameter, creating an IssueFact if it does. + /// \param IsGslConstruction True if this is a GSL construction where all + /// argument origins should flow to the returned origin. void handleFunctionCall(const Expr *Call, const FunctionDecl *FD, - ArrayRef Args) { - if (!FD) + ArrayRef Args, + bool IsGslConstruction = false) { + // Ignore functions returning values with no origin. + if (!FD || !hasOrigin(Call)) return; - // TODO: Handle more than one arguments. - for (unsigned I = 0; I <= 0 /*Args.size()*/; ++I) { - const Expr *ArgExpr = Args[I]; - - // Propagate origins for CXX this. - if (FD->isCXXClassMember() && I == 0) { - addAssignOriginFact(*Call, *ArgExpr); - continue; + auto IsArgLifetimeBound = [FD](unsigned I) -> bool { + const ParmVarDecl *PVD = nullptr; + if (const auto *Method = dyn_cast(FD); + Method && Method->isInstance()) { + if (I == 0) + // For the 'this' argument, the attribute is on the method itself. + return implicitObjectParamIsLifetimeBound(Method); + if ((I - 1) < Method->getNumParams()) + // For explicit arguments, find the corresponding parameter + // declaration. + PVD = Method->getParamDecl(I - 1); + } else if (I < FD->getNumParams()) + // For free functions or static methods. + PVD = FD->getParamDecl(I); + return PVD ? PVD->hasAttr() : false; + }; + if (Args.empty()) + return; + bool killedSrc = false; + for (unsigned I = 0; I < Args.size(); ++I) + if (IsGslConstruction || IsArgLifetimeBound(I)) { + if (!killedSrc) { + killedSrc = true; + killAndFlowOrigin(*Call, *Args[I]); + } else + flowOrigin(*Call, *Args[I]); } - // The parameter is a pointer, reference, or gsl::Pointer. - // This is a borrow. We propagate the origin from the argument expression - // at the call site to the parameter declaration in the callee. - if (hasOrigin(ArgExpr)) - addAssignOriginFact(*Call, *ArgExpr); - } } /// Creates a loan for the storage path of a given declaration reference. @@ -668,11 +718,19 @@ class FactGenerator : public ConstStmtVisitor { } template - void addAssignOriginFact(const Destination &D, const Source &S) { + void flowOrigin(const Destination &D, const Source &S) { + OriginID DestOID = FactMgr.getOriginMgr().getOrCreate(D); + OriginID SrcOID = FactMgr.getOriginMgr().get(S); + CurrentBlockFacts.push_back(FactMgr.createFact( + DestOID, SrcOID, /*KillDest=*/false)); + } + + template + void killAndFlowOrigin(const Destination &D, const Source &S) { OriginID DestOID = FactMgr.getOriginMgr().getOrCreate(D); OriginID SrcOID = FactMgr.getOriginMgr().get(S); CurrentBlockFacts.push_back( - FactMgr.createFact(DestOID, SrcOID)); + FactMgr.createFact(DestOID, SrcOID, /*KillDest=*/true)); } /// Checks if the expression is a `void("__lifetime_test_point_...")` cast. @@ -703,12 +761,11 @@ class FactGenerator : public ConstStmtVisitor { if (const auto *DRE_LHS = dyn_cast(LHSExpr->IgnoreParenImpCasts())) { markUseAsWrite(DRE_LHS); - if (const auto *VD_LHS = dyn_cast(DRE_LHS->getDecl())) - // We are interested in assignments like `ptr1 = ptr2` or `ptr = &var`. - // LHS must be a pointer/reference type that can be an origin. RHS must - // also represent an origin (either another pointer/ref or an - // address-of). - addAssignOriginFact(*VD_LHS, *RHSExpr); + if (const auto *VD_LHS = dyn_cast(DRE_LHS->getDecl())) { + // Kill the old loans of the destination origin and flow the new loans + // from the source origin. + killAndFlowOrigin(*VD_LHS, *RHSExpr); + } } } @@ -882,8 +939,8 @@ class DataflowAnalysis { return D->transfer(In, *F->getAs()); case Fact::Kind::Expire: return D->transfer(In, *F->getAs()); - case Fact::Kind::AssignOrigin: - return D->transfer(In, *F->getAs()); + case Fact::Kind::OriginFlow: + return D->transfer(In, *F->getAs()); case Fact::Kind::ReturnOfOrigin: return D->transfer(In, *F->getAs()); case Fact::Kind::Use: @@ -897,7 +954,7 @@ class DataflowAnalysis { public: Lattice transfer(Lattice In, const IssueFact &) { return In; } Lattice transfer(Lattice In, const ExpireFact &) { return In; } - Lattice transfer(Lattice In, const AssignOriginFact &) { return In; } + Lattice transfer(Lattice In, const OriginFlowFact &) { return In; } Lattice transfer(Lattice In, const ReturnOfOriginFact &) { return In; } Lattice transfer(Lattice In, const UseFact &) { return In; } Lattice transfer(Lattice In, const TestPointFact &) { return In; } @@ -910,13 +967,10 @@ template static llvm::ImmutableSet join(llvm::ImmutableSet A, llvm::ImmutableSet B, typename llvm::ImmutableSet::Factory &F) { - if (A == B) - return A; if (A.getHeight() < B.getHeight()) std::swap(A, B); for (const T &E : B) - if (!A.contains(E)) - A = F.add(A, E); + A = F.add(A, E); return A; } @@ -950,11 +1004,10 @@ join(llvm::ImmutableMap A, llvm::ImmutableMap B, for (const auto &Entry : B) { const K &Key = Entry.first; const V &ValB = Entry.second; - const V *ValA = A.lookup(Key); - if (!ValA) - A = F.add(A, Key, ValB); - else if (*ValA != ValB) + if (const V *ValA = A.lookup(Key)) A = F.add(A, Key, JoinValues(*ValA, ValB)); + else + A = F.add(A, Key, ValB); } return A; } @@ -970,9 +1023,11 @@ using ExpiredLoanMap = llvm::ImmutableMap; /// An object to hold the factories for immutable collections, ensuring /// that all created states share the same underlying memory management. struct LifetimeFactory { - OriginLoanMap::Factory OriginMapFactory; - LoanSet::Factory LoanSetFactory; - ExpiredLoanMap::Factory ExpiredLoanMapFactory; + llvm::BumpPtrAllocator Allocator; + OriginLoanMap::Factory OriginMapFactory{Allocator, /*canonicalize=*/false}; + LoanSet::Factory LoanSetFactory{Allocator, /*canonicalize=*/false}; + ExpiredLoanMap::Factory ExpiredLoanMapFactory{Allocator, + /*canonicalize=*/false}; }; /// Represents the dataflow lattice for loan propagation. @@ -1049,14 +1104,20 @@ class LoanPropagationAnalysis LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID))); } - /// The destination origin's loan set is replaced by the source's. - /// This implicitly "resets" the old loans of the destination. - Lattice transfer(Lattice In, const AssignOriginFact &F) { + /// A flow from source to destination. If `KillDest` is true, this replaces + /// the destination's loans with the source's. Otherwise, the source's loans + /// are merged into the destination's. + Lattice transfer(Lattice In, const OriginFlowFact &F) { OriginID DestOID = F.getDestOriginID(); OriginID SrcOID = F.getSrcOriginID(); + + LoanSet DestLoans = + F.getKillDest() ? LoanSetFactory.getEmptySet() : getLoans(In, DestOID); LoanSet SrcLoans = getLoans(In, SrcOID); + LoanSet MergedLoans = utils::join(DestLoans, SrcLoans, LoanSetFactory); + return LoanPropagationLattice( - OriginLoanMapFactory.add(In.Origins, DestOID, SrcLoans)); + OriginLoanMapFactory.add(In.Origins, DestOID, MergedLoans)); } LoanSet getLoans(OriginID OID, ProgramPoint P) { diff --git a/clang/lib/Analysis/PathDiagnostic.cpp b/clang/lib/Analysis/PathDiagnostic.cpp index ef24efd3c4bd0..e42731b93bfb2 100644 --- a/clang/lib/Analysis/PathDiagnostic.cpp +++ b/clang/lib/Analysis/PathDiagnostic.cpp @@ -24,6 +24,7 @@ #include "clang/AST/Type.h" #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" +#include "clang/Analysis/IssueHash.h" #include "clang/Analysis/ProgramPoint.h" #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" @@ -1075,6 +1076,19 @@ unsigned PathDiagnostic::full_size() { return size; } +SmallString<32> +PathDiagnostic::getIssueHash(const SourceManager &SrcMgr, + const LangOptions &LangOpts) const { + PathDiagnosticLocation UPDLoc = getUniqueingLoc(); + FullSourceLoc FullLoc( + SrcMgr.getExpansionLoc(UPDLoc.isValid() ? UPDLoc.asLocation() + : getLocation().asLocation()), + SrcMgr); + + return clang::getIssueHash(FullLoc, getCheckerName(), getBugType(), + getDeclWithIssue(), LangOpts); +} + //===----------------------------------------------------------------------===// // FoldingSet profiling methods. //===----------------------------------------------------------------------===// diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp index cee98d58a6112..d19f86a2223d8 100644 --- a/clang/lib/Analysis/ThreadSafety.cpp +++ b/clang/lib/Analysis/ThreadSafety.cpp @@ -1668,13 +1668,13 @@ void ThreadSafetyAnalyzer::getEdgeLockset(FactSet& Result, const CFGBlockInfo *PredBlockInfo = &BlockInfo[PredBlock->getBlockID()]; const LocalVarContext &LVarCtx = PredBlockInfo->ExitContext; - // Temporarily set the lookup context for SExprBuilder. - SxBuilder.setLookupLocalVarExpr([&](const NamedDecl *D) -> const Expr * { - if (!Handler.issueBetaWarnings()) - return nullptr; - auto Ctx = LVarCtx; - return LocalVarMap.lookupExpr(D, Ctx); - }); + if (Handler.issueBetaWarnings()) { + // Temporarily set the lookup context for SExprBuilder. + SxBuilder.setLookupLocalVarExpr( + [this, Ctx = LVarCtx](const NamedDecl *D) mutable -> const Expr * { + return LocalVarMap.lookupExpr(D, Ctx); + }); + } auto Cleanup = llvm::make_scope_exit( [this] { SxBuilder.setLookupLocalVarExpr(nullptr); }); @@ -1722,6 +1722,19 @@ class BuildLockset : public ConstStmtVisitor { LocalVariableMap::Context LVarCtx; unsigned CtxIndex; + // To update and adjust the context. + void updateLocalVarMapCtx(const Stmt *S) { + if (S) + LVarCtx = Analyzer->LocalVarMap.getNextContext(CtxIndex, S, LVarCtx); + if (!Analyzer->Handler.issueBetaWarnings()) + return; + // The lookup closure needs to be reconstructed with the refreshed LVarCtx. + Analyzer->SxBuilder.setLookupLocalVarExpr( + [this, Ctx = LVarCtx](const NamedDecl *D) mutable -> const Expr * { + return Analyzer->LocalVarMap.lookupExpr(D, Ctx); + }); + } + // helper functions void checkAccess(const Expr *Exp, AccessKind AK, @@ -1747,13 +1760,7 @@ class BuildLockset : public ConstStmtVisitor { : ConstStmtVisitor(), Analyzer(Anlzr), FSet(Info.EntrySet), FunctionExitFSet(FunctionExitFSet), LVarCtx(Info.EntryContext), CtxIndex(Info.EntryIndex) { - Analyzer->SxBuilder.setLookupLocalVarExpr( - [this](const NamedDecl *D) -> const Expr * { - if (!Analyzer->Handler.issueBetaWarnings()) - return nullptr; - auto Ctx = LVarCtx; - return Analyzer->LocalVarMap.lookupExpr(D, Ctx); - }); + updateLocalVarMapCtx(nullptr); } ~BuildLockset() { Analyzer->SxBuilder.setLookupLocalVarExpr(nullptr); } @@ -2259,9 +2266,7 @@ void BuildLockset::VisitBinaryOperator(const BinaryOperator *BO) { if (!BO->isAssignmentOp()) return; - // adjust the context - LVarCtx = Analyzer->LocalVarMap.getNextContext(CtxIndex, BO, LVarCtx); - + updateLocalVarMapCtx(BO); checkAccess(BO->getLHS(), AK_Written); } @@ -2307,8 +2312,7 @@ void BuildLockset::examineArguments(const FunctionDecl *FD, } void BuildLockset::VisitCallExpr(const CallExpr *Exp) { - // adjust the context - LVarCtx = Analyzer->LocalVarMap.getNextContext(CtxIndex, Exp, LVarCtx); + updateLocalVarMapCtx(Exp); if (const auto *CE = dyn_cast(Exp)) { const auto *ME = dyn_cast(CE->getCallee()); @@ -2404,8 +2408,7 @@ static const Expr *UnpackConstruction(const Expr *E) { } void BuildLockset::VisitDeclStmt(const DeclStmt *S) { - // adjust the context - LVarCtx = Analyzer->LocalVarMap.getNextContext(CtxIndex, S, LVarCtx); + updateLocalVarMapCtx(S); for (auto *D : S->getDeclGroup()) { if (auto *VD = dyn_cast_or_null(D)) { diff --git a/clang/lib/Analysis/ThreadSafetyCommon.cpp b/clang/lib/Analysis/ThreadSafetyCommon.cpp index 25ad673b58db6..ef48ae439c5f3 100644 --- a/clang/lib/Analysis/ThreadSafetyCommon.cpp +++ b/clang/lib/Analysis/ThreadSafetyCommon.cpp @@ -248,9 +248,17 @@ til::SExpr *SExprBuilder::translateVariable(const VarDecl *VD, // defining VD, use its pre-assignment value to break the cycle. if (VarsBeingTranslated.contains(VD->getCanonicalDecl())) return new (Arena) til::LiteralPtr(VD); - VarsBeingTranslated.insert(VD->getCanonicalDecl()); + + // The closure captures state that is updated to correctly translate chains of + // aliases. Restore it when we are done with recursive translation. auto Cleanup = llvm::make_scope_exit( - [&] { VarsBeingTranslated.erase(VD->getCanonicalDecl()); }); + [&, RestoreClosure = + VarsBeingTranslated.empty() ? LookupLocalVarExpr : nullptr] { + VarsBeingTranslated.erase(VD->getCanonicalDecl()); + if (VarsBeingTranslated.empty()) + LookupLocalVarExpr = RestoreClosure; + }); + VarsBeingTranslated.insert(VD->getCanonicalDecl()); QualType Ty = VD->getType(); if (!VD->isStaticLocal() && Ty->isPointerType()) { diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index 508685883364c..64b2bff063340 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -282,6 +282,7 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, StringRef Str, case OMPC_affinity: case OMPC_when: case OMPC_append_args: + case OMPC_looprange: break; default: break; @@ -627,6 +628,7 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, case OMPC_affinity: case OMPC_when: case OMPC_append_args: + case OMPC_looprange: break; default: break; @@ -677,6 +679,11 @@ bool clang::isOpenMPTargetDataManagementDirective(OpenMPDirectiveKind DKind) { DKind == OMPD_target_exit_data || DKind == OMPD_target_update; } +bool clang::isOpenMPTargetMapEnteringDirective(OpenMPDirectiveKind DKind) { + return DKind == OMPD_target_data || DKind == OMPD_target_enter_data || + isOpenMPTargetExecutionDirective(DKind); +} + bool clang::isOpenMPNestingTeamsDirective(OpenMPDirectiveKind DKind) { if (DKind == OMPD_teams) return true; @@ -750,9 +757,14 @@ bool clang::isOpenMPCanonicalLoopNestTransformationDirective( DKind == OMPD_interchange || DKind == OMPD_stripe; } +bool clang::isOpenMPCanonicalLoopSequenceTransformationDirective( + OpenMPDirectiveKind DKind) { + return DKind == OMPD_fuse; +} + bool clang::isOpenMPLoopTransformationDirective(OpenMPDirectiveKind DKind) { - // FIXME: There will be more cases when we implement 'fuse'. - return isOpenMPCanonicalLoopNestTransformationDirective(DKind); + return isOpenMPCanonicalLoopNestTransformationDirective(DKind) || + isOpenMPCanonicalLoopSequenceTransformationDirective(DKind); } bool clang::isOpenMPCombinedParallelADirective(OpenMPDirectiveKind DKind) { diff --git a/clang/lib/Basic/Sarif.cpp b/clang/lib/Basic/Sarif.cpp index 69862b73febd7..b3fb9a21249e9 100644 --- a/clang/lib/Basic/Sarif.cpp +++ b/clang/lib/Basic/Sarif.cpp @@ -67,7 +67,7 @@ static std::string percentEncodeURICharacter(char C) { /// \param Filename The filename to be represented as URI. /// /// \return RFC3986 URI representing the input file name. -static std::string fileNameToURI(StringRef Filename) { +std::string SarifDocumentWriter::fileNameToURI(StringRef Filename) { SmallString<32> Ret = StringRef("file://"); // Get the root name to see if it has a URI authority. @@ -391,6 +391,11 @@ void SarifDocumentWriter::appendResult(const SarifResult &Result) { json::Object Ret{{"message", createMessage(Result.DiagnosticMessage)}, {"ruleIndex", static_cast(RuleIdx)}, {"ruleId", Rule.Id}}; + + if (!Result.HostedViewerURI.empty()) { + Ret["hostedViewerUri"] = Result.HostedViewerURI; + } + if (!Result.Locations.empty()) { json::Array Locs; for (auto &Range : Result.Locations) { @@ -398,6 +403,15 @@ void SarifDocumentWriter::appendResult(const SarifResult &Result) { } Ret["locations"] = std::move(Locs); } + + if (!Result.PartialFingerprints.empty()) { + json::Object fingerprints = {}; + for (auto &pair : Result.PartialFingerprints) { + fingerprints[pair.first] = pair.second; + } + Ret["partialFingerprints"] = std::move(fingerprints); + } + if (!Result.ThreadFlows.empty()) Ret["codeFlows"] = json::Array{createCodeFlow(Result.ThreadFlows)}; diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 72ee09d209e02..f4d7c1288cc04 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -18,6 +18,7 @@ #include "clang/Basic/LangOptions.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/TargetParser.h" #include @@ -1042,3 +1043,51 @@ void TargetInfo::copyAuxTarget(const TargetInfo *Aux) { auto *Src = static_cast(Aux); *Target = *Src; } + +std::string +TargetInfo::simplifyConstraint(StringRef Constraint, + SmallVectorImpl *OutCons) const { + std::string Result; + + for (const char *I = Constraint.begin(), *E = Constraint.end(); I < E; I++) { + switch (*I) { + default: + Result += convertConstraint(I); + break; + // Ignore these + case '*': + case '?': + case '!': + case '=': // Will see this and the following in mult-alt constraints. + case '+': + break; + case '#': // Ignore the rest of the constraint alternative. + while (I + 1 != E && I[1] != ',') + I++; + break; + case '&': + case '%': + Result += *I; + while (I + 1 != E && I[1] == *I) + I++; + break; + case ',': + Result += "|"; + break; + case 'g': + Result += "imr"; + break; + case '[': { + assert(OutCons && + "Must pass output names to constraints with a symbolic name"); + unsigned Index; + bool ResolveResult = resolveSymbolicName(I, *OutCons, Index); + assert(ResolveResult && "Could not resolve symbolic name"); + (void)ResolveResult; + Result += llvm::utostr(Index); + break; + } + } + } + return Result; +} diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h index 81cbb854f3b7d..52d541f2b09b5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCall.h +++ b/clang/lib/CIR/CodeGen/CIRGenCall.h @@ -256,6 +256,7 @@ class ReturnValueSlot { ReturnValueSlot() = default; ReturnValueSlot(Address addr) : addr(addr) {} + bool isNull() const { return !addr.isValid(); } Address getValue() const { return addr; } }; diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp index 1a557beb610ea..cb8fe6c8862dc 100644 --- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp @@ -778,6 +778,86 @@ void CIRGenFunction::emitImplicitAssignmentOperatorBody(FunctionArgList &args) { s->getStmtClassName()); } +void CIRGenFunction::emitForwardingCallToLambda( + const CXXMethodDecl *callOperator, CallArgList &callArgs) { + // Get the address of the call operator. + const CIRGenFunctionInfo &calleeFnInfo = + cgm.getTypes().arrangeCXXMethodDeclaration(callOperator); + cir::FuncOp calleePtr = cgm.getAddrOfFunction( + GlobalDecl(callOperator), cgm.getTypes().getFunctionType(calleeFnInfo)); + + // Prepare the return slot. + const FunctionProtoType *fpt = + callOperator->getType()->castAs(); + QualType resultType = fpt->getReturnType(); + ReturnValueSlot returnSlot; + + // We don't need to separately arrange the call arguments because + // the call can't be variadic anyway --- it's impossible to forward + // variadic arguments. + + // Now emit our call. + CIRGenCallee callee = + CIRGenCallee::forDirect(calleePtr, GlobalDecl(callOperator)); + RValue rv = emitCall(calleeFnInfo, callee, returnSlot, callArgs); + + // If necessary, copy the returned value into the slot. + if (!resultType->isVoidType() && returnSlot.isNull()) { + if (getLangOpts().ObjCAutoRefCount && resultType->isObjCRetainableType()) + cgm.errorNYI(callOperator->getSourceRange(), + "emitForwardingCallToLambda: ObjCAutoRefCount"); + emitReturnOfRValue(*currSrcLoc, rv, resultType); + } else { + cgm.errorNYI(callOperator->getSourceRange(), + "emitForwardingCallToLambda: return slot is not null"); + } +} + +void CIRGenFunction::emitLambdaDelegatingInvokeBody(const CXXMethodDecl *md) { + const CXXRecordDecl *lambda = md->getParent(); + + // Start building arguments for forwarding call + CallArgList callArgs; + + QualType lambdaType = getContext().getCanonicalTagType(lambda); + QualType thisType = getContext().getPointerType(lambdaType); + Address thisPtr = + createMemTemp(lambdaType, getLoc(md->getSourceRange()), "unused.capture"); + callArgs.add(RValue::get(thisPtr.getPointer()), thisType); + + // Add the rest of the parameters. + for (auto *param : md->parameters()) + emitDelegateCallArg(callArgs, param, param->getBeginLoc()); + + const CXXMethodDecl *callOp = lambda->getLambdaCallOperator(); + // For a generic lambda, find the corresponding call operator specialization + // to which the call to the static-invoker shall be forwarded. + if (lambda->isGenericLambda()) { + assert(md->isFunctionTemplateSpecialization()); + const TemplateArgumentList *tal = md->getTemplateSpecializationArgs(); + FunctionTemplateDecl *callOpTemplate = + callOp->getDescribedFunctionTemplate(); + void *InsertPos = nullptr; + FunctionDecl *correspondingCallOpSpecialization = + callOpTemplate->findSpecialization(tal->asArray(), InsertPos); + assert(correspondingCallOpSpecialization); + callOp = cast(correspondingCallOpSpecialization); + } + emitForwardingCallToLambda(callOp, callArgs); +} + +void CIRGenFunction::emitLambdaStaticInvokeBody(const CXXMethodDecl *md) { + if (md->isVariadic()) { + // Codgen for LLVM doesn't emit code for this as well, it says: + // FIXME: Making this work correctly is nasty because it requires either + // cloning the body of the call operator or making the call operator + // forward. + cgm.errorNYI(md->getSourceRange(), "emitLambdaStaticInvokeBody: variadic"); + } + + emitLambdaDelegatingInvokeBody(md); +} + void CIRGenFunction::destroyCXXObject(CIRGenFunction &cgf, Address addr, QualType type) { const auto *record = type->castAsCXXRecordDecl(); @@ -826,7 +906,7 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase, if (!cgm.getCXXABI().needsVTTParameter(gd)) return nullptr; - const CXXRecordDecl *rd = cast(curFuncDecl)->getParent(); + const CXXRecordDecl *rd = cast(curCodeDecl)->getParent(); const CXXRecordDecl *base = cast(gd.getDecl())->getParent(); uint64_t subVTTIndex; diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index e9f5752e4b696..fa68ad931ba74 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -461,7 +461,8 @@ LValue CIRGenFunction::emitLValueForField(LValue base, const FieldDecl *field) { llvm::StringRef fieldName = field->getName(); unsigned fieldIndex; - assert(!cir::MissingFeatures::lambdaFieldToName()); + if (cgm.lambdaFieldToName.count(field)) + fieldName = cgm.lambdaFieldToName[field]; if (rec->isUnion()) fieldIndex = field->getFieldIndex(); @@ -476,8 +477,16 @@ LValue CIRGenFunction::emitLValueForField(LValue base, const FieldDecl *field) { // If this is a reference field, load the reference right now. if (fieldType->isReferenceType()) { - cgm.errorNYI(field->getSourceRange(), "emitLValueForField: reference type"); - return LValue(); + assert(!cir::MissingFeatures::opTBAA()); + LValue refLVal = makeAddrLValue(addr, fieldType, fieldBaseInfo); + if (recordCVR & Qualifiers::Volatile) + refLVal.getQuals().addVolatile(); + addr = emitLoadOfReference(refLVal, getLoc(field->getSourceRange()), + &fieldBaseInfo); + + // Qualifiers on the struct don't apply to the referencee. + recordCVR = 0; + fieldType = fieldType->getPointeeType(); } if (field->hasAttr()) { @@ -619,6 +628,38 @@ static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) { return cgm.getAddrOfFunction(gd); } +static LValue emitCapturedFieldLValue(CIRGenFunction &cgf, const FieldDecl *fd, + mlir::Value thisValue) { + return cgf.emitLValueForLambdaField(fd, thisValue); +} + +/// Given that we are currently emitting a lambda, emit an l-value for +/// one of its members. +/// +LValue CIRGenFunction::emitLValueForLambdaField(const FieldDecl *field, + mlir::Value thisValue) { + bool hasExplicitObjectParameter = false; + const auto *methD = dyn_cast_if_present(curCodeDecl); + LValue lambdaLV; + if (methD) { + hasExplicitObjectParameter = methD->isExplicitObjectMemberFunction(); + assert(methD->getParent()->isLambda()); + assert(methD->getParent() == field->getParent()); + } + if (hasExplicitObjectParameter) { + cgm.errorNYI(field->getSourceRange(), "ExplicitObjectMemberFunction"); + } else { + QualType lambdaTagType = + getContext().getCanonicalTagType(field->getParent()); + lambdaLV = makeNaturalAlignAddrLValue(thisValue, lambdaTagType); + } + return emitLValueForField(lambdaLV, field); +} + +LValue CIRGenFunction::emitLValueForLambdaField(const FieldDecl *field) { + return emitLValueForLambdaField(field, cxxabiThisValue); +} + static LValue emitFunctionDeclLValue(CIRGenFunction &cgf, const Expr *e, GlobalDecl gd) { const FunctionDecl *fd = cast(gd.getDecl()); @@ -645,6 +686,57 @@ static LValue emitFunctionDeclLValue(CIRGenFunction &cgf, const Expr *e, AlignmentSource::Decl); } +/// Determine whether we can emit a reference to \p vd from the current +/// context, despite not necessarily having seen an odr-use of the variable in +/// this context. +/// TODO(cir): This could be shared with classic codegen. +static bool canEmitSpuriousReferenceToVariable(CIRGenFunction &cgf, + const DeclRefExpr *e, + const VarDecl *vd) { + // For a variable declared in an enclosing scope, do not emit a spurious + // reference even if we have a capture, as that will emit an unwarranted + // reference to our capture state, and will likely generate worse code than + // emitting a local copy. + if (e->refersToEnclosingVariableOrCapture()) + return false; + + // For a local declaration declared in this function, we can always reference + // it even if we don't have an odr-use. + if (vd->hasLocalStorage()) { + return vd->getDeclContext() == + dyn_cast_or_null(cgf.curCodeDecl); + } + + // For a global declaration, we can emit a reference to it if we know + // for sure that we are able to emit a definition of it. + vd = vd->getDefinition(cgf.getContext()); + if (!vd) + return false; + + // Don't emit a spurious reference if it might be to a variable that only + // exists on a different device / target. + // FIXME: This is unnecessarily broad. Check whether this would actually be a + // cross-target reference. + if (cgf.getLangOpts().OpenMP || cgf.getLangOpts().CUDA || + cgf.getLangOpts().OpenCL) { + return false; + } + + // We can emit a spurious reference only if the linkage implies that we'll + // be emitting a non-interposable symbol that will be retained until link + // time. + switch (cgf.cgm.getCIRLinkageVarDefinition(vd, /*IsConstant=*/false)) { + case cir::GlobalLinkageKind::ExternalLinkage: + case cir::GlobalLinkageKind::LinkOnceODRLinkage: + case cir::GlobalLinkageKind::WeakODRLinkage: + case cir::GlobalLinkageKind::InternalLinkage: + case cir::GlobalLinkageKind::PrivateLinkage: + return true; + default: + return false; + } +} + LValue CIRGenFunction::emitDeclRefLValue(const DeclRefExpr *e) { const NamedDecl *nd = e->getDecl(); QualType ty = e->getType(); @@ -652,6 +744,32 @@ LValue CIRGenFunction::emitDeclRefLValue(const DeclRefExpr *e) { assert(e->isNonOdrUse() != NOUR_Unevaluated && "should not emit an unevaluated operand"); + if (const auto *vd = dyn_cast(nd)) { + // Global Named registers access via intrinsics only + if (vd->getStorageClass() == SC_Register && vd->hasAttr() && + !vd->isLocalVarDecl()) { + cgm.errorNYI(e->getSourceRange(), + "emitDeclRefLValue: Global Named registers access"); + return LValue(); + } + + if (e->isNonOdrUse() == NOUR_Constant && + (vd->getType()->isReferenceType() || + !canEmitSpuriousReferenceToVariable(*this, e, vd))) { + cgm.errorNYI(e->getSourceRange(), "emitDeclRefLValue: NonOdrUse"); + return LValue(); + } + + // Check for captured variables. + if (e->refersToEnclosingVariableOrCapture()) { + vd = vd->getCanonicalDecl(); + if (FieldDecl *fd = lambdaCaptureFields.lookup(vd)) + return emitCapturedFieldLValue(*this, fd, cxxabiThisValue); + assert(!cir::MissingFeatures::cgCapturedStmtInfo()); + assert(!cir::MissingFeatures::openMP()); + } + } + if (const auto *vd = dyn_cast(nd)) { // Checks for omitted feature handling assert(!cir::MissingFeatures::opAllocaStaticLocal()); @@ -795,8 +913,7 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) { assert(e->isPrefix() && "Prefix operator in unexpected state!"); if (e->getType()->isAnyComplexType()) { - cgm.errorNYI(e->getSourceRange(), "UnaryOp complex inc/dec"); - lv = LValue(); + emitComplexPrePostIncDec(e, lv, kind, /*isPre=*/true); } else { emitScalarPrePostIncDec(e, lv, kind, /*isPre=*/true); } @@ -1798,8 +1915,7 @@ RValue CIRGenFunction::convertTempToRValue(Address addr, clang::QualType type, LValue lvalue = makeAddrLValue(addr, type, AlignmentSource::Decl); switch (getEvaluationKind(type)) { case cir::TEK_Complex: - cgm.errorNYI(loc, "convertTempToRValue: complex type"); - return RValue::get(nullptr); + return RValue::getComplex(emitLoadOfComplex(lvalue, loc)); case cir::TEK_Aggregate: cgm.errorNYI(loc, "convertTempToRValue: aggregate type"); return RValue::get(nullptr); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index dc34d2b3baa8d..4a8aac900ee07 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -99,6 +99,7 @@ class AggExprEmitter : public StmtVisitor { assert(!cir::MissingFeatures::aggValueSlotDestructedFlag()); Visit(e->getSubExpr()); } + void VisitLambdaExpr(LambdaExpr *e); // Stubs -- These should be moved up when they are implemented. void VisitCastExpr(CastExpr *e) { @@ -239,9 +240,6 @@ class AggExprEmitter : public StmtVisitor { cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitCXXInheritedCtorInitExpr"); } - void VisitLambdaExpr(LambdaExpr *e) { - cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitLambdaExpr"); - } void VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *e) { cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitCXXStdInitializerListExpr"); @@ -495,12 +493,14 @@ void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) { if (isa(e)) return; - if (type->isReferenceType()) - cgf.cgm.errorNYI("emitInitializationToLValue ReferenceType"); + if (type->isReferenceType()) { + RValue rv = cgf.emitReferenceBindingToExpr(e); + return cgf.emitStoreThroughLValue(rv, lv); + } switch (cgf.getEvaluationKind(type)) { case cir::TEK_Complex: - cgf.cgm.errorNYI("emitInitializationToLValue TEK_Complex"); + cgf.emitComplexExprIntoLValue(e, lv, /*isInit*/ true); break; case cir::TEK_Aggregate: cgf.emitAggExpr(e, AggValueSlot::forLValue(lv, AggValueSlot::IsDestructed, @@ -550,6 +550,47 @@ void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc, cgf.emitNullInitialization(loc, lv.getAddress(), lv.getType()); } +void AggExprEmitter::VisitLambdaExpr(LambdaExpr *e) { + CIRGenFunction::SourceLocRAIIObject loc{cgf, cgf.getLoc(e->getSourceRange())}; + AggValueSlot slot = ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType()); + [[maybe_unused]] LValue slotLV = + cgf.makeAddrLValue(slot.getAddress(), e->getType()); + + // We'll need to enter cleanup scopes in case any of the element + // initializers throws an exception or contains branch out of the expressions. + assert(!cir::MissingFeatures::opScopeCleanupRegion()); + + for (auto [curField, capture, captureInit] : llvm::zip( + e->getLambdaClass()->fields(), e->captures(), e->capture_inits())) { + // Pick a name for the field. + llvm::StringRef fieldName = curField->getName(); + if (capture.capturesVariable()) { + assert(!curField->isBitField() && "lambdas don't have bitfield members!"); + ValueDecl *v = capture.getCapturedVar(); + fieldName = v->getName(); + cgf.cgm.lambdaFieldToName[curField] = fieldName; + } else if (capture.capturesThis()) { + cgf.cgm.lambdaFieldToName[curField] = "this"; + } else { + cgf.cgm.errorNYI(e->getSourceRange(), "Unhandled capture kind"); + cgf.cgm.lambdaFieldToName[curField] = "unhandled-capture-kind"; + } + + // Emit initialization + LValue lv = + cgf.emitLValueForFieldInitialization(slotLV, curField, fieldName); + if (curField->hasCapturedVLAType()) + cgf.cgm.errorNYI(e->getSourceRange(), "lambda captured VLA type"); + + emitInitializationToLValue(captureInit, lv); + + // Push a destructor if necessary. + if ([[maybe_unused]] QualType::DestructionKind DtorKind = + curField->getType().isDestructedType()) + cgf.cgm.errorNYI(e->getSourceRange(), "lambda with destructed field"); + } +} + void AggExprEmitter::VisitCallExpr(const CallExpr *e) { if (e->getCallReturnType(cgf.getContext())->isReferenceType()) { cgf.cgm.errorNYI(e->getSourceRange(), "reference return type"); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp index 3db34ccb1748d..1f7e3dd1fa7d2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp @@ -238,8 +238,8 @@ static void storeAnyExprIntoOneUnit(CIRGenFunction &cgf, const Expr *init, cgf.makeAddrLValue(newPtr, allocType), false); return; case cir::TEK_Complex: - cgf.cgm.errorNYI(init->getSourceRange(), - "storeAnyExprIntoOneUnit: complex"); + cgf.emitComplexExprIntoLValue(init, cgf.makeAddrLValue(newPtr, allocType), + /*isInit*/ true); return; case cir::TEK_Aggregate: { assert(!cir::MissingFeatures::aggValueSlotGC()); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index b7ae55e72bdfc..fcde4875393cd 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -193,8 +193,7 @@ class ComplexExprEmitter : public StmtVisitor { mlir::Value VisitUnaryNot(const UnaryOperator *e); // LNot,Real,Imag never return complex. mlir::Value VisitUnaryExtension(const UnaryOperator *e) { - cgf.cgm.errorNYI(e->getExprLoc(), "ComplexExprEmitter VisitUnaryExtension"); - return {}; + return Visit(e->getSubExpr()); } mlir::Value VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) { cgf.cgm.errorNYI(dae->getExprLoc(), @@ -317,8 +316,7 @@ class ComplexExprEmitter : public StmtVisitor { mlir::Value VisitVAArgExpr(VAArgExpr *e); mlir::Value VisitAtomicExpr(AtomicExpr *e) { - cgf.cgm.errorNYI(e->getExprLoc(), "ComplexExprEmitter VisitAtomicExpr"); - return {}; + return cgf.emitAtomicExpr(e).getComplexValue(); } mlir::Value VisitPackIndexingExpr(PackIndexingExpr *e) { diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp index f660544d13cfa..178b276f19d41 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp @@ -1464,25 +1464,24 @@ mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value, case APValue::ComplexInt: case APValue::ComplexFloat: { mlir::Type desiredType = cgm.convertType(destType); - cir::ComplexType complexType = - mlir::dyn_cast(desiredType); + auto complexType = mlir::dyn_cast(desiredType); mlir::Type complexElemTy = complexType.getElementType(); if (isa(complexElemTy)) { - llvm::APSInt real = value.getComplexIntReal(); - llvm::APSInt imag = value.getComplexIntImag(); - return builder.getAttr( - complexType, cir::IntAttr::get(complexElemTy, real), - cir::IntAttr::get(complexElemTy, imag)); + const llvm::APSInt &real = value.getComplexIntReal(); + const llvm::APSInt &imag = value.getComplexIntImag(); + return cir::ConstComplexAttr::get(builder.getContext(), complexType, + cir::IntAttr::get(complexElemTy, real), + cir::IntAttr::get(complexElemTy, imag)); } assert(isa(complexElemTy) && "expected floating-point type"); - llvm::APFloat real = value.getComplexFloatReal(); - llvm::APFloat imag = value.getComplexFloatImag(); - return builder.getAttr( - complexType, cir::FPAttr::get(complexElemTy, real), - cir::FPAttr::get(complexElemTy, imag)); + const llvm::APFloat &real = value.getComplexFloatReal(); + const llvm::APFloat &imag = value.getComplexFloatImag(); + return cir::ConstComplexAttr::get(builder.getContext(), complexType, + cir::FPAttr::get(complexElemTy, real), + cir::FPAttr::get(complexElemTy, imag)); } case APValue::FixedPoint: case APValue::AddrLabelDiff: diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 276adcfc5c6be..bd09d78cd0eb6 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -1278,9 +1278,7 @@ mlir::Value ScalarExprEmitter::emitPromoted(const Expr *e, "ScalarExprEmitter::emitPromoted unary imag"); return {}; case UO_Real: - cgf.cgm.errorNYI(e->getSourceRange(), - "ScalarExprEmitter::emitPromoted unary real"); - return {}; + return VisitRealImag(uo, promotionType); case UO_Minus: return emitUnaryPlusOrMinus(uo, cir::UnaryOpKind::Minus, promotionType); case UO_Plus: @@ -2087,9 +2085,13 @@ mlir::Value ScalarExprEmitter::VisitUnaryLNot(const UnaryOperator *e) { if (e->getType()->isVectorType() && e->getType()->castAs()->getVectorKind() == VectorKind::Generic) { - assert(!cir::MissingFeatures::vectorType()); - cgf.cgm.errorNYI(e->getSourceRange(), "vector logical not"); - return {}; + mlir::Value oper = Visit(e->getSubExpr()); + mlir::Location loc = cgf.getLoc(e->getExprLoc()); + auto operVecTy = mlir::cast(oper.getType()); + auto exprVecTy = mlir::cast(cgf.convertType(e->getType())); + mlir::Value zeroVec = builder.getNullValue(operVecTy, loc); + return cir::VecCmpOp::create(builder, loc, exprVecTy, cir::CmpOpKind::eq, + oper, zeroVec); } // Compare operand to zero. @@ -2125,33 +2127,41 @@ mlir::Value ScalarExprEmitter::VisitRealImag(const UnaryOperator *e, "Invalid UnaryOp kind for ComplexType Real or Imag"); Expr *op = e->getSubExpr(); + mlir::Location loc = cgf.getLoc(e->getExprLoc()); if (op->getType()->isAnyComplexType()) { // If it's an l-value, load through the appropriate subobject l-value. // Note that we have to ask `e` because `op` might be an l-value that - // this won't work for, e.g. an Obj-C property. - if (e->isGLValue()) { - mlir::Location loc = cgf.getLoc(e->getExprLoc()); - mlir::Value complex = cgf.emitComplexExpr(op); - if (!promotionTy.isNull()) { - complex = cgf.emitPromotedValue(complex, promotionTy); - } - - return e->getOpcode() == clang::UO_Real - ? builder.createComplexReal(loc, complex) - : builder.createComplexImag(loc, complex); + // this won't work for, e.g. an Obj-C property + mlir::Value complex = cgf.emitComplexExpr(op); + if (e->isGLValue() && !promotionTy.isNull()) { + promotionTy = promotionTy->isAnyComplexType() + ? promotionTy + : cgf.getContext().getComplexType(promotionTy); + complex = cgf.emitPromotedValue(complex, promotionTy); } - // Otherwise, calculate and project. - cgf.cgm.errorNYI(e->getSourceRange(), - "VisitRealImag calculate and project"); - return {}; + return e->getOpcode() == clang::UO_Real + ? builder.createComplexReal(loc, complex) + : builder.createComplexImag(loc, complex); + } + + if (e->getOpcode() == UO_Real) { + return promotionTy.isNull() ? Visit(op) + : cgf.emitPromotedScalarExpr(op, promotionTy); } - // __real or __imag on a scalar returns zero. Emit the subexpr to ensure side + // __imag on a scalar returns zero. Emit the subexpr to ensure side // effects are evaluated, but not the actual value. - cgf.cgm.errorNYI(e->getSourceRange(), - "VisitRealImag __real or __imag on a scalar"); - return {}; + if (op->isGLValue()) + cgf.emitLValue(op); + else if (!promotionTy.isNull()) + cgf.emitPromotedScalarExpr(op, promotionTy); + else + cgf.emitScalarExpr(op); + + mlir::Type valueTy = + cgf.convertType(promotionTy.isNull() ? e->getType() : promotionTy); + return builder.getNullValue(valueTy, loc); } /// Return the size or alignment of the type of argument of the sizeof @@ -2355,4 +2365,4 @@ mlir::Value CIRGenFunction::emitScalarPrePostIncDec(const UnaryOperator *e, bool isPre) { return ScalarExprEmitter(*this, builder) .emitScalarPrePostIncDec(e, lv, kind, isPre); -} \ No newline at end of file +} diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index e2181b8222aa2..0abb21a670719 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -405,6 +405,7 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType, curFn = fn; const Decl *d = gd.getDecl(); + curCodeDecl = d; const auto *fd = dyn_cast_or_null(d); curFuncDecl = d->getNonClosureContext(); @@ -457,7 +458,36 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType, const auto *md = cast(d); if (md->getParent()->isLambda() && md->getOverloadedOperator() == OO_Call) { - cgm.errorNYI(loc, "lambda call operator"); + // We're in a lambda. + curFn.setLambda(true); + + // Figure out the captures. + md->getParent()->getCaptureFields(lambdaCaptureFields, + lambdaThisCaptureField); + if (lambdaThisCaptureField) { + // If the lambda captures the object referred to by '*this' - either by + // value or by reference, make sure CXXThisValue points to the correct + // object. + + // Get the lvalue for the field (which is a copy of the enclosing object + // or contains the address of the enclosing object). + LValue thisFieldLValue = + emitLValueForLambdaField(lambdaThisCaptureField); + if (!lambdaThisCaptureField->getType()->isPointerType()) { + // If the enclosing object was captured by value, just use its + // address. Sign this pointer. + cxxThisValue = thisFieldLValue.getPointer(); + } else { + // Load the lvalue pointed to by the field, since '*this' was captured + // by reference. + cxxThisValue = + emitLoadOfLValue(thisFieldLValue, SourceLocation()).getValue(); + } + } + for (auto *fd : md->getParent()->fields()) { + if (fd->hasCapturedVLAType()) + cgm.errorNYI(loc, "lambda captured VLA type"); + } } else { // Not in a lambda; just use 'this' from the method. // FIXME: Should we generate a new load for each use of 'this'? The fast @@ -547,7 +577,10 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn, getCIRGenModule().errorNYI(bodyRange, "CUDA kernel"); } else if (isa(funcDecl) && cast(funcDecl)->isLambdaStaticInvoker()) { - getCIRGenModule().errorNYI(bodyRange, "Lambda static invoker"); + // The lambda static invoker function is special, because it forwards or + // clones the body of the function call operator (but is actually + // static). + emitLambdaStaticInvokeBody(cast(funcDecl)); } else if (funcDecl->isDefaulted() && isa(funcDecl) && (cast(funcDecl)->isCopyAssignmentOperator() || cast(funcDecl)->isMoveAssignmentOperator())) { diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index d107d481e3ce2..166435f9e7e9e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -73,6 +73,10 @@ class CIRGenFunction : public CIRGenTypeCache { /// Tracks function scope overall cleanup handling. EHScopeStack ehStack; + llvm::DenseMap + lambdaCaptureFields; + clang::FieldDecl *lambdaThisCaptureField = nullptr; + /// CXXThisDecl - When generating code for a C++ member function, /// this will hold the implicit 'this' declaration. ImplicitParamDecl *cxxabiThisDecl = nullptr; @@ -91,6 +95,8 @@ class CIRGenFunction : public CIRGenTypeCache { // Holds the Decl for the current outermost non-closure context const clang::Decl *curFuncDecl = nullptr; + /// This is the inner-most code context, which includes blocks. + const clang::Decl *curCodeDecl = nullptr; /// The function for which code is currently being generated. cir::FuncOp curFn; @@ -1268,6 +1274,8 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::Value emitPromotedValue(mlir::Value result, QualType promotionType); + void emitReturnOfRValue(mlir::Location loc, RValue rv, QualType ty); + /// Emit the computation of the specified expression of scalar type. mlir::Value emitScalarExpr(const clang::Expr *e); @@ -1287,6 +1295,9 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitForStmt(const clang::ForStmt &s); + void emitForwardingCallToLambda(const CXXMethodDecl *lambdaCallOperator, + CallArgList &callArgs); + /// Emit the computation of the specified expression of complex type, /// returning the result. mlir::Value emitComplexExpr(const Expr *e); @@ -1349,6 +1360,9 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitLabel(const clang::LabelDecl &d); mlir::LogicalResult emitLabelStmt(const clang::LabelStmt &s); + void emitLambdaDelegatingInvokeBody(const CXXMethodDecl *md); + void emitLambdaStaticInvokeBody(const CXXMethodDecl *md); + mlir::LogicalResult emitIfStmt(const clang::IfStmt &s); /// Emit code to compute the specified expression, @@ -1385,6 +1399,10 @@ class CIRGenFunction : public CIRGenTypeCache { LValue emitLValueForBitField(LValue base, const FieldDecl *field); LValue emitLValueForField(LValue base, const clang::FieldDecl *field); + LValue emitLValueForLambdaField(const FieldDecl *field); + LValue emitLValueForLambdaField(const FieldDecl *field, + mlir::Value thisValue); + /// Like emitLValueForField, excpet that if the Field is a reference, this /// will return the address of the reference and not the address of the value /// stored in the reference. @@ -1698,6 +1716,10 @@ class CIRGenFunction : public CIRGenTypeCache { ~ActiveOpenACCLoopRAII() { cgf.activeLoopOp = oldLoopOp; } }; + // Keep track of the last place we inserted a 'recipe' so that we can insert + // the next one in lexical order. + mlir::OpBuilder::InsertPoint lastRecipeLocation; + public: // Helper type used to store the list of important information for a 'data' // clause variable, or a 'cache' variable reference. @@ -1705,9 +1727,17 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::Location beginLoc; mlir::Value varValue; std::string name; + // The type of the original variable reference: that is, after 'bounds' have + // removed pointers/array types/etc. So in the case of int arr[5], and a + // private(arr[1]), 'origType' is 'int', but 'baseType' is 'int[5]'. + QualType origType; QualType baseType; llvm::SmallVector bounds; + // The list of types that we found when going through the bounds, which we + // can use to properly set the alloca section. + llvm::SmallVector boundTypes; }; + // Gets the collection of info required to lower and OpenACC clause or cache // construct variable reference. OpenACCDataOperandInfo getOpenACCDataOperandInfo(const Expr *e); diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 95a7ac0648bb7..073e8d96b773b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -121,6 +121,12 @@ class CIRGenModule : public CIRGenTypeCache { mlir::Operation *lastGlobalOp = nullptr; + /// Keep a map between lambda fields and names, this needs to be per module + /// since lambdas might get generated later as part of defered work, and since + /// the pointers are supposed to be uniqued, should be fine. Revisit this if + /// it ends up taking too much memory. + llvm::DenseMap lambdaFieldToName; + /// Tell the consumer that this variable has been instantiated. void handleCXXStaticMemberVarInstantiation(VarDecl *vd); diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp index 907cb5fa11401..7f9350a9e4173 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp @@ -68,14 +68,33 @@ mlir::Value CIRGenFunction::createOpenACCConstantInt(mlir::Location loc, CIRGenFunction::OpenACCDataOperandInfo CIRGenFunction::getOpenACCDataOperandInfo(const Expr *e) { const Expr *curVarExpr = e->IgnoreParenImpCasts(); + QualType origType = + curVarExpr->getType().getNonReferenceType().getUnqualifiedType(); + // Array sections are special, and we have to treat them that way. + if (const auto *section = + dyn_cast(curVarExpr->IgnoreParenImpCasts())) + origType = ArraySectionExpr::getBaseOriginalType(section); mlir::Location exprLoc = cgm.getLoc(curVarExpr->getBeginLoc()); llvm::SmallVector bounds; + llvm::SmallVector boundTypes; std::string exprString; llvm::raw_string_ostream os(exprString); e->printPretty(os, nullptr, getContext().getPrintingPolicy()); + auto addBoundType = [&](const Expr *e) { + if (const auto *section = dyn_cast(curVarExpr)) { + QualType baseTy = ArraySectionExpr::getBaseOriginalType( + section->getBase()->IgnoreParenImpCasts()); + boundTypes.push_back(QualType(baseTy->getPointeeOrArrayElementType(), 0)); + } else { + boundTypes.push_back(curVarExpr->getType()); + } + }; + + addBoundType(curVarExpr); + while (isa(curVarExpr)) { mlir::Location boundLoc = cgm.getLoc(curVarExpr->getBeginLoc()); mlir::Value lowerBound; @@ -115,19 +134,28 @@ CIRGenFunction::getOpenACCDataOperandInfo(const Expr *e) { bounds.push_back(createBound(*this, this->builder, boundLoc, lowerBound, upperBound, extent)); + addBoundType(curVarExpr); } if (const auto *memExpr = dyn_cast(curVarExpr)) - return {exprLoc, emitMemberExpr(memExpr).getPointer(), exprString, + return {exprLoc, + emitMemberExpr(memExpr).getPointer(), + exprString, + origType, curVarExpr->getType().getNonReferenceType().getUnqualifiedType(), - std::move(bounds)}; + std::move(bounds), + std::move(boundTypes)}; // Sema has made sure that only 4 types of things can get here, array // subscript, array section, member expr, or DRE to a var decl (or the // former 3 wrapping a var-decl), so we should be able to assume this is // right. const auto *dre = cast(curVarExpr); - return {exprLoc, emitDeclRefLValue(dre).getPointer(), exprString, + return {exprLoc, + emitDeclRefLValue(dre).getPointer(), + exprString, + origType, curVarExpr->getType().getNonReferenceType().getUnqualifiedType(), - std::move(bounds)}; + std::move(bounds), + std::move(boundTypes)}; } diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index 9959cf6c15792..3cf053449458f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -53,6 +53,7 @@ class OpenACCClauseCIREmitter final template friend class OpenACCClauseCIREmitter; OpTy &operation; + mlir::OpBuilder::InsertPoint &recipeInsertLocation; CIRGen::CIRGenFunction &cgf; CIRGen::CIRGenBuilderTy &builder; @@ -148,7 +149,7 @@ class OpenACCClauseCIREmitter final mlir::OpBuilder::InsertionGuard guardCase(builder); builder.setInsertionPoint(operation.loopOp); OpenACCClauseCIREmitter loopEmitter{ - operation.loopOp, cgf, builder, dirKind, dirLoc}; + operation.loopOp, recipeInsertLocation, cgf, builder, dirKind, dirLoc}; loopEmitter.lastDeviceTypeValues = lastDeviceTypeValues; loopEmitter.Visit(&c); } @@ -159,7 +160,12 @@ class OpenACCClauseCIREmitter final mlir::OpBuilder::InsertionGuard guardCase(builder); builder.setInsertionPoint(operation.computeOp); OpenACCClauseCIREmitter computeEmitter{ - operation.computeOp, cgf, builder, dirKind, dirLoc}; + operation.computeOp, + recipeInsertLocation, + cgf, + builder, + dirKind, + dirLoc}; computeEmitter.lastDeviceTypeValues = lastDeviceTypeValues; @@ -358,11 +364,13 @@ class OpenACCClauseCIREmitter final } public: - OpenACCClauseCIREmitter(OpTy &operation, CIRGen::CIRGenFunction &cgf, + OpenACCClauseCIREmitter(OpTy &operation, + mlir::OpBuilder::InsertPoint &recipeInsertLocation, + CIRGen::CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder, OpenACCDirectiveKind dirKind, SourceLocation dirLoc) - : operation(operation), cgf(cgf), builder(builder), dirKind(dirKind), - dirLoc(dirLoc) {} + : operation(operation), recipeInsertLocation(recipeInsertLocation), + cgf(cgf), builder(builder), dirKind(dirKind), dirLoc(dirLoc) {} void VisitClause(const OpenACCClause &clause) { clauseNotImplemented(clause); @@ -988,20 +996,16 @@ class OpenACCClauseCIREmitter final { mlir::OpBuilder::InsertionGuard guardCase(builder); - // TODO: OpenACC: At the moment this is a bit of a hacky way of doing - // this, and won't work when we get to bounds/etc. Do this for now to - // limit the scope of this refactor. - VarDecl *allocaDecl = varRecipe.AllocaDecl; - allocaDecl->setInit(varRecipe.InitExpr); - allocaDecl->setInitStyle(VarDecl::CallInit); auto recipe = OpenACCRecipeBuilder(cgf, builder) - .getOrCreateRecipe(cgf.getContext(), varExpr, allocaDecl, - /*temporary=*/nullptr, - OpenACCReductionOperator::Invalid, - Decl::castToDeclContext(cgf.curFuncDecl), - opInfo.baseType, privateOp.getResult()); + .getOrCreateRecipe( + cgf.getContext(), recipeInsertLocation, varExpr, + varRecipe.AllocaDecl, varRecipe.InitExpr, + /*temporary=*/nullptr, OpenACCReductionOperator::Invalid, + Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType, + opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType, + privateOp.getResult()); // TODO: OpenACC: The dialect is going to change in the near future to // have these be on a different operation, so when that changes, we // probably need to change these here. @@ -1042,12 +1046,14 @@ class OpenACCClauseCIREmitter final auto recipe = OpenACCRecipeBuilder(cgf, builder) - .getOrCreateRecipe(cgf.getContext(), varExpr, allocaDecl, - varRecipe.InitFromTemporary, - OpenACCReductionOperator::Invalid, - Decl::castToDeclContext(cgf.curFuncDecl), - opInfo.baseType, - firstPrivateOp.getResult()); + .getOrCreateRecipe( + cgf.getContext(), recipeInsertLocation, varExpr, + varRecipe.AllocaDecl, varRecipe.InitExpr, + varRecipe.InitFromTemporary, + OpenACCReductionOperator::Invalid, + Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType, + opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType, + firstPrivateOp.getResult()); // TODO: OpenACC: The dialect is going to change in the near future to // have these be on a different operation, so when that changes, we @@ -1089,11 +1095,13 @@ class OpenACCClauseCIREmitter final auto recipe = OpenACCRecipeBuilder(cgf, builder) - .getOrCreateRecipe(cgf.getContext(), varExpr, allocaDecl, - /*temporary=*/nullptr, - clause.getReductionOp(), - Decl::castToDeclContext(cgf.curFuncDecl), - opInfo.baseType, reductionOp.getResult()); + .getOrCreateRecipe( + cgf.getContext(), recipeInsertLocation, varExpr, + varRecipe.AllocaDecl, varRecipe.InitExpr, + /*temporary=*/nullptr, clause.getReductionOp(), + Decl::castToDeclContext(cgf.curFuncDecl), opInfo.origType, + opInfo.bounds.size(), opInfo.boundTypes, opInfo.baseType, + reductionOp.getResult()); operation.addReduction(builder.getContext(), reductionOp, recipe); } @@ -1109,10 +1117,13 @@ class OpenACCClauseCIREmitter final }; template -auto makeClauseEmitter(OpTy &op, CIRGen::CIRGenFunction &cgf, +auto makeClauseEmitter(OpTy &op, + mlir::OpBuilder::InsertPoint &recipeInsertLocation, + CIRGen::CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder, OpenACCDirectiveKind dirKind, SourceLocation dirLoc) { - return OpenACCClauseCIREmitter(op, cgf, builder, dirKind, dirLoc); + return OpenACCClauseCIREmitter(op, recipeInsertLocation, cgf, builder, + dirKind, dirLoc); } } // namespace @@ -1125,7 +1136,8 @@ void CIRGenFunction::emitOpenACCClauses( // Sets insertion point before the 'op', since every new expression needs to // be before the operation. builder.setInsertionPoint(op); - makeClauseEmitter(op, *this, builder, dirKind, dirLoc).emitClauses(clauses); + makeClauseEmitter(op, lastRecipeLocation, *this, builder, dirKind, dirLoc) + .emitClauses(clauses); } #define EXPL_SPEC(N) \ @@ -1157,7 +1169,8 @@ void CIRGenFunction::emitOpenACCClauses( // We cannot set the insertion point here and do so in the emitter, but make // sure we reset it with the 'guard' anyway. mlir::OpBuilder::InsertionGuard guardCase(builder); - makeClauseEmitter(inf, *this, builder, dirKind, dirLoc).emitClauses(clauses); + makeClauseEmitter(inf, lastRecipeLocation, *this, builder, dirKind, dirLoc) + .emitClauses(clauses); } #define EXPL_SPEC(N) \ diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp new file mode 100644 index 0000000000000..a4c2641fe631c --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp @@ -0,0 +1,316 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Helperes to emit OpenACC clause recipes as CIR code. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenOpenACCRecipe.h" + +namespace clang::CIRGen { +mlir::Block *OpenACCRecipeBuilderBase::createRecipeBlock(mlir::Region ®ion, + mlir::Type opTy, + mlir::Location loc, + size_t numBounds, + bool isInit) { + llvm::SmallVector types; + types.reserve(numBounds + 2); + types.push_back(opTy); + // The init section is the only one that doesn't have TWO copies of the + // operation-type. Copy has a to/from, and destroy has a + // 'reference'/'privatized' copy version. + if (!isInit) + types.push_back(opTy); + + auto boundsTy = mlir::acc::DataBoundsType::get(&cgf.getMLIRContext()); + for (size_t i = 0; i < numBounds; ++i) + types.push_back(boundsTy); + + llvm::SmallVector locs{types.size(), loc}; + return builder.createBlock(®ion, region.end(), types, locs); +} + +mlir::Value +OpenACCRecipeBuilderBase::createBoundsLoop(mlir::Value subscriptedValue, + mlir::Value bound, + mlir::Location loc, bool inverse) { + mlir::Operation *bodyInsertLoc; + + mlir::Type itrTy = cgf.cgm.convertType(cgf.getContext().UnsignedLongLongTy); + auto itrPtrTy = cir::PointerType::get(itrTy); + mlir::IntegerAttr itrAlign = + cgf.cgm.getSize(cgf.getContext().getTypeAlignInChars( + cgf.getContext().UnsignedLongLongTy)); + auto idxType = mlir::IndexType::get(&cgf.getMLIRContext()); + + auto doSubscriptOp = [&](mlir::Value subVal, + cir::LoadOp idxLoad) -> mlir::Value { + auto eltTy = cast(subVal.getType()).getPointee(); + + if (auto arrayTy = dyn_cast(eltTy)) + return builder.getArrayElement(loc, loc, subVal, arrayTy.getElementType(), + idxLoad.getResult(), + /*shouldDecay=*/true); + + assert(isa(eltTy)); + + auto eltLoad = cir::LoadOp::create(builder, loc, {subVal}); + + return cir::PtrStrideOp::create(builder, loc, eltLoad.getType(), eltLoad, + idxLoad.getResult()) + .getResult(); + }; + + auto forStmtBuilder = [&]() { + // get the lower and upper bound for iterating over. + auto lowerBoundVal = + mlir::acc::GetLowerboundOp::create(builder, loc, idxType, bound); + auto lbConversion = mlir::UnrealizedConversionCastOp::create( + builder, loc, itrTy, lowerBoundVal.getResult()); + auto upperBoundVal = + mlir::acc::GetUpperboundOp::create(builder, loc, idxType, bound); + auto ubConversion = mlir::UnrealizedConversionCastOp::create( + builder, loc, itrTy, upperBoundVal.getResult()); + + // Create a memory location for the iterator. + auto itr = + cir::AllocaOp::create(builder, loc, itrPtrTy, itrTy, "iter", itrAlign); + // Store to the iterator: either lower bound, or if inverse loop, upper + // bound. + if (inverse) { + cir::ConstantOp constOne = builder.getConstInt(loc, itrTy, 1); + + auto sub = + cir::BinOp::create(builder, loc, itrTy, cir::BinOpKind::Sub, + ubConversion.getResult(0), constOne.getResult()); + + // Upperbound is exclusive, so subtract 1. + builder.CIRBaseBuilderTy::createStore(loc, sub.getResult(), itr); + } else { + // Lowerbound is inclusive, so we can include it. + builder.CIRBaseBuilderTy::createStore(loc, lbConversion.getResult(0), + itr); + } + // Save the 'end' iterator based on whether we are inverted or not. This + // end iterator never changes, so we can just get it and convert it, so no + // need to store/load/etc. + auto endItr = inverse ? lbConversion : ubConversion; + + builder.createFor( + loc, + /*condBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto loadCur = cir::LoadOp::create(builder, loc, {itr}); + // Use 'not equal' since we are just doing an increment/decrement. + auto cmp = builder.createCompare( + loc, inverse ? cir::CmpOpKind::ge : cir::CmpOpKind::lt, + loadCur.getResult(), endItr.getResult(0)); + builder.createCondition(cmp); + }, + /*bodyBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto load = cir::LoadOp::create(builder, loc, {itr}); + + if (subscriptedValue) + subscriptedValue = doSubscriptOp(subscriptedValue, load); + bodyInsertLoc = builder.createYield(loc); + }, + /*stepBuilder=*/ + [&](mlir::OpBuilder &b, mlir::Location loc) { + auto load = cir::LoadOp::create(builder, loc, {itr}); + auto unary = cir::UnaryOp::create(builder, loc, load.getType(), + inverse ? cir::UnaryOpKind::Dec + : cir::UnaryOpKind::Inc, + load.getResult()); + builder.CIRBaseBuilderTy::createStore(loc, unary.getResult(), itr); + builder.createYield(loc); + }); + }; + + cir::ScopeOp::create(builder, loc, + [&](mlir::OpBuilder &b, mlir::Location loc) { + forStmtBuilder(); + builder.createYield(loc); + }); + + // Leave the insertion point to be inside the body, so we can loop over + // these things. + builder.setInsertionPoint(bodyInsertLoc); + return subscriptedValue; +} + +mlir::acc::ReductionOperator +OpenACCRecipeBuilderBase::convertReductionOp(OpenACCReductionOperator op) { + switch (op) { + case OpenACCReductionOperator::Addition: + return mlir::acc::ReductionOperator::AccAdd; + case OpenACCReductionOperator::Multiplication: + return mlir::acc::ReductionOperator::AccMul; + case OpenACCReductionOperator::Max: + return mlir::acc::ReductionOperator::AccMax; + case OpenACCReductionOperator::Min: + return mlir::acc::ReductionOperator::AccMin; + case OpenACCReductionOperator::BitwiseAnd: + return mlir::acc::ReductionOperator::AccIand; + case OpenACCReductionOperator::BitwiseOr: + return mlir::acc::ReductionOperator::AccIor; + case OpenACCReductionOperator::BitwiseXOr: + return mlir::acc::ReductionOperator::AccXor; + case OpenACCReductionOperator::And: + return mlir::acc::ReductionOperator::AccLand; + case OpenACCReductionOperator::Or: + return mlir::acc::ReductionOperator::AccLor; + case OpenACCReductionOperator::Invalid: + llvm_unreachable("invalid reduction operator"); + } + + llvm_unreachable("invalid reduction operator"); +} + +// This function generates the 'destroy' section for a recipe. Note +// that this function is not 'insertion point' clean, in that it alters the +// insertion point to be inside of the 'destroy' section of the recipe, but +// doesn't restore it aftewards. +void OpenACCRecipeBuilderBase::createRecipeDestroySection( + mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, + CharUnits alignment, QualType origType, size_t numBounds, QualType baseType, + mlir::Region &destroyRegion) { + mlir::Block *block = createRecipeBlock(destroyRegion, mainOp.getType(), loc, + numBounds, /*isInit=*/false); + builder.setInsertionPointToEnd(&destroyRegion.back()); + CIRGenFunction::LexicalScope ls(cgf, loc, block); + + mlir::Type elementTy = + mlir::cast(mainOp.getType()).getPointee(); + auto emitDestroy = [&](mlir::Value var, mlir::Type ty) { + Address addr{var, ty, alignment}; + cgf.emitDestroy(addr, origType, + cgf.getDestroyer(QualType::DK_cxx_destructor)); + }; + + if (numBounds) { + mlir::OpBuilder::InsertionGuard guardCase(builder); + // Get the range of bounds arguments, which are all but the 1st 2. 1st is + // a 'reference', 2nd is the 'private' variant we need to destroy from. + llvm::MutableArrayRef boundsRange = + block->getArguments().drop_front(2); + + mlir::Value subscriptedValue = block->getArgument(1); + for (mlir::BlockArgument boundArg : llvm::reverse(boundsRange)) + subscriptedValue = createBoundsLoop(subscriptedValue, boundArg, loc, + /*inverse=*/true); + + emitDestroy(subscriptedValue, cgf.cgm.convertType(origType)); + } else { + // If we don't have any bounds, we can just destroy the variable directly. + // The destroy region has a signature of "original item, privatized item". + // So the 2nd item is the one that needs destroying, the former is just + // for reference and we don't really have a need for it at the moment. + emitDestroy(block->getArgument(1), elementTy); + } + + mlir::acc::YieldOp::create(builder, locEnd); +} + +// TODO: OpenACC: When we get this implemented for the reduction/firstprivate, +// this might end up re-merging with createRecipeInitCopy. For now, keep it +// separate until we're sure what everything looks like to keep this as clean +// as possible. +void OpenACCRecipeBuilderBase::createPrivateInitRecipe( + mlir::Location loc, mlir::Location locEnd, SourceRange exprRange, + mlir::Value mainOp, mlir::acc::PrivateRecipeOp recipe, size_t numBounds, + llvm::ArrayRef boundTypes, const VarDecl *allocaDecl, + QualType origType, const Expr *initExpr) { + assert(allocaDecl && "Required recipe variable not set?"); + CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, allocaDecl}; + + mlir::Block *block = + createRecipeBlock(recipe.getInitRegion(), mainOp.getType(), loc, + numBounds, /*isInit=*/true); + builder.setInsertionPointToEnd(&recipe.getInitRegion().back()); + CIRGenFunction::LexicalScope ls(cgf, loc, block); + + const Type *allocaPointeeType = + allocaDecl->getType()->getPointeeOrArrayElementType(); + // We are OK with no init for builtins, arrays of builtins, or pointers, + // else we should NYI so we know to go look for these. + if (cgf.getContext().getLangOpts().CPlusPlus && !allocaDecl->getInit() && + !allocaDecl->getType()->isPointerType() && + !allocaPointeeType->isBuiltinType() && + !allocaPointeeType->isPointerType()) { + // If we don't have any initialization recipe, we failed during Sema to + // initialize this correctly. If we disable the + // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll + // emit an error to tell us. However, emitting those errors during + // production is a violation of the standard, so we cannot do them. + cgf.cgm.errorNYI(exprRange, "private default-init recipe"); + } + + if (!numBounds) { + // This is an 'easy' case, we just have to use the builtin init stuff to + // initialize this variable correctly. + CIRGenFunction::AutoVarEmission tempDeclEmission = + cgf.emitAutoVarAlloca(*allocaDecl, builder.saveInsertionPoint()); + cgf.emitAutoVarInit(tempDeclEmission); + } else { + cgf.cgm.errorNYI(exprRange, "private-init with bounds"); + } + + mlir::acc::YieldOp::create(builder, locEnd); +} + +void OpenACCRecipeBuilderBase::createFirstprivateRecipeCopy( + mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, + CIRGenFunction::AutoVarEmission tempDeclEmission, + mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe, + const VarDecl *temporary) { + mlir::Block *block = + createRecipeBlock(recipe.getCopyRegion(), mainOp.getType(), loc, + /*numBounds=*/0, /*isInit=*/false); + builder.setInsertionPointToEnd(&recipe.getCopyRegion().back()); + CIRGenFunction::LexicalScope ls(cgf, loc, block); + + mlir::BlockArgument fromArg = block->getArgument(0); + mlir::BlockArgument toArg = block->getArgument(1); + + mlir::Type elementTy = + mlir::cast(mainOp.getType()).getPointee(); + + // Set the address of the emission to be the argument, so that we initialize + // that instead of the variable in the other block. + tempDeclEmission.setAllocatedAddress( + Address{toArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)}); + tempDeclEmission.EmittedAsOffload = true; + + CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary}; + cgf.setAddrOfLocalVar( + temporary, + Address{fromArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)}); + + cgf.emitAutoVarInit(tempDeclEmission); + mlir::acc::YieldOp::create(builder, locEnd); +} +// This function generates the 'combiner' section for a reduction recipe. Note +// that this function is not 'insertion point' clean, in that it alters the +// insertion point to be inside of the 'combiner' section of the recipe, but +// doesn't restore it aftewards. +void OpenACCRecipeBuilderBase::createReductionRecipeCombiner( + mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, + mlir::acc::ReductionRecipeOp recipe) { + mlir::Block *block = builder.createBlock( + &recipe.getCombinerRegion(), recipe.getCombinerRegion().end(), + {mainOp.getType(), mainOp.getType()}, {loc, loc}); + builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back()); + CIRGenFunction::LexicalScope ls(cgf, loc, block); + + mlir::BlockArgument lhsArg = block->getArgument(0); + + mlir::acc::YieldOp::create(builder, locEnd, lhsArg); +} + +} // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h index 102fd890e5579..978c671f9a170 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.h @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CIRGenCXXABI.h" #include "CIRGenFunction.h" #include "clang/AST/ASTContext.h" @@ -22,38 +23,56 @@ #include "mlir/Dialect/OpenACC/OpenACC.h" namespace clang::CIRGen { -template class OpenACCRecipeBuilder { +class OpenACCRecipeBuilderBase { +protected: CIRGen::CIRGenFunction &cgf; CIRGen::CIRGenBuilderTy &builder; - mlir::acc::ReductionOperator convertReductionOp(OpenACCReductionOperator op) { - switch (op) { - case OpenACCReductionOperator::Addition: - return mlir::acc::ReductionOperator::AccAdd; - case OpenACCReductionOperator::Multiplication: - return mlir::acc::ReductionOperator::AccMul; - case OpenACCReductionOperator::Max: - return mlir::acc::ReductionOperator::AccMax; - case OpenACCReductionOperator::Min: - return mlir::acc::ReductionOperator::AccMin; - case OpenACCReductionOperator::BitwiseAnd: - return mlir::acc::ReductionOperator::AccIand; - case OpenACCReductionOperator::BitwiseOr: - return mlir::acc::ReductionOperator::AccIor; - case OpenACCReductionOperator::BitwiseXOr: - return mlir::acc::ReductionOperator::AccXor; - case OpenACCReductionOperator::And: - return mlir::acc::ReductionOperator::AccLand; - case OpenACCReductionOperator::Or: - return mlir::acc::ReductionOperator::AccLor; - case OpenACCReductionOperator::Invalid: - llvm_unreachable("invalid reduction operator"); - } + mlir::Block *createRecipeBlock(mlir::Region ®ion, mlir::Type opTy, + mlir::Location loc, size_t numBounds, + bool isInit); + // Creates a loop through an 'acc.bounds', leaving the 'insertion' point to be + // the inside of the loop body. Traverses LB->UB UNLESS `inverse` is set. + // Returns the 'subscriptedValue' changed with the new bounds subscript. + mlir::Value createBoundsLoop(mlir::Value subscriptedValue, mlir::Value bound, + mlir::Location loc, bool inverse); + mlir::acc::ReductionOperator convertReductionOp(OpenACCReductionOperator op); + void createFirstprivateRecipeCopy( + mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, + CIRGenFunction::AutoVarEmission tempDeclEmission, + mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe, + const VarDecl *temporary); - llvm_unreachable("invalid reduction operator"); - } + // This function generates the 'combiner' section for a reduction recipe. Note + // that this function is not 'insertion point' clean, in that it alters the + // insertion point to be inside of the 'combiner' section of the recipe, but + // doesn't restore it aftewards. + void createReductionRecipeCombiner(mlir::Location loc, mlir::Location locEnd, + mlir::Value mainOp, + mlir::acc::ReductionRecipeOp recipe); + void createPrivateInitRecipe(mlir::Location loc, mlir::Location locEnd, + SourceRange exprRange, mlir::Value mainOp, + mlir::acc::PrivateRecipeOp recipe, + size_t numBounds, + llvm::ArrayRef boundTypes, + const VarDecl *allocaDecl, QualType origType, + const Expr *initExpr); + + void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd, + mlir::Value mainOp, CharUnits alignment, + QualType origType, size_t numBounds, + QualType baseType, + mlir::Region &destroyRegion); + OpenACCRecipeBuilderBase(CIRGen::CIRGenFunction &cgf, + CIRGen::CIRGenBuilderTy &builder) + : cgf(cgf), builder(builder) {} +}; + +template +class OpenACCRecipeBuilder : OpenACCRecipeBuilderBase { std::string getRecipeName(SourceRange loc, QualType baseType, + unsigned numBounds, OpenACCReductionOperator reductionOp) { std::string recipeName; { @@ -106,44 +125,17 @@ template class OpenACCRecipeBuilder { static_assert(!sizeof(RecipeTy), "Unknown Recipe op kind"); } + // The naming convention from Flang with bounds doesn't map to C++ types + // very well, so we're just going to choose our own here. + if (numBounds) + stream << "_Bcnt" << numBounds << '_'; + MangleContext &mc = cgf.cgm.getCXXABI().getMangleContext(); mc.mangleCanonicalTypeName(baseType, stream); } return recipeName; } - void createFirstprivateRecipeCopy( - mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, - CIRGenFunction::AutoVarEmission tempDeclEmission, - mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe, - const VarDecl *temporary) { - mlir::Block *block = builder.createBlock( - &recipe.getCopyRegion(), recipe.getCopyRegion().end(), - {mainOp.getType(), mainOp.getType()}, {loc, loc}); - builder.setInsertionPointToEnd(&recipe.getCopyRegion().back()); - CIRGenFunction::LexicalScope ls(cgf, loc, block); - - mlir::BlockArgument fromArg = block->getArgument(0); - mlir::BlockArgument toArg = block->getArgument(1); - - mlir::Type elementTy = - mlir::cast(mainOp.getType()).getPointee(); - - // Set the address of the emission to be the argument, so that we initialize - // that instead of the variable in the other block. - tempDeclEmission.setAllocatedAddress( - Address{toArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)}); - tempDeclEmission.EmittedAsOffload = true; - - CIRGenFunction::DeclMapRevertingRAII declMapRAII{cgf, temporary}; - cgf.setAddrOfLocalVar( - temporary, - Address{fromArg, elementTy, cgf.getContext().getDeclAlign(varRecipe)}); - - cgf.emitAutoVarInit(tempDeclEmission); - mlir::acc::YieldOp::create(builder, locEnd); - } - // Create the 'init' section of the recipe, including the 'copy' section for // 'firstprivate'. Note that this function is not 'insertion point' clean, in // that it alters the insertion point to be inside of the 'destroy' section of @@ -152,6 +144,9 @@ template class OpenACCRecipeBuilder { SourceRange exprRange, mlir::Value mainOp, RecipeTy recipe, const VarDecl *varRecipe, const VarDecl *temporary) { + // TODO: OpenACC: when we get the 'pointer' variants for + // firstprivate/reduction, this probably should be removed/split into + // functions for the BuilderBase. assert(varRecipe && "Required recipe variable not set?"); CIRGenFunction::AutoVarEmission tempDeclEmission{ @@ -160,9 +155,9 @@ template class OpenACCRecipeBuilder { // Do the 'init' section of the recipe IR, which does an alloca, then the // initialization (except for firstprivate). - mlir::Block *block = builder.createBlock(&recipe.getInitRegion(), - recipe.getInitRegion().end(), - {mainOp.getType()}, {loc}); + mlir::Block *block = + createRecipeBlock(recipe.getInitRegion(), mainOp.getType(), loc, + /*numBounds=*/0, /*isInit=*/true); builder.setInsertionPointToEnd(&recipe.getInitRegion().back()); CIRGenFunction::LexicalScope ls(cgf, loc, block); @@ -217,79 +212,42 @@ template class OpenACCRecipeBuilder { } } - // This function generates the 'combiner' section for a reduction recipe. Note - // that this function is not 'insertion point' clean, in that it alters the - // insertion point to be inside of the 'combiner' section of the recipe, but - // doesn't restore it aftewards. - void createReductionRecipeCombiner(mlir::Location loc, mlir::Location locEnd, - mlir::Value mainOp, - mlir::acc::ReductionRecipeOp recipe) { - mlir::Block *block = builder.createBlock( - &recipe.getCombinerRegion(), recipe.getCombinerRegion().end(), - {mainOp.getType(), mainOp.getType()}, {loc, loc}); - builder.setInsertionPointToEnd(&recipe.getCombinerRegion().back()); - CIRGenFunction::LexicalScope ls(cgf, loc, block); - - mlir::BlockArgument lhsArg = block->getArgument(0); - - mlir::acc::YieldOp::create(builder, locEnd, lhsArg); - } - - // This function generates the 'destroy' section for a recipe. Note - // that this function is not 'insertion point' clean, in that it alters the - // insertion point to be inside of the 'destroy' section of the recipe, but - // doesn't restore it aftewards. - void createRecipeDestroySection(mlir::Location loc, mlir::Location locEnd, - mlir::Value mainOp, CharUnits alignment, - QualType baseType, - mlir::Region &destroyRegion) { - mlir::Block *block = - builder.createBlock(&destroyRegion, destroyRegion.end(), - {mainOp.getType(), mainOp.getType()}, {loc, loc}); - builder.setInsertionPointToEnd(&destroyRegion.back()); - CIRGenFunction::LexicalScope ls(cgf, loc, block); - - mlir::Type elementTy = - mlir::cast(mainOp.getType()).getPointee(); - // The destroy region has a signature of "original item, privatized item". - // So the 2nd item is the one that needs destroying, the former is just for - // reference and we don't really have a need for it at the moment. - Address addr{block->getArgument(1), elementTy, alignment}; - cgf.emitDestroy(addr, baseType, - cgf.getDestroyer(QualType::DK_cxx_destructor)); - - mlir::acc::YieldOp::create(builder, locEnd); - } - public: OpenACCRecipeBuilder(CIRGen::CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder) - : cgf(cgf), builder(builder) {} - RecipeTy getOrCreateRecipe(ASTContext &astCtx, const Expr *varRef, - const VarDecl *varRecipe, const VarDecl *temporary, + : OpenACCRecipeBuilderBase(cgf, builder) {} + RecipeTy getOrCreateRecipe(ASTContext &astCtx, + mlir::OpBuilder::InsertPoint &insertLocation, + const Expr *varRef, const VarDecl *varRecipe, + const Expr *initExpr, const VarDecl *temporary, OpenACCReductionOperator reductionOp, - DeclContext *dc, QualType baseType, - mlir::Value mainOp) { - - if (baseType->isPointerType() || - (baseType->isArrayType() && !baseType->isConstantArrayType())) { - // It is clear that the use of pointers/VLAs in a recipe are not properly - // generated/don't do what they are supposed to do. In the case where we - // have 'bounds', we can actually figure out what we want to - // initialize/copy/destroy/compare/etc, but we haven't figured out how - // that looks yet, both between the IR and generation code. For now, we - // will do an NYI error no it. - cgf.cgm.errorNYI( - varRef->getSourceRange(), - "OpenACC recipe generation for pointer/non-constant arrays"); + DeclContext *dc, QualType origType, + size_t numBounds, + llvm::ArrayRef boundTypes, + QualType baseType, mlir::Value mainOp) { + assert(!varRecipe->getType()->isSpecificBuiltinType( + BuiltinType::ArraySection) && + "array section shouldn't make it to recipe creation"); + + // TODO: OpenACC: This is a bit of a hackery to get this to not change for + // the non-private recipes. This will be removed soon, when we get this + // 'right' for firstprivate and reduction. + if constexpr (!std::is_same_v) { + if (numBounds) { + cgf.cgm.errorNYI(varRef->getSourceRange(), + "firstprivate/reduction-init with bounds"); + } + boundTypes = {}; + numBounds = 0; + origType = baseType; } mlir::ModuleOp mod = builder.getBlock() ->getParent() ->template getParentOfType(); - std::string recipeName = - getRecipeName(varRef->getSourceRange(), baseType, reductionOp); + std::string recipeName = getRecipeName(varRef->getSourceRange(), baseType, + numBounds, reductionOp); if (auto recipe = mod.lookupSymbol(recipeName)) return recipe; @@ -297,6 +255,8 @@ template class OpenACCRecipeBuilder { mlir::Location locEnd = cgf.cgm.getLoc(varRef->getEndLoc()); mlir::OpBuilder modBuilder(mod.getBodyRegion()); + if (insertLocation.isSet()) + modBuilder.restoreInsertionPoint(insertLocation); RecipeTy recipe; if constexpr (std::is_same_v) { @@ -305,18 +265,25 @@ template class OpenACCRecipeBuilder { } else { recipe = RecipeTy::create(modBuilder, loc, recipeName, mainOp.getType()); } + insertLocation = modBuilder.saveInsertionPoint(); - createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp, recipe, - varRecipe, temporary); + if constexpr (std::is_same_v) { + createPrivateInitRecipe(loc, locEnd, varRef->getSourceRange(), mainOp, + recipe, numBounds, boundTypes, varRecipe, + origType, initExpr); + } else { + createRecipeInitCopy(loc, locEnd, varRef->getSourceRange(), mainOp, + recipe, varRecipe, temporary); + } if constexpr (std::is_same_v) { createReductionRecipeCombiner(loc, locEnd, mainOp, recipe); } - if (varRecipe && varRecipe->needsDestruction(cgf.getContext())) - createRecipeDestroySection(loc, locEnd, mainOp, - cgf.getContext().getDeclAlign(varRecipe), - baseType, recipe.getDestroyRegion()); + if (origType.isDestructedType()) + createRecipeDestroySection( + loc, locEnd, mainOp, cgf.getContext().getDeclAlign(varRecipe), + origType, numBounds, baseType, recipe.getDestroyRegion()); return recipe; } }; diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index f116efc202061..e842892d085d2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -488,8 +488,11 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) { auto *retBlock = curLexScope->getOrCreateRetBlock(*this, loc); // This should emit a branch through the cleanup block if one exists. builder.create(loc, retBlock); + assert(!cir::MissingFeatures::emitBranchThroughCleanup()); if (ehStack.stable_begin() != currentCleanupStackDepth) cgm.errorNYI(s.getSourceRange(), "return with cleanup stack"); + + // Insert the new block to continue codegen after branch to ret block. builder.createBlock(builder.getBlock()->getParent()); return mlir::success(); @@ -1041,3 +1044,21 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) { return res; } + +void CIRGenFunction::emitReturnOfRValue(mlir::Location loc, RValue rv, + QualType ty) { + if (rv.isScalar()) { + builder.createStore(loc, rv.getValue(), returnValue); + } else if (rv.isAggregate()) { + LValue dest = makeAddrLValue(returnValue, ty); + LValue src = makeAddrLValue(rv.getAggregateAddress(), ty); + emitAggregateCopy(dest, src, ty, getOverlapForReturnValue()); + } else { + cgm.errorNYI(loc, "emitReturnOfRValue: complex return type"); + } + mlir::Block *retBlock = curLexScope->getOrCreateRetBlock(*this, loc); + assert(!cir::MissingFeatures::emitBranchThroughCleanup()); + builder.create(loc, retBlock); + if (ehStack.stable_begin() != currentCleanupStackDepth) + cgm.errorNYI(loc, "return with cleanup stack"); +} diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index c7b76e8372efc..c1f27ec8ba858 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -31,6 +31,7 @@ add_clang_library(clangCIR CIRGenModule.cpp CIRGenOpenACC.cpp CIRGenOpenACCClause.cpp + CIRGenOpenACCRecipe.cpp CIRGenRecordLayoutBuilder.cpp CIRGenStmt.cpp CIRGenStmtOpenACC.cpp diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 53126348c3bdc..58ef500446aa7 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -1546,11 +1546,14 @@ ParseResult cir::FuncOp::parse(OpAsmParser &parser, OperationState &state) { llvm::SMLoc loc = parser.getCurrentLocation(); mlir::Builder &builder = parser.getBuilder(); + mlir::StringAttr lambdaNameAttr = getLambdaAttrName(state.name); mlir::StringAttr noProtoNameAttr = getNoProtoAttrName(state.name); mlir::StringAttr visNameAttr = getSymVisibilityAttrName(state.name); mlir::StringAttr visibilityNameAttr = getGlobalVisibilityAttrName(state.name); mlir::StringAttr dsoLocalNameAttr = getDsoLocalAttrName(state.name); + if (::mlir::succeeded(parser.parseOptionalKeyword(lambdaNameAttr.strref()))) + state.addAttribute(lambdaNameAttr, parser.getBuilder().getUnitAttr()); if (parser.parseOptionalKeyword(noProtoNameAttr).succeeded()) state.addAttribute(noProtoNameAttr, parser.getBuilder().getUnitAttr()); @@ -1658,6 +1661,9 @@ mlir::Region *cir::FuncOp::getCallableRegion() { } void cir::FuncOp::print(OpAsmPrinter &p) { + if (getLambda()) + p << " lambda"; + if (getNoProto()) p << " no_proto"; diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 1865698838134..876948d53010b 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1941,8 +1941,14 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite( // Pointer unary operations: + only. (++ and -- of pointers are implemented // with cir.ptr_stride, not cir.unary.) if (mlir::isa(elementType)) { - return op.emitError() - << "Unary operation on pointer types is not yet implemented"; + switch (op.getKind()) { + case cir::UnaryOpKind::Plus: + rewriter.replaceOp(op, adaptor.getInput()); + return mlir::success(); + default: + op.emitError() << "Unknown pointer unary operation during CIR lowering"; + return mlir::failure(); + } } return op.emitError() << "Unary operation has unsupported type: " @@ -2381,9 +2387,6 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter, } break; } - converter.addConversion([&](cir::VoidType type) -> mlir::Type { - return mlir::LLVM::LLVMVoidType::get(type.getContext()); - }); // Record has a name: lower as an identified record. mlir::LLVM::LLVMStructType llvmStruct; diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 77bf0c8251fc2..57db20f70801b 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -29,6 +29,7 @@ #include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugInfo.h" +#include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/IR/ModuleSummaryIndex.h" @@ -436,7 +437,8 @@ static bool initTargetOptions(const CompilerInstance &CI, if (Options.BBSections == llvm::BasicBlockSection::List) { ErrorOr> MBOrErr = - MemoryBuffer::getFile(CodeGenOpts.BBSections.substr(5)); + CI.getVirtualFileSystem().getBufferForFile( + CodeGenOpts.BBSections.substr(5)); if (!MBOrErr) { Diags.Report(diag::err_fe_unable_to_load_basic_block_sections_file) << MBOrErr.getError().message(); @@ -784,7 +786,8 @@ static void addSanitizers(const Triple &TargetTriple, HWASanPass(SanitizerKind::KernelHWAddress, true); if (LangOpts.Sanitize.has(SanitizerKind::DataFlow)) { - MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles)); + MPM.addPass(DataFlowSanitizerPass(LangOpts.NoSanitizeFiles, + PB.getVirtualFileSystemPtr())); } }; if (ClSanitizeOnOptimizerEarlyEP) { @@ -836,9 +839,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline( if (CodeGenOpts.hasProfileIRInstr()) // -fprofile-generate. PGOOpt = PGOOptions(getProfileGenName(CodeGenOpts), "", "", - CodeGenOpts.MemoryProfileUsePath, nullptr, - PGOOptions::IRInstr, PGOOptions::NoCSAction, - ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, + CodeGenOpts.MemoryProfileUsePath, PGOOptions::IRInstr, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, + CodeGenOpts.DebugInfoForProfiling, /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate); else if (CodeGenOpts.hasProfileIRUse()) { @@ -847,32 +850,30 @@ void EmitAssemblyHelper::RunOptimizationPipeline( : PGOOptions::NoCSAction; PGOOpt = PGOOptions(CodeGenOpts.ProfileInstrumentUsePath, "", CodeGenOpts.ProfileRemappingFile, - CodeGenOpts.MemoryProfileUsePath, VFS, - PGOOptions::IRUse, CSAction, ClPGOColdFuncAttr, + CodeGenOpts.MemoryProfileUsePath, PGOOptions::IRUse, + CSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); } else if (!CodeGenOpts.SampleProfileFile.empty()) // -fprofile-sample-use PGOOpt = PGOOptions( CodeGenOpts.SampleProfileFile, "", CodeGenOpts.ProfileRemappingFile, - CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::SampleUse, + CodeGenOpts.MemoryProfileUsePath, PGOOptions::SampleUse, PGOOptions::NoCSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, CodeGenOpts.PseudoProbeForProfiling); else if (!CodeGenOpts.MemoryProfileUsePath.empty()) // -fmemory-profile-use (without any of the above options) - PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, VFS, + PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, PGOOptions::NoAction, PGOOptions::NoCSAction, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); else if (CodeGenOpts.PseudoProbeForProfiling) // -fpseudo-probe-for-profiling - PGOOpt = - PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, - PGOOptions::NoAction, PGOOptions::NoCSAction, - ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, true); + PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", PGOOptions::NoAction, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, + CodeGenOpts.DebugInfoForProfiling, true); else if (CodeGenOpts.DebugInfoForProfiling) // -fdebug-info-for-profiling - PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, - PGOOptions::NoAction, PGOOptions::NoCSAction, - ClPGOColdFuncAttr, true); + PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", PGOOptions::NoAction, + PGOOptions::NoCSAction, ClPGOColdFuncAttr, true); // Check to see if we want to generate a CS profile. if (CodeGenOpts.hasProfileCSIRInstr()) { @@ -888,7 +889,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( PGOOpt->CSAction = PGOOptions::CSIRInstr; } else PGOOpt = PGOOptions("", getProfileGenName(CodeGenOpts), "", - /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction, + /*MemoryProfile=*/"", PGOOptions::NoAction, PGOOptions::CSIRInstr, ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling); } @@ -925,7 +926,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline( (CodeGenOpts.DebugPassManager || DebugPassStructure), CodeGenOpts.VerifyEach, PrintPassOpts); SI.registerCallbacks(PIC, &MAM); - PassBuilder PB(TM.get(), PTO, PGOOpt, &PIC); + PassBuilder PB(TM.get(), PTO, PGOOpt, &PIC, CI.getVirtualFileSystemPtr()); // Handle the assignment tracking feature options. switch (CodeGenOpts.getAssignmentTrackingMode()) { @@ -1384,6 +1385,10 @@ runThinLTOBackend(CompilerInstance &CI, ModuleSummaryIndex *CombinedIndex, Conf.CGFileType = getCodeGenFileType(Action); break; } + + // FIXME: Both ExecuteAction and thinBackend set up optimization remarks for + // the same context. + finalizeLLVMOptimizationRemarks(M->getContext()); if (Error E = thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList, ModuleToDefinedGVSummaries[M->getModuleIdentifier()], @@ -1471,13 +1476,13 @@ void clang::EmbedBitcode(llvm::Module *M, const CodeGenOptions &CGOpts, } void clang::EmbedObject(llvm::Module *M, const CodeGenOptions &CGOpts, - DiagnosticsEngine &Diags) { + llvm::vfs::FileSystem &VFS, DiagnosticsEngine &Diags) { if (CGOpts.OffloadObjects.empty()) return; for (StringRef OffloadObject : CGOpts.OffloadObjects) { llvm::ErrorOr> ObjectOrErr = - llvm::MemoryBuffer::getFileOrSTDIN(OffloadObject); + VFS.getBufferForFile(OffloadObject); if (ObjectOrErr.getError()) { auto DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error, "could not open '%0' for embedding"); diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp index 9106c4cd8e139..eeb0fd6412946 100644 --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -734,7 +734,8 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest, CGF.emitAtomicRMWInst(llvm::AtomicRMWInst::Xchg, Ptr, CGF.Builder.getInt8(1), Order, Scope, E); RMWI->setVolatile(E->isVolatile()); - llvm::Value *Result = CGF.Builder.CreateIsNotNull(RMWI, "tobool"); + llvm::Value *Result = CGF.EmitToMemory( + CGF.Builder.CreateIsNotNull(RMWI, "tobool"), E->getType()); auto *I = CGF.Builder.CreateStore(Result, Dest); CGF.addInstToCurrentSourceAtom(I, Result); return; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f7c3dea257d50..9ee810c9d5775 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4277,18 +4277,19 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, llvm::Value *Ptr = EmitScalarExpr(E->getArg(1)); llvm::Type *RetTy = CGM.getTypes().ConvertType(E->getType()); - CharUnits Align = CGM.getNaturalTypeAlignment(E->getType(), nullptr); - llvm::Value *AlignVal = - llvm::ConstantInt::get(Int32Ty, Align.getQuantity()); - llvm::Value *PassThru = llvm::PoisonValue::get(RetTy); if (E->getNumArgs() > 2) PassThru = EmitScalarExpr(E->getArg(2)); + CharUnits Align = CGM.getNaturalTypeAlignment( + E->getType()->getAs()->getElementType(), nullptr); + llvm::Value *AlignVal = + llvm::ConstantInt::get(Int32Ty, Align.getQuantity()); + llvm::Value *Result; if (BuiltinID == Builtin::BI__builtin_masked_load) { Function *F = - CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, UnqualPtrTy}); + CGM.getIntrinsic(Intrinsic::masked_load, {RetTy, Ptr->getType()}); Result = Builder.CreateCall(F, {Ptr, AlignVal, Mask, PassThru}, "masked_load"); } else { @@ -4333,15 +4334,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, QualType ValTy = E->getArg(1)->getType(); llvm::Type *ValLLTy = CGM.getTypes().ConvertType(ValTy); - llvm::Type *PtrTy = Ptr->getType(); - CharUnits Align = CGM.getNaturalTypeAlignment(ValTy, nullptr); + CharUnits Align = CGM.getNaturalTypeAlignment( + E->getArg(1)->getType()->getAs()->getElementType(), + nullptr); llvm::Value *AlignVal = llvm::ConstantInt::get(Int32Ty, Align.getQuantity()); if (BuiltinID == Builtin::BI__builtin_masked_store) { - llvm::Function *F = - CGM.getIntrinsic(llvm::Intrinsic::masked_store, {ValLLTy, PtrTy}); + llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::masked_store, + {ValLLTy, Ptr->getType()}); Builder.CreateCall(F, {Val, Ptr, AlignVal, Mask}); } else { llvm::Function *F = diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 0b2fce4244fb6..a931ce476b8ae 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2438,7 +2438,10 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, // Some ABIs may result in additional accesses to arguments that may // otherwise not be present. + std::optional MemAttrForPtrArgs; + bool AddedPotentialArgAccess = false; auto AddPotentialArgAccess = [&]() { + AddedPotentialArgAccess = true; llvm::Attribute A = FuncAttrs.getAttribute(llvm::Attribute::Memory); if (A.isValid()) FuncAttrs.addMemoryAttr(A.getMemoryEffects() | @@ -2499,11 +2502,13 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, // gcc specifies that 'const' functions have greater restrictions than // 'pure' functions, so they also cannot have infinite loops. FuncAttrs.addAttribute(llvm::Attribute::WillReturn); + MemAttrForPtrArgs = llvm::Attribute::ReadNone; } else if (TargetDecl->hasAttr()) { FuncAttrs.addMemoryAttr(llvm::MemoryEffects::readOnly()); FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); // gcc specifies that 'pure' functions cannot have infinite loops. FuncAttrs.addAttribute(llvm::Attribute::WillReturn); + MemAttrForPtrArgs = llvm::Attribute::ReadOnly; } else if (TargetDecl->hasAttr()) { FuncAttrs.addMemoryAttr(llvm::MemoryEffects::inaccessibleOrArgMemOnly()); FuncAttrs.addAttribute(llvm::Attribute::NoUnwind); @@ -3011,6 +3016,27 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, } assert(ArgNo == FI.arg_size()); + ArgNo = 0; + if (AddedPotentialArgAccess && MemAttrForPtrArgs) { + llvm::FunctionType *FunctionType = FunctionType = + getTypes().GetFunctionType(FI); + for (CGFunctionInfo::const_arg_iterator I = FI.arg_begin(), + E = FI.arg_end(); + I != E; ++I, ++ArgNo) { + if (I->info.isDirect() || I->info.isExpand() || + I->info.isCoerceAndExpand()) { + unsigned FirstIRArg, NumIRArgs; + std::tie(FirstIRArg, NumIRArgs) = IRFunctionArgs.getIRArgs(ArgNo); + for (unsigned i = FirstIRArg; i < FirstIRArg + NumIRArgs; ++i) { + if (FunctionType->getParamType(i)->isPointerTy()) { + ArgAttrs[i] = + ArgAttrs[i].addAttribute(getLLVMContext(), *MemAttrForPtrArgs); + } + } + } + } + } + AttrList = llvm::AttributeList::get( getLLVMContext(), llvm::AttributeSet::get(getLLVMContext(), FuncAttrs), llvm::AttributeSet::get(getLLVMContext(), RetAttrs), ArgAttrs); diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp index 827385f9c1a1f..b76450152203d 100644 --- a/clang/lib/CodeGen/CGCoroutine.cpp +++ b/clang/lib/CodeGen/CGCoroutine.cpp @@ -575,17 +575,19 @@ struct CallCoroEnd final : public EHScopeStack::Cleanup { llvm::Function *CoroEndFn = CGM.getIntrinsic(llvm::Intrinsic::coro_end); // See if we have a funclet bundle to associate coro.end with. (WinEH) auto Bundles = getBundlesForCoroEnd(CGF); - auto *CoroEnd = - CGF.Builder.CreateCall(CoroEndFn, - {NullPtr, CGF.Builder.getTrue(), - llvm::ConstantTokenNone::get(CoroEndFn->getContext())}, - Bundles); + CGF.Builder.CreateCall( + CoroEndFn, + {NullPtr, CGF.Builder.getTrue(), + llvm::ConstantTokenNone::get(CoroEndFn->getContext())}, + Bundles); if (Bundles.empty()) { // Otherwise, (landingpad model), create a conditional branch that leads // either to a cleanup block or a block with EH resume instruction. auto *ResumeBB = CGF.getEHResumeBlock(/*isCleanup=*/true); auto *CleanupContBB = CGF.createBasicBlock("cleanup.cont"); - CGF.Builder.CreateCondBr(CoroEnd, ResumeBB, CleanupContBB); + auto *CoroIsInRampFn = CGM.getIntrinsic(llvm::Intrinsic::coro_is_in_ramp); + auto *CoroIsInRamp = CGF.Builder.CreateCall(CoroIsInRampFn); + CGF.Builder.CreateCondBr(CoroIsInRamp, CleanupContBB, ResumeBB); CGF.EmitBlock(CleanupContBB); } } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 578d09f7971d6..fee6bc0cbb64b 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -26,6 +26,7 @@ #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/Expr.h" +#include "clang/AST/LambdaCapture.h" #include "clang/AST/RecordLayout.h" #include "clang/AST/RecursiveASTVisitor.h" #include "clang/AST/VTableBuilder.h" @@ -1903,46 +1904,61 @@ CGDebugInfo::createInlinedSubprogram(StringRef FuncName, return SP; } +llvm::StringRef +CGDebugInfo::GetLambdaCaptureName(const LambdaCapture &Capture) { + if (Capture.capturesThis()) + return CGM.getCodeGenOpts().EmitCodeView ? "__this" : "this"; + + assert(Capture.capturesVariable()); + + const ValueDecl *CaptureDecl = Capture.getCapturedVar(); + assert(CaptureDecl && "Expected valid decl for captured variable."); + + return CaptureDecl->getName(); +} + void CGDebugInfo::CollectRecordLambdaFields( const CXXRecordDecl *CXXDecl, SmallVectorImpl &elements, llvm::DIType *RecordTy) { // For C++11 Lambdas a Field will be the same as a Capture, but the Capture // has the name and the location of the variable so we should iterate over // both concurrently. - const ASTRecordLayout &layout = CGM.getContext().getASTRecordLayout(CXXDecl); RecordDecl::field_iterator Field = CXXDecl->field_begin(); unsigned fieldno = 0; for (CXXRecordDecl::capture_const_iterator I = CXXDecl->captures_begin(), E = CXXDecl->captures_end(); I != E; ++I, ++Field, ++fieldno) { - const LambdaCapture &C = *I; - if (C.capturesVariable()) { - SourceLocation Loc = C.getLocation(); - assert(!Field->isBitField() && "lambdas don't have bitfield members!"); - ValueDecl *V = C.getCapturedVar(); - StringRef VName = V->getName(); - llvm::DIFile *VUnit = getOrCreateFile(Loc); - auto Align = getDeclAlignIfRequired(V, CGM.getContext()); - llvm::DIType *FieldType = createFieldType( - VName, Field->getType(), Loc, Field->getAccess(), - layout.getFieldOffset(fieldno), Align, VUnit, RecordTy, CXXDecl); - elements.push_back(FieldType); - } else if (C.capturesThis()) { + const LambdaCapture &Capture = *I; + const uint64_t FieldOffset = + CGM.getContext().getASTRecordLayout(CXXDecl).getFieldOffset(fieldno); + + assert(!Field->isBitField() && "lambdas don't have bitfield members!"); + + SourceLocation Loc; + uint32_t Align = 0; + + if (Capture.capturesThis()) { // TODO: Need to handle 'this' in some way by probably renaming the // this of the lambda class and having a field member of 'this' or // by using AT_object_pointer for the function and having that be // used as 'this' for semantic references. - FieldDecl *f = *Field; - llvm::DIFile *VUnit = getOrCreateFile(f->getLocation()); - QualType type = f->getType(); - StringRef ThisName = - CGM.getCodeGenOpts().EmitCodeView ? "__this" : "this"; - llvm::DIType *fieldType = createFieldType( - ThisName, type, f->getLocation(), f->getAccess(), - layout.getFieldOffset(fieldno), VUnit, RecordTy, CXXDecl); - - elements.push_back(fieldType); + Loc = Field->getLocation(); + } else if (Capture.capturesVariable()) { + Loc = Capture.getLocation(); + + const ValueDecl *CaptureDecl = Capture.getCapturedVar(); + assert(CaptureDecl && "Expected valid decl for captured variable."); + + Align = getDeclAlignIfRequired(CaptureDecl, CGM.getContext()); + } else { + continue; } + + llvm::DIFile *VUnit = getOrCreateFile(Loc); + + elements.push_back(createFieldType( + GetLambdaCaptureName(Capture), Field->getType(), Loc, + Field->getAccess(), FieldOffset, Align, VUnit, RecordTy, CXXDecl)); } } @@ -2657,12 +2673,22 @@ StringRef CGDebugInfo::getVTableName(const CXXRecordDecl *RD) { // existing information in the DWARF. The type is assumed to be 'void *'. void CGDebugInfo::emitVTableSymbol(llvm::GlobalVariable *VTable, const CXXRecordDecl *RD) { - if (!CGM.getTarget().getCXXABI().isItaniumFamily() || - CGM.getTarget().getTriple().isOSBinFormatCOFF()) + if (!CGM.getTarget().getCXXABI().isItaniumFamily()) return; if (DebugKind <= llvm::codegenoptions::DebugLineTablesOnly) return; + // On COFF platform, we shouldn't emit a reference to an external entity (i.e. + // VTable) into debug info, which is constructed within a discardable section. + // If that entity ends up implicitly dllimported from another DLL, the linker + // may produce a runtime pseudo-relocation for it (BFD-ld only. LLD prohibits + // to emit such relocation). If the debug section is stripped, the runtime + // pseudo-relocation points to memory space outside of the module, causing an + // access violation. + if (CGM.getTarget().getTriple().isOSBinFormatCOFF() && + VTable->isDeclarationForLinker()) + return; + ASTContext &Context = CGM.getContext(); StringRef SymbolName = "_vtable$"; SourceLocation Loc; diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h index f86077369a42a..78c3eb9c5792e 100644 --- a/clang/lib/CodeGen/CGDebugInfo.h +++ b/clang/lib/CodeGen/CGDebugInfo.h @@ -397,6 +397,7 @@ class CGDebugInfo { void CollectRecordFields(const RecordDecl *Decl, llvm::DIFile *F, SmallVectorImpl &E, llvm::DICompositeType *RecordTy); + llvm::StringRef GetLambdaCaptureName(const LambdaCapture &Capture); /// If the C++ class has vtable info then insert appropriate debug /// info entry in EltTys vector. diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 4fa25c5d66669..f319b176513f8 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -3672,17 +3672,19 @@ Value *ScalarExprEmitter::VisitReal(const UnaryOperator *E, // If it's an l-value, load through the appropriate subobject l-value. // Note that we have to ask E because Op might be an l-value that // this won't work for, e.g. an Obj-C property. - if (E->isGLValue()) { + if (E->isGLValue()) { if (!PromotionType.isNull()) { CodeGenFunction::ComplexPairTy result = CGF.EmitComplexExpr( Op, /*IgnoreReal*/ IgnoreResultAssign, /*IgnoreImag*/ true); - if (result.first) - result.first = CGF.EmitPromotedValue(result, PromotionType).first; - return result.first; - } else { - return CGF.EmitLoadOfLValue(CGF.EmitLValue(E), E->getExprLoc()) - .getScalarVal(); + PromotionType = PromotionType->isAnyComplexType() + ? PromotionType + : CGF.getContext().getComplexType(PromotionType); + return result.first ? CGF.EmitPromotedValue(result, PromotionType).first + : result.first; } + + return CGF.EmitLoadOfLValue(CGF.EmitLValue(E), E->getExprLoc()) + .getScalarVal(); } // Otherwise, calculate and project. return CGF.EmitComplexExpr(Op, false, true).first; @@ -3715,13 +3717,16 @@ Value *ScalarExprEmitter::VisitImag(const UnaryOperator *E, if (!PromotionType.isNull()) { CodeGenFunction::ComplexPairTy result = CGF.EmitComplexExpr( Op, /*IgnoreReal*/ true, /*IgnoreImag*/ IgnoreResultAssign); - if (result.second) - result.second = CGF.EmitPromotedValue(result, PromotionType).second; - return result.second; - } else { - return CGF.EmitLoadOfLValue(CGF.EmitLValue(E), E->getExprLoc()) - .getScalarVal(); + PromotionType = PromotionType->isAnyComplexType() + ? PromotionType + : CGF.getContext().getComplexType(PromotionType); + return result.second + ? CGF.EmitPromotedValue(result, PromotionType).second + : result.second; } + + return CGF.EmitLoadOfLValue(CGF.EmitLValue(E), E->getExprLoc()) + .getScalarVal(); } // Otherwise, calculate and project. return CGF.EmitComplexExpr(Op, true, false).second; diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 7b5b924b1fe82..6c0fc8d7f07be 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -352,6 +352,13 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, SmallVector Args{OrderID, SpaceOp, RangeOp, IndexOp, Name}; return Builder.CreateIntrinsic(HandleTy, IntrinsicID, Args); } + case Builtin::BI__builtin_hlsl_resource_nonuniformindex: { + Value *IndexOp = EmitScalarExpr(E->getArg(0)); + llvm::Type *RetTy = ConvertType(E->getType()); + return Builder.CreateIntrinsic( + RetTy, CGM.getHLSLRuntime().getNonUniformResourceIndexIntrinsic(), + ArrayRef{IndexOp}); + } case Builtin::BI__builtin_hlsl_all: { Value *Op0 = EmitScalarExpr(E->getArg(0)); return Builder.CreateIntrinsic( @@ -540,6 +547,21 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, retType, CGM.getHLSLRuntime().getIsInfIntrinsic(), ArrayRef{Op0}, nullptr, "hlsl.isinf"); } + case Builtin::BI__builtin_hlsl_elementwise_isnan: { + Value *Op0 = EmitScalarExpr(E->getArg(0)); + llvm::Type *Xty = Op0->getType(); + llvm::Type *retType = llvm::Type::getInt1Ty(this->getLLVMContext()); + if (Xty->isVectorTy()) { + auto *XVecTy = E->getArg(0)->getType()->castAs(); + retType = llvm::VectorType::get( + retType, ElementCount::getFixed(XVecTy->getNumElements())); + } + if (!E->getArg(0)->getType()->hasFloatingRepresentation()) + llvm_unreachable("isnan operand must have a float representation"); + return Builder.CreateIntrinsic( + retType, CGM.getHLSLRuntime().getIsNaNIntrinsic(), + ArrayRef{Op0}, nullptr, "hlsl.isnan"); + } case Builtin::BI__builtin_hlsl_mad: { Value *M = EmitScalarExpr(E->getArg(0)); Value *A = EmitScalarExpr(E->getArg(1)); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h index 370f3d5c5d30d..9c0e6056fd4ee 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.h +++ b/clang/lib/CodeGen/CGHLSLRuntime.h @@ -95,6 +95,7 @@ class CGHLSLRuntime { GENERATE_HLSL_INTRINSIC_FUNCTION(FlattenedThreadIdInGroup, flattened_thread_id_in_group) GENERATE_HLSL_INTRINSIC_FUNCTION(IsInf, isinf) + GENERATE_HLSL_INTRINSIC_FUNCTION(IsNaN, isnan) GENERATE_HLSL_INTRINSIC_FUNCTION(Lerp, lerp) GENERATE_HLSL_INTRINSIC_FUNCTION(Normalize, normalize) GENERATE_HLSL_INTRINSIC_FUNCTION(Rsqrt, rsqrt) @@ -129,6 +130,8 @@ class CGHLSLRuntime { resource_handlefrombinding) GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromImplicitBinding, resource_handlefromimplicitbinding) + GENERATE_HLSL_INTRINSIC_FUNCTION(NonUniformResourceIndex, + resource_nonuniformindex) GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter) GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync, group_memory_barrier_with_group_sync) diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index a503aaf613e30..75bde3f72c4c2 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1542,15 +1542,14 @@ static llvm::TargetRegionEntryInfo getEntryInfoFromPresumedLoc( SourceManager &SM = CGM.getContext().getSourceManager(); PresumedLoc PLoc = SM.getPresumedLoc(BeginLoc); - llvm::sys::fs::UniqueID ID; - if (llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID)) { + if (CGM.getFileSystem()->exists(PLoc.getFilename())) PLoc = SM.getPresumedLoc(BeginLoc, /*UseLineDirectives=*/false); - } return std::pair(PLoc.getFilename(), PLoc.getLine()); }; - return OMPBuilder.getTargetEntryUniqueInfo(FileInfoCallBack, ParentName); + return OMPBuilder.getTargetEntryUniqueInfo(FileInfoCallBack, + *CGM.getFileSystem(), ParentName); } ConstantAddress CGOpenMPRuntime::getAddrOfDeclareTargetVar(const VarDecl *VD) { @@ -2703,7 +2702,8 @@ llvm::Value *CGOpenMPRuntime::emitForNext(CodeGenFunction &CGF, } llvm::Value *CGOpenMPRuntime::emitMessageClause(CodeGenFunction &CGF, - const Expr *Message) { + const Expr *Message, + SourceLocation Loc) { if (!Message) return llvm::ConstantPointerNull::get(CGF.VoidPtrTy); return CGF.EmitScalarExpr(Message); @@ -2713,11 +2713,13 @@ llvm::Value * CGOpenMPRuntime::emitMessageClause(CodeGenFunction &CGF, const OMPMessageClause *MessageClause) { return emitMessageClause( - CGF, MessageClause ? MessageClause->getMessageString() : nullptr); + CGF, MessageClause ? MessageClause->getMessageString() : nullptr, + MessageClause->getBeginLoc()); } llvm::Value * -CGOpenMPRuntime::emitSeverityClause(OpenMPSeverityClauseKind Severity) { +CGOpenMPRuntime::emitSeverityClause(OpenMPSeverityClauseKind Severity, + SourceLocation Loc) { // OpenMP 6.0, 10.4: "If no severity clause is specified then the effect is // as if sev-level is fatal." return llvm::ConstantInt::get(CGM.Int32Ty, @@ -2727,13 +2729,15 @@ CGOpenMPRuntime::emitSeverityClause(OpenMPSeverityClauseKind Severity) { llvm::Value * CGOpenMPRuntime::emitSeverityClause(const OMPSeverityClause *SeverityClause) { return emitSeverityClause(SeverityClause ? SeverityClause->getSeverityKind() - : OMPC_SEVERITY_unknown); + : OMPC_SEVERITY_unknown, + SeverityClause->getBeginLoc()); } void CGOpenMPRuntime::emitNumThreadsClause( CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier, OpenMPSeverityClauseKind Severity, - const Expr *Message) { + SourceLocation SeverityLoc, const Expr *Message, + SourceLocation MessageLoc) { if (!CGF.HaveInsertPoint()) return; llvm::SmallVector Args( @@ -2745,8 +2749,8 @@ void CGOpenMPRuntime::emitNumThreadsClause( RuntimeFunction FnID = OMPRTL___kmpc_push_num_threads; if (Modifier == OMPC_NUMTHREADS_strict) { FnID = OMPRTL___kmpc_push_num_threads_strict; - Args.push_back(emitSeverityClause(Severity)); - Args.push_back(emitMessageClause(CGF, Message)); + Args.push_back(emitSeverityClause(Severity, SeverityLoc)); + Args.push_back(emitMessageClause(CGF, Message, MessageLoc)); } CGF.EmitRuntimeCall( OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), FnID), Args); @@ -6799,6 +6803,240 @@ LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); // code for that information. class MappableExprsHandler { public: + /// Custom comparator for attach-pointer expressions that compares them by + /// complexity (i.e. their component-depth) first, then by the order in which + /// they were computed by collectAttachPtrExprInfo(), if they are semantically + /// different. + struct AttachPtrExprComparator { + const MappableExprsHandler *Handler = nullptr; + // Cache of previous equality comparison results. + mutable llvm::DenseMap, bool> + CachedEqualityComparisons; + + AttachPtrExprComparator(const MappableExprsHandler *H) : Handler(H) {} + + // Return true iff LHS is "less than" RHS. + bool operator()(const Expr *LHS, const Expr *RHS) const { + if (LHS == RHS) + return false; + + // First, compare by complexity (depth) + const auto ItLHS = Handler->AttachPtrComponentDepthMap.find(LHS); + const auto ItRHS = Handler->AttachPtrComponentDepthMap.find(RHS); + + std::optional DepthLHS = + (ItLHS != Handler->AttachPtrComponentDepthMap.end()) ? ItLHS->second + : std::nullopt; + std::optional DepthRHS = + (ItRHS != Handler->AttachPtrComponentDepthMap.end()) ? ItRHS->second + : std::nullopt; + + // std::nullopt (no attach pointer) has lowest complexity + if (!DepthLHS.has_value() && !DepthRHS.has_value()) { + // Both have same complexity, now check semantic equality + if (areEqual(LHS, RHS)) + return false; + // Different semantically, compare by computation order + return wasComputedBefore(LHS, RHS); + } + if (!DepthLHS.has_value()) + return true; // LHS has lower complexity + if (!DepthRHS.has_value()) + return false; // RHS has lower complexity + + // Both have values, compare by depth (lower depth = lower complexity) + if (DepthLHS.value() != DepthRHS.value()) + return DepthLHS.value() < DepthRHS.value(); + + // Same complexity, now check semantic equality + if (areEqual(LHS, RHS)) + return false; + // Different semantically, compare by computation order + return wasComputedBefore(LHS, RHS); + } + + public: + /// Return true if \p LHS and \p RHS are semantically equal. Uses pre-cached + /// results, if available, otherwise does a recursive semantic comparison. + bool areEqual(const Expr *LHS, const Expr *RHS) const { + // Check cache first for faster lookup + const auto CachedResultIt = CachedEqualityComparisons.find({LHS, RHS}); + if (CachedResultIt != CachedEqualityComparisons.end()) + return CachedResultIt->second; + + bool ComparisonResult = areSemanticallyEqual(LHS, RHS); + + // Cache the result for future lookups (both orders since semantic + // equality is commutative) + CachedEqualityComparisons[{LHS, RHS}] = ComparisonResult; + CachedEqualityComparisons[{RHS, LHS}] = ComparisonResult; + return ComparisonResult; + } + + /// Compare the two attach-ptr expressions by their computation order. + /// Returns true iff LHS was computed before RHS by + /// collectAttachPtrExprInfo(). + bool wasComputedBefore(const Expr *LHS, const Expr *RHS) const { + const size_t &OrderLHS = Handler->AttachPtrComputationOrderMap.at(LHS); + const size_t &OrderRHS = Handler->AttachPtrComputationOrderMap.at(RHS); + + return OrderLHS < OrderRHS; + } + + private: + /// Helper function to compare attach-pointer expressions semantically. + /// This function handles various expression types that can be part of an + /// attach-pointer. + /// TODO: Not urgent, but we should ideally return true when comparing + /// `p[10]`, `*(p + 10)`, `*(p + 5 + 5)`, `p[10:1]` etc. + bool areSemanticallyEqual(const Expr *LHS, const Expr *RHS) const { + if (LHS == RHS) + return true; + + // If only one is null, they aren't equal + if (!LHS || !RHS) + return false; + + ASTContext &Ctx = Handler->CGF.getContext(); + // Strip away parentheses and no-op casts to get to the core expression + LHS = LHS->IgnoreParenNoopCasts(Ctx); + RHS = RHS->IgnoreParenNoopCasts(Ctx); + + // Direct pointer comparison of the underlying expressions + if (LHS == RHS) + return true; + + // Check if the expression classes match + if (LHS->getStmtClass() != RHS->getStmtClass()) + return false; + + // Handle DeclRefExpr (variable references) + if (const auto *LD = dyn_cast(LHS)) { + const auto *RD = dyn_cast(RHS); + if (!RD) + return false; + return LD->getDecl()->getCanonicalDecl() == + RD->getDecl()->getCanonicalDecl(); + } + + // Handle ArraySubscriptExpr (array indexing like a[i]) + if (const auto *LA = dyn_cast(LHS)) { + const auto *RA = dyn_cast(RHS); + if (!RA) + return false; + return areSemanticallyEqual(LA->getBase(), RA->getBase()) && + areSemanticallyEqual(LA->getIdx(), RA->getIdx()); + } + + // Handle MemberExpr (member access like s.m or p->m) + if (const auto *LM = dyn_cast(LHS)) { + const auto *RM = dyn_cast(RHS); + if (!RM) + return false; + if (LM->getMemberDecl()->getCanonicalDecl() != + RM->getMemberDecl()->getCanonicalDecl()) + return false; + return areSemanticallyEqual(LM->getBase(), RM->getBase()); + } + + // Handle UnaryOperator (unary operations like *p, &x, etc.) + if (const auto *LU = dyn_cast(LHS)) { + const auto *RU = dyn_cast(RHS); + if (!RU) + return false; + if (LU->getOpcode() != RU->getOpcode()) + return false; + return areSemanticallyEqual(LU->getSubExpr(), RU->getSubExpr()); + } + + // Handle BinaryOperator (binary operations like p + offset) + if (const auto *LB = dyn_cast(LHS)) { + const auto *RB = dyn_cast(RHS); + if (!RB) + return false; + if (LB->getOpcode() != RB->getOpcode()) + return false; + return areSemanticallyEqual(LB->getLHS(), RB->getLHS()) && + areSemanticallyEqual(LB->getRHS(), RB->getRHS()); + } + + // Handle ArraySectionExpr (array sections like a[0:1]) + // Attach pointers should not contain array-sections, but currently we + // don't emit an error. + if (const auto *LAS = dyn_cast(LHS)) { + const auto *RAS = dyn_cast(RHS); + if (!RAS) + return false; + return areSemanticallyEqual(LAS->getBase(), RAS->getBase()) && + areSemanticallyEqual(LAS->getLowerBound(), + RAS->getLowerBound()) && + areSemanticallyEqual(LAS->getLength(), RAS->getLength()); + } + + // Handle CastExpr (explicit casts) + if (const auto *LC = dyn_cast(LHS)) { + const auto *RC = dyn_cast(RHS); + if (!RC) + return false; + if (LC->getCastKind() != RC->getCastKind()) + return false; + return areSemanticallyEqual(LC->getSubExpr(), RC->getSubExpr()); + } + + // Handle CXXThisExpr (this pointer) + if (isa(LHS) && isa(RHS)) + return true; + + // Handle IntegerLiteral (integer constants) + if (const auto *LI = dyn_cast(LHS)) { + const auto *RI = dyn_cast(RHS); + if (!RI) + return false; + return LI->getValue() == RI->getValue(); + } + + // Handle CharacterLiteral (character constants) + if (const auto *LC = dyn_cast(LHS)) { + const auto *RC = dyn_cast(RHS); + if (!RC) + return false; + return LC->getValue() == RC->getValue(); + } + + // Handle FloatingLiteral (floating point constants) + if (const auto *LF = dyn_cast(LHS)) { + const auto *RF = dyn_cast(RHS); + if (!RF) + return false; + // Use bitwise comparison for floating point literals + return LF->getValue().bitwiseIsEqual(RF->getValue()); + } + + // Handle StringLiteral (string constants) + if (const auto *LS = dyn_cast(LHS)) { + const auto *RS = dyn_cast(RHS); + if (!RS) + return false; + return LS->getString() == RS->getString(); + } + + // Handle CXXNullPtrLiteralExpr (nullptr) + if (isa(LHS) && isa(RHS)) + return true; + + // Handle CXXBoolLiteralExpr (true/false) + if (const auto *LB = dyn_cast(LHS)) { + const auto *RB = dyn_cast(RHS); + if (!RB) + return false; + return LB->getValue() == RB->getValue(); + } + + // Fallback for other forms - use the existing comparison method + return Expr::isSameComparisonOperand(LHS, RHS); + } + }; + /// Get the offset of the OMP_MAP_MEMBER_OF field. static unsigned getFlagMemberOffset() { unsigned Offset = 0; @@ -6876,6 +7114,45 @@ class MappableExprsHandler { bool HasCompleteRecord = false; }; + /// A struct to store the attach pointer and pointee information, to be used + /// when emitting an attach entry. + struct AttachInfoTy { + Address AttachPtrAddr = Address::invalid(); + Address AttachPteeAddr = Address::invalid(); + const ValueDecl *AttachPtrDecl = nullptr; + const Expr *AttachMapExpr = nullptr; + + bool isValid() const { + return AttachPtrAddr.isValid() && AttachPteeAddr.isValid(); + } + }; + + /// Check if there's any component list where the attach pointer expression + /// matches the given captured variable. + bool hasAttachEntryForCapturedVar(const ValueDecl *VD) const { + for (const auto &AttachEntry : AttachPtrExprMap) { + if (AttachEntry.second) { + // Check if the attach pointer expression is a DeclRefExpr that + // references the captured variable + if (const auto *DRE = dyn_cast(AttachEntry.second)) + if (DRE->getDecl() == VD) + return true; + } + } + return false; + } + + /// Get the previously-cached attach pointer for a component list, if-any. + const Expr *getAttachPtrExpr( + OMPClauseMappableExprCommon::MappableExprComponentListRef Components) + const { + const auto It = AttachPtrExprMap.find(Components); + if (It != AttachPtrExprMap.end()) + return It->second; + + return nullptr; + } + private: /// Kind that defines how a device pointer has to be returned. struct MapInfo { @@ -6948,6 +7225,27 @@ class MappableExprsHandler { /// Map between lambda declarations and their map type. llvm::DenseMap LambdasMap; + /// Map from component lists to their attach pointer expressions. + llvm::DenseMap + AttachPtrExprMap; + + /// Map from attach pointer expressions to their component depth. + /// nullptr key has std::nullopt depth. This can be used to order attach-ptr + /// expressions with increasing/decreasing depth. + /// The component-depth of `nullptr` (i.e. no attach-ptr) is `std::nullopt`. + /// TODO: Not urgent, but we should ideally use the number of pointer + /// dereferences in an expr as an indicator of its complexity, instead of the + /// component-depth. That would be needed for us to treat `p[1]`, `*(p + 10)`, + /// `*(p + 5 + 5)` together. + llvm::DenseMap> + AttachPtrComponentDepthMap = {{nullptr, std::nullopt}}; + + /// Map from attach pointer expressions to the order they were computed in, in + /// collectAttachPtrExprInfo(). + llvm::DenseMap AttachPtrComputationOrderMap = { + {nullptr, 0}}; + llvm::Value *getExprTypeSize(const Expr *E) const { QualType ExprTy = E->getType().getCanonicalType(); @@ -8167,6 +8465,103 @@ class MappableExprsHandler { } } + /// Returns the address corresponding to \p PointerExpr. + static Address getAttachPtrAddr(const Expr *PointerExpr, + CodeGenFunction &CGF) { + assert(PointerExpr && "Cannot get addr from null attach-ptr expr"); + Address AttachPtrAddr = Address::invalid(); + + if (auto *DRE = dyn_cast(PointerExpr)) { + // If the pointer is a variable, we can use its address directly. + AttachPtrAddr = CGF.EmitLValue(DRE).getAddress(); + } else if (auto *OASE = dyn_cast(PointerExpr)) { + AttachPtrAddr = + CGF.EmitArraySectionExpr(OASE, /*IsLowerBound=*/true).getAddress(); + } else if (auto *ASE = dyn_cast(PointerExpr)) { + AttachPtrAddr = CGF.EmitLValue(ASE).getAddress(); + } else if (auto *ME = dyn_cast(PointerExpr)) { + AttachPtrAddr = CGF.EmitMemberExpr(ME).getAddress(); + } else if (auto *UO = dyn_cast(PointerExpr)) { + assert(UO->getOpcode() == UO_Deref && + "Unexpected unary-operator on attach-ptr-expr"); + AttachPtrAddr = CGF.EmitLValue(UO).getAddress(); + } + assert(AttachPtrAddr.isValid() && + "Failed to get address for attach pointer expression"); + return AttachPtrAddr; + } + + /// Get the address of the attach pointer, and a load from it, to get the + /// pointee base address. + /// \return A pair containing AttachPtrAddr and AttachPteeBaseAddr. The pair + /// contains invalid addresses if \p AttachPtrExpr is null. + static std::pair + getAttachPtrAddrAndPteeBaseAddr(const Expr *AttachPtrExpr, + CodeGenFunction &CGF) { + + if (!AttachPtrExpr) + return {Address::invalid(), Address::invalid()}; + + Address AttachPtrAddr = getAttachPtrAddr(AttachPtrExpr, CGF); + assert(AttachPtrAddr.isValid() && "Invalid attach pointer addr"); + + QualType AttachPtrType = + OMPClauseMappableExprCommon::getComponentExprElementType(AttachPtrExpr) + .getCanonicalType(); + + Address AttachPteeBaseAddr = CGF.EmitLoadOfPointer( + AttachPtrAddr, AttachPtrType->castAs()); + assert(AttachPteeBaseAddr.isValid() && "Invalid attach pointee base addr"); + + return {AttachPtrAddr, AttachPteeBaseAddr}; + } + + /// Returns whether an attach entry should be emitted for a map on + /// \p MapBaseDecl on the directive \p CurDir. + static bool + shouldEmitAttachEntry(const Expr *PointerExpr, const ValueDecl *MapBaseDecl, + CodeGenFunction &CGF, + llvm::PointerUnion + CurDir) { + if (!PointerExpr) + return false; + + // Pointer attachment is needed at map-entering time or for declare + // mappers. + return isa(CurDir) || + isOpenMPTargetMapEnteringDirective( + cast(CurDir) + ->getDirectiveKind()); + } + + /// Computes the attach-ptr expr for \p Components, and updates various maps + /// with the information. + /// It internally calls OMPClauseMappableExprCommon::findAttachPtrExpr() + /// with the OpenMPDirectiveKind extracted from \p CurDir. + /// It updates AttachPtrComputationOrderMap, AttachPtrComponentDepthMap, and + /// AttachPtrExprMap. + void collectAttachPtrExprInfo( + OMPClauseMappableExprCommon::MappableExprComponentListRef Components, + llvm::PointerUnion + CurDir) { + + OpenMPDirectiveKind CurDirectiveID = + isa(CurDir) + ? OMPD_declare_mapper + : cast(CurDir)->getDirectiveKind(); + + const auto &[AttachPtrExpr, Depth] = + OMPClauseMappableExprCommon::findAttachPtrExpr(Components, + CurDirectiveID); + + AttachPtrComputationOrderMap.try_emplace( + AttachPtrExpr, AttachPtrComputationOrderMap.size()); + AttachPtrComponentDepthMap.try_emplace(AttachPtrExpr, Depth); + AttachPtrExprMap.try_emplace(Components, AttachPtrExpr); + } + /// Generate all the base pointers, section pointers, sizes, map types, and /// mappers for the extracted mappable expressions (all included in \a /// CombinedInfo). Also, for each item that relates with a device pointer, a @@ -12263,7 +12658,8 @@ llvm::Value *CGOpenMPSIMDRuntime::emitForNext(CodeGenFunction &CGF, void CGOpenMPSIMDRuntime::emitNumThreadsClause( CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier, OpenMPSeverityClauseKind Severity, - const Expr *Message) { + SourceLocation SeverityLoc, const Expr *Message, + SourceLocation MessageLoc) { llvm_unreachable("Not supported in SIMD-only mode"); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index eb04eceee236c..ba76ba6b5f523 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1049,11 +1049,13 @@ class CGOpenMPRuntime { Address UB, Address ST); virtual llvm::Value *emitMessageClause(CodeGenFunction &CGF, - const Expr *Message); + const Expr *Message, + SourceLocation Loc); virtual llvm::Value *emitMessageClause(CodeGenFunction &CGF, const OMPMessageClause *MessageClause); - virtual llvm::Value *emitSeverityClause(OpenMPSeverityClauseKind Severity); + virtual llvm::Value *emitSeverityClause(OpenMPSeverityClauseKind Severity, + SourceLocation Loc); virtual llvm::Value * emitSeverityClause(const OMPSeverityClause *SeverityClause); @@ -1069,7 +1071,9 @@ class CGOpenMPRuntime { CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier = OMPC_NUMTHREADS_unknown, OpenMPSeverityClauseKind Severity = OMPC_SEVERITY_fatal, - const Expr *Message = nullptr); + SourceLocation SeverityLoc = SourceLocation(), + const Expr *Message = nullptr, + SourceLocation MessageLoc = SourceLocation()); /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. @@ -1956,7 +1960,9 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime { CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier = OMPC_NUMTHREADS_unknown, OpenMPSeverityClauseKind Severity = OMPC_SEVERITY_fatal, - const Expr *Message = nullptr) override; + SourceLocation SeverityLoc = SourceLocation(), + const Expr *Message = nullptr, + SourceLocation MessageLoc = SourceLocation()) override; /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 /// global_tid, int proc_bind) to generate code for 'proc_bind' clause. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 44a091e1b3c75..4272d8b1a1f51 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -899,10 +899,34 @@ void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, // Nothing to do. } +llvm::Value *CGOpenMPRuntimeGPU::emitMessageClause(CodeGenFunction &CGF, + const Expr *Message, + SourceLocation Loc) { + CGM.getDiags().Report(Loc, diag::warn_omp_gpu_unsupported_clause) + << getOpenMPClauseName(OMPC_message); + return nullptr; +} + +llvm::Value * +CGOpenMPRuntimeGPU::emitSeverityClause(OpenMPSeverityClauseKind Severity, + SourceLocation Loc) { + CGM.getDiags().Report(Loc, diag::warn_omp_gpu_unsupported_clause) + << getOpenMPClauseName(OMPC_severity); + return nullptr; +} + void CGOpenMPRuntimeGPU::emitNumThreadsClause( CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier, OpenMPSeverityClauseKind Severity, - const Expr *Message) { + SourceLocation SeverityLoc, const Expr *Message, + SourceLocation MessageLoc) { + if (Modifier == OMPC_NUMTHREADS_strict) { + CGM.getDiags().Report(Loc, + diag::warn_omp_gpu_unsupported_modifier_for_clause) + << "strict" << getOpenMPClauseName(OMPC_num_threads); + return; + } + // Nothing to do. } diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h index 665221b7d7890..810d6aa082156 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -162,6 +162,14 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override; + // Currently unsupported on the device. + llvm::Value *emitMessageClause(CodeGenFunction &CGF, const Expr *Message, + SourceLocation Loc) override; + + // Currently unsupported on the device. + virtual llvm::Value *emitSeverityClause(OpenMPSeverityClauseKind Severity, + SourceLocation Loc) override; + /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads' /// clause. @@ -169,7 +177,9 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime { CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc, OpenMPNumThreadsClauseModifier Modifier = OMPC_NUMTHREADS_unknown, OpenMPSeverityClauseKind Severity = OMPC_SEVERITY_fatal, - const Expr *Message = nullptr) override; + SourceLocation SeverityLoc = SourceLocation(), + const Expr *Message = nullptr, + SourceLocation MessageLoc = SourceLocation()) override; /// This function ought to emit, in the general case, a call to // the openmp runtime kmpc_push_num_teams. In NVPTX backend it is not needed diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index aeff73d525c10..92636f27fd4e5 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -234,6 +234,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { case Stmt::OMPInterchangeDirectiveClass: EmitOMPInterchangeDirective(cast(*S)); break; + case Stmt::OMPFuseDirectiveClass: + EmitOMPFuseDirective(cast(*S)); + break; case Stmt::OMPForDirectiveClass: EmitOMPForDirective(cast(*S)); break; @@ -1291,7 +1294,9 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S, ArrayRef ForAttrs) { JumpDest LoopExit = getJumpDestInCurrentScope("for.end"); - LexicalScope ForScope(*this, S.getSourceRange()); + std::optional ForScope; + if (getLangOpts().C99 || getLangOpts().CPlusPlus) + ForScope.emplace(*this, S.getSourceRange()); // Evaluate the first part before the loop. if (S.getInit()) @@ -1350,7 +1355,7 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S, llvm::BasicBlock *ExitBlock = LoopExit.getBlock(); // If there are any cleanups between here and the loop-exit scope, // create a block to stage a loop exit along. - if (ForScope.requiresCleanups()) + if (ForScope && ForScope->requiresCleanups()) ExitBlock = createBasicBlock("for.cond.cleanup"); // As long as the condition is true, iterate the loop. @@ -1419,7 +1424,8 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S, EmitStopPoint(&S); EmitBranch(CondBlock); - ForScope.ForceCleanup(); + if (ForScope) + ForScope->ForceCleanup(); LoopStack.pop(); @@ -2468,56 +2474,6 @@ void CodeGenFunction::EmitSwitchStmt(const SwitchStmt &S) { CaseRangeBlock = SavedCRBlock; } -static std::string -SimplifyConstraint(const char *Constraint, const TargetInfo &Target, - SmallVectorImpl *OutCons=nullptr) { - std::string Result; - - while (*Constraint) { - switch (*Constraint) { - default: - Result += Target.convertConstraint(Constraint); - break; - // Ignore these - case '*': - case '?': - case '!': - case '=': // Will see this and the following in mult-alt constraints. - case '+': - break; - case '#': // Ignore the rest of the constraint alternative. - while (Constraint[1] && Constraint[1] != ',') - Constraint++; - break; - case '&': - case '%': - Result += *Constraint; - while (Constraint[1] && Constraint[1] == *Constraint) - Constraint++; - break; - case ',': - Result += "|"; - break; - case 'g': - Result += "imr"; - break; - case '[': { - assert(OutCons && - "Must pass output names to constraints with a symbolic name"); - unsigned Index; - bool result = Target.resolveSymbolicName(Constraint, *OutCons, Index); - assert(result && "Could not resolve symbolic name"); (void)result; - Result += llvm::utostr(Index); - break; - } - } - - Constraint++; - } - - return Result; -} - /// AddVariableConstraints - Look at AsmExpr and if it is a variable declared /// as using a particular register add that as a constraint that will be used /// in this asm stmt. @@ -2896,8 +2852,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { // Simplify the output constraint. std::string OutputConstraint(S.getOutputConstraint(i)); - OutputConstraint = SimplifyConstraint(OutputConstraint.c_str() + 1, - getTarget(), &OutputConstraintInfos); + OutputConstraint = getTarget().simplifyConstraint( + StringRef(OutputConstraint).substr(1), &OutputConstraintInfos); const Expr *OutExpr = S.getOutputExpr(i); OutExpr = OutExpr->IgnoreParenNoopCasts(getContext()); @@ -3059,8 +3015,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) { // Simplify the input constraint. std::string InputConstraint(S.getInputConstraint(i)); - InputConstraint = SimplifyConstraint(InputConstraint.c_str(), getTarget(), - &OutputConstraintInfos); + InputConstraint = + getTarget().simplifyConstraint(InputConstraint, &OutputConstraintInfos); InputConstraint = AddVariableConstraints( InputConstraint, *InputExpr->IgnoreParenNoopCasts(getContext()), diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index d72cd8fbfd608..efc06a276267a 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -201,6 +201,24 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope { } else { llvm_unreachable("Unknown loop-based directive kind."); } + doEmitPreinits(PreInits); + PreCondVars.restore(CGF); + } + + void + emitPreInitStmt(CodeGenFunction &CGF, + const OMPCanonicalLoopSequenceTransformationDirective &S) { + const Stmt *PreInits; + if (const auto *Fuse = dyn_cast(&S)) { + PreInits = Fuse->getPreInits(); + } else { + llvm_unreachable( + "Unknown canonical loop sequence transform directive kind."); + } + doEmitPreinits(PreInits); + } + + void doEmitPreinits(const Stmt *PreInits) { if (PreInits) { // CompoundStmts and DeclStmts are used as lists of PreInit statements and // declarations. Since declarations must be visible in the the following @@ -222,7 +240,6 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope { CGF.EmitStmt(S); } } - PreCondVars.restore(CGF); } public: @@ -230,6 +247,11 @@ class OMPLoopScope : public CodeGenFunction::RunCleanupsScope { : CodeGenFunction::RunCleanupsScope(CGF) { emitPreInitStmt(CGF, S); } + OMPLoopScope(CodeGenFunction &CGF, + const OMPCanonicalLoopSequenceTransformationDirective &S) + : CodeGenFunction::RunCleanupsScope(CGF) { + emitPreInitStmt(CGF, S); + } }; class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope { @@ -1622,22 +1644,30 @@ static void emitCommonOMPParallelDirective( // if sev-level is fatal." OpenMPSeverityClauseKind Severity = OMPC_SEVERITY_fatal; clang::Expr *Message = nullptr; + SourceLocation SeverityLoc = SourceLocation(); + SourceLocation MessageLoc = SourceLocation(); + llvm::Function *OutlinedFn = CGF.CGM.getOpenMPRuntime().emitParallelOutlinedFunction( CGF, S, *CS->getCapturedDecl()->param_begin(), InnermostKind, CodeGen); + if (const auto *NumThreadsClause = S.getSingleClause()) { CodeGenFunction::RunCleanupsScope NumThreadsScope(CGF); NumThreads = CGF.EmitScalarExpr(NumThreadsClause->getNumThreads(), /*IgnoreResultAssign=*/true); Modifier = NumThreadsClause->getModifier(); - if (const auto *MessageClause = S.getSingleClause()) + if (const auto *MessageClause = S.getSingleClause()) { Message = MessageClause->getMessageString(); - if (const auto *SeverityClause = S.getSingleClause()) + MessageLoc = MessageClause->getBeginLoc(); + } + if (const auto *SeverityClause = S.getSingleClause()) { Severity = SeverityClause->getSeverityKind(); + SeverityLoc = SeverityClause->getBeginLoc(); + } CGF.CGM.getOpenMPRuntime().emitNumThreadsClause( CGF, NumThreads, NumThreadsClause->getBeginLoc(), Modifier, Severity, - Message); + SeverityLoc, Message, MessageLoc); } if (const auto *ProcBindClause = S.getSingleClause()) { CodeGenFunction::RunCleanupsScope ProcBindScope(CGF); @@ -1921,6 +1951,15 @@ class OMPTransformDirectiveScopeRAII { CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP); CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI); } + if (const auto *Dir = + dyn_cast(S)) { + // For simplicity we reuse the loop scope similarly to what we do with + // OMPCanonicalLoopNestTransformationDirective do by being a subclass + // of OMPLoopBasedDirective. + Scope = new OMPLoopScope(CGF, *Dir); + CGSI = new CodeGenFunction::CGCapturedStmtInfo(CR_OpenMP); + CapInfoRAII = new CodeGenFunction::CGCapturedStmtRAII(CGF, CGSI); + } } ~OMPTransformDirectiveScopeRAII() { if (!Scope) @@ -1948,8 +1987,7 @@ static void emitBody(CodeGenFunction &CGF, const Stmt *S, const Stmt *NextLoop, return; } if (SimplifiedS == NextLoop) { - if (auto *Dir = - dyn_cast(SimplifiedS)) + if (auto *Dir = dyn_cast(SimplifiedS)) SimplifiedS = Dir->getTransformedStmt(); if (const auto *CanonLoop = dyn_cast(SimplifiedS)) SimplifiedS = CanonLoop->getLoopStmt(); @@ -2944,6 +2982,12 @@ void CodeGenFunction::EmitOMPInterchangeDirective( EmitStmt(S.getTransformedStmt()); } +void CodeGenFunction::EmitOMPFuseDirective(const OMPFuseDirective &S) { + // Emit the de-sugared statement + OMPTransformDirectiveScopeRAII FuseScope(*this, &S); + EmitStmt(S.getTransformedStmt()); +} + void CodeGenFunction::EmitOMPUnrollDirective(const OMPUnrollDirective &S) { bool UseOMPIRBuilder = CGM.getLangOpts().OpenMPIRBuilder; diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index dc54c97eeae8e..9286f1f25c6cc 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -29,6 +29,7 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Serialization/ASTWriter.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Bitcode/BitcodeReader.h" #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/Demangle/Demangle.h" @@ -259,19 +260,18 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) { Ctx.setDefaultTargetCPU(TargetOpts.CPU); Ctx.setDefaultTargetFeatures(llvm::join(TargetOpts.Features, ",")); - Expected> OptRecordFileOrErr = - setupLLVMOptimizationRemarks( - Ctx, CodeGenOpts.OptRecordFile, CodeGenOpts.OptRecordPasses, - CodeGenOpts.OptRecordFormat, CodeGenOpts.DiagnosticsWithHotness, - CodeGenOpts.DiagnosticsHotnessThreshold); + Expected OptRecordFileOrErr = + setupLLVMOptimizationRemarks( + Ctx, CodeGenOpts.OptRecordFile, CodeGenOpts.OptRecordPasses, + CodeGenOpts.OptRecordFormat, CodeGenOpts.DiagnosticsWithHotness, + CodeGenOpts.DiagnosticsHotnessThreshold); if (Error E = OptRecordFileOrErr.takeError()) { reportOptRecordError(std::move(E), Diags, CodeGenOpts); return; } - std::unique_ptr OptRecordFile = - std::move(*OptRecordFileOrErr); + LLVMRemarkFileHandle OptRecordFile = std::move(*OptRecordFileOrErr); if (OptRecordFile && CodeGenOpts.getProfileUse() != llvm::driver::ProfileInstrKind::ProfileNone) @@ -1141,7 +1141,8 @@ void CodeGenAction::ExecuteAction() { TheModule->setTargetTriple(Triple(TargetOpts.Triple)); } - EmbedObject(TheModule.get(), CodeGenOpts, Diagnostics); + EmbedObject(TheModule.get(), CodeGenOpts, CI.getVirtualFileSystem(), + Diagnostics); EmbedBitcode(TheModule.get(), CodeGenOpts, *MainFile); LLVMContext &Ctx = TheModule->getContext(); @@ -1173,7 +1174,7 @@ void CodeGenAction::ExecuteAction() { Ctx.setDefaultTargetCPU(TargetOpts.CPU); Ctx.setDefaultTargetFeatures(llvm::join(TargetOpts.Features, ",")); - Expected> OptRecordFileOrErr = + Expected OptRecordFileOrErr = setupLLVMOptimizationRemarks( Ctx, CodeGenOpts.OptRecordFile, CodeGenOpts.OptRecordPasses, CodeGenOpts.OptRecordFormat, CodeGenOpts.DiagnosticsWithHotness, @@ -1183,8 +1184,7 @@ void CodeGenAction::ExecuteAction() { reportOptRecordError(std::move(E), Diagnostics, CodeGenOpts); return; } - std::unique_ptr OptRecordFile = - std::move(*OptRecordFileOrErr); + LLVMRemarkFileHandle OptRecordFile = std::move(*OptRecordFileOrErr); emitBackendOutput(CI, CI.getCodeGenOpts(), CI.getTarget().getDataLayoutString(), TheModule.get(), BA, diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 727487b46054f..f0565c1de04c4 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3861,6 +3861,7 @@ class CodeGenFunction : public CodeGenTypeCache { void EmitOMPUnrollDirective(const OMPUnrollDirective &S); void EmitOMPReverseDirective(const OMPReverseDirective &S); void EmitOMPInterchangeDirective(const OMPInterchangeDirective &S); + void EmitOMPFuseDirective(const OMPFuseDirective &S); void EmitOMPForDirective(const OMPForDirective &S); void EmitOMPForSimdDirective(const OMPForSimdDirective &S); void EmitOMPScopeDirective(const OMPScopeDirective &S); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index d05092a250040..f6f7f22a09004 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -528,8 +528,7 @@ CodeGenModule::CodeGenModule(ASTContext &C, if (!CGO.MSSecureHotPatchFunctionsFile.empty() || !CGO.MSSecureHotPatchFunctionsList.empty()) { if (!CGO.MSSecureHotPatchFunctionsFile.empty()) { - auto BufOrErr = - llvm::MemoryBuffer::getFile(CGO.MSSecureHotPatchFunctionsFile); + auto BufOrErr = FS->getBufferForFile(CGO.MSSecureHotPatchFunctionsFile); if (BufOrErr) { const llvm::MemoryBuffer &FileBuffer = **BufOrErr; for (llvm::line_iterator I(FileBuffer.getMemBufferRef(), true), E; @@ -1557,7 +1556,7 @@ void CodeGenModule::Release() { EmitBackendOptionsMetadata(getCodeGenOpts()); // If there is device offloading code embed it in the host now. - EmbedObject(&getModule(), CodeGenOpts, getDiags()); + EmbedObject(&getModule(), CodeGenOpts, *getFileSystem(), getDiags()); // Set visibility from DLL storage class // We do this at the end of LLVM IR generation; after any operation @@ -8173,12 +8172,17 @@ void CodeGenModule::printPostfixForExternalizedDecl(llvm::raw_ostream &OS, // Get the UniqueID for the file containing the decl. llvm::sys::fs::UniqueID ID; - if (llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID)) { + auto Status = FS->status(PLoc.getFilename()); + if (!Status) { PLoc = SM.getPresumedLoc(D->getLocation(), /*UseLineDirectives=*/false); assert(PLoc.isValid() && "Source location is expected to be valid."); - if (auto EC = llvm::sys::fs::getUniqueID(PLoc.getFilename(), ID)) - SM.getDiagnostics().Report(diag::err_cannot_open_file) - << PLoc.getFilename() << EC.message(); + Status = FS->status(PLoc.getFilename()); + } + if (!Status) { + SM.getDiagnostics().Report(diag::err_cannot_open_file) + << PLoc.getFilename() << Status.getError().message(); + } else { + ID = Status->getUniqueID(); } OS << llvm::format("%x", ID.getFile()) << llvm::format("%x", ID.getDevice()) << "_" << llvm::utohexstr(Result.low(), /*LowerCase=*/true, /*Width=*/8); diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp index 98b30e084b18b..8f095649f87ce 100644 --- a/clang/lib/CodeGen/CodeGenPGO.cpp +++ b/clang/lib/CodeGen/CodeGenPGO.cpp @@ -972,7 +972,7 @@ void PGOHash::combine(HashType Type) { if (Count && Count % NumTypesPerWord == 0) { using namespace llvm::support; uint64_t Swapped = - endian::byte_swap(Working); + endian::byte_swap(Working, llvm::endianness::little); MD5.update(llvm::ArrayRef((uint8_t *)&Swapped, sizeof(Swapped))); Working = 0; } @@ -999,7 +999,7 @@ uint64_t PGOHash::finalize() { } else { using namespace llvm::support; uint64_t Swapped = - endian::byte_swap(Working); + endian::byte_swap(Working, llvm::endianness::little); MD5.update(llvm::ArrayRef((uint8_t *)&Swapped, sizeof(Swapped))); } } diff --git a/clang/lib/CodeGen/Targets/Sparc.cpp b/clang/lib/CodeGen/Targets/Sparc.cpp index 5f3c15d106eb6..38dbebdec2429 100644 --- a/clang/lib/CodeGen/Targets/Sparc.cpp +++ b/clang/lib/CodeGen/Targets/Sparc.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include using namespace clang; using namespace clang::CodeGen; @@ -109,7 +110,8 @@ class SparcV9ABIInfo : public ABIInfo { SparcV9ABIInfo(CodeGenTypes &CGT) : ABIInfo(CGT) {} private: - ABIArgInfo classifyType(QualType RetTy, unsigned SizeLimit) const; + ABIArgInfo classifyType(QualType RetTy, unsigned SizeLimit, + unsigned &RegOffset) const; void computeInfo(CGFunctionInfo &FI) const override; RValue EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, AggValueSlot Slot) const override; @@ -222,128 +224,114 @@ class SparcV9ABIInfo : public ABIInfo { }; } // end anonymous namespace -ABIArgInfo -SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const { +ABIArgInfo SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit, + unsigned &RegOffset) const { if (Ty->isVoidType()) return ABIArgInfo::getIgnore(); - uint64_t Size = getContext().getTypeSize(Ty); + auto &Context = getContext(); + auto &VMContext = getVMContext(); + + uint64_t Size = Context.getTypeSize(Ty); + unsigned Alignment = Context.getTypeAlign(Ty); + bool NeedPadding = (Alignment > 64) && (RegOffset % 2 != 0); // Anything too big to fit in registers is passed with an explicit indirect // pointer / sret pointer. - if (Size > SizeLimit) + if (Size > SizeLimit) { + RegOffset += 1; return getNaturalAlignIndirect( Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(), /*ByVal=*/false); + } // Treat an enum type as its underlying type. if (const auto *ED = Ty->getAsEnumDecl()) Ty = ED->getIntegerType(); // Integer types smaller than a register are extended. - if (Size < 64 && Ty->isIntegerType()) + if (Size < 64 && Ty->isIntegerType()) { + RegOffset += 1; return ABIArgInfo::getExtend(Ty); + } if (const auto *EIT = Ty->getAs()) - if (EIT->getNumBits() < 64) + if (EIT->getNumBits() < 64) { + RegOffset += 1; return ABIArgInfo::getExtend(Ty); + } // Other non-aggregates go in registers. - if (!isAggregateTypeForABI(Ty)) + if (!isAggregateTypeForABI(Ty)) { + RegOffset += Size / 64; return ABIArgInfo::getDirect(); + } // If a C++ object has either a non-trivial copy constructor or a non-trivial // destructor, it is passed with an explicit indirect pointer / sret pointer. - if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) + if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) { + RegOffset += 1; return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(), RAA == CGCXXABI::RAA_DirectInMemory); + } // This is a small aggregate type that should be passed in registers. // Build a coercion type from the LLVM struct type. llvm::StructType *StrTy = dyn_cast(CGT.ConvertType(Ty)); - if (!StrTy) + if (!StrTy) { + RegOffset += Size / 64; return ABIArgInfo::getDirect(); + } - CoerceBuilder CB(getVMContext(), getDataLayout()); + CoerceBuilder CB(VMContext, getDataLayout()); CB.addStruct(0, StrTy); // All structs, even empty ones, should take up a register argument slot, // so pin the minimum struct size to one bit. CB.pad(llvm::alignTo( std::max(CB.DL.getTypeSizeInBits(StrTy).getKnownMinValue(), uint64_t(1)), 64)); + RegOffset += CB.Size / 64; + + // If we're dealing with overaligned structs we may need to add a padding in + // the front, to preserve the correct register-memory mapping. + // + // See SCD 2.4.1, pages 3P-11 and 3P-12. + llvm::Type *Padding = + NeedPadding ? llvm::Type::getInt64Ty(VMContext) : nullptr; + RegOffset += NeedPadding ? 1 : 0; // Try to use the original type for coercion. llvm::Type *CoerceTy = CB.isUsableType(StrTy) ? StrTy : CB.getType(); - if (CB.InReg) - return ABIArgInfo::getDirectInReg(CoerceTy); - else - return ABIArgInfo::getDirect(CoerceTy); + ABIArgInfo AAI = ABIArgInfo::getDirect(CoerceTy, 0, Padding); + AAI.setInReg(CB.InReg); + return AAI; } RValue SparcV9ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty, AggValueSlot Slot) const { - ABIArgInfo AI = classifyType(Ty, 16 * 8); - llvm::Type *ArgTy = CGT.ConvertType(Ty); - if (AI.canHaveCoerceToType() && !AI.getCoerceToType()) - AI.setCoerceToType(ArgTy); - CharUnits SlotSize = CharUnits::fromQuantity(8); + auto TInfo = getContext().getTypeInfoInChars(Ty); - CGBuilderTy &Builder = CGF.Builder; - Address Addr = Address(Builder.CreateLoad(VAListAddr, "ap.cur"), - getVAListElementType(CGF), SlotSize); - llvm::Type *ArgPtrTy = CGF.UnqualPtrTy; - - auto TypeInfo = getContext().getTypeInfoInChars(Ty); - - Address ArgAddr = Address::invalid(); - CharUnits Stride; - switch (AI.getKind()) { - case ABIArgInfo::Expand: - case ABIArgInfo::CoerceAndExpand: - case ABIArgInfo::InAlloca: - case ABIArgInfo::TargetSpecific: - llvm_unreachable("Unsupported ABI kind for va_arg"); - - case ABIArgInfo::Extend: { - Stride = SlotSize; - CharUnits Offset = SlotSize - TypeInfo.Width; - ArgAddr = Builder.CreateConstInBoundsByteGEP(Addr, Offset, "extend"); - break; - } - - case ABIArgInfo::Direct: { - auto AllocSize = getDataLayout().getTypeAllocSize(AI.getCoerceToType()); - Stride = CharUnits::fromQuantity(AllocSize).alignTo(SlotSize); - ArgAddr = Addr; - break; - } - - case ABIArgInfo::Indirect: - case ABIArgInfo::IndirectAliased: - Stride = SlotSize; - ArgAddr = Addr.withElementType(ArgPtrTy); - ArgAddr = Address(Builder.CreateLoad(ArgAddr, "indirect.arg"), ArgTy, - TypeInfo.Align); - break; + // Zero-sized types have a width of one byte for parameter passing purposes. + TInfo.Width = std::max(TInfo.Width, CharUnits::fromQuantity(1)); - case ABIArgInfo::Ignore: - return Slot.asRValue(); - } - - // Update VAList. - Address NextPtr = Builder.CreateConstInBoundsByteGEP(Addr, Stride, "ap.next"); - Builder.CreateStore(NextPtr.emitRawPointer(CGF), VAListAddr); - - return CGF.EmitLoadOfAnyValue( - CGF.MakeAddrLValue(ArgAddr.withElementType(ArgTy), Ty), Slot); + // Arguments bigger than 2*SlotSize bytes are passed indirectly. + return emitVoidPtrVAArg(CGF, VAListAddr, Ty, + /*IsIndirect=*/TInfo.Width > 2 * SlotSize, TInfo, + SlotSize, + /*AllowHigherAlign=*/true, Slot); } void SparcV9ABIInfo::computeInfo(CGFunctionInfo &FI) const { - FI.getReturnInfo() = classifyType(FI.getReturnType(), 32 * 8); + unsigned RetOffset = 0; + ABIArgInfo RetType = classifyType(FI.getReturnType(), 32 * 8, RetOffset); + FI.getReturnInfo() = RetType; + + // Indirect returns will have its pointer passed as an argument. + unsigned ArgOffset = RetType.isIndirect() ? RetOffset : 0; for (auto &I : FI.arguments()) - I.info = classifyType(I.type, 16 * 8); + I.info = classifyType(I.type, 16 * 8, ArgOffset); } namespace { diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index f110dbab3e5a5..85a1335785542 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -6613,6 +6613,9 @@ std::string Driver::GetStdModuleManifestPath(const Compilation &C, const ToolChain &TC) const { std::string error = ""; + if (C.getArgs().hasArg(options::OPT_nostdlib)) + return error; + switch (TC.GetCXXStdlibType(C.getArgs())) { case ToolChain::CST_Libcxx: { auto evaluate = [&](const char *library) -> std::optional { diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index ae546e9767039..654a382e87e40 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -882,6 +882,16 @@ void AMDGPUToolChain::addClangWarningOptions(ArgStringList &CC1Args) const { CC1Args.push_back("-Werror=atomic-alignment"); } +void AMDGPUToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { + if (DriverArgs.hasArg(options::OPT_nostdinc) || + DriverArgs.hasArg(options::OPT_nostdlibinc)) + return; + + if (std::optional Path = getStdlibIncludePath()) + addSystemInclude(DriverArgs, CC1Args, *Path); +} + StringRef AMDGPUToolChain::getGPUArch(const llvm::opt::ArgList &DriverArgs) const { return getProcessorFromTargetID( diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h index e5d41e2401db6..e90a5736911e4 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.h +++ b/clang/lib/Driver/ToolChains/AMDGPU.h @@ -79,6 +79,9 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUToolChain : public Generic_ELF { void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadKind) const override; + void + AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; /// Return whether denormals should be flushed, and treated as 0 by default /// for the subtarget. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index f67454ee517bd..adaa6b3005577 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -695,16 +695,6 @@ RenderDebugEnablingArgs(const ArgList &Args, ArgStringList &CmdArgs, } } -static bool checkDebugInfoOption(const Arg *A, const ArgList &Args, - const Driver &D, const ToolChain &TC) { - assert(A && "Expected non-nullptr argument."); - if (TC.supportsDebugInfoOption(A)) - return true; - D.Diag(diag::warn_drv_unsupported_debug_info_opt_for_target) - << A->getAsString(Args) << TC.getTripleString(); - return false; -} - static void RenderDebugInfoCompressionArgs(const ArgList &Args, ArgStringList &CmdArgs, const Driver &D, @@ -1109,26 +1099,15 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA, if (!Args.hasArg(options::OPT_nostdinc) && Args.hasFlag(options::OPT_offload_inc, options::OPT_no_offload_inc, true) && - !Args.hasArg(options::OPT_nobuiltininc)) { - // Without an offloading language we will include these headers directly. - // Offloading languages will instead only use the declarations stored in - // the resource directory at clang/lib/Headers/llvm_libc_wrappers. - if (getToolChain().getTriple().isGPU() && - C.getActiveOffloadKinds() == Action::OFK_None) { - SmallString<128> P(llvm::sys::path::parent_path(D.Dir)); - llvm::sys::path::append(P, "include"); - llvm::sys::path::append(P, getToolChain().getTripleString()); - CmdArgs.push_back("-internal-isystem"); - CmdArgs.push_back(Args.MakeArgString(P)); - } else if (C.getActiveOffloadKinds() == Action::OFK_OpenMP) { - // TODO: CUDA / HIP include their own headers for some common functions - // implemented here. We'll need to clean those up so they do not conflict. - SmallString<128> P(D.ResourceDir); - llvm::sys::path::append(P, "include"); - llvm::sys::path::append(P, "llvm_libc_wrappers"); - CmdArgs.push_back("-internal-isystem"); - CmdArgs.push_back(Args.MakeArgString(P)); - } + !Args.hasArg(options::OPT_nobuiltininc) && + (C.getActiveOffloadKinds() == Action::OFK_OpenMP)) { + // TODO: CUDA / HIP include their own headers for some common functions + // implemented here. We'll need to clean those up so they do not conflict. + SmallString<128> P(D.ResourceDir); + llvm::sys::path::append(P, "include"); + llvm::sys::path::append(P, "llvm_libc_wrappers"); + CmdArgs.push_back("-internal-isystem"); + CmdArgs.push_back(Args.MakeArgString(P)); } // Add system include arguments for all targets but IAMCU. @@ -4336,27 +4315,6 @@ static void RenderDiagnosticsOptions(const Driver &D, const ArgList &Args, Args.addLastArg(CmdArgs, options::OPT_warning_suppression_mappings_EQ); } -DwarfFissionKind tools::getDebugFissionKind(const Driver &D, - const ArgList &Args, Arg *&Arg) { - Arg = Args.getLastArg(options::OPT_gsplit_dwarf, options::OPT_gsplit_dwarf_EQ, - options::OPT_gno_split_dwarf); - if (!Arg || Arg->getOption().matches(options::OPT_gno_split_dwarf)) - return DwarfFissionKind::None; - - if (Arg->getOption().matches(options::OPT_gsplit_dwarf)) - return DwarfFissionKind::Split; - - StringRef Value = Arg->getValue(); - if (Value == "split") - return DwarfFissionKind::Split; - if (Value == "single") - return DwarfFissionKind::Single; - - D.Diag(diag::err_drv_unsupported_option_argument) - << Arg->getSpelling() << Arg->getValue(); - return DwarfFissionKind::None; -} - static void renderDwarfFormat(const Driver &D, const llvm::Triple &T, const ArgList &Args, ArgStringList &CmdArgs, unsigned DwarfVersion) { diff --git a/clang/lib/Driver/ToolChains/Clang.h b/clang/lib/Driver/ToolChains/Clang.h index 18f6c5ed06a59..c22789591e00a 100644 --- a/clang/lib/Driver/ToolChains/Clang.h +++ b/clang/lib/Driver/ToolChains/Clang.h @@ -187,12 +187,6 @@ class LLVM_LIBRARY_VISIBILITY LinkerWrapper final : public Tool { const char *LinkingOutput) const override; }; -enum class DwarfFissionKind { None, Split, Single }; - -DwarfFissionKind getDebugFissionKind(const Driver &D, - const llvm::opt::ArgList &Args, - llvm::opt::Arg *&Arg); - // Calculate the output path of the module file when compiling a module unit // with the `-fmodule-output` option or `-fmodule-output=` option specified. // The behavior is: diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 08cd98fd04df0..cce4f6487c0bd 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -2270,6 +2270,37 @@ unsigned tools::getDwarfVersion(const ToolChain &TC, return DwarfVersion; } +DwarfFissionKind tools::getDebugFissionKind(const Driver &D, + const ArgList &Args, Arg *&Arg) { + Arg = Args.getLastArg(options::OPT_gsplit_dwarf, options::OPT_gsplit_dwarf_EQ, + options::OPT_gno_split_dwarf); + if (!Arg || Arg->getOption().matches(options::OPT_gno_split_dwarf)) + return DwarfFissionKind::None; + + if (Arg->getOption().matches(options::OPT_gsplit_dwarf)) + return DwarfFissionKind::Split; + + StringRef Value = Arg->getValue(); + if (Value == "split") + return DwarfFissionKind::Split; + if (Value == "single") + return DwarfFissionKind::Single; + + D.Diag(diag::err_drv_unsupported_option_argument) + << Arg->getSpelling() << Arg->getValue(); + return DwarfFissionKind::None; +} + +bool tools::checkDebugInfoOption(const Arg *A, const ArgList &Args, + const Driver &D, const ToolChain &TC) { + assert(A && "Expected non-nullptr argument."); + if (TC.supportsDebugInfoOption(A)) + return true; + D.Diag(diag::warn_drv_unsupported_debug_info_opt_for_target) + << A->getAsString(Args) << TC.getTripleString(); + return false; +} + void tools::AddAssemblerKPIC(const ToolChain &ToolChain, const ArgList &Args, ArgStringList &CmdArgs) { llvm::Reloc::Model RelocationModel; @@ -3315,20 +3346,16 @@ bool tools::shouldEnableVectorizerAtOLevel(const ArgList &Args, bool isSlpVec) { void tools::handleVectorizeLoopsArgs(const ArgList &Args, ArgStringList &CmdArgs) { bool EnableVec = shouldEnableVectorizerAtOLevel(Args, false); - OptSpecifier vectorizeAliasOption = - EnableVec ? options::OPT_O_Group : options::OPT_fvectorize; - if (Args.hasFlag(options::OPT_fvectorize, vectorizeAliasOption, - options::OPT_fno_vectorize, EnableVec)) + if (Args.hasFlag(options::OPT_fvectorize, options::OPT_fno_vectorize, + EnableVec)) CmdArgs.push_back("-vectorize-loops"); } void tools::handleVectorizeSLPArgs(const ArgList &Args, ArgStringList &CmdArgs) { bool EnableSLPVec = shouldEnableVectorizerAtOLevel(Args, true); - OptSpecifier SLPVectAliasOption = - EnableSLPVec ? options::OPT_O_Group : options::OPT_fslp_vectorize; - if (Args.hasFlag(options::OPT_fslp_vectorize, SLPVectAliasOption, - options::OPT_fno_slp_vectorize, EnableSLPVec)) + if (Args.hasFlag(options::OPT_fslp_vectorize, options::OPT_fno_slp_vectorize, + EnableSLPVec)) CmdArgs.push_back("-vectorize-slp"); } diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 327cb5183f837..07201cc4676ac 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -778,6 +778,16 @@ void NVPTXToolChain::addClangTargetOptions( const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadingKind) const {} +void NVPTXToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs, + ArgStringList &CC1Args) const { + if (DriverArgs.hasArg(options::OPT_nostdinc) || + DriverArgs.hasArg(options::OPT_nostdlibinc)) + return; + + if (std::optional Path = getStdlibIncludePath()) + addSystemInclude(DriverArgs, CC1Args, *Path); +} + bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const { const Option &O = A->getOption(); return (O.matches(options::OPT_gN_Group) && diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h index 8aeba53dd0030..6193328908828 100644 --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -92,6 +92,9 @@ class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain { addClangTargetOptions(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args, Action::OffloadKind DeviceOffloadKind) const override; + void + AddClangSystemIncludeArgs(const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args) const override; // Never try to use the integrated assembler with CUDA; always fork out to // ptxas. diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 6fc372eb75eb7..a5394813eeb97 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -120,7 +120,11 @@ static bool shouldLoopVersion(const ArgList &Args) { return false; } -void Flang::addOtherOptions(const ArgList &Args, ArgStringList &CmdArgs) const { +void Flang::addDebugOptions(const llvm::opt::ArgList &Args, const JobAction &JA, + const InputInfo &Output, const InputInfo &Input, + llvm::opt::ArgStringList &CmdArgs) const { + const auto &TC = getToolChain(); + const Driver &D = TC.getDriver(); Args.addAllArgs(CmdArgs, {options::OPT_module_dir, options::OPT_fdebug_module_writer, options::OPT_fintrinsic_modules_path, options::OPT_pedantic, @@ -131,20 +135,60 @@ void Flang::addOtherOptions(const ArgList &Args, ArgStringList &CmdArgs) const { options::OPT_finstrument_functions}); llvm::codegenoptions::DebugInfoKind DebugInfoKind; + bool hasDwarfNArg = getDwarfNArg(Args) != nullptr; if (Args.hasArg(options::OPT_gN_Group)) { Arg *gNArg = Args.getLastArg(options::OPT_gN_Group); DebugInfoKind = debugLevelToInfoKind(*gNArg); - } else if (Args.hasArg(options::OPT_g_Group)) { + } else if (Args.hasArg(options::OPT_g_Flag) || hasDwarfNArg) { DebugInfoKind = llvm::codegenoptions::FullDebugInfo; } else { DebugInfoKind = llvm::codegenoptions::NoDebugInfo; } addDebugInfoKind(CmdArgs, DebugInfoKind); - if (getDwarfNArg(Args)) { + if (hasDwarfNArg) { const unsigned DwarfVersion = getDwarfVersion(getToolChain(), Args); CmdArgs.push_back( Args.MakeArgString("-dwarf-version=" + Twine(DwarfVersion))); } + if (Args.hasArg(options::OPT_gsplit_dwarf) || + Args.hasArg(options::OPT_gsplit_dwarf_EQ)) { + // FIXME: -gsplit-dwarf on AIX is currently unimplemented. + if (TC.getTriple().isOSAIX()) { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << Args.getLastArg(options::OPT_gsplit_dwarf)->getSpelling() + << TC.getTriple().str(); + return; + } + if (DebugInfoKind == llvm::codegenoptions::NoDebugInfo) + return; + + Arg *SplitDWARFArg; + DwarfFissionKind DwarfFission = getDebugFissionKind(D, Args, SplitDWARFArg); + + if (DwarfFission == DwarfFissionKind::None || + !checkDebugInfoOption(SplitDWARFArg, Args, D, TC)) + return; + + if (!TC.getTriple().isOSBinFormatELF() && + !TC.getTriple().isOSBinFormatWasm() && + !TC.getTriple().isOSBinFormatCOFF()) { + D.Diag(diag::warn_drv_unsupported_debug_info_opt_for_target) + << SplitDWARFArg->getSpelling() << TC.getTriple().str(); + return; + } + + if (!isa(JA) && !isa(JA) && + isa(JA)) + return; + + const char *SplitDWARFOut = SplitDebugName(JA, Args, Input, Output); + CmdArgs.push_back("-split-dwarf-file"); + CmdArgs.push_back(SplitDWARFOut); + if (DwarfFission == DwarfFissionKind::Split) { + CmdArgs.push_back("-split-dwarf-output"); + CmdArgs.push_back(SplitDWARFOut); + } + } } void Flang::addCodegenOptions(const ArgList &Args, @@ -936,8 +980,8 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, if (willEmitRemarks(Args)) renderRemarksOptions(Args, CmdArgs, Input); - // Add other compile options - addOtherOptions(Args, CmdArgs); + // Add debug compile options + addDebugOptions(Args, JA, Output, Input, CmdArgs); // Disable all warnings // TODO: Handle interactions between -w, -pedantic, -Wall, -WOption diff --git a/clang/lib/Driver/ToolChains/Flang.h b/clang/lib/Driver/ToolChains/Flang.h index 98167e1b75e15..c0837b80c032e 100644 --- a/clang/lib/Driver/ToolChains/Flang.h +++ b/clang/lib/Driver/ToolChains/Flang.h @@ -125,12 +125,16 @@ class LLVM_LIBRARY_VISIBILITY Flang : public Tool { void addCodegenOptions(const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CmdArgs) const; - /// Extract other compilation options from the driver arguments and add them + /// Extract debug compilation options from the driver arguments and add them /// to the command arguments. /// /// \param [in] Args The list of input driver arguments + /// \param [in] JA The job action + /// \param [in] Output The output information on the current file output + /// \param [in] Input The input information on the current file input /// \param [out] CmdArgs The list of output command arguments - void addOtherOptions(const llvm::opt::ArgList &Args, + void addDebugOptions(const llvm::opt::ArgList &Args, const JobAction &JA, + const InputInfo &Output, const InputInfo &Input, llvm::opt::ArgStringList &CmdArgs) const; public: diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index b38f2810c0a74..835071dbe715d 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -2185,47 +2185,68 @@ std::error_code parseConfiguration(llvm::MemoryBufferRef Config, if (Input.error()) return Input.error(); - for (unsigned i = 0; i < Styles.size(); ++i) { - // Ensures that only the first configuration can skip the Language option. - if (Styles[i].Language == FormatStyle::LK_None && i != 0) + assert(!Styles.empty()); + const auto StyleCount = Styles.size(); + + // Start from the second style as (only) the first one may be the default. + for (unsigned I = 1; I < StyleCount; ++I) { + const auto Lang = Styles[I].Language; + if (Lang == FormatStyle::LK_None) return make_error_code(ParseError::Error); // Ensure that each language is configured at most once. - for (unsigned j = 0; j < i; ++j) { - if (Styles[i].Language == Styles[j].Language) { + for (unsigned J = 0; J < I; ++J) { + if (Lang == Styles[J].Language) { LLVM_DEBUG(llvm::dbgs() << "Duplicate languages in the config file on positions " - << j << " and " << i << "\n"); + << J << " and " << I << '\n'); return make_error_code(ParseError::Error); } } } - // Look for a suitable configuration starting from the end, so we can - // find the configuration for the specific language first, and the default - // configuration (which can only be at slot 0) after it. - FormatStyle::FormatStyleSet StyleSet; - bool LanguageFound = false; - for (const FormatStyle &Style : llvm::reverse(Styles)) { - const auto Lang = Style.Language; - if (Lang != FormatStyle::LK_None) - StyleSet.Add(Style); - if (Lang == Language || - // For backward compatibility. - (Lang == FormatStyle::LK_Cpp && Language == FormatStyle::LK_C)) { - LanguageFound = true; - } else if (IsDotHFile && Language == FormatStyle::LK_Cpp && - (Lang == FormatStyle::LK_C || Lang == FormatStyle::LK_ObjC)) { - Language = Lang; - LanguageFound = true; + + int LanguagePos = -1; // Position of the style for Language. + int CppPos = -1; // Position of the style for C++. + int CPos = -1; // Position of the style for C. + + // Search Styles for Language and store the positions of C++ and C styles in + // case Language is not found. + for (unsigned I = 0; I < StyleCount; ++I) { + const auto Lang = Styles[I].Language; + if (Lang == Language) { + LanguagePos = I; + break; } - } - if (!LanguageFound) { - if (Styles.empty() || Styles[0].Language != FormatStyle::LK_None) + if (Lang == FormatStyle::LK_Cpp) + CppPos = I; + else if (Lang == FormatStyle::LK_C) + CPos = I; + } + + // If Language is not found, use the default style if there is one. Otherwise, + // use the C style for C++ .h files and for backward compatibility, the C++ + // style for .c files. + if (LanguagePos < 0) { + if (Styles[0].Language == FormatStyle::LK_None) // Default style. + LanguagePos = 0; + else if (IsDotHFile && Language == FormatStyle::LK_Cpp) + LanguagePos = CPos; + else if (!IsDotHFile && Language == FormatStyle::LK_C) + LanguagePos = CppPos; + if (LanguagePos < 0) return make_error_code(ParseError::Unsuitable); - FormatStyle DefaultStyle = Styles[0]; - DefaultStyle.Language = Language; - StyleSet.Add(std::move(DefaultStyle)); } - *Style = *StyleSet.Get(Language); + + for (const auto &S : llvm::reverse(llvm::drop_begin(Styles))) + Style->StyleSet.Add(S); + + *Style = Styles[LanguagePos]; + + if (LanguagePos == 0) { + if (Style->Language == FormatStyle::LK_None) // Default style. + Style->Language = Language; + Style->StyleSet.Add(*Style); + } + if (Style->InsertTrailingCommas != FormatStyle::TCS_None && Style->BinPackArguments) { // See comment on FormatStyle::TSC_Wrapped. @@ -2256,14 +2277,8 @@ FormatStyle::FormatStyleSet::Get(FormatStyle::LanguageKind Language) const { if (!Styles) return std::nullopt; auto It = Styles->find(Language); - if (It == Styles->end()) { - if (Language != FormatStyle::LK_C) - return std::nullopt; - // For backward compatibility. - It = Styles->find(FormatStyle::LK_Cpp); - if (It == Styles->end()) - return std::nullopt; - } + if (It == Styles->end()) + return std::nullopt; FormatStyle Style = It->second; Style.StyleSet = *this; return Style; diff --git a/clang/lib/Format/QualifierAlignmentFixer.cpp b/clang/lib/Format/QualifierAlignmentFixer.cpp index 441a37a4902b7..043d957611b19 100644 --- a/clang/lib/Format/QualifierAlignmentFixer.cpp +++ b/clang/lib/Format/QualifierAlignmentFixer.cpp @@ -571,7 +571,7 @@ void LeftRightQualifierAlignmentFixer::fixQualifierAlignment( for (const auto *Tok = First; Tok && Tok != Last && Tok->Next; Tok = Tok->Next) { - if (Tok->MustBreakBefore) + if (Tok->MustBreakBefore && Tok != First) break; if (Tok->is(tok::comment)) continue; diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 4bfb803ebedf7..67066a104d738 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -833,11 +833,6 @@ class AnnotatingParser { if (Parent && Parent->is(TT_PointerOrReference)) Parent->overwriteFixedType(TT_BinaryOperator); } - // An arrow after an ObjC method expression is not a lambda arrow. - if (CurrentToken->is(TT_ObjCMethodExpr) && CurrentToken->Next && - CurrentToken->Next->is(TT_LambdaArrow)) { - CurrentToken->Next->overwriteFixedType(TT_Unknown); - } Left->MatchingParen = CurrentToken; CurrentToken->MatchingParen = Left; // FirstObjCSelectorName is set when a colon is found. This does @@ -6496,13 +6491,14 @@ void TokenAnnotator::printDebugInfo(const AnnotatedLine &Line) const { << "):\n"; const FormatToken *Tok = Line.First; while (Tok) { - llvm::errs() << " M=" << Tok->MustBreakBefore + llvm::errs() << " I=" << Tok->IndentLevel << " M=" << Tok->MustBreakBefore << " C=" << Tok->CanBreakBefore << " T=" << getTokenTypeName(Tok->getType()) << " S=" << Tok->SpacesRequiredBefore << " F=" << Tok->Finalized << " B=" << Tok->BlockParameterCount << " BK=" << Tok->getBlockKind() << " P=" << Tok->SplitPenalty - << " Name=" << Tok->Tok.getName() << " L=" << Tok->TotalLength + << " Name=" << Tok->Tok.getName() << " N=" << Tok->NestingLevel + << " L=" << Tok->TotalLength << " PPK=" << Tok->getPackingKind() << " FakeLParens="; for (prec::Level LParen : Tok->FakeLParens) llvm::errs() << LParen << "/"; diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 2c9766c9b7bc0..6948b3de1e408 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -2268,7 +2268,7 @@ bool UnwrappedLineParser::tryToParseLambda() { if (!tryToParseLambdaIntroducer()) return false; - bool SeenArrow = false; + FormatToken *Arrow = nullptr; bool InTemplateParameterList = false; while (FormatTok->isNot(tok::l_brace)) { @@ -2343,17 +2343,13 @@ bool UnwrappedLineParser::tryToParseLambda() { case tok::ellipsis: case tok::kw_true: case tok::kw_false: - if (SeenArrow || InTemplateParameterList) { + if (Arrow || InTemplateParameterList) { nextToken(); break; } return true; case tok::arrow: - // This might or might not actually be a lambda arrow (this could be an - // ObjC method invocation followed by a dereferencing arrow). We might - // reset this back to TT_Unknown in TokenAnnotator. - FormatTok->setFinalizedType(TT_LambdaArrow); - SeenArrow = true; + Arrow = FormatTok; nextToken(); break; case tok::kw_requires: { @@ -2375,6 +2371,9 @@ bool UnwrappedLineParser::tryToParseLambda() { FormatTok->setFinalizedType(TT_LambdaLBrace); LSquare.setFinalizedType(TT_LambdaLSquare); + if (Arrow) + Arrow->setFinalizedType(TT_LambdaArrow); + NestedLambdas.push_back(Line->SeenDecltypeAuto); parseChildBlock(); assert(!NestedLambdas.empty()); @@ -2388,11 +2387,6 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() { const FormatToken *LeftSquare = FormatTok; nextToken(); if (Previous) { - if (Previous->Tok.getIdentifierInfo() && - !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield, - tok::kw_co_return)) { - return false; - } if (Previous->closesScope()) { // Not a potential C-style cast. if (Previous->isNot(tok::r_paren)) @@ -2402,6 +2396,13 @@ bool UnwrappedLineParser::tryToParseLambdaIntroducer() { // and `int (*)()`. if (!BeforeRParen || !BeforeRParen->isOneOf(tok::greater, tok::r_paren)) return false; + } else if (Previous->is(tok::star)) { + Previous = Previous->getPreviousNonComment(); + } + if (Previous && Previous->Tok.getIdentifierInfo() && + !Previous->isOneOf(tok::kw_return, tok::kw_co_await, tok::kw_co_yield, + tok::kw_co_return)) { + return false; } } if (LeftSquare->isCppStructuredBinding(IsCpp)) diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp index cc3cc0f6906cc..30c06bbb4d071 100644 --- a/clang/lib/Format/WhitespaceManager.cpp +++ b/clang/lib/Format/WhitespaceManager.cpp @@ -279,20 +279,19 @@ void WhitespaceManager::calculateLineBreakInformation() { } // Align a single sequence of tokens, see AlignTokens below. -// Column - The token for which Matches returns true is moved to this column. +// Column - The tokens indexed in Matches are moved to this column. // RightJustify - Whether it is the token's right end or left end that gets // moved to that column. -template static void AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, - unsigned Column, bool RightJustify, F &&Matches, + unsigned Column, bool RightJustify, + ArrayRef Matches, SmallVector &Changes) { - bool FoundMatchOnLine = false; int Shift = 0; // ScopeStack keeps track of the current scope depth. It contains indices of // the first token on each scope. - // We only run the "Matches" function on tokens from the outer-most scope. + // The "Matches" indices should only have tokens from the outer-most scope. // However, we do need to pay special attention to one class of tokens // that are not in the outer-most scope, and that is function parameters // which are split across multiple lines, as illustrated by this example: @@ -314,6 +313,9 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, for (unsigned i = Start; i != End; ++i) { auto &CurrentChange = Changes[i]; + if (!Matches.empty() && Matches[0] < i) + Matches.consume_front(); + assert(Matches.empty() || Matches[0] >= i); if (!ScopeStack.empty() && CurrentChange.indentAndNestingLevel() < Changes[ScopeStack.back()].indentAndNestingLevel()) { @@ -338,26 +340,16 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End, Changes[i - 1].Tok->is(tok::string_literal); bool SkipMatchCheck = InsideNestedScope || ContinuedStringLiteral; - if (CurrentChange.NewlinesBefore > 0 && !SkipMatchCheck) { + if (CurrentChange.NewlinesBefore > 0 && !SkipMatchCheck) Shift = 0; - FoundMatchOnLine = false; - } // If this is the first matching token to be aligned, remember by how many // spaces it has to be shifted, so the rest of the changes on the line are // shifted by the same amount - if (!FoundMatchOnLine && !SkipMatchCheck && Matches(CurrentChange)) { - FoundMatchOnLine = true; + if (!Matches.empty() && Matches[0] == i) { Shift = Column - (RightJustify ? CurrentChange.TokenLength : 0) - CurrentChange.StartOfTokenColumn; CurrentChange.Spaces += Shift; - // FIXME: This is a workaround that should be removed when we fix - // http://llvm.org/PR53699. An assertion later below verifies this. - if (CurrentChange.NewlinesBefore == 0) { - CurrentChange.Spaces = - std::max(CurrentChange.Spaces, - static_cast(CurrentChange.Tok->SpacesRequiredBefore)); - } } if (Shift == 0) @@ -532,12 +524,14 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, bool RightJustify = false) { // We arrange each line in 3 parts. The operator to be aligned (the anchor), // and text to its left and right. In the aligned text the width of each part - // will be the maximum of that over the block that has been aligned. Maximum - // widths of each part so far. When RightJustify is true and ACS.PadOperators - // is false, the part from start of line to the right end of the anchor. - // Otherwise, only the part to the left of the anchor. Including the space - // that exists on its left from the start. Not including the padding added on - // the left to right-justify the anchor. + // will be the maximum of that over the block that has been aligned. + + // Maximum widths of each part so far. + // When RightJustify is true and ACS.PadOperators is false, the part from + // start of line to the right end of the anchor. Otherwise, only the part to + // the left of the anchor. Including the space that exists on its left from + // the start. Not including the padding added on the left to right-justify the + // anchor. unsigned WidthLeft = 0; // The operator to be aligned when RightJustify is true and ACS.PadOperators // is false. 0 otherwise. @@ -550,6 +544,9 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, unsigned StartOfSequence = 0; unsigned EndOfSequence = 0; + // The positions of the tokens to be aligned. + SmallVector MatchedIndices; + // Measure the scope level (i.e. depth of (), [], {}) of the first token, and // abort when we hit any token in a higher scope than the starting one. auto IndentAndNestingLevel = StartAt < Changes.size() @@ -578,7 +575,7 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, auto AlignCurrentSequence = [&] { if (StartOfSequence > 0 && StartOfSequence < EndOfSequence) { AlignTokenSequence(Style, StartOfSequence, EndOfSequence, - WidthLeft + WidthAnchor, RightJustify, Matches, + WidthLeft + WidthAnchor, RightJustify, MatchedIndices, Changes); } WidthLeft = 0; @@ -586,6 +583,7 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, WidthRight = 0; StartOfSequence = 0; EndOfSequence = 0; + MatchedIndices.clear(); }; unsigned i = StartAt; @@ -637,8 +635,10 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, // If there is more than one matching token per line, or if the number of // preceding commas, do not match anymore, end the sequence. - if (FoundMatchOnLine || CommasBeforeMatch != CommasBeforeLastMatch) + if (FoundMatchOnLine || CommasBeforeMatch != CommasBeforeLastMatch) { + MatchedIndices.push_back(i); AlignCurrentSequence(); + } CommasBeforeLastMatch = CommasBeforeMatch; FoundMatchOnLine = true; @@ -684,6 +684,7 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches, WidthAnchor = NewAnchor; WidthRight = NewRight; } + MatchedIndices.push_back(i); } EndOfSequence = i; diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index d6f3aec981336..c989ad2e5155c 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -503,7 +503,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { // then we're the top level compiler instance and need to create one. if (!ModuleDepCollector && !DepOpts.ModuleDependencyOutputDir.empty()) { ModuleDepCollector = std::make_shared( - DepOpts.ModuleDependencyOutputDir); + DepOpts.ModuleDependencyOutputDir, getVirtualFileSystemPtr()); } // If there is a module dep collector, register with other dep collectors diff --git a/clang/lib/Frontend/HeaderIncludeGen.cpp b/clang/lib/Frontend/HeaderIncludeGen.cpp index 8ab335905f9f2..7cd9c8a3a5bd7 100644 --- a/clang/lib/Frontend/HeaderIncludeGen.cpp +++ b/clang/lib/Frontend/HeaderIncludeGen.cpp @@ -112,11 +112,22 @@ class HeaderIncludesJSONCallback : public PPCallbacks { /// an array of separate entries, one for each non-system source file used in /// the compilation showing only the direct includes and imports from that file. class HeaderIncludesDirectPerFileCallback : public PPCallbacks { + struct HeaderIncludeInfo { + SourceLocation Location; + FileEntryRef File; + const Module *ImportedModule; + + HeaderIncludeInfo(SourceLocation Location, FileEntryRef File, + const Module *ImportedModule) + : Location(Location), File(File), ImportedModule(ImportedModule) {} + }; + SourceManager &SM; HeaderSearch &HSI; raw_ostream *OutputFile; bool OwnsOutputFile; - using DependencyMap = llvm::DenseMap>; + using DependencyMap = + llvm::DenseMap>; DependencyMap Dependencies; public: @@ -295,8 +306,8 @@ void HeaderIncludesCallback::FileChanged(SourceLocation Loc, } } -void HeaderIncludesCallback::FileSkipped(const FileEntryRef &SkippedFile, const - Token &FilenameTok, +void HeaderIncludesCallback::FileSkipped(const FileEntryRef &SkippedFile, + const Token &FilenameTok, SrcMgr::CharacteristicKind FileType) { if (!DepOpts.ShowSkippedHeaderIncludes) return; @@ -390,18 +401,41 @@ void HeaderIncludesDirectPerFileCallback::EndOfMainFile() { std::string Str; llvm::raw_string_ostream OS(Str); llvm::json::OStream JOS(OS); - JOS.array([&] { - for (auto S = SourceFiles.begin(), SE = SourceFiles.end(); S != SE; ++S) { - JOS.object([&] { - SmallVector &Deps = Dependencies[*S]; - JOS.attribute("source", S->getName().str()); - JOS.attributeArray("includes", [&] { - for (unsigned I = 0, N = Deps.size(); I != N; ++I) - JOS.value(Deps[I].getName().str()); + JOS.object([&] { + JOS.attribute("version", "2.0.0"); + JOS.attributeArray("dependencies", [&] { + for (const auto &S : SourceFiles) { + JOS.object([&] { + SmallVector &Deps = Dependencies[S]; + JOS.attribute("source", S.getName().str()); + JOS.attributeArray("includes", [&] { + for (unsigned I = 0, N = Deps.size(); I != N; ++I) { + if (!Deps[I].ImportedModule) { + JOS.object([&] { + JOS.attribute("location", Deps[I].Location.printToString(SM)); + JOS.attribute("file", Deps[I].File.getName()); + }); + } + } + }); + JOS.attributeArray("imports", [&] { + for (unsigned I = 0, N = Deps.size(); I != N; ++I) { + if (Deps[I].ImportedModule) { + JOS.object([&] { + JOS.attribute("location", Deps[I].Location.printToString(SM)); + JOS.attribute( + "module", + Deps[I].ImportedModule->getTopLevelModuleName()); + JOS.attribute("file", Deps[I].File.getName()); + }); + } + } + }); }); - }); - } + } + }); }); + OS << "\n"; if (OutputFile->get_kind() == raw_ostream::OStreamKind::OK_FDStream) { @@ -427,7 +461,18 @@ void HeaderIncludesDirectPerFileCallback::InclusionDirective( if (!FromFile) return; - Dependencies[*FromFile].push_back(*File); + FileEntryRef HeaderOrModuleMapFile = *File; + if (ModuleImported && SuggestedModule) { + OptionalFileEntryRef ModuleMapFile = + HSI.getModuleMap().getModuleMapFileForUniquing(SuggestedModule); + if (ModuleMapFile) { + HeaderOrModuleMapFile = *ModuleMapFile; + } + } + + HeaderIncludeInfo DependenciesEntry( + Loc, HeaderOrModuleMapFile, (ModuleImported ? SuggestedModule : nullptr)); + Dependencies[*FromFile].push_back(DependenciesEntry); } void HeaderIncludesDirectPerFileCallback::moduleImport(SourceLocation ImportLoc, @@ -448,5 +493,6 @@ void HeaderIncludesDirectPerFileCallback::moduleImport(SourceLocation ImportLoc, if (!ModuleMapFile) return; - Dependencies[*FromFile].push_back(*ModuleMapFile); + HeaderIncludeInfo DependenciesEntry(Loc, *ModuleMapFile, Imported); + Dependencies[*FromFile].push_back(DependenciesEntry); } diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index edf0a091e087c..877ab02850667 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -742,7 +742,10 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts, Builder.defineMacro("__cpp_impl_coroutine", "201902L"); Builder.defineMacro("__cpp_designated_initializers", "201707L"); Builder.defineMacro("__cpp_impl_three_way_comparison", "201907L"); - //Builder.defineMacro("__cpp_modules", "201907L"); + // Intentionally to set __cpp_modules to 1. + // See https://github.com/llvm/llvm-project/issues/71364 for details. + // Builder.defineMacro("__cpp_modules", "201907L"); + Builder.defineMacro("__cpp_modules", "1"); Builder.defineMacro("__cpp_using_enum", "201907L"); } // C++23 features. diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index a3a5b02579081..31759c5386d9f 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -165,9 +165,8 @@ _mm256_abs_epi32(__m256i __a) { /// A 256-bit vector of [16 x i16] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packs_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); } @@ -197,9 +196,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packs_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packs_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); } @@ -228,9 +226,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b) /// A 256-bit vector of [16 x i16] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit integer vector containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packus_epi16(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packus_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); } @@ -260,9 +257,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b) /// A 256-bit vector of [8 x i32] used to generate result[127:64] and /// result[255:192]. /// \returns A 256-bit vector of [16 x i16] containing the result. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packus_epi32(__m256i __V1, __m256i __V2) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_packus_epi32(__m256i __V1, __m256i __V2) { return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); } diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index cf3d98d512684..c36bd814725fa 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -32,68 +32,63 @@ typedef unsigned long long __mmask64; #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS #endif -static __inline __mmask32 __DEFAULT_FN_ATTRS -_knot_mask32(__mmask32 __M) -{ +static __inline __mmask32 + __DEFAULT_FN_ATTRS_CONSTEXPR _knot_mask32(__mmask32 __M) { return __builtin_ia32_knotsi(__M); } -static __inline __mmask64 __DEFAULT_FN_ATTRS _knot_mask64(__mmask64 __M) { +static __inline __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_knot_mask64(__mmask64 __M) { return __builtin_ia32_knotdi(__M); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kand_mask32(__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_kand_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kand_mask64(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_kand_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kandn_mask32(__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_kandn_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kandn_mask64(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_kandn_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kor_mask32(__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_kor_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kor_mask64(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_kor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kxnor_mask32(__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_kxnor_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxnor_mask64(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_kxnor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kxor_mask32(__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_kxor_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kxor_mask64(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_kxor_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); } @@ -165,14 +160,13 @@ _ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kadd_mask32(__mmask32 __A, __mmask32 __B) -{ +static __inline__ __mmask32 __DEFAULT_FN_ATTRS_CONSTEXPR +_kadd_mask32(__mmask32 __A, __mmask32 __B) { return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); } -static __inline__ __mmask64 __DEFAULT_FN_ATTRS _kadd_mask64(__mmask64 __A, - __mmask64 __B) { +static __inline__ __mmask64 __DEFAULT_FN_ATTRS_CONSTEXPR +_kadd_mask64(__mmask64 __A, __mmask64 __B) { return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); } @@ -516,9 +510,8 @@ _mm512_maskz_abs_epi16(__mmask32 __U, __m512i __A) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packs_epi32(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packs_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); } @@ -538,9 +531,8 @@ _mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packs_epi16(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packs_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); } @@ -560,9 +552,8 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) (__v64qi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packus_epi32(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packus_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); } @@ -582,9 +573,8 @@ _mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) (__v32hi)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packus_epi16(__m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_packus_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); } diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index 95fdc2851cb9b..fb65bf933b8ad 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -29,39 +29,33 @@ #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS #endif -static __inline __mmask8 __DEFAULT_FN_ATTRS -_knot_mask8(__mmask8 __M) -{ +static __inline __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_knot_mask8(__mmask8 __M) { return __builtin_ia32_knotqi(__M); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kand_mask8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_kand_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kandn_mask8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_kandn_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kor_mask8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_kor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kxnor_mask8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_kxnor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kxor_mask8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_kxor_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); } @@ -119,15 +113,13 @@ _ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kadd_mask8(__mmask8 __A, __mmask8 __B) -{ +static __inline__ __mmask8 __DEFAULT_FN_ATTRS_CONSTEXPR +_kadd_mask8(__mmask8 __A, __mmask8 __B) { return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_kadd_mask16(__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_kadd_mask16(__mmask16 __A, __mmask16 __B) { return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); } diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 8dc556beccbcf..80e58425cdd71 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -520,15 +520,13 @@ _mm512_castsi512_si256(__m512i __A) { return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_int2mask(int __a) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_int2mask(int __a) { return (__mmask16)__a; } -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_mask2int(__mmask16 __a) -{ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask2int(__mmask16 __a) { return (int)__a; } @@ -4394,9 +4392,8 @@ _mm512_store_epi64 (void *__P, __m512i __A) /* Mask ops */ -static __inline __mmask16 __DEFAULT_FN_ATTRS -_mm512_knot(__mmask16 __M) -{ +static __inline __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_knot(__mmask16 __M) { return __builtin_ia32_knothi(__M); } @@ -8085,21 +8082,18 @@ _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kand (__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 + __DEFAULT_FN_ATTRS_CONSTEXPR _mm512_kand(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kandn (__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kandn(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kor (__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kor(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); } @@ -8139,15 +8133,13 @@ _mm512_kunpackb (__mmask16 __A, __mmask16 __B) return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kxnor (__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kxnor(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kxor (__mmask16 __A, __mmask16 __B) -{ +static __inline__ __mmask16 __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_kxor(__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); } diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h index 16a4ff3034244..c0bcc080dbe93 100644 --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -42,7 +42,8 @@ static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { return __a[0]; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_set_sh(_Float16 __h) { return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; } @@ -57,23 +58,23 @@ _mm256_set1_ph(_Float16 __h) { __h, __h, __h, __h, __h, __h, __h, __h}; } -static __inline __m128h __DEFAULT_FN_ATTRS128 +static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; } -static __inline __m256h __DEFAULT_FN_ATTRS256 +static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_set1_pch(_Float16 _Complex h) { return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); } -static __inline __m128h __DEFAULT_FN_ATTRS128 +static __inline __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_set1_pch(_Float16 _Complex h) { return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); } -static __inline __m256h __DEFAULT_FN_ATTRS256 +static __inline __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, @@ -83,13 +84,13 @@ _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, __h4, __h3, __h2, __h1}; } -static __inline__ __m128h __DEFAULT_FN_ATTRS128 +static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR _mm_setr_ph(_Float16 e0, _Float16 e1, _Float16 e2, _Float16 e3, _Float16 e4, _Float16 e5, _Float16 e6, _Float16 e7) { return _mm_set_ph(e7, e6, e5, e4, e3, e2, e1, e0); } -static __inline__ __m256h __DEFAULT_FN_ATTRS256 +static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_setr_ph(_Float16 e0, _Float16 e1, _Float16 e2, _Float16 e3, _Float16 e4, _Float16 e5, _Float16 e6, _Float16 e7, _Float16 e8, _Float16 e9, _Float16 e10, _Float16 e11, _Float16 e12, _Float16 e13, diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index fca6229a065be..6597e7e7d4030 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -4159,8 +4159,8 @@ void _mm_mfence(void); /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are /// written to the higher 64 bits of the result. /// \returns A 128-bit vector of [16 x i8] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packs_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); } @@ -4182,8 +4182,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values /// are written to the higher 64 bits of the result. /// \returns A 128-bit vector of [8 x i16] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packs_epi32(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); } @@ -4205,8 +4205,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are /// written to the higher 64 bits of the result. /// \returns A 128-bit vector of [16 x i8] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packus_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); } diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h index 83965334e2c9b..b6ca7088d3864 100644 --- a/clang/lib/Headers/f16cintrin.h +++ b/clang/lib/Headers/f16cintrin.h @@ -15,17 +15,20 @@ #define __F16CINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256))) - #if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), \ + __min_vector_width__(256))) constexpr #else -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), \ + __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), \ + __min_vector_width__(256))) #endif /* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h, @@ -43,7 +46,7 @@ /// \param __a /// A 16-bit half-precision float value. /// \returns The converted 32-bit float value. -static __inline float __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline float __DEFAULT_FN_ATTRS128 _cvtsh_ss(unsigned short __a) { return (float)__builtin_bit_cast(__fp16, __a); @@ -112,7 +115,7 @@ _cvtsh_ss(unsigned short __a) /// A 128-bit vector containing 16-bit half-precision float values. The lower /// 64 bits are used in the conversion. /// \returns A 128-bit vector of [4 x float] containing converted float values. -static __inline __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_cvtph_ps(__m128i __a) { typedef __fp16 __v4fp16 __attribute__((__vector_size__(8))); @@ -159,7 +162,7 @@ _mm_cvtph_ps(__m128i __a) /// converted to 32-bit single-precision float values. /// \returns A vector of [8 x float] containing the converted 32-bit /// single-precision float values. -static __inline __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtph_ps(__m128i __a) { typedef __fp16 __v8fp16 __attribute__((__vector_size__(16), __aligned__(16))); @@ -169,7 +172,5 @@ _mm256_cvtph_ps(__m128i __a) #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 -#undef __DEFAULT_FN_ATTRS128_CONSTEXPR -#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif /* __F16CINTRIN_H */ diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h index 21a9c30d9f445..d973371312701 100644 --- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h @@ -1292,6 +1292,39 @@ bool3 isinf(float3); _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isinf) bool4 isinf(float4); +//===----------------------------------------------------------------------===// +// isnan builtins +//===----------------------------------------------------------------------===// + +/// \fn T isnan(T x) +/// \brief Determines if the specified value \a x is Not a Number. +/// \param x The specified input value. +/// +/// Returns a value of the same size as the input, with a value set +/// to True if the x parameter is NaN or QNaN. Otherwise, False. + +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool isnan(half); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool2 isnan(half2); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool3 isnan(half3); +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool4 isnan(half4); + +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool isnan(float); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool2 isnan(float2); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool3 isnan(float3); +_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_isnan) +bool4 isnan(float4); + //===----------------------------------------------------------------------===// // lerp builtins //===----------------------------------------------------------------------===// @@ -2805,7 +2838,7 @@ float4 radians(float4); /// call. _HLSL_BUILTIN_ALIAS(__builtin_hlsl_group_memory_barrier_with_group_sync) -void GroupMemoryBarrierWithGroupSync(void); +__attribute__((convergent)) void GroupMemoryBarrierWithGroupSync(void); } // namespace hlsl #endif //_HLSL_HLSL_ALIAS_INTRINSICS_H_ diff --git a/clang/lib/Headers/hlsl/hlsl_basic_types.h b/clang/lib/Headers/hlsl/hlsl_basic_types.h index eff94e0d7f950..fc1e265067714 100644 --- a/clang/lib/Headers/hlsl/hlsl_basic_types.h +++ b/clang/lib/Headers/hlsl/hlsl_basic_types.h @@ -115,6 +115,239 @@ typedef vector float64_t2; typedef vector float64_t3; typedef vector float64_t4; +#ifdef __HLSL_ENABLE_16_BIT +typedef matrix int16_t1x1; +typedef matrix int16_t1x2; +typedef matrix int16_t1x3; +typedef matrix int16_t1x4; +typedef matrix int16_t2x1; +typedef matrix int16_t2x2; +typedef matrix int16_t2x3; +typedef matrix int16_t2x4; +typedef matrix int16_t3x1; +typedef matrix int16_t3x2; +typedef matrix int16_t3x3; +typedef matrix int16_t3x4; +typedef matrix int16_t4x1; +typedef matrix int16_t4x2; +typedef matrix int16_t4x3; +typedef matrix int16_t4x4; +typedef matrix uint16_t1x1; +typedef matrix uint16_t1x2; +typedef matrix uint16_t1x3; +typedef matrix uint16_t1x4; +typedef matrix uint16_t2x1; +typedef matrix uint16_t2x2; +typedef matrix uint16_t2x3; +typedef matrix uint16_t2x4; +typedef matrix uint16_t3x1; +typedef matrix uint16_t3x2; +typedef matrix uint16_t3x3; +typedef matrix uint16_t3x4; +typedef matrix uint16_t4x1; +typedef matrix uint16_t4x2; +typedef matrix uint16_t4x3; +typedef matrix uint16_t4x4; +#endif + +typedef matrix int1x1; +typedef matrix int1x2; +typedef matrix int1x3; +typedef matrix int1x4; +typedef matrix int2x1; +typedef matrix int2x2; +typedef matrix int2x3; +typedef matrix int2x4; +typedef matrix int3x1; +typedef matrix int3x2; +typedef matrix int3x3; +typedef matrix int3x4; +typedef matrix int4x1; +typedef matrix int4x2; +typedef matrix int4x3; +typedef matrix int4x4; +typedef matrix uint1x1; +typedef matrix uint1x2; +typedef matrix uint1x3; +typedef matrix uint1x4; +typedef matrix uint2x1; +typedef matrix uint2x2; +typedef matrix uint2x3; +typedef matrix uint2x4; +typedef matrix uint3x1; +typedef matrix uint3x2; +typedef matrix uint3x3; +typedef matrix uint3x4; +typedef matrix uint4x1; +typedef matrix uint4x2; +typedef matrix uint4x3; +typedef matrix uint4x4; +typedef matrix int32_t1x1; +typedef matrix int32_t1x2; +typedef matrix int32_t1x3; +typedef matrix int32_t1x4; +typedef matrix int32_t2x1; +typedef matrix int32_t2x2; +typedef matrix int32_t2x3; +typedef matrix int32_t2x4; +typedef matrix int32_t3x1; +typedef matrix int32_t3x2; +typedef matrix int32_t3x3; +typedef matrix int32_t3x4; +typedef matrix int32_t4x1; +typedef matrix int32_t4x2; +typedef matrix int32_t4x3; +typedef matrix int32_t4x4; +typedef matrix uint32_t1x1; +typedef matrix uint32_t1x2; +typedef matrix uint32_t1x3; +typedef matrix uint32_t1x4; +typedef matrix uint32_t2x1; +typedef matrix uint32_t2x2; +typedef matrix uint32_t2x3; +typedef matrix uint32_t2x4; +typedef matrix uint32_t3x1; +typedef matrix uint32_t3x2; +typedef matrix uint32_t3x3; +typedef matrix uint32_t3x4; +typedef matrix uint32_t4x1; +typedef matrix uint32_t4x2; +typedef matrix uint32_t4x3; +typedef matrix uint32_t4x4; +typedef matrix int64_t1x1; +typedef matrix int64_t1x2; +typedef matrix int64_t1x3; +typedef matrix int64_t1x4; +typedef matrix int64_t2x1; +typedef matrix int64_t2x2; +typedef matrix int64_t2x3; +typedef matrix int64_t2x4; +typedef matrix int64_t3x1; +typedef matrix int64_t3x2; +typedef matrix int64_t3x3; +typedef matrix int64_t3x4; +typedef matrix int64_t4x1; +typedef matrix int64_t4x2; +typedef matrix int64_t4x3; +typedef matrix int64_t4x4; +typedef matrix uint64_t1x1; +typedef matrix uint64_t1x2; +typedef matrix uint64_t1x3; +typedef matrix uint64_t1x4; +typedef matrix uint64_t2x1; +typedef matrix uint64_t2x2; +typedef matrix uint64_t2x3; +typedef matrix uint64_t2x4; +typedef matrix uint64_t3x1; +typedef matrix uint64_t3x2; +typedef matrix uint64_t3x3; +typedef matrix uint64_t3x4; +typedef matrix uint64_t4x1; +typedef matrix uint64_t4x2; +typedef matrix uint64_t4x3; +typedef matrix uint64_t4x4; + +typedef matrix half1x1; +typedef matrix half1x2; +typedef matrix half1x3; +typedef matrix half1x4; +typedef matrix half2x1; +typedef matrix half2x2; +typedef matrix half2x3; +typedef matrix half2x4; +typedef matrix half3x1; +typedef matrix half3x2; +typedef matrix half3x3; +typedef matrix half3x4; +typedef matrix half4x1; +typedef matrix half4x2; +typedef matrix half4x3; +typedef matrix half4x4; +typedef matrix float1x1; +typedef matrix float1x2; +typedef matrix float1x3; +typedef matrix float1x4; +typedef matrix float2x1; +typedef matrix float2x2; +typedef matrix float2x3; +typedef matrix float2x4; +typedef matrix float3x1; +typedef matrix float3x2; +typedef matrix float3x3; +typedef matrix float3x4; +typedef matrix float4x1; +typedef matrix float4x2; +typedef matrix float4x3; +typedef matrix float4x4; +typedef matrix double1x1; +typedef matrix double1x2; +typedef matrix double1x3; +typedef matrix double1x4; +typedef matrix double2x1; +typedef matrix double2x2; +typedef matrix double2x3; +typedef matrix double2x4; +typedef matrix double3x1; +typedef matrix double3x2; +typedef matrix double3x3; +typedef matrix double3x4; +typedef matrix double4x1; +typedef matrix double4x2; +typedef matrix double4x3; +typedef matrix double4x4; + +#ifdef __HLSL_ENABLE_16_BIT +typedef matrix float16_t1x1; +typedef matrix float16_t1x2; +typedef matrix float16_t1x3; +typedef matrix float16_t1x4; +typedef matrix float16_t2x1; +typedef matrix float16_t2x2; +typedef matrix float16_t2x3; +typedef matrix float16_t2x4; +typedef matrix float16_t3x1; +typedef matrix float16_t3x2; +typedef matrix float16_t3x3; +typedef matrix float16_t3x4; +typedef matrix float16_t4x1; +typedef matrix float16_t4x2; +typedef matrix float16_t4x3; +typedef matrix float16_t4x4; +#endif + +typedef matrix float32_t1x1; +typedef matrix float32_t1x2; +typedef matrix float32_t1x3; +typedef matrix float32_t1x4; +typedef matrix float32_t2x1; +typedef matrix float32_t2x2; +typedef matrix float32_t2x3; +typedef matrix float32_t2x4; +typedef matrix float32_t3x1; +typedef matrix float32_t3x2; +typedef matrix float32_t3x3; +typedef matrix float32_t3x4; +typedef matrix float32_t4x1; +typedef matrix float32_t4x2; +typedef matrix float32_t4x3; +typedef matrix float32_t4x4; +typedef matrix float64_t1x1; +typedef matrix float64_t1x2; +typedef matrix float64_t1x3; +typedef matrix float64_t1x4; +typedef matrix float64_t2x1; +typedef matrix float64_t2x2; +typedef matrix float64_t2x3; +typedef matrix float64_t2x4; +typedef matrix float64_t3x1; +typedef matrix float64_t3x2; +typedef matrix float64_t3x3; +typedef matrix float64_t3x4; +typedef matrix float64_t4x1; +typedef matrix float64_t4x2; +typedef matrix float64_t4x3; +typedef matrix float64_t4x4; + } // namespace hlsl #endif //_HLSL_HLSL_BASIC_TYPES_H_ diff --git a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h index 72a7bed21f3c9..fe4277ed4a7d2 100644 --- a/clang/lib/Headers/hlsl/hlsl_compat_overloads.h +++ b/clang/lib/Headers/hlsl/hlsl_compat_overloads.h @@ -352,6 +352,15 @@ constexpr bool3 isinf(double3 V) { return isinf((float3)V); } _DXC_DEPRECATED_64BIT_FN(fn) constexpr bool4 isinf(double4 V) { return isinf((float4)V); } +//===----------------------------------------------------------------------===// +// isnan builtins overloads +//===----------------------------------------------------------------------===// + +constexpr bool isnan(double V) { return isnan((float)V); } +constexpr bool2 isnan(double2 V) { return isnan((float2)V); } +constexpr bool3 isnan(double3 V) { return isnan((float3)V); } +constexpr bool4 isnan(double4 V) { return isnan((float4)V); } + //===----------------------------------------------------------------------===// // lerp builtins overloads //===----------------------------------------------------------------------===// diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index d9d87c827e6a4..5ba5bfb9abde0 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -422,6 +422,30 @@ constexpr int4 D3DCOLORtoUBYTE4(float4 V) { return __detail::d3d_color_to_ubyte4_impl(V); } +//===----------------------------------------------------------------------===// +// NonUniformResourceIndex builtin +//===----------------------------------------------------------------------===// + +/// \fn uint NonUniformResourceIndex(uint I) +/// \brief A compiler hint to indicate that a resource index varies across +/// threads within a wave (i.e., it is non-uniform). +/// \param I [in] Resource array index +/// +/// The return value is the \Index parameter. +/// +/// When indexing into an array of shader resources (e.g., textures, buffers), +/// some GPU hardware and drivers require the compiler to know whether the index +/// is uniform (same for all threads) or non-uniform (varies per thread). +/// +/// Using NonUniformResourceIndex explicitly marks an index as non-uniform, +/// disabling certain assumptions or optimizations that could lead to incorrect +/// behavior when dynamically accessing resource arrays with non-uniform +/// indices. + +constexpr uint32_t NonUniformResourceIndex(uint32_t Index) { + return __builtin_hlsl_resource_nonuniformindex(Index); +} + //===----------------------------------------------------------------------===// // reflect builtin //===----------------------------------------------------------------------===// diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 18e2c2154362a..5f617530b6f78 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -156,11 +156,10 @@ _mm_cvtm64_si64(__m64 __m) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_packs_pi16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_packsswb128( - (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_packs_pi16(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_packsswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Converts, with saturation, 32-bit signed integers from both 64-bit integer @@ -182,11 +181,10 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_packs_pi32(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_packssdw128( - (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_packs_pi32(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_packssdw128( + (__v4si)__builtin_shufflevector(__m1, __m2, 0, 1), (__v4si){})); } /// Converts, with saturation, 16-bit signed integers from both 64-bit integer @@ -208,11 +206,10 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) /// written to the upper 32 bits of the result. /// \returns A 64-bit integer vector of [8 x i8] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_packs_pu16(__m64 __m1, __m64 __m2) -{ - return __trunc64(__builtin_ia32_packuswb128( - (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_packs_pu16(__m64 __m1, __m64 __m2) { + return __trunc64(__builtin_ia32_packuswb128( + (__v8hi)__builtin_shufflevector(__m1, __m2, 0, 1), (__v8hi){})); } /// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index 6319fdbbeb8f0..5e63a1ae321bc 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -1466,8 +1466,8 @@ _mm_cvtepu32_epi64(__m128i __V) { /// A 128-bit vector of [4 x i32]. The converted [4 x i16] values are /// written to the higher 64 bits of the result. /// \returns A 128-bit vector of [8 x i16] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1, - __m128i __V2) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_packus_epi32(__m128i __V1, __m128i __V2) { return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); } @@ -1534,9 +1534,16 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) { so we'll do the same. */ #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + /* These specify the type of data that we're comparing. */ #define _SIDD_UBYTE_OPS 0x00 #define _SIDD_UWORD_OPS 0x01 diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 4891e3ce077b5..d876b4735a7d2 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -2363,9 +2363,8 @@ _mm_max_pi16(__m64 __a, __m64 __b) { /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_max_pu8(__m64 __a, __m64 __b) -{ +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_max_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b); } @@ -2400,9 +2399,8 @@ _mm_min_pi16(__m64 __a, __m64 __b) { /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 -_mm_min_pu8(__m64 __a, __m64 __b) -{ +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR +_mm_min_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b); } diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp index f28a74f5d0ae5..ae09f70ee7896 100644 --- a/clang/lib/Lex/HeaderSearch.cpp +++ b/clang/lib/Lex/HeaderSearch.cpp @@ -672,9 +672,8 @@ OptionalFileEntryRef DirectoryLookup::DoFrameworkLookup( if (getDirCharacteristic() == SrcMgr::C_User) { SmallString<1024> SystemFrameworkMarker(FrameworkName); SystemFrameworkMarker += ".system_framework"; - if (llvm::sys::fs::exists(SystemFrameworkMarker)) { + if (FileMgr.getOptionalFileRef(SystemFrameworkMarker)) CacheEntry.IsUserSpecifiedSystemFramework = true; - } } } diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 02f3f109b2562..04f29c83dd457 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -2968,6 +2968,39 @@ OMPClause *Parser::ParseOpenMPSizesClause() { OpenLoc, CloseLoc); } +OMPClause *Parser::ParseOpenMPLoopRangeClause() { + SourceLocation ClauseNameLoc = ConsumeToken(); + SourceLocation FirstLoc, CountLoc; + + BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end); + if (T.consumeOpen()) { + Diag(Tok, diag::err_expected) << tok::l_paren; + return nullptr; + } + + FirstLoc = Tok.getLocation(); + ExprResult FirstVal = ParseConstantExpression(); + if (!FirstVal.isUsable()) { + T.skipToEnd(); + return nullptr; + } + + ExpectAndConsume(tok::comma); + + CountLoc = Tok.getLocation(); + ExprResult CountVal = ParseConstantExpression(); + if (!CountVal.isUsable()) { + T.skipToEnd(); + return nullptr; + } + + T.consumeClose(); + + return Actions.OpenMP().ActOnOpenMPLoopRangeClause( + FirstVal.get(), CountVal.get(), ClauseNameLoc, T.getOpenLocation(), + FirstLoc, CountLoc, T.getCloseLocation()); +} + OMPClause *Parser::ParseOpenMPPermutationClause() { SourceLocation ClauseNameLoc, OpenLoc, CloseLoc; SmallVector ArgExprs; @@ -3473,6 +3506,9 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, } Clause = ParseOpenMPClause(CKind, WrongDirective); break; + case OMPC_looprange: + Clause = ParseOpenMPLoopRangeClause(); + break; default: break; } diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index e02e00231e58e..e8a7ad3bd355a 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -10,6 +10,7 @@ #include "clang/AST/Decl.h" #include "clang/AST/Expr.h" #include "clang/AST/Type.h" +#include "clang/Analysis/Analyses/LifetimeAnnotations.h" #include "clang/Basic/DiagnosticSema.h" #include "clang/Sema/Initialization.h" #include "clang/Sema/Sema.h" @@ -503,60 +504,6 @@ shouldTrackFirstArgumentForConstructor(const CXXConstructExpr *Ctor) { return true; } -// Return true if this is an "normal" assignment operator. -// We assume that a normal assignment operator always returns *this, that is, -// an lvalue reference that is the same type as the implicit object parameter -// (or the LHS for a non-member operator$=). -static bool isNormalAssignmentOperator(const FunctionDecl *FD) { - OverloadedOperatorKind OO = FD->getDeclName().getCXXOverloadedOperator(); - if (OO == OO_Equal || isCompoundAssignmentOperator(OO)) { - QualType RetT = FD->getReturnType(); - if (RetT->isLValueReferenceType()) { - ASTContext &Ctx = FD->getASTContext(); - QualType LHST; - auto *MD = dyn_cast(FD); - if (MD && MD->isCXXInstanceMember()) - LHST = Ctx.getLValueReferenceType(MD->getFunctionObjectParameterType()); - else - LHST = FD->getParamDecl(0)->getType(); - if (Ctx.hasSameType(RetT, LHST)) - return true; - } - } - return false; -} - -static const FunctionDecl * -getDeclWithMergedLifetimeBoundAttrs(const FunctionDecl *FD) { - return FD != nullptr ? FD->getMostRecentDecl() : nullptr; -} - -static const CXXMethodDecl * -getDeclWithMergedLifetimeBoundAttrs(const CXXMethodDecl *CMD) { - const FunctionDecl *FD = CMD; - return cast_if_present( - getDeclWithMergedLifetimeBoundAttrs(FD)); -} - -bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD) { - FD = getDeclWithMergedLifetimeBoundAttrs(FD); - const TypeSourceInfo *TSI = FD->getTypeSourceInfo(); - if (!TSI) - return false; - // Don't declare this variable in the second operand of the for-statement; - // GCC miscompiles that by ending its lifetime before evaluating the - // third operand. See gcc.gnu.org/PR86769. - AttributedTypeLoc ATL; - for (TypeLoc TL = TSI->getTypeLoc(); - (ATL = TL.getAsAdjusted()); - TL = ATL.getModifiedLoc()) { - if (ATL.getAttrAs()) - return true; - } - - return isNormalAssignmentOperator(FD); -} - // Visit lifetimebound or gsl-pointer arguments. static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, LocalVisitor Visit) { @@ -639,7 +586,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, // lifetimebound. if (Sema::CanBeGetReturnObject(Callee)) CheckCoroObjArg = false; - if (implicitObjectParamIsLifetimeBound(Callee) || CheckCoroObjArg) + if (lifetimes::implicitObjectParamIsLifetimeBound(Callee) || + CheckCoroObjArg) VisitLifetimeBoundArg(Callee, ObjectArg); else if (EnableGSLAnalysis) { if (auto *CME = dyn_cast(Callee); @@ -648,7 +596,8 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call, } } - const FunctionDecl *CanonCallee = getDeclWithMergedLifetimeBoundAttrs(Callee); + const FunctionDecl *CanonCallee = + lifetimes::getDeclWithMergedLifetimeBoundAttrs(Callee); unsigned NP = std::min(Callee->getNumParams(), CanonCallee->getNumParams()); for (unsigned I = 0, N = std::min(NP, Args.size()); I != N; ++I) { Expr *Arg = Args[I]; @@ -1276,19 +1225,14 @@ static AnalysisResult analyzePathForGSLPointer(const IndirectLocalPath &Path, return Report; } -static bool isAssignmentOperatorLifetimeBound(const CXXMethodDecl *CMD) { - CMD = getDeclWithMergedLifetimeBoundAttrs(CMD); - return CMD && isNormalAssignmentOperator(CMD) && CMD->param_size() == 1 && - CMD->getParamDecl(0)->hasAttr(); -} - static bool shouldRunGSLAssignmentAnalysis(const Sema &SemaRef, const AssignedEntity &Entity) { bool EnableGSLAssignmentWarnings = !SemaRef.getDiagnostics().isIgnored( diag::warn_dangling_lifetime_pointer_assignment, SourceLocation()); return (EnableGSLAssignmentWarnings && (isRecordWithAttr(Entity.LHS->getType()) || - isAssignmentOperatorLifetimeBound(Entity.AssignmentOperator))); + lifetimes::isAssignmentOperatorLifetimeBound( + Entity.AssignmentOperator))); } static void @@ -1610,11 +1554,11 @@ checkExprLifetimeImpl(Sema &SemaRef, const InitializedEntity *InitEntity, switch (LK) { case LK_Assignment: { if (shouldRunGSLAssignmentAnalysis(SemaRef, *AEntity)) - Path.push_back( - {isAssignmentOperatorLifetimeBound(AEntity->AssignmentOperator) - ? IndirectLocalPathEntry::LifetimeBoundCall - : IndirectLocalPathEntry::GslPointerAssignment, - Init}); + Path.push_back({lifetimes::isAssignmentOperatorLifetimeBound( + AEntity->AssignmentOperator) + ? IndirectLocalPathEntry::LifetimeBoundCall + : IndirectLocalPathEntry::GslPointerAssignment, + Init}); break; } case LK_LifetimeCapture: { diff --git a/clang/lib/Sema/CheckExprLifetime.h b/clang/lib/Sema/CheckExprLifetime.h index 6351e52a362f1..16595d0ca1b36 100644 --- a/clang/lib/Sema/CheckExprLifetime.h +++ b/clang/lib/Sema/CheckExprLifetime.h @@ -60,8 +60,6 @@ void checkCaptureByLifetime(Sema &SemaRef, const CapturingEntity &Entity, void checkExprLifetimeMustTailArg(Sema &SemaRef, const InitializedEntity &Entity, Expr *Init); -bool implicitObjectParamIsLifetimeBound(const FunctionDecl *FD); - } // namespace clang::sema #endif // LLVM_CLANG_SEMA_CHECK_EXPR_LIFETIME_H diff --git a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp index 5eafd03d89efe..97a6a7f1439db 100644 --- a/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp +++ b/clang/lib/Sema/HLSLBuiltinTypeDeclBuilder.cpp @@ -748,8 +748,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addHandleMember( // Adds default constructor to the resource class: // Resource::Resource() BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addDefaultHandleConstructor() { - if (Record->isCompleteDefinition()) - return *this; + assert(!Record->isCompleteDefinition() && "record is already complete"); using PH = BuiltinTypeMethodBuilder::PlaceHolder; QualType HandleType = getResourceHandleField()->getType(); @@ -773,8 +772,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addDefaultHandleConstructor() { // return tmp; // } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCreateFromBinding() { - if (Record->isCompleteDefinition()) - return *this; + assert(!Record->isCompleteDefinition() && "record is already complete"); using PH = BuiltinTypeMethodBuilder::PlaceHolder; ASTContext &AST = SemaRef.getASTContext(); @@ -811,8 +809,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCreateFromBinding() { // return tmp; // } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCreateFromImplicitBinding() { - if (Record->isCompleteDefinition()) - return *this; + assert(!Record->isCompleteDefinition() && "record is already complete"); using PH = BuiltinTypeMethodBuilder::PlaceHolder; ASTContext &AST = SemaRef.getASTContext(); @@ -838,8 +835,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCreateFromImplicitBinding() { } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyConstructor() { - if (Record->isCompleteDefinition()) - return *this; + assert(!Record->isCompleteDefinition() && "record is already complete"); ASTContext &AST = SemaRef.getASTContext(); QualType RecordType = AST.getCanonicalTagType(Record); @@ -857,8 +853,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyConstructor() { } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addCopyAssignmentOperator() { - if (Record->isCompleteDefinition()) - return *this; + assert(!Record->isCompleteDefinition() && "record is already complete"); ASTContext &AST = SemaRef.getASTContext(); QualType RecordType = AST.getCanonicalTagType(Record); @@ -889,8 +884,7 @@ BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addArraySubscriptOperators() { } BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::addLoadMethods() { - if (Record->isCompleteDefinition()) - return *this; + assert(!Record->isCompleteDefinition() && "record is already complete"); ASTContext &AST = Record->getASTContext(); IdentifierInfo &II = AST.Idents.get("Load", tok::TokenKind::identifier); @@ -931,12 +925,6 @@ BuiltinTypeDeclBuilder::getResourceAttrs() const { return cast(HandleType)->getAttrs(); } -// BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::startDefinition() { -// assert(!Record->isCompleteDefinition() && "record is already complete"); -// Record->startDefinition(); -// return *this; -// } - BuiltinTypeDeclBuilder &BuiltinTypeDeclBuilder::completeDefinition() { assert(!Record->isCompleteDefinition() && "record is already complete"); assert(Record->isBeingDefined() && diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 781f0445d0b61..464922b6257b6 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -121,8 +121,110 @@ void HLSLExternalSemaSource::defineHLSLVectorAlias() { HLSLNamespace->addDecl(Template); } +void HLSLExternalSemaSource::defineHLSLMatrixAlias() { + ASTContext &AST = SemaPtr->getASTContext(); + llvm::SmallVector TemplateParams; + + auto *TypeParam = TemplateTypeParmDecl::Create( + AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 0, + &AST.Idents.get("element", tok::TokenKind::identifier), false, false); + TypeParam->setDefaultArgument( + AST, SemaPtr->getTrivialTemplateArgumentLoc( + TemplateArgument(AST.FloatTy), QualType(), SourceLocation())); + + TemplateParams.emplace_back(TypeParam); + + // these should be 64 bit to be consistent with other clang matrices. + auto *RowsParam = NonTypeTemplateParmDecl::Create( + AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 1, + &AST.Idents.get("rows_count", tok::TokenKind::identifier), AST.IntTy, + false, AST.getTrivialTypeSourceInfo(AST.IntTy)); + llvm::APInt RVal(AST.getIntWidth(AST.IntTy), 4); + TemplateArgument RDefault(AST, llvm::APSInt(std::move(RVal)), AST.IntTy, + /*IsDefaulted=*/true); + RowsParam->setDefaultArgument( + AST, SemaPtr->getTrivialTemplateArgumentLoc(RDefault, AST.IntTy, + SourceLocation(), RowsParam)); + TemplateParams.emplace_back(RowsParam); + + auto *ColsParam = NonTypeTemplateParmDecl::Create( + AST, HLSLNamespace, SourceLocation(), SourceLocation(), 0, 2, + &AST.Idents.get("cols_count", tok::TokenKind::identifier), AST.IntTy, + false, AST.getTrivialTypeSourceInfo(AST.IntTy)); + llvm::APInt CVal(AST.getIntWidth(AST.IntTy), 4); + TemplateArgument CDefault(AST, llvm::APSInt(std::move(CVal)), AST.IntTy, + /*IsDefaulted=*/true); + ColsParam->setDefaultArgument( + AST, SemaPtr->getTrivialTemplateArgumentLoc(CDefault, AST.IntTy, + SourceLocation(), ColsParam)); + TemplateParams.emplace_back(ColsParam); + + const unsigned MaxMatDim = 4; + auto *MaxRow = IntegerLiteral::Create( + AST, llvm::APInt(AST.getIntWidth(AST.IntTy), MaxMatDim), AST.IntTy, + SourceLocation()); + auto *MaxCol = IntegerLiteral::Create( + AST, llvm::APInt(AST.getIntWidth(AST.IntTy), MaxMatDim), AST.IntTy, + SourceLocation()); + + auto *RowsRef = DeclRefExpr::Create( + AST, NestedNameSpecifierLoc(), SourceLocation(), RowsParam, + /*RefersToEnclosingVariableOrCapture*/ false, + DeclarationNameInfo(RowsParam->getDeclName(), SourceLocation()), + AST.IntTy, VK_LValue); + auto *ColsRef = DeclRefExpr::Create( + AST, NestedNameSpecifierLoc(), SourceLocation(), ColsParam, + /*RefersToEnclosingVariableOrCapture*/ false, + DeclarationNameInfo(ColsParam->getDeclName(), SourceLocation()), + AST.IntTy, VK_LValue); + + auto *RowsLE = BinaryOperator::Create(AST, RowsRef, MaxRow, BO_LE, AST.BoolTy, + VK_PRValue, OK_Ordinary, + SourceLocation(), FPOptionsOverride()); + auto *ColsLE = BinaryOperator::Create(AST, ColsRef, MaxCol, BO_LE, AST.BoolTy, + VK_PRValue, OK_Ordinary, + SourceLocation(), FPOptionsOverride()); + + auto *RequiresExpr = BinaryOperator::Create( + AST, RowsLE, ColsLE, BO_LAnd, AST.BoolTy, VK_PRValue, OK_Ordinary, + SourceLocation(), FPOptionsOverride()); + + auto *ParamList = TemplateParameterList::Create( + AST, SourceLocation(), SourceLocation(), TemplateParams, SourceLocation(), + RequiresExpr); + + IdentifierInfo &II = AST.Idents.get("matrix", tok::TokenKind::identifier); + + QualType AliasType = AST.getDependentSizedMatrixType( + AST.getTemplateTypeParmType(0, 0, false, TypeParam), + DeclRefExpr::Create( + AST, NestedNameSpecifierLoc(), SourceLocation(), RowsParam, false, + DeclarationNameInfo(RowsParam->getDeclName(), SourceLocation()), + AST.IntTy, VK_LValue), + DeclRefExpr::Create( + AST, NestedNameSpecifierLoc(), SourceLocation(), ColsParam, false, + DeclarationNameInfo(ColsParam->getDeclName(), SourceLocation()), + AST.IntTy, VK_LValue), + SourceLocation()); + + auto *Record = TypeAliasDecl::Create(AST, HLSLNamespace, SourceLocation(), + SourceLocation(), &II, + AST.getTrivialTypeSourceInfo(AliasType)); + Record->setImplicit(true); + + auto *Template = + TypeAliasTemplateDecl::Create(AST, HLSLNamespace, SourceLocation(), + Record->getIdentifier(), ParamList, Record); + + Record->setDescribedAliasTemplate(Template); + Template->setImplicit(true); + Template->setLexicalDeclContext(Record->getDeclContext()); + HLSLNamespace->addDecl(Template); +} + void HLSLExternalSemaSource::defineTrivialHLSLTypes() { defineHLSLVectorAlias(); + defineHLSLMatrixAlias(); } /// Set up common members and attributes for buffer types diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index bb98a39948fce..3a0c23187d45d 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -58,9 +58,11 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, [[fallthrough]]; } default: - Diag(ArgExpr->getExprLoc(), diag::err_amdgcn_load_lds_size_invalid_value) + SemaRef.targetDiag(ArgExpr->getExprLoc(), + diag::err_amdgcn_load_lds_size_invalid_value) << ArgExpr->getSourceRange(); - Diag(ArgExpr->getExprLoc(), diag::note_amdgcn_load_lds_size_valid_value) + SemaRef.targetDiag(ArgExpr->getExprLoc(), + diag::note_amdgcn_load_lds_size_valid_value) << HasGFX950Insts << ArgExpr->getSourceRange(); return true; } @@ -122,7 +124,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, << ArgExpr->getType(); auto Ord = ArgResult.Val.getInt().getZExtValue(); - // Check validity of memory ordering as per C11 / C++11's memody model. + // Check validity of memory ordering as per C11 / C++11's memory model. // Only fence needs check. Atomic dec/inc allow all memory orders. if (!llvm::isValidAtomicOrderingCABI(Ord)) return Diag(ArgExpr->getBeginLoc(), diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp index 99a29add8211d..35cdfbf8bf390 100644 --- a/clang/lib/Sema/SemaAPINotes.cpp +++ b/clang/lib/Sema/SemaAPINotes.cpp @@ -10,7 +10,6 @@ // //===----------------------------------------------------------------------===// -#include "CheckExprLifetime.h" #include "TypeLocBuilder.h" #include "clang/APINotes/APINotesReader.h" #include "clang/APINotes/Types.h" @@ -18,6 +17,7 @@ #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/TypeLoc.h" +#include "clang/Analysis/Analyses/LifetimeAnnotations.h" #include "clang/Basic/SourceLocation.h" #include "clang/Lex/Lexer.h" #include "clang/Sema/SemaObjC.h" @@ -654,7 +654,7 @@ static void ProcessAPINotes(Sema &S, CXXMethodDecl *Method, const api_notes::CXXMethodInfo &Info, VersionedInfoMetadata Metadata) { if (Info.This && Info.This->isLifetimebound() && - !sema::implicitObjectParamIsLifetimeBound(Method)) { + !lifetimes::implicitObjectParamIsLifetimeBound(Method)) { auto MethodType = Method->getType(); auto *attr = ::new (S.Context) LifetimeBoundAttr(S.Context, getPlaceholderAttrInfo()); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 740b472b0eb16..39c3aa2243338 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2268,7 +2268,8 @@ static bool BuiltinCountZeroBitsGeneric(Sema &S, CallExpr *TheCall) { } static bool CheckMaskedBuiltinArgs(Sema &S, Expr *MaskArg, Expr *PtrArg, - unsigned Pos, bool Vector = true) { + unsigned Pos, bool AllowConst, + bool AllowAS) { QualType MaskTy = MaskArg->getType(); if (!MaskTy->isExtVectorBoolType()) return S.Diag(MaskArg->getBeginLoc(), diag::err_builtin_invalid_arg_type) @@ -2276,11 +2277,38 @@ static bool CheckMaskedBuiltinArgs(Sema &S, Expr *MaskArg, Expr *PtrArg, << MaskTy; QualType PtrTy = PtrArg->getType(); - if (!PtrTy->isPointerType() || - (Vector && !PtrTy->getPointeeType()->isVectorType()) || - (!Vector && PtrTy->getPointeeType()->isVectorType())) + if (!PtrTy->isPointerType() || PtrTy->getPointeeType()->isVectorType()) return S.Diag(PtrArg->getExprLoc(), diag::err_vec_masked_load_store_ptr) - << Pos << (Vector ? "pointer to vector" : "scalar pointer"); + << Pos << "scalar pointer"; + + QualType PointeeTy = PtrTy->getPointeeType(); + if (PointeeTy.isVolatileQualified() || PointeeTy->isAtomicType() || + (!AllowConst && PointeeTy.isConstQualified()) || + (!AllowAS && PointeeTy.hasAddressSpace())) { + QualType Target = + S.Context.getPointerType(PointeeTy.getAtomicUnqualifiedType()); + return S.Diag(PtrArg->getExprLoc(), + diag::err_typecheck_convert_incompatible) + << PtrTy << Target << /*different qualifiers=*/5 + << /*qualifier difference=*/0 << /*parameter mismatch=*/3 << 2 + << PtrTy << Target; + } + return false; +} + +static bool ConvertMaskedBuiltinArgs(Sema &S, CallExpr *TheCall) { + bool TypeDependent = false; + for (unsigned Arg = 0, E = TheCall->getNumArgs(); Arg != E; ++Arg) { + ExprResult Converted = + S.DefaultFunctionArrayLvalueConversion(TheCall->getArg(Arg)); + if (Converted.isInvalid()) + return true; + TheCall->setArg(Arg, Converted.get()); + TypeDependent |= Converted.get()->isTypeDependent(); + } + + if (TypeDependent) + TheCall->setType(S.Context.DependentTy); return false; } @@ -2288,33 +2316,35 @@ static ExprResult BuiltinMaskedLoad(Sema &S, CallExpr *TheCall) { if (S.checkArgCountRange(TheCall, 2, 3)) return ExprError(); + if (ConvertMaskedBuiltinArgs(S, TheCall)) + return ExprError(); + Expr *MaskArg = TheCall->getArg(0); Expr *PtrArg = TheCall->getArg(1); - if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 2)) + if (TheCall->isTypeDependent()) + return TheCall; + + if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 2, /*AllowConst=*/true, + TheCall->getBuiltinCallee() == + Builtin::BI__builtin_masked_load)) return ExprError(); QualType MaskTy = MaskArg->getType(); QualType PtrTy = PtrArg->getType(); QualType PointeeTy = PtrTy->getPointeeType(); const VectorType *MaskVecTy = MaskTy->getAs(); - const VectorType *DataVecTy = PointeeTy->getAs(); + QualType RetTy = S.Context.getExtVectorType(PointeeTy.getUnqualifiedType(), + MaskVecTy->getNumElements()); if (TheCall->getNumArgs() == 3) { Expr *PassThruArg = TheCall->getArg(2); QualType PassThruTy = PassThruArg->getType(); - if (!S.Context.hasSameType(PassThruTy, PointeeTy)) + if (!S.Context.hasSameType(PassThruTy, RetTy)) return S.Diag(PtrArg->getExprLoc(), diag::err_vec_masked_load_store_ptr) - << /* third argument */ 3 << PointeeTy; + << /* third argument */ 3 << RetTy; } - if (MaskVecTy->getNumElements() != DataVecTy->getNumElements()) - return ExprError( - S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size) - << S.getASTContext().BuiltinInfo.getQuotedName( - TheCall->getBuiltinCallee()) - << MaskTy << PointeeTy); - - TheCall->setType(PointeeTy); + TheCall->setType(RetTy); return TheCall; } @@ -2322,11 +2352,18 @@ static ExprResult BuiltinMaskedStore(Sema &S, CallExpr *TheCall) { if (S.checkArgCount(TheCall, 3)) return ExprError(); + if (ConvertMaskedBuiltinArgs(S, TheCall)) + return ExprError(); + Expr *MaskArg = TheCall->getArg(0); Expr *ValArg = TheCall->getArg(1); Expr *PtrArg = TheCall->getArg(2); + if (TheCall->isTypeDependent()) + return TheCall; - if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3)) + if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3, /*AllowConst=*/false, + TheCall->getBuiltinCallee() == + Builtin::BI__builtin_masked_store)) return ExprError(); QualType MaskTy = MaskArg->getType(); @@ -2339,18 +2376,10 @@ static ExprResult BuiltinMaskedStore(Sema &S, CallExpr *TheCall) { QualType PointeeTy = PtrTy->getPointeeType(); const VectorType *MaskVecTy = MaskTy->getAs(); - const VectorType *ValVecTy = ValTy->getAs(); - const VectorType *PtrVecTy = PointeeTy->getAs(); - - if (MaskVecTy->getNumElements() != ValVecTy->getNumElements() || - MaskVecTy->getNumElements() != PtrVecTy->getNumElements()) - return ExprError( - S.Diag(TheCall->getBeginLoc(), diag::err_vec_masked_load_store_size) - << S.getASTContext().BuiltinInfo.getQuotedName( - TheCall->getBuiltinCallee()) - << MaskTy << PointeeTy); - - if (!S.Context.hasSameType(ValTy, PointeeTy)) + QualType MemoryTy = S.Context.getExtVectorType(PointeeTy.getUnqualifiedType(), + MaskVecTy->getNumElements()); + if (!S.Context.hasSameType(ValTy.getUnqualifiedType(), + MemoryTy.getUnqualifiedType())) return ExprError(S.Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_incompatible_vector) << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ 2 @@ -2365,10 +2394,17 @@ static ExprResult BuiltinMaskedGather(Sema &S, CallExpr *TheCall) { if (S.checkArgCountRange(TheCall, 3, 4)) return ExprError(); + if (ConvertMaskedBuiltinArgs(S, TheCall)) + return ExprError(); + Expr *MaskArg = TheCall->getArg(0); Expr *IdxArg = TheCall->getArg(1); Expr *PtrArg = TheCall->getArg(2); - if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3, /*Vector=*/false)) + if (TheCall->isTypeDependent()) + return TheCall; + + if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3, /*AllowConst=*/true, + /*AllowAS=*/true)) return ExprError(); QualType IdxTy = IdxArg->getType(); @@ -2389,8 +2425,8 @@ static ExprResult BuiltinMaskedGather(Sema &S, CallExpr *TheCall) { TheCall->getBuiltinCallee()) << MaskTy << IdxTy); - QualType RetTy = - S.Context.getExtVectorType(PointeeTy, MaskVecTy->getNumElements()); + QualType RetTy = S.Context.getExtVectorType(PointeeTy.getUnqualifiedType(), + MaskVecTy->getNumElements()); if (TheCall->getNumArgs() == 4) { Expr *PassThruArg = TheCall->getArg(3); QualType PassThruTy = PassThruArg->getType(); @@ -2408,12 +2444,18 @@ static ExprResult BuiltinMaskedScatter(Sema &S, CallExpr *TheCall) { if (S.checkArgCount(TheCall, 4)) return ExprError(); + if (ConvertMaskedBuiltinArgs(S, TheCall)) + return ExprError(); + Expr *MaskArg = TheCall->getArg(0); Expr *IdxArg = TheCall->getArg(1); Expr *ValArg = TheCall->getArg(2); Expr *PtrArg = TheCall->getArg(3); + if (TheCall->isTypeDependent()) + return TheCall; - if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 3, /*Vector=*/false)) + if (CheckMaskedBuiltinArgs(S, MaskArg, PtrArg, 4, /*AllowConst=*/false, + /*AllowAS=*/true)) return ExprError(); QualType IdxTy = IdxArg->getType(); @@ -2443,9 +2485,9 @@ static ExprResult BuiltinMaskedScatter(Sema &S, CallExpr *TheCall) { TheCall->getBuiltinCallee()) << MaskTy << ValTy); - QualType ArgTy = - S.Context.getExtVectorType(PointeeTy, MaskVecTy->getNumElements()); - if (!S.Context.hasSameType(ValTy, ArgTy)) + QualType ArgTy = S.Context.getExtVectorType(PointeeTy.getUnqualifiedType(), + MaskVecTy->getNumElements()); + if (!S.Context.hasSameType(ValTy.getUnqualifiedType(), ArgTy)) return ExprError(S.Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_incompatible_vector) << TheCall->getDirectCallee() << /*isMoreThanTwoArgs*/ 2 @@ -3855,6 +3897,8 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, // If the call requires a streaming-mode change and has scalable vector // arguments or return values, then warn the user that the streaming and // non-streaming vector lengths may be different. + // When both streaming and non-streaming vector lengths are defined and + // mismatched, produce an error. const auto *CallerFD = dyn_cast(CurContext); if (CallerFD && (!FD || !FD->getBuiltinID()) && (IsScalableArg || IsScalableRet)) { @@ -3867,12 +3911,30 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto, if (!IsCalleeStreamingCompatible && (CallerFnType == SemaARM::ArmStreamingCompatible || ((CallerFnType == SemaARM::ArmStreaming) ^ IsCalleeStreaming))) { + const LangOptions &LO = getLangOpts(); + unsigned VL = LO.VScaleMin * 128; + unsigned SVL = LO.VScaleStreamingMin * 128; + bool IsVLMismatch = VL && SVL && VL != SVL; + + auto EmitDiag = [&](bool IsArg) { + if (IsVLMismatch) { + if (CallerFnType == SemaARM::ArmStreamingCompatible) + // Emit warning for streaming-compatible callers + Diag(Loc, diag::warn_sme_streaming_compatible_vl_mismatch) + << IsArg << IsCalleeStreaming << SVL << VL; + else + // Emit error otherwise + Diag(Loc, diag::err_sme_streaming_transition_vl_mismatch) + << IsArg << SVL << VL; + } else + Diag(Loc, diag::warn_sme_streaming_pass_return_vl_to_non_streaming) + << IsArg; + }; + if (IsScalableArg) - Diag(Loc, diag::warn_sme_streaming_pass_return_vl_to_non_streaming) - << /*IsArg=*/true; + EmitDiag(true); if (IsScalableRet) - Diag(Loc, diag::warn_sme_streaming_pass_return_vl_to_non_streaming) - << /*IsArg=*/false; + EmitDiag(false); } } @@ -6862,11 +6924,12 @@ StringRef Sema::GetFormatStringTypeName(FormatStringType FST) { FormatStringType Sema::GetFormatStringType(StringRef Flavor) { return llvm::StringSwitch(Flavor) - .Case("scanf", FormatStringType::Scanf) - .Cases("printf", "printf0", "syslog", FormatStringType::Printf) + .Cases("gnu_scanf", "scanf", FormatStringType::Scanf) + .Cases("gnu_printf", "printf", "printf0", "syslog", + FormatStringType::Printf) .Cases("NSString", "CFString", FormatStringType::NSString) - .Case("strftime", FormatStringType::Strftime) - .Case("strfmon", FormatStringType::Strfmon) + .Cases("gnu_strftime", "strftime", FormatStringType::Strftime) + .Cases("gnu_strfmon", "strfmon", FormatStringType::Strfmon) .Cases("kprintf", "cmn_err", "vcmn_err", "zcmn_err", FormatStringType::Kprintf) .Case("freebsd_kprintf", FormatStringType::FreeBSDKPrintf) @@ -6986,7 +7049,6 @@ bool Sema::CheckFormatArguments(ArrayRef Args, case FormatStringType::Kprintf: case FormatStringType::FreeBSDKPrintf: case FormatStringType::Printf: - case FormatStringType::Syslog: Diag(FormatLoc, diag::note_format_security_fixit) << FixItHint::CreateInsertion(FormatLoc, "\"%s\", "); break; @@ -7653,6 +7715,14 @@ void CheckPrintfHandler::handleInvalidMaskType(StringRef MaskType) { S.Diag(getLocationOfByte(MaskType.data()), diag::err_invalid_mask_type_size); } +// Error out if struct or complex type argments are passed to os_log. +static bool isInvalidOSLogArgTypeForCodeGen(FormatStringType FSType, + QualType T) { + if (FSType != FormatStringType::OSLog) + return false; + return T->isRecordType() || T->isComplexType(); +} + bool CheckPrintfHandler::HandleAmount( const analyze_format_string::OptionalAmount &Amt, unsigned k, const char *startSpecifier, unsigned specifierLen) { @@ -7685,11 +7755,14 @@ bool CheckPrintfHandler::HandleAmount( assert(AT.isValid()); if (!AT.matchesType(S.Context, T)) { - EmitFormatDiagnostic(S.PDiag(diag::warn_printf_asterisk_wrong_type) - << k << AT.getRepresentativeTypeName(S.Context) - << T << Arg->getSourceRange(), + unsigned DiagID = isInvalidOSLogArgTypeForCodeGen(FSType, T) + ? diag::err_printf_asterisk_wrong_type + : diag::warn_printf_asterisk_wrong_type; + EmitFormatDiagnostic(S.PDiag(DiagID) + << k << AT.getRepresentativeTypeName(S.Context) + << T << Arg->getSourceRange(), getLocationOfByte(Amt.getStart()), - /*IsStringLocation*/true, + /*IsStringLocation*/ true, getSpecifierRange(startSpecifier, specifierLen)); // Don't do any more checking. We will just emit // spurious errors. @@ -8744,7 +8817,9 @@ CheckPrintfHandler::checkFormatExpr(const analyze_printf::PrintfSpecifier &FS, Diag = diag::warn_format_conversion_argument_type_mismatch_confusion; break; case ArgType::NoMatch: - Diag = diag::warn_format_conversion_argument_type_mismatch; + Diag = isInvalidOSLogArgTypeForCodeGen(FSType, ExprTy) + ? diag::err_format_conversion_argument_type_mismatch + : diag::warn_format_conversion_argument_type_mismatch; break; } @@ -9103,8 +9178,7 @@ static void CheckFormatString( if (Type == FormatStringType::Printf || Type == FormatStringType::NSString || Type == FormatStringType::Kprintf || Type == FormatStringType::FreeBSDKPrintf || - Type == FormatStringType::OSLog || Type == FormatStringType::OSTrace || - Type == FormatStringType::Syslog) { + Type == FormatStringType::OSLog || Type == FormatStringType::OSTrace) { bool IsObjC = Type == FormatStringType::NSString || Type == FormatStringType::OSTrace; if (ReferenceFormatString == nullptr) { @@ -9140,8 +9214,7 @@ bool Sema::CheckFormatStringsCompatible( if (Type != FormatStringType::Printf && Type != FormatStringType::NSString && Type != FormatStringType::Kprintf && Type != FormatStringType::FreeBSDKPrintf && - Type != FormatStringType::OSLog && Type != FormatStringType::OSTrace && - Type != FormatStringType::Syslog) + Type != FormatStringType::OSLog && Type != FormatStringType::OSTrace) return true; bool IsObjC = @@ -9175,8 +9248,7 @@ bool Sema::ValidateFormatString(FormatStringType Type, if (Type != FormatStringType::Printf && Type != FormatStringType::NSString && Type != FormatStringType::Kprintf && Type != FormatStringType::FreeBSDKPrintf && - Type != FormatStringType::OSLog && Type != FormatStringType::OSTrace && - Type != FormatStringType::Syslog) + Type != FormatStringType::OSLog && Type != FormatStringType::OSTrace) return true; FormatStringLiteral RefLit = Str; @@ -13010,7 +13082,19 @@ static void AnalyzeImplicitConversions( // Skip past explicit casts. if (auto *CE = dyn_cast(E)) { - E = CE->getSubExpr()->IgnoreParenImpCasts(); + E = CE->getSubExpr(); + // In the special case of a C++ function-style cast with braces, + // CXXFunctionalCastExpr has an InitListExpr as direct child with a single + // initializer. This InitListExpr basically belongs to the cast itself, so + // we skip it too. Specifically this is needed to silence -Wdouble-promotion + if (isa(CE)) { + if (auto *InitListE = dyn_cast(E)) { + if (InitListE->getNumInits() == 1) { + E = InitListE->getInit(0); + } + } + } + E = E->IgnoreParenImpCasts(); if (!CE->getType()->isVoidType() && E->getType()->isAtomicType()) S.Diag(E->getBeginLoc(), diag::warn_atomic_implicit_seq_cst); WorkList.push_back({E, CC, IsListInit}); diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index e10511cc7fc4e..9ef7a2698913d 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -6794,7 +6794,9 @@ bool Sema::tryToFixVariablyModifiedVarType(TypeSourceInfo *&TInfo, if (SizeIsNegative) Diag(Loc, diag::err_typecheck_negative_array_size); else if (Oversized.getBoolValue()) - Diag(Loc, diag::err_array_too_large) << toString(Oversized, 10); + Diag(Loc, diag::err_array_too_large) << toString( + Oversized, 10, Oversized.isSigned(), /*formatAsCLiteral=*/false, + /*UpperCase=*/false, /*InsertSeparators=*/true); else if (FailedFoldDiagID) Diag(Loc, FailedFoldDiagID); return false; diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index b876911384f6f..a8dfa4d7df2d5 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -3629,10 +3629,11 @@ static FormatAttrKind getFormatAttrKind(StringRef Format) { // Check for formats that get handled specially. .Case("NSString", NSStringFormat) .Case("CFString", CFStringFormat) - .Case("strftime", StrftimeFormat) + .Cases("gnu_strftime", "strftime", StrftimeFormat) // Otherwise, check for supported formats. - .Cases("scanf", "printf", "printf0", "strfmon", SupportedFormat) + .Cases("gnu_scanf", "scanf", "gnu_printf", "printf", "printf0", + "gnu_strfmon", "strfmon", SupportedFormat) .Cases("cmn_err", "vcmn_err", "zcmn_err", SupportedFormat) .Cases("kprintf", "syslog", SupportedFormat) // OpenBSD. .Case("freebsd_kprintf", SupportedFormat) // FreeBSD. diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index ea08f41437e70..1131e1f033b72 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -768,58 +768,44 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D, // C++23 [dcl.pre]/6: // Each decl-specifier in the decl-specifier-seq shall be static, // thread_local, auto (9.2.9.6 [dcl.spec.auto]), or a cv-qualifier. + // C++23 [dcl.pre]/7: + // Each decl-specifier in the decl-specifier-seq shall be constexpr, + // constinit, static, thread_local, auto, or a cv-qualifier auto &DS = D.getDeclSpec(); - { - // Note: While constrained-auto needs to be checked, we do so separately so - // we can emit a better diagnostic. - SmallVector BadSpecifiers; - SmallVector BadSpecifierLocs; - SmallVector CPlusPlus20Specifiers; - SmallVector CPlusPlus20SpecifierLocs; - if (auto SCS = DS.getStorageClassSpec()) { - if (SCS == DeclSpec::SCS_static) { - CPlusPlus20Specifiers.push_back(DeclSpec::getSpecifierName(SCS)); - CPlusPlus20SpecifierLocs.push_back(DS.getStorageClassSpecLoc()); - } else { - BadSpecifiers.push_back(DeclSpec::getSpecifierName(SCS)); - BadSpecifierLocs.push_back(DS.getStorageClassSpecLoc()); - } - } - if (auto TSCS = DS.getThreadStorageClassSpec()) { - CPlusPlus20Specifiers.push_back(DeclSpec::getSpecifierName(TSCS)); - CPlusPlus20SpecifierLocs.push_back(DS.getThreadStorageClassSpecLoc()); - } - if (DS.hasConstexprSpecifier()) { - BadSpecifiers.push_back( - DeclSpec::getSpecifierName(DS.getConstexprSpecifier())); - BadSpecifierLocs.push_back(DS.getConstexprSpecLoc()); - } - if (DS.isInlineSpecified()) { - BadSpecifiers.push_back("inline"); - BadSpecifierLocs.push_back(DS.getInlineSpecLoc()); - } - - if (!BadSpecifiers.empty()) { - auto &&Err = Diag(BadSpecifierLocs.front(), diag::err_decomp_decl_spec); - Err << (int)BadSpecifiers.size() - << llvm::join(BadSpecifiers.begin(), BadSpecifiers.end(), " "); - // Don't add FixItHints to remove the specifiers; we do still respect - // them when building the underlying variable. - for (auto Loc : BadSpecifierLocs) - Err << SourceRange(Loc, Loc); - } else if (!CPlusPlus20Specifiers.empty()) { - auto &&Warn = DiagCompat(CPlusPlus20SpecifierLocs.front(), - diag_compat::decomp_decl_spec); - Warn << (int)CPlusPlus20Specifiers.size() - << llvm::join(CPlusPlus20Specifiers.begin(), - CPlusPlus20Specifiers.end(), " "); - for (auto Loc : CPlusPlus20SpecifierLocs) - Warn << SourceRange(Loc, Loc); - } - // We can't recover from it being declared as a typedef. - if (DS.getStorageClassSpec() == DeclSpec::SCS_typedef) - return nullptr; + auto DiagBadSpecifier = [&](StringRef Name, SourceLocation Loc) { + Diag(Loc, diag::err_decomp_decl_spec) << Name; + }; + + auto DiagCpp20Specifier = [&](StringRef Name, SourceLocation Loc) { + DiagCompat(Loc, diag_compat::decomp_decl_spec) << Name; + }; + + if (auto SCS = DS.getStorageClassSpec()) { + if (SCS == DeclSpec::SCS_static) + DiagCpp20Specifier(DeclSpec::getSpecifierName(SCS), + DS.getStorageClassSpecLoc()); + else + DiagBadSpecifier(DeclSpec::getSpecifierName(SCS), + DS.getStorageClassSpecLoc()); } + if (auto TSCS = DS.getThreadStorageClassSpec()) + DiagCpp20Specifier(DeclSpec::getSpecifierName(TSCS), + DS.getThreadStorageClassSpecLoc()); + + if (DS.isInlineSpecified()) + DiagBadSpecifier("inline", DS.getInlineSpecLoc()); + + if (ConstexprSpecKind ConstexprSpec = DS.getConstexprSpecifier(); + ConstexprSpec != ConstexprSpecKind::Unspecified) { + if (ConstexprSpec == ConstexprSpecKind::Consteval || + !getLangOpts().CPlusPlus26) + DiagBadSpecifier(DeclSpec::getSpecifierName(ConstexprSpec), + DS.getConstexprSpecLoc()); + } + + // We can't recover from it being declared as a typedef. + if (DS.getStorageClassSpec() == DeclSpec::SCS_typedef) + return nullptr; // C++2a [dcl.struct.bind]p1: // A cv that includes volatile is deprecated diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 552c92996dc2e..a0483c3027199 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1493,6 +1493,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Stmt::OMPUnrollDirectiveClass: case Stmt::OMPReverseDirectiveClass: case Stmt::OMPInterchangeDirectiveClass: + case Stmt::OMPFuseDirectiveClass: case Stmt::OMPSingleDirectiveClass: case Stmt::OMPTargetDataDirectiveClass: case Stmt::OMPTargetDirectiveClass: diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 293097fd708fb..779ccf5f1e888 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -2395,7 +2395,10 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal, if (ActiveSizeBits > ConstantArrayType::getMaxSizeBits(Context)) return ExprError( Diag((*ArraySize)->getBeginLoc(), diag::err_array_too_large) - << toString(*Value, 10) << (*ArraySize)->getSourceRange()); + << toString(*Value, 10, Value->isSigned(), + /*formatAsCLiteral=*/false, /*UpperCase=*/false, + /*InsertSeparators=*/true) + << (*ArraySize)->getSourceRange()); } KnownArraySize = Value->getZExtValue(); diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 305691dcf4f6c..940d510b4cc02 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -1369,7 +1369,8 @@ bool SemaHLSL::handleRootSignatureElements( "Number of unbound elements must match the number of clauses"); bool HasAnySampler = false; bool HasAnyNonSampler = false; - uint32_t Offset = 0; + uint64_t Offset = 0; + bool IsPrevUnbound = false; for (const auto &[Clause, ClauseElem] : UnboundClauses) { SourceLocation Loc = ClauseElem->getLocation(); if (Clause->Type == llvm::dxil::ResourceClass::Sampler) @@ -1386,32 +1387,28 @@ bool SemaHLSL::handleRootSignatureElements( if (Clause->NumDescriptors == 0) return true; - if (Clause->Offset != - llvm::hlsl::rootsig::DescriptorTableOffsetAppend) { - // Manually specified the offset + bool IsAppending = + Clause->Offset == llvm::hlsl::rootsig::DescriptorTableOffsetAppend; + if (!IsAppending) Offset = Clause->Offset; - } uint64_t RangeBound = llvm::hlsl::rootsig::computeRangeBound( Offset, Clause->NumDescriptors); - if (!llvm::hlsl::rootsig::verifyBoundOffset(Offset)) { - // Trying to append onto unbound offset + if (IsPrevUnbound && IsAppending) Diag(Loc, diag::err_hlsl_appending_onto_unbound); - } else if (!llvm::hlsl::rootsig::verifyNoOverflowedOffset(RangeBound)) { - // Upper bound overflows maximum offset + else if (!llvm::hlsl::rootsig::verifyNoOverflowedOffset(RangeBound)) Diag(Loc, diag::err_hlsl_offset_overflow) << Offset << RangeBound; - } - Offset = RangeBound == llvm::hlsl::rootsig::NumDescriptorsUnbounded - ? uint32_t(RangeBound) - : uint32_t(RangeBound + 1); + // Update offset to be 1 past this range's bound + Offset = RangeBound + 1; + IsPrevUnbound = Clause->NumDescriptors == + llvm::hlsl::rootsig::NumDescriptorsUnbounded; // Compute the register bounds and track resource binding uint32_t LowerBound(Clause->Reg.Number); - uint32_t UpperBound = Clause->NumDescriptors == ~0u - ? ~0u - : LowerBound + Clause->NumDescriptors - 1; + uint32_t UpperBound = llvm::hlsl::rootsig::computeRangeBound( + LowerBound, Clause->NumDescriptors); BindingChecker.trackBinding( Table->Visibility, @@ -3093,7 +3090,8 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) { return true; break; } - case Builtin::BI__builtin_hlsl_elementwise_isinf: { + case Builtin::BI__builtin_hlsl_elementwise_isinf: + case Builtin::BI__builtin_hlsl_elementwise_isnan: { if (SemaRef.checkArgCount(TheCall, 1)) return true; if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall, @@ -3294,7 +3292,6 @@ static void BuildFlattenedTypeList(QualType BaseTy, while (!WorkList.empty()) { QualType T = WorkList.pop_back_val(); T = T.getCanonicalType().getUnqualifiedType(); - assert(!isa(T) && "Matrix types not yet supported in HLSL"); if (const auto *AT = dyn_cast(T)) { llvm::SmallVector ElementFields; // Generally I've avoided recursion in this algorithm, but arrays of @@ -3326,7 +3323,8 @@ static void BuildFlattenedTypeList(QualType BaseTy, llvm::SmallVector FieldTypes; for (const auto *FD : RD->fields()) - FieldTypes.push_back(FD->getType()); + if (!FD->isUnnamedBitField()) + FieldTypes.push_back(FD->getType()); // Reverse the newly added sub-range. std::reverse(FieldTypes.begin(), FieldTypes.end()); llvm::append_range(WorkList, FieldTypes); @@ -4161,6 +4159,8 @@ class InitListTransformer { while (!RecordDecls.empty()) { CXXRecordDecl *RD = RecordDecls.pop_back_val(); for (auto *FD : RD->fields()) { + if (FD->isUnnamedBitField()) + continue; DeclAccessPair Found = DeclAccessPair::make(FD, FD->getAccess()); DeclarationNameInfo NameInfo(FD->getDeclName(), E->getBeginLoc()); ExprResult Res = S.BuildFieldReferenceExpr( @@ -4210,7 +4210,8 @@ class InitListTransformer { while (!RecordDecls.empty()) { CXXRecordDecl *RD = RecordDecls.pop_back_val(); for (auto *FD : RD->fields()) - Inits.push_back(generateInitListsImpl(FD->getType())); + if (!FD->isUnnamedBitField()) + Inits.push_back(generateInitListsImpl(FD->getType())); } } auto *NewInit = new (Ctx) InitListExpr(Ctx, Inits.front()->getBeginLoc(), @@ -4283,6 +4284,9 @@ bool SemaHLSL::transformInitList(const InitializedEntity &Entity, } size_t ExpectedSize = ILT.DestTypes.size(); size_t ActualSize = ILT.ArgExprs.size(); + if (ExpectedSize == 0 && ActualSize == 0) + return true; + // For incomplete arrays it is completely arbitrary to choose whether we think // the user intended fewer or more elements. This implementation assumes that // the user intended more, and errors that there are too few initializers to diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index fbd8022cd68ba..a64f207ca0231 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -2758,16 +2758,18 @@ ExprResult FinishValueInit(Sema &S, InitializedEntity &Entity, } // namespace OpenACCPrivateRecipe SemaOpenACC::CreatePrivateInitRecipe(const Expr *VarExpr) { - VarExpr = StripOffBounds(VarExpr); - + // We don't strip bounds here, so that we are doing our recipe init at the + // 'lowest' possible level. Codegen is going to have to do its own 'looping'. if (!VarExpr || VarExpr->getType()->isDependentType()) return OpenACCPrivateRecipe::Empty(); QualType VarTy = VarExpr->getType().getNonReferenceType().getUnqualifiedType(); - // TODO: OpenACC: for arrays/bounds versions, we're going to have to do a - // different initializer, but for now we can go ahead with this. + // Array sections are special, and we have to treat them that way. + if (const auto *ASE = + dyn_cast(VarExpr->IgnoreParenImpCasts())) + VarTy = ArraySectionExpr::getBaseOriginalType(ASE); VarDecl *AllocaDecl = CreateAllocaDecl( getASTContext(), SemaRef.getCurContext(), VarExpr->getBeginLoc(), @@ -2780,11 +2782,19 @@ OpenACCPrivateRecipe SemaOpenACC::CreatePrivateInitRecipe(const Expr *VarExpr) { InitializationSequence InitSeq(SemaRef.SemaRef, Entity, Kind, {}); ExprResult Init = InitSeq.Perform(SemaRef.SemaRef, Entity, Kind, {}); + // For 'no bounds' version, we can use this as a shortcut, so set the init + // anyway. + if (Init.isUsable()) { + AllocaDecl->setInit(Init.get()); + AllocaDecl->setInitStyle(VarDecl::CallInit); + } + return OpenACCPrivateRecipe(AllocaDecl, Init.get()); } OpenACCFirstPrivateRecipe SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) { + // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe VarExpr = StripOffBounds(VarExpr); if (!VarExpr || VarExpr->getType()->isDependentType()) @@ -2869,6 +2879,7 @@ SemaOpenACC::CreateFirstPrivateInitRecipe(const Expr *VarExpr) { } OpenACCReductionRecipe SemaOpenACC::CreateReductionInitRecipe( OpenACCReductionOperator ReductionOperator, const Expr *VarExpr) { + // TODO: OpenACC: This shouldn't be necessary, see PrivateInitRecipe VarExpr = StripOffBounds(VarExpr); if (!VarExpr || VarExpr->getType()->isDependentType()) diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 48e06d1dc7579..f5feed6206494 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -4569,6 +4569,7 @@ void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, case OMPD_unroll: case OMPD_reverse: case OMPD_interchange: + case OMPD_fuse: case OMPD_assume: break; default: @@ -6410,6 +6411,10 @@ StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective( Res = ActOnOpenMPInterchangeDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc); break; + case OMPD_fuse: + Res = + ActOnOpenMPFuseDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc); + break; case OMPD_for: Res = ActOnOpenMPForDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA); @@ -9488,7 +9493,9 @@ static bool checkOpenMPIterationSpace( // sharing attributes. VarsWithImplicitDSA.erase(LCDecl); - assert(isOpenMPLoopDirective(DKind) && "DSA for non-loop vars"); + assert((isOpenMPLoopDirective(DKind) || + isOpenMPCanonicalLoopSequenceTransformationDirective(DKind)) && + "DSA for non-loop vars"); // Check test-expr. HasErrors |= ISC.checkAndSetCond(For ? For->getCond() : CXXFor->getCond()); @@ -9916,7 +9923,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, unsigned NumLoops = std::max(OrderedLoopCount, NestedLoopCount); SmallVector IterSpaces(NumLoops); if (!OMPLoopBasedDirective::doForAllLoops( - AStmt->IgnoreContainers(!isOpenMPLoopTransformationDirective(DKind)), + AStmt->IgnoreContainers( + !isOpenMPCanonicalLoopNestTransformationDirective(DKind)), SupportsNonPerfectlyNested, NumLoops, [DKind, &SemaRef, &DSA, NumLoops, NestedLoopCount, CollapseLoopCountExpr, OrderedLoopCountExpr, &VarsWithImplicitDSA, @@ -9938,8 +9946,7 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, } return false; }, - [&SemaRef, - &Captures](OMPCanonicalLoopNestTransformationDirective *Transform) { + [&SemaRef, &Captures](OMPLoopTransformationDirective *Transform) { Stmt *DependentPreInits = Transform->getPreInits(); if (!DependentPreInits) return; @@ -9954,7 +9961,8 @@ checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr, auto *D = cast(C); DeclRefExpr *Ref = buildDeclRefExpr( SemaRef, D, D->getType().getNonReferenceType(), - Transform->getBeginLoc()); + cast(Transform->getDirective()) + ->getBeginLoc()); Captures[Ref] = Ref; } } @@ -14404,10 +14412,34 @@ StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective( getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B); } +/// Updates OriginalInits by checking Transform against loop transformation +/// directives and appending their pre-inits if a match is found. +static void updatePreInits(OMPLoopTransformationDirective *Transform, + SmallVectorImpl &PreInits) { + Stmt *Dir = Transform->getDirective(); + switch (Dir->getStmtClass()) { +#define STMT(CLASS, PARENT) +#define ABSTRACT_STMT(CLASS) +#define COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT) \ + case Stmt::CLASS##Class: \ + appendFlattenedStmtList(PreInits, \ + static_cast(Dir)->getPreInits()); \ + break; +#define OMPCANONICALLOOPNESTTRANSFORMATIONDIRECTIVE(CLASS, PARENT) \ + COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT) +#define OMPCANONICALLOOPSEQUENCETRANSFORMATIONDIRECTIVE(CLASS, PARENT) \ + COMMON_OMP_LOOP_TRANSFORMATION(CLASS, PARENT) +#include "clang/AST/StmtNodes.inc" +#undef COMMON_OMP_LOOP_TRANSFORMATION + default: + llvm_unreachable("Not a loop transformation"); + } +} + bool SemaOpenMP::checkTransformableLoopNest( OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops, SmallVectorImpl &LoopHelpers, - Stmt *&Body, SmallVectorImpl> &OriginalInits) { + Stmt *&Body, SmallVectorImpl> &OriginalInits) { OriginalInits.emplace_back(); bool Result = OMPLoopBasedDirective::doForAllLoops( AStmt->IgnoreContainers(), /*TryImperfectlyNestedLoops=*/false, NumLoops, @@ -14433,29 +14465,268 @@ bool SemaOpenMP::checkTransformableLoopNest( OriginalInits.emplace_back(); return false; }, - [&OriginalInits](OMPLoopBasedDirective *Transform) { - Stmt *DependentPreInits; - if (auto *Dir = dyn_cast(Transform)) - DependentPreInits = Dir->getPreInits(); - else if (auto *Dir = dyn_cast(Transform)) - DependentPreInits = Dir->getPreInits(); - else if (auto *Dir = dyn_cast(Transform)) - DependentPreInits = Dir->getPreInits(); - else if (auto *Dir = dyn_cast(Transform)) - DependentPreInits = Dir->getPreInits(); - else if (auto *Dir = dyn_cast(Transform)) - DependentPreInits = Dir->getPreInits(); - else - llvm_unreachable("Unhandled loop transformation"); - - appendFlattenedStmtList(OriginalInits.back(), DependentPreInits); + [&OriginalInits](OMPLoopTransformationDirective *Transform) { + updatePreInits(Transform, OriginalInits.back()); }); assert(OriginalInits.back().empty() && "No preinit after innermost loop"); OriginalInits.pop_back(); return Result; } -/// Add preinit statements that need to be propageted from the selected loop. +/// Counts the total number of OpenMP canonical nested loops, including the +/// outermost loop (the original loop). PRECONDITION of this visitor is that it +/// must be invoked from the original loop to be analyzed. The traversal stops +/// for Decl's and Expr's given that they may contain inner loops that must not +/// be counted. +/// +/// Example AST structure for the code: +/// +/// int main() { +/// #pragma omp fuse +/// { +/// for (int i = 0; i < 100; i++) { <-- Outer loop +/// []() { +/// for(int j = 0; j < 100; j++) {} <-- NOT A LOOP (1) +/// }; +/// for(int j = 0; j < 5; ++j) {} <-- Inner loop +/// } +/// for (int r = 0; i < 100; i++) { <-- Outer loop +/// struct LocalClass { +/// void bar() { +/// for(int j = 0; j < 100; j++) {} <-- NOT A LOOP (2) +/// } +/// }; +/// for(int k = 0; k < 10; ++k) {} <-- Inner loop +/// {x = 5; for(k = 0; k < 10; ++k) x += k; x}; <-- NOT A LOOP (3) +/// } +/// } +/// } +/// (1) because in a different function (here: a lambda) +/// (2) because in a different function (here: class method) +/// (3) because considered to be intervening-code of non-perfectly nested loop +/// Result: Loop 'i' contains 2 loops, Loop 'r' also contains 2 loops. +class NestedLoopCounterVisitor final : public DynamicRecursiveASTVisitor { +private: + unsigned NestedLoopCount = 0; + +public: + explicit NestedLoopCounterVisitor() = default; + + unsigned getNestedLoopCount() const { return NestedLoopCount; } + + bool VisitForStmt(ForStmt *FS) override { + ++NestedLoopCount; + return true; + } + + bool VisitCXXForRangeStmt(CXXForRangeStmt *FRS) override { + ++NestedLoopCount; + return true; + } + + bool TraverseStmt(Stmt *S) override { + if (!S) + return true; + + // Skip traversal of all expressions, including special cases like + // LambdaExpr, StmtExpr, BlockExpr, and RequiresExpr. These expressions + // may contain inner statements (and even loops), but they are not part + // of the syntactic body of the surrounding loop structure. + // Therefore must not be counted. + if (isa(S)) + return true; + + // Only recurse into CompoundStmt (block {}) and loop bodies. + if (isa(S)) { + return DynamicRecursiveASTVisitor::TraverseStmt(S); + } + + // Stop traversal of the rest of statements, that break perfect + // loop nesting, such as control flow (IfStmt, SwitchStmt...). + return true; + } + + bool TraverseDecl(Decl *D) override { + // Stop in the case of finding a declaration, it is not important + // in order to find nested loops (Possible CXXRecordDecl, RecordDecl, + // FunctionDecl...). + return true; + } +}; + +bool SemaOpenMP::analyzeLoopSequence(Stmt *LoopSeqStmt, + LoopSequenceAnalysis &SeqAnalysis, + ASTContext &Context, + OpenMPDirectiveKind Kind) { + VarsWithInheritedDSAType TmpDSA; + // Helper Lambda to handle storing initialization and body statements for + // both ForStmt and CXXForRangeStmt. + auto StoreLoopStatements = [](LoopAnalysis &Analysis, Stmt *LoopStmt) { + if (auto *For = dyn_cast(LoopStmt)) { + Analysis.OriginalInits.push_back(For->getInit()); + Analysis.TheForStmt = For; + } else { + auto *CXXFor = cast(LoopStmt); + Analysis.OriginalInits.push_back(CXXFor->getBeginStmt()); + Analysis.TheForStmt = CXXFor; + } + }; + + // Helper lambda functions to encapsulate the processing of different + // derivations of the canonical loop sequence grammar + // Modularized code for handling loop generation and transformations. + auto AnalyzeLoopGeneration = [&](Stmt *Child) { + auto *LoopTransform = cast(Child); + Stmt *TransformedStmt = LoopTransform->getTransformedStmt(); + unsigned NumGeneratedTopLevelLoops = + LoopTransform->getNumGeneratedTopLevelLoops(); + // Handle the case where transformed statement is not available due to + // dependent contexts + if (!TransformedStmt) { + if (NumGeneratedTopLevelLoops > 0) { + SeqAnalysis.LoopSeqSize += NumGeneratedTopLevelLoops; + return true; + } + // Unroll full (0 loops produced) + Diag(Child->getBeginLoc(), diag::err_omp_not_for) + << 0 << getOpenMPDirectiveName(Kind); + return false; + } + // Handle loop transformations with multiple loop nests + // Unroll full + if (!NumGeneratedTopLevelLoops) { + Diag(Child->getBeginLoc(), diag::err_omp_not_for) + << 0 << getOpenMPDirectiveName(Kind); + return false; + } + // Loop transformatons such as split or loopranged fuse + if (NumGeneratedTopLevelLoops > 1) { + // Get the preinits related to this loop sequence generating + // loop transformation (i.e loopranged fuse, split...) + // These preinits differ slightly from regular inits/pre-inits related + // to single loop generating loop transformations (interchange, unroll) + // given that they are not bounded to a particular loop nest + // so they need to be treated independently + updatePreInits(LoopTransform, SeqAnalysis.LoopSequencePreInits); + return analyzeLoopSequence(TransformedStmt, SeqAnalysis, Context, Kind); + } + // Vast majority: (Tile, Unroll, Stripe, Reverse, Interchange, Fuse all) + // Process the transformed loop statement + LoopAnalysis &NewTransformedSingleLoop = + SeqAnalysis.Loops.emplace_back(Child); + unsigned IsCanonical = checkOpenMPLoop( + Kind, nullptr, nullptr, TransformedStmt, SemaRef, *DSAStack, TmpDSA, + NewTransformedSingleLoop.HelperExprs); + + if (!IsCanonical) + return false; + + StoreLoopStatements(NewTransformedSingleLoop, TransformedStmt); + updatePreInits(LoopTransform, NewTransformedSingleLoop.TransformsPreInits); + + SeqAnalysis.LoopSeqSize++; + return true; + }; + + // Modularized code for handling regular canonical loops. + auto AnalyzeRegularLoop = [&](Stmt *Child) { + LoopAnalysis &NewRegularLoop = SeqAnalysis.Loops.emplace_back(Child); + unsigned IsCanonical = + checkOpenMPLoop(Kind, nullptr, nullptr, Child, SemaRef, *DSAStack, + TmpDSA, NewRegularLoop.HelperExprs); + + if (!IsCanonical) + return false; + + StoreLoopStatements(NewRegularLoop, Child); + NestedLoopCounterVisitor NLCV; + NLCV.TraverseStmt(Child); + return true; + }; + + // High level grammar validation. + for (Stmt *Child : LoopSeqStmt->children()) { + if (!Child) + continue; + // Skip over non-loop-sequence statements. + if (!LoopSequenceAnalysis::isLoopSequenceDerivation(Child)) { + Child = Child->IgnoreContainers(); + // Ignore empty compound statement. + if (!Child) + continue; + // In the case of a nested loop sequence ignoring containers would not + // be enough, a recurisve transversal of the loop sequence is required. + if (isa(Child)) { + if (!analyzeLoopSequence(Child, SeqAnalysis, Context, Kind)) + return false; + // Already been treated, skip this children + continue; + } + } + // Regular loop sequence handling. + if (LoopSequenceAnalysis::isLoopSequenceDerivation(Child)) { + if (LoopAnalysis::isLoopTransformation(Child)) { + if (!AnalyzeLoopGeneration(Child)) + return false; + // AnalyzeLoopGeneration updates SeqAnalysis.LoopSeqSize accordingly. + } else { + if (!AnalyzeRegularLoop(Child)) + return false; + SeqAnalysis.LoopSeqSize++; + } + } else { + // Report error for invalid statement inside canonical loop sequence. + Diag(Child->getBeginLoc(), diag::err_omp_not_for) + << 0 << getOpenMPDirectiveName(Kind); + return false; + } + } + return true; +} + +bool SemaOpenMP::checkTransformableLoopSequence( + OpenMPDirectiveKind Kind, Stmt *AStmt, LoopSequenceAnalysis &SeqAnalysis, + ASTContext &Context) { + // Following OpenMP 6.0 API Specification, a Canonical Loop Sequence follows + // the grammar: + // + // canonical-loop-sequence: + // { + // loop-sequence+ + // } + // where loop-sequence can be any of the following: + // 1. canonical-loop-sequence + // 2. loop-nest + // 3. loop-sequence-generating-construct (i.e OMPLoopTransformationDirective) + // + // To recognise and traverse this structure the helper function + // analyzeLoopSequence serves as the recurisve entry point + // and tries to match the input AST to the canonical loop sequence grammar + // structure. This function will perform both a semantic and syntactical + // analysis of the given statement according to OpenMP 6.0 definition of + // the aforementioned canonical loop sequence. + + // We expect an outer compound statement. + if (!isa(AStmt)) { + Diag(AStmt->getBeginLoc(), diag::err_omp_not_a_loop_sequence) + << getOpenMPDirectiveName(Kind); + return false; + } + + // Recursive entry point to process the main loop sequence + if (!analyzeLoopSequence(AStmt, SeqAnalysis, Context, Kind)) + return false; + + // Diagnose an empty loop sequence. + if (!SeqAnalysis.LoopSeqSize) { + Diag(AStmt->getBeginLoc(), diag::err_omp_empty_loop_sequence) + << getOpenMPDirectiveName(Kind); + return false; + } + return true; +} + +/// Add preinit statements that need to be propagated from the selected loop. static void addLoopPreInits(ASTContext &Context, OMPLoopBasedDirective::HelperExprs &LoopHelper, Stmt *LoopStmt, ArrayRef OriginalInit, @@ -14540,7 +14811,7 @@ StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef Clauses, // Verify and diagnose loop nest. SmallVector LoopHelpers(NumLoops); Stmt *Body = nullptr; - SmallVector, 4> OriginalInits; + SmallVector, 4> OriginalInits; if (!checkTransformableLoopNest(OMPD_tile, AStmt, NumLoops, LoopHelpers, Body, OriginalInits)) return StmtError(); @@ -14817,7 +15088,7 @@ StmtResult SemaOpenMP::ActOnOpenMPStripeDirective(ArrayRef Clauses, // Verify and diagnose loop nest. SmallVector LoopHelpers(NumLoops); Stmt *Body = nullptr; - SmallVector, 4> OriginalInits; + SmallVector, 4> OriginalInits; if (!checkTransformableLoopNest(OMPD_stripe, AStmt, NumLoops, LoopHelpers, Body, OriginalInits)) return StmtError(); @@ -15078,7 +15349,7 @@ StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef Clauses, Stmt *Body = nullptr; SmallVector LoopHelpers( NumLoops); - SmallVector, NumLoops + 1> OriginalInits; + SmallVector, NumLoops + 1> OriginalInits; if (!checkTransformableLoopNest(OMPD_unroll, AStmt, NumLoops, LoopHelpers, Body, OriginalInits)) return StmtError(); @@ -15348,7 +15619,7 @@ StmtResult SemaOpenMP::ActOnOpenMPReverseDirective(Stmt *AStmt, Stmt *Body = nullptr; SmallVector LoopHelpers( NumLoops); - SmallVector, NumLoops + 1> OriginalInits; + SmallVector, NumLoops + 1> OriginalInits; if (!checkTransformableLoopNest(OMPD_reverse, AStmt, NumLoops, LoopHelpers, Body, OriginalInits)) return StmtError(); @@ -15540,7 +15811,7 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective( // Verify and diagnose loop nest. SmallVector LoopHelpers(NumLoops); Stmt *Body = nullptr; - SmallVector, 2> OriginalInits; + SmallVector, 2> OriginalInits; if (!checkTransformableLoopNest(OMPD_interchange, AStmt, NumLoops, LoopHelpers, Body, OriginalInits)) return StmtError(); @@ -15716,6 +15987,484 @@ StmtResult SemaOpenMP::ActOnOpenMPInterchangeDirective( buildPreInits(Context, PreInits)); } +StmtResult SemaOpenMP::ActOnOpenMPFuseDirective(ArrayRef Clauses, + Stmt *AStmt, + SourceLocation StartLoc, + SourceLocation EndLoc) { + + ASTContext &Context = getASTContext(); + DeclContext *CurrContext = SemaRef.CurContext; + Scope *CurScope = SemaRef.getCurScope(); + CaptureVars CopyTransformer(SemaRef); + + // Ensure the structured block is not empty + if (!AStmt) + return StmtError(); + + // Defer transformation in dependent contexts + // The NumLoopNests argument is set to a placeholder 1 (even though + // using looprange fuse could yield up to 3 top level loop nests) + // because a dependent context could prevent determining its true value + if (CurrContext->isDependentContext()) + return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses, + /* NumLoops */ 1, AStmt, nullptr, nullptr); + + // Validate that the potential loop sequence is transformable for fusion + // Also collect the HelperExprs, Loop Stmts, Inits, and Number of loops + LoopSequenceAnalysis SeqAnalysis; + if (!checkTransformableLoopSequence(OMPD_fuse, AStmt, SeqAnalysis, Context)) + return StmtError(); + + // SeqAnalysis.LoopSeqSize exists mostly to handle dependent contexts, + // otherwise it must be the same as SeqAnalysis.Loops.size(). + assert(SeqAnalysis.LoopSeqSize == SeqAnalysis.Loops.size() && + "Inconsistent size of the loop sequence and the number of loops " + "found in the sequence"); + + // Handle clauses, which can be any of the following: [looprange, apply] + const auto *LRC = + OMPExecutableDirective::getSingleClause(Clauses); + + // The clause arguments are invalidated if any error arises + // such as non-constant or non-positive arguments + if (LRC && (!LRC->getFirst() || !LRC->getCount())) + return StmtError(); + + // Delayed semantic check of LoopRange constraint + // Evaluates the loop range arguments and returns the first and count values + auto EvaluateLoopRangeArguments = [&Context](Expr *First, Expr *Count, + uint64_t &FirstVal, + uint64_t &CountVal) { + llvm::APSInt FirstInt = First->EvaluateKnownConstInt(Context); + llvm::APSInt CountInt = Count->EvaluateKnownConstInt(Context); + FirstVal = FirstInt.getZExtValue(); + CountVal = CountInt.getZExtValue(); + }; + + // OpenMP [6.0, Restrictions] + // first + count - 1 must not evaluate to a value greater than the + // loop sequence length of the associated canonical loop sequence. + auto ValidLoopRange = [](uint64_t FirstVal, uint64_t CountVal, + unsigned NumLoops) -> bool { + return FirstVal + CountVal - 1 <= NumLoops; + }; + uint64_t FirstVal = 1, CountVal = 0, LastVal = SeqAnalysis.LoopSeqSize; + + // Validates the loop range after evaluating the semantic information + // and ensures that the range is valid for the given loop sequence size. + // Expressions are evaluated at compile time to obtain constant values. + if (LRC) { + EvaluateLoopRangeArguments(LRC->getFirst(), LRC->getCount(), FirstVal, + CountVal); + if (CountVal == 1) + SemaRef.Diag(LRC->getCountLoc(), diag::warn_omp_redundant_fusion) + << getOpenMPDirectiveName(OMPD_fuse); + + if (!ValidLoopRange(FirstVal, CountVal, SeqAnalysis.LoopSeqSize)) { + SemaRef.Diag(LRC->getFirstLoc(), diag::err_omp_invalid_looprange) + << getOpenMPDirectiveName(OMPD_fuse) << FirstVal + << (FirstVal + CountVal - 1) << SeqAnalysis.LoopSeqSize; + return StmtError(); + } + + LastVal = FirstVal + CountVal - 1; + } + + // Complete fusion generates a single canonical loop nest + // However looprange clause may generate several loop nests + unsigned NumGeneratedTopLevelLoops = + LRC ? SeqAnalysis.LoopSeqSize - CountVal + 1 : 1; + + // Emit a warning for redundant loop fusion when the sequence contains only + // one loop. + if (SeqAnalysis.LoopSeqSize == 1) + SemaRef.Diag(AStmt->getBeginLoc(), diag::warn_omp_redundant_fusion) + << getOpenMPDirectiveName(OMPD_fuse); + + // Select the type with the largest bit width among all induction variables + QualType IVType = + SeqAnalysis.Loops[FirstVal - 1].HelperExprs.IterationVarRef->getType(); + for (unsigned I : llvm::seq(FirstVal, LastVal)) { + QualType CurrentIVType = + SeqAnalysis.Loops[I].HelperExprs.IterationVarRef->getType(); + if (Context.getTypeSize(CurrentIVType) > Context.getTypeSize(IVType)) { + IVType = CurrentIVType; + } + } + uint64_t IVBitWidth = Context.getIntWidth(IVType); + + // Create pre-init declarations for all loops lower bounds, upper bounds, + // strides and num-iterations for every top level loop in the fusion + SmallVector LBVarDecls; + SmallVector STVarDecls; + SmallVector NIVarDecls; + SmallVector UBVarDecls; + SmallVector IVVarDecls; + + // Helper lambda to create variables for bounds, strides, and other + // expressions. Generates both the variable declaration and the corresponding + // initialization statement. + auto CreateHelperVarAndStmt = + [&, &SemaRef = SemaRef](Expr *ExprToCopy, const std::string &BaseName, + unsigned I, bool NeedsNewVD = false) { + Expr *TransformedExpr = + AssertSuccess(CopyTransformer.TransformExpr(ExprToCopy)); + if (!TransformedExpr) + return std::pair(nullptr, StmtError()); + + auto Name = (Twine(".omp.") + BaseName + std::to_string(I)).str(); + + VarDecl *VD; + if (NeedsNewVD) { + VD = buildVarDecl(SemaRef, SourceLocation(), IVType, Name); + SemaRef.AddInitializerToDecl(VD, TransformedExpr, false); + } else { + // Create a unique variable name + DeclRefExpr *DRE = cast(TransformedExpr); + VD = cast(DRE->getDecl()); + VD->setDeclName(&SemaRef.PP.getIdentifierTable().get(Name)); + } + // Create the corresponding declaration statement + StmtResult DeclStmt = new (Context) class DeclStmt( + DeclGroupRef(VD), SourceLocation(), SourceLocation()); + return std::make_pair(VD, DeclStmt); + }; + + // PreInits hold a sequence of variable declarations that must be executed + // before the fused loop begins. These include bounds, strides, and other + // helper variables required for the transformation. Other loop transforms + // also contain their own preinits + SmallVector PreInits; + + // Update the general preinits using the preinits generated by loop sequence + // generating loop transformations. These preinits differ slightly from + // single-loop transformation preinits, as they can be detached from a + // specific loop inside multiple generated loop nests. This happens + // because certain helper variables, like '.omp.fuse.max', are introduced to + // handle fused iteration spaces and may not be directly tied to a single + // original loop. The preinit structure must ensure that hidden variables + // like '.omp.fuse.max' are still properly handled. + // Transformations that apply this concept: Loopranged Fuse, Split + llvm::append_range(PreInits, SeqAnalysis.LoopSequencePreInits); + + // Process each single loop to generate and collect declarations + // and statements for all helper expressions related to + // particular single loop nests + + // Also In the case of the fused loops, we keep track of their original + // inits by appending them to their preinits statement, and in the case of + // transformations, also append their preinits (which contain the original + // loop initialization statement or other statements) + + // Firstly we need to set TransformIndex to match the begining of the + // looprange section + unsigned int TransformIndex = 0; + for (unsigned I : llvm::seq(FirstVal - 1)) { + if (SeqAnalysis.Loops[I].isLoopTransformation()) + ++TransformIndex; + } + + for (unsigned int I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) { + if (SeqAnalysis.Loops[I].isRegularLoop()) { + addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs, + SeqAnalysis.Loops[I].TheForStmt, + SeqAnalysis.Loops[I].OriginalInits, PreInits); + } else if (SeqAnalysis.Loops[I].isLoopTransformation()) { + // For transformed loops, insert both pre-inits and original inits. + // Order matters: pre-inits may define variables used in the original + // inits such as upper bounds... + SmallVector &TransformPreInit = + SeqAnalysis.Loops[TransformIndex++].TransformsPreInits; + llvm::append_range(PreInits, TransformPreInit); + + addLoopPreInits(Context, SeqAnalysis.Loops[I].HelperExprs, + SeqAnalysis.Loops[I].TheForStmt, + SeqAnalysis.Loops[I].OriginalInits, PreInits); + } + auto [UBVD, UBDStmt] = + CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.UB, "ub", J); + auto [LBVD, LBDStmt] = + CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.LB, "lb", J); + auto [STVD, STDStmt] = + CreateHelperVarAndStmt(SeqAnalysis.Loops[I].HelperExprs.ST, "st", J); + auto [NIVD, NIDStmt] = CreateHelperVarAndStmt( + SeqAnalysis.Loops[I].HelperExprs.NumIterations, "ni", J, true); + auto [IVVD, IVDStmt] = CreateHelperVarAndStmt( + SeqAnalysis.Loops[I].HelperExprs.IterationVarRef, "iv", J); + + assert(LBVD && STVD && NIVD && IVVD && + "OpenMP Fuse Helper variables creation failed"); + + UBVarDecls.push_back(UBVD); + LBVarDecls.push_back(LBVD); + STVarDecls.push_back(STVD); + NIVarDecls.push_back(NIVD); + IVVarDecls.push_back(IVVD); + + PreInits.push_back(LBDStmt.get()); + PreInits.push_back(STDStmt.get()); + PreInits.push_back(NIDStmt.get()); + PreInits.push_back(IVDStmt.get()); + } + + auto MakeVarDeclRef = [&SemaRef = this->SemaRef](VarDecl *VD) { + return buildDeclRefExpr(SemaRef, VD, VD->getType(), VD->getLocation(), + false); + }; + + // Following up the creation of the final fused loop will be performed + // which has the following shape (considering the selected loops): + // + // for (fuse.index = 0; fuse.index < max(ni0, ni1..., nik); ++fuse.index) { + // if (fuse.index < ni0){ + // iv0 = lb0 + st0 * fuse.index; + // original.index0 = iv0 + // body(0); + // } + // if (fuse.index < ni1){ + // iv1 = lb1 + st1 * fuse.index; + // original.index1 = iv1 + // body(1); + // } + // + // ... + // + // if (fuse.index < nik){ + // ivk = lbk + stk * fuse.index; + // original.indexk = ivk + // body(k); Expr *InitVal = IntegerLiteral::Create(Context, + // llvm::APInt(IVWidth, 0), + // } + + // 1. Create the initialized fuse index + StringRef IndexName = ".omp.fuse.index"; + Expr *InitVal = IntegerLiteral::Create(Context, llvm::APInt(IVBitWidth, 0), + IVType, SourceLocation()); + VarDecl *IndexDecl = + buildVarDecl(SemaRef, {}, IVType, IndexName, nullptr, nullptr); + SemaRef.AddInitializerToDecl(IndexDecl, InitVal, false); + StmtResult InitStmt = new (Context) + DeclStmt(DeclGroupRef(IndexDecl), SourceLocation(), SourceLocation()); + + if (!InitStmt.isUsable()) + return StmtError(); + + auto MakeIVRef = [&SemaRef = this->SemaRef, IndexDecl, IVType, + Loc = InitVal->getExprLoc()]() { + return buildDeclRefExpr(SemaRef, IndexDecl, IVType, Loc, false); + }; + + // 2. Iteratively compute the max number of logical iterations Max(NI_1, NI_2, + // ..., NI_k) + // + // This loop accumulates the maximum value across multiple expressions, + // ensuring each step constructs a unique AST node for correctness. By using + // intermediate temporary variables and conditional operators, we maintain + // distinct nodes and avoid duplicating subtrees, For instance, max(a,b,c): + // omp.temp0 = max(a, b) + // omp.temp1 = max(omp.temp0, c) + // omp.fuse.max = max(omp.temp1, omp.temp0) + + ExprResult MaxExpr; + // I is the range of loops in the sequence that we fuse. + for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) { + DeclRefExpr *NIRef = MakeVarDeclRef(NIVarDecls[J]); + QualType NITy = NIRef->getType(); + + if (MaxExpr.isUnset()) { + // Initialize MaxExpr with the first NI expression + MaxExpr = NIRef; + } else { + // Create a new acummulator variable t_i = MaxExpr + std::string TempName = (Twine(".omp.temp.") + Twine(J)).str(); + VarDecl *TempDecl = + buildVarDecl(SemaRef, {}, NITy, TempName, nullptr, nullptr); + TempDecl->setInit(MaxExpr.get()); + DeclRefExpr *TempRef = + buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false); + DeclRefExpr *TempRef2 = + buildDeclRefExpr(SemaRef, TempDecl, NITy, SourceLocation(), false); + // Add a DeclStmt to PreInits to ensure the variable is declared. + StmtResult TempStmt = new (Context) + DeclStmt(DeclGroupRef(TempDecl), SourceLocation(), SourceLocation()); + + if (!TempStmt.isUsable()) + return StmtError(); + PreInits.push_back(TempStmt.get()); + + // Build MaxExpr <-(MaxExpr > NIRef ? MaxExpr : NIRef) + ExprResult Comparison = + SemaRef.BuildBinOp(nullptr, SourceLocation(), BO_GT, TempRef, NIRef); + // Handle any errors in Comparison creation + if (!Comparison.isUsable()) + return StmtError(); + + DeclRefExpr *NIRef2 = MakeVarDeclRef(NIVarDecls[J]); + // Update MaxExpr using a conditional expression to hold the max value + MaxExpr = new (Context) ConditionalOperator( + Comparison.get(), SourceLocation(), TempRef2, SourceLocation(), + NIRef2->getExprStmt(), NITy, VK_LValue, OK_Ordinary); + + if (!MaxExpr.isUsable()) + return StmtError(); + } + } + if (!MaxExpr.isUsable()) + return StmtError(); + + // 3. Declare the max variable + const std::string MaxName = Twine(".omp.fuse.max").str(); + VarDecl *MaxDecl = + buildVarDecl(SemaRef, {}, IVType, MaxName, nullptr, nullptr); + MaxDecl->setInit(MaxExpr.get()); + DeclRefExpr *MaxRef = buildDeclRefExpr(SemaRef, MaxDecl, IVType, {}, false); + StmtResult MaxStmt = new (Context) + DeclStmt(DeclGroupRef(MaxDecl), SourceLocation(), SourceLocation()); + + if (MaxStmt.isInvalid()) + return StmtError(); + PreInits.push_back(MaxStmt.get()); + + // 4. Create condition Expr: index < n_max + ExprResult CondExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT, + MakeIVRef(), MaxRef); + if (!CondExpr.isUsable()) + return StmtError(); + + // 5. Increment Expr: ++index + ExprResult IncrExpr = + SemaRef.BuildUnaryOp(CurScope, SourceLocation(), UO_PreInc, MakeIVRef()); + if (!IncrExpr.isUsable()) + return StmtError(); + + // 6. Build the Fused Loop Body + // The final fused loop iterates over the maximum logical range. Inside the + // loop, each original loop's index is calculated dynamically, and its body + // is executed conditionally. + // + // Each sub-loop's body is guarded by a conditional statement to ensure + // it executes only within its logical iteration range: + // + // if (fuse.index < ni_k){ + // iv_k = lb_k + st_k * fuse.index; + // original.index = iv_k + // body(k); + // } + + CompoundStmt *FusedBody = nullptr; + SmallVector FusedBodyStmts; + for (unsigned I = FirstVal - 1, J = 0; I < LastVal; ++I, ++J) { + // Assingment of the original sub-loop index to compute the logical index + // IV_k = LB_k + omp.fuse.index * ST_k + ExprResult IdxExpr = + SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Mul, + MakeVarDeclRef(STVarDecls[J]), MakeIVRef()); + if (!IdxExpr.isUsable()) + return StmtError(); + IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Add, + MakeVarDeclRef(LBVarDecls[J]), IdxExpr.get()); + + if (!IdxExpr.isUsable()) + return StmtError(); + IdxExpr = SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_Assign, + MakeVarDeclRef(IVVarDecls[J]), IdxExpr.get()); + if (!IdxExpr.isUsable()) + return StmtError(); + + // Update the original i_k = IV_k + SmallVector BodyStmts; + BodyStmts.push_back(IdxExpr.get()); + llvm::append_range(BodyStmts, SeqAnalysis.Loops[I].HelperExprs.Updates); + + // If the loop is a CXXForRangeStmt then the iterator variable is needed + if (auto *SourceCXXFor = + dyn_cast(SeqAnalysis.Loops[I].TheForStmt)) + BodyStmts.push_back(SourceCXXFor->getLoopVarStmt()); + + Stmt *Body = + (isa(SeqAnalysis.Loops[I].TheForStmt)) + ? cast(SeqAnalysis.Loops[I].TheForStmt)->getBody() + : cast(SeqAnalysis.Loops[I].TheForStmt)->getBody(); + BodyStmts.push_back(Body); + + CompoundStmt *CombinedBody = + CompoundStmt::Create(Context, BodyStmts, FPOptionsOverride(), + SourceLocation(), SourceLocation()); + ExprResult Condition = + SemaRef.BuildBinOp(CurScope, SourceLocation(), BO_LT, MakeIVRef(), + MakeVarDeclRef(NIVarDecls[J])); + + if (!Condition.isUsable()) + return StmtError(); + + IfStmt *IfStatement = IfStmt::Create( + Context, SourceLocation(), IfStatementKind::Ordinary, nullptr, nullptr, + Condition.get(), SourceLocation(), SourceLocation(), CombinedBody, + SourceLocation(), nullptr); + + FusedBodyStmts.push_back(IfStatement); + } + FusedBody = CompoundStmt::Create(Context, FusedBodyStmts, FPOptionsOverride(), + SourceLocation(), SourceLocation()); + + // 7. Construct the final fused loop + ForStmt *FusedForStmt = new (Context) + ForStmt(Context, InitStmt.get(), CondExpr.get(), nullptr, IncrExpr.get(), + FusedBody, InitStmt.get()->getBeginLoc(), SourceLocation(), + IncrExpr.get()->getEndLoc()); + + // In the case of looprange, the result of fuse won't simply + // be a single loop (ForStmt), but rather a loop sequence + // (CompoundStmt) of 3 parts: the pre-fusion loops, the fused loop + // and the post-fusion loops, preserving its original order. + // + // Note: If looprange clause produces a single fused loop nest then + // this compound statement wrapper is unnecessary (Therefore this + // treatment is skipped) + + Stmt *FusionStmt = FusedForStmt; + if (LRC && CountVal != SeqAnalysis.LoopSeqSize) { + SmallVector FinalLoops; + + // Reset the transform index + TransformIndex = 0; + + // Collect all non-fused loops before and after the fused region. + // Pre-fusion and post-fusion loops are inserted in order exploiting their + // symmetry, along with their corresponding transformation pre-inits if + // needed. The fused loop is added between the two regions. + for (unsigned I : llvm::seq(SeqAnalysis.LoopSeqSize)) { + if (I >= FirstVal - 1 && I < FirstVal + CountVal - 1) { + // Update the Transformation counter to skip already treated + // loop transformations + if (!SeqAnalysis.Loops[I].isLoopTransformation()) + ++TransformIndex; + continue; + } + + // No need to handle: + // Regular loops: they are kept intact as-is. + // Loop-sequence-generating transformations: already handled earlier. + // Only TransformSingleLoop requires inserting pre-inits here + if (SeqAnalysis.Loops[I].isRegularLoop()) { + const auto &TransformPreInit = + SeqAnalysis.Loops[TransformIndex++].TransformsPreInits; + if (!TransformPreInit.empty()) + llvm::append_range(PreInits, TransformPreInit); + } + + FinalLoops.push_back(SeqAnalysis.Loops[I].TheForStmt); + } + + FinalLoops.insert(FinalLoops.begin() + (FirstVal - 1), FusedForStmt); + FusionStmt = CompoundStmt::Create(Context, FinalLoops, FPOptionsOverride(), + SourceLocation(), SourceLocation()); + } + return OMPFuseDirective::Create(Context, StartLoc, EndLoc, Clauses, + NumGeneratedTopLevelLoops, AStmt, FusionStmt, + buildPreInits(Context, PreInits)); +} + OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, SourceLocation StartLoc, @@ -16887,6 +17636,31 @@ OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr, FactorExpr); } +OMPClause *SemaOpenMP::ActOnOpenMPLoopRangeClause( + Expr *First, Expr *Count, SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation FirstLoc, SourceLocation CountLoc, SourceLocation EndLoc) { + + // OpenMP [6.0, Restrictions] + // First and Count must be integer expressions with positive value + ExprResult FirstVal = + VerifyPositiveIntegerConstantInClause(First, OMPC_looprange); + if (FirstVal.isInvalid()) + First = nullptr; + + ExprResult CountVal = + VerifyPositiveIntegerConstantInClause(Count, OMPC_looprange); + if (CountVal.isInvalid()) + Count = nullptr; + + // OpenMP [6.0, Restrictions] + // first + count - 1 must not evaluate to a value greater than the + // loop sequence length of the associated canonical loop sequence. + // This check must be performed afterwards due to the delayed + // parsing and computation of the associated loop sequence + return OMPLoopRangeClause::Create(getASTContext(), StartLoc, LParenLoc, + FirstLoc, CountLoc, EndLoc, First, Count); +} + OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) { diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index f051a246f954f..2bf1511c5cfa0 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -775,6 +775,40 @@ Sema::BuildDependentDeclRefExpr(const CXXScopeSpec &SS, TemplateArgs); } +ExprResult Sema::BuildSubstNonTypeTemplateParmExpr( + Decl *AssociatedDecl, const NonTypeTemplateParmDecl *NTTP, + SourceLocation Loc, TemplateArgument Arg, UnsignedOrNone PackIndex, + bool Final) { + // The template argument itself might be an expression, in which case we just + // return that expression. This happens when substituting into an alias + // template. + Expr *Replacement; + bool refParam = true; + if (Arg.getKind() == TemplateArgument::Expression) { + Replacement = Arg.getAsExpr(); + refParam = Replacement->isLValue(); + if (refParam && Replacement->getType()->isRecordType()) { + QualType ParamType = + NTTP->isExpandedParameterPack() + ? NTTP->getExpansionType(*SemaRef.ArgPackSubstIndex) + : NTTP->getType(); + if (const auto *PET = dyn_cast(ParamType)) + ParamType = PET->getPattern(); + refParam = ParamType->isReferenceType(); + } + } else { + ExprResult result = + SemaRef.BuildExpressionFromNonTypeTemplateArgument(Arg, Loc); + if (result.isInvalid()) + return ExprError(); + Replacement = result.get(); + refParam = Arg.getNonTypeTemplateArgumentType()->isReferenceType(); + } + return new (SemaRef.Context) SubstNonTypeTemplateParmExpr( + Replacement->getType(), Replacement->getValueKind(), Loc, Replacement, + AssociatedDecl, NTTP->getIndex(), PackIndex, refParam, Final); +} + bool Sema::DiagnoseUninstantiableTemplate(SourceLocation PointOfInstantiation, NamedDecl *Instantiation, bool InstantiatedFromMember, @@ -7068,22 +7102,8 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType, // If the parameter type somehow involves auto, deduce the type now. DeducedType *DeducedT = ParamType->getContainedDeducedType(); - if (getLangOpts().CPlusPlus17 && DeducedT && !DeducedT->isDeduced()) { - // During template argument deduction, we allow 'decltype(auto)' to - // match an arbitrary dependent argument. - // FIXME: The language rules don't say what happens in this case. - // FIXME: We get an opaque dependent type out of decltype(auto) if the - // expression is merely instantiation-dependent; is this enough? - if (DeductionArg->isTypeDependent()) { - auto *AT = dyn_cast(DeducedT); - if (AT && AT->isDecltypeAuto()) { - SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false); - CanonicalConverted = TemplateArgument( - Context.getCanonicalTemplateArgument(SugaredConverted)); - return Arg; - } - } - + bool IsDeduced = DeducedT && DeducedT->getDeducedType().isNull(); + if (IsDeduced) { // When checking a deduced template argument, deduce from its type even if // the type is dependent, in order to check the types of non-type template // arguments line up properly in partial ordering. @@ -7112,17 +7132,21 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType, // along with the other associated constraints after // checking the template argument list. /*IgnoreConstraints=*/true); - if (Result == TemplateDeductionResult::AlreadyDiagnosed) { - return ExprError(); - } else if (Result != TemplateDeductionResult::Success) { - if (const auto *NTTP = dyn_cast(Param)) { - Diag(Arg->getExprLoc(), - diag::err_non_type_template_parm_type_deduction_failure) - << Param->getDeclName() << NTTP->getType() << Arg->getType() - << Arg->getSourceRange(); + if (Result != TemplateDeductionResult::Success) { + ParamType = TSI->getType(); + if (StrictCheck || !DeductionArg->isTypeDependent()) { + if (Result == TemplateDeductionResult::AlreadyDiagnosed) + return ExprError(); + if (const auto *NTTP = dyn_cast(Param)) + Diag(Arg->getExprLoc(), + diag::err_non_type_template_parm_type_deduction_failure) + << Param->getDeclName() << NTTP->getType() << Arg->getType() + << Arg->getSourceRange(); + NoteTemplateParameterLocation(*Param); + return ExprError(); } - NoteTemplateParameterLocation(*Param); - return ExprError(); + ParamType = SubstAutoTypeDependent(ParamType); + assert(!ParamType.isNull() && "substituting DependentTy can't fail"); } } // CheckNonTypeTemplateParameterType will produce a diagnostic if there's @@ -7144,14 +7168,16 @@ ExprResult Sema::CheckTemplateArgument(NamedDecl *Param, QualType ParamType, // type-dependent, there's nothing we can check now. if (ParamType->isDependentType() || DeductionArg->isTypeDependent()) { // Force the argument to the type of the parameter to maintain invariants. - ExprResult E = ImpCastExprToType( - DeductionArg, ParamType.getNonLValueExprType(Context), CK_Dependent, - ParamType->isLValueReferenceType() ? VK_LValue - : ParamType->isRValueReferenceType() ? VK_XValue - : VK_PRValue); - if (E.isInvalid()) - return ExprError(); - setDeductionArg(E.get()); + if (!IsDeduced) { + ExprResult E = ImpCastExprToType( + DeductionArg, ParamType.getNonLValueExprType(Context), CK_Dependent, + ParamType->isLValueReferenceType() ? VK_LValue + : ParamType->isRValueReferenceType() ? VK_XValue + : VK_PRValue); + if (E.isInvalid()) + return ExprError(); + setDeductionArg(E.get()); + } SugaredConverted = TemplateArgument(Arg, /*IsCanonical=*/false); CanonicalConverted = TemplateArgument( Context.getCanonicalTemplateArgument(SugaredConverted)); @@ -8555,6 +8581,7 @@ static SourceRange findTemplateParameter(unsigned Depth, TypeLoc TL) { static bool CheckNonTypeTemplatePartialSpecializationArgs( Sema &S, SourceLocation TemplateNameLoc, NonTypeTemplateParmDecl *Param, const TemplateArgument *Args, unsigned NumArgs, bool IsDefaultArgument) { + bool HasError = false; for (unsigned I = 0; I != NumArgs; ++I) { if (Args[I].getKind() == TemplateArgument::Pack) { if (CheckNonTypeTemplatePartialSpecializationArgs( @@ -8569,6 +8596,10 @@ static bool CheckNonTypeTemplatePartialSpecializationArgs( continue; Expr *ArgExpr = Args[I].getAsExpr(); + if (ArgExpr->containsErrors()) { + HasError = true; + continue; + } // We can have a pack expansion of any of the bullets below. if (PackExpansionExpr *Expansion = dyn_cast(ArgExpr)) @@ -8638,7 +8669,7 @@ static bool CheckNonTypeTemplatePartialSpecializationArgs( } } - return false; + return HasError; } bool Sema::CheckTemplatePartialSpecializationArgs( diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index 62e867c44ad14..f6ee7452c2f9a 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -483,7 +483,7 @@ DeduceNonTypeTemplateArgument(Sema &S, TemplateParameterList *TemplateParams, return TemplateDeductionResult::Inconsistent; } Deduced[NTTP.getIndex()] = Result; - if (!S.getLangOpts().CPlusPlus17) + if (!S.getLangOpts().CPlusPlus17 && !PartialOrdering) return TemplateDeductionResult::Success; if (NTTP.isExpandedParameterPack()) @@ -2652,28 +2652,11 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams, getDeducedNTTParameterFromExpr(Info, P.getAsExpr())) { switch (A.getKind()) { case TemplateArgument::Expression: { - const Expr *E = A.getAsExpr(); - // When checking NTTP, if either the parameter or the argument is - // dependent, as there would be otherwise nothing to deduce, we force - // the argument to the parameter type using this dependent implicit - // cast, in order to maintain invariants. Now we can deduce the - // resulting type from the original type, and deduce the original type - // against the parameter we are checking. - if (const auto *ICE = dyn_cast(E); - ICE && ICE->getCastKind() == clang::CK_Dependent) { - E = ICE->getSubExpr(); - if (auto Result = DeduceTemplateArgumentsByTypeMatch( - S, TemplateParams, ICE->getType(), E->getType(), Info, - Deduced, TDF_SkipNonDependent, - PartialOrdering ? PartialOrderingKind::NonCall - : PartialOrderingKind::None, - /*DeducedFromArrayBound=*/false, HasDeducedAnyParam); - Result != TemplateDeductionResult::Success) - return Result; - } + // The type of the value is the type of the expression as written. return DeduceNonTypeTemplateArgument( - S, TemplateParams, NTTP, DeducedTemplateArgument(A), E->getType(), - Info, PartialOrdering, Deduced, HasDeducedAnyParam); + S, TemplateParams, NTTP, DeducedTemplateArgument(A), + A.getAsExpr()->IgnoreImplicitAsWritten()->getType(), Info, + PartialOrdering, Deduced, HasDeducedAnyParam); } case TemplateArgument::Integral: case TemplateArgument::StructuralValue: @@ -5279,18 +5262,6 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result, SmallVector Deduced; Deduced.resize(1); - // If deduction failed, don't diagnose if the initializer is dependent; it - // might acquire a matching type in the instantiation. - auto DeductionFailed = [&](TemplateDeductionResult TDK) { - if (Init->isTypeDependent()) { - Result = - SubstituteDeducedTypeTransform(*this, DependentResult).Apply(Type); - assert(!Result.isNull() && "substituting DependentTy can't fail"); - return TemplateDeductionResult::Success; - } - return TDK; - }; - SmallVector OriginalCallArgs; QualType DeducedType; @@ -5340,9 +5311,9 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result, Diag(Info.getLocation(), diag::err_auto_inconsistent_deduction) << Info.FirstArg << Info.SecondArg << DeducedFromInitRange << Init->getSourceRange(); - return DeductionFailed(TemplateDeductionResult::AlreadyDiagnosed); + return TemplateDeductionResult::AlreadyDiagnosed; } - return DeductionFailed(TDK); + return TDK; } if (DeducedFromInitRange.isInvalid() && @@ -5364,12 +5335,12 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result, OriginalCallArgs, /*Decomposed=*/false, /*ArgIdx=*/0, /*TDF=*/0, FailedTSC); TDK != TemplateDeductionResult::Success) - return DeductionFailed(TDK); + return TDK; } // Could be null if somehow 'auto' appears in a non-deduced context. if (Deduced[0].getKind() != TemplateArgument::Type) - return DeductionFailed(TemplateDeductionResult::Incomplete); + return TemplateDeductionResult::Incomplete; DeducedType = Deduced[0].getAsType(); if (InitList) { @@ -5383,7 +5354,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result, if (!Context.hasSameType(DeducedType, Result)) { Info.FirstArg = Result; Info.SecondArg = DeducedType; - return DeductionFailed(TemplateDeductionResult::Inconsistent); + return TemplateDeductionResult::Inconsistent; } DeducedType = Context.getCommonSugaredType(Result, DeducedType); } @@ -5407,7 +5378,7 @@ Sema::DeduceAutoType(TypeLoc Type, Expr *Init, QualType &Result, CheckOriginalCallArgDeduction(*this, Info, OriginalArg, DeducedA); TDK != TemplateDeductionResult::Success) { Result = QualType(); - return DeductionFailed(TDK); + return TDK; } } @@ -5429,13 +5400,17 @@ TypeSourceInfo *Sema::SubstAutoTypeSourceInfo(TypeSourceInfo *TypeWithAuto, } QualType Sema::SubstAutoTypeDependent(QualType TypeWithAuto) { - return SubstituteDeducedTypeTransform(*this, DependentAuto{false}) + return SubstituteDeducedTypeTransform( + *this, + DependentAuto{/*IsPack=*/isa(TypeWithAuto)}) .TransformType(TypeWithAuto); } TypeSourceInfo * Sema::SubstAutoTypeSourceInfoDependent(TypeSourceInfo *TypeWithAuto) { - return SubstituteDeducedTypeTransform(*this, DependentAuto{false}) + return SubstituteDeducedTypeTransform( + *this, DependentAuto{/*IsPack=*/isa( + TypeWithAuto->getType())}) .TransformType(TypeWithAuto); } diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index a72c95d6d77cf..f1c9c5c868159 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -616,29 +616,30 @@ Sema::InstantiatingTemplate::InstantiatingTemplate( Invalid = true; return; } - Invalid = CheckInstantiationDepth(PointOfInstantiation, InstantiationRange); + + CodeSynthesisContext Inst; + Inst.Kind = Kind; + Inst.PointOfInstantiation = PointOfInstantiation; + Inst.Entity = Entity; + Inst.Template = Template; + Inst.TemplateArgs = TemplateArgs.data(); + Inst.NumTemplateArgs = TemplateArgs.size(); + Inst.DeductionInfo = DeductionInfo; + Inst.InstantiationRange = InstantiationRange; + Inst.InConstraintSubstitution = + Inst.Kind == CodeSynthesisContext::ConstraintSubstitution; + if (!SemaRef.CodeSynthesisContexts.empty()) + Inst.InConstraintSubstitution |= + SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution; + + Invalid = SemaRef.pushCodeSynthesisContext(Inst); if (!Invalid) { - CodeSynthesisContext Inst; - Inst.Kind = Kind; - Inst.PointOfInstantiation = PointOfInstantiation; - Inst.Entity = Entity; - Inst.Template = Template; - Inst.TemplateArgs = TemplateArgs.data(); - Inst.NumTemplateArgs = TemplateArgs.size(); - Inst.DeductionInfo = DeductionInfo; - Inst.InstantiationRange = InstantiationRange; - Inst.InConstraintSubstitution = - Inst.Kind == CodeSynthesisContext::ConstraintSubstitution; - if (!SemaRef.CodeSynthesisContexts.empty()) - Inst.InConstraintSubstitution |= - SemaRef.CodeSynthesisContexts.back().InConstraintSubstitution; - - SemaRef.pushCodeSynthesisContext(Inst); - - AlreadyInstantiating = !Inst.Entity ? false : - !SemaRef.InstantiatingSpecializations - .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind}) - .second; + AlreadyInstantiating = + !Inst.Entity + ? false + : !SemaRef.InstantiatingSpecializations + .insert({Inst.Entity->getCanonicalDecl(), Inst.Kind}) + .second; atTemplateBegin(SemaRef.TemplateInstCallbacks, SemaRef, Inst); } } @@ -834,18 +835,34 @@ Sema::InstantiatingTemplate::InstantiatingTemplate( : InstantiatingTemplate(SemaRef, CodeSynthesisContext::PartialOrderingTTP, ArgLoc, InstantiationRange, PArg) {} -void Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) { +bool Sema::pushCodeSynthesisContext(CodeSynthesisContext Ctx) { Ctx.SavedInNonInstantiationSFINAEContext = InNonInstantiationSFINAEContext; InNonInstantiationSFINAEContext = false; - CodeSynthesisContexts.push_back(Ctx); - - if (!Ctx.isInstantiationRecord()) + if (!Ctx.isInstantiationRecord()) { ++NonInstantiationEntries; + } else { + assert(SemaRef.NonInstantiationEntries <= + SemaRef.CodeSynthesisContexts.size()); + if ((SemaRef.CodeSynthesisContexts.size() - + SemaRef.NonInstantiationEntries) > + SemaRef.getLangOpts().InstantiationDepth) { + SemaRef.Diag(Ctx.PointOfInstantiation, + diag::err_template_recursion_depth_exceeded) + << SemaRef.getLangOpts().InstantiationDepth << Ctx.InstantiationRange; + SemaRef.Diag(Ctx.PointOfInstantiation, + diag::note_template_recursion_depth) + << SemaRef.getLangOpts().InstantiationDepth; + return true; + } + } + + CodeSynthesisContexts.push_back(Ctx); // Check to see if we're low on stack space. We can't do anything about this // from here, but we can at least warn the user. StackHandler.warnOnStackNearlyExhausted(Ctx.PointOfInstantiation); + return false; } void Sema::popCodeSynthesisContext() { @@ -907,25 +924,6 @@ static std::string convertCallArgsToString(Sema &S, return Result; } -bool Sema::InstantiatingTemplate::CheckInstantiationDepth( - SourceLocation PointOfInstantiation, - SourceRange InstantiationRange) { - assert(SemaRef.NonInstantiationEntries <= - SemaRef.CodeSynthesisContexts.size()); - if ((SemaRef.CodeSynthesisContexts.size() - - SemaRef.NonInstantiationEntries) - <= SemaRef.getLangOpts().InstantiationDepth) - return false; - - SemaRef.Diag(PointOfInstantiation, - diag::err_template_recursion_depth_exceeded) - << SemaRef.getLangOpts().InstantiationDepth - << InstantiationRange; - SemaRef.Diag(PointOfInstantiation, diag::note_template_recursion_depth) - << SemaRef.getLangOpts().InstantiationDepth; - return true; -} - void Sema::PrintInstantiationStack(InstantiationContextDiagFuncRef DiagFunc) { // Determine which template instantiations to skip, if any. unsigned SkipStart = CodeSynthesisContexts.size(), SkipEnd = SkipStart; @@ -1373,16 +1371,6 @@ std::optional Sema::isSFINAEContext() const { return std::nullopt; } -static TemplateArgument -getPackSubstitutedTemplateArgument(Sema &S, TemplateArgument Arg) { - assert(S.ArgPackSubstIndex); - assert(*S.ArgPackSubstIndex < Arg.pack_size()); - Arg = Arg.pack_begin()[*S.ArgPackSubstIndex]; - if (Arg.isPackExpansion()) - Arg = Arg.getPackExpansionPattern(); - return Arg; -} - //===----------------------------------------------------------------------===/ // Template Instantiation for Types //===----------------------------------------------------------------------===/ @@ -1449,13 +1437,6 @@ namespace { return TemplateArgs.getNewDepth(Depth); } - UnsignedOrNone getPackIndex(TemplateArgument Pack) { - UnsignedOrNone Index = getSema().ArgPackSubstIndex; - if (!Index) - return std::nullopt; - return Pack.pack_size() - 1 - *Index; - } - bool TryExpandParameterPacks(SourceLocation EllipsisLoc, SourceRange PatternRange, ArrayRef Unexpanded, @@ -1537,7 +1518,7 @@ namespace { if (TA.getKind() != TemplateArgument::Pack) return TA; if (SemaRef.ArgPackSubstIndex) - return getPackSubstitutedTemplateArgument(SemaRef, TA); + return SemaRef.getPackSubstitutedTemplateArgument(TA); assert(TA.pack_size() == 1 && TA.pack_begin()->isPackExpansion() && "unexpected pack arguments in template rewrite"); TemplateArgument Arg = *TA.pack_begin(); @@ -1643,10 +1624,6 @@ namespace { ExprResult TransformTemplateParmRefExpr(DeclRefExpr *E, NonTypeTemplateParmDecl *D); - ExprResult TransformSubstNonTypeTemplateParmPackExpr( - SubstNonTypeTemplateParmPackExpr *E); - ExprResult TransformSubstNonTypeTemplateParmExpr( - SubstNonTypeTemplateParmExpr *E); /// Rebuild a DeclRefExpr for a VarDecl reference. ExprResult RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc); @@ -1933,12 +1910,6 @@ namespace { SmallVectorImpl &PTypes, SmallVectorImpl &TransParams, Sema::ExtParameterInfoBuilder &PInfos); - - private: - ExprResult - transformNonTypeTemplateParmRef(Decl *AssociatedDecl, const NamedDecl *parm, - SourceLocation loc, TemplateArgument arg, - UnsignedOrNone PackIndex, bool Final); }; } @@ -1975,7 +1946,7 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) { if (TTP->isParameterPack()) { assert(Arg.getKind() == TemplateArgument::Pack && "Missing argument pack"); - Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } TemplateName Template = Arg.getAsTemplate(); @@ -2079,7 +2050,7 @@ TemplateInstantiator::TransformFirstQualifierInScope(NamedDecl *D, if (!getSema().ArgPackSubstIndex) return nullptr; - Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } QualType T = Arg.getAsType(); @@ -2165,8 +2136,8 @@ TemplateName TemplateInstantiator::TransformTemplateName( Arg, AssociatedDecl, TTP->getIndex(), Final); } - PackIndex = getPackIndex(Arg); - Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); + PackIndex = SemaRef.getPackIndex(Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } TemplateName Template = Arg.getAsTemplate(); @@ -2183,10 +2154,10 @@ TemplateName TemplateInstantiator::TransformTemplateName( TemplateArgument Pack = SubstPack->getArgumentPack(); TemplateName Template = - getPackSubstitutedTemplateArgument(getSema(), Pack).getAsTemplate(); + SemaRef.getPackSubstitutedTemplateArgument(Pack).getAsTemplate(); return getSema().Context.getSubstTemplateTemplateParm( Template, SubstPack->getAssociatedDecl(), SubstPack->getIndex(), - getPackIndex(Pack), SubstPack->getFinal()); + SemaRef.getPackIndex(Pack), SubstPack->getFinal()); } return inherited::TransformTemplateName( @@ -2252,11 +2223,11 @@ TemplateInstantiator::TransformTemplateParmRefExpr(DeclRefExpr *E, ExprType, TargetType->isReferenceType() ? VK_LValue : VK_PRValue, E->getLocation(), Arg, AssociatedDecl, NTTP->getPosition(), Final); } - PackIndex = getPackIndex(Arg); - Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); + PackIndex = SemaRef.getPackIndex(Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } - return transformNonTypeTemplateParmRef(AssociatedDecl, NTTP, E->getLocation(), - Arg, PackIndex, Final); + return SemaRef.BuildSubstNonTypeTemplateParmExpr( + AssociatedDecl, NTTP, E->getLocation(), Arg, PackIndex, Final); } const AnnotateAttr * @@ -2344,144 +2315,6 @@ TemplateInstantiator::TransformOpenACCRoutineDeclAttr( "applies to a Function Decl (and a few places for VarDecl)"); } -ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef( - Decl *AssociatedDecl, const NamedDecl *parm, SourceLocation loc, - TemplateArgument arg, UnsignedOrNone PackIndex, bool Final) { - ExprResult result; - - // Determine the substituted parameter type. We can usually infer this from - // the template argument, but not always. - auto SubstParamType = [&] { - if (const auto *NTTP = dyn_cast(parm)) { - QualType T; - if (NTTP->isExpandedParameterPack()) - T = NTTP->getExpansionType(*SemaRef.ArgPackSubstIndex); - else - T = NTTP->getType(); - if (parm->isParameterPack() && isa(T)) - T = cast(T)->getPattern(); - return SemaRef.SubstType(T, TemplateArgs, loc, parm->getDeclName()); - } - return SemaRef.SubstType(arg.getAsExpr()->getType(), TemplateArgs, loc, - parm->getDeclName()); - }; - - bool refParam = false; - - // The template argument itself might be an expression, in which case we just - // return that expression. This happens when substituting into an alias - // template. - if (arg.getKind() == TemplateArgument::Expression) { - Expr *argExpr = arg.getAsExpr(); - result = argExpr; - if (argExpr->isLValue()) { - if (argExpr->getType()->isRecordType()) { - // Check whether the parameter was actually a reference. - QualType paramType = SubstParamType(); - if (paramType.isNull()) - return ExprError(); - refParam = paramType->isReferenceType(); - } else { - refParam = true; - } - } - } else if (arg.getKind() == TemplateArgument::Declaration || - arg.getKind() == TemplateArgument::NullPtr) { - if (arg.getKind() == TemplateArgument::Declaration) { - ValueDecl *VD = arg.getAsDecl(); - - // Find the instantiation of the template argument. This is - // required for nested templates. - VD = cast_or_null( - getSema().FindInstantiatedDecl(loc, VD, TemplateArgs)); - if (!VD) - return ExprError(); - } - - QualType paramType = arg.getNonTypeTemplateArgumentType(); - assert(!paramType.isNull() && "type substitution failed for param type"); - assert(!paramType->isDependentType() && "param type still dependent"); - result = SemaRef.BuildExpressionFromDeclTemplateArgument(arg, paramType, loc); - refParam = paramType->isReferenceType(); - } else { - QualType paramType = arg.getNonTypeTemplateArgumentType(); - result = SemaRef.BuildExpressionFromNonTypeTemplateArgument(arg, loc); - refParam = paramType->isReferenceType(); - assert(result.isInvalid() || - SemaRef.Context.hasSameType(result.get()->getType(), - paramType.getNonReferenceType())); - } - - if (result.isInvalid()) - return ExprError(); - - Expr *resultExpr = result.get(); - return new (SemaRef.Context) SubstNonTypeTemplateParmExpr( - resultExpr->getType(), resultExpr->getValueKind(), loc, resultExpr, - AssociatedDecl, - clang::getDepthAndIndex(const_cast(parm)).second, PackIndex, - refParam, Final); -} - -ExprResult -TemplateInstantiator::TransformSubstNonTypeTemplateParmPackExpr( - SubstNonTypeTemplateParmPackExpr *E) { - if (!getSema().ArgPackSubstIndex) { - // We aren't expanding the parameter pack, so just return ourselves. - return E; - } - - TemplateArgument Pack = E->getArgumentPack(); - TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack); - return transformNonTypeTemplateParmRef( - E->getAssociatedDecl(), E->getParameterPack(), - E->getParameterPackLocation(), Arg, getPackIndex(Pack), E->getFinal()); -} - -ExprResult -TemplateInstantiator::TransformSubstNonTypeTemplateParmExpr( - SubstNonTypeTemplateParmExpr *E) { - ExprResult SubstReplacement = E->getReplacement(); - if (!isa(SubstReplacement.get())) - SubstReplacement = TransformExpr(E->getReplacement()); - if (SubstReplacement.isInvalid()) - return true; - QualType SubstType = TransformType(E->getParameterType(getSema().Context)); - if (SubstType.isNull()) - return true; - // The type may have been previously dependent and not now, which means we - // might have to implicit cast the argument to the new type, for example: - // template - // concept C = sizeof(U) == 4; - // void foo() requires C<2, 'a'> { } - // When normalizing foo(), we first form the normalized constraints of C: - // AtomicExpr(sizeof(U) == 4, - // U=SubstNonTypeTemplateParmExpr(Param=U, - // Expr=DeclRef(U), - // Type=decltype(T))) - // Then we substitute T = 2, U = 'a' into the parameter mapping, and need to - // produce: - // AtomicExpr(sizeof(U) == 4, - // U=SubstNonTypeTemplateParmExpr(Param=U, - // Expr=ImpCast( - // decltype(2), - // SubstNTTPE(Param=U, Expr='a', - // Type=char)), - // Type=decltype(2))) - // The call to CheckTemplateArgument here produces the ImpCast. - TemplateArgument SugaredConverted, CanonicalConverted; - if (SemaRef - .CheckTemplateArgument(E->getParameter(), SubstType, - SubstReplacement.get(), SugaredConverted, - CanonicalConverted, - /*StrictCheck=*/false, Sema::CTAK_Specified) - .isInvalid()) - return true; - return transformNonTypeTemplateParmRef( - E->getAssociatedDecl(), E->getParameter(), E->getExprLoc(), - SugaredConverted, E->getPackIndex(), E->getFinal()); -} - ExprResult TemplateInstantiator::RebuildVarDeclRefExpr(ValueDecl *PD, SourceLocation Loc) { DeclarationNameInfo NameInfo(PD->getDeclName(), Loc); @@ -2701,8 +2534,8 @@ TemplateInstantiator::TransformTemplateTypeParmType(TypeLocBuilder &TLB, } // PackIndex starts from last element. - PackIndex = getPackIndex(Arg); - Arg = getPackSubstitutedTemplateArgument(getSema(), Arg); + PackIndex = SemaRef.getPackIndex(Arg); + Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg); } assert(Arg.getKind() == TemplateArgument::Type && @@ -2749,20 +2582,20 @@ QualType TemplateInstantiator::TransformSubstTemplateTypeParmPackType( } TemplateArgument Pack = T->getArgumentPack(); - TemplateArgument Arg = getPackSubstitutedTemplateArgument(getSema(), Pack); + TemplateArgument Arg = SemaRef.getPackSubstitutedTemplateArgument(Pack); return BuildSubstTemplateTypeParmType( TLB, SuppressObjCLifetime, T->getFinal(), NewReplaced, T->getIndex(), - getPackIndex(Pack), Arg, TL.getNameLoc()); + SemaRef.getPackIndex(Pack), Arg, TL.getNameLoc()); } QualType TemplateInstantiator::TransformSubstBuiltinTemplatePackType( TypeLocBuilder &TLB, SubstBuiltinTemplatePackTypeLoc TL) { if (!getSema().ArgPackSubstIndex) return TreeTransform::TransformSubstBuiltinTemplatePackType(TLB, TL); - auto &Sema = getSema(); - TemplateArgument Result = getPackSubstitutedTemplateArgument( - Sema, TL.getTypePtr()->getArgumentPack()); - TLB.pushTrivial(Sema.getASTContext(), Result.getAsType(), TL.getBeginLoc()); + TemplateArgument Result = SemaRef.getPackSubstitutedTemplateArgument( + TL.getTypePtr()->getArgumentPack()); + TLB.pushTrivial(SemaRef.getASTContext(), Result.getAsType(), + TL.getBeginLoc()); return Result.getAsType(); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index adac3dff5b2b4..e2dc70360506e 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -3742,7 +3742,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl( ExpandedParams.reserve(D->getNumExpansionTemplateParameters()); for (unsigned I = 0, N = D->getNumExpansionTemplateParameters(); I != N; ++I) { - LocalInstantiationScope Scope(SemaRef); + LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); TemplateParameterList *Expansion = SubstTemplateParams(D->getExpansionTemplateParameters(I)); if (!Expansion) @@ -3774,7 +3774,7 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl( if (Expand) { for (unsigned I = 0; I != *NumExpansions; ++I) { Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, I); - LocalInstantiationScope Scope(SemaRef); + LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); TemplateParameterList *Expansion = SubstTemplateParams(TempParams); if (!Expansion) return nullptr; @@ -3785,21 +3785,18 @@ TemplateDeclInstantiator::VisitTemplateTemplateParmDecl( // expanded parameter pack is the original expansion type, but callers // will end up using the expanded parameter pack types for type-checking. IsExpandedParameterPack = true; - InstParams = TempParams; - } else { - // We cannot fully expand the pack expansion now, so just substitute - // into the pattern. - Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt); - - LocalInstantiationScope Scope(SemaRef); - InstParams = SubstTemplateParams(TempParams); - if (!InstParams) - return nullptr; } + + Sema::ArgPackSubstIndexRAII SubstIndex(SemaRef, std::nullopt); + + LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); + InstParams = SubstTemplateParams(TempParams); + if (!InstParams) + return nullptr; } else { // Perform the actual substitution of template parameters within a new, // local instantiation scope. - LocalInstantiationScope Scope(SemaRef); + LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true); InstParams = SubstTemplateParams(TempParams); if (!InstParams) return nullptr; diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index d723fb80f437e..bee613aa5f1c5 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -2270,7 +2270,10 @@ QualType Sema::BuildArrayType(QualType T, ArraySizeModifier ASM, : ConstVal.getActiveBits(); if (ActiveSizeBits > ConstantArrayType::getMaxSizeBits(Context)) { Diag(ArraySize->getBeginLoc(), diag::err_array_too_large) - << toString(ConstVal, 10) << ArraySize->getSourceRange(); + << toString(ConstVal, 10, ConstVal.isSigned(), + /*formatAsCLiteral=*/false, /*UpperCase=*/false, + /*InsertSeparators=*/true) + << ArraySize->getSourceRange(); return QualType(); } diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp index 1ca769ebb50f0..c2427dcf52538 100644 --- a/clang/lib/Sema/SemaTypeTraits.cpp +++ b/clang/lib/Sema/SemaTypeTraits.cpp @@ -2014,6 +2014,7 @@ static std::optional StdNameToTypeTrait(StringRef Name) { .Case("is_aggregate", TypeTrait::UTT_IsAggregate) .Case("is_constructible", TypeTrait::TT_IsConstructible) .Case("is_final", TypeTrait::UTT_IsFinal) + .Case("is_abstract", TypeTrait::UTT_IsAbstract) .Default(std::nullopt); } @@ -2774,6 +2775,75 @@ static void DiagnoseNonAggregateReason(Sema &SemaRef, SourceLocation Loc, DiagnoseNonAggregateReason(SemaRef, Loc, D); } +static void DiagnoseNonAbstractReason(Sema &SemaRef, SourceLocation Loc, + const CXXRecordDecl *D) { + // If this type has any abstract base classes, their respective virtual + // functions must have been overridden. + for (const CXXBaseSpecifier &B : D->bases()) { + if (B.getType()->castAsCXXRecordDecl()->isAbstract()) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::OverridesAllPureVirtual + << B.getType() << B.getSourceRange(); + } + } +} + +static void DiagnoseNonAbstractReason(Sema &SemaRef, SourceLocation Loc, + QualType T) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait) + << T << diag::TraitName::Abstract; + + if (T->isReferenceType()) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::Ref; + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::NotStructOrClass; + return; + } + + if (T->isUnionType()) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::UnionType; + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::NotStructOrClass; + return; + } + + if (SemaRef.Context.getAsArrayType(T)) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::ArrayType; + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::NotStructOrClass; + return; + } + + if (T->isFunctionType()) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::FunctionType; + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::NotStructOrClass; + return; + } + + if (T->isPointerType()) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::PointerType; + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::NotStructOrClass; + return; + } + + if (!T->isStructureOrClassType()) { + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::NotStructOrClass; + return; + } + + const CXXRecordDecl *D = T->getAsCXXRecordDecl(); + if (D->hasDefinition()) + DiagnoseNonAbstractReason(SemaRef, Loc, D); +} + void Sema::DiagnoseTypeTraitDetails(const Expr *E) { E = E->IgnoreParenImpCasts(); if (E->containsErrors()) @@ -2818,6 +2888,9 @@ void Sema::DiagnoseTypeTraitDetails(const Expr *E) { DiagnoseIsFinalReason(*this, E->getBeginLoc(), QT); // unsatisfied break; } + case UTT_IsAbstract: + DiagnoseNonAbstractReason(*this, E->getBeginLoc(), Args[0]); + break; default: break; } diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 242ffb09af006..6967301483361 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -1783,6 +1783,14 @@ class TreeTransform { LParenLoc, EndLoc); } + OMPClause * + RebuildOMPLoopRangeClause(Expr *First, Expr *Count, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation FirstLoc, + SourceLocation CountLoc, SourceLocation EndLoc) { + return getSema().OpenMP().ActOnOpenMPLoopRangeClause( + First, Count, StartLoc, LParenLoc, FirstLoc, CountLoc, EndLoc); + } + /// Build a new OpenMP 'allocator' clause. /// /// By default, performs semantic analysis to build the new OpenMP clause. @@ -9607,6 +9615,17 @@ StmtResult TreeTransform::TransformOMPInterchangeDirective( return Res; } +template +StmtResult +TreeTransform::TransformOMPFuseDirective(OMPFuseDirective *D) { + DeclarationNameInfo DirName; + getDerived().getSema().OpenMP().StartOpenMPDSABlock( + D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc()); + StmtResult Res = getDerived().TransformOMPExecutableDirective(D); + getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get()); + return Res; +} + template StmtResult TreeTransform::TransformOMPForDirective(OMPForDirective *D) { @@ -10500,6 +10519,31 @@ TreeTransform::TransformOMPPartialClause(OMPPartialClause *C) { C->getEndLoc()); } +template +OMPClause * +TreeTransform::TransformOMPLoopRangeClause(OMPLoopRangeClause *C) { + ExprResult F = getDerived().TransformExpr(C->getFirst()); + if (F.isInvalid()) + return nullptr; + + ExprResult Cn = getDerived().TransformExpr(C->getCount()); + if (Cn.isInvalid()) + return nullptr; + + Expr *First = F.get(); + Expr *Count = Cn.get(); + + bool Changed = (First != C->getFirst()) || (Count != C->getCount()); + + // If no changes and AlwaysRebuild() is false, return the original clause + if (!Changed && !getDerived().AlwaysRebuild()) + return C; + + return RebuildOMPLoopRangeClause(First, Count, C->getBeginLoc(), + C->getLParenLoc(), C->getFirstLoc(), + C->getCountLoc(), C->getEndLoc()); +} + template OMPClause * TreeTransform::TransformOMPCollapseClause(OMPCollapseClause *C) { @@ -16289,20 +16333,68 @@ TreeTransform::TransformPackIndexingExpr(PackIndexingExpr *E) { IndexExpr.get(), ExpandedExprs, FullySubstituted); } -template -ExprResult -TreeTransform::TransformSubstNonTypeTemplateParmPackExpr( - SubstNonTypeTemplateParmPackExpr *E) { - // Default behavior is to do nothing with this transformation. - return E; +template +ExprResult TreeTransform::TransformSubstNonTypeTemplateParmPackExpr( + SubstNonTypeTemplateParmPackExpr *E) { + if (!getSema().ArgPackSubstIndex) + // We aren't expanding the parameter pack, so just return ourselves. + return E; + + TemplateArgument Pack = E->getArgumentPack(); + TemplateArgument Arg = SemaRef.getPackSubstitutedTemplateArgument(Pack); + return SemaRef.BuildSubstNonTypeTemplateParmExpr( + E->getAssociatedDecl(), E->getParameterPack(), + E->getParameterPackLocation(), Arg, SemaRef.getPackIndex(Pack), + E->getFinal()); } -template -ExprResult -TreeTransform::TransformSubstNonTypeTemplateParmExpr( - SubstNonTypeTemplateParmExpr *E) { - // Default behavior is to do nothing with this transformation. - return E; +template +ExprResult TreeTransform::TransformSubstNonTypeTemplateParmExpr( + SubstNonTypeTemplateParmExpr *E) { + Expr *OrigReplacement = E->getReplacement()->IgnoreImplicitAsWritten(); + ExprResult Replacement = getDerived().TransformExpr(OrigReplacement); + if (Replacement.isInvalid()) + return true; + + Decl *AssociatedDecl = + getDerived().TransformDecl(E->getNameLoc(), E->getAssociatedDecl()); + if (!AssociatedDecl) + return true; + + if (Replacement.get() == OrigReplacement && + AssociatedDecl == E->getAssociatedDecl()) + return E; + + // If the replacement expression did not change, and the parameter type + // did not change, we can skip the semantic action because it would + // produce the same result anyway. + auto *Param = cast( + getReplacedTemplateParameterList(AssociatedDecl) + ->asArray()[E->getIndex()]); + if (QualType ParamType = Param->getType(); + !SemaRef.Context.hasSameType(ParamType, E->getParameter()->getType()) || + Replacement.get() != OrigReplacement) { + + // When transforming the replacement expression previously, all Sema + // specific annotations, such as implicit casts, are discarded. Calling the + // corresponding sema action is necessary to recover those. Otherwise, + // equivalency of the result would be lost. + TemplateArgument SugaredConverted, CanonicalConverted; + Replacement = SemaRef.CheckTemplateArgument( + Param, ParamType, Replacement.get(), SugaredConverted, + CanonicalConverted, + /*StrictCheck=*/false, Sema::CTAK_Specified); + if (Replacement.isInvalid()) + return true; + } else { + // Otherwise, the same expression would have been produced. + Replacement = E->getReplacement(); + } + + return new (SemaRef.Context) SubstNonTypeTemplateParmExpr( + Replacement.get()->getType(), Replacement.get()->getValueKind(), + E->getNameLoc(), Replacement.get(), AssociatedDecl, E->getIndex(), + E->getPackIndex(), E->isReferenceParameter(), E->getFinal()); } template diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 9ee8a0fb0f060..c05e428a6fb39 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -11215,6 +11215,9 @@ OMPClause *OMPClauseReader::readClause() { case llvm::omp::OMPC_partial: C = OMPPartialClause::CreateEmpty(Context); break; + case llvm::omp::OMPC_looprange: + C = OMPLoopRangeClause::CreateEmpty(Context); + break; case llvm::omp::OMPC_allocator: C = new (Context) OMPAllocatorClause(); break; @@ -11618,6 +11621,14 @@ void OMPClauseReader::VisitOMPPartialClause(OMPPartialClause *C) { C->setLParenLoc(Record.readSourceLocation()); } +void OMPClauseReader::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) { + C->setFirst(Record.readSubExpr()); + C->setCount(Record.readSubExpr()); + C->setLParenLoc(Record.readSourceLocation()); + C->setFirstLoc(Record.readSourceLocation()); + C->setCountLoc(Record.readSourceLocation()); +} + void OMPClauseReader::VisitOMPAllocatorClause(OMPAllocatorClause *C) { C->setAllocator(Record.readExpr()); C->setLParenLoc(Record.readSourceLocation()); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 213c2c2148f64..70b898a53fcbd 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2469,10 +2469,21 @@ void ASTStmtReader::VisitOMPReverseDirective(OMPReverseDirective *D) { VisitOMPCanonicalLoopNestTransformationDirective(D); } +void ASTStmtReader::VisitOMPCanonicalLoopSequenceTransformationDirective( + OMPCanonicalLoopSequenceTransformationDirective *D) { + VisitStmt(D); + VisitOMPExecutableDirective(D); + D->setNumGeneratedTopLevelLoops(Record.readUInt32()); +} + void ASTStmtReader::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) { VisitOMPCanonicalLoopNestTransformationDirective(D); } +void ASTStmtReader::VisitOMPFuseDirective(OMPFuseDirective *D) { + VisitOMPCanonicalLoopSequenceTransformationDirective(D); +} + void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) { VisitOMPLoopDirective(D); D->setHasCancel(Record.readBool()); @@ -3615,6 +3626,12 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { break; } + case STMT_OMP_FUSE_DIRECTIVE: { + unsigned NumClauses = Record[ASTStmtReader::NumStmtFields]; + S = OMPFuseDirective::CreateEmpty(Context, NumClauses); + break; + } + case STMT_OMP_INTERCHANGE_DIRECTIVE: { unsigned NumLoops = Record[ASTStmtReader::NumStmtFields]; unsigned NumClauses = Record[ASTStmtReader::NumStmtFields + 1]; diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 09859da171fcd..cdf95ba1c4ba5 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7882,6 +7882,14 @@ void OMPClauseWriter::VisitOMPPartialClause(OMPPartialClause *C) { Record.AddSourceLocation(C->getLParenLoc()); } +void OMPClauseWriter::VisitOMPLoopRangeClause(OMPLoopRangeClause *C) { + Record.AddStmt(C->getFirst()); + Record.AddStmt(C->getCount()); + Record.AddSourceLocation(C->getLParenLoc()); + Record.AddSourceLocation(C->getFirstLoc()); + Record.AddSourceLocation(C->getCountLoc()); +} + void OMPClauseWriter::VisitOMPAllocatorClause(OMPAllocatorClause *C) { Record.AddStmt(C->getAllocator()); Record.AddSourceLocation(C->getLParenLoc()); diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 21c04ddbc2c7a..ebda91e3819c3 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2487,6 +2487,18 @@ void ASTStmtWriter::VisitOMPInterchangeDirective(OMPInterchangeDirective *D) { Code = serialization::STMT_OMP_INTERCHANGE_DIRECTIVE; } +void ASTStmtWriter::VisitOMPCanonicalLoopSequenceTransformationDirective( + OMPCanonicalLoopSequenceTransformationDirective *D) { + VisitStmt(D); + VisitOMPExecutableDirective(D); + Record.writeUInt32(D->getNumGeneratedTopLevelLoops()); +} + +void ASTStmtWriter::VisitOMPFuseDirective(OMPFuseDirective *D) { + VisitOMPCanonicalLoopSequenceTransformationDirective(D); + Code = serialization::STMT_OMP_FUSE_DIRECTIVE; +} + void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) { VisitOMPLoopDirective(D); Record.writeBool(D->hasCancel()); diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp index 36f316df0c3ff..0ae784c000f60 100644 --- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp @@ -672,6 +672,10 @@ ProgramStateRef CStringChecker::CheckOverlap(CheckerContext &C, ProgramStateRef stateTrue, stateFalse; + if (!First.Expression->getType()->isAnyPointerType() || + !Second.Expression->getType()->isAnyPointerType()) + return state; + // Assume different address spaces cannot overlap. if (First.Expression->getType()->getPointeeType().getAddressSpace() != Second.Expression->getType()->getPointeeType().getAddressSpace()) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp index df13de158a646..9585ceb40f95e 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp @@ -224,6 +224,8 @@ class RawPtrRefCallArgsChecker // foo(123) return true; } + if (isa(ArgOrigin)) + return true; if (isa(ArgOrigin)) return true; if (isASafeCallArg(ArgOrigin)) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp index a97a37f85e96c..15a0c5a7fd9dc 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp @@ -130,17 +130,16 @@ class RawPtrRefMemberChecker if (BR->getSourceManager().isInSystemHeader(CD->getLocation())) return; - ObjCContainerDecl::PropertyMap map; - CD->collectPropertiesToImplement(map); - for (auto it : map) - visitObjCPropertyDecl(CD, it.second); - - if (auto *ID = dyn_cast(CD)) { - for (auto *Ivar : ID->ivars()) - visitIvarDecl(CD, Ivar); - return; - } if (auto *ID = dyn_cast(CD)) { + ObjCContainerDecl::PropertyMap map; + CD->collectPropertiesToImplement(map); + for (auto it : map) + visitObjCPropertyDecl(CD, it.second); + + if (auto *Interface = ID->getClassInterface()) { + for (auto *Ivar : Interface->ivars()) + visitIvarDecl(CD, Ivar); + } for (auto *PropImpl : ID->property_impls()) visitPropImpl(CD, PropImpl); for (auto *Ivar : ID->ivars()) diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 785cdfa15bf04..4e472b7fc38b0 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1814,6 +1814,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::OMPStripeDirectiveClass: case Stmt::OMPTileDirectiveClass: case Stmt::OMPInterchangeDirectiveClass: + case Stmt::OMPFuseDirectiveClass: case Stmt::OMPInteropDirectiveClass: case Stmt::OMPDispatchDirectiveClass: case Stmt::OMPMaskedDirectiveClass: diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp index 4c9c619f2487a..217b853305ed1 100644 --- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "HTMLDiagnostics.h" #include "PlistDiagnostics.h" #include "SarifDiagnostics.h" #include "clang/AST/Decl.h" @@ -82,7 +83,7 @@ class HTMLDiagnostics : public PathDiagnosticConsumer { void FlushDiagnosticsImpl(std::vector &Diags, FilesMade *filesMade) override; - StringRef getName() const override { return "HTMLDiagnostics"; } + StringRef getName() const override { return HTML_DIAGNOSTICS_NAME; } bool supportsCrossFileDiagnostics() const override { return SupportsCrossFileDiagnostics; @@ -254,18 +255,6 @@ void HTMLDiagnostics::FlushDiagnosticsImpl( ReportDiag(*Diag, filesMade); } -static llvm::SmallString<32> getIssueHash(const PathDiagnostic &D, - const Preprocessor &PP) { - SourceManager &SMgr = PP.getSourceManager(); - PathDiagnosticLocation UPDLoc = D.getUniqueingLoc(); - FullSourceLoc L(SMgr.getExpansionLoc(UPDLoc.isValid() - ? UPDLoc.asLocation() - : D.getLocation().asLocation()), - SMgr); - return getIssueHash(L, D.getCheckerName(), D.getBugType(), - D.getDeclWithIssue(), PP.getLangOpts()); -} - void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D, FilesMade *filesMade) { // Create the HTML directory if it is missing. @@ -310,7 +299,8 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D, } } - SmallString<32> IssueHash = getIssueHash(D, PP); + SmallString<32> IssueHash = + D.getIssueHash(PP.getSourceManager(), PP.getLangOpts()); auto [It, IsNew] = EmittedHashes.insert(IssueHash); if (!IsNew) { // We've already emitted a duplicate issue. It'll get overwritten anyway. @@ -369,6 +359,12 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D, if (EC != llvm::errc::file_exists) { llvm::errs() << "warning: could not create file in '" << Directory << "': " << EC.message() << '\n'; + } else if (filesMade) { + // Record that we created the file so that it gets referenced in the + // plist and SARIF reports for every translation unit that found the + // issue. + filesMade->addDiagnostic(D, getName(), + llvm::sys::path::filename(ResultPath)); } return; } @@ -679,8 +675,8 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic &D, Rewriter &R, os << "\n\n"; - os << "\n\n"; + os << "\n\n"; os << "\n should use MoveOnlyMulti(MoveOnlyMulti&&) + e1 = {std::move(t)}; + assert(e1.value().used_move1); + } + } + return true; } diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp index 13c0da27bc533..fe664dfc97cfe 100644 --- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp @@ -80,6 +80,17 @@ struct CopyOnly { friend constexpr bool operator==(const CopyOnly& mi, int ii) { return mi.i == ii; } }; +struct MoveOnly2 { + int j; + bool used_move1 = false; + bool used_move2 = false; + + constexpr explicit MoveOnly2(int jj) : j(jj) {} + constexpr MoveOnly2(const MoveOnly2&) = delete; + constexpr MoveOnly2(MoveOnly2&& m) : j(m.j), used_move1(true) {} + constexpr MoveOnly2(const MoveOnly2&& m) : j(m.j), used_move2(true) {} +}; + struct BaseError {}; struct DerivedError : BaseError {}; @@ -164,6 +175,22 @@ constexpr bool test() { assert(e2.has_value()); assert(!e2.value()); // yes, e2 holds "false" since LWG3836 } + + // Check move constructor selection + { + MoveOnly2 t{1}; + std::expected e1(std::move(t)); + assert(e1.has_value()); + assert(e1.value().used_move1 == true); + assert(e1.value().j == 1); + } + { + const MoveOnly2 t2{2}; + std::expected e1(std::move(t2)); + assert(e1.has_value()); + assert(e1.value().used_move2 == true); + assert(e1.value().j == 2); + } return true; } diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp index a68f3f40e3647..5264e7700e3d9 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp @@ -17,9 +17,12 @@ #include #include +#include +#include #include #include #include +#include #include "test_macros.h" #include "type_algorithms.h" @@ -216,8 +219,16 @@ constexpr bool test() { // C++ standard library types + // These types are guaranteed to be implicit-lifetime. + test_is_implicit_lifetime>(); + test_is_implicit_lifetime>(); + test_is_implicit_lifetime>(); + +#ifdef _LIBCPP_VERSION + // These types should be implicit-lifetime, but they are not guaranteed to be so. test_is_implicit_lifetime>(); test_is_implicit_lifetime>(); +#endif // Standard C23 types diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp index a5ee602ab7bce..a90fecfd075fe 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp @@ -59,7 +59,8 @@ constexpr bool explicit_conversion(Input&& in, const Expect& v) static_assert(!std::is_constructible::value, ""); static_assert(!std::is_constructible::value, ""); optional opt(std::forward(in)); - return opt && *opt == static_cast(v); + optional opt2{std::forward(in)}; + return opt && *opt == static_cast(v) && (opt2 && *opt2 == static_cast(v)); } void test_implicit() @@ -83,6 +84,11 @@ void test_implicit() using T = TestTypes::TestType; assert(implicit_conversion(3, T(3))); } + { + using T = TestTypes::TestType; + optional opt({3}); + assert(opt && *opt == static_cast(3)); + } { using O = optional; static_assert(!test_convertible(), ""); diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp index 4f9b6993c6f4f..8c063ae1a799c 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or.pass.cpp @@ -40,6 +40,12 @@ struct X {return x.i_ == y.i_;} }; +struct Z { + int i_, j_; + constexpr Z(int i, int j) : i_(i), j_(j) {} + friend constexpr bool operator==(const Z& z1, const Z& z2) { return z1.i_ == z2.i_ && z1.j_ == z2.j_; } +}; + constexpr int test() { { @@ -64,6 +70,16 @@ constexpr int test() assert(std::move(opt).value_or(Y(3)) == 4); assert(!opt); } + { + optional opt; + assert(std::move(opt).value_or({Y(3)}) == 4); + assert(!opt); + } + { + optional opt; + assert((std::move(opt).value_or({2, 3}) == Z{2, 3})); + assert(!opt); + } return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp index cf782f1137876..ec42890a3b995 100644 --- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_or_const.pass.cpp @@ -75,6 +75,10 @@ int main(int, char**) const optional opt; assert(opt.value_or(Y(3)) == 4); } + { + const optional opt; + assert(opt.value_or({Y(3)}) == 4); + } return 0; } diff --git a/libcxx/utils/benchmark-historical b/libcxx/utils/benchmark-historical index 4d8d65b83c170..c1f9d11a6e800 100755 --- a/libcxx/utils/benchmark-historical +++ b/libcxx/utils/benchmark-historical @@ -43,9 +43,11 @@ def main(argv): parser.add_argument('--commit-list', type=argparse.FileType('r'), default=sys.stdin, help='Path to a file containing a whitespace separated list of commits to test. ' 'By default, this is read from standard input.') - parser.add_argument('--overwrite', action='store_true', - help='When the data for a commit already exists in the output directory, the tool normally skips it. ' - 'This option instructs the tool to generate the data and overwrite it in the output directory.') + parser.add_argument('--existing', type=str, choices=['skip', 'overwrite', 'append'], default='skip', + help='This option instructs what to do when data for a commit already exists in the output directory. ' + 'Selecting "skip" instructs the tool to skip generating data for a commit that already has data, ' + '"overwrite" will overwrite the existing data with the newly-generated one, and "append" will ' + 'append the new data to the existing one. By default, the tool uses "skip".') parser.add_argument('lit_options', nargs=argparse.REMAINDER, help='Optional arguments passed to lit when running the tests. Should be provided last and ' 'separated from other arguments with a `--`.') @@ -70,14 +72,11 @@ def main(argv): commit = resolve_commit(args.git_repo, commit) # resolve e.g. HEAD to a real SHA output_file = args.output / (commit + '.lnt') - if output_file.exists(): - if args.overwrite: - logging.info(f'Will overwrite data for commit {commit} in {output_file}') - else: - logging.info(f'Data for commit {commit} already exists in {output_file}, skipping') - continue + if output_file.exists() and args.existing == 'skip': + logging.info(f'Skipping {commit} which already has data in {output_file}') + continue else: - logging.info(f'Benchmarking commit {commit}') + logging.info(f'Benchmarking {commit}') with tempfile.TemporaryDirectory() as build_dir: test_cmd = [PARENT_DIR / 'test-at-commit', '--git-repo', args.git_repo, @@ -92,8 +91,15 @@ def main(argv): subprocess.call(test_cmd) output_file.parent.mkdir(parents=True, exist_ok=True) - consolidate_cmd = [(PARENT_DIR / 'consolidate-benchmarks'), build_dir, '--output', output_file] - subprocess.check_call(consolidate_cmd) + mode = 'a' if args.existing == 'append' else 'w' + if output_file.exists() and args.existing == 'append': + logging.info(f'Appending to existing data for {commit}') + elif output_file.exists() and args.existing == 'overwrite': + logging.info(f'Overwriting existing data for {commit}') + else: + logging.info(f'Writing data for {commit}') + with open(output_file, mode) as out: + subprocess.check_call([(PARENT_DIR / 'consolidate-benchmarks'), build_dir], stdout=out) if __name__ == '__main__': main(sys.argv[1:]) diff --git a/libcxx/utils/compare-benchmarks b/libcxx/utils/compare-benchmarks index c56f5581b0ae7..18a448ab434c7 100755 --- a/libcxx/utils/compare-benchmarks +++ b/libcxx/utils/compare-benchmarks @@ -1,32 +1,40 @@ #!/usr/bin/env python3 import argparse +import functools +import pathlib import re import statistics import sys +import tempfile -import plotly +import numpy +import pandas +import plotly.express import tabulate -def parse_lnt(lines): +def parse_lnt(lines, aggregate=statistics.median): """ - Parse lines in LNT format and return a dictionnary of the form: + Parse lines in LNT format and return a list of dictionnaries of the form: - { - 'benchmark1': { - 'metric1': [float], - 'metric2': [float], + [ + { + 'benchmark': , + : float, + : float, ... }, - 'benchmark2': { - 'metric1': [float], - 'metric2': [float], + { + 'benchmark': , + : float, + : float, ... }, ... - } + ] - Each metric may have multiple values. + If a metric has multiple values associated to it, they are aggregated into a single + value using the provided aggregation function. """ results = {} for line in lines: @@ -35,90 +43,123 @@ def parse_lnt(lines): continue (identifier, value) = line.split(' ') - (name, metric) = identifier.split('.') - if name not in results: - results[name] = {} - if metric not in results[name]: - results[name][metric] = [] - results[name][metric].append(float(value)) - return results - -def plain_text_comparison(benchmarks, baseline, candidate): + (benchmark, metric) = identifier.split('.') + if benchmark not in results: + results[benchmark] = {'benchmark': benchmark} + + entry = results[benchmark] + if metric not in entry: + entry[metric] = [] + entry[metric].append(float(value)) + + for (bm, entry) in results.items(): + for metric in entry: + if isinstance(entry[metric], list): + entry[metric] = aggregate(entry[metric]) + + return list(results.values()) + +def plain_text_comparison(data, metric, baseline_name=None, candidate_name=None): """ - Create a tabulated comparison of the baseline and the candidate. + Create a tabulated comparison of the baseline and the candidate for the given metric. """ - headers = ['Benchmark', 'Baseline', 'Candidate', 'Difference', '% Difference'] + # Compute additional info in new columns. In text mode, we can assume that we are + # comparing exactly two data sets (suffixed _0 and _1). + data['difference'] = data[f'{metric}_1'] - data[f'{metric}_0'] + data['percent'] = 100 * (data['difference'] / data[f'{metric}_0']) + + data = data.replace(numpy.nan, None).sort_values(by='benchmark') # avoid NaNs in tabulate output + headers = ['Benchmark', baseline_name, candidate_name, 'Difference', '% Difference'] fmt = (None, '.2f', '.2f', '.2f', '.2f') - table = [] - for (bm, base, cand) in zip(benchmarks, baseline, candidate): - diff = (cand - base) if base and cand else None - percent = 100 * (diff / base) if base and cand else None - row = [bm, base, cand, diff, percent] - table.append(row) + table = data[['benchmark', f'{metric}_0', f'{metric}_1', 'difference', 'percent']].set_index('benchmark') return tabulate.tabulate(table, headers=headers, floatfmt=fmt, numalign='right') -def create_chart(benchmarks, baseline, candidate): +def create_chart(data, metric, subtitle=None, series_names=None): """ - Create a bar chart comparing 'baseline' and 'candidate'. + Create a bar chart comparing the given metric across the provided series. """ - figure = plotly.graph_objects.Figure() - figure.add_trace(plotly.graph_objects.Bar(x=benchmarks, y=baseline, name='Baseline')) - figure.add_trace(plotly.graph_objects.Bar(x=benchmarks, y=candidate, name='Candidate')) + data = data.sort_values(by='benchmark').rename(columns={f'{metric}_{i}': series_names[i] for i in range(len(series_names))}) + title = ' vs '.join(series_names) + figure = plotly.express.bar(data, title=title, subtitle=subtitle, x='benchmark', y=series_names, barmode='group') + figure.update_layout(xaxis_title='', yaxis_title='', legend_title='') return figure -def prepare_series(baseline, candidate, metric, aggregate=statistics.median): - """ - Prepare the data for being formatted or displayed as a chart. - - Metrics that have more than one value are aggregated using the given aggregation function. - """ - all_benchmarks = sorted(list(set(baseline.keys()) | set(candidate.keys()))) - baseline_series = [] - candidate_series = [] - for bm in all_benchmarks: - baseline_series.append(aggregate(baseline[bm][metric]) if bm in baseline and metric in baseline[bm] else None) - candidate_series.append(aggregate(candidate[bm][metric]) if bm in candidate and metric in candidate[bm] else None) - return (all_benchmarks, baseline_series, candidate_series) - def main(argv): parser = argparse.ArgumentParser( prog='compare-benchmarks', - description='Compare the results of two sets of benchmarks in LNT format.', - epilog='This script requires the `tabulate` and the `plotly` Python modules.') - parser.add_argument('baseline', type=argparse.FileType('r'), - help='Path to a LNT format file containing the benchmark results for the baseline.') - parser.add_argument('candidate', type=argparse.FileType('r'), - help='Path to a LNT format file containing the benchmark results for the candidate.') - parser.add_argument('--output', '-o', type=argparse.FileType('w'), default=sys.stdout, - help='Path of a file where to output the resulting comparison. Default to stdout.') + description='Compare the results of multiple sets of benchmarks in LNT format.', + epilog='This script depends on the modules listed in `libcxx/utils/requirements.txt`.') + parser.add_argument('files', type=argparse.FileType('r'), nargs='+', + help='Path to LNT format files containing the benchmark results to compare. In the text format, ' + 'exactly two files must be compared.') + parser.add_argument('--output', '-o', type=pathlib.Path, required=False, + help='Path of a file where to output the resulting comparison. If the output format is `text`, ' + 'default to stdout. If the output format is `chart`, default to a temporary file which is ' + 'opened automatically once generated, but not removed after creation.') parser.add_argument('--metric', type=str, default='execution_time', help='The metric to compare. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- ' - 'this option allows selecting which metric is being analyzed. The default is "execution_time".') + 'this option allows selecting which metric is being analyzed. The default is `execution_time`.') parser.add_argument('--filter', type=str, required=False, help='An optional regular expression used to filter the benchmarks included in the comparison. ' 'Only benchmarks whose names match the regular expression will be included.') parser.add_argument('--format', type=str, choices=['text', 'chart'], default='text', - help='Select the output format. "text" generates a plain-text comparison in tabular form, and "chart" ' - 'generates a self-contained HTML graph that can be opened in a browser. The default is text.') + help='Select the output format. `text` generates a plain-text comparison in tabular form, and `chart` ' + 'generates a self-contained HTML graph that can be opened in a browser. The default is `text`.') + parser.add_argument('--open', action='store_true', + help='Whether to automatically open the generated HTML file when finished. This option only makes sense ' + 'when the output format is `chart`.') + parser.add_argument('--series-names', type=str, required=False, + help='Optional comma-delimited list of names to use for the various series. By default, we use ' + 'Baseline and Candidate for two input files, and CandidateN for subsequent inputs.') + parser.add_argument('--subtitle', type=str, required=False, + help='Optional subtitle to use for the chart. This can be used to help identify the contents of the chart. ' + 'This option cannot be used with the plain text output.') args = parser.parse_args(argv) - baseline = parse_lnt(args.baseline.readlines()) - candidate = parse_lnt(args.candidate.readlines()) + if args.format == 'text': + if len(args.files) != 2: + parser.error('--format=text requires exactly two input files to compare') + if args.subtitle is not None: + parser.error('Passing --subtitle makes no sense with --format=text') + if args.open: + parser.error('Passing --open makes no sense with --format=text') + + if args.series_names is None: + args.series_names = ['Baseline'] + if len(args.files) == 2: + args.series_names += ['Candidate'] + elif len(args.files) > 2: + args.series_names.extend(f'Candidate{n}' for n in range(1, len(args.files))) + else: + args.series_names = args.series_names.split(',') + if len(args.series_names) != len(args.files): + parser.error(f'Passed incorrect number of series names: got {len(args.series_names)} series names but {len(args.files)} inputs to compare') + + # Parse the raw LNT data and store each input in a dataframe + lnt_inputs = [parse_lnt(file.readlines()) for file in args.files] + inputs = [pandas.DataFrame(lnt).rename(columns={args.metric: f'{args.metric}_{i}'}) for (i, lnt) in enumerate(lnt_inputs)] - if args.filter is not None: - regex = re.compile(args.filter) - baseline = {k: v for (k, v) in baseline.items() if regex.search(k)} - candidate = {k: v for (k, v) in candidate.items() if regex.search(k)} + # Join the inputs into a single dataframe + data = functools.reduce(lambda a, b: a.merge(b, how='outer', on='benchmark'), inputs) - (benchmarks, baseline_series, candidate_series) = prepare_series(baseline, candidate, args.metric) + if args.filter is not None: + keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None] + data = data[data['benchmark'].isin(keeplist)] if args.format == 'chart': - figure = create_chart(benchmarks, baseline_series, candidate_series) - plotly.io.write_html(figure, file=args.output) + figure = create_chart(data, args.metric, subtitle=args.subtitle, series_names=args.series_names) + do_open = args.output is None or args.open + output = args.output or tempfile.NamedTemporaryFile(suffix='.html').name + plotly.io.write_html(figure, file=output, auto_open=do_open) else: - diff = plain_text_comparison(benchmarks, baseline_series, candidate_series) - args.output.write(diff) - args.output.write('\n') + diff = plain_text_comparison(data, args.metric, baseline_name=args.series_names[0], + candidate_name=args.series_names[1]) + diff += '\n' + if args.output is not None: + with open(args.output, 'w') as out: + out.write(diff) + else: + sys.stdout.write(diff) if __name__ == '__main__': main(sys.argv[1:]) diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index c1e579c775746..5d469d4914b0b 100644 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1334,6 +1334,11 @@ def add_version_header(tc): "values": {"c++23": 202110}, "headers": ["string"], }, + { + "name": "__cpp_lib_string_subview", + "values": {"c++26": 202506}, + "headers": ["string", "string_view"], + }, { "name": "__cpp_lib_string_udls", "values": {"c++14": 201304}, diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py index 5765afec399cf..c9dffd1bb7971 100644 --- a/libcxx/utils/libcxx/test/format.py +++ b/libcxx/utils/libcxx/test/format.py @@ -92,6 +92,7 @@ def parseScript(test, preamble): # errors, which doesn't make sense for clang-verify tests because we may want to check # for specific warning diagnostics. _checkBaseSubstitutions(substitutions) + substitutions.append(("%T", tmpDir)) substitutions.append( ("%{build}", "%{cxx} %s %{flags} %{compile_flags} %{link_flags} -o %t.exe") ) diff --git a/libcxx/utils/parse-spec-results b/libcxx/utils/parse-spec-results index 3aff206f8959c..2c3c279622ad5 100755 --- a/libcxx/utils/parse-spec-results +++ b/libcxx/utils/parse-spec-results @@ -58,7 +58,10 @@ def main(argv): 'sure to use appropriate quoting for header names that contain spaces. This option only makes sense ' 'when the output format is CSV.') parser.add_argument('--keep-not-run', action='store_true', - help='Keep entries whose \'Base Status\' is marked as \'NR\', aka \'Not Run\'. By default, such entries are discarded.') + help='Keep entries whose "Base Status" is marked as "NR" (aka "Not Run"). By default, such entries are discarded.') + parser.add_argument('--keep-failed', action='store_true', + help='Keep entries whose "Base Status" is marked as "CE" (aka "Compilation Error") or "RE" (aka "Runtime Error"). ' + 'By default, such entries are discarded.') args = parser.parse_args(argv) if args.table == 'full': @@ -76,10 +79,12 @@ def main(argv): headers = parsed_headers rows.extend(parsed_rows) - # Remove rows that were not run unless we were asked to keep them + # Remove rows that were not run (or failed) unless we were asked to keep them + status = headers.index('Base Status') if not args.keep_not_run: - not_run = headers.index('Base Status') - rows = [row for row in rows if row[not_run] != 'NR'] + rows = [row for row in rows if row[status] != 'NR'] + if not args.keep_failed: + rows = [row for row in rows if row[status] not in ('CE', 'RE')] if args.extract is not None: if args.output_format != 'csv': diff --git a/libcxx/utils/requirements.txt b/libcxx/utils/requirements.txt index 0c76714849281..1ec769c8693dc 100644 --- a/libcxx/utils/requirements.txt +++ b/libcxx/utils/requirements.txt @@ -1,3 +1,7 @@ +GitPython +numpy +pandas plotly +statsmodels tabulate tqdm diff --git a/libcxx/utils/visualize-historical b/libcxx/utils/visualize-historical index 2e9b07137b0b2..ef28e8b17ca4b 100755 --- a/libcxx/utils/visualize-historical +++ b/libcxx/utils/visualize-historical @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import argparse +import datetime import functools import os import pathlib @@ -10,7 +11,10 @@ import subprocess import sys import tempfile +import git +import pandas import plotly +import plotly.express import tqdm @functools.total_ordering @@ -42,6 +46,13 @@ class Commit: raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}') return res.returncode == 0 + def __hash__(self): + """ + Return the full revision for this commit. + """ + return hash(self.fullrev) + + @functools.cache def show(self, include_diff=False): """ Return the commit information equivalent to `git show` associated to this commit. @@ -65,6 +76,14 @@ class Commit: """ return subprocess.check_output(['git', '-C', self._git_repo, 'rev-parse', self._sha], text=True).strip() + @functools.cached_property + def commit_date(self): + """ + Return the date of the commit as a `datetime.datetime` object. + """ + repo = git.Repo(self._git_repo) + return datetime.datetime.fromtimestamp(repo.commit(self._sha).committed_date) + def prefetch(self): """ Prefetch cached properties associated to this commit object. @@ -72,8 +91,10 @@ class Commit: This makes it possible to control when time is spent recovering that information from Git for e.g. better reporting to the user. """ - self.shortrev + self.commit_date self.fullrev + self.shortrev + self.show() def __str__(self): return self._sha @@ -91,25 +112,21 @@ def truncate_lines(string, n, marker=None): assert len(truncated) <= n, "broken post-condition" return '\n'.join(truncated) -def create_plot(commits, benchmarks, data): +def create_plot(data, metric, subtitle=None): """ - Create a plot object showing the evolution of each benchmark throughout the given commits. + Create a plot object showing the evolution of each benchmark throughout the given commits for + the given metric. """ - figure = plotly.graph_objects.Figure(layout_title_text=f'{commits[0].shortrev} to {commits[-1].shortrev}') - - # Create the X axis and the hover information - x_axis = [commit.shortrev for commit in commits] - hover_info = [truncate_lines(commit.show(), 30, marker='...').replace('\n', '
') for commit in commits] - - # For each benchmark, get the metric for that benchmark for each commit. - # - # Some commits may not have any data associated to a benchmark (e.g. runtime or compilation error). - # Use None, which is handled properly by plotly. - for benchmark in benchmarks: - series = [commit_data.get(benchmark, None) for commit_data in data] - scatter = plotly.graph_objects.Scatter(x=x_axis, y=series, text=hover_info, name=benchmark) - figure.add_trace(scatter) - + data = data.sort_values(by=['revlist_order', 'benchmark']) + revlist = pandas.unique(data['commit']) # list of all commits in chronological order + hover_info = {c: truncate_lines(c.show(), 30, marker='...').replace('\n', '
') for c in revlist} + figure = plotly.express.scatter(data, title=f"{revlist[0].shortrev} to {revlist[-1].shortrev}", + subtitle=subtitle, + x='revlist_order', y=metric, + symbol='benchmark', + color='benchmark', + hover_name=[hover_info[c] for c in data['commit']], + trendline="lowess") return figure def directory_path(string): @@ -118,25 +135,28 @@ def directory_path(string): else: raise NotADirectoryError(string) -def parse_lnt(lines): +def parse_lnt(lines, aggregate=statistics.median): """ - Parse lines in LNT format and return a dictionnary of the form: + Parse lines in LNT format and return a list of dictionnaries of the form: - { - 'benchmark1': { - 'metric1': [float], - 'metric2': [float], + [ + { + 'benchmark': , + : float, + : float, ... }, - 'benchmark2': { - 'metric1': [float], - 'metric2': [float], + { + 'benchmark': , + : float, + : float, ... }, ... - } + ] - Each metric may have multiple values. + If a metric has multiple values associated to it, they are aggregated into a single + value using the provided aggregation function. """ results = {} for line in lines: @@ -145,13 +165,30 @@ def parse_lnt(lines): continue (identifier, value) = line.split(' ') - (name, metric) = identifier.split('.') - if name not in results: - results[name] = {} - if metric not in results[name]: - results[name][metric] = [] - results[name][metric].append(float(value)) - return results + (benchmark, metric) = identifier.split('.') + if benchmark not in results: + results[benchmark] = {'benchmark': benchmark} + + entry = results[benchmark] + if metric not in entry: + entry[metric] = [] + entry[metric].append(float(value)) + + for (bm, entry) in results.items(): + for metric in entry: + if isinstance(entry[metric], list): + entry[metric] = aggregate(entry[metric]) + + return list(results.values()) + +def sorted_revlist(git_repo, commits): + """ + Return the list of commits sorted by their chronological order (from oldest to newest) in the + provided Git repository. Items earlier in the list are older than items later in the list. + """ + revlist_cmd = ['git', '-C', git_repo, 'rev-list', '--no-walk'] + list(commits) + revlist = subprocess.check_output(revlist_cmd, text=True).strip().splitlines() + return list(reversed(revlist)) def main(argv): parser = argparse.ArgumentParser( @@ -159,7 +196,7 @@ def main(argv): description='Visualize historical data in LNT format. This program generates a HTML file that embeds an ' 'interactive plot with the provided data. The HTML file can then be opened in a browser to ' 'visualize the data as a chart.', - epilog='This script depends on the `plotly` and the `tqdm` Python modules.') + epilog='This script depends on the modules listed in `libcxx/utils/requirements.txt`.') parser.add_argument('directory', type=directory_path, help='Path to a valid directory containing benchmark data in LNT format, each file being named .lnt. ' 'This is also the format generated by the `benchmark-historical` utility.') @@ -176,6 +213,15 @@ def main(argv): 'Since the chart is interactive, it generally makes most sense to include all the benchmarks ' 'and to then filter them in the browser, but in some cases producing a chart with a reduced ' 'number of data series is useful.') + parser.add_argument('--find-outliers', metavar='FLOAT', type=float, required=False, + help='Instead of building a chart, detect commits that show a large spike (more than the given relative threshold) ' + 'with the previous result and print those to standard output. This can be used to generate a list of ' + 'potential outliers that we might want to re-generate the data for. The threshold is expressed as a ' + 'floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their previous ' + 'result. This option respects --filter, i.e. only benchmarks that match the filter will be analyzed for ' + 'outliers.') + parser.add_argument('--subtitle', type=str, required=False, + help='Optional subtitle for the chart. This can be used to help identify the contents of the chart.') parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()), help='Path to the git repository to use for ordering commits in time. ' 'By default, the current working directory is used.') @@ -184,40 +230,46 @@ def main(argv): 'the resulting benchmark is opened automatically by default.') args = parser.parse_args(argv) - # Extract benchmark data from the directory and keep only the metric we're interested in. - # - # Some data points may have multiple values associated to the metric (e.g. if we performed - # multiple runs to reduce noise), in which case we aggregate them using a median. - historical_data = [] + # Extract benchmark data from the directory. + data = {} files = [f for f in args.directory.glob('*.lnt')] for file in tqdm.tqdm(files, desc='Parsing LNT files'): + rows = parse_lnt(file.read_text().splitlines()) (commit, _) = os.path.splitext(os.path.basename(file)) commit = Commit(args.git_repo, commit) - with open(file, 'r') as f: - lnt_data = parse_lnt(f.readlines()) - commit_data = {} - for (bm, metrics) in lnt_data.items(): - commit_data[bm] = statistics.median(metrics[args.metric]) if args.metric in metrics else None - historical_data.append((commit, commit_data)) + data[commit] = rows # Obtain commit information which is then cached throughout the program. Do this # eagerly so we can provide a progress bar. - for (commit, _) in tqdm.tqdm(historical_data, desc='Prefetching Git information'): + for commit in tqdm.tqdm(data.keys(), desc='Prefetching Git information'): commit.prefetch() - # Sort the data based on the ordering of commits inside the provided Git repository - historical_data.sort(key=lambda x: x[0]) + # Create a dataframe from the raw data and add some columns to it: + # - 'commit' represents the Commit object associated to the results in that row + # - `revlist_order` represents the order of the commit within the Git repository. + # - `date` represents the commit date + revlist = sorted_revlist(args.git_repo, [c.fullrev for c in data.keys()]) + data = pandas.DataFrame([row | {'commit': c} for (c, rows) in data.items() for row in rows]) + data = data.join(pandas.DataFrame([{'revlist_order': revlist.index(c.fullrev)} for c in data['commit']])) + data = data.join(pandas.DataFrame([{'date': c.commit_date} for c in data['commit']])) - # Filter the benchmarks if needed - benchmarks = {b for (_, commit_data) in historical_data for b in commit_data.keys()} + # Filter the benchmarks if needed. if args.filter is not None: - regex = re.compile(args.filter) - benchmarks = {b for b in benchmarks if regex.search(b)} + keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None] + data = data[data['benchmark'].isin(keeplist)] + + # If requested, perform a basic pass to detect outliers. + # Note that we consider a commit to be an outlier if any of the benchmarks for that commit is an outlier. + if args.find_outliers is not None: + threshold = args.find_outliers + outliers = set() + for (benchmark, series) in data.sort_values(by='revlist_order').groupby('benchmark'): + outliers |= set(series[series[args.metric].pct_change() > threshold]['commit']) + print(f'Outliers (more than {threshold * 100}%): {" ".join(c.shortrev for c in outliers)}') + return - # Plot the data for all the required benchmarks - figure = create_plot([commit for (commit, _) in historical_data], - sorted(list(benchmarks)), - [data for (_, data) in historical_data]) + # Plot the data for all the required benchmarks. + figure = create_plot(data, args.metric, subtitle=args.subtitle) do_open = args.output is None or args.open output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name plotly.io.write_html(figure, file=output, auto_open=do_open) diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S index 5e199188945df..1bcd205be260d 100644 --- a/libunwind/src/UnwindRegistersRestore.S +++ b/libunwind/src/UnwindRegistersRestore.S @@ -1044,9 +1044,10 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind18Registers_mips_o326jumptoEv) lw $27, (4 * 27)($4) lw $28, (4 * 28)($4) lw $29, (4 * 29)($4) - lw $30, (4 * 30)($4) // load new pc into ra lw $31, (4 * 32)($4) + // MIPS 1 has load delay slot. Ensure lw $31 and jr are separated by an instruction. + lw $30, (4 * 30)($4) // jump to ra, load a0 in the delay slot jr $31 lw $4, (4 * 4)($4) @@ -1082,11 +1083,13 @@ DEFINE_LIBUNWIND_FUNCTION(_ZN9libunwind21Registers_mips_newabi6jumptoEv) ld $2, (8 * 2)($4) ld $3, (8 * 3)($4) // skip a0 for now - .irp i,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 + .irp i,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 ld $\i, (8 * \i)($4) .endr // load new pc into ra ld $31, (8 * 32)($4) + // MIPS 1 has load delay slot. Ensure lw $31 and jr are separated by an instruction. + ld $30, (8 * 30)($4) // jump to ra, load a0 in the delay slot jr $31 ld $4, (8 * 4)($4) diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index b4f00996319b1..258a82e371f3a 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -1620,7 +1620,7 @@ void Writer::createSymbolAndStringTable() { dthunk->wrappedSym->writtenToSymtab = true; if (std::optional sym = createSymbol(dthunk->wrappedSym)) { - if (d->getName().size() > COFF::NameSize) + if (dthunk->wrappedSym->getName().size() > COFF::NameSize) longNameSymbols.emplace_back(outputSymtab.size(), dthunk->wrappedSym->getName()); outputSymtab.push_back(*sym); diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp index 27e77e943c197..2a97df4785ecb 100644 --- a/lld/ELF/Arch/AArch64.cpp +++ b/lld/ELF/Arch/AArch64.cpp @@ -82,7 +82,7 @@ class AArch64 : public TargetInfo { void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; void applyBranchToBranchOpt() const override; private: @@ -939,12 +939,8 @@ static bool needsGotForMemtag(const Relocation &rel) { return rel.sym->isTagged() && needsGot(rel.expr); } -void AArch64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; - else if (auto *ehIn = dyn_cast(&sec)) - secAddr += ehIn->getParent()->outSecOff; +void AArch64::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; AArch64Relaxer relaxer(ctx, sec.relocs()); for (size_t i = 0, size = sec.relocs().size(); i != size; ++i) { const Relocation &rel = sec.relocs()[i]; diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp index db2c71c3b42b9..c6cdf05547d3f 100644 --- a/lld/ELF/Arch/LoongArch.cpp +++ b/lld/ELF/Arch/LoongArch.cpp @@ -41,7 +41,7 @@ class LoongArch final : public TargetInfo { bool relaxOnce(int pass) const override; bool synthesizeAlign(uint64_t &dot, InputSection *sec) override; RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; void finalizeRelax(int passes) const override; private: @@ -1395,13 +1395,9 @@ static bool pairForGotRels(ArrayRef relocs) { return i == size; } -void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { +void LoongArch::relocateAlloc(InputSection &sec, uint8_t *buf) const { const unsigned bits = ctx.arg.is64 ? 64 : 32; - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; - else if (auto *ehIn = dyn_cast(&sec)) - secAddr += ehIn->getParent()->outSecOff; + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; bool isExtreme = false, isRelax = false; const MutableArrayRef relocs = sec.relocs(); const bool isPairForGotRels = pairForGotRels(relocs); diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp index 60a0a38d5f23a..5972698b34a2c 100644 --- a/lld/ELF/Arch/PPC.cpp +++ b/lld/ELF/Arch/PPC.cpp @@ -49,7 +49,7 @@ class PPC final : public TargetInfo { uint64_t val) const override; RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override; int getTlsGdRelaxSkip(RelType type) const override; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; private: void relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const; @@ -496,10 +496,8 @@ void PPC::relaxTlsIeToLe(uint8_t *loc, const Relocation &rel, } } -void PPC::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; +void PPC::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; for (const Relocation &rel : sec.relocs()) { uint8_t *loc = buf + rel.offset; const uint64_t val = diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp index 3cd4a6294e2a8..550c091624bb5 100644 --- a/lld/ELF/Arch/PPC64.cpp +++ b/lld/ELF/Arch/PPC64.cpp @@ -190,7 +190,7 @@ class PPC64 final : public TargetInfo { RelExpr adjustGotPcExpr(RelType type, int64_t addend, const uint8_t *loc) const override; void relaxGot(uint8_t *loc, const Relocation &rel, uint64_t val) const; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, uint8_t stOther) const override; @@ -1561,12 +1561,8 @@ void PPC64::relaxTlsGdToIe(uint8_t *loc, const Relocation &rel, } } -void PPC64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; - else if (auto *ehIn = dyn_cast(&sec)) - secAddr += ehIn->getParent()->outSecOff; +void PPC64::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; uint64_t lastPPCRelaxedRelocOff = -1; for (const Relocation &rel : sec.relocs()) { uint8_t *loc = buf + rel.offset; diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp index 7f2bfefa5578a..dc2ab97e9d9be 100644 --- a/lld/ELF/Arch/RISCV.cpp +++ b/lld/ELF/Arch/RISCV.cpp @@ -43,7 +43,7 @@ class RISCV final : public TargetInfo { const uint8_t *loc) const override; void relocate(uint8_t *loc, const Relocation &rel, uint64_t val) const override; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; bool relaxOnce(int pass) const override; template bool synthesizeAlignForInput(uint64_t &dot, InputSection *sec, @@ -603,12 +603,8 @@ static void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) { } } -void RISCV::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; - else if (auto *ehIn = dyn_cast(&sec)) - secAddr += ehIn->getParent()->outSecOff; +void RISCV::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; uint64_t tlsdescVal = 0; bool tlsdescRelax = false, isToLe = false; const ArrayRef relocs = sec.relocs(); diff --git a/lld/ELF/Arch/X86.cpp b/lld/ELF/Arch/X86.cpp index c1980d6e0538f..904741fd72b0a 100644 --- a/lld/ELF/Arch/X86.cpp +++ b/lld/ELF/Arch/X86.cpp @@ -37,7 +37,7 @@ class X86 : public TargetInfo { uint64_t val) const override; RelExpr adjustTlsExpr(RelType type, RelExpr expr) const override; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; private: void relaxTlsGdToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const; @@ -491,10 +491,8 @@ void X86::relaxTlsLdToLe(uint8_t *loc, const Relocation &rel, memcpy(loc - 2, inst, sizeof(inst)); } -void X86::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; +void X86::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; for (const Relocation &rel : sec.relocs()) { uint8_t *loc = buf + rel.offset; const uint64_t val = diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp index 488f4803b2cb4..9083b5b9ff250 100644 --- a/lld/ELF/Arch/X86_64.cpp +++ b/lld/ELF/Arch/X86_64.cpp @@ -44,7 +44,7 @@ class X86_64 : public TargetInfo { unsigned size) const override; RelExpr adjustGotPcExpr(RelType type, int64_t addend, const uint8_t *loc) const override; - void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const override; + void relocateAlloc(InputSection &sec, uint8_t *buf) const override; bool adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, uint8_t stOther) const override; bool deleteFallThruJmpInsn(InputSection &is, InputFile *file, @@ -1146,12 +1146,8 @@ bool X86_64::adjustPrologueForCrossSplitStack(uint8_t *loc, uint8_t *end, return false; } -void X86_64::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; - else if (auto *ehIn = dyn_cast(&sec)) - secAddr += ehIn->getParent()->outSecOff; +void X86_64::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; for (const Relocation &rel : sec.relocs()) { if (rel.expr == R_NONE) // See deleteFallThruJmpInsn continue; diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp index ea6bcc5bb272b..98267d1e081db 100644 --- a/lld/ELF/InputSection.cpp +++ b/lld/ELF/InputSection.cpp @@ -1171,7 +1171,7 @@ void InputSection::relocateNonAlloc(Ctx &ctx, uint8_t *buf, } template -void InputSectionBase::relocate(Ctx &ctx, uint8_t *buf, uint8_t *bufEnd) { +void InputSection::relocate(Ctx &ctx, uint8_t *buf, uint8_t *bufEnd) { if ((flags & SHF_EXECINSTR) && LLVM_UNLIKELY(getFile()->splitStack)) adjustSplitStackFunctionPrologues(ctx, buf, bufEnd); diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h index 98e7d5d4ff0cd..8462f03bdb77e 100644 --- a/lld/ELF/InputSection.h +++ b/lld/ELF/InputSection.h @@ -262,10 +262,6 @@ class InputSectionBase : public SectionBase { return {*this, sym, offset}; } - // Each section knows how to relocate itself. These functions apply - // relocations, assuming that Buf points to this section's copy in - // the mmap'ed output buffer. - template void relocate(Ctx &, uint8_t *buf, uint8_t *bufEnd); uint64_t getRelocTargetVA(Ctx &, const Relocation &r, uint64_t p) const; // The native ELF reloc data type is not very convenient to handle. @@ -443,8 +439,12 @@ class InputSection : public InputSectionBase { InputSectionBase *getRelocatedSection() const; + // Each section knows how to relocate itself. These functions apply + // relocations, assuming that `buf` points to this section's copy in + // the mmap'ed output buffer. template void relocateNonAlloc(Ctx &, uint8_t *buf, Relocs rels); + template void relocate(Ctx &, uint8_t *buf, uint8_t *bufEnd); // Points to the canonical section. If ICF folds two sections, repl pointer of // one section points to the other. diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 6f55bac2ecf16..bd96c051d160d 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -1549,7 +1549,7 @@ void RelocationScanner::scanOne(typename Relocs::const_iterator &i) { sec->file->ppc64SmallCodeModelTocRelocs = true; // Record the TOC entry (.toc + addend) as not relaxable. See the comment in - // InputSectionBase::relocateAlloc(). + // PPC64::relocateAlloc(). if (type == R_PPC64_TOC16_LO && sym.isSection() && isa(sym) && cast(sym).section->name == ".toc") ctx.ppc64noTocRelax.insert({&sym, addend}); diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 0d87f9a66071a..457a794a8c3a8 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -403,7 +403,7 @@ EhFrameSection::EhFrameSection(Ctx &ctx) // Search for an existing CIE record or create a new one. // CIE records from input object files are uniquified by their contents // and where their relocations point to. -template +template CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef rels) { Symbol *personality = nullptr; unsigned firstRelI = cie.firstRelocation; @@ -424,7 +424,7 @@ CieRecord *EhFrameSection::addCie(EhSectionPiece &cie, ArrayRef rels) { // There is one FDE per function. Returns a non-null pointer to the function // symbol if the given FDE points to a live function. -template +template Defined *EhFrameSection::isFdeLive(EhSectionPiece &fde, ArrayRef rels) { auto *sec = cast(fde.sec); unsigned firstRelI = fde.firstRelocation; @@ -456,14 +456,14 @@ template void EhFrameSection::addRecords(EhInputSection *sec, ArrayRef rels) { offsetToCie.clear(); for (EhSectionPiece &cie : sec->cies) - offsetToCie[cie.inputOff] = addCie(cie, rels); + offsetToCie[cie.inputOff] = addCie(cie, rels); for (EhSectionPiece &fde : sec->fdes) { uint32_t id = endian::read32(fde.data().data() + 4); CieRecord *rec = offsetToCie[fde.inputOff + 4 - id]; if (!rec) Fatal(ctx) << sec << ": invalid CIE reference"; - if (!isFdeLive(fde, rels)) + if (!isFdeLive(fde, rels)) continue; rec->fdes.push_back(&fde); numFdes++; @@ -497,7 +497,7 @@ void EhFrameSection::iterateFDEWithLSDAAux( continue; // The CIE has a LSDA argument. Call fn with d's section. - if (Defined *d = isFdeLive(fde, rels)) + if (Defined *d = isFdeLive(fde, rels)) if (auto *s = dyn_cast_or_null(d->section)) fn(*s); } @@ -662,7 +662,7 @@ void EhFrameSection::writeTo(uint8_t *buf) { // in the output buffer, but relocateAlloc() still works because // getOffset() takes care of discontiguous section pieces. for (EhInputSection *s : sections) - ctx.target->relocateAlloc(*s, buf); + ctx.target->relocateEh(*s, buf); if (getPartition(ctx).ehFrameHdr && getPartition(ctx).ehFrameHdr->getParent()) getPartition(ctx).ehFrameHdr->write(); diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h index 223dfe3b67b10..55a10716c054b 100644 --- a/lld/ELF/SyntheticSections.h +++ b/lld/ELF/SyntheticSections.h @@ -88,10 +88,10 @@ class EhFrameSection final : public SyntheticSection { llvm::DenseSet &ciesWithLSDA, llvm::function_ref fn); - template + template CieRecord *addCie(EhSectionPiece &piece, ArrayRef rels); - template + template Defined *isFdeLive(EhSectionPiece &piece, ArrayRef rels); uint64_t getFdePc(uint8_t *buf, size_t off, uint8_t enc) const; diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp index 4946484074d05..fb79ee911273a 100644 --- a/lld/ELF/Target.cpp +++ b/lld/ELF/Target.cpp @@ -148,22 +148,30 @@ RelExpr TargetInfo::adjustGotPcExpr(RelType type, int64_t addend, return R_GOT_PC; } -void TargetInfo::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const { +static void relocateImpl(const TargetInfo &target, InputSectionBase &sec, + uint64_t secAddr, uint8_t *buf) { + auto &ctx = target.ctx; const unsigned bits = ctx.arg.is64 ? 64 : 32; - uint64_t secAddr = sec.getOutputSection()->addr; - if (auto *s = dyn_cast(&sec)) - secAddr += s->outSecOff; - else if (auto *ehIn = dyn_cast(&sec)) - secAddr += ehIn->getParent()->outSecOff; for (const Relocation &rel : sec.relocs()) { uint8_t *loc = buf + rel.offset; const uint64_t val = SignExtend64( sec.getRelocTargetVA(ctx, rel, secAddr + rel.offset), bits); if (rel.expr != R_RELAX_HINT) - relocate(loc, rel, val); + target.relocate(loc, rel, val); } } +void TargetInfo::relocateAlloc(InputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.outSecOff; + relocateImpl(*this, sec, secAddr, buf); +} + +// A variant of relocateAlloc that processes an EhInputSection. +void TargetInfo::relocateEh(EhInputSection &sec, uint8_t *buf) const { + uint64_t secAddr = sec.getOutputSection()->addr + sec.getParent()->outSecOff; + relocateImpl(*this, sec, secAddr, buf); +} + uint64_t TargetInfo::getImageBase() const { // Use --image-base if set. Fall back to the target default if not. if (ctx.arg.imageBase) diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h index bb8c24f052aa2..9f0605138a4fb 100644 --- a/lld/ELF/Target.h +++ b/lld/ELF/Target.h @@ -92,7 +92,8 @@ class TargetInfo { void relocateNoSym(uint8_t *loc, RelType type, uint64_t val) const { relocate(loc, Relocation{R_NONE, type, 0, 0, nullptr}, val); } - virtual void relocateAlloc(InputSectionBase &sec, uint8_t *buf) const; + virtual void relocateAlloc(InputSection &sec, uint8_t *buf) const; + void relocateEh(EhInputSection &sec, uint8_t *buf) const; // Do a linker relaxation pass and return true if we changed something. virtual bool relaxOnce(int pass) const { return false; } diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 19dba790c1c7c..51b1363d87615 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -222,6 +222,7 @@ struct Configuration { bool pgoWarnMismatch; bool warnThinArchiveMissingMembers; bool disableVerify; + bool separateCstringLiteralSections; bool callGraphProfileSort = false; llvm::StringRef printSymbolOrder; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 5a9b5b66d01b5..7ce987e400a24 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1522,8 +1522,8 @@ static void foldIdenticalLiterals() { // We always create a cStringSection, regardless of whether dedupLiterals is // true. If it isn't, we simply create a non-deduplicating CStringSection. // Either way, we must unconditionally finalize it here. - in.cStringSection->finalizeContents(); - in.objcMethnameSection->finalizeContents(); + for (auto *sec : in.cStringSections) + sec->finalizeContents(); in.wordLiteralSection->finalizeContents(); } @@ -1711,7 +1711,7 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, firstTLVDataSection = nullptr; tar = nullptr; - memset(&in, 0, sizeof(in)); + in = InStruct(); resetLoadedDylibs(); resetOutputSegments(); @@ -1983,6 +1983,9 @@ bool link(ArrayRef argsArr, llvm::raw_ostream &stdoutOS, OPT_no_warn_thin_archive_missing_members, true); config->generateUuid = !args.hasArg(OPT_no_uuid); config->disableVerify = args.hasArg(OPT_disable_verify); + config->separateCstringLiteralSections = + args.hasFlag(OPT_separate_cstring_literal_sections, + OPT_no_separate_cstring_literal_sections, false); auto IncompatWithCGSort = [&](StringRef firstArgStr) { // Throw an error only if --call-graph-profile-sort is explicitly specified diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp index 18b3ff961085b..b173e14cc86a8 100644 --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -63,15 +63,13 @@ void lld::macho::addInputSection(InputSection *inputSection) { isec->parent = osec; inputSections.push_back(isec); } else if (auto *isec = dyn_cast(inputSection)) { - if (isec->getName() == section_names::objcMethname) { - if (in.objcMethnameSection->inputOrder == UnspecifiedInputOrder) - in.objcMethnameSection->inputOrder = inputSectionsOrder++; - in.objcMethnameSection->addInput(isec); - } else { - if (in.cStringSection->inputOrder == UnspecifiedInputOrder) - in.cStringSection->inputOrder = inputSectionsOrder++; - in.cStringSection->addInput(isec); - } + bool useSectionName = config->separateCstringLiteralSections || + isec->getName() == section_names::objcMethname; + auto *osec = in.getOrCreateCStringSection( + useSectionName ? isec->getName() : section_names::cString); + if (osec->inputOrder == UnspecifiedInputOrder) + osec->inputOrder = inputSectionsOrder++; + osec->addInput(isec); } else if (auto *isec = dyn_cast(inputSection)) { if (in.wordLiteralSection->inputOrder == UnspecifiedInputOrder) in.wordLiteralSection->inputOrder = inputSectionsOrder++; diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp index f3e221a700b14..29ebcdcf9a832 100644 --- a/lld/MachO/MapFile.cpp +++ b/lld/MachO/MapFile.cpp @@ -239,7 +239,7 @@ void macho::writeMapFile() { printIsecArrSyms(textOsec->inputs, textOsec->getThunks()); } else if (auto *concatOsec = dyn_cast(osec)) { printIsecArrSyms(concatOsec->inputs); - } else if (osec == in.cStringSection || osec == in.objcMethnameSection) { + } else if (is_contained(in.cStringSections, osec)) { const auto &liveCStrings = info.liveCStringsForSection.lookup(osec); uint64_t lastAddr = 0; // strings will never start at address 0, so this // is a sentinel value diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index 8ae50f380741a..4eeb8fbe11121 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -1084,6 +1084,13 @@ def dyld_env : Separate<["-"], "dyld_env">, def ignore_auto_link : Flag<["-"], "ignore_auto_link">, HelpText<"Ignore LC_LINKER_OPTIONs">, Group; +defm separate_cstring_literal_sections + : BB<"separate-cstring-literal-sections", + "Emit all cstring literals into their respective sections defined by " + "their section names.", + "Emit all cstring literals into the __cstring section. As a special " + "case, the __objc_methname section will still be emitted. (default)">, + Group; def grp_deprecated : OptionGroup<"deprecated">, HelpText<"DEPRECATED">; diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 979a4ee6d8133..228b84db21c2a 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -1687,6 +1687,9 @@ void CStringSection::writeTo(uint8_t *buf) const { void CStringSection::finalizeContents() { uint64_t offset = 0; + // TODO: Call buildCStringPriorities() to support cstring ordering when + // deduplication is off, although this may negatively impact build + // performance. for (CStringInputSection *isec : inputs) { for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) { if (!piece.live) diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index 5796b0790c83a..1abf3c210a64e 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -843,6 +843,9 @@ void writeChainedFixup(uint8_t *buf, const Symbol *sym, int64_t addend); struct InStruct { const uint8_t *bufferStart = nullptr; MachHeaderSection *header = nullptr; + /// The list of cstring sections. Note that this includes \p cStringSection + /// and \p objcMethnameSection already. + llvm::SmallVector cStringSections; CStringSection *cStringSection = nullptr; DeduplicatedCStringSection *objcMethnameSection = nullptr; WordLiteralSection *wordLiteralSection = nullptr; @@ -863,6 +866,26 @@ struct InStruct { InitOffsetsSection *initOffsets = nullptr; ObjCMethListSection *objcMethList = nullptr; ChainedFixupsSection *chainedFixups = nullptr; + + CStringSection *getOrCreateCStringSection(StringRef name, + bool forceDedupStrings = false) { + auto [it, didEmplace] = + cStringSectionMap.try_emplace(name, cStringSections.size()); + if (!didEmplace) + return cStringSections[it->getValue()]; + + std::string &nameData = *make(name); + CStringSection *sec; + if (config->dedupStrings || forceDedupStrings) + sec = make(nameData.c_str()); + else + sec = make(nameData.c_str()); + cStringSections.push_back(sec); + return sec; + } + +private: + llvm::StringMap cStringSectionMap; }; extern InStruct in; diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index f288fadc0d14f..995792be41747 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -1377,13 +1377,11 @@ void macho::resetWriter() { LCDylib::resetInstanceCount(); } void macho::createSyntheticSections() { in.header = make(); - if (config->dedupStrings) - in.cStringSection = - make(section_names::cString); - else - in.cStringSection = make(section_names::cString); - in.objcMethnameSection = - make(section_names::objcMethname); + // Materialize cstring and objcMethname sections + in.cStringSection = in.getOrCreateCStringSection(section_names::cString); + in.objcMethnameSection = cast( + in.getOrCreateCStringSection(section_names::objcMethname, + /*forceDedupStrings=*/true)); in.wordLiteralSection = make(); if (config->emitChainedFixups) { in.chainedFixups = make(); diff --git a/lld/test/COFF/strtab.s b/lld/test/COFF/strtab.s index fbdd8df52d540..9edc13e19e825 100644 --- a/lld/test/COFF/strtab.s +++ b/lld/test/COFF/strtab.s @@ -1,17 +1,32 @@ # REQUIRES: x86 # RUN: llvm-mc -triple=x86_64-windows-msvc %s -filetype=obj -o %t.obj -# RUN: lld-link -out:%t.exe -entry:main %t.obj -debug:dwarf +# RUN: lld-link -machine:x64 -def:%S/Inputs/library.def -implib:%t.lib +# RUN: lld-link -out:%t.exe -entry:main %t.obj %t.lib -debug:dwarf # RUN: llvm-readobj --string-table %t.exe | FileCheck %s +# RUN: llvm-nm %t.exe | FileCheck %s --check-prefix=SYMBOLS + +# Note, for this test to have the intended test coverage, the imported symbol +# "function" needs to be such that the symbol name itself is <= 8 chars, while +# "__imp_"+name is >8 chars. # CHECK: StringTable { -# CHECK-NEXT: Length: 87 +# CHECK-NEXT: Length: 102 # CHECK-NEXT: [ 4] .debug_abbrev # CHECK-NEXT: [ 12] .debug_line # CHECK-NEXT: [ 1e] long_name_symbolz # CHECK-NEXT: [ 30] .debug_abbrez -# CHECK-NEXT: [ 3e] __impl_long_name_symbolA +# CHECK-NEXT: [ 3e] __imp_function +# CHECK-NEXT: [ 4d] __impl_long_name_symbolA # CHECK-NEXT: } +# SYMBOLS: 140001000 N .debug_abbrez +# SYMBOLS-NEXT: 140002070 R __imp_function +# SYMBOLS-NEXT: 140001000 t __impl_long_name_symbolA +# SYMBOLS-NEXT: 140001010 T function +# SYMBOLS-NEXT: 140001000 t long_name_symbolA +# SYMBOLS-NEXT: 140001000 t long_name_symbolz +# SYMBOLS-NEXT: 140001000 T main +# SYMBOLS-NEXT: 140001000 t name_symbolA .global main .text @@ -21,6 +36,7 @@ long_name_symbolA: __impl_long_name_symbolA: name_symbolA: .debug_abbrez: + call function ret .section .debug_abbrev,"dr" diff --git a/lld/test/ELF/eh-frame-relocation.s b/lld/test/ELF/eh-frame-relocation.s new file mode 100644 index 0000000000000..9c1fe40dba7d3 --- /dev/null +++ b/lld/test/ELF/eh-frame-relocation.s @@ -0,0 +1,29 @@ +# REQUIRES: x86 +## Test that marker relocations are ignored and undefined symbols lead to errors. + +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o +# RUN: llvm-mc -filetype=obj -triple=x86_64 abi.s -o abi.o +# RUN: ld.lld a.o abi.o -o a +# RUN: llvm-readelf -s a | FileCheck %s + +# CHECK: 00000000002{{.*}} 0 FUNC GLOBAL DEFAULT [[#]] __gxx_personality_v0 + +# RUN: not ld.lld a.o 2>&1 | FileCheck %s --check-prefix=ERR + +# ERR: error: undefined symbol: __gxx_personality_v0 +# ERR-NEXT: >>> referenced by a.o:(.eh_frame+0x12) + +#--- a.s +.cfi_startproc +.cfi_personality 0, __gxx_personality_v0 + ret +.cfi_endproc + +.section .eh_frame,"a",@unwind +.reloc ., BFD_RELOC_NONE, ignore + +#--- abi.s +.globl __gxx_personality_v0 +.type __gxx_personality_v0, @function +__gxx_personality_v0: diff --git a/lld/test/ELF/gc-sections-print.s b/lld/test/ELF/gc-sections-print.s index f105dc10c2471..a6451912c05ed 100644 --- a/lld/test/ELF/gc-sections-print.s +++ b/lld/test/ELF/gc-sections-print.s @@ -2,7 +2,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t # RUN: ld.lld %t --gc-sections --print-gc-sections -o %t2 2>&1 | FileCheck -check-prefix=PRINT %s # RUN: ld.lld %t --gc-sections --print-gc-sections=- -o %t2 2>&1 | FileCheck -check-prefix=PRINT %s -# RUN: ld.lld %t --gc-sections --print-gc-sections=%t.txt +# RUN: ld.lld %t --gc-sections --print-gc-sections=%t.txt -o %t2 # RUN: FileCheck --check-prefix=PRINT %s --input-file=%t.txt # PRINT: removing unused section {{.*}}:(.text.x) diff --git a/lld/test/MachO/cstring.ll b/lld/test/MachO/cstring.ll new file mode 100644 index 0000000000000..4ba9082427b85 --- /dev/null +++ b/lld/test/MachO/cstring.ll @@ -0,0 +1,32 @@ +; REQUIRES: x86 +; RUN: llvm-as %s -o %t.o + +; RUN: %lld -dylib --separate-cstring-literal-sections %t.o -o - | llvm-objdump --macho --section-headers - | FileCheck %s +; RUN: %lld -dylib --no-separate-cstring-literal-sections %t.o -o - | llvm-objdump --macho --section-headers - | FileCheck %s --check-prefix=CSTR +; RUN: %lld -dylib %t.o -o - | llvm-objdump --macho --section-headers - | FileCheck %s --check-prefix=CSTR + +; CHECK-DAG: __cstring +; CHECK-DAG: __new_sec +; CHECK-DAG: __objc_classname +; CHECK-DAG: __objc_methname +; CHECK-DAG: __objc_methtype + +; CSTR-DAG: __cstring +; CSTR-DAG: __objc_methname + +target triple = "x86_64-apple-darwin" +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" + +@.str = private unnamed_addr constant [10 x i8] c"my string\00", align 1 +@.str1 = private unnamed_addr constant [16 x i8] c"my other string\00", section "__TEXT,__new_sec,cstring_literals", align 1 +@OBJC_CLASS_NAME_ = private unnamed_addr constant [4 x i8] c"foo\00", section "__TEXT,__objc_classname,cstring_literals", align 1 +@OBJC_METH_VAR_NAME_ = private unnamed_addr constant [4 x i8] c"bar\00", section "__TEXT,__objc_methname,cstring_literals", align 1 +@OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [4 x i8] c"goo\00", section "__TEXT,__objc_methtype,cstring_literals", align 1 + +@llvm.compiler.used = appending global [5 x ptr] [ + ptr @.str, + ptr @.str1, + ptr @OBJC_METH_VAR_NAME_, + ptr @OBJC_CLASS_NAME_, + ptr @OBJC_METH_VAR_TYPE_ +] diff --git a/lld/test/MachO/ordre-file-cstring.s b/lld/test/MachO/order-file-cstring.s similarity index 100% rename from lld/test/MachO/ordre-file-cstring.s rename to lld/test/MachO/order-file-cstring.s diff --git a/lld/test/wasm/memory-naming.test b/lld/test/wasm/memory-naming.test index b4aabaeeac357..766d9cd59050b 100644 --- a/lld/test/wasm/memory-naming.test +++ b/lld/test/wasm/memory-naming.test @@ -65,6 +65,21 @@ # CHECK-IMPORT-NEXT: Index: 0 # CHECK-IMPORT-NEXT: - Type: +# RUN:wasm-ld --import-memory=foo -o %t.import.wasm %t.start.o +# RUN: obj2yaml %t.import.wasm | FileCheck -check-prefix=CHECK-IMPORT-DEFAULT %s + +# Verify that memory import module defaults to `env`, which is the default +# module for all imports. + +# CHECK-IMPORT-DEFAULT: - Type: IMPORT +# CHECK-IMPORT-DEFAULT-NEXT: Imports: +# CHECK-IMPORT-DEFAULT-NEXT: - Module: env +# CHECK-IMPORT-DEFAULT-NEXT: Field: foo +# CHECK-IMPORT-DEFAULT-NEXT: Kind: MEMORY +# CHECK-IMPORT-DEFAULT-NEXT: Memory: +# CHECK-IMPORT-DEFAULT-NEXT: Minimum: 0x2 +# CHECK-IMPORT-DEFAULT-NEXT: - Type: + # RUN:wasm-ld --import-memory=foo,bar --export-memory=qux -o %t.both.wasm %t.start.o # RUN: obj2yaml %t.both.wasm | FileCheck -check-prefix=CHECK-BOTH %s diff --git a/lld/test/wasm/mutable-global-exports.s b/lld/test/wasm/mutable-global-exports.s index 59308496ab4cc..1c10e92083b5c 100644 --- a/lld/test/wasm/mutable-global-exports.s +++ b/lld/test/wasm/mutable-global-exports.s @@ -16,6 +16,10 @@ .globl _start .globl foo_global +.globl bar_global + +.globaltype bar_global, i32, immutable +bar_global: .globaltype foo_global, i32 foo_global: @@ -33,6 +37,7 @@ _start: .ascii "atomics" # CHECK-ERR: mutable global exported but 'mutable-globals' feature not present in inputs: `foo_global`. Use --no-check-features to suppress +# CHECK-ERR-NOT: bar_global # CHECK: - Type: EXPORT # CHECK-NEXT: Exports: @@ -68,42 +73,48 @@ _start: # CHECK-ALL-NEXT: - Name: __wasm_call_ctors # CHECK-ALL-NEXT: Kind: FUNCTION # CHECK-ALL-NEXT: Index: 0 +# CHECK-ALL-NEXT: - Name: __stack_pointer +# CHECK-ALL-NEXT: Kind: GLOBAL +# CHECK-ALL-NEXT: Index: 0 # CHECK-ALL-NEXT: - Name: _start # CHECK-ALL-NEXT: Kind: FUNCTION # CHECK-ALL-NEXT: Index: 1 # CHECK-ALL-NEXT: - Name: foo_global # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 1 -# CHECK-ALL-NEXT: - Name: __dso_handle +# CHECK-ALL-NEXT: - Name: bar_global # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 2 -# CHECK-ALL-NEXT: - Name: __data_end +# CHECK-ALL-NEXT: - Name: __dso_handle # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 3 -# CHECK-ALL-NEXT: - Name: __stack_low +# CHECK-ALL-NEXT: - Name: __data_end # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 4 -# CHECK-ALL-NEXT: - Name: __stack_high +# CHECK-ALL-NEXT: - Name: __stack_low # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 5 -# CHECK-ALL-NEXT: - Name: __global_base +# CHECK-ALL-NEXT: - Name: __stack_high # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 6 -# CHECK-ALL-NEXT: - Name: __heap_base +# CHECK-ALL-NEXT: - Name: __global_base # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 7 -# CHECK-ALL-NEXT: - Name: __heap_end +# CHECK-ALL-NEXT: - Name: __heap_base # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 8 -# CHECK-ALL-NEXT: - Name: __memory_base +# CHECK-ALL-NEXT: - Name: __heap_end # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 9 -# CHECK-ALL-NEXT: - Name: __table_base +# CHECK-ALL-NEXT: - Name: __memory_base # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 10 -# CHECK-ALL-NEXT: - Name: __wasm_first_page_end +# CHECK-ALL-NEXT: - Name: __table_base # CHECK-ALL-NEXT: Kind: GLOBAL # CHECK-ALL-NEXT: Index: 11 +# CHECK-ALL-NEXT: - Name: __wasm_first_page_end +# CHECK-ALL-NEXT: Kind: GLOBAL +# CHECK-ALL-NEXT: Index: 12 # CHECK-ALL-NEXT: - Type: CODE # CHECK-ALL: Name: target_features diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index b57d77457b83a..9b85b6c00b26d 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -542,22 +542,19 @@ static void readConfigs(opt::InputArgList &args) { ctx.arg.noinhibitExec = args.hasArg(OPT_noinhibit_exec); if (args.hasArg(OPT_import_memory_with_name)) { - ctx.arg.memoryImport = - args.getLastArgValue(OPT_import_memory_with_name).split(","); + auto argValue = args.getLastArgValue(OPT_import_memory_with_name); + if (argValue.contains(',')) + ctx.arg.memoryImport = argValue.split(","); + else + ctx.arg.memoryImport = {defaultModule, argValue}; } else if (args.hasArg(OPT_import_memory)) { - ctx.arg.memoryImport = - std::pair(defaultModule, memoryName); - } else { - ctx.arg.memoryImport = - std::optional>(); + ctx.arg.memoryImport = {defaultModule, memoryName}; } if (args.hasArg(OPT_export_memory_with_name)) { ctx.arg.memoryExport = args.getLastArgValue(OPT_export_memory_with_name); } else if (args.hasArg(OPT_export_memory)) { ctx.arg.memoryExport = memoryName; - } else { - ctx.arg.memoryExport = std::optional(); } ctx.arg.sharedMemory = args.hasArg(OPT_shared_memory); @@ -748,8 +745,7 @@ static void setConfigs() { error("--export-memory is incompatible with --shared"); } if (!ctx.arg.memoryImport.has_value()) { - ctx.arg.memoryImport = std::pair( - defaultModule, memoryName); + ctx.arg.memoryImport = {defaultModule, memoryName}; } } diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp index b704677d36c93..9a5b56fc52e2f 100644 --- a/lld/wasm/Writer.cpp +++ b/lld/wasm/Writer.cpp @@ -576,7 +576,7 @@ void Writer::populateTargetFeatures() { if (ctx.isPic) { // This should not be necessary because all PIC objects should - // contain the mutable-globals feature. + // contain the `mutable-globals` feature. // TODO (https://github.com/llvm/llvm-project/issues/51681) allowed.insert("mutable-globals"); } @@ -703,10 +703,12 @@ void Writer::checkImportExportTargetFeatures() { } } for (const Symbol *sym : out.exportSec->exportedSymbols) { - if (isa(sym)) { - error(Twine("mutable global exported but 'mutable-globals' feature " - "not present in inputs: `") + - toString(*sym) + "`. Use --no-check-features to suppress."); + if (auto *global = dyn_cast(sym)) { + if (global->getGlobalType()->Mutable) { + error(Twine("mutable global exported but 'mutable-globals' feature " + "not present in inputs: `") + + toString(*sym) + "`. Use --no-check-features to suppress."); + } } } } @@ -782,6 +784,9 @@ void Writer::calculateExports() { unsigned globalIndex = out.importSec->getNumImportedGlobals() + out.globalSec->numGlobals(); + bool hasMutableGlobals = + out.targetFeaturesSec->features.count("mutable-globals") > 0; + for (Symbol *sym : symtab->symbols()) { if (!sym->isExported()) continue; @@ -799,7 +804,8 @@ void Writer::calculateExports() { } export_ = {name, WASM_EXTERNAL_FUNCTION, f->getExportedFunctionIndex()}; } else if (auto *g = dyn_cast(sym)) { - if (g->getGlobalType()->Mutable && !g->getFile() && !g->forceExport) { + if (!hasMutableGlobals && g->getGlobalType()->Mutable && !g->getFile() && + !g->isExportedExplicit()) { // Avoid exporting mutable globals are linker synthesized (e.g. // __stack_pointer or __tls_base) unless they are explicitly exported // from the command line. diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index 250ad64b76d9a..06136ed40471d 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -181,7 +181,15 @@ class Debugger : public std::enable_shared_from_this, return m_target_list.GetSelectedTarget(); } + /// Get the execution context representing the selected entities in the + /// selected target. ExecutionContext GetSelectedExecutionContext(); + + /// Similar to GetSelectedExecutionContext but returns a + /// ExecutionContextRef, and will hold the dummy target if no target is + /// currently selected. + ExecutionContextRef GetSelectedExecutionContextRef(); + /// Get accessor for the target list. /// /// The target list is part of the global debugger object. This the single @@ -419,7 +427,7 @@ class Debugger : public std::enable_shared_from_this, void CancelInterruptRequest(); /// Redraw the statusline if enabled. - void RedrawStatusline(bool update = true); + void RedrawStatusline(std::optional exe_ctx_ref); /// This is the correct way to query the state of Interruption. /// If you are on the RunCommandInterpreter thread, it will check the @@ -701,9 +709,9 @@ class Debugger : public std::enable_shared_from_this, void HandleBreakpointEvent(const lldb::EventSP &event_sp); - void HandleProcessEvent(const lldb::EventSP &event_sp); + lldb::ProcessSP HandleProcessEvent(const lldb::EventSP &event_sp); - void HandleThreadEvent(const lldb::EventSP &event_sp); + lldb::ThreadSP HandleThreadEvent(const lldb::EventSP &event_sp); void HandleProgressEvent(const lldb::EventSP &event_sp); diff --git a/lldb/include/lldb/Core/Opcode.h b/lldb/include/lldb/Core/Opcode.h index 7bbd73d039f99..7e756d3f15d22 100644 --- a/lldb/include/lldb/Core/Opcode.h +++ b/lldb/include/lldb/Core/Opcode.h @@ -223,7 +223,9 @@ class Opcode { int Dump(Stream *s, uint32_t min_byte_width) const; const void *GetOpcodeBytes() const { - return ((m_type == Opcode::eTypeBytes) ? m_data.inst.bytes : nullptr); + return ((m_type == Opcode::eTypeBytes || m_type == Opcode::eType16_32Tuples) + ? m_data.inst.bytes + : nullptr); } uint32_t GetByteSize() const { diff --git a/lldb/include/lldb/Core/Statusline.h b/lldb/include/lldb/Core/Statusline.h index 6bda153f822d2..a5ab1927b57f5 100644 --- a/lldb/include/lldb/Core/Statusline.h +++ b/lldb/include/lldb/Core/Statusline.h @@ -9,6 +9,8 @@ #ifndef LLDB_CORE_STATUSLINE_H #define LLDB_CORE_STATUSLINE_H +#include "lldb/Symbol/SymbolContext.h" +#include "lldb/Target/ExecutionContext.h" #include "lldb/lldb-forward.h" #include #include @@ -19,15 +21,16 @@ class Statusline { Statusline(Debugger &debugger); ~Statusline(); + using Context = std::pair; + /// Reduce the scroll window and draw the statusline. - void Enable(); + void Enable(std::optional exe_ctx_ref); /// Hide the statusline and extend the scroll window. void Disable(); - /// Redraw the statusline. If update is false, this will redraw the last - /// string. - void Redraw(bool update = true); + /// Redraw the statusline. + void Redraw(std::optional exe_ctx_ref); /// Inform the statusline that the terminal dimensions have changed. void TerminalSizeChanged(); @@ -46,7 +49,11 @@ class Statusline { void UpdateScrollWindow(ScrollWindowMode mode); Debugger &m_debugger; - std::string m_last_str; + + /// Cached copy of the execution context that allows us to redraw the + /// statusline. + ExecutionContextRef m_exe_ctx_ref; + uint64_t m_terminal_width = 0; uint64_t m_terminal_height = 0; }; diff --git a/lldb/include/lldb/Target/ExecutionContext.h b/lldb/include/lldb/Target/ExecutionContext.h index f105e38fa69aa..fe8bce7f69713 100644 --- a/lldb/include/lldb/Target/ExecutionContext.h +++ b/lldb/include/lldb/Target/ExecutionContext.h @@ -92,10 +92,21 @@ class ExecutionContextRef { /// Construct using the target and all the selected items inside of it (the /// process and its selected thread, and the thread's selected frame). If - /// there is no selected thread, default to the first thread If there is no + /// there is no selected thread, default to the first thread. If there is no /// selected frame, default to the first frame. ExecutionContextRef(Target *target, bool adopt_selected); + /// Construct using the process and all the selected items inside of it ( + /// the selected thread, and the thread's selected frame). If + /// there is no selected thread, default to the first thread. If there is no + /// selected frame, default to the first frame. + ExecutionContextRef(Process *process, bool adopt_selected); + + /// Construct using the thread and all the selected items inside of it ( the + /// selected frame). If there is no selected frame, default to the first + /// frame. + ExecutionContextRef(Thread *thread, bool adopt_selected); + /// Construct using an execution context scope. /// /// If the ExecutionContextScope object is valid and refers to a frame, make @@ -199,9 +210,9 @@ class ExecutionContextRef { void SetTargetPtr(Target *target, bool adopt_selected); - void SetProcessPtr(Process *process); + void SetProcessPtr(Process *process, bool adopt_selected = false); - void SetThreadPtr(Thread *thread); + void SetThreadPtr(Thread *thread, bool adopt_selected = false); void SetFramePtr(StackFrame *frame); diff --git a/lldb/packages/Python/lldbsuite/test/cpu_feature.py b/lldb/packages/Python/lldbsuite/test/cpu_feature.py new file mode 100644 index 0000000000000..b46a5acc596f0 --- /dev/null +++ b/lldb/packages/Python/lldbsuite/test/cpu_feature.py @@ -0,0 +1,79 @@ +""" +Platform-agnostic helper to query for CPU features. +""" + +import re + + +class CPUFeature: + def __init__(self, linux_cpu_info_flag: str = None, darwin_sysctl_key: str = None): + self.cpu_info_flag = linux_cpu_info_flag + self.sysctl_key = darwin_sysctl_key + + def __str__(self): + for arch_class in ALL_ARCHS: + for feat_var in dir(arch_class): + if self == getattr(arch_class, feat_var): + return f"{arch_class.__name__}.{feat_var}" + raise AssertionError("unreachable") + + def is_supported(self, triple, cmd_runner): + if re.match(".*-.*-linux", triple): + err_msg, res = self._is_supported_linux(cmd_runner) + elif re.match(".*-apple-.*", triple): + err_msg, res = self._is_supported_darwin(cmd_runner) + else: + err_msg, res = None, False + + if err_msg: + print(f"CPU feature check failed: {err_msg}") + + return res + + def _is_supported_linux(self, cmd_runner): + if not self.cpu_info_flag: + return f"Unspecified cpuinfo flag for {self}", False + + cmd = "cat /proc/cpuinfo" + err, retcode, output = cmd_runner(cmd) + if err.Fail() or retcode != 0: + return output, False + + # Assume that every processor presents the same features. + # Look for the first "Features: ...." line. Features are space separated. + if m := re.search(r"Features\s*: (.*)\n", output): + features = m.group(1).split() + return None, (self.cpu_info_flag in features) + + return 'No "Features:" line found in /proc/cpuinfo', False + + def _is_supported_darwin(self, cmd_runner): + if not self.sysctl_key: + return f"Unspecified sysctl key for {self}", False + + cmd = f"sysctl -n {self.sysctl_key}" + err, retcode, output = cmd_runner(cmd) + if err.Fail() or retcode != 0: + return output, False + + return None, (output.strip() == "1") + + +class AArch64: + FPMR = CPUFeature("fpmr") + GCS = CPUFeature("gcs") + MTE = CPUFeature("mte") + MTE_STORE_ONLY = CPUFeature("mtestoreonly") + PTR_AUTH = CPUFeature("paca", "hw.optional.arm.FEAT_PAuth2") + SME = CPUFeature("sme", "hw.optional.arm.FEAT_SME") + SME_FA64 = CPUFeature("smefa64") + SME2 = CPUFeature("sme2", "hw.optional.arm.FEAT_SME2") + SVE = CPUFeature("sve") + + +class Loong: + LASX = CPUFeature("lasx") + LSX = CPUFeature("lsx") + + +ALL_ARCHS = [AArch64, Loong] diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 16a58cfc10b9a..454196e1b0264 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -27,6 +27,7 @@ from lldbsuite.support import temp_file from lldbsuite.test import lldbplatform from lldbsuite.test import lldbplatformutil +from lldbsuite.test.cpu_feature import CPUFeature class DecorateMode: @@ -1131,24 +1132,13 @@ def skipIfLLVMTargetMissing(target): return unittest.skipIf(not found, "requires " + target) -# Call sysctl on darwin to see if a specified hardware feature is available on this machine. -def skipUnlessFeature(feature): - def is_feature_enabled(): - if platform.system() == "Darwin": - try: - output = subprocess.check_output( - ["/usr/sbin/sysctl", feature], stderr=subprocess.DEVNULL - ).decode("utf-8") - # If 'feature: 1' was output, then this feature is available and - # the test should not be skipped. - if re.match(r"%s: 1\s*" % feature, output): - return None - else: - return "%s is not supported on this system." % feature - except subprocess.CalledProcessError: - return "%s is not supported on this system." % feature +def skipUnlessFeature(cpu_feature: CPUFeature): + def hasFeature(test_case): + if not test_case.isSupported(cpu_feature): + return f"Unsupported CPU feature: {cpu_feature}" + return None - return skipTestIfFn(is_feature_enabled) + return skipTestIfFn(hasFeature) def skipIfBuildType(types: list[str]): diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index b7077f8d8cc5c..8074922723440 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -48,6 +48,7 @@ # LLDB modules import lldb from . import configuration +from . import cpu_feature from . import decorators from . import lldbplatformutil from . import lldbtest_config @@ -1315,39 +1316,6 @@ def isPPC64le(self): return True return False - def getCPUInfo(self): - triple = self.dbg.GetSelectedPlatform().GetTriple() - - # TODO other platforms, please implement this function - if not re.match(".*-.*-linux", triple): - return "" - - # Need to do something different for non-Linux/Android targets - cpuinfo_path = self.getBuildArtifact("cpuinfo") - if configuration.lldb_platform_name: - self.runCmd( - 'platform get-file "/proc/cpuinfo" ' + cpuinfo_path, check=False - ) - if not self.res.Succeeded(): - if self.TraceOn(): - print( - 'Failed to get /proc/cpuinfo from remote: "{}"'.format( - self.res.GetOutput().strip() - ) - ) - print("All cpuinfo feature checks will fail.") - return "" - else: - cpuinfo_path = "/proc/cpuinfo" - - try: - with open(cpuinfo_path, "r") as f: - cpuinfo = f.read() - except: - return "" - - return cpuinfo - def isAArch64(self): """Returns true if the architecture is AArch64.""" arch = self.getArchitecture().lower() @@ -1360,39 +1328,47 @@ def isARM(self): self.getArchitecture().lower().startswith("arm") ) + def isSupported(self, cpu_feature: cpu_feature.CPUFeature): + triple = self.dbg.GetSelectedPlatform().GetTriple() + cmd_runner = self.run_platform_command + return cpu_feature.is_supported(triple, cmd_runner) + def isAArch64SVE(self): - return self.isAArch64() and "sve" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.SVE) def isAArch64SME(self): - return self.isAArch64() and "sme" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.SME) def isAArch64SME2(self): # If you have sme2, you also have sme. - return self.isAArch64() and "sme2" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.SME2) def isAArch64SMEFA64(self): # smefa64 allows the use of the full A64 instruction set in streaming # mode. This is required by certain test programs to setup register # state. - cpuinfo = self.getCPUInfo() - return self.isAArch64() and "sme" in cpuinfo and "smefa64" in cpuinfo + return ( + self.isAArch64() + and self.isSupported(cpu_feature.AArch64.SME) + and self.isSupported(cpu_feature.AArch64.SME_FA64) + ) def isAArch64MTE(self): - return self.isAArch64() and "mte" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.MTE) def isAArch64MTEStoreOnly(self): - return self.isAArch64() and "mtestoreonly" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.MTE_STORE_ONLY) def isAArch64GCS(self): - return self.isAArch64() and "gcs" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.GCS) def isAArch64PAuth(self): if self.getArchitecture() == "arm64e": return True - return self.isAArch64() and "paca" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.PTR_AUTH) def isAArch64FPMR(self): - return self.isAArch64() and "fpmr" in self.getCPUInfo() + return self.isAArch64() and self.isSupported(cpu_feature.AArch64.FPMR) def isAArch64Windows(self): """Returns true if the architecture is AArch64 and platform windows.""" @@ -1407,10 +1383,10 @@ def isLoongArch(self): return arch in ["loongarch64", "loongarch32"] def isLoongArchLSX(self): - return self.isLoongArch() and "lsx" in self.getCPUInfo() + return self.isLoongArch() and self.isSupported(cpu_feature.Loong.LSX) def isLoongArchLASX(self): - return self.isLoongArch() and "lasx" in self.getCPUInfo() + return self.isLoongArch() and self.isSupported(cpu_feature.Loong.LASX) def isRISCV(self): """Returns true if the architecture is RISCV64 or RISCV32.""" diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index fdd6b3b077463..ce59ee505cd3d 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -299,6 +299,8 @@ set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h) file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h) file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h) list(REMOVE_ITEM root_public_headers ${root_private_headers}) +# Skip the initial copy of lldb-defines.h. The fixed version is generated at build time. +list(REMOVE_ITEM root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-defines.h) find_program(unifdef_EXECUTABLE unifdef) diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index ed674ee1275c7..568cd9d3d03b6 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -253,16 +253,18 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx, // Statusline setting changed. If we have a statusline instance, update it // now. Otherwise it will get created in the default event handler. std::lock_guard guard(m_statusline_mutex); - if (StatuslineSupported()) + if (StatuslineSupported()) { m_statusline.emplace(*this); - else + m_statusline->Enable(GetSelectedExecutionContextRef()); + } else { m_statusline.reset(); + } } else if (property_path == g_debugger_properties[ePropertyStatuslineFormat].name || property_path == g_debugger_properties[ePropertySeparator].name) { // Statusline format changed. Redraw the statusline. - RedrawStatusline(); + RedrawStatusline(std::nullopt); } else if (property_path == g_debugger_properties[ePropertyUseSourceCache].name) { // use-source-cache changed. Wipe out the cache contents if it was @@ -501,7 +503,7 @@ FormatEntity::Entry Debugger::GetStatuslineFormat() const { bool Debugger::SetStatuslineFormat(const FormatEntity::Entry &format) { constexpr uint32_t idx = ePropertyStatuslineFormat; bool ret = SetPropertyAtIndex(idx, format); - RedrawStatusline(); + RedrawStatusline(std::nullopt); return ret; } @@ -526,7 +528,7 @@ llvm::StringRef Debugger::GetDisabledAnsiSuffix() const { bool Debugger::SetSeparator(llvm::StringRef s) { constexpr uint32_t idx = ePropertySeparator; bool ret = SetPropertyAtIndex(idx, s); - RedrawStatusline(); + RedrawStatusline(std::nullopt); return ret; } @@ -1210,14 +1212,18 @@ void Debugger::RestoreInputTerminalState() { { std::lock_guard guard(m_statusline_mutex); if (m_statusline) - m_statusline->Enable(); + m_statusline->Enable(GetSelectedExecutionContext()); } } -void Debugger::RedrawStatusline(bool update) { +void Debugger::RedrawStatusline( + std::optional exe_ctx_ref) { std::lock_guard guard(m_statusline_mutex); - if (m_statusline) - m_statusline->Redraw(update); + + if (!m_statusline) + return; + + m_statusline->Redraw(exe_ctx_ref); } ExecutionContext Debugger::GetSelectedExecutionContext() { @@ -1226,6 +1232,13 @@ ExecutionContext Debugger::GetSelectedExecutionContext() { return ExecutionContext(exe_ctx_ref); } +ExecutionContextRef Debugger::GetSelectedExecutionContextRef() { + if (TargetSP selected_target_sp = GetSelectedTarget()) + return ExecutionContextRef(selected_target_sp.get(), + /*adopt_selected=*/true); + return ExecutionContextRef(m_dummy_target_sp.get(), /*adopt_selected=*/false); +} + void Debugger::DispatchInputInterrupt() { std::lock_guard guard(m_io_handler_stack.GetMutex()); IOHandlerSP reader_sp(m_io_handler_stack.Top()); @@ -1941,8 +1954,7 @@ void Debugger::FlushProcessOutput(Process &process, bool flush_stdout, } // This function handles events that were broadcast by the process. -void Debugger::HandleProcessEvent(const EventSP &event_sp) { - using namespace lldb; +ProcessSP Debugger::HandleProcessEvent(const EventSP &event_sp) { const uint32_t event_type = event_sp->GetType(); ProcessSP process_sp = (event_type == Process::eBroadcastBitStructuredData) @@ -2024,23 +2036,24 @@ void Debugger::HandleProcessEvent(const EventSP &event_sp) { if (pop_process_io_handler) process_sp->PopProcessIOHandler(); } + return process_sp; } -void Debugger::HandleThreadEvent(const EventSP &event_sp) { +ThreadSP Debugger::HandleThreadEvent(const EventSP &event_sp) { // At present the only thread event we handle is the Frame Changed event, and // all we do for that is just reprint the thread status for that thread. - using namespace lldb; const uint32_t event_type = event_sp->GetType(); const bool stop_format = true; + ThreadSP thread_sp; if (event_type == Thread::eBroadcastBitStackChanged || event_type == Thread::eBroadcastBitThreadSelected) { - ThreadSP thread_sp( - Thread::ThreadEventData::GetThreadFromEvent(event_sp.get())); + thread_sp = Thread::ThreadEventData::GetThreadFromEvent(event_sp.get()); if (thread_sp) { thread_sp->GetStatus(*GetAsyncOutputStream(), 0, 1, 1, stop_format, /*show_hidden*/ true); } } + return thread_sp; } bool Debugger::IsForwardingEvents() { return (bool)m_forward_listener_sp; } @@ -2068,6 +2081,11 @@ bool Debugger::StatuslineSupported() { return false; } +static bool RequiresFollowChildWorkaround(const Process &process) { + // FIXME: https://github.com/llvm/llvm-project/issues/160216 + return process.GetFollowForkMode() == eFollowChild; +} + lldb::thread_result_t Debugger::DefaultEventHandler() { ListenerSP listener_sp(GetListener()); ConstString broadcaster_class_target(Target::GetStaticBroadcasterClass()); @@ -2109,28 +2127,37 @@ lldb::thread_result_t Debugger::DefaultEventHandler() { if (StatuslineSupported()) { std::lock_guard guard(m_statusline_mutex); - if (!m_statusline) + if (!m_statusline) { m_statusline.emplace(*this); + m_statusline->Enable(GetSelectedExecutionContextRef()); + } } bool done = false; while (!done) { EventSP event_sp; if (listener_sp->GetEvent(event_sp, std::nullopt)) { + std::optional exe_ctx_ref = std::nullopt; if (event_sp) { Broadcaster *broadcaster = event_sp->GetBroadcaster(); if (broadcaster) { uint32_t event_type = event_sp->GetType(); ConstString broadcaster_class(broadcaster->GetBroadcasterClass()); if (broadcaster_class == broadcaster_class_process) { - HandleProcessEvent(event_sp); + if (ProcessSP process_sp = HandleProcessEvent(event_sp)) + if (!RequiresFollowChildWorkaround(*process_sp)) + exe_ctx_ref = ExecutionContextRef(process_sp.get(), + /*adopt_selected=*/true); } else if (broadcaster_class == broadcaster_class_target) { if (Breakpoint::BreakpointEventData::GetEventDataFromEvent( event_sp.get())) { HandleBreakpointEvent(event_sp); } } else if (broadcaster_class == broadcaster_class_thread) { - HandleThreadEvent(event_sp); + if (ThreadSP thread_sp = HandleThreadEvent(event_sp)) + if (!RequiresFollowChildWorkaround(*thread_sp->GetProcess())) + exe_ctx_ref = ExecutionContextRef(thread_sp.get(), + /*adopt_selected=*/true); } else if (broadcaster == m_command_interpreter_up.get()) { if (event_type & CommandInterpreter::eBroadcastBitQuitCommandReceived) { @@ -2168,7 +2195,7 @@ lldb::thread_result_t Debugger::DefaultEventHandler() { if (m_forward_listener_sp) m_forward_listener_sp->AddEvent(event_sp); } - RedrawStatusline(); + RedrawStatusline(exe_ctx_ref); } } diff --git a/lldb/source/Core/IOHandler.cpp b/lldb/source/Core/IOHandler.cpp index f65a1113f3592..57819eeade6e8 100644 --- a/lldb/source/Core/IOHandler.cpp +++ b/lldb/source/Core/IOHandler.cpp @@ -442,7 +442,7 @@ void IOHandlerEditline::AutoCompleteCallback(CompletionRequest &request) { } void IOHandlerEditline::RedrawCallback() { - m_debugger.RedrawStatusline(/*update=*/false); + m_debugger.RedrawStatusline(std::nullopt); } #endif diff --git a/lldb/source/Core/Statusline.cpp b/lldb/source/Core/Statusline.cpp index 393d427241021..bfbd190fba27c 100644 --- a/lldb/source/Core/Statusline.cpp +++ b/lldb/source/Core/Statusline.cpp @@ -35,9 +35,7 @@ using namespace lldb_private; Statusline::Statusline(Debugger &debugger) : m_debugger(debugger), m_terminal_width(m_debugger.GetTerminalWidth()), - m_terminal_height(m_debugger.GetTerminalHeight()) { - Enable(); -} + m_terminal_height(m_debugger.GetTerminalHeight()) {} Statusline::~Statusline() { Disable(); } @@ -47,16 +45,16 @@ void Statusline::TerminalSizeChanged() { UpdateScrollWindow(ResizeStatusline); - // Draw the old statusline. - Redraw(/*update=*/false); + // Redraw the old statusline. + Redraw(std::nullopt); } -void Statusline::Enable() { +void Statusline::Enable(std::optional exe_ctx_ref) { // Reduce the scroll window to make space for the status bar below. UpdateScrollWindow(EnableStatusline); // Draw the statusline. - Redraw(/*update=*/true); + Redraw(exe_ctx_ref); } void Statusline::Disable() { @@ -69,8 +67,6 @@ void Statusline::Draw(std::string str) { if (!stream_sp) return; - m_last_str = str; - str = ansi::TrimAndPad(str, m_terminal_width); LockedStreamFile locked_stream = stream_sp->Lock(); @@ -127,33 +123,32 @@ void Statusline::UpdateScrollWindow(ScrollWindowMode mode) { m_debugger.RefreshIOHandler(); } -void Statusline::Redraw(bool update) { - if (!update) { - Draw(m_last_str); - return; - } - - ExecutionContext exe_ctx = m_debugger.GetSelectedExecutionContext(); - - // For colors and progress events, the format entity needs access to the - // debugger, which requires a target in the execution context. - if (!exe_ctx.HasTargetScope()) - exe_ctx.SetTargetPtr(&m_debugger.GetSelectedOrDummyTarget()); - - SymbolContext symbol_ctx; - if (ProcessSP process_sp = exe_ctx.GetProcessSP()) { - // Check if the process is stopped, and if it is, make sure it remains - // stopped until we've computed the symbol context. - Process::StopLocker stop_locker; - if (stop_locker.TryLock(&process_sp->GetRunLock())) { - if (auto frame_sp = exe_ctx.GetFrameSP()) - symbol_ctx = frame_sp->GetSymbolContext(eSymbolContextEverything); - } +void Statusline::Redraw(std::optional exe_ctx_ref) { + // Update the cached execution context. + if (exe_ctx_ref) + m_exe_ctx_ref = *exe_ctx_ref; + + // Lock the execution context. + ExecutionContext exe_ctx = + m_exe_ctx_ref.Lock(/*thread_and_frame_only_if_stopped=*/false); + + // Compute the symbol context if we're stopped. + SymbolContext sym_ctx; + llvm::Expected stopped_exe_ctx = + GetStoppedExecutionContext(&m_exe_ctx_ref); + if (stopped_exe_ctx) { + // The StoppedExecutionContext only ensures that we hold the run lock. + // The process could be in an exited or unloaded state and have no frame. + if (auto frame_sp = stopped_exe_ctx->GetFrameSP()) + sym_ctx = frame_sp->GetSymbolContext(eSymbolContextEverything); + } else { + // We can draw the statusline without being stopped. + llvm::consumeError(stopped_exe_ctx.takeError()); } StreamString stream; FormatEntity::Entry format = m_debugger.GetStatuslineFormat(); - FormatEntity::Format(format, stream, &symbol_ctx, &exe_ctx, nullptr, nullptr, + FormatEntity::Format(format, stream, &sym_ctx, &exe_ctx, nullptr, nullptr, false, false); Draw(stream.GetString().str()); diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp index 6885977baa24e..924953cc43fa2 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp @@ -1307,6 +1307,10 @@ ClangExpressionParser::ParseInternal(DiagnosticManager &diagnostic_manager, m_compiler->setSema(nullptr); adapter->EndSourceFile(); + // Creating persistent variables can trigger diagnostic emission. + // Make sure we reset the manager so we don't get asked to handle + // diagnostics after we finished parsing. + adapter->ResetManager(); unsigned num_errors = adapter->getNumErrors(); @@ -1322,8 +1326,6 @@ ClangExpressionParser::ParseInternal(DiagnosticManager &diagnostic_manager, type_system_helper->CommitPersistentDecls(); } - adapter->ResetManager(); - return num_errors; } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ModuleDependencyCollector.h b/lldb/source/Plugins/ExpressionParser/Clang/ModuleDependencyCollector.h index 4fe727460fdb9..dcba0d9c34962 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ModuleDependencyCollector.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ModuleDependencyCollector.h @@ -19,8 +19,8 @@ class ModuleDependencyCollectorAdaptor public: ModuleDependencyCollectorAdaptor( std::shared_ptr file_collector) - : clang::ModuleDependencyCollector(""), m_file_collector(file_collector) { - } + : clang::ModuleDependencyCollector("", llvm::vfs::getRealFileSystem()), + m_file_collector(file_collector) {} void addFile(llvm::StringRef Filename, llvm::StringRef FileDst = {}) override { diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp index 5e429a92613ce..5c1b7d4943b3f 100644 --- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp +++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.cpp @@ -33,6 +33,10 @@ LLDB_PLUGIN_DEFINE_ADV(EmulateInstructionRISCV, InstructionRISCV) namespace lldb_private { +// RISC-V General Purpose Register numbers +static constexpr uint32_t RISCV_GPR_SP = 2; // x2 is the stack pointer +static constexpr uint32_t RISCV_GPR_FP = 8; // x8 is the frame pointer + /// Returns all values wrapped in Optional, or std::nullopt if any of the values /// is std::nullopt. template @@ -108,6 +112,16 @@ static uint32_t FPREncodingToLLDB(uint32_t reg_encode) { return LLDB_INVALID_REGNUM; } +// Helper function to get register info from GPR encoding +static std::optional +GPREncodingToRegisterInfo(EmulateInstructionRISCV &emulator, + uint32_t reg_encode) { + uint32_t lldb_reg = GPREncodingToLLDB(reg_encode); + if (lldb_reg == LLDB_INVALID_REGNUM) + return std::nullopt; + return emulator.GetRegisterInfo(eRegisterKindLLDB, lldb_reg); +} + bool Rd::Write(EmulateInstructionRISCV &emulator, uint64_t value) { uint32_t lldb_reg = GPREncodingToLLDB(rd); EmulateInstruction::Context ctx; @@ -230,10 +244,34 @@ Load(EmulateInstructionRISCV &emulator, I inst, uint64_t (*extend)(E)) { auto addr = LoadStoreAddr(emulator, inst); if (!addr) return false; - return transformOptional( - emulator.ReadMem(*addr), - [&](T t) { return inst.rd.Write(emulator, extend(E(t))); }) - .value_or(false); + + // Set up context for the load operation, similar to ARM64. + EmulateInstructionRISCV::Context context; + + // Get register info for base register + std::optional reg_info_rs1 = + GPREncodingToRegisterInfo(emulator, inst.rs1.rs); + + if (!reg_info_rs1) + return false; + + // Set context type based on whether this is a stack-based load. + if (inst.rs1.rs == RISCV_GPR_SP) + context.type = EmulateInstruction::eContextPopRegisterOffStack; + else + context.type = EmulateInstruction::eContextRegisterLoad; + + // Set the context address information + context.SetAddress(*addr); + + // Read from memory with context and write to register. + bool success = false; + uint64_t value = + emulator.ReadMemoryUnsigned(context, *addr, sizeof(T), 0, &success); + if (!success) + return false; + + return inst.rd.Write(emulator, extend(E(T(value)))); } template @@ -242,9 +280,35 @@ Store(EmulateInstructionRISCV &emulator, I inst) { auto addr = LoadStoreAddr(emulator, inst); if (!addr) return false; - return transformOptional( - inst.rs2.Read(emulator), - [&](uint64_t rs2) { return emulator.WriteMem(*addr, rs2); }) + + // Set up context for the store operation, similar to ARM64. + EmulateInstructionRISCV::Context context; + + // Get register info for source and base registers. + std::optional reg_info_rs1 = + GPREncodingToRegisterInfo(emulator, inst.rs1.rs); + std::optional reg_info_rs2 = + GPREncodingToRegisterInfo(emulator, inst.rs2.rs); + + if (!reg_info_rs1 || !reg_info_rs2) + return false; + + // Set context type based on whether this is a stack-based store. + if (inst.rs1.rs == RISCV_GPR_SP) + context.type = EmulateInstruction::eContextPushRegisterOnStack; + else + context.type = EmulateInstruction::eContextRegisterStore; + + // Set the context to show which register is being stored to which base + // register + offset. + context.SetRegisterToRegisterPlusOffset(*reg_info_rs2, *reg_info_rs1, + SignExt(inst.imm)); + + return transformOptional(inst.rs2.Read(emulator), + [&](uint64_t rs2) { + return emulator.WriteMemoryUnsigned( + context, *addr, rs2, sizeof(T)); + }) .value_or(false); } @@ -737,11 +801,44 @@ class Executor { bool operator()(SH inst) { return Store(m_emu, inst); } bool operator()(SW inst) { return Store(m_emu, inst); } bool operator()(ADDI inst) { - return transformOptional(inst.rs1.ReadI64(m_emu), - [&](int64_t rs1) { - return inst.rd.Write( - m_emu, rs1 + int64_t(SignExt(inst.imm))); - }) + return transformOptional( + inst.rs1.ReadI64(m_emu), + [&](int64_t rs1) { + uint64_t result = rs1 + uint64_t(SignExt(inst.imm)); + // Check if this is a stack pointer adjustment. + if (inst.rd.rd == RISCV_GPR_SP && + inst.rs1.rs == RISCV_GPR_SP) { + EmulateInstruction::Context context; + context.type = + EmulateInstruction::eContextAdjustStackPointer; + context.SetImmediateSigned(SignExt(inst.imm)); + uint32_t sp_lldb_reg = GPREncodingToLLDB(RISCV_GPR_SP); + RegisterValue registerValue; + registerValue.SetUInt64(result); + return m_emu.WriteRegister(context, eRegisterKindLLDB, + sp_lldb_reg, registerValue); + } + // Check if this is setting up the frame pointer. + // addi fp, sp, imm -> fp = sp + imm (frame pointer setup). + if (inst.rd.rd == RISCV_GPR_FP && + inst.rs1.rs == RISCV_GPR_SP) { + EmulateInstruction::Context context; + context.type = EmulateInstruction::eContextSetFramePointer; + auto sp_reg_info = m_emu.GetRegisterInfo( + eRegisterKindLLDB, GPREncodingToLLDB(RISCV_GPR_SP)); + if (sp_reg_info) { + context.SetRegisterPlusOffset(*sp_reg_info, + SignExt(inst.imm)); + } + uint32_t fp_lldb_reg = GPREncodingToLLDB(RISCV_GPR_FP); + RegisterValue registerValue; + registerValue.SetUInt64(result); + return m_emu.WriteRegister(context, eRegisterKindLLDB, + fp_lldb_reg, registerValue); + } + // Regular ADDI instruction. + return inst.rd.Write(m_emu, result); + }) .value_or(false); } bool operator()(SLTI inst) { @@ -1745,6 +1842,61 @@ EmulateInstructionRISCV::GetRegisterInfo(RegisterKind reg_kind, return array[reg_index]; } +bool EmulateInstructionRISCV::SetInstruction(const Opcode &opcode, + const Address &inst_addr, + Target *target) { + // Call the base class implementation. + if (!EmulateInstruction::SetInstruction(opcode, inst_addr, target)) + return false; + + // Extract instruction data from the opcode. + uint32_t inst_data = 0; + const void *opcode_data = m_opcode.GetOpcodeBytes(); + if (!opcode_data) + return false; + + if (m_opcode.GetByteSize() == 2) { + // 16-bit compressed instruction. + const uint16_t *data = static_cast(opcode_data); + inst_data = *data; + } else if (m_opcode.GetByteSize() == 4) { + // 32-bit instruction. + const uint32_t *data = static_cast(opcode_data); + inst_data = *data; + } else { + return false; + } + + // Decode the instruction. + auto decoded_inst = Decode(inst_data); + if (!decoded_inst) + return false; + + // Store the decoded result. + m_decoded = *decoded_inst; + return true; +} + +bool EmulateInstructionRISCV::CreateFunctionEntryUnwind( + UnwindPlan &unwind_plan) { + unwind_plan.Clear(); + unwind_plan.SetRegisterKind(eRegisterKindLLDB); + + UnwindPlan::Row row; + + row.GetCFAValue().SetIsRegisterPlusOffset(gpr_sp_riscv, 0); + row.SetRegisterLocationToSame(gpr_ra_riscv, /*must_replace=*/false); + row.SetRegisterLocationToSame(gpr_fp_riscv, /*must_replace=*/false); + + unwind_plan.AppendRow(std::move(row)); + unwind_plan.SetSourceName("EmulateInstructionRISCV"); + unwind_plan.SetSourcedFromCompiler(eLazyBoolNo); + unwind_plan.SetUnwindPlanValidAtAllInstructions(eLazyBoolYes); + unwind_plan.SetUnwindPlanForSignalTrap(eLazyBoolNo); + unwind_plan.SetReturnAddressRegister(gpr_ra_riscv); + return true; +} + bool EmulateInstructionRISCV::SetTargetTriple(const ArchSpec &arch) { return SupportsThisArch(arch); } diff --git a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h index 3578a4ab03053..c196a9bb9ce82 100644 --- a/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h +++ b/lldb/source/Plugins/Instruction/RISCV/EmulateInstructionRISCV.h @@ -61,6 +61,7 @@ class EmulateInstructionRISCV : public EmulateInstruction { case eInstructionTypePCModifying: return true; case eInstructionTypePrologueEpilogue: + return true; case eInstructionTypeAll: return false; } @@ -85,6 +86,7 @@ class EmulateInstructionRISCV : public EmulateInstruction { return SupportsThisInstructionType(inst_type); } + bool CreateFunctionEntryUnwind(UnwindPlan &unwind_plan) override; bool SetTargetTriple(const ArchSpec &arch) override; bool ReadInstruction() override; std::optional GetLastInstrSize() override { return m_last_size; } @@ -94,6 +96,8 @@ class EmulateInstructionRISCV : public EmulateInstruction { std::optional GetRegisterInfo(lldb::RegisterKind reg_kind, uint32_t reg_num) override; + bool SetInstruction(const Opcode &opcode, const Address &inst_addr, + Target *target) override; std::optional ReadInstructionAt(lldb::addr_t addr); std::optional Decode(uint32_t inst); bool Execute(DecodeResult inst, bool ignore_cond); diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp index 931baf5927a04..097c91b623e8f 100644 --- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp +++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp @@ -826,6 +826,24 @@ bool ObjectFileELF::ParseHeader() { } UUID ObjectFileELF::GetUUID() { + if (m_uuid) + return m_uuid; + + // Try loading note info from any PT_NOTE program headers. This is more + // friendly to ELF files that have no section headers, like ELF files that + // are loaded from memory. + for (const ELFProgramHeader &H : ProgramHeaders()) { + if (H.p_type == llvm::ELF::PT_NOTE) { + DataExtractor note_data = GetSegmentData(H); + if (note_data.GetByteSize()) { + lldb_private::ArchSpec arch_spec; + RefineModuleDetailsFromNote(note_data, arch_spec, m_uuid); + if (m_uuid) + return m_uuid; + } + } + } + // Need to parse the section list to get the UUIDs, so make sure that's been // done. if (!ParseSectionHeaders() && GetType() != ObjectFile::eTypeCoreFile) diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.cpp b/lldb/source/Plugins/Platform/Android/AdbClient.cpp index a179260ca15f6..0fbb48a2e16a0 100644 --- a/lldb/source/Plugins/Platform/Android/AdbClient.cpp +++ b/lldb/source/Plugins/Platform/Android/AdbClient.cpp @@ -8,61 +8,48 @@ #include "AdbClient.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/FileUtilities.h" - #include "lldb/Host/ConnectionFileDescriptor.h" #include "lldb/Host/FileSystem.h" -#include "lldb/Host/PosixApi.h" -#include "lldb/Utility/DataBuffer.h" -#include "lldb/Utility/DataBufferHeap.h" +#include "lldb/Utility/Connection.h" #include "lldb/Utility/DataEncoder.h" #include "lldb/Utility/DataExtractor.h" #include "lldb/Utility/FileSpec.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" +#include "lldb/Utility/Status.h" #include "lldb/Utility/StreamString.h" #include "lldb/Utility/Timeout.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileUtilities.h" +#include #include - -#include #include #include #include -// On Windows, transitive dependencies pull in , which defines a -// macro that clashes with a method name. -#ifdef SendMessage -#undef SendMessage -#endif - using namespace lldb; using namespace lldb_private; using namespace lldb_private::platform_android; using namespace std::chrono; +using namespace llvm; -static const seconds kReadTimeout(20); +static const char *kSocketNamespaceAbstract = "localabstract"; +static const char *kSocketNamespaceFileSystem = "localfilesystem"; +const seconds kReadTimeout(20); static const char *kOKAY = "OKAY"; static const char *kFAIL = "FAIL"; static const char *kDATA = "DATA"; static const char *kDONE = "DONE"; - static const char *kSEND = "SEND"; static const char *kRECV = "RECV"; static const char *kSTAT = "STAT"; - static const size_t kSyncPacketLen = 8; -// Maximum size of a filesync DATA packet. static const size_t kMaxPushData = 2 * 1024; -// Default mode for pushed files. -static const uint32_t kDefaultMode = 0100770; // S_IFREG | S_IRWXU | S_IRWXG - -static const char *kSocketNamespaceAbstract = "localabstract"; -static const char *kSocketNamespaceFileSystem = "localfilesystem"; +static const uint32_t kDefaultMode = 0100770; static Status ReadAllBytes(Connection &conn, void *buffer, size_t size) { - Status error; ConnectionStatus status; char *read_buffer = static_cast(buffer); @@ -85,86 +72,215 @@ static Status ReadAllBytes(Connection &conn, void *buffer, size_t size) { error = Status::FromErrorStringWithFormat( "Unable to read requested number of bytes. Connection status: %d.", status); + return error; } -Status AdbClient::CreateByDeviceID(const std::string &device_id, - AdbClient &adb) { - Status error; - std::string android_serial; - if (!device_id.empty()) - android_serial = device_id; - else if (const char *env_serial = std::getenv("ANDROID_SERIAL")) - android_serial = env_serial; +static Status ReadAdbMessage(Connection &conn, std::vector &message) { + message.clear(); - if (android_serial.empty()) { - DeviceIDList connected_devices; - error = adb.GetDevices(connected_devices); - if (error.Fail()) - return error; + char buffer[5]; + buffer[4] = 0; + + auto error = ReadAllBytes(conn, buffer, 4); + if (error.Fail()) + return error; + + unsigned int packet_len = 0; + sscanf(buffer, "%x", &packet_len); + + message.resize(packet_len, 0); + error = ReadAllBytes(conn, &message[0], packet_len); + if (error.Fail()) + message.clear(); - if (connected_devices.size() != 1) - return Status::FromErrorStringWithFormat( - "Expected a single connected device, got instead %zu - try " - "setting 'ANDROID_SERIAL'", - connected_devices.size()); - adb.SetDeviceID(connected_devices.front()); - } else { - adb.SetDeviceID(android_serial); - } return error; } -AdbClient::AdbClient() = default; - -AdbClient::AdbClient(const std::string &device_id) : m_device_id(device_id) {} +static Status GetResponseError(Connection &conn, const char *response_id) { + if (strcmp(response_id, kFAIL) != 0) + return Status::FromErrorStringWithFormat( + "Got unexpected response id from adb: \"%s\"", response_id); -AdbClient::~AdbClient() = default; + std::vector error_message; + auto error = ReadAdbMessage(conn, error_message); + if (!error.Success()) + return error; -void AdbClient::SetDeviceID(const std::string &device_id) { - m_device_id = device_id; + std::string error_str(&error_message[0], error_message.size()); + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, "ADB error: %s", error_str.c_str()); + return Status(error_str); } -const std::string &AdbClient::GetDeviceID() const { return m_device_id; } +static Status ReadResponseStatus(Connection &conn) { + char response_id[5]; -Status AdbClient::Connect() { + const size_t packet_len = 4; + response_id[packet_len] = 0; + + auto error = ReadAllBytes(conn, response_id, packet_len); + if (error.Fail()) + return error; + + if (strncmp(response_id, kOKAY, packet_len) != 0) + return GetResponseError(conn, response_id); + + return error; +} + +static Status SendAdbMessage(Connection &conn, llvm::StringRef packet) { Status error; - m_conn = std::make_unique(); + + char length_buffer[5]; + snprintf(length_buffer, sizeof(length_buffer), "%04x", + static_cast(packet.size())); + + ConnectionStatus status; + + conn.Write(length_buffer, 4, status, &error); + if (error.Fail()) + return error; + + conn.Write(packet.str().c_str(), packet.size(), status, &error); + return error; +} + +static Status ConnectToAdb(Connection &conn) { std::string port = "5037"; - if (const char *env_port = std::getenv("ANDROID_ADB_SERVER_PORT")) { + if (const char *env_port = std::getenv("ANDROID_ADB_SERVER_PORT")) port = env_port; - } std::string uri = "connect://127.0.0.1:" + port; - m_conn->Connect(uri.c_str(), &error); + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, "Connecting to ADB server at %s", uri.c_str()); + + Status error; + conn.Connect(uri.c_str(), &error); return error; } -Status AdbClient::GetDevices(DeviceIDList &device_list) { - device_list.clear(); - - auto error = SendMessage("host:devices"); +static Status EnterSyncMode(Connection &conn) { + auto error = SendAdbMessage(conn, "sync:"); if (error.Fail()) return error; - error = ReadResponseStatus(); + return ReadResponseStatus(conn); +} + +static Status SelectTargetDevice(Connection &conn, llvm::StringRef device_id) { + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOG(log, "Selecting device: {0}", device_id); + + std::ostringstream msg; + msg << "host:transport:" << device_id.str(); + + auto error = SendAdbMessage(conn, msg.str()); if (error.Fail()) return error; - std::vector in_buffer; - error = ReadMessage(in_buffer); + return ReadResponseStatus(conn); +} - llvm::StringRef response(&in_buffer[0], in_buffer.size()); - llvm::SmallVector devices; - response.split(devices, "\n", -1, false); +Expected AdbClient::ResolveDeviceID(StringRef device_id) { + StringRef preferred_serial; + if (!device_id.empty()) + preferred_serial = device_id; + else if (const char *env_serial = std::getenv("ANDROID_SERIAL")) + preferred_serial = env_serial; - for (const auto &device : devices) - device_list.push_back(std::string(device.split('\t').first)); + if (preferred_serial.empty()) { + DeviceIDList connected_devices; - // Force disconnect since ADB closes connection after host:devices response - // is sent. - m_conn.reset(); - return error; + auto GetDevices = [](DeviceIDList &device_list) -> Status { + device_list.clear(); + + // Create temporary ADB client for this operation only + auto temp_conn = std::make_unique(); + auto error = ConnectToAdb(*temp_conn); + if (error.Fail()) + return error; + + // NOTE: ADB closes the connection after host:devices response. + // The connection is no longer valid + error = SendAdbMessage(*temp_conn, "host:devices"); + if (error.Fail()) + return error; + + error = ReadResponseStatus(*temp_conn); + if (error.Fail()) + return error; + + std::vector in_buffer; + error = ReadAdbMessage(*temp_conn, in_buffer); + + StringRef response(&in_buffer[0], in_buffer.size()); + SmallVector devices; + response.split(devices, "\n", -1, false); + + for (const auto &device : devices) + device_list.push_back(std::string(device.split('\t').first)); + return error; + }; + + Status error = GetDevices(connected_devices); + if (error.Fail()) + return error.ToError(); + + if (connected_devices.size() != 1) + return createStringError( + inconvertibleErrorCode(), + "Expected a single connected device, got instead %zu - try " + "setting 'ANDROID_SERIAL'", + connected_devices.size()); + + std::string resolved_device_id = std::move(connected_devices.front()); + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, "AdbClient::ResolveDeviceID Resolved device ID: %s", + resolved_device_id.c_str()); + return resolved_device_id; + } + + std::string resolved_device_id = preferred_serial.str(); + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, "AdbClient::ResolveDeviceID Resolved device ID: %s", + resolved_device_id.c_str()); + return resolved_device_id; +} + +AdbClient::AdbClient(llvm::StringRef device_id) : m_device_id(device_id) { + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, + "AdbClient::AdbClient(device_id='%s') - Creating AdbClient with " + "device ID", + device_id.str().c_str()); + m_conn = std::make_unique(); + Connect(); +} + +AdbClient::AdbClient() { + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF( + log, + "AdbClient::AdbClient() - Creating AdbClient with default constructor"); + m_conn = std::make_unique(); + Connect(); +} + +AdbClient::~AdbClient() { + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, + "AdbClient::~AdbClient() - Destroying AdbClient for device: %s", + m_device_id.c_str()); +} + +llvm::StringRef AdbClient::GetDeviceID() const { return m_device_id; } + +Status AdbClient::Connect() { + if (m_conn->IsConnected()) + return Status(); + + return ConnectToAdb(*m_conn); } Status AdbClient::SetPortForwarding(const uint16_t local_port, @@ -177,7 +293,7 @@ Status AdbClient::SetPortForwarding(const uint16_t local_port, if (error.Fail()) return error; - return ReadResponseStatus(); + return ReadResponseStatus(*m_conn); } Status @@ -196,7 +312,7 @@ AdbClient::SetPortForwarding(const uint16_t local_port, if (error.Fail()) return error; - return ReadResponseStatus(); + return ReadResponseStatus(*m_conn); } Status AdbClient::DeletePortForwarding(const uint16_t local_port) { @@ -207,56 +323,13 @@ Status AdbClient::DeletePortForwarding(const uint16_t local_port) { if (error.Fail()) return error; - return ReadResponseStatus(); -} - -Status AdbClient::SendMessage(const std::string &packet, const bool reconnect) { - Status error; - if (!m_conn || reconnect) { - error = Connect(); - if (error.Fail()) - return error; - } - - char length_buffer[5]; - snprintf(length_buffer, sizeof(length_buffer), "%04x", - static_cast(packet.size())); - - ConnectionStatus status; - - m_conn->Write(length_buffer, 4, status, &error); - if (error.Fail()) - return error; - - m_conn->Write(packet.c_str(), packet.size(), status, &error); - return error; + return ReadResponseStatus(*m_conn); } -Status AdbClient::SendDeviceMessage(const std::string &packet) { +Status AdbClient::SendDeviceMessage(llvm::StringRef packet) { std::ostringstream msg; - msg << "host-serial:" << m_device_id << ":" << packet; - return SendMessage(msg.str()); -} - -Status AdbClient::ReadMessage(std::vector &message) { - message.clear(); - - char buffer[5]; - buffer[4] = 0; - - auto error = ReadAllBytes(buffer, 4); - if (error.Fail()) - return error; - - unsigned int packet_len = 0; - sscanf(buffer, "%x", &packet_len); - - message.resize(packet_len, 0); - error = ReadAllBytes(&message[0], packet_len); - if (error.Fail()) - message.clear(); - - return error; + msg << "host-serial:" << m_device_id << ":" << packet.str(); + return SendAdbMessage(*m_conn, msg.str()); } Status AdbClient::ReadMessageStream(std::vector &message, @@ -264,6 +337,9 @@ Status AdbClient::ReadMessageStream(std::vector &message, auto start = steady_clock::now(); message.clear(); + if (!m_conn) + return Status::FromErrorString("No connection available"); + Status error; lldb::ConnectionStatus status = lldb::eConnectionStatusSuccess; char buffer[1024]; @@ -282,87 +358,22 @@ Status AdbClient::ReadMessageStream(std::vector &message, return error; } -Status AdbClient::ReadResponseStatus() { - char response_id[5]; - - static const size_t packet_len = 4; - response_id[packet_len] = 0; - - auto error = ReadAllBytes(response_id, packet_len); - if (error.Fail()) - return error; - - if (strncmp(response_id, kOKAY, packet_len) != 0) - return GetResponseError(response_id); - - return error; -} - -Status AdbClient::GetResponseError(const char *response_id) { - if (strcmp(response_id, kFAIL) != 0) - return Status::FromErrorStringWithFormat( - "Got unexpected response id from adb: \"%s\"", response_id); - - std::vector error_message; - auto error = ReadMessage(error_message); - if (!error.Success()) - return error; - return Status(std::string(&error_message[0], error_message.size())); -} - -Status AdbClient::SwitchDeviceTransport() { - std::ostringstream msg; - msg << "host:transport:" << m_device_id; - - auto error = SendMessage(msg.str()); - if (error.Fail()) - return error; - - return ReadResponseStatus(); -} - -Status AdbClient::StartSync() { - auto error = SwitchDeviceTransport(); - if (error.Fail()) - return Status::FromErrorStringWithFormat( - "Failed to switch to device transport: %s", error.AsCString()); - - error = Sync(); - if (error.Fail()) - return Status::FromErrorStringWithFormat("Sync failed: %s", - error.AsCString()); - - return error; -} - -Status AdbClient::Sync() { - auto error = SendMessage("sync:", false); - if (error.Fail()) - return error; - - return ReadResponseStatus(); -} - -Status AdbClient::ReadAllBytes(void *buffer, size_t size) { - return ::ReadAllBytes(*m_conn, buffer, size); -} - Status AdbClient::internalShell(const char *command, milliseconds timeout, std::vector &output_buf) { output_buf.clear(); - auto error = SwitchDeviceTransport(); + auto error = SelectTargetDevice(*m_conn, m_device_id); if (error.Fail()) return Status::FromErrorStringWithFormat( - "Failed to switch to device transport: %s", error.AsCString()); + "Failed to select target device: %s", error.AsCString()); StreamString adb_command; adb_command.Printf("shell:%s", command); - error = SendMessage(std::string(adb_command.GetString()), false); + error = SendAdbMessage(*m_conn, std::string(adb_command.GetString())); if (error.Fail()) return error; - error = ReadResponseStatus(); + error = ReadResponseStatus(*m_conn); if (error.Fail()) return error; @@ -417,18 +428,8 @@ Status AdbClient::ShellToFile(const char *command, milliseconds timeout, return Status(); } -std::unique_ptr -AdbClient::GetSyncService(Status &error) { - std::unique_ptr sync_service; - error = StartSync(); - if (error.Success()) - sync_service.reset(new SyncService(std::move(m_conn))); - - return sync_service; -} - -Status AdbClient::SyncService::internalPullFile(const FileSpec &remote_file, - const FileSpec &local_file) { +Status AdbSyncService::PullFileImpl(const FileSpec &remote_file, + const FileSpec &local_file) { const auto local_file_path = local_file.GetPath(); llvm::FileRemover local_file_remover(local_file_path); @@ -462,8 +463,8 @@ Status AdbClient::SyncService::internalPullFile(const FileSpec &remote_file, return error; } -Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file, - const FileSpec &remote_file) { +Status AdbSyncService::PushFileImpl(const FileSpec &local_file, + const FileSpec &remote_file) { const auto local_file_path(local_file.GetPath()); std::ifstream src(local_file_path.c_str(), std::ios::in | std::ios::binary); if (!src.is_open()) @@ -487,7 +488,9 @@ Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file, error.AsCString()); } error = SendSyncRequest( - kDONE, llvm::sys::toTimeT(FileSystem::Instance().GetModificationTime(local_file)), + kDONE, + llvm::sys::toTimeT( + FileSystem::Instance().GetModificationTime(local_file)), nullptr); if (error.Fail()) return error; @@ -500,7 +503,7 @@ Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file, error.AsCString()); if (response_id == kFAIL) { std::string error_message(data_len, 0); - error = ReadAllBytes(&error_message[0], data_len); + error = ReadAllBytes(*m_conn, &error_message[0], data_len); if (error.Fail()) return Status::FromErrorStringWithFormat( "Failed to read DONE error message: %s", error.AsCString()); @@ -518,9 +521,8 @@ Status AdbClient::SyncService::internalPushFile(const FileSpec &local_file, return error; } -Status AdbClient::SyncService::internalStat(const FileSpec &remote_file, - uint32_t &mode, uint32_t &size, - uint32_t &mtime) { +Status AdbSyncService::StatImpl(const FileSpec &remote_file, uint32_t &mode, + uint32_t &size, uint32_t &mtime) { const std::string remote_file_path(remote_file.GetPath(false)); auto error = SendSyncRequest(kSTAT, remote_file_path.length(), remote_file_path.c_str()); @@ -532,7 +534,7 @@ Status AdbClient::SyncService::internalStat(const FileSpec &remote_file, static const size_t response_len = stat_len + (sizeof(uint32_t) * 3); std::vector buffer(response_len); - error = ReadAllBytes(&buffer[0], buffer.size()); + error = ReadAllBytes(*m_conn, &buffer[0], buffer.size()); if (error.Fail()) return Status::FromErrorStringWithFormat("Failed to read response: %s", error.AsCString()); @@ -555,51 +557,57 @@ Status AdbClient::SyncService::internalStat(const FileSpec &remote_file, return Status(); } -Status AdbClient::SyncService::PullFile(const FileSpec &remote_file, - const FileSpec &local_file) { - return executeCommand([this, &remote_file, &local_file]() { - return internalPullFile(remote_file, local_file); +Status AdbSyncService::PullFile(const FileSpec &remote_file, + const FileSpec &local_file) { + return ExecuteCommand([this, &remote_file, &local_file]() { + return PullFileImpl(remote_file, local_file); }); } -Status AdbClient::SyncService::PushFile(const FileSpec &local_file, - const FileSpec &remote_file) { - return executeCommand([this, &local_file, &remote_file]() { - return internalPushFile(local_file, remote_file); +Status AdbSyncService::PushFile(const FileSpec &local_file, + const FileSpec &remote_file) { + return ExecuteCommand([this, &local_file, &remote_file]() { + return PushFileImpl(local_file, remote_file); }); } -Status AdbClient::SyncService::Stat(const FileSpec &remote_file, uint32_t &mode, - uint32_t &size, uint32_t &mtime) { - return executeCommand([this, &remote_file, &mode, &size, &mtime]() { - return internalStat(remote_file, mode, size, mtime); +Status AdbSyncService::Stat(const FileSpec &remote_file, uint32_t &mode, + uint32_t &size, uint32_t &mtime) { + return ExecuteCommand([this, &remote_file, &mode, &size, &mtime]() { + return StatImpl(remote_file, mode, size, mtime); }); } -bool AdbClient::SyncService::IsConnected() const { +bool AdbSyncService::IsConnected() const { return m_conn && m_conn->IsConnected(); } -AdbClient::SyncService::SyncService(std::unique_ptr &&conn) - : m_conn(std::move(conn)) {} - -Status -AdbClient::SyncService::executeCommand(const std::function &cmd) { - if (!m_conn) - return Status::FromErrorString("SyncService is disconnected"); +AdbSyncService::AdbSyncService(const std::string device_id) + : m_device_id(device_id) { + m_conn = std::make_unique(); + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, + "AdbSyncService::AdbSyncService() - Creating AdbSyncService for " + "device: %s", + m_device_id.c_str()); +} +Status AdbSyncService::ExecuteCommand(const std::function &cmd) { Status error = cmd(); - if (error.Fail()) - m_conn.reset(); - return error; } -AdbClient::SyncService::~SyncService() = default; +AdbSyncService::~AdbSyncService() { + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, + "AdbSyncService::~AdbSyncService() - Destroying AdbSyncService for " + "device: %s", + m_device_id.c_str()); +} -Status AdbClient::SyncService::SendSyncRequest(const char *request_id, - const uint32_t data_len, - const void *data) { +Status AdbSyncService::SendSyncRequest(const char *request_id, + const uint32_t data_len, + const void *data) { DataEncoder encoder(eByteOrderLittle, sizeof(void *)); encoder.AppendData(llvm::StringRef(request_id)); encoder.AppendU32(data_len); @@ -615,11 +623,11 @@ Status AdbClient::SyncService::SendSyncRequest(const char *request_id, return error; } -Status AdbClient::SyncService::ReadSyncHeader(std::string &response_id, - uint32_t &data_len) { +Status AdbSyncService::ReadSyncHeader(std::string &response_id, + uint32_t &data_len) { char buffer[kSyncPacketLen]; - auto error = ReadAllBytes(buffer, kSyncPacketLen); + auto error = ReadAllBytes(*m_conn, buffer, kSyncPacketLen); if (error.Success()) { response_id.assign(&buffer[0], 4); DataExtractor extractor(&buffer[4], 4, eByteOrderLittle, sizeof(void *)); @@ -630,8 +638,7 @@ Status AdbClient::SyncService::ReadSyncHeader(std::string &response_id, return error; } -Status AdbClient::SyncService::PullFileChunk(std::vector &buffer, - bool &eof) { +Status AdbSyncService::PullFileChunk(std::vector &buffer, bool &eof) { buffer.clear(); std::string response_id; @@ -642,14 +649,14 @@ Status AdbClient::SyncService::PullFileChunk(std::vector &buffer, if (response_id == kDATA) { buffer.resize(data_len, 0); - error = ReadAllBytes(&buffer[0], data_len); + error = ReadAllBytes(*m_conn, &buffer[0], data_len); if (error.Fail()) buffer.clear(); } else if (response_id == kDONE) { eof = true; } else if (response_id == kFAIL) { std::string error_message(data_len, 0); - error = ReadAllBytes(&error_message[0], data_len); + error = ReadAllBytes(*m_conn, &error_message[0], data_len); if (error.Fail()) return Status::FromErrorStringWithFormat( "Failed to read pull error message: %s", error.AsCString()); @@ -662,6 +669,15 @@ Status AdbClient::SyncService::PullFileChunk(std::vector &buffer, return Status(); } -Status AdbClient::SyncService::ReadAllBytes(void *buffer, size_t size) { - return ::ReadAllBytes(*m_conn, buffer, size); +Status AdbSyncService::SetupSyncConnection() { + Status error = ConnectToAdb(*m_conn); + if (error.Fail()) + return error; + + error = SelectTargetDevice(*m_conn, m_device_id); + if (error.Fail()) + return error; + + error = EnterSyncMode(*m_conn); + return error; } diff --git a/lldb/source/Plugins/Platform/Android/AdbClient.h b/lldb/source/Plugins/Platform/Android/AdbClient.h index 851c09957bd4a..341a9fa4b93ad 100644 --- a/lldb/source/Plugins/Platform/Android/AdbClient.h +++ b/lldb/source/Plugins/Platform/Android/AdbClient.h @@ -10,6 +10,7 @@ #define LLDB_SOURCE_PLUGINS_PLATFORM_ANDROID_ADBCLIENT_H #include "lldb/Utility/Status.h" +#include "llvm/Support/Error.h" #include #include #include @@ -32,59 +33,21 @@ class AdbClient { using DeviceIDList = std::list; - class SyncService { - friend class AdbClient; - - public: - virtual ~SyncService(); - - virtual Status PullFile(const FileSpec &remote_file, - const FileSpec &local_file); - - Status PushFile(const FileSpec &local_file, const FileSpec &remote_file); - - virtual Status Stat(const FileSpec &remote_file, uint32_t &mode, - uint32_t &size, uint32_t &mtime); - - bool IsConnected() const; - - protected: - explicit SyncService(std::unique_ptr &&conn); - - private: - Status SendSyncRequest(const char *request_id, const uint32_t data_len, - const void *data); - - Status ReadSyncHeader(std::string &response_id, uint32_t &data_len); - - Status PullFileChunk(std::vector &buffer, bool &eof); - - Status ReadAllBytes(void *buffer, size_t size); - - Status internalPullFile(const FileSpec &remote_file, - const FileSpec &local_file); - - Status internalPushFile(const FileSpec &local_file, - const FileSpec &remote_file); - - Status internalStat(const FileSpec &remote_file, uint32_t &mode, - uint32_t &size, uint32_t &mtime); - - Status executeCommand(const std::function &cmd); - - std::unique_ptr m_conn; - }; - - static Status CreateByDeviceID(const std::string &device_id, AdbClient &adb); + /// Resolves a device identifier to its canonical form. + /// + /// \param device_id the device identifier to resolve (may be empty). + /// + /// \returns Expected containing the resolved device ID on + /// success, or an Error if the device ID cannot be resolved or + /// is ambiguous. + static llvm::Expected ResolveDeviceID(llvm::StringRef device_id); AdbClient(); - explicit AdbClient(const std::string &device_id); + explicit AdbClient(llvm::StringRef device_id); virtual ~AdbClient(); - const std::string &GetDeviceID() const; - - Status GetDevices(DeviceIDList &device_list); + llvm::StringRef GetDeviceID() const; Status SetPortForwarding(const uint16_t local_port, const uint16_t remote_port); @@ -102,39 +65,50 @@ class AdbClient { std::chrono::milliseconds timeout, const FileSpec &output_file_spec); - virtual std::unique_ptr GetSyncService(Status &error); - - Status SwitchDeviceTransport(); - -private: Status Connect(); - void SetDeviceID(const std::string &device_id); - - Status SendMessage(const std::string &packet, const bool reconnect = true); - - Status SendDeviceMessage(const std::string &packet); - - Status ReadMessage(std::vector &message); +private: + Status SendDeviceMessage(llvm::StringRef packet); Status ReadMessageStream(std::vector &message, std::chrono::milliseconds timeout); - Status GetResponseError(const char *response_id); + Status internalShell(const char *command, std::chrono::milliseconds timeout, + std::vector &output_buf); - Status ReadResponseStatus(); + std::string m_device_id; + std::unique_ptr m_conn; +}; - Status Sync(); +class AdbSyncService { +public: + explicit AdbSyncService(const std::string device_id); + virtual ~AdbSyncService(); + Status SetupSyncConnection(); - Status StartSync(); + virtual Status PullFile(const FileSpec &remote_file, + const FileSpec &local_file); + virtual Status PushFile(const FileSpec &local_file, + const FileSpec &remote_file); + virtual Status Stat(const FileSpec &remote_file, uint32_t &mode, + uint32_t &size, uint32_t &mtime); + virtual bool IsConnected() const; - Status internalShell(const char *command, std::chrono::milliseconds timeout, - std::vector &output_buf); + llvm::StringRef GetDeviceId() const { return m_device_id; } - Status ReadAllBytes(void *buffer, size_t size); +private: + Status SendSyncRequest(const char *request_id, const uint32_t data_len, + const void *data); + Status ReadSyncHeader(std::string &response_id, uint32_t &data_len); + Status PullFileChunk(std::vector &buffer, bool &eof); + Status PullFileImpl(const FileSpec &remote_file, const FileSpec &local_file); + Status PushFileImpl(const FileSpec &local_file, const FileSpec &remote_file); + Status StatImpl(const FileSpec &remote_file, uint32_t &mode, uint32_t &size, + uint32_t &mtime); + Status ExecuteCommand(const std::function &cmd); - std::string m_device_id; std::unique_ptr m_conn; + std::string m_device_id; }; } // namespace platform_android diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp index 5bc9cc133fbd3..600cc0a04cd22 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.cpp @@ -9,10 +9,8 @@ #include "lldb/Core/Module.h" #include "lldb/Core/PluginManager.h" #include "lldb/Core/Section.h" -#include "lldb/Host/HostInfo.h" #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" -#include "lldb/Utility/Scalar.h" #include "lldb/Utility/UriParser.h" #include "lldb/ValueObject/ValueObject.h" @@ -194,12 +192,10 @@ Status PlatformAndroid::ConnectRemote(Args &args) { auto error = PlatformLinux::ConnectRemote(args); if (error.Success()) { - AdbClient adb; - error = AdbClient::CreateByDeviceID(m_device_id, adb); - if (error.Fail()) - return error; - - m_device_id = adb.GetDeviceID(); + auto resolved_device_id_or_error = AdbClient::ResolveDeviceID(m_device_id); + if (!resolved_device_id_or_error) + return Status::FromError(resolved_device_id_or_error.takeError()); + m_device_id = *resolved_device_id_or_error; } return error; } @@ -216,29 +212,33 @@ Status PlatformAndroid::GetFile(const FileSpec &source, Status error; auto sync_service = GetSyncService(error); - if (error.Fail()) - return error; - - uint32_t mode = 0, size = 0, mtime = 0; - error = sync_service->Stat(source_spec, mode, size, mtime); - if (error.Fail()) - return error; - if (mode != 0) - return sync_service->PullFile(source_spec, destination); + // If sync service is available, try to use it + if (error.Success() && sync_service) { + uint32_t mode = 0, size = 0, mtime = 0; + error = sync_service->Stat(source_spec, mode, size, mtime); + if (error.Success()) { + if (mode != 0) + return sync_service->PullFile(source_spec, destination); + + // mode == 0 can signify that adbd cannot access the file due security + // constraints - fall through to try "cat ..." as a fallback. + Log *log = GetLog(LLDBLog::Platform); + LLDB_LOGF(log, "Got mode == 0 on '%s': try to get file via 'shell cat'", + source_spec.GetPath(false).c_str()); + } + } + // Fallback to shell cat command if sync service failed or returned mode == 0 std::string source_file = source_spec.GetPath(false); Log *log = GetLog(LLDBLog::Platform); - LLDB_LOGF(log, "Got mode == 0 on '%s': try to get file via 'shell cat'", - source_file.c_str()); + LLDB_LOGF(log, "Using shell cat fallback for '%s'", source_file.c_str()); if (strchr(source_file.c_str(), '\'') != nullptr) return Status::FromErrorString( "Doesn't support single-quotes in filenames"); - // mode == 0 can signify that adbd cannot access the file due security - // constraints - try "cat ..." as a fallback. AdbClientUP adb(GetAdbClient(error)); if (error.Fail()) return error; @@ -275,12 +275,19 @@ Status PlatformAndroid::DownloadModuleSlice(const FileSpec &src_file_spec, const uint64_t src_offset, const uint64_t src_size, const FileSpec &dst_file_spec) { + std::string source_file = src_file_spec.GetPath(false); + if (source_file.empty()) + return Status::FromErrorString("Source file path cannot be empty"); + + std::string destination_file = dst_file_spec.GetPath(false); + if (destination_file.empty()) + return Status::FromErrorString("Destination file path cannot be empty"); + // In Android API level 23 and above, dynamic loader is able to load .so // file directly from APK. In that case, src_offset will be an non-zero. if (src_offset == 0) // Use GetFile for a normal file. return GetFile(src_file_spec, dst_file_spec); - std::string source_file = src_file_spec.GetPath(false); if (source_file.find('\'') != std::string::npos) return Status::FromErrorString( "Doesn't support single-quotes in filenames"); @@ -424,7 +431,7 @@ PlatformAndroid::GetLibdlFunctionDeclarations(lldb_private::Process *process) { std::vector dl_open_names = {"__dl_dlopen", "dlopen"}; const char *dl_open_name = nullptr; Target &target = process->GetTarget(); - for (auto name : dl_open_names) { + for (auto *name : dl_open_names) { target.GetImages().FindFunctionSymbols( ConstString(name), eFunctionNameTypeFull, matching_symbols); if (matching_symbols.GetSize()) { @@ -445,11 +452,8 @@ PlatformAndroid::GetLibdlFunctionDeclarations(lldb_private::Process *process) { } PlatformAndroid::AdbClientUP PlatformAndroid::GetAdbClient(Status &error) { - AdbClientUP adb(std::make_unique(m_device_id)); - if (adb) - error.Clear(); - else - error = Status::FromErrorString("Failed to create AdbClient"); + AdbClientUP adb = std::make_unique(m_device_id); + error = adb->Connect(); return adb; } @@ -473,14 +477,10 @@ std::string PlatformAndroid::GetRunAs() { } return run_as.str(); } - -AdbClient::SyncService *PlatformAndroid::GetSyncService(Status &error) { - if (m_adb_sync_svc && m_adb_sync_svc->IsConnected()) - return m_adb_sync_svc.get(); - - AdbClientUP adb(GetAdbClient(error)); +std::unique_ptr PlatformAndroid::GetSyncService(Status &error) { + auto sync_service = std::make_unique(m_device_id); + error = sync_service->SetupSyncConnection(); if (error.Fail()) return nullptr; - m_adb_sync_svc = adb->GetSyncService(error); - return (error.Success()) ? m_adb_sync_svc.get() : nullptr; + return sync_service; } diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h index 5602edf73c1d3..3384525362ecf 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroid.h +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroid.h @@ -75,14 +75,15 @@ class PlatformAndroid : public platform_linux::PlatformLinux { typedef std::unique_ptr AdbClientUP; virtual AdbClientUP GetAdbClient(Status &error); + std::string GetRunAs(); + +public: virtual llvm::StringRef GetPropertyPackageName(); - std::string GetRunAs(); +protected: + virtual std::unique_ptr GetSyncService(Status &error); private: - AdbClient::SyncService *GetSyncService(Status &error); - - std::unique_ptr m_adb_sync_svc; std::string m_device_id; uint32_t m_sdk_version; }; diff --git a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp index 0cf64807ec0d6..461ee8e3b1826 100644 --- a/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp +++ b/lldb/source/Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.cpp @@ -21,6 +21,7 @@ using namespace lldb; using namespace lldb_private; using namespace platform_android; +using namespace llvm; static const lldb::pid_t g_remote_platform_pid = 0; // Alias for the process id of lldb-platform @@ -32,12 +33,12 @@ static Status ForwardPortWithAdb( std::string &device_id) { Log *log = GetLog(LLDBLog::Platform); - AdbClient adb; - auto error = AdbClient::CreateByDeviceID(device_id, adb); - if (error.Fail()) - return error; + auto resolved_device_id_or_error = AdbClient::ResolveDeviceID(device_id); + if (!resolved_device_id_or_error) + return Status::FromError(resolved_device_id_or_error.takeError()); + device_id = *resolved_device_id_or_error; - device_id = adb.GetDeviceID(); + AdbClient adb(device_id); LLDB_LOGF(log, "Connected to Android device \"%s\"", device_id.c_str()); if (remote_port != 0) { diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index 5ffb4423969ca..f1e73d73a733b 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -3142,7 +3142,11 @@ void DWARFASTParserClang::ParseSingleMember( uint64_t parent_byte_size = parent_die.GetAttributeValueAsUnsigned(DW_AT_byte_size, UINT64_MAX); - if (attrs.member_byte_offset >= parent_byte_size) { + // If the attrs.member_byte_offset is still set to UINT32_MAX this means + // that the DW_TAG_member didn't have a DW_AT_data_member_location, so + // don't emit an error if this is the case. + if (attrs.member_byte_offset != UINT32_MAX && + attrs.member_byte_offset >= parent_byte_size) { if (member_array_size != 1 && (member_array_size != 0 || attrs.member_byte_offset > parent_byte_size)) { diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp index 0e2ca1784e7e9..3b936c06b1072 100644 --- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp +++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp @@ -14,6 +14,7 @@ #include "clang/Lex/Lexer.h" #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" +#include "lldb/Core/Debugger.h" #include "lldb/Core/Mangled.h" #include "lldb/Core/Module.h" #include "lldb/Core/PluginManager.h" @@ -105,24 +106,21 @@ enum { #include "SymbolFilePDBPropertiesEnum.inc" }; -#if LLVM_ENABLE_DIA_SDK && defined(_WIN32) -bool ShouldUseNativeReaderByDefault() { - static bool g_use_native_by_default = true; - - static llvm::once_flag g_initialize; - llvm::call_once(g_initialize, [] { - llvm::StringRef env_value = ::getenv("LLDB_USE_NATIVE_PDB_READER"); - if (!env_value.equals_insensitive("on") && - !env_value.equals_insensitive("yes") && - !env_value.equals_insensitive("1") && - !env_value.equals_insensitive("true")) - g_use_native_by_default = false; - }); - - return g_use_native_by_default; -} +static const bool g_should_use_native_reader_by_default = [] { + llvm::StringRef env_value = ::getenv("LLDB_USE_NATIVE_PDB_READER"); + +#if !LLVM_ENABLE_DIA_SDK || !defined(_WIN32) + // if the environment value is unset, the native reader is requested + if (env_value.empty()) + return true; #endif + return env_value.equals_insensitive("on") || + env_value.equals_insensitive("yes") || + env_value.equals_insensitive("1") || + env_value.equals_insensitive("true"); +}(); + class PluginProperties : public Properties { public: static llvm::StringRef GetSettingName() { @@ -136,6 +134,21 @@ class PluginProperties : public Properties { bool UseNativeReader() const { #if LLVM_ENABLE_DIA_SDK && defined(_WIN32) + return IsNativeReaderRequested(); +#else + if (!IsNativeReaderRequested()) { + static std::once_flag g_warning_shown; + Debugger::ReportWarning( + "the DIA PDB reader was explicitly requested, but LLDB was built " + "without the DIA SDK. The native reader will be used instead", + {}, &g_warning_shown); + } + return true; +#endif + } + +private: + bool IsNativeReaderRequested() const { auto value = GetPropertyAtIndexAs(ePropertyReader, ePDBReaderDefault); switch (value) { @@ -144,12 +157,8 @@ class PluginProperties : public Properties { case ePDBReaderDIA: return false; default: - case ePDBReaderDefault: - return ShouldUseNativeReaderByDefault(); + return g_should_use_native_reader_by_default; } -#else - return true; -#endif } }; diff --git a/lldb/source/Symbol/DWARFCallFrameInfo.cpp b/lldb/source/Symbol/DWARFCallFrameInfo.cpp index 2f8f9e9182fb2..b490045cb3818 100644 --- a/lldb/source/Symbol/DWARFCallFrameInfo.cpp +++ b/lldb/source/Symbol/DWARFCallFrameInfo.cpp @@ -20,6 +20,8 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/Timer.h" +#include "llvm/BinaryFormat/Dwarf.h" +#include #include #include #include @@ -147,6 +149,23 @@ GetGNUEHPointer(const DataExtractor &DE, lldb::offset_t *offset_ptr, return baseAddress + addressValue; } +// Check if the given cie_id value indicates a CIE (Common Information Entry) +// as opposed to an FDE (Frame Description Entry). +static bool IsCIEMarker(uint64_t cie_id, bool is_64bit, + DWARFCallFrameInfo::Type type) { + // Check eh_frame CIE marker + if (type == DWARFCallFrameInfo::EH) + return cie_id == 0; + + // Check debug_frame CIE marker + // DWARF64 + if (is_64bit) + return cie_id == llvm::dwarf::DW64_CIE_ID; + + // DWARF32 + return cie_id == llvm::dwarf::DW_CIE_ID; +} + DWARFCallFrameInfo::DWARFCallFrameInfo(ObjectFile &objfile, SectionSP §ion_sp, Type type) : m_objfile(objfile), m_section_sp(section_sp), m_type(type) {} @@ -283,7 +302,7 @@ DWARFCallFrameInfo::ParseCIE(const dw_offset_t cie_offset) { GetCFIData(); uint32_t length = m_cfi_data.GetU32(&offset); dw_offset_t cie_id, end_offset; - bool is_64bit = (length == UINT32_MAX); + bool is_64bit = (length == llvm::dwarf::DW_LENGTH_DWARF64); if (is_64bit) { length = m_cfi_data.GetU64(&offset); cie_id = m_cfi_data.GetU64(&offset); @@ -292,8 +311,9 @@ DWARFCallFrameInfo::ParseCIE(const dw_offset_t cie_offset) { cie_id = m_cfi_data.GetU32(&offset); end_offset = cie_offset + length + 4; } - if (length > 0 && ((m_type == DWARF && cie_id == UINT32_MAX) || - (m_type == EH && cie_id == 0ul))) { + + // Check if this is a CIE or FDE based on the CIE ID marker + if (length > 0 && IsCIEMarker(cie_id, is_64bit, m_type)) { size_t i; // cie.offset = cie_offset; // cie.length = length; @@ -470,7 +490,7 @@ void DWARFCallFrameInfo::GetFDEIndex() { const dw_offset_t current_entry = offset; dw_offset_t cie_id, next_entry, cie_offset; uint32_t len = m_cfi_data.GetU32(&offset); - bool is_64bit = (len == UINT32_MAX); + bool is_64bit = (len == llvm::dwarf::DW_LENGTH_DWARF64); if (is_64bit) { len = m_cfi_data.GetU64(&offset); cie_id = m_cfi_data.GetU64(&offset); @@ -493,11 +513,8 @@ void DWARFCallFrameInfo::GetFDEIndex() { return; } - // An FDE entry contains CIE_pointer in debug_frame in same place as cie_id - // in eh_frame. CIE_pointer is an offset into the .debug_frame section. So, - // variable cie_offset should be equal to cie_id for debug_frame. - // FDE entries with cie_id == 0 shouldn't be ignored for it. - if ((cie_id == 0 && m_type == EH) || cie_id == UINT32_MAX || len == 0) { + // Check if this is a CIE or FDE based on the CIE ID marker + if (IsCIEMarker(cie_id, is_64bit, m_type) || len == 0) { auto cie_sp = ParseCIE(current_entry); if (!cie_sp) { // Cannot parse, the reason is already logged @@ -568,7 +585,7 @@ DWARFCallFrameInfo::ParseFDE(dw_offset_t dwarf_offset, uint32_t length = m_cfi_data.GetU32(&offset); dw_offset_t cie_offset; - bool is_64bit = (length == UINT32_MAX); + bool is_64bit = (length == llvm::dwarf::DW_LENGTH_DWARF64); if (is_64bit) { length = m_cfi_data.GetU64(&offset); cie_offset = m_cfi_data.GetU64(&offset); @@ -577,7 +594,9 @@ DWARFCallFrameInfo::ParseFDE(dw_offset_t dwarf_offset, } // FDE entries with zeroth cie_offset may occur for debug_frame. - assert(!(m_type == EH && 0 == cie_offset) && cie_offset != UINT32_MAX); + assert(!(m_type == EH && 0 == cie_offset) && + cie_offset != + (is_64bit ? llvm::dwarf::DW64_CIE_ID : llvm::dwarf::DW_CIE_ID)); // Translate the CIE_id from the eh_frame format, which is relative to the // FDE offset, into a __eh_frame section offset diff --git a/lldb/source/Target/ExecutionContext.cpp b/lldb/source/Target/ExecutionContext.cpp index 9d232e420f71c..a795913047639 100644 --- a/lldb/source/Target/ExecutionContext.cpp +++ b/lldb/source/Target/ExecutionContext.cpp @@ -429,6 +429,16 @@ ExecutionContextRef::ExecutionContextRef(Target *target, bool adopt_selected) SetTargetPtr(target, adopt_selected); } +ExecutionContextRef::ExecutionContextRef(Process *process, bool adopt_selected) + : m_target_wp(), m_process_wp(), m_thread_wp(), m_stack_id() { + SetProcessPtr(process, adopt_selected); +} + +ExecutionContextRef::ExecutionContextRef(Thread *thread, bool adopt_selected) + : m_target_wp(), m_process_wp(), m_thread_wp(), m_stack_id() { + SetThreadPtr(thread, adopt_selected); +} + ExecutionContextRef::ExecutionContextRef(const ExecutionContextRef &rhs) = default; @@ -513,55 +523,66 @@ void ExecutionContextRef::SetFrameSP(const lldb::StackFrameSP &frame_sp) { void ExecutionContextRef::SetTargetPtr(Target *target, bool adopt_selected) { Clear(); if (target) { - lldb::TargetSP target_sp(target->shared_from_this()); - if (target_sp) { - m_target_wp = target_sp; - if (adopt_selected) { - lldb::ProcessSP process_sp(target_sp->GetProcessSP()); - if (process_sp) { - m_process_wp = process_sp; - if (process_sp) { - // Only fill in the thread and frame if our process is stopped - // Don't just check the state, since we might be in the middle of - // resuming. - Process::StopLocker stop_locker; - - if (stop_locker.TryLock(&process_sp->GetRunLock()) && - StateIsStoppedState(process_sp->GetState(), true)) { - lldb::ThreadSP thread_sp( - process_sp->GetThreadList().GetSelectedThread()); - if (!thread_sp) - thread_sp = process_sp->GetThreadList().GetThreadAtIndex(0); - - if (thread_sp) { - SetThreadSP(thread_sp); - lldb::StackFrameSP frame_sp( - thread_sp->GetSelectedFrame(DoNoSelectMostRelevantFrame)); - if (!frame_sp) - frame_sp = thread_sp->GetStackFrameAtIndex(0); - if (frame_sp) - SetFrameSP(frame_sp); - } - } - } - } - } + lldb::TargetSP target_sp = target->shared_from_this(); + SetTargetSP(target_sp); + if (adopt_selected) { + if (lldb::ProcessSP process_sp = target_sp->GetProcessSP()) + SetProcessPtr(process_sp.get(), adopt_selected); } } } -void ExecutionContextRef::SetProcessPtr(Process *process) { +void ExecutionContextRef::SetProcessPtr(Process *process, bool adopt_selected) { if (process) { - SetProcessSP(process->shared_from_this()); + lldb::ProcessSP process_sp = process->shared_from_this(); + SetProcessSP(process_sp); + if (adopt_selected) { + // Only fill in the thread if our process is stopped. + // Don't just check the state, since we might be in the middle of + // resuming. + Process::StopLocker stop_locker; + if (stop_locker.TryLock(&process_sp->GetRunLock()) && + StateIsStoppedState(process_sp->GetState(), true)) { + lldb::ThreadSP thread_sp( + process_sp->GetThreadList().GetSelectedThread()); + if (!thread_sp) + thread_sp = process_sp->GetThreadList().GetThreadAtIndex(0); + if (thread_sp) { + SetThreadSP(thread_sp); + lldb::StackFrameSP frame_sp = + thread_sp->GetSelectedFrame(DoNoSelectMostRelevantFrame); + if (!frame_sp) + frame_sp = thread_sp->GetStackFrameAtIndex(0); + if (frame_sp) + SetFrameSP(frame_sp); + } + } + } } else { m_process_wp.reset(); m_target_wp.reset(); } } -void ExecutionContextRef::SetThreadPtr(Thread *thread) { +void ExecutionContextRef::SetThreadPtr(Thread *thread, bool adopt_selected) { if (thread) { - SetThreadSP(thread->shared_from_this()); + lldb::ThreadSP thread_sp = thread->shared_from_this(); + SetThreadSP(thread_sp); + if (adopt_selected) { + // Only fill in the frame if our process is stopped. + // Don't just check the state, since we might be in the middle of + // resuming. + Process::StopLocker stop_locker; + if (stop_locker.TryLock(&thread->GetProcess()->GetRunLock()) && + StateIsStoppedState(thread->GetProcess()->GetState(), true)) { + lldb::StackFrameSP frame_sp = + thread_sp->GetSelectedFrame(DoNoSelectMostRelevantFrame); + if (!frame_sp) + frame_sp = thread_sp->GetStackFrameAtIndex(0); + if (frame_sp) + SetFrameSP(frame_sp); + } + } } else { ClearThread(); m_process_wp.reset(); diff --git a/lldb/source/Utility/Scalar.cpp b/lldb/source/Utility/Scalar.cpp index c8766bdf2aee7..f2c18cdd896da 100644 --- a/lldb/source/Utility/Scalar.cpp +++ b/lldb/source/Utility/Scalar.cpp @@ -471,24 +471,10 @@ bool Scalar::ShiftRightLogical(const Scalar &rhs) { } Scalar &Scalar::operator>>=(const Scalar &rhs) { - switch (m_type) { - case e_void: - case e_float: + if (m_type == e_int && rhs.m_type == e_int) + m_integer >>= rhs.m_integer.getZExtValue(); + else m_type = e_void; - break; - - case e_int: - switch (rhs.m_type) { - case e_void: - case e_float: - m_type = e_void; - break; - case e_int: - m_integer = m_integer.ashr(rhs.m_integer); - break; - } - break; - } return *this; } diff --git a/lldb/test/API/commands/register/register/register_command/TestRegisters.py b/lldb/test/API/commands/register/register/register_command/TestRegisters.py index 0134139892794..29d090a279070 100644 --- a/lldb/test/API/commands/register/register/register_command/TestRegisters.py +++ b/lldb/test/API/commands/register/register/register_command/TestRegisters.py @@ -21,24 +21,6 @@ def tearDown(self): self.dbg.GetSelectedTarget().GetProcess().Destroy() TestBase.tearDown(self) - # on macOS, detect if the current machine is arm64 and supports SME - def get_sme_available(self): - if self.getArchitecture() != "arm64": - return None - try: - sysctl_output = subprocess.check_output( - ["sysctl", "hw.optional.arm.FEAT_SME"] - ).decode("utf-8") - except subprocess.CalledProcessError: - return None - m = re.match(r"hw\.optional\.arm\.FEAT_SME: (\w+)", sysctl_output) - if m: - if int(m.group(1)) == 1: - return True - else: - return False - return None - @skipIfiOSSimulator @skipIf(archs=no_match(["amd64", "arm$", "i386", "x86_64"])) @expectedFailureAll(oslist=["freebsd", "netbsd"], bugnumber="llvm.org/pr48371") @@ -51,7 +33,7 @@ def test_register_commands(self): self.log_enable("registers") error_str_matched = False - if self.get_sme_available() and self.platformIsDarwin(): + if self.isAArch64SME() and self.platformIsDarwin(): # On Darwin AArch64 SME machines, we will have unavailable # registers when not in Streaming SVE Mode/SME, so # `register read -a` will report that some registers diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestMemoryRegionDirtyPages.py b/lldb/test/API/functionalities/gdb_remote_client/TestMemoryRegionDirtyPages.py index 9d7e0c0f7af6c..695faf896ef5d 100644 --- a/lldb/test/API/functionalities/gdb_remote_client/TestMemoryRegionDirtyPages.py +++ b/lldb/test/API/functionalities/gdb_remote_client/TestMemoryRegionDirtyPages.py @@ -5,60 +5,102 @@ from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase +class TestRegion(object): + def __init__(self, start_addr, size, dirty_pages): + self.start_addr = start_addr + self.size = size + self.dirty_pages = dirty_pages + + def as_packet(self): + dirty_pages = "" + if self.dirty_pages is not None: + dirty_pages = ( + "dirty-pages:" + + ",".join([format(a, "x") for a in self.dirty_pages]) + + ";" + ) + return f"start:{self.start_addr:x};size:{self.size};permissions:r;{dirty_pages}" + + def expected_command_output(self): + if self.dirty_pages is None: + return [ + "Modified memory (dirty) page list provided", + "Dirty pages:", + ], False + + expected = [ + f"Modified memory (dirty) page list provided, {len(self.dirty_pages)} entries." + ] + if self.dirty_pages: + expected.append( + "Dirty pages: " + + ", ".join([format(a, "#x") for a in self.dirty_pages]) + + "." + ) + return expected, True + + class TestMemoryRegionDirtyPages(GDBRemoteTestBase): @skipIfXmlSupportMissing def test(self): + test_regions = [ + # A memory region where we don't know anything about dirty pages + TestRegion(0, 0x100000000, None), + # A memory region with dirty page information -- and zero dirty pages + TestRegion(0x100000000, 4000, []), + # A memory region with one dirty page + TestRegion(0x100004000, 4000, [0x100004000]), + # A memory region with multple dirty pages + TestRegion( + 0x1000A2000, + 5000, + [0x1000A2000, 0x1000A3000, 0x1000A4000, 0x1000A5000, 0x1000A6000], + ), + ] + class MyResponder(MockGDBServerResponder): def qHostInfo(self): return "ptrsize:8;endian:little;vm-page-size:4096;" def qMemoryRegionInfo(self, addr): - if addr == 0: - return "start:0;size:100000000;" - if addr == 0x100000000: - return "start:100000000;size:4000;permissions:rx;dirty-pages:;" - if addr == 0x100004000: - return ( - "start:100004000;size:4000;permissions:r;dirty-pages:100004000;" - ) - if addr == 0x1000A2000: - return "start:1000a2000;size:5000;permissions:r;dirty-pages:1000a2000,1000a3000,1000a4000,1000a5000,1000a6000;" + for region in test_regions: + if region.start_addr == addr: + return region.as_packet() self.server.responder = MyResponder() target = self.dbg.CreateTarget("") if self.TraceOn(): self.runCmd("log enable gdb-remote packets") self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets")) + process = self.connect(target) + lldbutil.expect_state_changes( + self, self.dbg.GetListener(), process, [lldb.eStateStopped] + ) - # A memory region where we don't know anything about dirty pages - region = lldb.SBMemoryRegionInfo() - err = process.GetMemoryRegionInfo(0, region) - self.assertSuccess(err) - self.assertFalse(region.HasDirtyMemoryPageList()) - self.assertEqual(region.GetNumDirtyPages(), 0) - region.Clear() + for test_region in test_regions: + region = lldb.SBMemoryRegionInfo() + err = process.GetMemoryRegionInfo(test_region.start_addr, region) + self.assertSuccess(err) + self.assertEqual(region.GetPageSize(), 4096) - # A memory region with dirty page information -- and zero dirty pages - err = process.GetMemoryRegionInfo(0x100000000, region) - self.assertSuccess(err) - self.assertTrue(region.HasDirtyMemoryPageList()) - self.assertEqual(region.GetNumDirtyPages(), 0) - self.assertEqual(region.GetPageSize(), 4096) - region.Clear() + if test_region.dirty_pages is None: + self.assertFalse(region.HasDirtyMemoryPageList()) + self.assertEqual(0, region.GetNumDirtyPages()) + else: + self.assertTrue(region.HasDirtyMemoryPageList()) + self.assertEqual( + len(test_region.dirty_pages), region.GetNumDirtyPages() + ) - # A memory region with one dirty page - err = process.GetMemoryRegionInfo(0x100004000, region) - self.assertSuccess(err) - self.assertTrue(region.HasDirtyMemoryPageList()) - self.assertEqual(region.GetNumDirtyPages(), 1) - self.assertEqual(region.GetDirtyPageAddressAtIndex(0), 0x100004000) - region.Clear() + for i, expected_dirty_page in enumerate(test_region.dirty_pages): + self.assertEqual( + expected_dirty_page, region.GetDirtyPageAddressAtIndex(i) + ) - # A memory region with multple dirty pages - err = process.GetMemoryRegionInfo(0x1000A2000, region) - self.assertSuccess(err) - self.assertTrue(region.HasDirtyMemoryPageList()) - self.assertEqual(region.GetNumDirtyPages(), 5) - self.assertEqual(region.GetDirtyPageAddressAtIndex(4), 0x1000A6000) - region.Clear() + substrs, matching = test_region.expected_command_output() + self.expect( + f"memory region 0x{test_region.start_addr:x}", + substrs=substrs, + matching=matching, + ) diff --git a/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py b/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py index 6f9d055cef506..c762c8da78ca8 100644 --- a/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py +++ b/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py @@ -1,6 +1,7 @@ import lldb from lldbsuite.test.lldbtest import * from lldbsuite.test.decorators import * +import lldbsuite.test.cpu_feature as cpu_feature import lldbsuite.test.lldbutil as lldbutil import os @@ -9,10 +10,9 @@ class TestSMERegistersDarwin(TestBase): NO_DEBUG_INFO_TESTCASE = True mydir = TestBase.compute_mydir(__file__) - @skipIfRemote @skipUnlessDarwin - @skipUnlessFeature("hw.optional.arm.FEAT_SME") - @skipUnlessFeature("hw.optional.arm.FEAT_SME2") + @skipUnlessFeature(cpu_feature.AArch64.SME) + @skipUnlessFeature(cpu_feature.AArch64.SME2) # thread_set_state/thread_get_state only avail in macOS 15.4+ @skipIf(macos_version=["<", "15.4"]) def test(self): diff --git a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py index 74743d9182ab4..c5a68372d8221 100644 --- a/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py +++ b/lldb/test/API/tools/lldb-dap/module/TestDAP_module.py @@ -7,7 +7,8 @@ import lldbdap_testcase import re - +# Flakey in Github CI runs, see https://github.com/llvm/llvm-project/issues/137660. +@skipIfLinux class TestDAP_module(lldbdap_testcase.DAPTestCaseBase): def run_test(self, symbol_basename, expect_debug_info_size): program_basename = "a.out.stripped" diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 8116f4c3c823a..513d1ec493ee1 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -164,9 +164,14 @@ if(TARGET clang) if (TARGET libcxx OR ("libcxx" IN_LIST LLVM_ENABLE_RUNTIMES)) set(LLDB_HAS_LIBCXX ON) if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE) - set(LIBCXX_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) + set(LIBCXX_TARGET_SUBDIR ${LLVM_DEFAULT_TARGET_TRIPLE}) + if(LIBCXX_LIBDIR_SUBDIR) + string(APPEND LIBCXX_TARGET_SUBDIR /${LIBCXX_LIBDIR_SUBDIR}) + endif() + cmake_path(NORMAL_PATH LIBCXX_TARGET_SUBDIR) + set(LIBCXX_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LIBCXX_TARGET_SUBDIR}) set(LIBCXX_GENERATED_INCLUDE_DIR "${LLVM_BINARY_DIR}/include/c++/v1") - set(LIBCXX_GENERATED_INCLUDE_TARGET_DIR "${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE}/c++/v1") + set(LIBCXX_GENERATED_INCLUDE_TARGET_DIR "${LLVM_BINARY_DIR}/include/${LIBCXX_TARGET_SUBDIR}/c++/v1") else() set(LIBCXX_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LIBCXX_LIBDIR_SUFFIX}) set(LIBCXX_GENERATED_INCLUDE_DIR "${CMAKE_BINARY_DIR}/include/c++/v1") diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-no-shdrs-pt-notes.yaml b/lldb/test/Shell/ObjectFile/ELF/elf-no-shdrs-pt-notes.yaml new file mode 100644 index 0000000000000..1e9c5dfaeab1b --- /dev/null +++ b/lldb/test/Shell/ObjectFile/ELF/elf-no-shdrs-pt-notes.yaml @@ -0,0 +1,706 @@ +## This test verifies that loading an ELF file, that has no section headers but +## has a PT_NOTE program header with a GNU Build ID, can properly extract the +## UUID value. + +# RUN: yaml2obj %s -o %t +# RUN: llvm-strip --strip-sections %t + +# RUN: %lldb -b \ +# RUN: -o "target create -d '%t'" \ +# RUN: -o "image list" \ +# RUN: | FileCheck %s + +# CHECK: Current executable set to '{{.*}}elf-no-shdrs-pt-notes.yaml.tmp' (x86_64). +# CHECK: [ 0] 7F1F56D6-7DBB-17BA-C9A3-4417DB52F097-2548414F 0x0000000000000000 {{.*}}elf-no-shdrs-pt-notes.yaml.tmp + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x1040 +ProgramHeaders: + - Type: PT_PHDR + Flags: [ PF_R ] + VAddr: 0x40 + Align: 0x8 + Offset: 0x40 + - Type: PT_INTERP + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .interp + VAddr: 0x318 + Offset: 0x318 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .interp + LastSec: .rela.plt + Align: 0x1000 + Offset: 0x0 + - Type: PT_LOAD + Flags: [ PF_X, PF_R ] + FirstSec: .init + LastSec: .fini + VAddr: 0x1000 + Align: 0x1000 + Offset: 0x1000 + - Type: PT_LOAD + Flags: [ PF_R ] + FirstSec: .rodata + LastSec: .eh_frame + VAddr: 0x2000 + Align: 0x1000 + Offset: 0x2000 + - Type: PT_LOAD + Flags: [ PF_W, PF_R ] + FirstSec: .init_array + LastSec: .bss + VAddr: 0x3DB0 + Align: 0x1000 + Offset: 0x2DB0 + - Type: PT_DYNAMIC + Flags: [ PF_W, PF_R ] + FirstSec: .dynamic + LastSec: .dynamic + VAddr: 0x3DC8 + Align: 0x8 + Offset: 0x2DC8 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x338 + Align: 0x8 + Offset: 0x338 + - Type: PT_NOTE + Flags: [ PF_R ] + FirstSec: .note.gnu.build-id + LastSec: .note.ABI-tag + VAddr: 0x358 + Align: 0x4 + Offset: 0x358 + - Type: PT_GNU_PROPERTY + Flags: [ PF_R ] + FirstSec: .note.gnu.property + LastSec: .note.gnu.property + VAddr: 0x338 + Align: 0x8 + Offset: 0x338 + - Type: PT_GNU_EH_FRAME + Flags: [ PF_R ] + FirstSec: .eh_frame_hdr + LastSec: .eh_frame_hdr + VAddr: 0x2004 + Align: 0x4 + Offset: 0x2004 + - Type: PT_GNU_STACK + Flags: [ PF_W, PF_R ] + Align: 0x10 + Offset: 0x0 + - Type: PT_GNU_RELRO + Flags: [ PF_R ] + FirstSec: .init_array + LastSec: .got + VAddr: 0x3DB0 + Offset: 0x2DB0 +Sections: + - Name: .interp + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x318 + AddressAlign: 0x1 + Content: 2F6C696236342F6C642D6C696E75782D7838362D36342E736F2E3200 + - Name: .note.gnu.property + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x338 + AddressAlign: 0x8 + Notes: + - Name: GNU + Desc: 028000C0040000000300000000000000 + Type: NT_GNU_PROPERTY_TYPE_0 + - Name: .note.gnu.build-id + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x358 + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: 7F1F56D67DBB17BAC9A34417DB52F0972548414F + Type: NT_PRPSINFO + - Name: .note.ABI-tag + Type: SHT_NOTE + Flags: [ SHF_ALLOC ] + Address: 0x37C + AddressAlign: 0x4 + Notes: + - Name: GNU + Desc: '00000000030000000200000000000000' + Type: NT_VERSION + - Name: .gnu.hash + Type: SHT_GNU_HASH + Flags: [ SHF_ALLOC ] + Address: 0x3A0 + Link: .dynsym + AddressAlign: 0x8 + Header: + SymNdx: 0x1 + Shift2: 0x0 + BloomFilter: [ 0x0 ] + HashBuckets: [ 0x0 ] + HashValues: [ ] + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x3C0 + Link: .dynstr + AddressAlign: 0x8 + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x450 + AddressAlign: 0x1 + - Name: .gnu.version + Type: SHT_GNU_versym + Flags: [ SHF_ALLOC ] + Address: 0x500 + Link: .dynsym + AddressAlign: 0x2 + Entries: [ 0, 2, 3, 0, 0, 0 ] + - Name: .gnu.version_r + Type: SHT_GNU_verneed + Flags: [ SHF_ALLOC ] + Address: 0x510 + Link: .dynstr + AddressAlign: 0x8 + Dependencies: + - Version: 1 + File: libc.so.6 + Entries: + - Name: GLIBC_2.34 + Hash: 110530996 + Flags: 0 + Other: 3 + - Name: GLIBC_2.2.5 + Hash: 157882997 + Flags: 0 + Other: 2 + - Name: .rela.dyn + Type: SHT_RELA + Flags: [ SHF_ALLOC ] + Address: 0x540 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x3DB0 + Type: R_X86_64_RELATIVE + Addend: 4384 + - Offset: 0x3DB8 + Type: R_X86_64_RELATIVE + Addend: 4320 + - Offset: 0x3DC0 + Type: R_X86_64_RELATIVE + Addend: 15808 + - Offset: 0x3FD8 + Symbol: __cxa_finalize + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE0 + Symbol: __libc_start_main + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FE8 + Symbol: _ITM_deregisterTMCloneTable + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF0 + Symbol: __gmon_start__ + Type: R_X86_64_GLOB_DAT + - Offset: 0x3FF8 + Symbol: _ITM_registerTMCloneTable + Type: R_X86_64_GLOB_DAT + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC, SHF_INFO_LINK ] + Address: 0x600 + Link: .dynsym + AddressAlign: 0x8 + Info: .got.plt + Relocations: + - Offset: 0x4018 + Symbol: __cxa_finalize + Type: R_X86_64_JUMP_SLOT + - Name: .init + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1000 + AddressAlign: 0x4 + Offset: 0x1000 + Content: F30F1EFA4883EC08488B05E12F00004885C07402FFD04883C408C3 + - Name: .plt + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1020 + AddressAlign: 0x10 + EntSize: 0x10 + Content: FF35E22F0000FF25E42F00000F1F4000FF25E22F00006800000000E9E0FFFFFF + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1040 + AddressAlign: 0x10 + Content: F30F1EFA31ED4989D15E4889E24883E4F050544531C031C9488D3DD1000000FF157B2F0000F4662E0F1F840000000000488D3DB12F0000488D05AA2F00004839F87415488B055E2F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D812F0000488D357A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B052D2F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D392F000000752B5548833DE22E0000004889E5740C488D3DBE2C0000E829FFFFFFE864FFFFFFC605112F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFF0F1F8000000000554889E5C745FC0000000031C05DC3 + - Name: .fini + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x1140 + AddressAlign: 0x4 + Content: F30F1EFA4883EC084883C408C3 + - Name: .rodata + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_MERGE ] + Address: 0x2000 + AddressAlign: 0x4 + EntSize: 0x4 + Offset: 0x2000 + Content: '01000200' + - Name: .eh_frame_hdr + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2004 + AddressAlign: 0x4 + Content: 011B033B20000000030000001CF0FFFF540000003CF0FFFF3C0000002CF1FFFF7C000000 + - Name: .eh_frame + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC ] + Address: 0x2028 + AddressAlign: 0x8 + Content: 1400000000000000017A5200017810011B0C070890010000140000001C000000F8EFFFFF2600000000440710000000002400000034000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3B2A332422000000001C0000005C000000A8F0FFFF0F00000000410E108602430D064A0C070800000000000000 + - Name: .init_array + Type: SHT_INIT_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3DB0 + AddressAlign: 0x8 + EntSize: 0x8 + Offset: 0x2DB0 + Content: '2011000000000000' + - Name: .fini_array + Type: SHT_FINI_ARRAY + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3DB8 + AddressAlign: 0x8 + EntSize: 0x8 + Content: E010000000000000 + - Name: .data.rel.ro + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3DC0 + AddressAlign: 0x8 + Content: C03D000000000000 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3DC8 + Link: .dynstr + AddressAlign: 0x8 + Entries: + - Tag: DT_NEEDED + Value: 0x67 + - Tag: DT_NEEDED + Value: 0x76 + - Tag: DT_NEEDED + Value: 0x80 + - Tag: DT_NEEDED + Value: 0x8E + - Tag: DT_INIT + Value: 0x1000 + - Tag: DT_FINI + Value: 0x1140 + - Tag: DT_INIT_ARRAY + Value: 0x3DB0 + - Tag: DT_INIT_ARRAYSZ + Value: 0x8 + - Tag: DT_FINI_ARRAY + Value: 0x3DB8 + - Tag: DT_FINI_ARRAYSZ + Value: 0x8 + - Tag: DT_GNU_HASH + Value: 0x3A0 + - Tag: DT_STRTAB + Value: 0x450 + - Tag: DT_SYMTAB + Value: 0x3C0 + - Tag: DT_STRSZ + Value: 0xAF + - Tag: DT_SYMENT + Value: 0x18 + - Tag: DT_DEBUG + Value: 0x0 + - Tag: DT_PLTGOT + Value: 0x4000 + - Tag: DT_PLTRELSZ + Value: 0x18 + - Tag: DT_PLTREL + Value: 0x7 + - Tag: DT_JMPREL + Value: 0x600 + - Tag: DT_RELA + Value: 0x540 + - Tag: DT_RELASZ + Value: 0xC0 + - Tag: DT_RELAENT + Value: 0x18 + - Tag: DT_FLAGS_1 + Value: 0x8000000 + - Tag: DT_VERNEED + Value: 0x510 + - Tag: DT_VERNEEDNUM + Value: 0x1 + - Tag: DT_VERSYM + Value: 0x500 + - Tag: DT_RELACOUNT + Value: 0x3 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Tag: DT_NULL + Value: 0x0 + - Name: .got + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x3FD8 + AddressAlign: 0x8 + EntSize: 0x8 + Content: '00000000000000000000000000000000000000000000000000000000000000000000000000000000' + - Name: .got.plt + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4000 + AddressAlign: 0x8 + EntSize: 0x8 + Content: C83D000000000000000000000000000000000000000000003610000000000000 + - Name: .data + Type: SHT_PROGBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4020 + AddressAlign: 0x1 + Content: '00000000' + - Name: .bss + Type: SHT_NOBITS + Flags: [ SHF_WRITE, SHF_ALLOC ] + Address: 0x4024 + AddressAlign: 0x1 + Size: 0x4 + - Name: .comment + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 4743433A2028474E55292031312E352E302032303234303731392028526564204861742031312E352E302D3929004743433A2028474E55292031312E352E302032303234303731392028526564204861742031312E352E302D3131290046616365626F6F6B20636C616E672076657273696F6E2031352E38302E31202868747470733A2F2F6769742E696E7465726E616C2E7466626E772E6E65742F7265706F732F6769742F726F2F6F736D6574612F65787465726E616C2F6C6C766D2D70726F6A65637420626632333164636436353637396532643466616461623562353363353264623734666237653133362900 + - Name: .annobin.notes + Type: SHT_PROGBITS + Flags: [ SHF_MERGE, SHF_STRINGS ] + AddressAlign: 0x1 + EntSize: 0x1 + Content: 41563A3470313239380052563A72756E6E696E67206763632031312E352E302032303234303731390042563A616E6E6F62696E206763632031312E352E302032303234303731390047573A307833643230353661202E2E2F737973646570732F7838362F6162692D6E6F74652E630053503A330053433A310043463A38202E2E2F737973646570732F7838362F6162692D6E6F74652E6300464C3A2D31202E2E2F737973646570732F7838362F6162692D6E6F74652E630047413A310050493A330053453A300069533A300047573A30783364323035366120696E69742E630043463A3820696E69742E6300464C3A2D3120696E69742E6300 + - Name: .gnu.build.attributes + Type: SHT_NOTE + Address: 0x6028 + AddressAlign: 0x4 + Notes: + - Name: "GA$\x013a1" + Desc: '40100000000000006610000000000000' + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: '66100000000000006610000000000000' + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: '00100000000000001610000000000000' + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: '40110000000000004811000000000000' + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: '70100000000000002911000000000000' + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: 3F110000000000003F11000000000000 + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: 3F110000000000003F11000000000000 + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: 16100000000000001B10000000000000 + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN + - Name: "GA$\x013a1" + Desc: 48110000000000004D11000000000000 + Type: NT_GNU_BUILD_ATTRIBUTE_OPEN +Symbols: + - Name: .interp + Type: STT_SECTION + Section: .interp + Value: 0x318 + - Name: .note.gnu.property + Type: STT_SECTION + Section: .note.gnu.property + Value: 0x338 + - Name: .note.gnu.build-id + Type: STT_SECTION + Section: .note.gnu.build-id + Value: 0x358 + - Name: .note.ABI-tag + Type: STT_SECTION + Section: .note.ABI-tag + Value: 0x37C + - Name: .gnu.hash + Type: STT_SECTION + Section: .gnu.hash + Value: 0x3A0 + - Name: .dynsym + Type: STT_SECTION + Section: .dynsym + Value: 0x3C0 + - Name: .dynstr + Type: STT_SECTION + Section: .dynstr + Value: 0x450 + - Name: .gnu.version + Type: STT_SECTION + Section: .gnu.version + Value: 0x500 + - Name: .gnu.version_r + Type: STT_SECTION + Section: .gnu.version_r + Value: 0x510 + - Name: .rela.dyn + Type: STT_SECTION + Section: .rela.dyn + Value: 0x540 + - Name: .rela.plt + Type: STT_SECTION + Section: .rela.plt + Value: 0x600 + - Name: .init + Type: STT_SECTION + Section: .init + Value: 0x1000 + - Name: .plt + Type: STT_SECTION + Section: .plt + Value: 0x1020 + - Name: .text + Type: STT_SECTION + Section: .text + Value: 0x1040 + - Name: .fini + Type: STT_SECTION + Section: .fini + Value: 0x1140 + - Name: .rodata + Type: STT_SECTION + Section: .rodata + Value: 0x2000 + - Name: .eh_frame_hdr + Type: STT_SECTION + Section: .eh_frame_hdr + Value: 0x2004 + - Name: .eh_frame + Type: STT_SECTION + Section: .eh_frame + Value: 0x2028 + - Name: .init_array + Type: STT_SECTION + Section: .init_array + Value: 0x3DB0 + - Name: .fini_array + Type: STT_SECTION + Section: .fini_array + Value: 0x3DB8 + - Name: .data.rel.ro + Type: STT_SECTION + Section: .data.rel.ro + Value: 0x3DC0 + - Name: .dynamic + Type: STT_SECTION + Section: .dynamic + Value: 0x3DC8 + - Name: .got + Type: STT_SECTION + Section: .got + Value: 0x3FD8 + - Name: .got.plt + Type: STT_SECTION + Section: .got.plt + Value: 0x4000 + - Name: .data + Type: STT_SECTION + Section: .data + Value: 0x4020 + - Name: .bss + Type: STT_SECTION + Section: .bss + Value: 0x4024 + - Name: .comment + Type: STT_SECTION + Section: .comment + - Name: .annobin.notes + Type: STT_SECTION + Section: .annobin.notes + - Name: .gnu.build.attributes + Type: STT_SECTION + Section: .gnu.build.attributes + Value: 0x6028 + - Name: '/usr/lib/gcc/x86_64-redhat-linux/11/../../../../lib64/Scrt1.o' + Type: STT_FILE + Index: SHN_ABS + - Name: __abi_tag + Type: STT_OBJECT + Section: .note.ABI-tag + Value: 0x37C + Size: 0x20 + - Name: crtstuff.c + Type: STT_FILE + Index: SHN_ABS + - Name: deregister_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x1070 + - Name: register_tm_clones + Type: STT_FUNC + Section: .text + Value: 0x10A0 + - Name: __do_global_dtors_aux + Type: STT_FUNC + Section: .text + Value: 0x10E0 + - Name: completed.0 + Type: STT_OBJECT + Section: .bss + Value: 0x4024 + Size: 0x1 + - Name: __do_global_dtors_aux_fini_array_entry + Type: STT_OBJECT + Section: .fini_array + Value: 0x3DB8 + - Name: frame_dummy + Type: STT_FUNC + Section: .text + Value: 0x1120 + - Name: __frame_dummy_init_array_entry + Type: STT_OBJECT + Section: .init_array + Value: 0x3DB0 + - Name: main.cpp + Type: STT_FILE + Index: SHN_ABS + - Name: 'crtstuff.c (1)' + Type: STT_FILE + Index: SHN_ABS + - Name: __FRAME_END__ + Type: STT_OBJECT + Section: .eh_frame + Value: 0x20A0 + - Type: STT_FILE + Index: SHN_ABS + - Name: __GNU_EH_FRAME_HDR + Section: .eh_frame_hdr + Value: 0x2004 + - Name: _DYNAMIC + Type: STT_OBJECT + Section: .dynamic + Value: 0x3DC8 + - Name: _GLOBAL_OFFSET_TABLE_ + Type: STT_OBJECT + Section: .got.plt + Value: 0x4000 + - Name: _edata + Section: .data + Binding: STB_GLOBAL + Value: 0x4024 + - Name: data_start + Section: .data + Binding: STB_WEAK + Value: 0x4020 + - Name: _IO_stdin_used + Type: STT_OBJECT + Section: .rodata + Binding: STB_GLOBAL + Value: 0x2000 + Size: 0x4 + - Name: '__cxa_finalize@GLIBC_2.2.5' + Type: STT_FUNC + Binding: STB_WEAK + - Name: main + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1130 + Size: 0xF + - Name: __dso_handle + Type: STT_OBJECT + Section: .data.rel.ro + Binding: STB_GLOBAL + Value: 0x3DC0 + Other: [ STV_HIDDEN ] + - Name: _fini + Type: STT_FUNC + Section: .fini + Binding: STB_GLOBAL + Value: 0x1140 + Other: [ STV_HIDDEN ] + - Name: '__libc_start_main@GLIBC_2.34' + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: _start + Type: STT_FUNC + Section: .text + Binding: STB_GLOBAL + Value: 0x1040 + Size: 0x26 + - Name: _init + Type: STT_FUNC + Section: .init + Binding: STB_GLOBAL + Value: 0x1000 + Other: [ STV_HIDDEN ] + - Name: __TMC_END__ + Type: STT_OBJECT + Section: .data + Binding: STB_GLOBAL + Value: 0x4028 + Other: [ STV_HIDDEN ] + - Name: __data_start + Section: .data + Binding: STB_GLOBAL + Value: 0x4020 + - Name: _end + Section: .bss + Binding: STB_GLOBAL + Value: 0x4028 + - Name: __bss_start + Section: .bss + Binding: STB_GLOBAL + Value: 0x4024 + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK +DynamicSymbols: + - Name: __cxa_finalize + Type: STT_FUNC + Binding: STB_WEAK + - Name: __libc_start_main + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: _ITM_deregisterTMCloneTable + Binding: STB_WEAK + - Name: __gmon_start__ + Binding: STB_WEAK + - Name: _ITM_registerTMCloneTable + Binding: STB_WEAK +... diff --git a/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml b/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml new file mode 100644 index 0000000000000..fbdc626ed113f --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/union-types-no-member-location.yaml @@ -0,0 +1,182 @@ +# This test produces DWARF that contains a union type whose DW_TAG_member does +# not have a DW_AT_data_member_location set to zero. This is how GCC emits +# debug information for unions. There was code in the DWARFASTParserClang that +# was emitting an invalid error in this case. This test verifies that this +# error does not get emitted. +# +# 0x0000000b: DW_TAG_compile_unit +# DW_AT_name ("main.cpp") +# DW_AT_language (DW_LANG_C) +# +# 0x00000011: DW_TAG_base_type +# DW_AT_name ("int") +# DW_AT_encoding (DW_ATE_signed_char) +# DW_AT_byte_size (0x04) +# +# 0x00000018: DW_TAG_base_type +# DW_AT_name ("__ARRAY_SIZE_TYPE__") +# DW_AT_encoding (DW_ATE_unsigned) +# DW_AT_byte_size (0x08) +# +# 0x0000001f: DW_TAG_array_type +# DW_AT_type (0x00000011 "int") +# +# 0x00000024: DW_TAG_subrange_type +# DW_AT_type (0x00000018 "__ARRAY_SIZE_TYPE__") +# DW_AT_count (0x20) +# +# 0x0000002a: NULL +# +# 0x0000002b: DW_TAG_union_type +# DW_AT_name ("UnionType") +# DW_AT_byte_size (0x20) +# +# 0x00000031: DW_TAG_member +# DW_AT_name ("array") +# DW_AT_type (0x0000001f "int[32]") +# +# 0x0000003a: NULL +# +# 0x0000003b: DW_TAG_subprogram +# DW_AT_low_pc (0x0000000000001000) +# DW_AT_high_pc (0x0000000000001050) +# DW_AT_name ("foo") +# DW_AT_type (0x00000031 "array") +# +# 0x00000054: NULL + +# RUN: yaml2obj %s > %t +# RUN: lldb-test symbols --name=UnionType --find=type %t > %t.stdout +# RUN: cat %t.stdout | FileCheck --check-prefix=STDOUT %s +# RUN: lldb-test symbols --name=UnionType --find=type %t 2> %t.stderr +# RUN: cat %t.stderr | FileCheck --allow-empty --check-prefix=STDERR %s + +# STDOUT: Found 1 types: +# STDOUT: {{(0x)?[0-9a-fA-F]+}}: Type{0x0000002b} , name = "UnionType", size = 32, compiler_type = 0x{{[0-9a-fA-F]+}} union UnionType { + +# STDERR-NOT: error: union-types-no-member-location.yaml.tmp 0x00000031: DW_TAG_member 'array' refers to type 0x000000000000001f which extends beyond the bounds of 0x0000002b + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +DWARF: + debug_str: + - '' + - main.cpp + - int + - __ARRAY_SIZE_TYPE__ + - UnionType + - array + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_language + Form: DW_FORM_udata + - Code: 0x2 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Code: 0x3 + Tag: DW_TAG_array_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x4 + Tag: DW_TAG_subrange_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_count + Form: DW_FORM_data1 + - Code: 0x5 + Tag: DW_TAG_union_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Code: 0x6 + Tag: DW_TAG_member + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x7 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_addr + - Attribute: DW_AT_name + Form: DW_FORM_string + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + debug_info: + - Length: 0x51 + Version: 4 + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x1 + - Value: 0x2 + - AbbrCode: 0x2 + Values: + - Value: 0xA + - Value: 0x6 + - Value: 0x4 + - AbbrCode: 0x2 + Values: + - Value: 0xE + - Value: 0x7 + - Value: 0x8 + - AbbrCode: 0x3 + Values: + - Value: 0x11 + - AbbrCode: 0x4 + Values: + - Value: 0x18 + - Value: 0x20 + - AbbrCode: 0x0 + - AbbrCode: 0x5 + Values: + - Value: 0x22 + - Value: 0x20 + - AbbrCode: 0x6 + Values: + - Value: 0x2C + - Value: 0x1F + - AbbrCode: 0x0 + - AbbrCode: 0x7 + Values: + - Value: 0x1000 + - Value: 0x1050 + - Value: 0xDEADBEEFDEADBEEF + CStr: foo + - Value: 0x31 + - AbbrCode: 0x0 +... diff --git a/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp new file mode 100644 index 0000000000000..dc26ec8d30cb4 --- /dev/null +++ b/lldb/test/Shell/SymbolFile/NativePDB/native-setting.cpp @@ -0,0 +1,71 @@ +// REQUIRES: !diasdk, target-windows + +// Test plugin.symbol-file.pdb.reader setting without the DIA SDK +// RUN: %build -o %t.exe -- %s +// RUN: env -u LLDB_USE_NATIVE_PDB_READER %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=NO-ENV %s +// RUN: env LLDB_USE_NATIVE_PDB_READER= %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=NO-ENV %s + +// RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s + +// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s + +// RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \ +// RUN: -o 'settings set plugin.symbol-file.pdb.reader dia' \ +// RUN: -o 'target create %t.exe' \ +// RUN: -o 'target modules dump symfile' \ +// RUN: 2>&1 | FileCheck --check-prefix=ENV0-SET-DIA %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb \ +// RUN: -o 'settings set plugin.symbol-file.pdb.reader dia' \ +// RUN: -o 'target create %t.exe' \ +// RUN: -o 'target modules dump symfile' \ +// RUN: 2>&1 | FileCheck --check-prefix=ENV1-SET-DIA %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \ +// RUN: -o 'settings set plugin.symbol-file.pdb.reader native' \ +// RUN: -o 'target create %t.exe' \ +// RUN: -o 'target modules dump symfile' \ +// RUN: 2>&1 | FileCheck --check-prefix=ENV0-SET-NATIVE %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb \ +// RUN: -o 'settings set plugin.symbol-file.pdb.reader native' \ +// RUN: -o 'target create %t.exe' \ +// RUN: -o 'target modules dump symfile' \ +// RUN: 2>&1 | FileCheck --check-prefix=ENV1-SET-NATIVE %s + +// NO-ENV-NOT: warning: +// NO-ENV: (lldb) target modules dump symfile +// NO-ENV: Dumping debug symbols for 1 modules. +// NO-ENV: SymbolFile native-pdb + +// ENV0: warning: the DIA PDB reader was explicitly requested, but LLDB was built without the DIA SDK. The native reader will be used instead +// ENV0: (lldb) target modules dump symfile +// ENV0: Dumping debug symbols for 1 modules. +// ENV0: SymbolFile native-pdb + +// ENV1-NOT: warning: +// ENV1: (lldb) target modules dump symfile +// ENV1: Dumping debug symbols for 1 modules. +// ENV1: SymbolFile native-pdb + +// ENV0-SET-DIA: warning: the DIA PDB reader was explicitly requested, but LLDB was built without the DIA SDK. The native reader will be used instead +// ENV0-SET-DIA: (lldb) target modules dump symfile +// ENV0-SET-DIA: Dumping debug symbols for 1 modules. +// ENV0-SET-DIA: SymbolFile native-pdb + +// ENV1-SET-DIA: warning: the DIA PDB reader was explicitly requested, but LLDB was built without the DIA SDK. The native reader will be used instead +// ENV1-SET-DIA: (lldb) target modules dump symfile +// ENV1-SET-DIA: Dumping debug symbols for 1 modules. +// ENV1-SET-DIA: SymbolFile native-pdb + +// ENV1-SET-NATIVE-NOT: warning: +// ENV0-SET-NATIVE: (lldb) target modules dump symfile +// ENV0-SET-NATIVE: Dumping debug symbols for 1 modules. +// ENV0-SET-NATIVE: SymbolFile native-pdb + +// ENV1-SET-NATIVE-NOT: warning: +// ENV1-SET-NATIVE: (lldb) target modules dump symfile +// ENV1-SET-NATIVE: Dumping debug symbols for 1 modules. +// ENV1-SET-NATIVE: SymbolFile native-pdb + +int main() {} diff --git a/lldb/test/Shell/SymbolFile/NativePDB/udt-layout.test b/lldb/test/Shell/SymbolFile/NativePDB/udt-layout.test new file mode 100644 index 0000000000000..6e971541de60c --- /dev/null +++ b/lldb/test/Shell/SymbolFile/NativePDB/udt-layout.test @@ -0,0 +1,129 @@ +# REQUIRES: target-windows + +# Test UDT layout reconstruction +# RUN: split-file %s %t +# RUN: %build --compiler=clang-cl -o %t.exe -- %t/main.cpp +# RUN: %lldb -f %t.exe -s %t/commands.input 2>&1 | FileCheck %s + +#--- main.cpp + +// this is from the DIA plugin (UdtLayoutTest.cpp) +struct A { + explicit A(int u) { _u._u3 = u; } + A(const A &) = default; + virtual ~A() = default; + +private: + union U { + char _u1; + short _u2; + int _u3; + }; + + A::U _u; +}; + +#pragma pack(push, 1) +template struct B : public virtual A { + B(char a, unsigned short b, int c) : A(a + b + c), _a(a), _b(b), _c(c) {} + +private: + char _a; + unsigned short : 3; + unsigned short _b : 6; + unsigned short : 4; + int _c; +}; +#pragma pack(pop) + +#pragma pack(push, 16) +class C : private virtual B<0>, public virtual B<1>, private B<2>, public B<3> { +public: + C(char x, char y, char z) + : A(x - y + z), B<0>(x, y, z), B<1>(x * 2, y * 2, z * 2), + B<2>(x * 3, y * 3, z * 3), B<3>(x * 4, y * 4, z * 4), _x(x * 5), + _y(y * 5), _z(z * 5) {} + + static int abc; + +private: + int _x; + short _y; + char _z; +}; +int C::abc = 123; +#pragma pack(pop) + +class List { +public: + List() = default; + List(List *p, List *n, C v) : Prev(p), Next(n), Value(v) {} + +private: + List *Prev = nullptr; + List *Next = nullptr; + C Value{1, 2, 3}; +}; + +int main() { + List ls[16]; + return 0; // break here +} + +#--- commands.input + +settings set target.max-children-depth 10 +br set -p "break here" +run +target variable +frame variable +quit + +# CHECK: (int) ::C::abc = 123 + +# CHECK: (List[16]) ls = { +# CHECK: [15] = { +# CHECK-NEXT: Prev = nullptr +# CHECK-NEXT: Next = nullptr +# CHECK-NEXT: Value = { +# CHECK-NEXT: B<2> = { +# CHECK-NEXT: A = { +# CHECK-NEXT: _u = (_u1 = '\x02', _u2 = 2, _u3 = 2) +# CHECK-NEXT: } +# CHECK-NEXT: _a = '\x03' +# CHECK-NEXT: _b = 6 +# CHECK-NEXT: _c = 9 +# CHECK-NEXT: } +# CHECK-NEXT: B<3> = { +# CHECK-NEXT: A = { +# CHECK-NEXT: _u = (_u1 = '\x02', _u2 = 2, _u3 = 2) +# CHECK-NEXT: } +# CHECK-NEXT: _a = '\x04' +# CHECK-NEXT: _b = 8 +# CHECK-NEXT: _c = 12 +# CHECK-NEXT: } +# CHECK-NEXT: A = { +# CHECK-NEXT: _u = (_u1 = '\x02', _u2 = 2, _u3 = 2) +# CHECK-NEXT: } +# CHECK-NEXT: B<0> = { +# CHECK-NEXT: A = { +# CHECK-NEXT: _u = (_u1 = '\x02', _u2 = 2, _u3 = 2) +# CHECK-NEXT: } +# CHECK-NEXT: _a = '\x01' +# CHECK-NEXT: _b = 2 +# CHECK-NEXT: _c = 3 +# CHECK-NEXT: } +# CHECK-NEXT: B<1> = { +# CHECK-NEXT: A = { +# CHECK-NEXT: _u = (_u1 = '\x02', _u2 = 2, _u3 = 2) +# CHECK-NEXT: } +# CHECK-NEXT: _a = '\x02' +# CHECK-NEXT: _b = 4 +# CHECK-NEXT: _c = 6 +# CHECK-NEXT: } +# CHECK-NEXT: _x = 5 +# CHECK-NEXT: _y = 10 +# CHECK-NEXT: _z = '\x0f' +# CHECK-NEXT: } +# CHECK-NEXT: } +# CHECK-NEXT: } diff --git a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp index a3077252f08f1..f5e54592b0b31 100644 --- a/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp +++ b/lldb/test/Shell/SymbolFile/PDB/native-setting.cpp @@ -2,49 +2,68 @@ // Test plugin.symbol-file.pdb.reader setting // RUN: %build -o %t.exe -- %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' | FileCheck --check-prefix=ENV0 %s -// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' | FileCheck --check-prefix=ENV1 %s +// RUN: env -u LLDB_USE_NATIVE_PDB_READER %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=NO-ENV %s +// RUN: env LLDB_USE_NATIVE_PDB_READER= %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=NO-ENV %s + +// RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV1 %s + +// RUN: env LLDB_USE_NATIVE_PDB_READER=foo %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=42 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s +// RUN: env LLDB_USE_NATIVE_PDB_READER=-1 %lldb %t.exe -o 'target modules dump symfile' 2>&1 | FileCheck --check-prefix=ENV0 %s + // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \ // RUN: -o 'settings set plugin.symbol-file.pdb.reader dia' \ // RUN: -o 'target create %t.exe' \ // RUN: -o 'target modules dump symfile' \ -// RUN: | FileCheck --check-prefix=ENV0-SET-DIA %s +// RUN: 2>&1 | FileCheck --check-prefix=ENV0-SET-DIA %s // RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb \ // RUN: -o 'settings set plugin.symbol-file.pdb.reader dia' \ // RUN: -o 'target create %t.exe' \ // RUN: -o 'target modules dump symfile' \ -// RUN: | FileCheck --check-prefix=ENV1-SET-DIA %s +// RUN: 2>&1 | FileCheck --check-prefix=ENV1-SET-DIA %s // RUN: env LLDB_USE_NATIVE_PDB_READER=0 %lldb \ // RUN: -o 'settings set plugin.symbol-file.pdb.reader native' \ // RUN: -o 'target create %t.exe' \ // RUN: -o 'target modules dump symfile' \ -// RUN: | FileCheck --check-prefix=ENV0-SET-NATIVE %s +// RUN: 2>&1 | FileCheck --check-prefix=ENV0-SET-NATIVE %s // RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb \ // RUN: -o 'settings set plugin.symbol-file.pdb.reader native' \ // RUN: -o 'target create %t.exe' \ // RUN: -o 'target modules dump symfile' \ -// RUN: | FileCheck --check-prefix=ENV1-SET-NATIVE %s +// RUN: 2>&1 | FileCheck --check-prefix=ENV1-SET-NATIVE %s + +// NO-ENV-NOT: warning: +// NO-ENV: (lldb) target modules dump symfile +// NO-ENV: Dumping debug symbols for 1 modules. +// NO-ENV: SymbolFile pdb +// ENV0-NOT: warning: // ENV0: (lldb) target modules dump symfile // ENV0: Dumping debug symbols for 1 modules. // ENV0: SymbolFile pdb +// ENV1-NOT: warning: // ENV1: (lldb) target modules dump symfile // ENV1: Dumping debug symbols for 1 modules. // ENV1: SymbolFile native-pdb +// ENV0-SET-DIA-NOT: warning: // ENV0-SET-DIA: (lldb) target modules dump symfile // ENV0-SET-DIA: Dumping debug symbols for 1 modules. // ENV0-SET-DIA: SymbolFile pdb +// ENV1-SET-DIA-NOT: warning: // ENV1-SET-DIA: (lldb) target modules dump symfile // ENV1-SET-DIA: Dumping debug symbols for 1 modules. // ENV1-SET-DIA: SymbolFile pdb +// ENV0-SET-NATIVE-NOT: warning: // ENV0-SET-NATIVE: (lldb) target modules dump symfile // ENV0-SET-NATIVE: Dumping debug symbols for 1 modules. // ENV0-SET-NATIVE: SymbolFile native-pdb +// ENV1-SET-NATIVE-NOT: warning: // ENV1-SET-NATIVE: (lldb) target modules dump symfile // ENV1-SET-NATIVE: Dumping debug symbols for 1 modules. // ENV1-SET-NATIVE: SymbolFile native-pdb diff --git a/lldb/test/Shell/SymbolFile/PDB/udt-layout.test b/lldb/test/Shell/SymbolFile/PDB/udt-layout.test index bc68539e25ec1..619646b3f12ba 100644 --- a/lldb/test/Shell/SymbolFile/PDB/udt-layout.test +++ b/lldb/test/Shell/SymbolFile/PDB/udt-layout.test @@ -1,4 +1,4 @@ -REQUIRES: target-windows, lld +REQUIRES: target-windows, lld, diasdk RUN: %build --compiler=clang-cl --output=%t.exe %S/Inputs/UdtLayoutTest.cpp RUN: %lldb -b -s %S/Inputs/UdtLayoutTest.script -- %t.exe | FileCheck %s diff --git a/lldb/tools/lldb-dap/package-lock.json b/lldb/tools/lldb-dap/package-lock.json index f3ae6b76be6d0..826f29f70106c 100644 --- a/lldb/tools/lldb-dap/package-lock.json +++ b/lldb/tools/lldb-dap/package-lock.json @@ -8,6 +8,9 @@ "name": "lldb-dap", "version": "0.2.16", "license": "Apache 2.0 License with LLVM exceptions", + "dependencies": { + "chokidar": "^4.0.3" + }, "devDependencies": { "@types/node": "^18.19.41", "@types/tabulator-tables": "^6.2.10", @@ -1301,6 +1304,21 @@ "url": "https://github.com/sponsors/fb55" } }, + "node_modules/chokidar": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz", + "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==", + "license": "MIT", + "dependencies": { + "readdirp": "^4.0.1" + }, + "engines": { + "node": ">= 14.16.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/chownr": { "version": "1.1.4", "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", @@ -2746,6 +2764,19 @@ "node": ">= 6" } }, + "node_modules/readdirp": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz", + "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==", + "license": "MIT", + "engines": { + "node": ">= 14.18.0" + }, + "funding": { + "type": "individual", + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/safe-buffer": { "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", diff --git a/lldb/tools/lldb-dap/package.json b/lldb/tools/lldb-dap/package.json index 6566ba3bdee13..e961c2e48b258 100644 --- a/lldb/tools/lldb-dap/package.json +++ b/lldb/tools/lldb-dap/package.json @@ -1,7 +1,7 @@ { "name": "lldb-dap", "displayName": "LLDB DAP", - "version": "0.2.16", + "version": "0.2.18", "publisher": "llvm-vs-code-extensions", "homepage": "https://lldb.llvm.org", "description": "Debugging with LLDB in Visual Studio Code", @@ -27,6 +27,9 @@ "categories": [ "Debuggers" ], + "dependencies": { + "chokidar": "^4.0.3" + }, "devDependencies": { "@types/node": "^18.19.41", "@types/tabulator-tables": "^6.2.10", @@ -46,13 +49,14 @@ ], "main": "./out/extension", "scripts": { + "bundle-extension": "npx tsc -p ./ --noEmit && npx esbuild src-ts/extension.ts --bundle --outfile=out/extension.js --external:vscode --format=cjs --platform=node --target=node22 --minify", "bundle-symbols-table-view": "npx tsc -p src-ts/webview --noEmit && npx esbuild src-ts/webview/symbols-table-view.ts --bundle --format=iife --outdir=./out/webview", "bundle-tabulator": "cp node_modules/tabulator-tables/dist/js/tabulator.min.js ./out/webview/ && cp node_modules/tabulator-tables/dist/css/tabulator_midnight.min.css ./out/webview/ && cp node_modules/tabulator-tables/dist/css/tabulator_simple.min.css ./out/webview/", "bundle-webview": "npm run bundle-symbols-table-view && npm run bundle-tabulator", - "vscode:prepublish": "npm run bundle-webview && tsc -p ./", + "vscode:prepublish": "npm run bundle-webview && npm run bundle-extension", "watch": "npm run bundle-webview && tsc -watch -p ./", "format": "npx prettier './src-ts/' --write", - "package": "rm -rf ./out/lldb-dap.vsix && vsce package --out ./out/lldb-dap.vsix", + "package": "rm -rf ./out && vsce package --out ./out/lldb-dap.vsix", "publish": "vsce publish", "vscode-uninstall": "code --uninstall-extension llvm-vs-code-extensions.lldb-dap", "vscode-install": "code --install-extension ./out/lldb-dap.vsix" @@ -347,6 +351,9 @@ { "language": "objective-c" }, + { + "language": "objective-cpp" + }, { "language": "objectpascal" }, @@ -375,6 +382,7 @@ "fortran-modern", "nim", "objective-c", + "objective-cpp", "objectpascal", "pascal", "rust", diff --git a/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts b/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts index 280a11d807f6a..4e348965930d9 100644 --- a/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts +++ b/lldb/tools/lldb-dap/src-ts/lldb-dap-server.ts @@ -1,4 +1,6 @@ +import { FSWatcher, watch as chokidarWatch } from 'chokidar'; import * as child_process from "node:child_process"; +import * as path from "path"; import { isDeepStrictEqual } from "util"; import * as vscode from "vscode"; @@ -12,6 +14,10 @@ export class LLDBDapServer implements vscode.Disposable { private serverProcess?: child_process.ChildProcessWithoutNullStreams; private serverInfo?: Promise<{ host: string; port: number }>; private serverSpawnInfo?: string[]; + // Detects changes to the lldb-dap executable file since the server's startup. + private serverFileWatcher?: FSWatcher; + // Indicates whether the lldb-dap executable file has changed since the server's startup. + private serverFileChanged?: boolean; constructor() { vscode.commands.registerCommand( @@ -83,6 +89,11 @@ export class LLDBDapServer implements vscode.Disposable { }); this.serverProcess = process; this.serverSpawnInfo = this.getSpawnInfo(dapPath, dapArgs, options?.env); + this.serverFileChanged = false; + this.serverFileWatcher = chokidarWatch(dapPath); + this.serverFileWatcher + .on('change', () => this.serverFileChanged = true) + .on('unlink', () => this.serverFileChanged = true); }); return this.serverInfo; } @@ -100,21 +111,27 @@ export class LLDBDapServer implements vscode.Disposable { args: string[], env: NodeJS.ProcessEnv | { [key: string]: string } | undefined, ): Promise { - if (!this.serverProcess || !this.serverInfo || !this.serverSpawnInfo) { + if ( + !this.serverProcess || + !this.serverInfo || + !this.serverSpawnInfo || + !this.serverFileWatcher || + this.serverFileChanged === undefined + ) { return true; } - const newSpawnInfo = this.getSpawnInfo(dapPath, args, env); - if (isDeepStrictEqual(this.serverSpawnInfo, newSpawnInfo)) { - return true; - } + const changeTLDR = []; + const changeDetails = []; - const userInput = await vscode.window.showInformationMessage( - "The arguments to lldb-dap have changed. Would you like to restart the server?", - { - modal: true, - detail: `An existing lldb-dap server (${this.serverProcess.pid}) is running with different arguments. + if (this.serverFileChanged) { + changeTLDR.push("an old binary"); + } + const newSpawnInfo = this.getSpawnInfo(dapPath, args, env); + if (!isDeepStrictEqual(this.serverSpawnInfo, newSpawnInfo)) { + changeTLDR.push("different arguments"); + changeDetails.push(` The previous lldb-dap server was started with: ${this.serverSpawnInfo.join(" ")} @@ -122,7 +139,22 @@ ${this.serverSpawnInfo.join(" ")} The new lldb-dap server will be started with: ${newSpawnInfo.join(" ")} +` + ); + } + + // If the server hasn't changed, continue startup without killing it. + if (changeTLDR.length === 0) { + return true; + } + // The server has changed. Prompt the user to restart it. + const userInput = await vscode.window.showInformationMessage( + "The lldb-dap server has changed. Would you like to restart the server?", + { + modal: true, + detail: `An existing lldb-dap server (${this.serverProcess.pid}) is running with ${changeTLDR.map(s => `*${s}*`).join(" and ")}. +${changeDetails.join("\n")} Restarting the server will interrupt any existing debug sessions and start a new server.`, }, "Restart", @@ -130,9 +162,7 @@ Restarting the server will interrupt any existing debug sessions and start a new ); switch (userInput) { case "Restart": - this.serverProcess.kill(); - this.serverProcess = undefined; - this.serverInfo = undefined; + this.dispose(); return true; case "Use Existing": return true; @@ -156,6 +186,10 @@ Restarting the server will interrupt any existing debug sessions and start a new if (this.serverProcess === process) { this.serverProcess = undefined; this.serverInfo = undefined; + this.serverSpawnInfo = undefined; + this.serverFileWatcher?.close(); + this.serverFileWatcher = undefined; + this.serverFileChanged = undefined; } } diff --git a/lldb/unittests/Host/posix/HostTest.cpp b/lldb/unittests/Host/posix/HostTest.cpp index 082edccf4e774..dc75b288ba76a 100644 --- a/lldb/unittests/Host/posix/HostTest.cpp +++ b/lldb/unittests/Host/posix/HostTest.cpp @@ -15,6 +15,10 @@ #include #include +#ifdef __linux__ +#include +#endif // __linux__ + using namespace lldb_private; namespace { @@ -116,7 +120,12 @@ TEST_F(HostTest, GetProcessInfoSetsPriority) { ASSERT_TRUE(Info.IsZombie().has_value()); ASSERT_FALSE(Info.IsZombie().value()); + // CoreDumping was added in kernel version 4.15. +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 15, 0) ASSERT_TRUE(Info.IsCoreDumping().has_value()); ASSERT_FALSE(Info.IsCoreDumping().value()); +#else + ASSERT_FALSE(Info.IsCoreDumping().has_value()); +#endif } #endif diff --git a/lldb/unittests/Instruction/CMakeLists.txt b/lldb/unittests/Instruction/CMakeLists.txt index 10385377923ba..975923fdbb371 100644 --- a/lldb/unittests/Instruction/CMakeLists.txt +++ b/lldb/unittests/Instruction/CMakeLists.txt @@ -2,7 +2,7 @@ add_lldb_unittest(EmulatorTests ARM64/TestAArch64Emulator.cpp LoongArch/TestLoongArchEmulator.cpp RISCV/TestRISCVEmulator.cpp - + LINK_COMPONENTS Support LINK_LIBS @@ -13,3 +13,25 @@ add_lldb_unittest(EmulatorTests lldbPluginInstructionLoongArch lldbPluginInstructionRISCV ) + +# Only add RISCV emulator tests if RISCV is in the list of LLVM targets to build. +# This is necessary because some buildbots (e.g., X86-only machines) +# do not have RISCV support enabled. Without this check, the RISCV tests would be added +# causing build failures. +if ("RISCV" IN_LIST LLVM_TARGETS_TO_BUILD) + add_lldb_unittest(RISCVEmulatorTests + RISCV/TestRiscvInstEmulation.cpp + + LINK_COMPONENTS + Support + ${LLVM_TARGETS_TO_BUILD} + LINK_LIBS + lldbCore + lldbSymbol + lldbTarget + lldbPluginInstructionRISCV + lldbPluginDisassemblerLLVMC + lldbPluginUnwindAssemblyInstEmulation + lldbPluginProcessUtility + ) +endif() diff --git a/lldb/unittests/Instruction/RISCV/TestRiscvInstEmulation.cpp b/lldb/unittests/Instruction/RISCV/TestRiscvInstEmulation.cpp new file mode 100644 index 0000000000000..009b9cdc79607 --- /dev/null +++ b/lldb/unittests/Instruction/RISCV/TestRiscvInstEmulation.cpp @@ -0,0 +1,188 @@ +//===-- TestRiscvInstEmulation.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gtest/gtest.h" + +#include "Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.h" + +#include "lldb/Core/AddressRange.h" +#include "lldb/Symbol/UnwindPlan.h" +#include "lldb/Utility/ArchSpec.h" + +#include "Plugins/Disassembler/LLVMC/DisassemblerLLVMC.h" +#include "Plugins/Instruction/RISCV/EmulateInstructionRISCV.h" +#include "Plugins/Process/Utility/lldb-riscv-register-enums.h" +#include "llvm/Support/TargetSelect.h" + +using namespace lldb; +using namespace lldb_private; + +class TestRiscvInstEmulation : public testing::Test { +public: + static void SetUpTestCase(); + static void TearDownTestCase(); + +protected: +}; + +void TestRiscvInstEmulation::SetUpTestCase() { + llvm::InitializeAllTargets(); + llvm::InitializeAllAsmPrinters(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllDisassemblers(); + DisassemblerLLVMC::Initialize(); + EmulateInstructionRISCV::Initialize(); +} + +void TestRiscvInstEmulation::TearDownTestCase() { + DisassemblerLLVMC::Terminate(); + EmulateInstructionRISCV::Terminate(); +} + +TEST_F(TestRiscvInstEmulation, TestSimpleRiscvFunction) { + ArchSpec arch("riscv64-unknown-linux-gnu"); + // Enable compressed instruction support (RVC extension). + arch.SetFlags(ArchSpec::eRISCV_rvc); + std::unique_ptr engine( + static_cast( + UnwindAssemblyInstEmulation::CreateInstance(arch))); + ASSERT_NE(nullptr, engine); + + // RISC-V function with compressed and uncompressed instructions + // 0x0000: 1141 addi sp, sp, -0x10 + // 0x0002: e406 sd ra, 0x8(sp) + // 0x0004: e022 sd s0, 0x0(sp) + // 0x0006: 0800 addi s0, sp, 0x10 + // 0x0008: 00000537 lui a0, 0x0 + // 0x000C: 00050513 mv a0, a0 + // 0x0010: 00000097 auipc ra, 0x0 + // 0x0014: 000080e7 jalr ra + // 0x0018: 4501 li a0, 0x0 + // 0x001A: ff040113 addi sp, s0, -0x10 + // 0x001E: 60a2 ld ra, 0x8(sp) + // 0x0020: 6402 ld s0, 0x0(sp) + // 0x0022: 0141 addi sp, sp, 0x10 + // 0x0024: 8082 ret + uint8_t data[] = {// 0x0000: 1141 addi sp, sp, -0x10 + 0x41, 0x11, + // 0x0002: e406 sd ra, 0x8(sp) + 0x06, 0xE4, + // 0x0004: e022 sd s0, 0x0(sp) + 0x22, 0xE0, + // 0x0006: 0800 addi s0, sp, 0x10 + 0x00, 0x08, + // 0x0008: 00000537 lui a0, 0x0 + 0x37, 0x05, 0x00, 0x00, + // 0x000C: 00050513 mv a0, a0 + 0x13, 0x05, 0x05, 0x00, + // 0x0010: 00000097 auipc ra, 0x0 + 0x97, 0x00, 0x00, 0x00, + // 0x0014: 000080e7 jalr ra + 0xE7, 0x80, 0x00, 0x00, + // 0x0018: 4501 li a0, 0x0 + 0x01, 0x45, + // 0x001A: ff040113 addi sp, s0, -0x10 + 0x13, 0x01, 0x04, 0xFF, + // 0x001E: 60a2 ld ra, 0x8(sp) + 0xA2, 0x60, + // 0x0020: 6402 ld s0, 0x0(sp) + 0x02, 0x64, + // 0x0022: 0141 addi sp, sp, 0x10 + 0x41, 0x01, + // 0x0024: 8082 ret + 0x82, 0x80}; + + // Expected UnwindPlan (prologue only - emulation stops after frame setup): + // row[0]: 0: CFA=sp+0 => fp= ra= + // row[1]: 2: CFA=sp+16 => fp= ra= (after stack + // allocation) row[2]: 4: CFA=sp+16 => fp= ra=[CFA-8] + // (after saving ra) row[3]: 6: CFA=sp+16 => fp=[CFA-16] ra=[CFA-8] + // (after saving s0/fp) row[4]: 8: CFA=s0+0 => fp=[CFA-16] ra=[CFA-8] + // (after setting frame pointer: s0=sp+16) + + const UnwindPlan::Row *row; + AddressRange sample_range; + UnwindPlan unwind_plan(eRegisterKindLLDB); + UnwindPlan::Row::AbstractRegisterLocation regloc; + sample_range = AddressRange(0x1000, sizeof(data)); + + EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly( + sample_range, data, sizeof(data), unwind_plan)); + + // CFA=sp+0 => fp= ra=. + row = unwind_plan.GetRowForFunctionOffset(0); + EXPECT_EQ(0, row->GetOffset()); + EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_riscv); + EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true); + EXPECT_EQ(0, row->GetCFAValue().GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_riscv, regloc)); + EXPECT_TRUE(regloc.IsSame()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_ra_riscv, regloc)); + EXPECT_TRUE(regloc.IsSame()); + + // CFA=sp+16 => fp= ra=. + row = unwind_plan.GetRowForFunctionOffset(2); + EXPECT_EQ(2, row->GetOffset()); + EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_riscv); + EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true); + EXPECT_EQ(16, row->GetCFAValue().GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_riscv, regloc)); + EXPECT_TRUE(regloc.IsSame()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_ra_riscv, regloc)); + EXPECT_TRUE(regloc.IsSame()); + + // CFA=sp+16 => fp= ra=[CFA-8]. + row = unwind_plan.GetRowForFunctionOffset(4); + EXPECT_EQ(4, row->GetOffset()); + EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_riscv); + EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true); + EXPECT_EQ(16, row->GetCFAValue().GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_riscv, regloc)); + EXPECT_TRUE(regloc.IsSame()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_ra_riscv, regloc)); + EXPECT_TRUE(regloc.IsAtCFAPlusOffset()); + EXPECT_EQ(-8, regloc.GetOffset()); + + // CFA=sp+16 => fp=[CFA-16] ra=[CFA-8] + row = unwind_plan.GetRowForFunctionOffset(6); + EXPECT_EQ(6, row->GetOffset()); + EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_sp_riscv); + EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true); + EXPECT_EQ(16, row->GetCFAValue().GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_riscv, regloc)); + EXPECT_TRUE(regloc.IsAtCFAPlusOffset()); + EXPECT_EQ(-16, regloc.GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_ra_riscv, regloc)); + EXPECT_TRUE(regloc.IsAtCFAPlusOffset()); + EXPECT_EQ(-8, regloc.GetOffset()); + + // CFA=s0+0 => fp=[CFA-16] ra=[CFA-8] + // s0 = sp + 16, so switching CFA to s0 does not change the effective + // locations. + row = unwind_plan.GetRowForFunctionOffset(8); + EXPECT_EQ(8, row->GetOffset()); + EXPECT_TRUE(row->GetCFAValue().GetRegisterNumber() == gpr_fp_riscv); + EXPECT_TRUE(row->GetCFAValue().IsRegisterPlusOffset() == true); + EXPECT_EQ(0, row->GetCFAValue().GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_fp_riscv, regloc)); + EXPECT_TRUE(regloc.IsAtCFAPlusOffset()); + EXPECT_EQ(-16, regloc.GetOffset()); + + EXPECT_TRUE(row->GetRegisterInfo(gpr_ra_riscv, regloc)); + EXPECT_TRUE(regloc.IsAtCFAPlusOffset()); + EXPECT_EQ(-8, regloc.GetOffset()); +} diff --git a/lldb/unittests/Platform/Android/AdbClientTest.cpp b/lldb/unittests/Platform/Android/AdbClientTest.cpp index 0808b96f69fc8..9b3a6fa9ceb33 100644 --- a/lldb/unittests/Platform/Android/AdbClientTest.cpp +++ b/lldb/unittests/Platform/Android/AdbClientTest.cpp @@ -6,8 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "gtest/gtest.h" #include "Plugins/Platform/Android/AdbClient.h" +#include "lldb/Host/Socket.h" +#include "lldb/Host/common/TCPSocket.h" +#include "gtest/gtest.h" +#include #include static void set_env(const char *var, const char *value) { @@ -20,32 +23,121 @@ static void set_env(const char *var, const char *value) { using namespace lldb; using namespace lldb_private; - -namespace lldb_private { -namespace platform_android { +using namespace lldb_private::platform_android; class AdbClientTest : public ::testing::Test { public: - void SetUp() override { set_env("ANDROID_SERIAL", ""); } + void SetUp() override { + set_env("ANDROID_SERIAL", ""); + set_env("ANDROID_ADB_SERVER_PORT", ""); + } - void TearDown() override { set_env("ANDROID_SERIAL", ""); } + void TearDown() override { + set_env("ANDROID_SERIAL", ""); + set_env("ANDROID_ADB_SERVER_PORT", ""); + } }; -TEST(AdbClientTest, CreateByDeviceId) { - AdbClient adb; - Status error = AdbClient::CreateByDeviceID("device1", adb); - EXPECT_TRUE(error.Success()); - EXPECT_EQ("device1", adb.GetDeviceID()); +TEST_F(AdbClientTest, ResolveDeviceId_ExplicitDeviceId) { + auto result = AdbClient::ResolveDeviceID("device1"); + EXPECT_TRUE(static_cast(result)); + EXPECT_EQ("device1", *result); } -TEST(AdbClientTest, CreateByDeviceId_ByEnvVar) { +TEST_F(AdbClientTest, ResolveDeviceId_ByEnvVar) { set_env("ANDROID_SERIAL", "device2"); - AdbClient adb; - Status error = AdbClient::CreateByDeviceID("", adb); - EXPECT_TRUE(error.Success()); - EXPECT_EQ("device2", adb.GetDeviceID()); + auto result = AdbClient::ResolveDeviceID(""); + EXPECT_TRUE(static_cast(result)); + EXPECT_EQ("device2", *result); +} + +TEST_F(AdbClientTest, ResolveDeviceId_PrefersExplicitOverEnvVar) { + set_env("ANDROID_SERIAL", "env_device"); + + // Explicit device ID should take precedence over environment variable + auto result = AdbClient::ResolveDeviceID("explicit_device"); + EXPECT_TRUE(static_cast(result)); + EXPECT_EQ("explicit_device", *result); +} + +TEST_F(AdbClientTest, AdbClient_Constructor_StoresDeviceId) { + AdbClient client("test_device_123"); + EXPECT_EQ(client.GetDeviceID(), "test_device_123"); +} + +TEST_F(AdbClientTest, AdbClient_DefaultConstructor) { + AdbClient client; + EXPECT_EQ(client.GetDeviceID(), ""); } -} // end namespace platform_android -} // end namespace lldb_private +TEST_F(AdbClientTest, AdbSyncService_Constructor_StoresDeviceId) { + AdbSyncService sync("device123"); + EXPECT_EQ(sync.GetDeviceId(), "device123"); +} + +TEST_F(AdbClientTest, AdbSyncService_OperationsFailWhenNotConnected) { + AdbSyncService sync_service("test_device"); + + // Verify service is not connected initially + EXPECT_FALSE(sync_service.IsConnected()); + + // File operations should fail when not connected + FileSpec remote_file("/data/test.txt"); + FileSpec local_file("/tmp/test.txt"); + uint32_t mode, size, mtime; + + Status stat_result = sync_service.Stat(remote_file, mode, size, mtime); + EXPECT_TRUE(stat_result.Fail()); + + Status pull_result = sync_service.PullFile(remote_file, local_file); + EXPECT_TRUE(pull_result.Fail()); + + Status push_result = sync_service.PushFile(local_file, remote_file); + EXPECT_TRUE(push_result.Fail()); +} + +static uint16_t FindUnusedPort() { + auto temp_socket = std::make_unique(true); + Status error = temp_socket->Listen("localhost:0", 1); + if (error.Fail()) { + return 0; // fallback + } + uint16_t port = temp_socket->GetLocalPortNumber(); + temp_socket.reset(); // Close the socket to free the port + return port; +} + +#ifndef _WIN32 +// This test is disabled on Windows due to platform-specific socket behavior +// that causes assertion failures in TCPSocket::Listen() +TEST_F(AdbClientTest, RealTcpConnection) { + uint16_t unused_port = FindUnusedPort(); + ASSERT_NE(unused_port, 0) << "Failed to find an unused port"; + + std::string port_str = std::to_string(unused_port); + set_env("ANDROID_ADB_SERVER_PORT", port_str.c_str()); + + AdbClient client; + const auto status1 = client.Connect(); + EXPECT_FALSE(status1.Success()) + << "Connection should fail when no server is listening on port " + << unused_port; + + // now start a server on the port and try again + auto listen_socket = std::make_unique(true); + std::string listen_address = "localhost:" + port_str; + Status error = listen_socket->Listen(listen_address.c_str(), 5); + ASSERT_TRUE(error.Success()) << "Failed to create listening socket on port " + << unused_port << ": " << error.AsCString(); + + // Verify the socket is listening on the expected port + ASSERT_EQ(listen_socket->GetLocalPortNumber(), unused_port) + << "Socket is not listening on the expected port"; + + const auto status2 = client.Connect(); + EXPECT_TRUE(status2.Success()) + << "Connection should succeed when server is listening on port " + << unused_port; +} +#endif // _WIN32 diff --git a/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp b/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp index d021562d94d28..514bce1c71576 100644 --- a/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp +++ b/lldb/unittests/Platform/Android/PlatformAndroidTest.cpp @@ -8,8 +8,6 @@ #include "Plugins/Platform/Android/PlatformAndroid.h" #include "Plugins/Platform/Android/PlatformAndroidRemoteGDBServer.h" -#include "TestingSupport/SubsystemRAII.h" -#include "TestingSupport/TestUtilities.h" #include "lldb/Utility/Connection.h" #include "gmock/gmock.h" @@ -20,212 +18,281 @@ using namespace testing; namespace { -class MockSyncService : public AdbClient::SyncService { -public: - MockSyncService() : SyncService(std::unique_ptr()) {} - - MOCK_METHOD2(PullFile, - Status(const FileSpec &remote_file, const FileSpec &local_file)); - MOCK_METHOD4(Stat, Status(const FileSpec &remote_file, uint32_t &mode, - uint32_t &size, uint32_t &mtime)); -}; - -typedef std::unique_ptr SyncServiceUP; - class MockAdbClient : public AdbClient { public: - explicit MockAdbClient() : AdbClient("mock") {} + explicit MockAdbClient() : AdbClient() {} MOCK_METHOD3(ShellToFile, Status(const char *command, std::chrono::milliseconds timeout, const FileSpec &output_file_spec)); - MOCK_METHOD1(GetSyncService, SyncServiceUP(Status &error)); }; class PlatformAndroidTest : public PlatformAndroid, public ::testing::Test { public: PlatformAndroidTest() : PlatformAndroid(false) { m_remote_platform_sp = PlatformSP(new PlatformAndroidRemoteGDBServer()); + + // Set up default mock behavior to avoid uninteresting call warnings + ON_CALL(*this, GetSyncService(_)) + .WillByDefault([](Status &error) -> std::unique_ptr { + error = Status::FromErrorString("Sync service unavailable"); + return nullptr; + }); } MOCK_METHOD1(GetAdbClient, AdbClientUP(Status &error)); MOCK_METHOD0(GetPropertyPackageName, llvm::StringRef()); + MOCK_METHOD1(GetSyncService, std::unique_ptr(Status &error)); + + // Make GetSyncService public for testing + using PlatformAndroid::GetSyncService; }; } // namespace -TEST_F(PlatformAndroidTest, DownloadModuleSliceWithAdbClientError) { +TEST_F(PlatformAndroidTest, + DownloadModuleSlice_AdbClientError_FailsGracefully) { EXPECT_CALL(*this, GetAdbClient(_)) - .Times(1) .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg = Status::FromErrorString( "Failed to create AdbClient"); }), Return(ByMove(AdbClientUP())))); - EXPECT_TRUE( - DownloadModuleSlice( - FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096, - 3600, FileSpec()) - .Fail()); -} - -TEST_F(PlatformAndroidTest, DownloadModuleSliceWithNormalFile) { - auto sync_service = new MockSyncService(); - EXPECT_CALL(*sync_service, Stat(FileSpec("/system/lib64/libc.so"), _, _, _)) - .Times(1) - .WillOnce(DoAll(SetArgReferee<1>(1), Return(Status()))); - EXPECT_CALL(*sync_service, PullFile(FileSpec("/system/lib64/libc.so"), _)) - .Times(1) - .WillOnce(Return(Status())); - - auto adb_client = new MockAdbClient(); - EXPECT_CALL(*adb_client, GetSyncService(_)) - .Times(1) - .WillOnce(Return(ByMove(SyncServiceUP(sync_service)))); - - EXPECT_CALL(*this, GetAdbClient(_)) - .Times(1) - .WillOnce(Return(ByMove(AdbClientUP(adb_client)))); + Status result = DownloadModuleSlice( + FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096, + 3600, FileSpec("/tmp/libtest.so")); - EXPECT_TRUE( - DownloadModuleSlice(FileSpec("/system/lib64/libc.so"), 0, 0, FileSpec()) - .Success()); + EXPECT_TRUE(result.Fail()); + EXPECT_THAT(result.AsCString(), HasSubstr("Failed to create AdbClient")); } -TEST_F(PlatformAndroidTest, DownloadModuleSliceWithZipFile) { - auto adb_client = new MockAdbClient(); +TEST_F(PlatformAndroidTest, DownloadModuleSlice_ZipFile_UsesCorrectDdCommand) { + auto *adb_client = new MockAdbClient(); EXPECT_CALL(*adb_client, ShellToFile(StrEq("dd if='/system/app/Test/Test.apk' " "iflag=skip_bytes,count_bytes " "skip=4096 count=3600 status=none"), _, _)) - .Times(1) .WillOnce(Return(Status())); + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef(""))); + EXPECT_CALL(*this, GetAdbClient(_)) - .Times(1) .WillOnce(Return(ByMove(AdbClientUP(adb_client)))); - EXPECT_TRUE( - DownloadModuleSlice( - FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096, - 3600, FileSpec()) - .Success()); + Status result = DownloadModuleSlice( + FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096, + 3600, FileSpec("/tmp/libtest.so")); + + EXPECT_TRUE(result.Success()); } -TEST_F(PlatformAndroidTest, DownloadModuleSliceWithZipFileAndRunAs) { - auto adb_client = new MockAdbClient(); +TEST_F(PlatformAndroidTest, + DownloadModuleSlice_ZipFileWithRunAs_UsesRunAsCommand) { + auto *adb_client = new MockAdbClient(); EXPECT_CALL(*adb_client, ShellToFile(StrEq("run-as 'com.example.test' " "dd if='/system/app/Test/Test.apk' " "iflag=skip_bytes,count_bytes " "skip=4096 count=3600 status=none"), _, _)) - .Times(1) .WillOnce(Return(Status())); EXPECT_CALL(*this, GetPropertyPackageName()) - .Times(1) .WillOnce(Return(llvm::StringRef("com.example.test"))); EXPECT_CALL(*this, GetAdbClient(_)) - .Times(1) .WillOnce(Return(ByMove(AdbClientUP(adb_client)))); - EXPECT_TRUE( - DownloadModuleSlice( - FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096, - 3600, FileSpec()) - .Success()); + Status result = DownloadModuleSlice( + FileSpec("/system/app/Test/Test.apk!/lib/arm64-v8a/libtest.so"), 4096, + 3600, FileSpec("/tmp/libtest.so")); + + EXPECT_TRUE(result.Success()); } -TEST_F(PlatformAndroidTest, GetFileWithNormalFile) { - auto sync_service = new MockSyncService(); - EXPECT_CALL(*sync_service, Stat(FileSpec("/data/local/tmp/test"), _, _, _)) - .Times(1) - .WillOnce(DoAll(SetArgReferee<1>(1), Return(Status()))); - EXPECT_CALL(*sync_service, PullFile(FileSpec("/data/local/tmp/test"), _)) - .Times(1) +TEST_F(PlatformAndroidTest, + DownloadModuleSlice_LargeFile_CalculatesParametersCorrectly) { + const uint64_t large_offset = 100 * 1024 * 1024; // 100MB offset + const uint64_t large_size = 50 * 1024 * 1024; // 50MB size + + auto *adb_client = new MockAdbClient(); + EXPECT_CALL(*adb_client, + ShellToFile(StrEq("dd if='/system/app/Large.apk' " + "iflag=skip_bytes,count_bytes " + "skip=104857600 count=52428800 status=none"), + _, _)) .WillOnce(Return(Status())); - auto adb_client = new MockAdbClient(); - EXPECT_CALL(*adb_client, GetSyncService(_)) - .Times(1) - .WillOnce(Return(ByMove(SyncServiceUP(sync_service)))); + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef(""))); EXPECT_CALL(*this, GetAdbClient(_)) - .Times(1) .WillOnce(Return(ByMove(AdbClientUP(adb_client)))); - EXPECT_TRUE(GetFile(FileSpec("/data/local/tmp/test"), FileSpec()).Success()); + Status result = DownloadModuleSlice( + FileSpec("/system/app/Large.apk!/lib/arm64-v8a/large.so"), large_offset, + large_size, FileSpec("/tmp/large.so")); + + EXPECT_TRUE(result.Success()); } -TEST_F(PlatformAndroidTest, GetFileWithCatFallback) { - auto sync_service = new MockSyncService(); - EXPECT_CALL( - *sync_service, - Stat(FileSpec("/data/data/com.example.app/lib-main/libtest.so"), _, _, _)) - .Times(1) - .WillOnce(DoAll(SetArgReferee<1>(0), Return(Status()))); +TEST_F(PlatformAndroidTest, + GetFile_SyncServiceUnavailable_FallsBackToShellCat) { + auto *adb_client = new MockAdbClient(); + EXPECT_CALL(*adb_client, + ShellToFile(StrEq("cat '/data/local/tmp/test'"), _, _)) + .WillOnce(Return(Status())); - auto adb_client0 = new MockAdbClient(); - EXPECT_CALL(*adb_client0, GetSyncService(_)) - .Times(1) - .WillOnce(Return(ByMove(SyncServiceUP(sync_service)))); + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef(""))); + + EXPECT_CALL(*this, GetAdbClient(_)) + .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }), + Return(ByMove(AdbClientUP(adb_client))))); + + EXPECT_CALL(*this, GetSyncService(_)) + .WillOnce([](Status &error) -> std::unique_ptr { + error = Status::FromErrorString("Sync service unavailable"); + return nullptr; + }); + + Status result = + GetFile(FileSpec("/data/local/tmp/test"), FileSpec("/tmp/test")); + EXPECT_TRUE(result.Success()); +} - auto adb_client1 = new MockAdbClient(); +TEST_F(PlatformAndroidTest, GetFile_WithRunAs_UsesRunAsInShellCommand) { + auto *adb_client = new MockAdbClient(); EXPECT_CALL( - *adb_client1, - ShellToFile(StrEq("cat '/data/data/com.example.app/lib-main/libtest.so'"), + *adb_client, + ShellToFile(StrEq("run-as 'com.example.app' " + "cat '/data/data/com.example.app/lib-main/libtest.so'"), _, _)) - .Times(1) .WillOnce(Return(Status())); + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef("com.example.app"))); + EXPECT_CALL(*this, GetAdbClient(_)) - .Times(2) - .WillOnce(Return(ByMove(AdbClientUP(adb_client0)))) - .WillOnce(Return(ByMove(AdbClientUP(adb_client1)))); + .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }), + Return(ByMove(AdbClientUP(adb_client))))); + + EXPECT_CALL(*this, GetSyncService(_)) + .WillOnce([](Status &error) -> std::unique_ptr { + error = Status::FromErrorString("Sync service unavailable"); + return nullptr; + }); - EXPECT_TRUE( + Status result = GetFile(FileSpec("/data/data/com.example.app/lib-main/libtest.so"), - FileSpec()) - .Success()); + FileSpec("/tmp/libtest.so")); + EXPECT_TRUE(result.Success()); } -TEST_F(PlatformAndroidTest, GetFileWithCatFallbackAndRunAs) { - auto sync_service = new MockSyncService(); - EXPECT_CALL( - *sync_service, - Stat(FileSpec("/data/data/com.example.app/lib-main/libtest.so"), _, _, _)) - .Times(1) - .WillOnce(DoAll(SetArgReferee<1>(0), Return(Status()))); +TEST_F(PlatformAndroidTest, GetFile_FilenameWithSingleQuotes_Rejected) { + EXPECT_CALL(*this, GetSyncService(_)) + .WillOnce([](Status &error) -> std::unique_ptr { + error = Status::FromErrorString("Sync service unavailable"); + return nullptr; + }); - auto adb_client0 = new MockAdbClient(); - EXPECT_CALL(*adb_client0, GetSyncService(_)) - .Times(1) - .WillOnce(Return(ByMove(SyncServiceUP(sync_service)))); + Status result = + GetFile(FileSpec("/test/file'with'quotes"), FileSpec("/tmp/output")); - auto adb_client1 = new MockAdbClient(); - EXPECT_CALL( - *adb_client1, - ShellToFile(StrEq("run-as 'com.example.app' " - "cat '/data/data/com.example.app/lib-main/libtest.so'"), - _, _)) - .Times(1) + EXPECT_TRUE(result.Fail()); + EXPECT_THAT(result.AsCString(), HasSubstr("single-quotes")); +} + +TEST_F(PlatformAndroidTest, + DownloadModuleSlice_FilenameWithSingleQuotes_Rejected) { + Status result = DownloadModuleSlice(FileSpec("/test/file'with'quotes"), 100, + 200, FileSpec("/tmp/output")); + + EXPECT_TRUE(result.Fail()); + EXPECT_THAT(result.AsCString(), HasSubstr("single-quotes")); +} + +TEST_F(PlatformAndroidTest, GetFile_NetworkTimeout_PropagatesErrorCorrectly) { + auto *adb_client = new MockAdbClient(); + EXPECT_CALL(*adb_client, ShellToFile(_, _, _)) + .WillOnce(Return(Status::FromErrorString("Network timeout"))); + + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef(""))); + + EXPECT_CALL(*this, GetAdbClient(_)) + .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }), + Return(ByMove(AdbClientUP(adb_client))))); + + EXPECT_CALL(*this, GetSyncService(_)) + .WillOnce([](Status &error) -> std::unique_ptr { + error = Status::FromErrorString("Sync service unavailable"); + return nullptr; + }); + + Status result = + GetFile(FileSpec("/data/large/file.so"), FileSpec("/tmp/large.so")); + EXPECT_TRUE(result.Fail()); + EXPECT_THAT(result.AsCString(), HasSubstr("Network timeout")); +} + +TEST_F(PlatformAndroidTest, SyncService_ConnectionFailsGracefully) { + // Constructor should succeed even with a failing connection + AdbSyncService sync_service("test-device"); + + // The service should report as not connected initially + EXPECT_FALSE(sync_service.IsConnected()); + EXPECT_EQ(sync_service.GetDeviceId(), "test-device"); + + // Operations should fail gracefully when connection setup fails + FileSpec remote_file("/data/test.txt"); + FileSpec local_file("/tmp/test.txt"); + uint32_t mode, size, mtime; + + Status result = sync_service.Stat(remote_file, mode, size, mtime); + EXPECT_TRUE(result.Fail()); +} + +TEST_F(PlatformAndroidTest, GetRunAs_FormatsPackageNameCorrectly) { + // Empty package name + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef(""))); + EXPECT_EQ(this->GetRunAs(), ""); + + // Valid package name + EXPECT_CALL(*this, GetPropertyPackageName()) + .WillOnce(Return(llvm::StringRef("com.example.test"))); + EXPECT_EQ(this->GetRunAs(), "run-as 'com.example.test' "); +} + +TEST_F(PlatformAndroidTest, + DownloadModuleSlice_ZeroOffset_CallsGetFileInsteadOfDd) { + // When offset=0, DownloadModuleSlice calls GetFile which uses 'cat', not 'dd' + // We need to ensure the sync service fails so GetFile falls back to shell cat + auto *adb_client = new MockAdbClient(); + EXPECT_CALL(*adb_client, + ShellToFile(StrEq("cat '/system/lib64/libc.so'"), _, _)) .WillOnce(Return(Status())); EXPECT_CALL(*this, GetPropertyPackageName()) - .Times(1) - .WillOnce(Return(llvm::StringRef("com.example.app"))); + .WillOnce(Return(llvm::StringRef(""))); EXPECT_CALL(*this, GetAdbClient(_)) - .Times(2) - .WillOnce(Return(ByMove(AdbClientUP(adb_client0)))) - .WillOnce(Return(ByMove(AdbClientUP(adb_client1)))); + .WillOnce(DoAll(WithArg<0>([](auto &arg) { arg.Clear(); }), + Return(ByMove(AdbClientUP(adb_client))))); - EXPECT_TRUE( - GetFile(FileSpec("/data/data/com.example.app/lib-main/libtest.so"), - FileSpec()) - .Success()); + // Mock GetSyncService to fail, forcing GetFile to use shell cat fallback + EXPECT_CALL(*this, GetSyncService(_)) + .WillOnce(DoAll(WithArg<0>([](auto &arg) { + arg = + Status::FromErrorString("Sync service unavailable"); + }), + Return(ByMove(std::unique_ptr())))); + + Status result = DownloadModuleSlice(FileSpec("/system/lib64/libc.so"), 0, 0, + FileSpec("/tmp/libc.so")); + EXPECT_TRUE(result.Success()); } diff --git a/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp b/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp index e113b8ca99341..c52e9a7387e14 100644 --- a/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp +++ b/lldb/unittests/Symbol/TestDWARFCallFrameInfo.cpp @@ -380,3 +380,288 @@ void DWARFCallFrameInfoTest::TestValOffset(DWARFCallFrameInfo::Type type, TEST_F(DWARFCallFrameInfoTest, ValOffset_dwarf3) { TestValOffset(DWARFCallFrameInfo::DWARF, "debug_frame3"); } + +// Test that we correctly handle invalid FDE entries that have CIE ID values +TEST_F(DWARFCallFrameInfoTest, InvalidFDEWithCIEID_dwarf32) { + // Create an FDE with cie_offset of 0xFFFFFFFF (DW_CIE_ID) which is invalid + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000000260 + AddressAlign: 0x0000000000000010 + Content: 554889E5897DFC8B45FC5DC3 + - Name: .debug_frame + Type: SHT_PROGBITS + AddressAlign: 0x0000000000000008 + # First, a valid CIE + # 00000000 0000000000000014 ffffffff CIE + # Version: 3 + # Augmentation: "" + # Code alignment factor: 1 + # Data alignment factor: -8 + # Return address column: 16 + Content: 14000000FFFFFFFF03000178100C0708900100000000000018000000FFFFFFFF60020000000000000C00000000000000 + # Then an invalid FDE with CIE pointer = 0xFFFFFFFF (which would make it look like a CIE) + # 00000018 0000000000000018 ffffffff FDE cie=ffffffff pc=0000000000000260..000000000000026c + # The cie offset of 0xFFFFFFFF is invalid for an FDE in debug_frame +Symbols: + - Name: test_invalid + Type: STT_FUNC + Section: .text + Value: 0x0000000000000260 + Size: 0x000000000000000C + Binding: STB_GLOBAL +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + auto module_sp = std::make_shared(ExpectedFile->moduleSpec()); + SectionList *list = module_sp->GetSectionList(); + ASSERT_NE(nullptr, list); + + auto section_sp = list->FindSectionByType(eSectionTypeDWARFDebugFrame, false); + ASSERT_NE(nullptr, section_sp); + + DWARFCallFrameInfo cfi(*module_sp->GetObjectFile(), section_sp, + DWARFCallFrameInfo::DWARF); + + // This should trigger our assertion or return nullptr because the FDE is + // invalid + const Symbol *sym = module_sp->FindFirstSymbolWithNameAndType( + ConstString("test_invalid"), eSymbolTypeAny); + ASSERT_NE(nullptr, sym); + + std::unique_ptr plan_up = cfi.GetUnwindPlan(sym->GetAddress()); + // The plan should be null because we have an invalid FDE + EXPECT_EQ(nullptr, plan_up); +} + +// Test that we correctly handle invalid FDE entries that have CIE ID values +TEST_F(DWARFCallFrameInfoTest, InvalidFDEWithCIEID_dwarf64) { + // Create an FDE with cie_offset of 0xFFFFFFFFFFFFFFFF (DW64_CIE_ID) which is + // invalid + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000000260 + AddressAlign: 0x0000000000000010 + Content: 554889E5897DFC8B45FC5DC3 + - Name: .debug_frame + Type: SHT_PROGBITS + AddressAlign: 0x0000000000000008 + # DWARF64 format CIE + # Initial length: 0xFFFFFFFF followed by 64-bit length + # 00000000 ffffffff 0000000000000014 ffffffffffffffff CIE + Content: FFFFFFFF1400000000000000FFFFFFFFFFFFFFFF03000178100C0708900100000000FFFFFFFF1800000000000000FFFFFFFFFFFFFFFF60020000000000000C00000000000000 + # DWARF64 FDE with invalid CIE pointer = 0xFFFFFFFFFFFFFFFF + # Initial length: 0xFFFFFFFF, followed by 64-bit length (0x18) + # Then 64-bit CIE pointer: 0xFFFFFFFFFFFFFFFF (which is DW64_CIE_ID, invalid for FDE) +Symbols: + - Name: test_invalid64 + Type: STT_FUNC + Section: .text + Value: 0x0000000000000260 + Size: 0x000000000000000C + Binding: STB_GLOBAL +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + auto module_sp = std::make_shared(ExpectedFile->moduleSpec()); + SectionList *list = module_sp->GetSectionList(); + ASSERT_NE(nullptr, list); + + auto section_sp = list->FindSectionByType(eSectionTypeDWARFDebugFrame, false); + ASSERT_NE(nullptr, section_sp); + + DWARFCallFrameInfo cfi(*module_sp->GetObjectFile(), section_sp, + DWARFCallFrameInfo::DWARF); + + const Symbol *sym = module_sp->FindFirstSymbolWithNameAndType( + ConstString("test_invalid64"), eSymbolTypeAny); + ASSERT_NE(nullptr, sym); + + std::unique_ptr plan_up = cfi.GetUnwindPlan(sym->GetAddress()); + // The plan should be null because we have an invalid FDE + EXPECT_EQ(nullptr, plan_up); +} + +// Test valid CIE markers in eh_frame format +TEST_F(DWARFCallFrameInfoTest, ValidCIEMarkers_eh_frame) { + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 + Entry: 0x0000000000000260 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000000260 + AddressAlign: 0x0000000000000010 + Content: 554889E5897DFC8B45FC5DC3 + - Name: .eh_frame + Type: SHT_X86_64_UNWIND + Flags: [ SHF_ALLOC ] + Address: 0x0000000000000290 + AddressAlign: 0x0000000000000008 + # eh_frame content + # CIE + FDE that works with address 0x260 + Content: 1400000000000000017A5200017810011B0C0708900100001C0000001C000000B0FFFFFF0C00000000410E108602430D0600000000000000 +Symbols: + - Name: simple_function + Type: STT_FUNC + Section: .text + Value: 0x0000000000000260 + Size: 0x000000000000000F + Binding: STB_GLOBAL +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + auto module_sp = std::make_shared(ExpectedFile->moduleSpec()); + SectionList *list = module_sp->GetSectionList(); + ASSERT_NE(nullptr, list); + + auto section_sp = list->FindSectionByType(eSectionTypeEHFrame, false); + ASSERT_NE(nullptr, section_sp); + + DWARFCallFrameInfo cfi(*module_sp->GetObjectFile(), section_sp, + DWARFCallFrameInfo::EH); + + const Symbol *sym = module_sp->FindFirstSymbolWithNameAndType( + ConstString("simple_function"), eSymbolTypeAny); + ASSERT_NE(nullptr, sym); + + std::unique_ptr plan_up = cfi.GetUnwindPlan(sym->GetAddress()); + // Should succeed with valid CIE and FDE + ASSERT_NE(nullptr, plan_up); + EXPECT_GE(plan_up->GetRowCount(), 1); +} + +// Test valid CIE markers in debug_frame DWARF32 format +TEST_F(DWARFCallFrameInfoTest, ValidCIEMarkers_dwarf32) { + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000001130 + AddressAlign: 0x0000000000000010 + Content: 554889E5897DFC8B45FC83C0015DC3 + - Name: .debug_frame + Type: SHT_PROGBITS + AddressAlign: 0x0000000000000008 + # debug_frame content in DWARF32 format + # CIE (length=0x14, CIE_id=0xFFFFFFFF, version=4) + # FDE (length=0x24, CIE_offset=0) + Content: 14000000FFFFFFFF040008000178100C0708900100000000240000000000000030110000000000000F00000000000000410E108602430D064A0C070800000000 +Symbols: + - Name: simple_function + Type: STT_FUNC + Section: .text + Value: 0x0000000000001130 + Size: 0x000000000000000F + Binding: STB_GLOBAL +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + auto module_sp = std::make_shared(ExpectedFile->moduleSpec()); + SectionList *list = module_sp->GetSectionList(); + ASSERT_NE(nullptr, list); + + auto section_sp = list->FindSectionByType(eSectionTypeDWARFDebugFrame, false); + ASSERT_NE(nullptr, section_sp); + + DWARFCallFrameInfo cfi(*module_sp->GetObjectFile(), section_sp, + DWARFCallFrameInfo::DWARF); + + const Symbol *sym = module_sp->FindFirstSymbolWithNameAndType( + ConstString("simple_function"), eSymbolTypeAny); + ASSERT_NE(nullptr, sym); + + std::unique_ptr plan_up = cfi.GetUnwindPlan(sym->GetAddress()); + // Should succeed with valid CIE and FDE + ASSERT_NE(nullptr, plan_up); + EXPECT_GE(plan_up->GetRowCount(), 1); +} + +// Test valid CIE markers in debug_frame DWARF64 format +TEST_F(DWARFCallFrameInfoTest, ValidCIEMarkers_dwarf64) { + auto ExpectedFile = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + Address: 0x0000000000001130 + AddressAlign: 0x0000000000000010 + Content: 554889E5897DFC8B45FC83C0015DC3 + - Name: .debug_frame + Type: SHT_PROGBITS + AddressAlign: 0x0000000000000008 + # debug_frame content in DWARF64 format + # CIE: length_marker=0xFFFFFFFF, length=0x14, CIE_id=0xFFFFFFFFFFFFFFFF, version=4 + # FDE: length_marker=0xFFFFFFFF, length=0x24, CIE_offset=0x0 (points to CIE) + Content: FFFFFFFF1400000000000000FFFFFFFFFFFFFFFF040008000178100C07089001FFFFFFFF2400000000000000000000000000000030110000000000000F00000000000000410E108602430D064A0C0708 +Symbols: + - Name: simple_function + Type: STT_FUNC + Section: .text + Value: 0x0000000000001130 + Size: 0x000000000000000F + Binding: STB_GLOBAL +... +)"); + ASSERT_THAT_EXPECTED(ExpectedFile, llvm::Succeeded()); + + auto module_sp = std::make_shared(ExpectedFile->moduleSpec()); + SectionList *list = module_sp->GetSectionList(); + ASSERT_NE(nullptr, list); + + auto section_sp = list->FindSectionByType(eSectionTypeDWARFDebugFrame, false); + ASSERT_NE(nullptr, section_sp); + + DWARFCallFrameInfo cfi(*module_sp->GetObjectFile(), section_sp, + DWARFCallFrameInfo::DWARF); + + const Symbol *sym = module_sp->FindFirstSymbolWithNameAndType( + ConstString("simple_function"), eSymbolTypeAny); + ASSERT_NE(nullptr, sym); + + std::unique_ptr plan_up = cfi.GetUnwindPlan(sym->GetAddress()); + // Should succeed with valid CIE and FDE + ASSERT_NE(nullptr, plan_up); + EXPECT_GE(plan_up->GetRowCount(), 1); +} diff --git a/lldb/unittests/Utility/ScalarTest.cpp b/lldb/unittests/Utility/ScalarTest.cpp index 6d5caef42bee4..869a5809e6d14 100644 --- a/lldb/unittests/Utility/ScalarTest.cpp +++ b/lldb/unittests/Utility/ScalarTest.cpp @@ -118,11 +118,17 @@ TEST(ScalarTest, RightShiftOperator) { int a = 0x00001000; int b = 0xFFFFFFFF; int c = 4; + unsigned d = 0xFFFFFFFF; + unsigned short e = 0xFFFF; Scalar a_scalar(a); Scalar b_scalar(b); Scalar c_scalar(c); + Scalar d_scalar(d); + Scalar e_scalar(e); ASSERT_EQ(a >> c, a_scalar >> c_scalar); ASSERT_EQ(b >> c, b_scalar >> c_scalar); + ASSERT_EQ(d >> c, d_scalar >> c_scalar); + ASSERT_EQ(e >> c, e_scalar >> c_scalar); } TEST(ScalarTest, GetBytes) { diff --git a/lldb/utils/lldb-dotest/CMakeLists.txt b/lldb/utils/lldb-dotest/CMakeLists.txt index 3b8c88b6dc78c..f3f75015637f4 100644 --- a/lldb/utils/lldb-dotest/CMakeLists.txt +++ b/lldb/utils/lldb-dotest/CMakeLists.txt @@ -15,9 +15,14 @@ llvm_canonicalize_cmake_booleans( if ("libcxx" IN_LIST LLVM_ENABLE_RUNTIMES) set(LLDB_HAS_LIBCXX ON) if(LLVM_ENABLE_PER_TARGET_RUNTIME_DIR AND NOT APPLE) - set(LIBCXX_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LLVM_DEFAULT_TARGET_TRIPLE}) + set(LIBCXX_TARGET_SUBDIR ${LLVM_DEFAULT_TARGET_TRIPLE}) + if(LIBCXX_LIBDIR_SUBDIR) + string(APPEND LIBCXX_TARGET_SUBDIR /${LIBCXX_LIBDIR_SUBDIR}) + endif() + cmake_path(NORMAL_PATH LIBCXX_TARGET_SUBDIR) + set(LIBCXX_LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}/${LIBCXX_TARGET_SUBDIR}) set(LIBCXX_GENERATED_INCLUDE_DIR "${LLVM_BINARY_DIR}/include/c++/v1") - set(LIBCXX_GENERATED_INCLUDE_TARGET_DIR "${LLVM_BINARY_DIR}/include/${LLVM_DEFAULT_TARGET_TRIPLE}/c++/v1") + set(LIBCXX_GENERATED_INCLUDE_TARGET_DIR "${LLVM_BINARY_DIR}/include/${LIBCXX_TARGET_SUBDIR}/c++/v1") else() set(LIBCXX_LIBRARY_DIR ${CMAKE_BINARY_DIR}/lib${LIBCXX_LIBDIR_SUFFIX}) set(LIBCXX_GENERATED_INCLUDE_DIR "${CMAKE_BINARY_DIR}/include/c++/v1") diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index 0c976048a8d26..5afdd1519c96f 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -87,7 +87,9 @@ quentin.colombet@gmail.com (email), [qcolombet](https://github.com/qcolombet) (G #### LoopVectorize Florian Hahn \ -flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub) +flo@fhahn.com (email), [fhahn](https://github.com/fhahn) (GitHub) \ +David Sherwood \ +david.sherwood@arm.com (email), [david-arm](https://github.com/david-arm) (GitHub) #### MemorySSA diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index 57d49ed153749..3cbfb0d44e5a3 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -10,6 +10,7 @@ add_benchmark(GetIntrinsicForClangBuiltin GetIntrinsicForClangBuiltin.cpp PARTIA add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED) +add_benchmark(MustacheBench Mustache.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(RuntimeLibcallsBench RuntimeLibcalls.cpp PARTIAL_SOURCES_INTENDED) diff --git a/llvm/benchmarks/Mustache.cpp b/llvm/benchmarks/Mustache.cpp new file mode 100644 index 0000000000000..6d24f5442e274 --- /dev/null +++ b/llvm/benchmarks/Mustache.cpp @@ -0,0 +1,256 @@ +#include "llvm/Support/Mustache.h" +#include "benchmark/benchmark.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/raw_ostream.h" +#include + +// A large, raw string with many characters that require HTML escaping. +static const std::string LongHtmlString = [] { + std::string S; + S.reserve(500000); + for (int i = 0; i < 50000; ++i) { + S += ""; + } + return S; +}(); + +// A deep AND wide JSON object for testing traversal. +static const llvm::json::Value DeepJsonData = [] { + llvm::json::Value Root = llvm::json::Object(); + llvm::json::Object *Current = Root.getAsObject(); + for (int i = 0; i < 50; ++i) { // 50 levels deep + for (int j = 0; j < 100; ++j) { + (*Current)["sibling_" + std::to_string(j)] = llvm::json::Value("noise"); + } + std::string Key = "level_" + std::to_string(i); + (*Current)[Key] = llvm::json::Object(); + Current = (*Current)[Key].getAsObject(); + } + (*Current)["final_value"] = llvm::json::Value("Success!"); + + llvm::json::Array Arr; + for (int i = 0; i < 5000; ++i) { // 5,000 iterations + Arr.push_back(llvm::json::Value(i)); + } + + llvm::json::Object NewRoot; + NewRoot["deep_data"] = std::move(Root); + NewRoot["loop_array"] = std::move(Arr); + return llvm::json::Value(std::move(NewRoot)); +}(); + +// A huge array for testing iteration performance. +static const llvm::json::Value HugeArrayData = [] { + llvm::json::Array Arr; + for (int i = 0; i < 100000; ++i) { // 100,000 array items + Arr.push_back(llvm::json::Object( + {{"id", llvm::json::Value(static_cast(i))}, + {"is_even", llvm::json::Value(i % 2 == 0)}, + {"data", llvm::json::Value("Item data for " + std::to_string(i))}})); + } + return llvm::json::Object({{"items", std::move(Arr)}}); +}(); + +// The main template that includes a partial within a loop. +static const std::string ComplexPartialTemplate = + "Header\n" + "{{#items}}{{> item_partial}}{{/items}}\n" + "Footer"; + +// The partial template is now more complex, rendering multiple fields and a +// conditional section. +static const std::string ItemPartialTemplate = + "
\n" + "

{{data}}

\n" + " {{#is_even}}(Even){{/is_even}}\n" + "
\n"; + +// A single large string to stress the output buffer. +static const llvm::json::Value LargeOutputData = llvm::json::Object({ + {"long_string", + llvm::json::Value(std::string(1024 * 1024, 'A'))} // 1MB string +}); + +// --- Static Data (Templates) --- + +static const std::string BulkEscapingTemplate = "{{content}}"; +static const std::string BulkUnescapedTemplate = "{{{content}}}"; +static const std::string BulkUnescapedAmpersandTemplate = "{{& content}}"; + +static const std::string DeepTraversalTemplate = [] { + std::string LongKey = + "deep_data.level_0.level_1.level_2.level_3.level_4.level_5." + "level_6.level_7.level_8.level_9." + "level_10.level_11.level_12.level_13.level_14.level_" + "15.level_16.level_17.level_18.level_19." + "level_20.level_21.level_22.level_23.level_24.level_" + "25.level_26.level_27.level_28.level_29." + "level_30.level_31.level_32.level_33.level_34.level_" + "35.level_36.level_37.level_38.level_39." + "level_40.level_41.level_42.level_43.level_44.level_" + "45.level_46.level_47.level_48.level_49.final_value"; + return "{{#loop_array}}{{" + LongKey + "}}{{/loop_array}}"; +}(); + +static const std::string DeeplyNestedRenderingTemplate = [] { + std::string NestedTemplate = "{{#deep_data}}"; + for (int i = 0; i < 50; ++i) { + NestedTemplate += "{{#level_" + std::to_string(i) + "}}"; + } + NestedTemplate += "{{final_value}}"; + for (int i = 49; i >= 0; --i) { + NestedTemplate += "{{/level_" + std::to_string(i) + "}}"; + } + NestedTemplate += "{{/deep_data}}"; + return NestedTemplate; +}(); + +static const std::string HugeArrayIterationTemplate = + "{{#items}}ID: {{id}}.{{/items}}"; + +static const std::string ComplexTemplateParsingTemplate = [] { + std::string LargeTemplate; + LargeTemplate.reserve(100000); + for (int i = 0; i < 1000; ++i) { + LargeTemplate += "{{var_" + std::to_string(i) + + "}}" + "{{#section_" + + std::to_string(i) + "}}Content{{/section_" + + std::to_string(i) + + "}}" + "{{!comment_" + + std::to_string(i) + + "}}" + "{{=<% %>=}}" + "<%var_tag_changed_to_percent_sign_" + + std::to_string(i) + + "%>" + "<%={{ }}=%>" + "{{^inverted_" + + std::to_string(i) + "}}Not Present{{/inverted_" + + std::to_string(i) + "}}"; + } + return LargeTemplate; +}(); + +static const std::string SmallTemplateParsingTemplate = + "{{level_0.sibling_99}}\n" + "{{level_0.level_1.level_2.level_3.level_4.level_5.sibling_50}}\n" + "{{level_0.level_1.level_2.level_3.level_4.level_5." + "level_6.level_7.level_8.level_9." + "level_10.level_11.level_12.level_13.level_14.level_" + "15.level_16.level_17.level_18.level_19." + "level_20.level_21.level_22.level_23.level_24.level_" + "25.level_26.level_27.level_28.level_29." + "level_30.level_31.level_32.level_33.level_34.level_" + "35.level_36.level_37.level_38.level_39." + "level_40.level_41.level_42.level_43.level_44.level_" + "45.level_46.level_47.level_48.level_49.final_value}}\n"; + +static const std::string LargeOutputStringTemplate = "{{long_string}}"; + +// Tests the performance of rendering a large string with various escaping +// syntaxes. +static void BM_Mustache_StringRendering(benchmark::State &state, + const std::string &TplStr) { + llvm::mustache::Template Tpl(TplStr); + llvm::json::Value Data = + llvm::json::Object({{"content", llvm::json::Value(LongHtmlString)}}); + for (auto _ : state) { + std::string Result; + llvm::raw_string_ostream OS(Result); + Tpl.render(Data, OS); + benchmark::DoNotOptimize(Result); + } +} +BENCHMARK_CAPTURE(BM_Mustache_StringRendering, Escaped, BulkEscapingTemplate); +BENCHMARK_CAPTURE(BM_Mustache_StringRendering, Unescaped_Triple, + BulkUnescapedTemplate); +BENCHMARK_CAPTURE(BM_Mustache_StringRendering, Unescaped_Ampersand, + BulkUnescapedAmpersandTemplate); + +// Tests the "hot render" cost of repeatedly traversing a deep and wide +// JSON object. +static void BM_Mustache_DeepTraversal(benchmark::State &state) { + llvm::mustache::Template Tpl(DeepTraversalTemplate); + for (auto _ : state) { + std::string Result; + llvm::raw_string_ostream OS(Result); + Tpl.render(DeepJsonData, OS); + benchmark::DoNotOptimize(Result); + } +} +BENCHMARK(BM_Mustache_DeepTraversal); + +// Tests the "hot render" cost of pushing and popping a deep context stack. +static void BM_Mustache_DeeplyNestedRendering(benchmark::State &state) { + llvm::mustache::Template Tpl(DeeplyNestedRenderingTemplate); + for (auto _ : state) { + std::string Result; + llvm::raw_string_ostream OS(Result); + Tpl.render(DeepJsonData, OS); + benchmark::DoNotOptimize(Result); + } +} +BENCHMARK(BM_Mustache_DeeplyNestedRendering); + +// Tests the performance of the loop logic when iterating over a huge number of +// items. +static void BM_Mustache_HugeArrayIteration(benchmark::State &state) { + llvm::mustache::Template Tpl(HugeArrayIterationTemplate); + for (auto _ : state) { + std::string Result; + llvm::raw_string_ostream OS(Result); + Tpl.render(HugeArrayData, OS); + benchmark::DoNotOptimize(Result); + } +} +BENCHMARK(BM_Mustache_HugeArrayIteration); + +// Tests the performance of the parser on a large, "wide" template. +static void BM_Mustache_ComplexTemplateParsing(benchmark::State &state) { + for (auto _ : state) { + llvm::mustache::Template Tpl(ComplexTemplateParsingTemplate); + benchmark::DoNotOptimize(Tpl); + } +} +BENCHMARK(BM_Mustache_ComplexTemplateParsing); + +// Tests the performance of the parser on a small, "deep" template. +static void BM_Mustache_SmallTemplateParsing(benchmark::State &state) { + for (auto _ : state) { + llvm::mustache::Template Tpl(SmallTemplateParsingTemplate); + benchmark::DoNotOptimize(Tpl); + } +} +BENCHMARK(BM_Mustache_SmallTemplateParsing); + +// Tests the performance of rendering a template that includes a partial. +static void BM_Mustache_PartialsRendering(benchmark::State &state) { + llvm::mustache::Template Tpl(ComplexPartialTemplate); + Tpl.registerPartial("item_partial", ItemPartialTemplate); + llvm::json::Value Data = HugeArrayData; + + for (auto _ : state) { + std::string Result; + llvm::raw_string_ostream OS(Result); + Tpl.render(Data, OS); + benchmark::DoNotOptimize(Result); + } +} +BENCHMARK(BM_Mustache_PartialsRendering); + +// Tests the performance of the underlying buffer management when generating a +// very large output. +static void BM_Mustache_LargeOutputString(benchmark::State &state) { + llvm::mustache::Template Tpl(LargeOutputStringTemplate); + for (auto _ : state) { + std::string Result; + llvm::raw_string_ostream OS(Result); + Tpl.render(LargeOutputData, OS); + benchmark::DoNotOptimize(Result); + } +} +BENCHMARK(BM_Mustache_LargeOutputString); + +BENCHMARK_MAIN(); diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index edabdc595a1f0..74b7604fda56d 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -979,11 +979,13 @@ supported for the ``amdgcn`` target. access is not supported except by flat and scratch instructions in GFX9-GFX11. - Code that manipulates the stack values in other lanes of a wavefront, - such as by ``addrspacecast``-ing stack pointers to generic ones and taking offsets - that reach other lanes or by explicitly constructing the scratch buffer descriptor, - triggers undefined behavior when it modifies the scratch values of other lanes. - The compiler may assume that such modifications do not occur. + On targets without "Globally Accessible Scratch" (introduced in GFX125x), code that + manipulates the stack values in other lanes of a wavefront, such as by + ``addrspacecast``-ing stack pointers to generic ones and taking offsets that reach other + lanes or by explicitly constructing the scratch buffer descriptor, triggers undefined + behavior when it modifies the scratch values of other lanes. The compiler may assume + that such modifications do not occur for such targets. + When using code object V5 ``LIBOMPTARGET_STACK_SIZE`` may be used to provide the private segment size in bytes, for cases where a dynamic stack is used. @@ -1515,6 +1517,88 @@ The AMDGPU backend implements the following LLVM IR intrinsics. List AMDGPU intrinsics. +'``llvm.amdgcn.cooperative.atomic``' Intrinsics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``llvm.amdgcn.cooperative.atomic`` :ref:`family of intrinsics` +provide atomic load and store operations to a naturally-aligned contiguous memory regions. +Memory is accessed cooperatively by a collection of convergent threads, with each thread accessing +a fraction of the contiguous memory region. + + .. TODO:: + + The memory model described here is imprecise; see SWDEV-536264. + +This intrinsic has a memory ordering and may be used to synchronize-with another cooperative atomic. +If the memory ordering is relaxed, it may pair with a fence if that same fence is executed by +all participating threads with the same synchronization scope and set of address spaces. + +In both cases, a synchronize-with relation can only be established between cooperative atomics with the +same total access size. + +Each target may have additional restrictions on how the intrinsic may be used; see +:ref:`the table below`. +Targets not covered in the table do not support these intrinsics. + + .. table:: AMDGPU Cooperative Atomic Intrinsics Availability + :name: amdgpu-llvm-ir-cooperative-atomic-intrinsics-availability + + =============== ============================================================= + GFX Version Target Restrictions + =============== ============================================================= + GFX 12.5 :ref:`amdgpu-amdhsa-memory-model-gfx125x-cooperative-atomics` + =============== ============================================================= + +If the intrinsic is used without meeting all of the above conditions, or the target-specific conditions, +then this intrinsic causes undefined behavior. + + .. table:: AMDGPU Cooperative Atomic Intrinsics + :name: amdgpu-cooperative-atomic-intrinsics-table + + ======================================================= =========== ============ ========== + LLVM Intrinsic Number of Access Size Total Size + Threads Per Thread + Used + ======================================================= =========== ============ ========== + ``llvm.amdgcn.cooperative.atomic.store.32x4B`` 32 4B 128B + + ``llvm.amdgcn.cooperative.atomic.load.32x4B`` 32 4B 128B + + ``llvm.amdgcn.cooperative.atomic.store.16x8B`` 16 8B 128B + + ``llvm.amdgcn.cooperative.atomic.load.16x8B`` 16 8B 128B + + ``llvm.amdgcn.cooperative.atomic.store.8x16B`` 8 16B 128B + + ``llvm.amdgcn.cooperative.atomic.load.8x16B`` 8 16B 128B + + ======================================================= =========== ============ ========== + +The intrinsics are available for the global (``.p1`` suffix) and generic (``.p0`` suffix) address spaces. + +The atomic ordering operand (3rd operand for ``.store``, 2nd for ``.load``) is an integer that follows the +C ABI encoding of atomic memory orderings. The supported values are in +:ref:`the table below`. + + .. table:: AMDGPU Cooperative Atomic Intrinsics Atomic Memory Orderings + :name: amdgpu-cooperative-atomic-intrinsics-atomic-memory-orderings-table + + ====== ================ ================================= + Value Atomic Memory Notes + Ordering + ====== ================ ================================= + ``0`` ``relaxed`` The default for unsupported values. + + ``2`` ``acquire`` Only for ``.load`` + + ``3`` ``release`` Only for ``.store`` + + ``5`` ``seq_cst`` + ====== ================ ================================= + +The last argument of the intrinsic is the synchronization scope +as a metadata string, which must be one of the supported :ref:`memory scopes`. + .. _amdgpu_metadata: LLVM IR Metadata @@ -1843,6 +1927,7 @@ The AMDGPU backend supports the following LLVM IR attributes. This is only relevant on targets with cluster support. + ================================================ ========================================================== Calling Conventions @@ -5261,6 +5346,9 @@ The fields used by CP for code objects before V3 also match those specified in GFX10-GFX12 (wavefront size 32) - max_vgpr 1..256 - max(0, ceil(vgprs_used / 8) - 1) + GFX125X (wavefront size 32) + - max_vgpr 1..1024 + - max(0, ceil(vgprs_used / 16) - 1) Where vgprs_used is defined as the highest VGPR number @@ -6491,6 +6579,7 @@ following sections: * :ref:`amdgpu-amdhsa-memory-model-gfx942` * :ref:`amdgpu-amdhsa-memory-model-gfx10-gfx11` * :ref:`amdgpu-amdhsa-memory-model-gfx12` +* :ref:`amdgpu-amdhsa-memory-model-gfx125x` .. _amdgpu-fence-as: @@ -16617,6 +16706,2022 @@ the instruction in the code sequence that references the table. - system for OpenCL.* ============ ============ ============== ========== ================================ +.. _amdgpu-amdhsa-memory-model-gfx125x: + +Memory Model GFX125x +++++++++++++++++++++++++ + +For GFX125x: + +**Device Structure:** + +* Each agent has multiple shader engines (SE). +* Each SE has multiple shader arrays (SA). +* Each SA has multiple work-group processors (WGP). +* Each WGP has 4 SIMD32 (2 SIMD32-pairs) that execute wavefronts. +* The wavefronts for a single work-group are executed in the same + WGP. + +**Device Memory:** + +* Each WGP has a single write-through WGP cache (WGP$) shared by the wavefronts of the + work-groups executing on it. The WGP$ is divided between LDS and vector L0 memory. + + * Vector L0 memory holds clean data only. + +* Each WGP$ has two request queues; one per SIMD32-pair. + Each queue can handle both LDS and vector L0 requests. Requests in one queue + are executed serially and in-order, but are not kept in order with the other queue. +* The scalar memory operations access a scalar L0 cache shared by all wavefronts + on a WGP. The scalar and vector L0 caches are not kept coherent by hardware. However, scalar + operations are used in a restricted way so do not impact the memory model. See + :ref:`amdgpu-amdhsa-memory-spaces`. +* The vector and scalar memory L0 caches are both clients of an L1 buffer shared by + all WGPs on the same SE. +* L1 buffers have separate request queues for each WGP$ it serves. Requests in one queue + are executed serially and in-order, but are not kept in order with other queues. +* L1 buffers are clients of the L2 cache. +* There may be multiple L2 caches per agent. Ranges of virtual addresses can be set up as follows: + + * Be non-hardware-coherent; copies of the data are not coherent between multiple L2s. + * Be read-write hardware-coherent with other L2 caches on the same or other agents. + * Bypass L2 entirely to ensure system coherence. + +* L2 caches have multiple memory channels to service disjoint ranges of virtual + addresses. + +**Memory Model:** + +.. note:: + + This section is currently incomplete as work on the compiler is still ongoing. + The following is a non-exhaustive list of unimplemented/undocumented features: + non-volatile bit code sequences, monitor and wait, globally accessing scratch atomics, + multicast loads, barriers (including split barriers) and cooperative atomics. + Scalar operations memory model needs more elaboration as well. + +* Vector memory operations are performed as wavefront wide operations, with the + ``EXEC`` mask predicating which lanes execute. +* Consecutive vector memory operations from the same wavefront are issued in program order. + Vector memory operations are issued (and executed) in no particular order between wavefronts. +* Wave execution of a vector memory operation instruction issues (initiates) the operation, + but completion occurs an unspecified amount of time later. + The ``s_wait_*cnt`` instructions must be used to determine if the operation has completed. +* The types of vector memory operations (and their associated ``s_wait_*cnt`` instructions) are: + + * Load (global, scratch, flat, buffer): ``s_wait_loadcnt`` + * Store (global, scratch, flat, buffer): ``s_wait_storecnt`` + * non-ASYNC LDS: ``s_wait_dscnt`` + * ASYNC LDS: ``s_wait_asynccnt`` + * Tensor: ``s_wait_tensorcnt`` + +* ``s_wait_xcnt`` is a counter that is incremented when a memory operation is issued, and + decremented when memory address translation for that instruction is completed. + Waiting on a memory counter ``s_wait_*cnt N`` also waits on ``s_wait_xcnt N``. + + * ``s_wait_xcnt 0x0`` is required before flat and global atomic stores/read-modify-write + operations to guarantee atomicity during a xnack replay. + +* Within a wavefront, vector memory operation completion (``s_wait_*cnt`` decrement) is + reported in order of issue within a type, but in no particular order between types. +* Within a wavefront, the order in which data is returned to registers by a vector memory + operation can be different from the order in which the vector memory operations were issued. + + * Thus, a ``s_wait_*cnt`` instruction must be used to prevent multiple vector memory operations + that return results to the same register from executing concurrently as they may not return + their results in instruction issue order, even though they will be reported as completed in + instruction issue order by the decrementing of the counter. + +* Within a wavefront, consecutive loads and store to the same address will be processed in program order + by the memory subsystem. Loads and stores to different addresses may be processed + out of order with respect to a different address. +* All non-ASYNC LDS vector memory operations of a WGP are performed as wavefront wide + operations in a global order and involve no caching. Completion is reported to a wavefront in + execution order. +* ASYNC LDS and tensor vector memory operations are not covered by the memory model implemented + by the AMDGPU backend. Neither ``s_wait_asynccnt`` nor ``s_wait_tensorcnt`` are inserted + automatically. They must be emitted using compiler built-in calls. +* Some vector memory operations contain a ``SCOPE`` field with values + corresponding to each cache level. The ``SCOPE`` determines whether a cache + can complete an operation locally or whether it needs to forward the operation + to the next cache level. The ``SCOPE`` values are: + + * ``SCOPE_CU``: WGP + * ``SCOPE_SE``: Shader Engine + * ``SCOPE_DEV``: Device/Agent + * ``SCOPE_SYS``: System + +* Each cache is assigned a ``SCOPE`` by the hardware depending on the agent's + configuration. + + * This ensures that ``SCOPE_DEV`` can always be used to implement agent coherence, + even in the presence of multiple non-coherent L2 caches on the same agent. + +* When a vector memory operation with a given ``SCOPE`` reaches a cache with a smaller + ``SCOPE`` value, it is forwarded to the next level of cache. +* When a vector memory operation with a given ``SCOPE`` reaches a cache with a ``SCOPE`` + value greater than or equal to its own, the operation can proceed: + + * Reads can hit into the cache. + * Writes can happen in this cache and completion (``s_wait`` decrement) can be + reported. + * RMW operations can be done locally. + +* Some memory operations contain a ``nv`` bit, for "non-volatile", which indicates + memory that is not expected to change during a kernel's execution. + This information is propagated to the cache lines for that address + (refered to as ``$nv``). + + * When ``nv=0`` reads hit dirty ``$nv=1`` data in cache, the hardware will + writeback the data to the next level in the hierarchy and then subsequently read + it again, updating the cache line with a clean ``$nv=0`` copy of the data. + +* ``global_inv``, ``global_wb`` and ``global_wbinv`` are cache control instructions. + The affected cache(s) are controlled by the ``SCOPE`` of the instruction. + Only caches whose scope is strictly smaller than the instruction's are affected. + + * ``global_inv`` invalidates the data in affected caches so that subsequent reads + will re-read from the next level in the cache hierarchy. + The invalidation requests cannot be reordered with pending or upcoming + memory operations. Instruction completion is reported using ``s_wait_loadcnt``. + * ``global_wb`` flushes the dirty data in affected caches to the next level in + the cache hierarchy. This instruction additionally ensures previous + memory operation done at a lower scope level have reached the desired + ``SCOPE:``. Instruction completion is reported using ``s_wait_storecnt`` once + all data has been acknowledged by the next level in the cache hierarchy. + * ``global_wbinv`` performs a ``global_inv`` then a ``global_wb``. + Instruction completion is reported using ``s_wait_storecnt``. + * ``global_inv``, ``global_wb`` and ``global_wbinv`` with ``nv=0`` can only + affect ``$nv=0`` cache lines, whereas ``nv=1`` can affect all cache lines. + * ``global_inv``, ``global_wb`` and ``global_wbinv`` behave like memory operations + issued to every address at the same time. They are kept in order with other + memory operations from the same wave. + +Scalar memory operations are only used to access memory that is proven to not +change during the execution of the kernel dispatch. This includes constant +address space and global address space for program scope ``const`` variables. +Therefore, the kernel machine code does not have to maintain the scalar cache to +ensure it is coherent with the vector caches. The scalar and vector caches are +invalidated between kernel dispatches by CP since constant address space data +may change between kernel dispatch executions. See +:ref:`amdgpu-amdhsa-memory-spaces`. + +Atomics in the scratch address space are handled as follows: + +* Data types <= 32 bits: The instruction is converted into an atomic in the + generic (``flat``) address space. All properties of the atomic + (atomic ordering, volatility, alignment, etc.) are preserved. + Refer to the generic address space code sequences for further information. +* Data types >32 bits: unsupported and an error is emitted. + +The code sequences used to implement the memory model for GFX125x are defined in +table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-table`. + +The mapping of LLVM IR syncscope to GFX125x instruction ``scope`` operands is +defined in :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + +The table only applies if and only if it is directly referenced by an entry in +:ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-table`, and it only applies to +the instruction in the code sequence that references the table. + + .. table:: AMDHSA Memory Model Code Sequences GFX125x - Instruction Scopes + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table + + ================================= ======================= + LLVM syncscope ISA + + + ================================= ======================= + *none*, one-as ``scope:SCOPE_SYS`` + system, system-one-as ``scope:SCOPE_SYS`` + agent, agent-one-as ``scope:SCOPE_DEV`` + cluster, cluster-one-as ``scope:SCOPE_SE`` + workgroup, workgroup-one-as ``scope:SCOPE_CU`` [1]_ + wavefront, wavefront-one-as ``scope:SCOPE_CU`` [1]_ + singlethread, singlethread-one-as ``scope:SCOPE_CU`` [1]_ + ================================= ======================= + + .. [1] ``SCOPE_CU`` is the default ``scope:`` emitted by the compiler. + It will be omitted when instructions are emitted in textual form by the compiler. + + .. table:: AMDHSA Memory Model Code Sequences GFX125x + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx125x-table + + ============ ============ ============== ========== ================================ + LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code + Ordering Sync Scope Address GFX125x + Space + ============ ============ ============== ========== ================================ + **Non-Atomic** + ------------------------------------------------------------------------------------ + load *none* *none* - global - !volatile & !nontemporal + - generic + - private 1. buffer/global/flat_load + - constant + - !volatile & nontemporal + + 1. buffer/global/flat_load + ``th:TH_LOAD_NT`` + + - volatile + + 1. buffer/global/flat_load + ``scope:SCOPE_SYS`` + + 2. ``s_wait_loadcnt 0x0`` + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + load *none* *none* - local 1. ds_load + store *none* *none* - global - !volatile & !nontemporal + - generic + - private 1. buffer/global/flat_store + - constant + - !volatile & nontemporal + + 1. buffer/global/flat_store + ``th:TH_STORE_NT`` + + - volatile + + 1. buffer/global/flat_store + ``scope:SCOPE_SYS`` + + 2. ``s_wait_storecnt 0x0`` + + - Must happen before + any following volatile + global/generic + load/store. + - Ensures that + volatile + operations to + different + addresses will not + be reordered by + hardware. + + store *none* *none* - local 1. ds_store + **Unordered Atomic** + ------------------------------------------------------------------------------------ + load atomic unordered *any* *any* *Same as non-atomic*. + store atomic unordered *any* *any* *Same as non-atomic*. + atomicrmw unordered *any* *any* *Same as monotonic atomic*. + **Monotonic Atomic** + ------------------------------------------------------------------------------------ + load atomic monotonic - singlethread - global 1. buffer/global/flat_load + - wavefront - generic + - workgroup - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - cluster + - agent + - system + load atomic monotonic - singlethread - local 1. ds_load + - wavefront + - workgroup + store atomic monotonic - singlethread - global 1. ``s_wait_xcnt 0x0`` + - wavefront - generic + - workgroup - Ensure operation remains atomic even during a xnack replay. + - cluster - Only needed for ``flat`` and ``global`` operations. + - agent + - system 2. buffer/global/flat_store + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + store atomic monotonic - singlethread - local 1. ds_store + - wavefront + - workgroup + atomicrmw monotonic - singlethread - global 1. ``s_wait_xcnt 0x0`` + - wavefront - generic + - workgroup - Ensure operation remains atomic even during a xnack replay. + - cluster - Only needed for ``flat`` and ``global`` operations. + - agent + - system 2. buffer/global/flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + atomicrmw monotonic - singlethread - local 1. ds_atomic + - wavefront + - workgroup + **Acquire Atomic** + ------------------------------------------------------------------------------------ + load atomic acquire - singlethread - global 1. buffer/global/ds/flat_load + - wavefront - local + - generic + load atomic acquire - workgroup - global 1. buffer/global_load + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. ``s_wait_loadcnt 0x0`` + + - Must happen before any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + + + load atomic acquire - workgroup - local 1. ds_load + 2. ``s_wait_dscnt 0x0`` + + - If OpenCL, omit. + - Must happen before any following + global/generic load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than the local load + atomic value being + acquired. + + + load atomic acquire - workgroup - generic 1. flat_load + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0`` + - Must happen before any + following global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures any + following global + data read is no + older than a local load + atomic value being + acquired. + + load atomic acquire - cluster - global 1. buffer/global_load + - agent + - system - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. ``s_wait_loadcnt 0x0`` + + - Must happen before + following + ``global_inv``. + - Ensures the load + has completed + before invalidating + the caches. + + 3. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following + loads will not see + stale global data. + + load atomic acquire - cluster - generic 1. flat_load + - agent + - system - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0`` + - Must happen before + following + ``global_inv``. + - Ensures the flat_load + has completed + before invalidating + the caches. + + 3. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - singlethread - global 1. ``s_wait_xcnt 0x0`` + - wavefront - local + - generic - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 2. buffer/global/ds/flat_atomic + + atomicrmw acquire - workgroup - global 1. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 2. buffer/global_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, + use ``th:TH_ATOMIC_RETURN`` + + 3. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + + - Must happen before any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + + atomicrmw acquire - workgroup - local 1. ds_atomic + 2. ``s_wait_dscnt 0x0`` + + - If OpenCL, omit. + - Ensures any + following global + data read is no + older than the local + atomicrmw value + being acquired. + + + atomicrmw acquire - workgroup - generic 1. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + + 2. flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, + use ``th:TH_ATOMIC_RETURN`` + + 3. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0`` + - Ensures any + following global + data read is no + older than the local + atomicrmw value + being acquired. + + atomicrmw acquire - cluster - global 1. ``s_wait_xcnt 0x0`` + - agent + - system - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``global`` operations. + + 2. buffer/global_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, + use ``th:TH_ATOMIC_RETURN`` + + 3. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + + - Must happen before + following ``global_inv``. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 4. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acquire - cluster - generic 1. ``s_wait_xcnt 0x0`` + - agent + - system - Ensure operation remains atomic even during a xnack replay. + + 2. flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, + use ``th:TH_ATOMIC_RETURN`` + + 3. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit dscnt + - Must happen before + following + global_inv + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 4. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + fence acquire - singlethread *none* *none* + - wavefront + fence acquire - workgroup *none* 1. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0`` + - If OpenCL and address space is local, + omit all. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load + atomic/ + atomicrmw-with-return-value + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + atomicrmw-no-return-value + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that the + fence-paired atomic + has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + fence-paired-atomic. + + + fence acquire - cluster *none* 1. | ``s_wait_storecnt 0x0`` + - agent | ``s_wait_loadcnt 0x0`` + - system | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - If OpenCL and address space is + local, omit all. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load + atomic/ + atomicrmw-with-return-value + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + atomicrmw-no-return-value + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Must happen before + the following + ``global_inv`` + - Ensures that the + fence-paired atomic + has completed + before invalidating the + caches. Therefore + any following + locations read must + be no older than + the value read by + the + fence-paired-atomic. + + 2. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Ensures that + following + loads will not see + stale data. + + **Release Atomic** + ------------------------------------------------------------------------------------ + store atomic release - singlethread - global 1. ``s_wait_xcnt 0x0`` + - wavefront - local + - generic - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 2. buffer/global/ds/flat_store + + store atomic release - workgroup - global 1. | ``s_wait_storecnt 0x0`` + - cluster - generic | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before the + following store. + - Ensures that all + memory operations + have + completed before + performing the + store that is being + released. + + 2. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 3. buffer/global/flat_store + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + store atomic release - workgroup - local 1. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - Must happen before the + following store. + - Ensures that all + global memory + operations have + completed before + performing the + store that is being + released. + + 2. ds_store + store atomic release - agent - global 1. ``global_wb`` + - system - generic + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + ``global_wb`` or + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before the + following store. + - Ensures that all + memory operations + have + completed before + performing the + store that is being + released. + + 3. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 4. buffer/global/flat_store + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + atomicrmw release - singlethread - global 1. ``s_wait_xcnt 0x0`` + - wavefront - local + - generic - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 2. buffer/global/ds/flat_atomic + atomicrmw release - workgroup - global 1. | ``s_wait_storecnt 0x0`` + - cluster - generic | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before the + following atomic. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 3. buffer/global/flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + atomicrmw release - workgroup - local 1. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit all. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - Must happen before the + following atomic. + - Ensures that all + global memory + operations have + completed before + performing the + store that is being + released. + + 2. ds_atomic + atomicrmw release - agent - global 1. ``global_wb`` + - system - generic + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + ``global_wb`` or + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before the + following atomic. + - Ensures that all + memory operations + to global and local + have completed + before performing + the atomicrmw that + is being released. + + 3. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 4. buffer/global/flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + fence release - singlethread *none* *none* + - wavefront + fence release - workgroup *none* 1. | ``s_wait_storecnt 0x0`` + - cluster | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - If OpenCL and + address space is + local, omit all. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store atomic/ + atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + fence release - agent *none* 1. ``global_wb`` + - system + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + | **OpenCL:** + | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + + - If OpenCl, omit ``s_wait_dscnt 0x0``. + - If OpenCL and address space is local, + omit all. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + ``global_wb`` or + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + any following store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + fence-paired-atomic). + - Ensures that all + memory operations + have + completed before + performing the + following + fence-paired-atomic. + + **Acquire-Release Atomic** + ------------------------------------------------------------------------------------ + atomicrmw acq_rel - singlethread - global 1. ``s_wait_xcnt 0x0`` + - wavefront - local + - generic - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 2. buffer/global/ds/flat_atomic + atomicrmw acq_rel - workgroup - global 1. | ``s_wait_storecnt 0x0`` + - cluster | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0``. + - Must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``flat`` and ``global`` operations. + + 3. buffer/global_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, use + ``th:TH_ATOMIC_RETURN``. + + 4. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + + - Ensures any + following global + data read is no + older than the + atomicrmw value + being acquired. + + atomicrmw acq_rel - workgroup - local 1 | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - Must happen before + the following + store. + - Ensures that all + global memory + operations have + completed before + performing the + store that is being + released. + + 2. ds_atomic + 3. ``s_wait_dscnt 0x0`` + + - If OpenCL, omit. + - Ensures any + following global + data read is no + older than the local load + atomic value being + acquired. + + atomicrmw acq_rel - workgroup - generic 1. | ``s_wait_storecnt 0x0`` + - cluster | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_loadcnt 0x0``. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store + atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 2. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + + 3. flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, + use ``th:TH_ATOMIC_RETURN``. + + 4. | **Atomic without return:** + | ``s_wait_dscnt 0x0`` + | ``s_wait_storecnt 0x0`` + | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit ``s_wait_dscnt 0x0`` + - Ensures any + following global + data read is no + older than the load + atomic value being + acquired. + + + atomicrmw acq_rel - agent - global 1. ``global_wb`` + - system + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit + ``s_wait_dscnt 0x0`` + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + ``global_wb``. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + to global have + completed before + performing the + atomicrmw that is + being released. + + 2. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + - Only needed for ``global`` operations. + + 3. buffer/global_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, use + ``th:TH_ATOMIC_RETURN``. + + 4. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + + - Must happen before + following + ``global_inv``. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 5. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + atomicrmw acq_rel - agent - generic 1. ``global_wb`` + - system + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit + ``s_wait_dscnt 0x0`` + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load atomic + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + ``global_wb``. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + atomicrmw. + - Ensures that all + memory operations + have + completed before + performing the + atomicrmw that is + being released. + + 3. ``s_wait_xcnt 0x0`` + + - Ensure operation remains atomic even during a xnack replay. + + 4. flat_atomic + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - If atomic with return, use + ``th:TH_ATOMIC_RETURN``. + + 5. | **Atomic with return:** + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + | **Atomic without return:** + | ``s_wait_storecnt 0x0`` + | ``s_wait_dscnt 0x0`` + + + - If OpenCL, omit + ``s_wait_dscnt 0x0``. + - Must happen before + following + ``global_inv``. + - Ensures the + atomicrmw has + completed before + invalidating the + caches. + + 5. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. + + fence acq_rel - singlethread *none* *none* + - wavefront + fence acq_rel - workgroup *none* 1. | ``s_wait_storecnt 0x0`` + - cluster | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL and + address space is + not generic, omit + ``s_wait_dscnt 0x0`` + - If OpenCL and + address space is + local, omit + all but ``s_wait_dscnt 0x0``. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + any preceding + global/generic + store/store atomic/ + atomicrmw-no-return-value. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store atomic/ + atomicrmw. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that all + memory operations + have + completed before + performing any + following global + memory operations. + - Ensures that the + preceding + local/generic load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before following + global memory + operations. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + local/generic store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + - Ensures that the + acquire-fence-paired + atomic has completed + before invalidating + the + cache. Therefore + any following + locations read must + be no older than + the value read by + the + acquire-fence-paired-atomic. + + fence acq_rel - agent *none* 1. ``global_wb`` + - system + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + + 2. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL and + address space is + not generic, omit + ``s_wait_dscnt 0x0`` + - If OpenCL and + address space is + local, omit + all but ``s_wait_dscnt 0x0``. + - See :ref:`amdgpu-fence-as` for + more details on fencing specific + address spaces. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + any preceding + global/generic + load/load + atomic/ + atomicrmw-with-return-value. + - ``s_wait_storecnt 0x0`` + must happen after + ``global_wb``. + - ``s_wait_dscnt 0x0`` + must happen after + any preceding + local/generic + load/store/load + atomic/store + atomic/atomicrmw. + - Must happen before + the following + ``global_inv`` + - Ensures that the + preceding + global/local/generic + load + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + acquire-fence-paired-atomic) + has completed + before invalidating + the caches. This + satisfies the + requirements of + acquire. + - Ensures that all + previous memory + operations have + completed before a + following + global/local/generic + store + atomic/atomicrmw + with an equal or + wider sync scope + and memory ordering + stronger than + unordered (this is + termed the + release-fence-paired-atomic). + This satisfies the + requirements of + release. + + 3. ``global_inv`` + + - Apply :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx125x-scopes-table`. + - Must happen before + any following + global/generic + load/load + atomic/store/store + atomic/atomicrmw. + - Ensures that + following loads + will not see stale + global data. This + satisfies the + requirements of + acquire. + + **Sequential Consistent Atomic** + ------------------------------------------------------------------------------------ + load atomic seq_cst - singlethread - global *Same as corresponding + - wavefront - local load atomic acquire, + - generic except must generate + all instructions even + for OpenCL.* + load atomic seq_cst - workgroup - global 1. | ``s_wait_storecnt 0x0`` + - generic | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit + ``s_wait_dscnt 0x0`` + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_dscnt 0x0`` must + happen after + preceding + local/generic load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own ``s_wait_dscnt 0x0`` + and so do not need to be + considered.) + - ``s_wait_loadcnt 0x0`` + must happen after + preceding + global/generic load + atomic/ + atomicrmw-with-return-value + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own waits and so do + not need to be + considered.) + - ``s_wait_storecnt 0x0`` + Must happen after + preceding + global/generic store + atomic/ + atomicrmw-no-return-value + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own ``s_wait_storecnt 0x0`` + and so do not need to be + considered.) + - Ensures any + preceding + sequential + consistent global/local + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + ``s_wait``\s of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The ``s_wait``\s + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the ``s_wait``\s be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generate + all instructions even + for OpenCL.* + load atomic seq_cst - workgroup - local 1. | ``s_wait_storecnt 0x0`` + | ``s_wait_loadcnt 0x0`` + | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit all. + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_loadcnt 0x0`` + must happen after + preceding + global/generic load + atomic/ + atomicrmw-with-return-value + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own ``s_wait``\s and so do + not need to be + considered.) + - ``s_wait_storecnt 0x0`` + Must happen after + preceding + global/generic store + atomic/ + atomicrmw-no-return-value + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own ``s_wait_storecnt 0x0`` + and so do + not need to be + considered.) + - Ensures any + preceding + sequential + consistent global + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + ``s_wait``\s of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The s_waitcnt + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the ``s_wait``\s be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generate + all instructions even + for OpenCL.* + + load atomic seq_cst - cluster - global 1. | ``s_wait_storecnt 0x0`` + - agent - generic | ``s_wait_loadcnt 0x0`` + - system | ``s_wait_dscnt 0x0`` + + - If OpenCL, omit + ``s_wait_dscnt 0x0`` + - The waits can be + independently moved + according to the + following rules: + - ``s_wait_dscnt 0x0`` + must happen after + preceding + local load + atomic/store + atomic/atomicrmw + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own ``s_wait_dscnt 0x0`` + and so do + not need to be + considered.) + - ``s_wait_loadcnt 0x0`` + must happen after + preceding + global/generic load + atomic/ + atomicrmw-with-return-value + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own ``s_wait``\s and so do + not need to be + considered.) + - ``s_wait_storecnt 0x0`` + Must happen after + preceding + global/generic store + atomic/ + atomicrmw-no-return-value + with memory + ordering of seq_cst + and with equal or + wider sync scope. + (Note that seq_cst + fences have their + own + ``s_wait_storecnt 0x0`` and so do + not need to be + considered.) + - Ensures any + preceding + sequential + consistent global + memory instructions + have completed + before executing + this sequentially + consistent + instruction. This + prevents reordering + a seq_cst store + followed by a + seq_cst load. (Note + that seq_cst is + stronger than + acquire/release as + the reordering of + load acquire + followed by a store + release is + prevented by the + ``s_wait``\s of + the release, but + there is nothing + preventing a store + release followed by + load acquire from + completing out of + order. The ``s_wait``\s + could be placed after + seq_store or before + the seq_load. We + choose the load to + make the ``s_wait``\s be + as late as possible + so that the store + may have already + completed.) + + 2. *Following + instructions same as + corresponding load + atomic acquire, + except must generate + all instructions even + for OpenCL.* + store atomic seq_cst - singlethread - global *Same as corresponding + - wavefront - local store atomic release, + - workgroup - generic except must generate + - cluster all instructions even + - agent for OpenCL.* + - system + atomicrmw seq_cst - singlethread - global *Same as corresponding + - wavefront - local atomicrmw acq_rel, + - workgroup - generic except must generate + - cluster all instructions even + - agent for OpenCL.* + - system + fence seq_cst - singlethread *none* *Same as corresponding + - wavefront fence acq_rel, + - workgroup except must generate + - cluster all instructions even + - agent for OpenCL.* + - system + ============ ============ ============== ========== ================================ + +.. _amdgpu-amdhsa-memory-model-gfx125x-cooperative-atomics: + +'``llvm.amdgcn.cooperative.atomic``' Intrinsics +############################################### + +The collection of convergent threads participating in a cooperative atomic must belong +to the same wave32. + +Only naturally-aligned, contiguous groups of lanes may be used; +see :ref:`the table below` for the set of +possible lane groups. +Cooperative atomics may be executed by more than one group per wave. +Using an unsupported lane group, or using more lane groups per wave than the maximum will +cause undefined behavior. + +Using the intrinsic also causes undefined behavior if it loads or stores to addresses that: + +* Are not in the global address space (e.g.: private and local addresses spaces). +* Are only reachable through a bus that does not support 128B/256B requests + (e.g.: host memory over PCIe) +* Any other unsupported addresses (TBD, needs refinement) + +.. TODO:: + + Enumerate all cases where UB is invoked when using this intrinsic instead of hand-waving + "specific global memory locations". + +.. table:: GFX125x Cooperative Atomic Intrinsics + :name: gfx125x-cooperative-atomic-intrinsics-table + + ======================================================= ======================================= + LLVM Intrinsic Lane Groups + ======================================================= ======================================= + ``llvm.amdgcn.cooperative.atomic.store.32x4B`` ``0-31`` + + ``llvm.amdgcn.cooperative.atomic.load.32x4B`` ``0-31`` + + ``llvm.amdgcn.cooperative.atomic.store.16x8B`` ``0-15``, ``16-31`` + + ``llvm.amdgcn.cooperative.atomic.load.16x8B`` ``0-15``, ``16-31`` + + ``llvm.amdgcn.cooperative.atomic.store.8x16B`` ``0-7``, ``8-15``, ``16-23``, ``24-31`` + + ``llvm.amdgcn.cooperative.atomic.load.8x16B`` ``0-7``, ``8-15``, ``16-23``, ``24-31`` + + ======================================================= ======================================= + .. _amdgpu-amdhsa-trap-handler-abi: Trap Handler ABI diff --git a/llvm/docs/BugLifeCycle.rst b/llvm/docs/BugLifeCycle.rst index 9bf13e64ed56e..1215af9e47e08 100644 --- a/llvm/docs/BugLifeCycle.rst +++ b/llvm/docs/BugLifeCycle.rst @@ -16,7 +16,7 @@ consistency helps reporters, developers and others to gain a better understanding of what a particular bug state actually means and what to expect might happen next. -At the same time, we aim to not over-specify the life cycle of bugs in +At the same time, we aim not to over-specify the life cycle of bugs in `the LLVM Bug Tracking System `_, as the overall goal is to make it easier to work with and understand the bug reports. @@ -39,7 +39,7 @@ use, needs to be maintained. See the following for details: Reporting bugs ============== -See :doc:`HowToSubmitABug` on further details on how to submit good bug reports. +See :doc:`HowToSubmitABug` for further details on how to submit good bug reports. You can apply `labels `_ to the bug to provide extra information to make the bug easier to discover, such @@ -50,9 +50,9 @@ as a label for the part of the project the bug pertains to. Triaging bugs ============= -Open bugs that have not been marked with the ``confirmed`` label are bugs that +Open bugs that have not been marked with the ``confirmed`` label still need to be triaged. When triage is complete, the ``confirmed`` label -should be added along with any other labels that help to classify the report, +should be added along with any other labels that help classify the report, unless the issue is being :ref:`closed`. The goal of triaging a bug is to make sure a newly reported bug ends up in a @@ -124,13 +124,13 @@ Examples of reasons for resolving are: ``duplicate`` label with a comment pointing to the issue it duplicates. * If there is a sound reason for not fixing the issue (difficulty, ABI, open - research questions, etc), add the ``wontfix`` label and a comment explaining + research questions, etc.), add the ``wontfix`` label and a comment explaining why no changes are expected. * If there is a specific and plausible reason to think that a given bug is otherwise inapplicable or obsolete. One example is an open bug that doesn't contain enough information to clearly understand the problem being reported - (e.g. not reproducible). It is fine to close such a bug, adding with the + (e.g., not reproducible). It is fine to close such a bug, adding the ``worksforme`` label and leaving a comment to encourage the reporter to reopen the bug with more information if it's still reproducible for them. @@ -140,7 +140,7 @@ Examples of reasons for resolving are: Maintenance of metadata ======================= -Project member with write access to the project can create new labels, but we +Project members with write access to the project can create new labels, but we discourage adding ad hoc labels because we want to control the proliferation of labels and avoid single-use labels. If you would like a new label added, please open an issue asking to create an issue label and add the ``infrastructure`` diff --git a/llvm/docs/BuildingADistribution.rst b/llvm/docs/BuildingADistribution.rst index 10e571cdea3f9..81ed8b8723a26 100644 --- a/llvm/docs/BuildingADistribution.rst +++ b/llvm/docs/BuildingADistribution.rst @@ -13,8 +13,8 @@ combination of its sub-project tools for distribution. This document covers useful features of the LLVM build system as well as best practices and general information about packaging LLVM. -If you are new to CMake you may find the :doc:`CMake` or :doc:`CMakePrimer` -documentation useful. Some of the things covered in this document are the inner +If you are new to CMake, you may find the :doc:`CMake` or :doc:`CMakePrimer` +documentation useful. This document covers some of the inner workings of the builds described in the :doc:`AdvancedBuilds` document. General Distribution Guidance @@ -27,7 +27,7 @@ compiler. This is done so that the compiler you distribute benefits from all the bug fixes, performance optimizations and general improvements provided by the new compiler. -In deciding how to build your distribution there are a few trade-offs that you +In deciding how to build your distribution, there are a few trade-offs that you will need to evaluate. The big two are: #. Compile time of the distribution against performance of the built compiler @@ -41,8 +41,8 @@ opportunity for the compiler to optimize. The guidance for minimizing distribution size is to dynamically link LLVM and Clang libraries into the tools to reduce code duplication. This will come at a -substantial performance penalty to the generated binary both because it reduces -optimization opportunity, and because dynamic linking requires resolving symbols +substantial performance penalty to the generated binary, both because it reduces +optimization opportunities and because dynamic linking requires resolving symbols at process launch time, which can be very slow for C++ code. .. _shared_libs: @@ -76,7 +76,7 @@ LLVM testing tools. Alternatively the ``install-distribution`` target, which is recommended for building distributions, only installs specific parts of LLVM as specified at configuration time by *LLVM_DISTRIBUTION_COMPONENTS*. -Additionally by default the ``install`` target will install the LLVM testing +Additionally, by default, the ``install`` target will install the LLVM testing tools as the public tools. This can be changed well by setting *LLVM_INSTALL_TOOLCHAIN_ONLY* to ``On``. The LLVM tools are intended for development and testing of LLVM, and should only be included in distributions diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst index 359e0c3e81d0e..6a721ebf9cad0 100644 --- a/llvm/docs/CommandGuide/lit.rst +++ b/llvm/docs/CommandGuide/lit.rst @@ -630,13 +630,11 @@ TestRunner.py: %{fs-sep} file system path separator %t temporary file name unique to the test %basename_t The last path component of %t but without the ``.tmp`` extension (deprecated, use ``%{t:stem}`` instead) - %T parent directory of %t (not unique, deprecated, do not use) %% % %/s %s but ``\`` is replaced by ``/`` %/S %S but ``\`` is replaced by ``/`` %/p %p but ``\`` is replaced by ``/`` %/t %t but ``\`` is replaced by ``/`` - %/T %T but ``\`` is replaced by ``/`` %{s:basename} The last path component of %s %{t:stem} The last path component of %t but without the ``.tmp`` extension (alias for %basename_t) %{s:real} %s after expanding all symbolic links and substitute drives @@ -648,12 +646,10 @@ TestRunner.py: %{/S:real} %/S after expanding all symbolic links and substitute drives %{/p:real} %/p after expanding all symbolic links and substitute drives %{/t:real} %/t after expanding all symbolic links and substitute drives - %{/T:real} %/T after expanding all symbolic links and substitute drives %{/s:regex_replacement} %/s but escaped for use in the replacement of a ``s@@@`` command in sed %{/S:regex_replacement} %/S but escaped for use in the replacement of a ``s@@@`` command in sed %{/p:regex_replacement} %/p but escaped for use in the replacement of a ``s@@@`` command in sed %{/t:regex_replacement} %/t but escaped for use in the replacement of a ``s@@@`` command in sed - %{/T:regex_replacement} %/T but escaped for use in the replacement of a ``s@@@`` command in sed %:s On Windows, %/s but a ``:`` is removed if its the second character. Otherwise, %s but with a single leading ``/`` removed. %:S On Windows, %/S but a ``:`` is removed if its the second character. @@ -662,8 +658,6 @@ TestRunner.py: Otherwise, %p but with a single leading ``/`` removed. %:t On Windows, %/t but a ``:`` is removed if its the second character. Otherwise, %t but with a single leading ``/`` removed. - %:T On Windows, %/T but a ``:`` is removed if its the second character. - Otherwise, %T but with a single leading ``/`` removed. %{readfile:} Reads the file specified. ======================= ============== diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst index dde73c9c3cc23..13d2da42eaca7 100644 --- a/llvm/docs/Coroutines.rst +++ b/llvm/docs/Coroutines.rst @@ -303,7 +303,7 @@ The LLVM IR for this coroutine looks like this: call void @free(ptr %mem) br label %suspend suspend: - %unused = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -637,7 +637,7 @@ store the current value produced by a coroutine. call void @free(ptr %mem) br label %suspend suspend: - %unused = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -806,7 +806,7 @@ The LLVM IR for a coroutine using a Coroutine with a custom ABI looks like: call void @free(ptr %mem) br label %suspend suspend: - %unused = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -1444,7 +1444,7 @@ A frontend should emit function attribute `presplitcoroutine` for the coroutine. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: - declare i1 @llvm.coro.end(ptr , i1 , token ) + declare void @llvm.coro.end(ptr , i1 , token ) Overview: """"""""" @@ -1502,8 +1502,9 @@ For landingpad based exception model, it is expected that frontend uses the .. code-block:: llvm ehcleanup: - %InResumePart = call i1 @llvm.coro.end(ptr null, i1 true, token none) - br i1 %InResumePart, label %eh.resume, label %cleanup.cont + call void @llvm.coro.end(ptr null, i1 true, token none) + %InRamp = call i1 @llvm.coro.is_in_ramp() + br i1 %InRamp, label %cleanup.cont, label %eh.resume cleanup.cont: ; rest of the cleanup @@ -1515,10 +1516,10 @@ For landingpad based exception model, it is expected that frontend uses the %lpad.val29 = insertvalue { ptr, i32 } %lpad.val, i32 %sel, 1 resume { ptr, i32 } %lpad.val29 -The `CoroSpit` pass replaces `coro.end` with ``True`` in the resume functions, -thus leading to immediate unwind to the caller, whereas in start function it -is replaced with ``False``, thus allowing to proceed to the rest of the cleanup -code that is only needed during initial invocation of the coroutine. +The `CoroSpit` pass replaces `coro.is_in_ramp` with ``True`` in the ramp functions, +thus allowing to proceed to the rest of the cleanup code that is only needed during +initial invocation of the coroutine. Otherwise, it is replaced with ``False``, +thus leading to immediate unwind to the caller. For Windows Exception handling model, a frontend should attach a funclet bundle referring to an enclosing cleanuppad as follows: @@ -1527,7 +1528,7 @@ referring to an enclosing cleanuppad as follows: ehcleanup: %tok = cleanuppad within none [] - %unused = call i1 @llvm.coro.end(ptr null, i1 true, token none) [ "funclet"(token %tok) ] + call void @llvm.coro.end(ptr null, i1 true, token none) [ "funclet"(token %tok) ] cleanupret from %tok unwind label %RestOfTheCleanup The `CoroSplit` pass, if the funclet bundle is present, will insert @@ -1592,7 +1593,7 @@ The number of arguments must match the return type of the continuation function: cleanup: %tok = call token (...) @llvm.coro.end.results(i8 %val) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token %tok) + call void @llvm.coro.end(ptr %hdl, i1 0, token %tok) unreachable ... @@ -1604,7 +1605,7 @@ The number of arguments must match the return type of the continuation function: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :: - declare i1 @llvm.coro.end.async(ptr , i1 , ...) + declare void @llvm.coro.end.async(ptr , i1 , ...) Overview: """"""""" @@ -1635,10 +1636,10 @@ the function call. .. code-block:: llvm - call i1 (ptr, i1, ...) @llvm.coro.end.async( - ptr %hdl, i1 0, - ptr @must_tail_call_return, - ptr %ctxt, ptr %task, ptr %actor) + call void (ptr, i1, ...) @llvm.coro.end.async( + ptr %hdl, i1 0, + ptr @must_tail_call_return, + ptr %ctxt, ptr %task, ptr %actor) unreachable .. _coro.suspend: @@ -2117,6 +2118,30 @@ Example: %hdl.result = ... ; get address of returned coroutine handle ret ptr %hdl.result +'llvm.coro.is_in_ramp' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + declare i1 @llvm.coro.is_in_ramp() + +Overview: +""""""""" + +The '``llvm.coro.is_in_ramp``' intrinsic returns a bool value that marks coroutine ramp +function and resume/destroy function. + +Arguments: +"""""""""" + +None + +Semantics: +"""""""""" + +The `CoroSpit` pass replaces `coro.is_in_ramp` with ``True`` ramp functions. +Otherwise, it is replaced with ``False``, allowing the frontend to separate +ramp function and resume/destroy function. + Coroutine Transformation Passes =============================== CoroEarly diff --git a/llvm/docs/Docker.rst b/llvm/docs/Docker.rst index 5d976eddb3130..5f8e619d8b5eb 100644 --- a/llvm/docs/Docker.rst +++ b/llvm/docs/Docker.rst @@ -27,8 +27,8 @@ to get a very basic explanation of it. `Docker `_ is a popular solution for running programs in an isolated and reproducible environment, especially to maintain releases for software deployed to large distributed fleets. -It uses linux kernel namespaces and cgroups to provide a lightweight isolation -inside currently running linux kernel. +It uses Linux kernel namespaces and cgroups to provide a lightweight isolation +inside currently running Linux kernel. A single active instance of dockerized environment is called a *docker container*. A snapshot of a docker container filesystem is called a *docker image*. @@ -127,17 +127,17 @@ Which image should I choose? We currently provide two images: Debian12-based and nvidia-cuda-based. They differ in the base image that they use, i.e. they have a different set of preinstalled binaries. Debian8 is very minimal, nvidia-cuda is larger, but has -preinstalled CUDA libraries and allows to access a GPU, installed on your +preinstalled CUDA libraries and allows access to a GPU, installed on your machine. -If you need a minimal linux distribution with only clang and libstdc++ included, +If you need a minimal Linux distribution with only clang and libstdc++ included, you should try Debian12-based image. If you want to use CUDA libraries and have access to a GPU on your machine, you should choose nvidia-cuda-based image and use `nvidia-docker `_ to run your docker containers. Note that you don't need nvidia-docker to build the images, but you need it in order -to have an access to GPU from a docker container that is running the built +to have access to a GPU from a docker container that is running the built image. If you have a different use-case, you could create your own image based on @@ -176,4 +176,4 @@ The first image is only used during build and does not have a descriptive name, i.e. it is only accessible via the hash value after the build is finished. The second image is our resulting image. It contains only the built binaries and not any build dependencies. It is also accessible via a descriptive name -(specified by -d and -t flags). +(specified by ``-d`` and ``-t`` flags). diff --git a/llvm/docs/ExtendingLLVM.rst b/llvm/docs/ExtendingLLVM.rst index 50f0af3fafc4c..019fdf5fc3278 100644 --- a/llvm/docs/ExtendingLLVM.rst +++ b/llvm/docs/ExtendingLLVM.rst @@ -13,7 +13,7 @@ function, or a whole new instruction. When you come to this realization, stop and think. Do you really need to extend LLVM? Is it a new fundamental capability that LLVM does not support at its -current incarnation or can it be synthesized from already pre-existing LLVM +current incarnation or can it be synthesized from existing LLVM elements? If you are not sure, ask on the `LLVM forums `_. The reason is that extending LLVM will get involved as you need to update all the different passes @@ -27,7 +27,7 @@ method of choice for LLVM extension. Before you invest a significant amount of effort into a non-trivial extension, **ask on the list** if what you are looking to do can be done with -already-existing infrastructure, or if maybe someone else is already working on +existing infrastructure, or if maybe someone else is already working on it. You will save yourself a lot of time and effort by doing so. .. _intrinsic function: @@ -57,12 +57,12 @@ function and then be turned into an instruction if warranted. #. ``llvm/lib/Analysis/ConstantFolding.cpp``: - If it is possible to constant fold your intrinsic, add support to it in the + If it is possible to constant fold your intrinsic, add support for it in the ``canConstantFoldCallTo`` and ``ConstantFoldCall`` functions. #. ``llvm/test/*``: - Add test cases for your test cases to the test suite + Add test cases for your intrinsic to the test suite Once the intrinsic has been added to the system, you must add code generator support for it. Generally you must do the following steps: @@ -72,7 +72,7 @@ Add support to the .td file for the target(s) of your choice in This is usually a matter of adding a pattern to the .td file that matches the intrinsic, though it may obviously require adding the instructions you want to - generate as well. There are lots of examples in the PowerPC and X86 backend + generate as well. There are lots of examples in the PowerPC and X86 backends to follow. Adding a new SelectionDAG node @@ -194,7 +194,7 @@ Adding a new instruction #. ``llvm/lib/AsmParser/LLLexer.cpp``: - add a new token to parse your instruction from assembly text file + add a new token to parse your instruction from an assembly text file #. ``llvm/lib/AsmParser/LLParser.cpp``: @@ -207,7 +207,7 @@ Adding a new instruction #. ``llvm/lib/Bitcode/Writer/BitcodeWriter.cpp``: - add a case for your instruction and how it will be parsed from bitcode + add a case for your instruction and how it will be written to bitcode #. ``llvm/lib/IR/Instruction.cpp``: @@ -236,7 +236,7 @@ Adding a new type .. warning:: Adding new types changes the bitcode format, and will break compatibility with - currently-existing LLVM installations. Only add new types if it is absolutely + existing LLVM installations. Only add new types if it is absolutely necessary. Adding a fundamental type @@ -284,17 +284,17 @@ Adding a derived type #. ``llvm/include/llvm/IR/DerivedTypes.h``: - add new class to represent new class in the hierarchy; add forward + add a new class to represent the new class in the hierarchy; add forward declaration to the TypeMap value type #. ``llvm/lib/IR/Type.cpp`` and ``llvm/lib/CodeGen/ValueTypes.cpp``: - add support for derived type, notably `enum TypeID` and `is`, `get` methods. + add support for derived type, notably ``enum TypeID`` and ``is``, ``get`` methods. #. ``llvm/include/llvm-c/Core.h`` and ``llvm/lib/IR/Core.cpp``: add enum ``LLVMTypeKind`` and modify - `LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty)` for the new type + ``LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty)`` for the new type #. ``llvm/lib/AsmParser/LLLexer.cpp``: diff --git a/llvm/docs/FuzzingLLVM.rst b/llvm/docs/FuzzingLLVM.rst index a0355d7014c8d..76eb4288a1f2c 100644 --- a/llvm/docs/FuzzingLLVM.rst +++ b/llvm/docs/FuzzingLLVM.rst @@ -33,7 +33,7 @@ clang-proto-fuzzer A |protobuf fuzzer| that compiles valid C++ programs generated from a protobuf class that describes a subset of the C++ language. -This fuzzer accepts clang command line options after `ignore_remaining_args=1`. +This fuzzer accepts clang command-line options after `ignore_remaining_args=1`. For example, the following command will fuzz clang with a higher optimization level: @@ -106,7 +106,7 @@ llvm-opt-fuzzer A |LLVM IR fuzzer| aimed at finding bugs in optimization passes. -It receives optimization pipeline and runs it for each fuzzer input. +It receives an optimization pipeline and runs it for each fuzzer input. Interface of this fuzzer almost directly mirrors ``llvm-isel-fuzzer``. Both ``mtriple`` and ``passes`` arguments are required. Passes are specified in a @@ -117,7 +117,7 @@ this format in the doxygen for ``PassBuilder::parsePassPipeline``. % bin/llvm-opt-fuzzer -ignore_remaining_args=1 -mtriple x86_64 -passes instcombine -Similarly to the ``llvm-isel-fuzzer`` arguments in some predefined configurations +Similarly to the ``llvm-isel-fuzzer``, arguments in some predefined configurations might be embedded directly into the binary file name: .. code-block:: shell @@ -176,7 +176,7 @@ mutations that a fuzzer in LLVM might want. Generic Random Fuzzing ---------------------- -The most basic form of input mutation is to use the built in mutators of +The most basic form of input mutation is to use the built-in mutators of LibFuzzer. These simply treat the input corpus as a bag of bits and make random mutations. This type of fuzzer is good for stressing the surface layers of a program, and is good at testing things like lexers, parsers, or binary @@ -244,7 +244,7 @@ by adding the following two flags to your CMake invocation: to avoid building the sanitizers themselves with sanitizers enabled. .. note:: You may run into issues if you build with BFD ld, which is the - default linker on many unix systems. These issues are being tracked + default linker on many Unix systems. These issues are being tracked in https://llvm.org/PR34636. Continuously Running and Finding Bugs @@ -280,6 +280,6 @@ your fuzzer can be built and tested when not built against libFuzzer. There is also some handling of the CMake config for fuzzers, where you should use the ``add_llvm_fuzzer`` to set up fuzzer targets. This function works -similarly to functions such as ``add_llvm_tool``, but they take care of linking +similarly to functions such as ``add_llvm_tool``, but it takes care of linking to LibFuzzer when appropriate and can be passed the ``DUMMY_MAIN`` argument to enable standalone testing. diff --git a/llvm/docs/GetElementPtr.rst b/llvm/docs/GetElementPtr.rst index 25a24c6403091..6831a8e6e81eb 100644 --- a/llvm/docs/GetElementPtr.rst +++ b/llvm/docs/GetElementPtr.rst @@ -10,8 +10,8 @@ Introduction This document seeks to dispel the mystery and confusion surrounding LLVM's `GetElementPtr `_ (GEP) instruction. -Questions about the wily GEP instruction are probably the most frequently -occurring questions once a developer gets down to coding with LLVM. Here we lay +Questions about the wily GEP instruction are probably the most frequent +questions once a developer gets down to coding with LLVM. Here we lay out the sources of confusion and show that the GEP instruction is really quite simple. @@ -30,8 +30,8 @@ What is the first index of the GEP instruction? Quick answer: The index stepping through the second operand. The confusion with the first index usually arises from thinking about the -GetElementPtr instruction as if it was a C index operator. They aren't the -same. For example, when we write, in "C": +GetElementPtr instruction as if it were a C index operator. They aren't the +same. For example, when we write, in C: .. code-block:: c++ @@ -62,7 +62,7 @@ The answer is simply because memory does not have to be accessed to perform the computation. The second operand to the GEP instruction must be a value of a pointer type. The value of the pointer is provided directly to the GEP instruction as an operand without any need for accessing memory. It must, -therefore be indexed and requires an index operand. Consider this example: +therefore, be indexed and requires an index operand. Consider this example: .. code-block:: c++ @@ -285,7 +285,7 @@ I'm writing a backend for a target which needs custom lowering for GEP. How do I ----------------------------------------------------------------------------------------- You don't. The integer computation implied by a GEP is target-independent. -Typically what you'll need to do is make your backend pattern-match expressions +Typically what you'll need to do is make your backend pattern-match expression trees involving ADD, MUL, etc., which are what GEP is lowered into. This has the advantage of letting your code work correctly in more cases. @@ -377,7 +377,7 @@ the underlying object. Furthermore, loads and stores don't have to use the same types as the type of the underlying object. Types in this context serve only to specify memory size -and alignment. Beyond that there are merely a hint to the optimizer indicating +and alignment. Beyond that they are merely a hint to the optimizer indicating how the value will likely be used. Can I cast an object's address to integer and add it to null? @@ -506,7 +506,7 @@ sufficient to preserve the pointer aliasing guarantees that GEP provides. Summary ======= -In summary, here's some things to always remember about the GetElementPtr +In summary, here are some things to always remember about the GetElementPtr instruction: diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index d87a8bd81cc7b..72716fa667487 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -208,7 +208,7 @@ what to add to your calendar invite. - 3rd Tuesday of the month - `ics `__ - `Meeting details/agenda: `__ - * - LLVM Qualification Working Group + * - `LLVM Qualification Working Group `__ - 1st Tuesday/Wednesday of the month - `ics `__ `gcal `__ diff --git a/llvm/docs/GlobalISel/InstructionSelect.rst b/llvm/docs/GlobalISel/InstructionSelect.rst index 9798ae7a596ca..5513824cf190c 100644 --- a/llvm/docs/GlobalISel/InstructionSelect.rst +++ b/llvm/docs/GlobalISel/InstructionSelect.rst @@ -5,8 +5,22 @@ InstructionSelect ----------------- This pass transforms generic machine instructions into equivalent -target-specific instructions. It traverses the ``MachineFunction`` bottom-up, -selecting uses before definitions, enabling trivial dead code elimination. +target-specific instructions. + +The legacy instruction selector, SelectionDAG, iterated over each function's +basic block and constructed a dataflow graph. Every backend defines +tree patterns in the ``XXXInstrInfo.td``. The legacy selector started +at the bottom and replaced the SDNodes greedily. + +The GlobalISel's instruction selector traverses the ``MachineFunction`` +bottom-up, selecting uses before definitions, enabling trivial dead code +elimination. It does that by iterating over the basic blocks in post-order. +Each gMIR instruction is then replaced by a MIR instruction when a matching +pattern is found. So, when there is a 1:1 mapping between gMIR and MIR, where +is the benefit of the global scope? Even in the case of a 1:1 mapping, +GlobalISel includes a combiner that can match and fuse multiple gMIR +instructions. The scope of the combination is not limited to a basic block, +but can extend across the entire function. .. _api-instructionselector: diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index e6713c827d6ab..8e863939781a2 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -660,19 +660,60 @@ Non-Integral Pointer Type Note: non-integral pointer types are a work in progress, and they should be considered experimental at this time. -LLVM IR optionally allows the frontend to denote pointers in certain address -spaces as "non-integral" via the :ref:`datalayout string`. -Non-integral pointer types represent pointers that have an *unspecified* bitwise -representation; that is, the integral representation may be target dependent or -unstable (not backed by a fixed integer). +For most targets, the pointer representation is a direct mapping from the +bitwise representation to the address of the underlying memory location. +Such pointers are considered "integral", and any pointers where the +representation is not just an integer address are called "non-integral". + +Non-integral pointers have at least one of the following three properties: + +* the pointer representation contains non-address bits +* the pointer representation is unstable (may changed at any time in a + target-specific way) +* the pointer representation has external state + +These properties (or combinations thereof) can be applied to pointers via the +:ref:`datalayout string`. + +The exact implications of these properties are target-specific. The following +subsections describe the IR semantics and restrictions to optimization passes +for each of these properties. + +Pointers with non-address bits +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pointers in this address space have a bitwise representation that not only +has address bits, but also some other target-specific metadata. +In most cases pointers with non-address bits behave exactly the same as +integral pointers, the only difference is that it is not possible to create a +pointer just from an address unless all the non-address bits are also recreated +correctly in a target-specific way. + +An example of pointers with non-address bits are the AMDGPU buffer descriptors +which are 160 bits: a 128-bit fat pointer and a 32-bit offset. +Similarly, CHERI capabilities contain a 32 or 64 bit address as well as the +same number of metadata bits, but unlike the AMDGPU buffer descriptors they have +external state in addition to non-address bits. + + +Unstable pointer representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pointers in this address space have an *unspecified* bitwise representation +(i.e. not backed by a fixed integer). The bitwise pattern of such pointers is +allowed to change in a target-specific way. For example, this could be a pointer +type used with copying garbage collection where the garbage collector could +update the pointer at any time in the collection sweep. ``inttoptr`` and ``ptrtoint`` instructions have the same semantics as for integral (i.e., normal) pointers in that they convert integers to and from -corresponding pointer types, but there are additional implications to be -aware of. Because the bit-representation of a non-integral pointer may -not be stable, two identical casts of the same operand may or may not +corresponding pointer types, but there are additional implications to be aware +of. + +For "unstable" pointer representations, the bit-representation of the pointer +may not be stable, so two identical casts of the same operand may or may not return the same value. Said differently, the conversion to or from the -non-integral type depends on environmental state in an implementation +"unstable" pointer type depends on environmental state in an implementation defined manner. If the frontend wishes to observe a *particular* value following a cast, the @@ -681,21 +722,72 @@ defined manner. (In practice, this tends to require ``noinline`` routines for such operations.) From the perspective of the optimizer, ``inttoptr`` and ``ptrtoint`` for -non-integral types are analogous to ones on integral types with one +"unstable" pointer types are analogous to ones on integral types with one key exception: the optimizer may not, in general, insert new dynamic occurrences of such casts. If a new cast is inserted, the optimizer would need to either ensure that a) all possible values are valid, or b) appropriate fencing is inserted. Since the appropriate fencing is implementation defined, the optimizer can't do the latter. The former is challenging as many commonly expected properties, such as -``ptrtoint(v)-ptrtoint(v) == 0``, don't hold for non-integral types. +``ptrtoint(v)-ptrtoint(v) == 0``, don't hold for "unstable" pointer types. Similar restrictions apply to intrinsics that might examine the pointer bits, such as :ref:`llvm.ptrmask`. -The alignment information provided by the frontend for a non-integral pointer +The alignment information provided by the frontend for an "unstable" pointer (typically using attributes or metadata) must be valid for every possible representation of the pointer. +Pointers with external state +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A further special case of non-integral pointers is ones that include external +state (such as bounds information or a type tag) with a target-defined size. +An example of such a type is a CHERI capability, where there is an additional +validity bit that is part of all pointer-typed registers, but is located in +memory at an implementation-defined address separate from the pointer itself. +Another example would be a fat-pointer scheme where pointers remain plain +integers, but the associated bounds are stored in an out-of-band table. + +Unless also marked as "unstable", the bit-wise representation of pointers with +external state is stable and ``ptrtoint(x)`` always yields a deterministic +value. This means transformation passes are still permitted to insert new +``ptrtoint`` instructions. + +The following restrictions apply to IR level optimization passes: + +The ``inttoptr`` instruction does not recreate the external state and therefore +it is target dependent whether it can be used to create a dereferenceable +pointer. In general passes should assume that the result of such an inttoptr +is not dereferenceable. For example, on CHERI targets an ``inttoptr`` will +yield a capability with the external state (the validity tag bit) set to zero, +which will cause any dereference to trap. +The ``ptrtoint`` instruction also only returns the "in-band" state and omits +all external state. + +When a ``store ptr addrspace(N) %p, ptr @dst`` of such a non-integral pointer +is performed, the external metadata is also stored to an implementation-defined +location. Similarly, a ``%val = load ptr addrspace(N), ptr @dst`` will fetch the +external metadata and make it available for all uses of ``%val``. +Similarly, the ``llvm.memcpy`` and ``llvm.memmove`` intrinsics also transfer the +external state. This is essential to allow frontends to efficiently emit copies +of structures containing such pointers, since expanding all these copies as +individual loads and stores would affect compilation speed and inhibit +optimizations. + +Notionally, these external bits are part of the pointer, but since +``inttoptr`` / ``ptrtoint``` only operate on the "in-band" bits of the pointer +and the external bits are not explicitly exposed, they are not included in the +size specified in the :ref:`datalayout string`. + +When a pointer type has external state, all roundtrips via memory must +be performed as loads and stores of the correct type since stores of other +types may not propagate the external data. +Therefore it is not legal to convert an existing load/store (or a +``llvm.memcpy`` / ``llvm.memmove`` intrinsic) of pointer types with external +state to a load/store of an integer type with same bitwidth, as that may drop +the external state. + + .. _globalvars: Global Variables @@ -2921,6 +3013,8 @@ assumptions, such as that a :ref:`parameter attribute ` or a location. Operand bundles enable assumptions that are either hard or impossible to represent as a boolean argument of an :ref:`llvm.assume `. +Assumes with operand bundles must have ``i1 true`` as the condition operand. + An assume operand bundle has the form: :: @@ -2953,7 +3047,7 @@ allows the optimizer to assume that at location of call to .. code-block:: llvm - call void @llvm.assume(i1 %cond) ["cold"(), "nonnull"(ptr %val)] + call void @llvm.assume(i1 true) ["cold"(), "nonnull"(ptr %val)] allows the optimizer to assume that the :ref:`llvm.assume ` call location is cold and that ``%val`` may not be null. @@ -3179,8 +3273,8 @@ as follows: ``A
`` Specifies the address space of objects created by '``alloca``'. Defaults to the default address space of 0. -``p[n]::[:[:]]`` - This specifies the properties of a pointer in address space ``n``. +``p[][]::[:[:]]`` + This specifies the properties of a pointer in address space ``as``. The ```` parameter specifies the size of the bitwise representation. For :ref:`non-integral pointers ` the representation size may be larger than the address width of the underlying address space (e.g. to @@ -3193,9 +3287,13 @@ as follows: default index size is equal to the pointer size. The index size also specifies the width of addresses in this address space. All sizes are in bits. - The address space, ``n``, is optional, and if not specified, - denotes the default address space 0. The value of ``n`` must be - in the range [1,2^24). + The address space, ````, is optional, and if not specified, denotes the + default address space 0. The value of ```` must be in the range [1,2^24). + The optional ```` are used to specify properties of pointers in this + address space: the character ``u`` marks pointers as having an unstable + representation, and ``e`` marks pointers having external state. See + :ref:`Non-Integral Pointer Types `. + ``i:[:]`` This specifies the alignment for an integer type of a given bit ````. The value of ```` must be in the range [1,2^24). @@ -3248,9 +3346,11 @@ as follows: this set are considered to support most general arithmetic operations efficiently. ``ni:
:
:
...`` - This specifies pointer types with the specified address spaces - as :ref:`Non-Integral Pointer Type ` s. The ``0`` - address space cannot be specified as non-integral. + This marks pointer types with the specified address spaces + as :ref:`unstable `. + The ``0`` address space cannot be specified as non-integral. + It is only supported for backwards compatibility, the flags of the ``p`` + specifier should be used instead for new code. ```` is a lower bound on what is required for a type to be considered aligned. This is used in various places, such as: @@ -8764,6 +8864,28 @@ For example, the following metadata section contains two library specifiers:: Each library specifier will be handled independently by the consuming linker. The effect of the library specifiers are defined by the consuming linker. +'``llvm.errno.tbaa``' Named Metadata +==================================== + +The module-level ``!llvm.errno.tbaa`` metadata specifies the TBAA nodes used +for accessing ``errno``. These nodes are guaranteed to represent int-compatible +accesses according to C/C++ strict aliasing rules. This should let LLVM alias +analyses to reason about aliasing with ``errno`` when calling library functions +that may set ``errno``, allowing optimizations such as store-to-load forwarding +across such routines. + +For example, the following is a valid metadata specifying the TBAA information +for an integer access: + + !llvm.errno.tbaa = !{!0} + !0 = !{!1, !1, i64 0} + !1 = !{!"int", !2, i64 0} + !2 = !{!"omnipotent char", !3, i64 0} + !3 = !{!"Simple C/C++ TBAA"} + +Multiple TBAA operands are allowed to support merging of modules that may use +different TBAA hierarchies (e.g., when mixing C and C++). + .. _summary: ThinLTO Summary @@ -11299,11 +11421,9 @@ responsibility of the code emitter to ensure that the alignment information is correct. Overestimating the alignment results in undefined behavior. Underestimating the alignment may produce less efficient code. An alignment of 1 is always safe. The maximum possible alignment is ``1 << 32``. An alignment -value higher than the size of the loaded type implies memory up to the -alignment value bytes can be safely loaded without trapping in the default -address space. Access of the high bytes can interfere with debugging tools, so -should not be accessed if the function has the ``sanitize_thread`` or -``sanitize_address`` attributes. +value higher than the size of the loaded type does *not* imply (without target +specific knowledge) that memory up to the alignment value bytes can be safely +loaded without trapping. The alignment is only optional when parsing textual IR; for in-memory IR, it is always present. An omitted ``align`` argument means that the operation has the @@ -11439,12 +11559,10 @@ operation (that is, the alignment of the memory address). It is the responsibility of the code emitter to ensure that the alignment information is correct. Overestimating the alignment results in undefined behavior. Underestimating the alignment may produce less efficient code. An alignment of -1 is always safe. The maximum possible alignment is ``1 << 32``. An alignment -value higher than the size of the loaded type implies memory up to the -alignment value bytes can be safely loaded without trapping in the default -address space. Access of the high bytes can interfere with debugging tools, so -should not be accessed if the function has the ``sanitize_thread`` or -``sanitize_address`` attributes. +1 is always safe. The maximum possible alignment is ``1 << 32``. An alignment +value higher than the size of the stored type does *not* imply (without target +specific knowledge) that memory up to the alignment value bytes can be safely +loaded without trapping. The alignment is only optional when parsing textual IR; for in-memory IR, it is always present. An omitted ``align`` argument means that the operation has the @@ -24468,7 +24586,7 @@ Overview: The '``llvm.vp.load.ff.*``' intrinsic is similar to '``llvm.vp.load.*``', but will not trap if there are not ``evl`` readable -lanes at the pointer. '``ff``' stands for fault-first or fault-only-first. +lanes at the pointer. '``ff``' stands for first-fault or fault-only-first. Arguments: """""""""" @@ -31402,4 +31520,3 @@ Semantics: The '``llvm.preserve.struct.access.index``' intrinsic produces the same result as a getelementptr with base ``base`` and access operands ``{0, gep_index}``. - diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 4c8c605edfdd6..e8dceb836f98a 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1971,6 +1971,464 @@ The last argument `i1 %unpack` is a compile-time constant which when set, indica For more information, refer to the `PTX ISA `__. +tcgen05.mma Intrinsics +---------------------- + +Overview +^^^^^^^^ + +`tcgen05.mma` operation of shape `M x N x K` perform matrix multiplication and +accumulation of the form: `D = A * B + D` where: + + - the `A` matrix has shape `M x K`, in either `Tensor Memory` or `Shared Memory` + - the `B` matrix has shape `K x N`, in `Shared Memory` of the current CTA and, optionally in peer CTA + - the `D` matrix is of the shape `M x N`, in `Tensor Memory` + +Optionally an input predicate can be used to disable the input (`%enable_inp_d`) +from the accumulator matrix and the following operation can be performed as `D = A * B` + +The matrix multiplication and accumulation operations are categorized into various +kinds based on input types and the throughput of the multiplication operation. +The following table shows the different kinds of MMA operations that are supported: + ++------------+--------------------------------------------+ +| .kind | Supported Input Types | ++============+============================================+ +| f16 | F16 and BF16 | ++------------+--------------------------------------------+ +| tf32 | TF32 | ++------------+--------------------------------------------+ +| f8f6f4 | All combinations of F8, F6, and F4 | ++------------+--------------------------------------------+ +| i8 | Signed and Unsigned 8-bit Integers | ++------------+--------------------------------------------+ +| mxf8f6f4 | MX-floating point formats | ++------------+--------------------------------------------+ +| mxf4 | MX-floating point formats (FP4) | ++------------+--------------------------------------------+ +| mxf4nvf4 | MXF4 + custom NVIDIA 4-bit floating point | +| | (with common scaling factor) | ++------------+--------------------------------------------+ + +`tcgen05.mma.sp` supports sparse variant of `A` with shape `M x K` stored in packed +form as `M X (K / 2)` in memory. The `%spmetadata` specifies the mapping of the +`K / 2` non-zero elements to the `K` elements before performing the MMA operation. + +`tcgen05.mma.block_scale` perform matrix multiplication with block scaling +`D = (A * scale_A) * (B * scale_B) + D` where scaling of input matrices from +memory to form the matrix `A` and matrix `B` before performing the MMA operation. +Scale factors for `A` and `B` matrices need to be duplicated to all 32 lane partitions +of tensor memory. The shape of `%scale_a` and `%scale_b` matrices depend on the +`.scale_vectorsize` described in `here `__ + +The sparsity metadata (`%spmetadata`) as well as the block-scale inputs for `A / B` +matrices (`%scale_a` and `%scale_b`) reside in Tensor Memory. + +To facilitate opportunistic re-use of `A / B` matrix data across a sequence of MMA +operations, the `A/B` matrices are loaded into a collector buffer +(`%collector_usage_a_op_flag`, `%collector_usage_b_buffer_flag`, and `%collector_usage_b_op_flag`). +The flag value of the collector_usage flag in the intrinsic specifies the nature of the re-use + +There are three kinds of matrix descriptors used by the tcgen05 family of instructions: + ++----------------------------+-----------------------------------------------------------------------------------------------------------+-------------+ +| Descriptor | Description | Size (bits) | ++============================+===========================================================================================================+=============+ +| Shared Memory Descriptor | Describes properties of multiplicand matrix | | +| | in shared memory, including its location | | +| | within the CTA's shared memory. | 64 | +| | `PTX ISA `__ | | ++----------------------------+-----------------------------------------------+-------------+---------------------------------------------+-------------+ +| Instruction Descriptor | Describes shapes, types, and details of | | +| | all matrices and the MMA operation. | 32 | +| | `PTX ISA `__ | | ++----------------------------+-----------------------------------------------+-------------+---------------------------------------------+-------------+ +| Zero-Column Mask Descriptor| Generates a mask specifying which columns of | | +| | B matrix are zeroed in the MMA operation, | | +| | regardless of values in shared memory. | 64 | +| | Total mask size = N bits | | +| | `PTX ISA `__ | | ++----------------------------+-----------------------------------------------+-------------+---------------------------------------------+-------------+ + +`tcgen05.mma` can be used for general matrix multiplication or for convolution operations. +In case of convolutions, the `activations` can be stored in either matrix `A` or matrix `B` +while the `weights` will be stored in the other matrix + +`tcgen05.mma` has an optional collector qualifier to specify when an `A` or `B` matrix +is new to the sequence and should be loaded, unchanged within the sequence and, +should be reused, or the last use in the sequence and should be discarded. +The collector qualifier is used to give the TensorCore permission to reuse a +previously loaded `A` or `B` matrix; however reuse is opportunistic in that the +TensorCore may reload a matrix even when it has permission to reuse that matrix. +Thus, the source memory of an A or B matrix must not be modified while the MMA +instruction using those matrices has not completed - regardless of collector +qualifier permissions. + +The `cta_group::1` specifies that the operation is performed on the Tensor Memory +of the executing thread’s CTA only. The `cta_group::2` specifies that the MMA +operation is performed on the Tensor Memory of the executing thread’s CTA and its peer CTA. + +The vector operand `%disable_output_lane` specifies the lane(s) in the Tensor Memory +that should be not be updated with the resultant matrix D. Elements of the vector operand +disable-output-lane forms a mask where each bit corresponds to a lane of the Tensor Memory, +with least significant bit of the first element of the vector (leftmost in syntax) +corresponding to the lane 0 of the Tensor Memory. If a bit in the mask is 1, then +the corresponding lane in the Tensor Memory for the resultant matrix D will not be +updated + +Intrinsic Design: +^^^^^^^^^^^^^^^^^ + +Given the broad feature set of `tcgen05.mma` instruction modeling these +through intrinsics is highly complex, and the following table outlines the large +number of intrinsics required to fully support the `tcgen05.mma` instruction set. + ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| variant | Configuration | Total Variants | ++====================================+===================================================================================================+================+ +| tcgen05.mma.shared | 2 (space) x 2 (sp) x 4 (kind) x 2 (cta_group) x 4 (collector_usage) | 128 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.tensor.ashift | 2 (sp) x 4 (kind) x 2 (cta_group) x 2 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.scale_d | 2 (space) x 2 (sp) x 2 (kind) x 2 (cta_group) x 4 (collector_usage) | 128 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.scale_d.tensor.ashift | 2 (sp) x 2 (kind) x 2 (cta_group) x 2 (collector_usage) | 16 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.disable_output_lane | 2 (space) x 2 (sp) x 4 (kind) x 2 (cta_group) x 4 (collector_usage) | 128 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.disable_output_lane... | 2 (sp) x 4 (kind) x 2 (cta_group) x 2 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.block_scale | 2 (space) x 1 (mxf4nvf4) x 2 (cta_group) x 2 (scale_vec_size) x 4 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.block_scale | 2 (space) x 1 (mxf4) x 2 (cta_group) x 2 (scale_vec_size) x 4 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.block_scale | 2 (space) x 1 (mxf8f6f4) x 2 (cta_group) x 2 (scale_vec_size) x 4 (collector_usage) | 32 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| tcgen05.mma.ws | 2 (space) x 2 (sp) x 4 (kind) x 2 (zero_col_mask) x 4 (collector_usage_op) x 4 (collector_buffer) | 256 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ +| Total | | 816 | ++------------------------------------+---------------------------------------------------------------------------------------------------+----------------+ + + +To reduce the number of possible intrinsic variations, we've modeled the `tcgen05.mma` +instructions using flag operands. We've added range checks to these flags to prevent +invalid values. We also expanded some flags back into intrinsic modifiers to avoid +supporting invalid combinations of features. + + +'``llvm.nvvm.tcgen05.mma.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_a_op_flag) + + ; .sp variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i32 %kind_flag, i32 %cta_group_flag, i32 %collector_usage_a_op_flag) + + ; .scale_d variants + declare void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group_flag, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.scale_d<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group_flag, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; sp.scale_d variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, i32 %cta_group, i32 %collector_usage_a_op_flag) + +Overview: +""""""""" + +`nvvm.tcgen05.mma` is an asynchronous intrinsic which initiates an `M x N x K` matrix +multiply and accumulate operation, `D = A * B + D` where the `A` matrix is `M x K`, +the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The operation of the form +`D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. +The optional immediate argument `%scale_d_imm` can be specified to scale the input +matrix `D` as follows: `D = A * B + D * (2 ^ - %scale_d_imm)`. The valid range of +values for argument `%scale_d_imm` is `[0, 15]`. The 32-bit register operand idesc +is the instruction descriptor as described in `Instruction descriptor `__ + +`nvvm.tcgen05.mma` has single thread semantics, unlike the collective instructions +`nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing +the `nvvm.tcgen05.mma` will result in the initiation of the whole matrix and accumulate +operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K/2)` and requires +specifiying an additional `%spmetadata` argument + +`.ashift` shifts the rows of the A matrix down by one row, except for the last row +in the Tensor Memory. `.ashift` is only allowed with M = 128 or M = 256. + +The `%collector_usage_a_op_flag` flag specifies the usage of collector buffer for +matrix `A`. It is illegal to specify either of `USE` or `FILL` for `%collector_usage_a_op_flag` +along with `.ashift` + +For more information, refer to the +`PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%kind_flag` flag: + +============= ========== + `kind_flag` value +============= ========== + F16 0 + TF32 1 + F8F6F4 2 + I8 3 +============= ========== + +`%cta_group_flag` flag: + +================= ========== + `cta_group_flag` value +================= ========== + CG1 1 + CG2 2 +================= ========== + +`%collector_usage_a_op_flag` flag: + +============================= ========== + `collector_usage_a_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + +'``llvm.nvvm.tcgen05.mma.block_scale*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + ; mxf8f6f4 + declare void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + + ; mxf4 + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + + ; mxf4nvf4 + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 cta_group_flag, i32 %collector_usage_a_op_flag) + +Overview: +""""""""" +`nvvm.tcgen05.mma.block_scale` is an asynchronous intrinsic which initiates an `M x N x K` matrix multiply and accumulate operation, `D = (A * scale_a) * (B * scale_b) + D` where the `A` matrix is `M x K`, the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The matrices `A` and `B` are scaled with `%scale_A` and `%scale_B` matrices respectively before performing the matrix multiply and accumulate operation. The operation of the form `D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. The 32-bit register operand idesc is the instruction descriptor as described in `Instruction descriptor `__ + +`nvvm.tcgen05.mma.block_scale` has single thread semantics, unlike the collective instructions `nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing the `nvvm.tcgen05.mma.block_scale` will result in the initiation of the whole matrix multiply and accumulate operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K / 2)` and requires specifiying an additional `%spmetadata` argument + +The `%collector_usage_a_op_flag` flag specifies the usage of collector buffer for matrix `A` + +For more information, refer to the +`PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%cta_group`: + +============= ========== + `cta_group` value +============= ========== + CG1 1 + CG2 2 +============= ========== + +`%collector_usage_a_op_flag`: + +============================= ========== + `collector_usage_a_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + +'``llvm.nvvm.tcgen05.mma.disable_output_lane*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; .sp variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; .scale_d variants + declare void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + + ; .sp.scale_d variants + declare void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg1<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, <4 x i32> %disable_output_lane_v4, i32 %kind_flag, i32 %collector_usage_a_op_flag) + declare void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.disable_output_lane.cg2<.ashift>(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, ptr addrspace(6) %spmetadata, i1 %enable_inp_d, i64 %scale_d_imm, <8 x i32> %disable_output_lane_v8, i32 %kind_flag, i32 %collector_usage_a_op_flag) + +Overview: +""""""""" + +`nvvm.tcgen05.mma.disable_output_lane` is an asynchronous intrinsic which initiates an `M x N x K` matrix multiply and accumulate operation, `D = A * B + D` where the `A` matrix is `M x K`, the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The operation of the form `D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. The optional immediate argument `%scale_d_imm` can be specified to scale the input matrix `D` as follows: `D = A*B+D * (2 ^ - %scale_d_imm)`. The valid range of values for argument `%scale_d_imm` is `[0, 15]`. The 32-bit register operand idesc is the instruction descriptor as described in `Instruction descriptor `__ + +The vector operand `%disable_output_lane` specifies the lane(s) in the Tensor Memory that should be not be updated with the resultant matrix `D`. Elements of the vector operand `%disable_output_lane` forms a mask where each bit corresponds to a lane of the Tensor Memory, with least significant bit of the first element of the vector corresponding to the `lane 0` of the Tensor Memory. If a bit in the mask is 1, then the corresponding lane in the Tensor Memory for the resultant matrix `D` will not be updated + +`nvvm.tcgen05.mma.disable_output_lane` has single thread semantics, unlike the collective instructions `nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing the `nvvm.tcgen05.mma.disable_output_lane` will result in the initiation of the whole matrix multiply and accumulate operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K / 2)` and requires specifiying an additional `%spmetadata` argument + +`.ashift` shifts the rows of the A matrix down by one row, except for the last row in the Tensor Memory. `.ashift` is only allowed with M = 128 or M = 256. + +The `%collector_usage_a_op_flag` flag specifies the usage of collector buffer for matrix `A`. It is illegal to specify either of `USE` or `FILL` for `%collector_usage_a_op_flag` along with `.ashift` + +For more information, refer to the `PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%kind_flag`: + +============= ========== + `kind_flag` value +============= ========== + F16 0 + TF32 1 + F8F6F4 2 + I8 3 +============= ========== + +`%cta_group_flag`: + +================= ========== + `cta_group_flag` value +================= ========== + CG1 1 + CG2 2 +================= ========== + +`%collector_usage_a_op_flag`: + +============================= ========== + `collector_usage_a_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + + +'``llvm.nvvm.tcgen05.mma.ws*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + // tcgen05.mma.ws + declare void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + + ; .sp variants + declare void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %d, i64 %adesc, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + declare void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %d, ptr addrspace(6) %atensor, i64 %bdesc, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 %zero_col_mask, i32 %kind_flag, i32 %collector_usage_b_buffer_flag, i32 %collector_usage_b_op_flag) + +Overview: +""""""""" + +`nvvm.tcgen05.mma.ws` is an asynchronous intrinsic which initiates an `M x N x K` weight stationary convolution matrix multiply and accumulate operation, `D = A * B + D` where the `A` matrix is `M x K`, the `B` matrix is `K x N`, and the `D` matrix is `M x N`. The operation of the form `D = A*B` is issued when the input predicate argument `%enable_inp_d` is false. The optional immediate argument `%scale_d_imm` can be specified to scale the input matrix `D` as follows: `D = A*B+D * (2 ^ - %scale_d_imm)`. The valid range of values for argument `%scale_d_imm` is `[0, 15]`. The 32-bit register operand idesc is the instruction descriptor as described in `Instruction descriptor `__ + +`nvvm.tcgen05.mma` has single thread semantics, unlike the collective instructions `nvvm.mma.sync` or the PTX `wgmma.mma_async` instruction. So, a single thread issuing the `nvvm.tcgen05.mma` will result in the initiation of the whole matrix multiply and accumulate operation + +When `.sp` is specifed, the dimension of A matrix is `M x (K / 2)` and requires specifiying an additional `%spmetadata` argument + +The operand `%zero_col_mask` is a 64-bit register which specifies the `Zero-Column Mask Descriptor `__. The zero-column mask descriptor is used to generate a mask that specifies which columns of `B` matrix will have zero value for the matrix multiply and accumulate operation regardless of the values present in the shared memory. + +The `%collector_usage_b_buffer_flag` and `%collector_usage_b_op_flag` together flag specifies the usage of collector buffer for Matrix `B` + +For more information, refer to the +`PTX ISA `__ + +The following tables describes the possible values of the flag arguments + +`%kind_flag`: + +============= ========== + `kind_flag` value +============= ========== + F16 0 + TF32 1 + F8F6F4 2 + I8 3 +============= ========== + +`%collector_usage_b_buffer_flag`: + +================================ ========== + `collector_usage_b_buffer_flag` value +================================ ========== + B0 0 + B1 1 + B2 2 + B3 3 +================================ ========== + +`%collector_usage_b_op_flag`: + +============================= ========== + `collector_usage_b_op_flag` value +============================= ========== + DISCARD 0 + LASTUSE 1 + USE 2 + FILL 3 +============================= ========== + Store Intrinsics ---------------- diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index 602922fcb3b9c..f2b31211cf0dc 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -3832,7 +3832,7 @@ Important Subclasses of the ``Instruction`` class * ``BinaryOperator`` - This subclasses represents all two operand instructions whose operands must be + This subclass represents all two operand instructions whose operands must be the same type, except for the comparison instructions. .. _CastInst: diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index 2ea571e12a277..7b1a6ce834919 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -527,6 +527,9 @@ The current vendor extensions supported are: ``XAndesVBFHCvt`` LLVM implements `version 5.0.0 of the Andes Vector BFLOAT16 Conversion Extension specification `__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification. +``XAndesVSINTH`` + LLVM implements `version 5.0.0 of the Andes Vector Small Int Handling Extension specification `__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification. + ``XAndesVSINTLoad`` LLVM implements `version 5.0.0 of the Andes Vector INT4 Load Extension specification `__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification. diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index 7d0fdd78dc96d..5d842d339f8c9 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -46,6 +46,7 @@ LLVM and API reference documentation. ScudoHardenedAllocator MemoryModelRelaxationAnnotations MemTagSanitizer + QualGroup Security SecurityTransparencyReports SegmentedStacks diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index c211844c62491..85c16b9c33f10 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -126,6 +126,7 @@ Changes to the RISC-V Backend * Add support for Zvfbfa (Additional BF16 vector compute support) * Adds experimental support for the 'Zibi` (Branch with Immediate) extension. * Add support for Zvfofp8min (OFP8 conversion extension) +* Adds assembler support for the Andes `XAndesvsinth` (Andes Vector Small Int Handling Extension). Changes to the WebAssembly Backend ---------------------------------- @@ -159,6 +160,7 @@ Changes to the LLVM tools * `llvm-readelf` now dumps all hex format values in lower-case mode. * Some code paths for supporting Python 2.7 in `llvm-lit` have been removed. +* Support for `%T` in lit has been removed. Changes to LLDB --------------------------------- diff --git a/llvm/docs/Remarks.rst b/llvm/docs/Remarks.rst index c89940f9ff4d5..67ed94d9740f4 100644 --- a/llvm/docs/Remarks.rst +++ b/llvm/docs/Remarks.rst @@ -152,26 +152,6 @@ Other tools that support remarks: .. option:: -opt-remarks-format= .. option:: -opt-remarks-with-hotness -Serialization modes -=================== - -There are two modes available for serializing remarks: - -``Separate`` - - In this mode, the remarks and the metadata are serialized separately. The - client is responsible for parsing the metadata first, then use the metadata - to correctly parse the remarks. - -``Standalone`` - - In this mode, the remarks and the metadata are serialized to the same - stream. The metadata will always come before the remarks. - - The compiler does not support emitting standalone remarks. This mode is - more suited for post-processing tools like linkers, that can merge the - remarks for one whole project. - .. _yamlremarks: YAML remarks @@ -374,27 +354,11 @@ This block can contain the following records: The remark container -------------------- -Bitstream remarks are designed to be used in two different modes: - -``The separate mode`` - - The separate mode is the mode that is typically used during compilation. It - provides a way to serialize the remark entries to a stream while some - metadata is kept in memory to be emitted in the product of the compilation - (typically, an object file). - -``The standalone mode`` - - The standalone mode is typically stored and used after the distribution of - a program. It contains all the information that allows the parsing of all - the remarks without having any external dependencies. - -In order to support multiple modes, the format introduces the concept of a -bitstream remark container type. +The bitstream remark container supports multiple types: -.. _bitstreamremarksseparateremarksmeta: +.. _bitstreamremarksfileexternal: -``SeparateRemarksMeta: the metadata emitted separately`` +``RemarksFileExternal: a link to an external remarks file`` This container type expects only a :ref:`META_BLOCK ` containing only: @@ -406,84 +370,33 @@ bitstream remark container type. clients to retrieve remarks and their associated metadata directly from intermediate products. -``SeparateRemarksFile: the remark entries emitted separately`` + The container versions of the external separate container should match in order to + have a well-formed file. - This container type expects only a :ref:`META_BLOCK ` containing only: - - * :ref:`RECORD_META_CONTAINER_INFO ` - * :ref:`RECORD_META_REMARK_VERSION ` +.. _bitstreamremarksfile: - This container type expects 0 or more :ref:`REMARK_BLOCK `. +``RemarksFile: a standalone remarks file`` - Typically, this is emitted in a side-file alongside an object file, and is - made to be able to stream to without increasing the memory consumption of - the compiler. This is referenced by the :ref:`RECORD_META_EXTERNAL_FILE - ` entry in the - :ref:`SeparateRemarksMeta ` container. + This container type expects a :ref:`META_BLOCK ` containing only: -When the parser tries to parse a container that contains the metadata for the -separate remarks, it should parse the version and type, then keep the string -table in memory while opening the external file, validating its metadata and -parsing the remark entries. + * :ref:`RECORD_META_CONTAINER_INFO ` + * :ref:`RECORD_META_REMARK_VERSION ` -The container versions from the separate container should match in order to -have a well-formed file. + Then, this container type expects 1 or more :ref:`REMARK_BLOCK `. + If no remarks are emitted, the meta blocks are also not emitted, so the file is empty. -``Standalone: the metadata and the remark entries emitted together`` + After the remark blocks, another :ref:`META_BLOCK ` is emitted, containing: + * :ref:`RECORD_META_STRTAB ` - This container type expects only a :ref:`META_BLOCK ` containing only: + When the parser reads this container type, it jumps to the end of the file + to read the string table before parsing the individual remarks. - * :ref:`RECORD_META_CONTAINER_INFO ` - * :ref:`RECORD_META_REMARK_VERSION ` - * :ref:`RECORD_META_STRTAB ` + Standalone remarks files can be referenced by the + :ref:`RECORD_META_EXTERNAL_FILE ` + entry in the :ref:`RemarksFileExternal + ` container. - This container type expects 0 or more :ref:`REMARK_BLOCK `. - -A complete output of :program:`llvm-bcanalyzer` on the different container types: - -``SeparateRemarksMeta`` - -.. code-block:: none - - - - - blob data = 'pass\\x00key\\x00value\\x00' - blob data = '/path/to/file/name' - - -``SeparateRemarksFile`` - -.. code-block:: none - - - - - - - - - - - - - -``Standalone`` - -.. code-block:: none - - - - - - blob data = 'pass\\x00remark\\x00function\\x00path\\x00key\\x00value\\x00argpath\\x00' - - - - - - - +.. FIXME: Add complete output of :program:`llvm-bcanalyzer` on the different container types (once format changes are completed) opt-viewer ========== diff --git a/llvm/docs/index.rst b/llvm/docs/index.rst index 28ca6bf6316f4..b480729aaa5d9 100644 --- a/llvm/docs/index.rst +++ b/llvm/docs/index.rst @@ -86,7 +86,6 @@ LLVM welcomes contributions of all kinds. To learn more, see the following artic :hidden: GettingInvolved - QualGroup * :doc:`GettingInvolved` * :ref:`development-process` @@ -98,8 +97,6 @@ LLVM welcomes contributions of all kinds. To learn more, see the following artic * :ref:`report-security-issue` -* :doc:`QualGroup` - Indices and tables ================== diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h index 72da2343fae13..83350e6e45846 100644 --- a/llvm/include/llvm/ADT/BitVector.h +++ b/llvm/include/llvm/ADT/BitVector.h @@ -40,12 +40,20 @@ template class const_set_bits_iterator_impl { Current = Parent.find_next(Current); } + void retreat() { + if (Current == -1) { + Current = Parent.find_last(); + } else { + Current = Parent.find_prev(Current); + } + } + public: - using iterator_category = std::forward_iterator_tag; - using difference_type = std::ptrdiff_t; - using value_type = int; - using pointer = value_type*; - using reference = value_type&; + using iterator_category = std::bidirectional_iterator_tag; + using difference_type = std::ptrdiff_t; + using value_type = unsigned; + using pointer = const value_type *; + using reference = value_type; const_set_bits_iterator_impl(const BitVectorT &Parent, int Current) : Parent(Parent), Current(Current) {} @@ -64,6 +72,17 @@ template class const_set_bits_iterator_impl { return *this; } + const_set_bits_iterator_impl operator--(int) { + auto Prev = *this; + retreat(); + return Prev; + } + + const_set_bits_iterator_impl &operator--() { + retreat(); + return *this; + } + unsigned operator*() const { return Current; } bool operator==(const const_set_bits_iterator_impl &Other) const { diff --git a/llvm/include/llvm/ADT/CoalescingBitVector.h b/llvm/include/llvm/ADT/CoalescingBitVector.h index 4940bc1c2c18b..b126fc699ad87 100644 --- a/llvm/include/llvm/ADT/CoalescingBitVector.h +++ b/llvm/include/llvm/ADT/CoalescingBitVector.h @@ -194,10 +194,7 @@ template class CoalescingBitVector { // Delete the overlapping intervals. Split up intervals that only partially // intersect an overlap. - for (IntervalT Overlap : Overlaps) { - IndexT OlapStart, OlapStop; - std::tie(OlapStart, OlapStop) = Overlap; - + for (auto [OlapStart, OlapStop] : Overlaps) { auto It = Intervals.find(OlapStart); IndexT CurrStart = It.start(); IndexT CurrStop = It.stop(); @@ -420,10 +417,7 @@ template class CoalescingBitVector { const SmallVectorImpl &Overlaps, SmallVectorImpl &NonOverlappingParts) { IndexT NextUncoveredBit = Start; - for (IntervalT Overlap : Overlaps) { - IndexT OlapStart, OlapStop; - std::tie(OlapStart, OlapStop) = Overlap; - + for (auto [OlapStart, OlapStop] : Overlaps) { // [Start;Stop] and [OlapStart;OlapStop] overlap iff OlapStart <= Stop // and Start <= OlapStop. bool DoesOverlap = OlapStart <= Stop && Start <= OlapStop; diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index e13a2cb09a412..bcf3e9676a7b5 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -75,37 +75,39 @@ class DenseMapBase : public DebugEpochBase { using const_iterator = DenseMapIterator; - inline iterator begin() { + [[nodiscard]] inline iterator begin() { return iterator::makeBegin(buckets(), empty(), *this); } - inline iterator end() { return iterator::makeEnd(buckets(), *this); } - inline const_iterator begin() const { + [[nodiscard]] inline iterator end() { + return iterator::makeEnd(buckets(), *this); + } + [[nodiscard]] inline const_iterator begin() const { return const_iterator::makeBegin(buckets(), empty(), *this); } - inline const_iterator end() const { + [[nodiscard]] inline const_iterator end() const { return const_iterator::makeEnd(buckets(), *this); } // Return an iterator to iterate over keys in the map. - inline auto keys() { + [[nodiscard]] inline auto keys() { return map_range(*this, [](const BucketT &P) { return P.getFirst(); }); } // Return an iterator to iterate over values in the map. - inline auto values() { + [[nodiscard]] inline auto values() { return map_range(*this, [](const BucketT &P) { return P.getSecond(); }); } - inline auto keys() const { + [[nodiscard]] inline auto keys() const { return map_range(*this, [](const BucketT &P) { return P.getFirst(); }); } - inline auto values() const { + [[nodiscard]] inline auto values() const { return map_range(*this, [](const BucketT &P) { return P.getSecond(); }); } [[nodiscard]] bool empty() const { return getNumEntries() == 0; } - unsigned size() const { return getNumEntries(); } + [[nodiscard]] unsigned size() const { return getNumEntries(); } /// Grow the densemap so that it can contain at least \p NumEntries items /// before resizing again. @@ -153,30 +155,35 @@ class DenseMapBase : public DebugEpochBase { } /// Return true if the specified key is in the map, false otherwise. - bool contains(const_arg_type_t Val) const { + [[nodiscard]] bool contains(const_arg_type_t Val) const { return doFind(Val) != nullptr; } /// Return 1 if the specified key is in the map, 0 otherwise. - size_type count(const_arg_type_t Val) const { + [[nodiscard]] size_type count(const_arg_type_t Val) const { return contains(Val) ? 1 : 0; } - iterator find(const_arg_type_t Val) { return find_as(Val); } - const_iterator find(const_arg_type_t Val) const { return find_as(Val); } + [[nodiscard]] iterator find(const_arg_type_t Val) { + return find_as(Val); + } + [[nodiscard]] const_iterator find(const_arg_type_t Val) const { + return find_as(Val); + } /// Alternate version of find() which allows a different, and possibly /// less expensive, key type. /// The DenseMapInfo is responsible for supplying methods /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key /// type used. - template iterator find_as(const LookupKeyT &Val) { + template + [[nodiscard]] iterator find_as(const LookupKeyT &Val) { if (BucketT *Bucket = doFind(Val)) return makeIterator(Bucket); return end(); } template - const_iterator find_as(const LookupKeyT &Val) const { + [[nodiscard]] const_iterator find_as(const LookupKeyT &Val) const { if (const BucketT *Bucket = doFind(Val)) return makeConstIterator(Bucket); return end(); @@ -184,7 +191,7 @@ class DenseMapBase : public DebugEpochBase { /// lookup - Return the entry for the specified key, or a default /// constructed value if no such entry exists. - ValueT lookup(const_arg_type_t Val) const { + [[nodiscard]] ValueT lookup(const_arg_type_t Val) const { if (const BucketT *Bucket = doFind(Val)) return Bucket->getSecond(); return ValueT(); @@ -194,7 +201,8 @@ class DenseMapBase : public DebugEpochBase { // useful, because `lookup` cannot be used with non-default-constructible // values. template > - ValueT lookup_or(const_arg_type_t Val, U &&Default) const { + [[nodiscard]] ValueT lookup_or(const_arg_type_t Val, + U &&Default) const { if (const BucketT *Bucket = doFind(Val)) return Bucket->getSecond(); return Default; @@ -202,7 +210,7 @@ class DenseMapBase : public DebugEpochBase { /// at - Return the entry for the specified key, or abort if no such /// entry exists. - const ValueT &at(const_arg_type_t Val) const { + [[nodiscard]] const ValueT &at(const_arg_type_t Val) const { auto Iter = this->find(std::move(Val)); assert(Iter != this->end() && "DenseMap::at failed due to a missing key"); return Iter->second; @@ -330,14 +338,16 @@ class DenseMapBase : public DebugEpochBase { /// isPointerIntoBucketsArray - Return true if the specified pointer points /// somewhere into the DenseMap's array of buckets (i.e. either to a key or /// value in the DenseMap). - bool isPointerIntoBucketsArray(const void *Ptr) const { + [[nodiscard]] bool isPointerIntoBucketsArray(const void *Ptr) const { return Ptr >= getBuckets() && Ptr < getBucketsEnd(); } /// getPointerIntoBucketsArray() - Return an opaque pointer into the buckets /// array. In conjunction with the previous method, this can be used to /// determine whether an insertion caused the DenseMap to reallocate. - const void *getPointerIntoBucketsArray() const { return getBuckets(); } + [[nodiscard]] const void *getPointerIntoBucketsArray() const { + return getBuckets(); + } protected: DenseMapBase() = default; @@ -448,6 +458,11 @@ class DenseMapBase : public DebugEpochBase { static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); } private: + DerivedT &derived() { return *static_cast(this); } + const DerivedT &derived() const { + return *static_cast(this); + } + template std::pair lookupOrInsertIntoBucket(KeyArgT &&Key, Ts &&...Args) { @@ -477,39 +492,27 @@ class DenseMapBase : public DebugEpochBase { return const_iterator::makeIterator(TheBucket, buckets(), *this); } - unsigned getNumEntries() const { - return static_cast(this)->getNumEntries(); - } + unsigned getNumEntries() const { return derived().getNumEntries(); } - void setNumEntries(unsigned Num) { - static_cast(this)->setNumEntries(Num); - } + void setNumEntries(unsigned Num) { derived().setNumEntries(Num); } void incrementNumEntries() { setNumEntries(getNumEntries() + 1); } void decrementNumEntries() { setNumEntries(getNumEntries() - 1); } - unsigned getNumTombstones() const { - return static_cast(this)->getNumTombstones(); - } + unsigned getNumTombstones() const { return derived().getNumTombstones(); } - void setNumTombstones(unsigned Num) { - static_cast(this)->setNumTombstones(Num); - } + void setNumTombstones(unsigned Num) { derived().setNumTombstones(Num); } void incrementNumTombstones() { setNumTombstones(getNumTombstones() + 1); } void decrementNumTombstones() { setNumTombstones(getNumTombstones() - 1); } - const BucketT *getBuckets() const { - return static_cast(this)->getBuckets(); - } + const BucketT *getBuckets() const { return derived().getBuckets(); } - BucketT *getBuckets() { return static_cast(this)->getBuckets(); } + BucketT *getBuckets() { return derived().getBuckets(); } - unsigned getNumBuckets() const { - return static_cast(this)->getNumBuckets(); - } + unsigned getNumBuckets() const { return derived().getNumBuckets(); } BucketT *getBucketsEnd() { return getBuckets() + getNumBuckets(); } @@ -525,9 +528,9 @@ class DenseMapBase : public DebugEpochBase { return llvm::make_range(getBuckets(), getBucketsEnd()); } - void grow(unsigned AtLeast) { static_cast(this)->grow(AtLeast); } + void grow(unsigned AtLeast) { derived().grow(AtLeast); } - void shrink_and_clear() { static_cast(this)->shrink_and_clear(); } + void shrink_and_clear() { derived().shrink_and_clear(); } template BucketT *findBucketForInsertion(const LookupKeyT &Lookup, @@ -656,7 +659,9 @@ class DenseMapBase : public DebugEpochBase { /// This is just the raw memory used by DenseMap. /// If entries are pointers to objects, the size of the referenced objects /// are not included. - size_t getMemorySize() const { return getNumBuckets() * sizeof(BucketT); } + [[nodiscard]] size_t getMemorySize() const { + return getNumBuckets() * sizeof(BucketT); + } }; /// Equality comparison for DenseMap. @@ -667,9 +672,9 @@ class DenseMapBase : public DebugEpochBase { /// complexity is linear, worst case is O(N^2) (if every hash collides). template -bool operator==( - const DenseMapBase &LHS, - const DenseMapBase &RHS) { +[[nodiscard]] bool +operator==(const DenseMapBase &LHS, + const DenseMapBase &RHS) { if (LHS.size() != RHS.size()) return false; @@ -687,9 +692,9 @@ bool operator==( /// Equivalent to !(LHS == RHS). See operator== for performance notes. template -bool operator!=( - const DenseMapBase &LHS, - const DenseMapBase &RHS) { +[[nodiscard]] bool +operator!=(const DenseMapBase &LHS, + const DenseMapBase &RHS) { return !(LHS == RHS); } @@ -1009,21 +1014,13 @@ class SmallDenseMap void copyFrom(const SmallDenseMap &other) { this->destroyAll(); deallocateBuckets(); - Small = true; - if (other.getNumBuckets() > InlineBuckets) { - Small = false; - new (getLargeRep()) LargeRep(allocateBuckets(other.getNumBuckets())); - } + allocateBuckets(other.getNumBuckets()); this->BaseT::copyFrom(other); } void init(unsigned InitNumEntries) { auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries); - Small = true; - if (InitBuckets > InlineBuckets) { - Small = false; - new (getLargeRep()) LargeRep(allocateBuckets(InitBuckets)); - } + allocateBuckets(InitBuckets); this->BaseT::initEmpty(); } @@ -1057,21 +1054,14 @@ class SmallDenseMap // AtLeast == InlineBuckets can happen if there are many tombstones, // and grow() is used to remove them. Usually we always switch to the // large rep here. - if (AtLeast > InlineBuckets) { - Small = false; - new (getLargeRep()) LargeRep(allocateBuckets(AtLeast)); - } + allocateBuckets(AtLeast); this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd)); return; } LargeRep OldRep = std::move(*getLargeRep()); getLargeRep()->~LargeRep(); - if (AtLeast <= InlineBuckets) { - Small = true; - } else { - new (getLargeRep()) LargeRep(allocateBuckets(AtLeast)); - } + allocateBuckets(AtLeast); this->moveFromOldBuckets(OldRep.buckets()); @@ -1166,12 +1156,15 @@ class SmallDenseMap getLargeRep()->~LargeRep(); } - LargeRep allocateBuckets(unsigned Num) { - assert(Num > InlineBuckets && "Must allocate more buckets than are inline"); - LargeRep Rep = {static_cast(allocate_buffer( - sizeof(BucketT) * Num, alignof(BucketT))), - Num}; - return Rep; + void allocateBuckets(unsigned Num) { + if (Num <= InlineBuckets) { + Small = true; + } else { + Small = false; + BucketT *NewBuckets = static_cast( + allocate_buffer(sizeof(BucketT) * Num, alignof(BucketT))); + new (getLargeRep()) LargeRep{NewBuckets, Num}; + } } }; @@ -1239,15 +1232,15 @@ class DenseMapIterator : DebugEpochBase::HandleBase { const DenseMapIterator &I) : DebugEpochBase::HandleBase(I), Ptr(I.Ptr), End(I.End) {} - reference operator*() const { + [[nodiscard]] reference operator*() const { assert(isHandleInSync() && "invalid iterator access!"); assert(Ptr != End && "dereferencing end() iterator"); return *Ptr; } - pointer operator->() const { return &operator*(); } + [[nodiscard]] pointer operator->() const { return &operator*(); } - friend bool operator==(const DenseMapIterator &LHS, - const DenseMapIterator &RHS) { + [[nodiscard]] friend bool operator==(const DenseMapIterator &LHS, + const DenseMapIterator &RHS) { assert((!LHS.getEpochAddress() || LHS.isHandleInSync()) && "handle not in sync!"); assert((!RHS.getEpochAddress() || RHS.isHandleInSync()) && @@ -1257,8 +1250,8 @@ class DenseMapIterator : DebugEpochBase::HandleBase { return LHS.Ptr == RHS.Ptr; } - friend bool operator!=(const DenseMapIterator &LHS, - const DenseMapIterator &RHS) { + [[nodiscard]] friend bool operator!=(const DenseMapIterator &LHS, + const DenseMapIterator &RHS) { return !(LHS == RHS); } @@ -1296,7 +1289,8 @@ class DenseMapIterator : DebugEpochBase::HandleBase { }; template -inline size_t capacity_in_bytes(const DenseMap &X) { +[[nodiscard]] inline size_t +capacity_in_bytes(const DenseMap &X) { return X.getMemorySize(); } diff --git a/llvm/include/llvm/ADT/DenseMapInfo.h b/llvm/include/llvm/ADT/DenseMapInfo.h index 57a8674e35015..f24aeb4371e7f 100644 --- a/llvm/include/llvm/ADT/DenseMapInfo.h +++ b/llvm/include/llvm/ADT/DenseMapInfo.h @@ -139,13 +139,11 @@ struct DenseMapInfo> { using SecondInfo = DenseMapInfo; static constexpr Pair getEmptyKey() { - return std::make_pair(FirstInfo::getEmptyKey(), - SecondInfo::getEmptyKey()); + return {FirstInfo::getEmptyKey(), SecondInfo::getEmptyKey()}; } static constexpr Pair getTombstoneKey() { - return std::make_pair(FirstInfo::getTombstoneKey(), - SecondInfo::getTombstoneKey()); + return {FirstInfo::getTombstoneKey(), SecondInfo::getTombstoneKey()}; } static unsigned getHashValue(const Pair& PairVal) { diff --git a/llvm/include/llvm/ADT/DenseSet.h b/llvm/include/llvm/ADT/DenseSet.h index 60ad9b2eb7762..eec800d07b6df 100644 --- a/llvm/include/llvm/ADT/DenseSet.h +++ b/llvm/include/llvm/ADT/DenseSet.h @@ -83,9 +83,9 @@ class DenseSetImpl { DenseSetImpl(llvm::from_range_t, Range &&R) : DenseSetImpl(adl_begin(R), adl_end(R)) {} - bool empty() const { return TheMap.empty(); } - size_type size() const { return TheMap.size(); } - size_t getMemorySize() const { return TheMap.getMemorySize(); } + [[nodiscard]] bool empty() const { return TheMap.empty(); } + [[nodiscard]] size_type size() const { return TheMap.size(); } + [[nodiscard]] size_t getMemorySize() const { return TheMap.getMemorySize(); } /// Grow the DenseSet so that it has at least Size buckets. Will not shrink /// the Size of the set. @@ -154,14 +154,20 @@ class DenseSetImpl { using iterator = DenseSetIterator; using const_iterator = DenseSetIterator; - iterator begin() { return iterator(TheMap.begin()); } - iterator end() { return iterator(TheMap.end()); } + [[nodiscard]] iterator begin() { return iterator(TheMap.begin()); } + [[nodiscard]] iterator end() { return iterator(TheMap.end()); } - const_iterator begin() const { return const_iterator(TheMap.begin()); } - const_iterator end() const { return const_iterator(TheMap.end()); } + [[nodiscard]] const_iterator begin() const { + return const_iterator(TheMap.begin()); + } + [[nodiscard]] const_iterator end() const { + return const_iterator(TheMap.end()); + } - iterator find(const_arg_type_t V) { return iterator(TheMap.find(V)); } - const_iterator find(const_arg_type_t V) const { + [[nodiscard]] iterator find(const_arg_type_t V) { + return iterator(TheMap.find(V)); + } + [[nodiscard]] const_iterator find(const_arg_type_t V) const { return const_iterator(TheMap.find(V)); } @@ -180,10 +186,12 @@ class DenseSetImpl { /// The DenseMapInfo is responsible for supplying methods /// getHashValue(LookupKeyT) and isEqual(LookupKeyT, KeyT) for each key type /// used. - template iterator find_as(const LookupKeyT &Val) { + template + [[nodiscard]] iterator find_as(const LookupKeyT &Val) { return iterator(TheMap.find_as(Val)); } template + [[nodiscard]] const_iterator find_as(const LookupKeyT &Val) const { return const_iterator(TheMap.find_as(Val)); } @@ -229,8 +237,9 @@ class DenseSetImpl { /// Equivalent to N calls to RHS.count. Amortized complexity is linear, worst /// case is O(N^2) (if every hash collides). template -bool operator==(const DenseSetImpl &LHS, - const DenseSetImpl &RHS) { +[[nodiscard]] bool +operator==(const DenseSetImpl &LHS, + const DenseSetImpl &RHS) { if (LHS.size() != RHS.size()) return false; @@ -245,8 +254,9 @@ bool operator==(const DenseSetImpl &LHS, /// /// Equivalent to !(LHS == RHS). See operator== for performance notes. template -bool operator!=(const DenseSetImpl &LHS, - const DenseSetImpl &RHS) { +[[nodiscard]] bool +operator!=(const DenseSetImpl &LHS, + const DenseSetImpl &RHS) { return !(LHS == RHS); } diff --git a/llvm/include/llvm/ADT/FunctionExtras.h b/llvm/include/llvm/ADT/FunctionExtras.h index 1311452a17bb3..2498cb7796f1f 100644 --- a/llvm/include/llvm/ADT/FunctionExtras.h +++ b/llvm/include/llvm/ADT/FunctionExtras.h @@ -58,10 +58,6 @@ template class unique_function; namespace detail { -template -using EnableIfTrivial = - std::enable_if_t::value && - std::is_trivially_destructible::value>; template using EnableUnlessSameType = std::enable_if_t, ThisT>::value>; @@ -94,13 +90,12 @@ template class UniqueFunctionBase { template struct AdjustedParamTBase { static_assert(!std::is_reference::value, "references should be handled by template specialization"); - template - using IsSizeLessThanThresholdT = - std::bool_constant; + static constexpr bool IsSizeLessThanThreshold = + sizeof(T) <= 2 * sizeof(void *); using type = std::conditional_t::value && std::is_trivially_move_constructible::value && - IsSizeLessThanThresholdT::value, + IsSizeLessThanThreshold, T, T &>; }; @@ -236,17 +231,17 @@ template class UniqueFunctionBase { // type erased behaviors needed. Create a static instance of the struct type // here and each instance will contain a pointer to it. // Wrap in a struct to avoid https://gcc.gnu.org/PR71954 - template - struct CallbacksHolder { - inline static NonTrivialCallbacks Callbacks = { - &CallImpl, &MoveImpl, &DestroyImpl}; - }; - // See if we can create a trivial callback. We need the callable to be - // trivially moved and trivially destroyed so that we don't have to store - // type erased callbacks for those operations. - template - struct CallbacksHolder> { - inline static TrivialCallback Callbacks = {&CallImpl}; + template struct CallbacksHolder { + inline static auto Callbacks = []() constexpr { + // For trivial callables, we don't need to store move and destroy + // callbacks. + if constexpr (std::is_trivially_move_constructible_v && + std::is_trivially_destructible_v) + return TrivialCallback{&CallImpl}; + else + return NonTrivialCallbacks{&CallImpl, &MoveImpl, + &DestroyImpl}; + }(); }; // A simple tag type so the call-as type to be passed to the constructor. diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h index 6aa3a8b9b6e0b..e9f99bafe9f1e 100644 --- a/llvm/include/llvm/ADT/GenericSSAContext.h +++ b/llvm/include/llvm/ADT/GenericSSAContext.h @@ -54,7 +54,7 @@ template class GenericSSAContext { // The null value for ValueRefT. For LLVM IR and MIR, this is simply the // default constructed value. - static constexpr ValueRefT *ValueRefNull = {}; + static constexpr ValueRefT ValueRefNull = {}; // An InstructionT usually defines one or more ValueT objects. using InstructionT = typename SSATraits::InstructionT; diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index 3b9b7f2633771..141816c304397 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -310,7 +310,7 @@ template class GenericSyncDependenceAnalysis { const DivergenceDescriptor &getJoinBlocks(const BlockT *DivTermBlock); private: - static DivergenceDescriptor EmptyDivergenceDesc; + static inline DivergenceDescriptor EmptyDivergenceDesc; ModifiedPO CyclePO; @@ -741,10 +741,6 @@ template class DivergencePropagator { } }; -template -typename llvm::GenericSyncDependenceAnalysis::DivergenceDescriptor - llvm::GenericSyncDependenceAnalysis::EmptyDivergenceDesc; - template llvm::GenericSyncDependenceAnalysis::GenericSyncDependenceAnalysis( const ContextT &Context, const DominatorTreeT &DT, const CycleInfoT &CI) diff --git a/llvm/include/llvm/ADT/ImmutableMap.h b/llvm/include/llvm/ADT/ImmutableMap.h index 3d19ca41a5be0..32634a96ee9ea 100644 --- a/llvm/include/llvm/ADT/ImmutableMap.h +++ b/llvm/include/llvm/ADT/ImmutableMap.h @@ -111,25 +111,25 @@ class ImmutableMap { } }; - bool contains(key_type_ref K) const { + [[nodiscard]] bool contains(key_type_ref K) const { return Root ? Root->contains(K) : false; } - bool operator==(const ImmutableMap &RHS) const { + [[nodiscard]] bool operator==(const ImmutableMap &RHS) const { return Root && RHS.Root ? Root->isEqual(*RHS.Root.get()) : Root == RHS.Root; } - bool operator!=(const ImmutableMap &RHS) const { + [[nodiscard]] bool operator!=(const ImmutableMap &RHS) const { return Root && RHS.Root ? Root->isNotEqual(*RHS.Root.get()) : Root != RHS.Root; } - TreeTy *getRoot() const { + [[nodiscard]] TreeTy *getRoot() const { if (Root) { Root->retain(); } return Root.get(); } - TreeTy *getRootWithoutRetain() const { return Root.get(); } + [[nodiscard]] TreeTy *getRootWithoutRetain() const { return Root.get(); } void manualRetain() { if (Root) Root->retain(); @@ -139,7 +139,7 @@ class ImmutableMap { if (Root) Root->release(); } - bool isEmpty() const { return !Root; } + [[nodiscard]] bool isEmpty() const { return !Root; } public: //===--------------------------------------------------===// @@ -163,10 +163,10 @@ class ImmutableMap { data_type_ref getData() const { return (*this)->second; } }; - iterator begin() const { return iterator(Root.get()); } - iterator end() const { return iterator(); } + [[nodiscard]] iterator begin() const { return iterator(Root.get()); } + [[nodiscard]] iterator end() const { return iterator(); } - data_type* lookup(key_type_ref K) const { + [[nodiscard]] data_type *lookup(key_type_ref K) const { if (Root) { TreeTy* T = Root->find(K); if (T) return &T->getValue().second; @@ -178,7 +178,7 @@ class ImmutableMap { /// getMaxElement - Returns the pair in the ImmutableMap for /// which key is the highest in the ordering of keys in the map. This /// method returns NULL if the map is empty. - value_type* getMaxElement() const { + [[nodiscard]] value_type *getMaxElement() const { return Root ? &(Root->getMaxElement()->getValue()) : nullptr; } @@ -186,7 +186,9 @@ class ImmutableMap { // Utility methods. //===--------------------------------------------------===// - unsigned getHeight() const { return Root ? Root->getHeight() : 0; } + [[nodiscard]] unsigned getHeight() const { + return Root ? Root->getHeight() : 0; + } static inline void Profile(FoldingSetNodeID& ID, const ImmutableMap& M) { ID.AddPointer(M.Root.get()); @@ -250,7 +252,7 @@ class ImmutableMapRef { return ImmutableMapRef(NewT, Factory); } - bool contains(key_type_ref K) const { + [[nodiscard]] bool contains(key_type_ref K) const { return Root ? Root->contains(K) : false; } @@ -258,16 +260,16 @@ class ImmutableMapRef { return ImmutableMap(Factory->getCanonicalTree(Root.get())); } - bool operator==(const ImmutableMapRef &RHS) const { + [[nodiscard]] bool operator==(const ImmutableMapRef &RHS) const { return Root && RHS.Root ? Root->isEqual(*RHS.Root.get()) : Root == RHS.Root; } - bool operator!=(const ImmutableMapRef &RHS) const { + [[nodiscard]] bool operator!=(const ImmutableMapRef &RHS) const { return Root && RHS.Root ? Root->isNotEqual(*RHS.Root.get()) : Root != RHS.Root; } - bool isEmpty() const { return !Root; } + [[nodiscard]] bool isEmpty() const { return !Root; } //===--------------------------------------------------===// // For testing. @@ -293,10 +295,10 @@ class ImmutableMapRef { data_type_ref getData() const { return (*this)->second; } }; - iterator begin() const { return iterator(Root.get()); } - iterator end() const { return iterator(); } + [[nodiscard]] iterator begin() const { return iterator(Root.get()); } + [[nodiscard]] iterator end() const { return iterator(); } - data_type *lookup(key_type_ref K) const { + [[nodiscard]] data_type *lookup(key_type_ref K) const { if (Root) { TreeTy* T = Root->find(K); if (T) return &T->getValue().second; @@ -308,7 +310,7 @@ class ImmutableMapRef { /// getMaxElement - Returns the pair in the ImmutableMap for /// which key is the highest in the ordering of keys in the map. This /// method returns NULL if the map is empty. - value_type* getMaxElement() const { + [[nodiscard]] value_type *getMaxElement() const { return Root ? &(Root->getMaxElement()->getValue()) : nullptr; } @@ -316,7 +318,9 @@ class ImmutableMapRef { // Utility methods. //===--------------------------------------------------===// - unsigned getHeight() const { return Root ? Root->getHeight() : 0; } + [[nodiscard]] unsigned getHeight() const { + return Root ? Root->getHeight() : 0; + } static inline void Profile(FoldingSetNodeID &ID, const ImmutableMapRef &M) { ID.AddPointer(M.Root.get()); diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h index ac86f43b2048e..017585a47ddd6 100644 --- a/llvm/include/llvm/ADT/ImmutableSet.h +++ b/llvm/include/llvm/ADT/ImmutableSet.h @@ -531,7 +531,7 @@ class ImutAVLFactory { /// add_internal - Creates a new tree that includes the specified /// data and the data from the original tree. If the original tree /// already contained the data item, the original tree is returned. - TreeTy* add_internal(value_type_ref V, TreeTy* T) { + TreeTy *add_internal(value_type_ref V, TreeTy *T) { if (isEmpty(T)) return createNode(T, V, T); assert(!T->isMutable()); @@ -539,19 +539,34 @@ class ImutAVLFactory { key_type_ref K = ImutInfo::KeyOfValue(V); key_type_ref KCurrent = ImutInfo::KeyOfValue(getValue(T)); - if (ImutInfo::isEqual(K,KCurrent)) + if (ImutInfo::isEqual(K, KCurrent)) { + // If both key and value are same, return the original tree. + if (ImutInfo::isDataEqual(ImutInfo::DataOfValue(V), + ImutInfo::DataOfValue(getValue(T)))) + return T; + // Otherwise create a new node with the new value. return createNode(getLeft(T), V, getRight(T)); - else if (ImutInfo::isLess(K,KCurrent)) - return balanceTree(add_internal(V, getLeft(T)), getValue(T), getRight(T)); + } + + TreeTy *NewL = getLeft(T); + TreeTy *NewR = getRight(T); + if (ImutInfo::isLess(K, KCurrent)) + NewL = add_internal(V, NewL); else - return balanceTree(getLeft(T), getValue(T), add_internal(V, getRight(T))); + NewR = add_internal(V, NewR); + + // If no changes were made, return the original tree. Otherwise, balance the + // tree and return the new root. + return NewL == getLeft(T) && NewR == getRight(T) + ? T + : balanceTree(NewL, getValue(T), NewR); } /// remove_internal - Creates a new tree that includes all the data /// from the original tree except the specified data. If the /// specified data did not exist in the original tree, the original /// tree is returned. - TreeTy* remove_internal(key_type_ref K, TreeTy* T) { + TreeTy *remove_internal(key_type_ref K, TreeTy *T) { if (isEmpty(T)) return T; @@ -559,15 +574,21 @@ class ImutAVLFactory { key_type_ref KCurrent = ImutInfo::KeyOfValue(getValue(T)); - if (ImutInfo::isEqual(K,KCurrent)) { + if (ImutInfo::isEqual(K, KCurrent)) return combineTrees(getLeft(T), getRight(T)); - } else if (ImutInfo::isLess(K,KCurrent)) { - return balanceTree(remove_internal(K, getLeft(T)), - getValue(T), getRight(T)); - } else { - return balanceTree(getLeft(T), getValue(T), - remove_internal(K, getRight(T))); - } + + TreeTy *NewL = getLeft(T); + TreeTy *NewR = getRight(T); + if (ImutInfo::isLess(K, KCurrent)) + NewL = remove_internal(K, NewL); + else + NewR = remove_internal(K, NewR); + + // If no changes were made, return the original tree. Otherwise, balance the + // tree and return the new root. + return NewL == getLeft(T) && NewR == getRight(T) + ? T + : balanceTree(NewL, getValue(T), NewR); } TreeTy* combineTrees(TreeTy* L, TreeTy* R) { diff --git a/llvm/include/llvm/ADT/MapVector.h b/llvm/include/llvm/ADT/MapVector.h index 4a50126ff5aad..82f2c4977e01d 100644 --- a/llvm/include/llvm/ADT/MapVector.h +++ b/llvm/include/llvm/ADT/MapVector.h @@ -45,15 +45,15 @@ class MapVector { using const_reverse_iterator = typename VectorType::const_reverse_iterator; /// Clear the MapVector and return the underlying vector. - VectorType takeVector() { + [[nodiscard]] VectorType takeVector() { Map.clear(); return std::move(Vector); } /// Returns an array reference of the underlying vector. - ArrayRef getArrayRef() const { return Vector; } + [[nodiscard]] ArrayRef getArrayRef() const { return Vector; } - size_type size() const { return Vector.size(); } + [[nodiscard]] size_type size() const { return Vector.size(); } /// Grow the MapVector so that it can contain at least \p NumEntries items /// before resizing again. @@ -62,24 +62,28 @@ class MapVector { Vector.reserve(NumEntries); } - iterator begin() { return Vector.begin(); } - const_iterator begin() const { return Vector.begin(); } - iterator end() { return Vector.end(); } - const_iterator end() const { return Vector.end(); } + [[nodiscard]] iterator begin() { return Vector.begin(); } + [[nodiscard]] const_iterator begin() const { return Vector.begin(); } + [[nodiscard]] iterator end() { return Vector.end(); } + [[nodiscard]] const_iterator end() const { return Vector.end(); } - reverse_iterator rbegin() { return Vector.rbegin(); } - const_reverse_iterator rbegin() const { return Vector.rbegin(); } - reverse_iterator rend() { return Vector.rend(); } - const_reverse_iterator rend() const { return Vector.rend(); } - - bool empty() const { - return Vector.empty(); + [[nodiscard]] reverse_iterator rbegin() { return Vector.rbegin(); } + [[nodiscard]] const_reverse_iterator rbegin() const { + return Vector.rbegin(); } + [[nodiscard]] reverse_iterator rend() { return Vector.rend(); } + [[nodiscard]] const_reverse_iterator rend() const { return Vector.rend(); } + + [[nodiscard]] bool empty() const { return Vector.empty(); } - std::pair &front() { return Vector.front(); } - const std::pair &front() const { return Vector.front(); } - std::pair &back() { return Vector.back(); } - const std::pair &back() const { return Vector.back(); } + [[nodiscard]] std::pair &front() { return Vector.front(); } + [[nodiscard]] const std::pair &front() const { + return Vector.front(); + } + [[nodiscard]] std::pair &back() { return Vector.back(); } + [[nodiscard]] const std::pair &back() const { + return Vector.back(); + } void clear() { Map.clear(); @@ -96,7 +100,7 @@ class MapVector { } // Returns a copy of the value. Only allowed if ValueT is copyable. - ValueT lookup(const KeyT &Key) const { + [[nodiscard]] ValueT lookup(const KeyT &Key) const { static_assert(std::is_copy_constructible_v, "Cannot call lookup() if ValueT is not copyable."); typename MapType::const_iterator Pos = Map.find(Key); @@ -134,17 +138,21 @@ class MapVector { return Ret; } - bool contains(const KeyT &Key) const { return Map.find(Key) != Map.end(); } + [[nodiscard]] bool contains(const KeyT &Key) const { + return Map.find(Key) != Map.end(); + } - size_type count(const KeyT &Key) const { return contains(Key) ? 1 : 0; } + [[nodiscard]] size_type count(const KeyT &Key) const { + return contains(Key) ? 1 : 0; + } - iterator find(const KeyT &Key) { + [[nodiscard]] iterator find(const KeyT &Key) { typename MapType::const_iterator Pos = Map.find(Key); return Pos == Map.end()? Vector.end() : (Vector.begin() + Pos->second); } - const_iterator find(const KeyT &Key) const { + [[nodiscard]] const_iterator find(const KeyT &Key) const { typename MapType::const_iterator Pos = Map.find(Key); return Pos == Map.end()? Vector.end() : (Vector.begin() + Pos->second); diff --git a/llvm/include/llvm/ADT/PackedVector.h b/llvm/include/llvm/ADT/PackedVector.h index 1146cc4bd6d23..77fcbf24b2861 100644 --- a/llvm/include/llvm/ADT/PackedVector.h +++ b/llvm/include/llvm/ADT/PackedVector.h @@ -47,7 +47,7 @@ class PackedVectorBase { protected: static T getValue(const BitVectorTy &Bits, unsigned Idx) { T val = T(); - for (unsigned i = 0; i != BitNum-1; ++i) + for (unsigned i = 0; i != BitNum - 1; ++i) val = T(val | ((Bits[(Idx * BitNum) + i] ? 1UL : 0UL) << i)); if (Bits[(Idx * BitNum) + BitNum - 1]) val = ~val; @@ -58,9 +58,11 @@ class PackedVectorBase { if (val < 0) { val = ~val; Bits.set((Idx * BitNum) + BitNum - 1); + } else { + Bits.reset((Idx * BitNum) + BitNum - 1); } - assert((val >> (BitNum-1)) == 0 && "value is too big"); - for (unsigned i = 0; i != BitNum-1; ++i) + assert((val >> (BitNum - 1)) == 0 && "value is too big"); + for (unsigned i = 0; i != BitNum - 1; ++i) Bits[(Idx * BitNum) + i] = val & (T(1) << i); } }; @@ -73,8 +75,9 @@ class PackedVectorBase { /// will create a vector accepting values -2, -1, 0, 1. Any other value will hit /// an assertion. template -class PackedVector : public PackedVectorBase::is_signed> { +class PackedVector + : public PackedVectorBase::is_signed> { BitVectorTy Bits; // Keep track of the number of elements on our own. // We always maintain Bits.size() == NumElements * BitNum. @@ -97,9 +100,7 @@ class PackedVector : public PackedVectorBase &base, ptrdiff_t index) { // We encode the internal base as a pair of the derived base and a start // index into the derived base. - return std::make_pair(base.first, base.second + index); + return {base.first, base.second + index}; } /// See `detail::indexed_accessor_range_base` for details. static ReferenceT diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h index 5f6db9a78a003..c129f3a695b9e 100644 --- a/llvm/include/llvm/ADT/SetVector.h +++ b/llvm/include/llvm/ADT/SetVector.h @@ -87,72 +87,54 @@ class SetVector { SetVector(llvm::from_range_t, Range &&R) : SetVector(adl_begin(R), adl_end(R)) {} - ArrayRef getArrayRef() const { return vector_; } + [[nodiscard]] ArrayRef getArrayRef() const { return vector_; } /// Clear the SetVector and return the underlying vector. - Vector takeVector() { + [[nodiscard]] Vector takeVector() { set_.clear(); return std::move(vector_); } /// Determine if the SetVector is empty or not. - bool empty() const { - return vector_.empty(); - } + [[nodiscard]] bool empty() const { return vector_.empty(); } /// Determine the number of elements in the SetVector. - size_type size() const { - return vector_.size(); - } + [[nodiscard]] size_type size() const { return vector_.size(); } /// Get an iterator to the beginning of the SetVector. - iterator begin() { - return vector_.begin(); - } + [[nodiscard]] iterator begin() { return vector_.begin(); } /// Get a const_iterator to the beginning of the SetVector. - const_iterator begin() const { - return vector_.begin(); - } + [[nodiscard]] const_iterator begin() const { return vector_.begin(); } /// Get an iterator to the end of the SetVector. - iterator end() { - return vector_.end(); - } + [[nodiscard]] iterator end() { return vector_.end(); } /// Get a const_iterator to the end of the SetVector. - const_iterator end() const { - return vector_.end(); - } + [[nodiscard]] const_iterator end() const { return vector_.end(); } /// Get an reverse_iterator to the end of the SetVector. - reverse_iterator rbegin() { - return vector_.rbegin(); - } + [[nodiscard]] reverse_iterator rbegin() { return vector_.rbegin(); } /// Get a const_reverse_iterator to the end of the SetVector. - const_reverse_iterator rbegin() const { + [[nodiscard]] const_reverse_iterator rbegin() const { return vector_.rbegin(); } /// Get a reverse_iterator to the beginning of the SetVector. - reverse_iterator rend() { - return vector_.rend(); - } + [[nodiscard]] reverse_iterator rend() { return vector_.rend(); } /// Get a const_reverse_iterator to the beginning of the SetVector. - const_reverse_iterator rend() const { - return vector_.rend(); - } + [[nodiscard]] const_reverse_iterator rend() const { return vector_.rend(); } /// Return the first element of the SetVector. - const value_type &front() const { + [[nodiscard]] const value_type &front() const { assert(!empty() && "Cannot call front() on empty SetVector!"); return vector_.front(); } /// Return the last element of the SetVector. - const value_type &back() const { + [[nodiscard]] const value_type &back() const { assert(!empty() && "Cannot call back() on empty SetVector!"); return vector_.back(); } @@ -299,11 +281,11 @@ class SetVector { return Ret; } - bool operator==(const SetVector &that) const { + [[nodiscard]] bool operator==(const SetVector &that) const { return vector_ == that.vector_; } - bool operator!=(const SetVector &that) const { + [[nodiscard]] bool operator!=(const SetVector &that) const { return vector_ != that.vector_; } diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h index 16ad3973e054d..e24cd6415b687 100644 --- a/llvm/include/llvm/ADT/SmallPtrSet.h +++ b/llvm/include/llvm/ADT/SmallPtrSet.h @@ -96,8 +96,8 @@ class SmallPtrSetImplBase : public DebugEpochBase { SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete; [[nodiscard]] bool empty() const { return size() == 0; } - size_type size() const { return NumEntries; } - size_type capacity() const { return CurArraySize; } + [[nodiscard]] size_type size() const { return NumEntries; } + [[nodiscard]] size_type capacity() const { return CurArraySize; } void clear() { incrementEpoch(); @@ -136,12 +136,12 @@ class SmallPtrSetImplBase : public DebugEpochBase { } protected: - static void *getTombstoneMarker() { return reinterpret_cast(-2); } + static void *getTombstoneMarker() { return reinterpret_cast(-2); } static void *getEmptyMarker() { // Note that -1 is chosen to make clear() efficiently implementable with // memset and because it's not a valid pointer value. - return reinterpret_cast(-1); + return reinterpret_cast(-1); } const void **EndPointer() const { @@ -190,7 +190,7 @@ class SmallPtrSetImplBase : public DebugEpochBase { /// return true, otherwise return false. This is hidden from the client so /// that the derived class can check that the right type of pointer is passed /// in. - bool erase_imp(const void * Ptr) { + bool erase_imp(const void *Ptr) { if (isSmall()) { for (const void *&Bucket : small_buckets()) { if (Bucket == Ptr) { @@ -218,7 +218,7 @@ class SmallPtrSetImplBase : public DebugEpochBase { /// Returns the raw pointer needed to construct an iterator. If element not /// found, this will be EndPointer. Otherwise, it will be a pointer to the /// slot which stores Ptr; - const void *const * find_imp(const void * Ptr) const { + const void *const *find_imp(const void *Ptr) const { if (isSmall()) { // Linear search for the item. for (const void *const &Bucket : small_buckets()) @@ -251,7 +251,7 @@ class SmallPtrSetImplBase : public DebugEpochBase { LLVM_ABI std::pair insert_imp_big(const void *Ptr); LLVM_ABI const void *const *doFind(const void *Ptr) const; - const void * const *FindBucketFor(const void *Ptr) const; + const void *const *FindBucketFor(const void *Ptr) const; LLVM_ABI void shrink_and_clear(); /// Grow - Allocate a larger backing store for the buckets and move it over. @@ -279,18 +279,12 @@ class SmallPtrSetImplBase : public DebugEpochBase { /// SmallPtrSetIteratorImpl - This is the common base class shared between all /// instances of SmallPtrSetIterator. -class SmallPtrSetIteratorImpl { -protected: - const void *const *Bucket; - const void *const *End; - +class LLVM_DEBUGEPOCHBASE_HANDLEBASE_EMPTYBASE SmallPtrSetIteratorImpl + : public DebugEpochBase::HandleBase { public: - explicit SmallPtrSetIteratorImpl(const void *const *BP, const void*const *E) - : Bucket(BP), End(E) { - if (shouldReverseIterate()) { - RetreatIfNotValid(); - return; - } + explicit SmallPtrSetIteratorImpl(const void *const *BP, const void *const *E, + const DebugEpochBase &Epoch) + : DebugEpochBase::HandleBase(&Epoch), Bucket(BP), End(E) { AdvanceIfNotValid(); } @@ -302,6 +296,18 @@ class SmallPtrSetIteratorImpl { } protected: + void *dereference() const { + assert(isHandleInSync() && "invalid iterator access!"); + assert(Bucket < End); + return const_cast(*Bucket); + } + void increment() { + assert(isHandleInSync() && "invalid iterator access!"); + ++Bucket; + AdvanceIfNotValid(); + } + +private: /// AdvanceIfNotValid - If the current bucket isn't valid, advance to a bucket /// that is. This is guaranteed to stop because the end() bucket is marked /// valid. @@ -312,21 +318,19 @@ class SmallPtrSetIteratorImpl { *Bucket == SmallPtrSetImplBase::getTombstoneMarker())) ++Bucket; } - void RetreatIfNotValid() { - assert(Bucket >= End); - while (Bucket != End && - (Bucket[-1] == SmallPtrSetImplBase::getEmptyMarker() || - Bucket[-1] == SmallPtrSetImplBase::getTombstoneMarker())) { - --Bucket; - } - } + + using BucketItTy = + std::conditional_t, + const void *const *>; + + BucketItTy Bucket; + BucketItTy End; }; /// SmallPtrSetIterator - This implements a const_iterator for SmallPtrSet. template -class LLVM_DEBUGEPOCHBASE_HANDLEBASE_EMPTYBASE SmallPtrSetIterator - : public SmallPtrSetIteratorImpl, - DebugEpochBase::HandleBase { +class SmallPtrSetIterator : public SmallPtrSetIteratorImpl { using PtrTraits = PointerLikeTypeTraits; public: @@ -336,37 +340,22 @@ class LLVM_DEBUGEPOCHBASE_HANDLEBASE_EMPTYBASE SmallPtrSetIterator using difference_type = std::ptrdiff_t; using iterator_category = std::forward_iterator_tag; - explicit SmallPtrSetIterator(const void *const *BP, const void *const *E, - const DebugEpochBase &Epoch) - : SmallPtrSetIteratorImpl(BP, E), DebugEpochBase::HandleBase(&Epoch) {} + using SmallPtrSetIteratorImpl::SmallPtrSetIteratorImpl; // Most methods are provided by the base class. - const PtrTy operator*() const { - assert(isHandleInSync() && "invalid iterator access!"); - if (shouldReverseIterate()) { - assert(Bucket > End); - return PtrTraits::getFromVoidPointer(const_cast(Bucket[-1])); - } - assert(Bucket < End); - return PtrTraits::getFromVoidPointer(const_cast(*Bucket)); + [[nodiscard]] const PtrTy operator*() const { + return PtrTraits::getFromVoidPointer(dereference()); } - inline SmallPtrSetIterator& operator++() { // Preincrement - assert(isHandleInSync() && "invalid iterator access!"); - if (shouldReverseIterate()) { - --Bucket; - RetreatIfNotValid(); - return *this; - } - ++Bucket; - AdvanceIfNotValid(); + inline SmallPtrSetIterator &operator++() { // Preincrement + increment(); return *this; } - SmallPtrSetIterator operator++(int) { // Postincrement + SmallPtrSetIterator operator++(int) { // Postincrement SmallPtrSetIterator tmp = *this; - ++*this; + increment(); return tmp; } }; @@ -376,8 +365,7 @@ class LLVM_DEBUGEPOCHBASE_HANDLEBASE_EMPTYBASE SmallPtrSetIterator /// /// This is particularly useful for passing around between interface boundaries /// to avoid encoding a particular small size in the interface boundary. -template -class SmallPtrSetImpl : public SmallPtrSetImplBase { +template class SmallPtrSetImpl : public SmallPtrSetImplBase { using ConstPtrType = typename add_const_past_pointer::type; using PtrTraits = PointerLikeTypeTraits; using ConstPtrTraits = PointerLikeTypeTraits; @@ -406,9 +394,7 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase { /// Insert the given pointer with an iterator hint that is ignored. This is /// identical to calling insert(Ptr), but allows SmallPtrSet to be used by /// std::insert_iterator and std::inserter(). - iterator insert(iterator, PtrType Ptr) { - return insert(Ptr).first; - } + iterator insert(iterator, PtrType Ptr) { return insert(Ptr).first; } /// Remove pointer from the set. /// @@ -431,8 +417,7 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase { /// Returns whether anything was removed. It is safe to read the set inside /// the predicate function. However, the predicate must not modify the set /// itself, only indicate a removal by returning true. - template - bool remove_if(UnaryPredicate P) { + template bool remove_if(UnaryPredicate P) { bool Removed = false; if (isSmall()) { auto Buckets = small_buckets(); @@ -467,18 +452,17 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase { } /// count - Return 1 if the specified pointer is in the set, 0 otherwise. - size_type count(ConstPtrType Ptr) const { + [[nodiscard]] size_type count(ConstPtrType Ptr) const { return contains_imp(ConstPtrTraits::getAsVoidPointer(Ptr)); } - iterator find(ConstPtrType Ptr) const { + [[nodiscard]] iterator find(ConstPtrType Ptr) const { return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr))); } - bool contains(ConstPtrType Ptr) const { + [[nodiscard]] bool contains(ConstPtrType Ptr) const { return contains_imp(ConstPtrTraits::getAsVoidPointer(Ptr)); } - template - void insert(IterT I, IterT E) { + template void insert(IterT I, IterT E) { for (; I != E; ++I) insert(*I); } @@ -491,12 +475,12 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase { insert(adl_begin(R), adl_end(R)); } - iterator begin() const { + [[nodiscard]] iterator begin() const { if (shouldReverseIterate()) return makeIterator(EndPointer() - 1); return makeIterator(CurArray); } - iterator end() const { return makeIterator(EndPointer()); } + [[nodiscard]] iterator end() const { return makeIterator(EndPointer()); } private: /// Create an iterator that dereferences to same place as the given pointer. @@ -512,8 +496,8 @@ class SmallPtrSetImpl : public SmallPtrSetImplBase { /// Iterates over elements of LHS confirming that each value from LHS is also in /// RHS, and that no additional values are in RHS. template -bool operator==(const SmallPtrSetImpl &LHS, - const SmallPtrSetImpl &RHS) { +[[nodiscard]] bool operator==(const SmallPtrSetImpl &LHS, + const SmallPtrSetImpl &RHS) { if (LHS.size() != RHS.size()) return false; @@ -528,8 +512,8 @@ bool operator==(const SmallPtrSetImpl &LHS, /// /// Equivalent to !(LHS == RHS). template -bool operator!=(const SmallPtrSetImpl &LHS, - const SmallPtrSetImpl &RHS) { +[[nodiscard]] bool operator!=(const SmallPtrSetImpl &LHS, + const SmallPtrSetImpl &RHS) { return !(LHS == RHS); } @@ -537,7 +521,7 @@ bool operator!=(const SmallPtrSetImpl &LHS, /// SmallSize or less elements. This internally rounds up SmallSize to the next /// power of two if it is not already a power of two. See the comments above /// SmallPtrSetImplBase for details of the algorithm. -template +template class SmallPtrSet : public SmallPtrSetImpl { // In small mode SmallPtrSet uses linear search for the elements, so it is // not a good idea to choose this value too high. You may consider using a @@ -568,7 +552,7 @@ class SmallPtrSet : public SmallPtrSetImpl { : BaseT(SmallStorage, SmallSizePowTwo, that.SmallStorage, std::move(that)) {} - template + template SmallPtrSet(It I, It E) : BaseT(SmallStorage, SmallSizePowTwo) { this->insert(I, E); } @@ -610,16 +594,16 @@ class SmallPtrSet : public SmallPtrSetImpl { } }; -} // end namespace llvm +} // namespace llvm namespace std { - /// Implement std::swap in terms of SmallPtrSet swap. - template - inline void swap(llvm::SmallPtrSet &LHS, llvm::SmallPtrSet &RHS) { - LHS.swap(RHS); - } +/// Implement std::swap in terms of SmallPtrSet swap. +template +inline void swap(llvm::SmallPtrSet &LHS, llvm::SmallPtrSet &RHS) { + LHS.swap(RHS); +} -} // end namespace std +} // namespace std #endif // LLVM_ADT_SMALLPTRSET_H diff --git a/llvm/include/llvm/ADT/SmallSet.h b/llvm/include/llvm/ADT/SmallSet.h index 0e90293352630..3ca833f15eed3 100644 --- a/llvm/include/llvm/ADT/SmallSet.h +++ b/llvm/include/llvm/ADT/SmallSet.h @@ -167,12 +167,14 @@ class SmallSet { [[nodiscard]] bool empty() const { return Vector.empty() && Set.empty(); } - size_type size() const { + [[nodiscard]] size_type size() const { return isSmall() ? Vector.size() : Set.size(); } /// count - Return 1 if the element is in the set, 0 otherwise. - size_type count(const T &V) const { return contains(V) ? 1 : 0; } + [[nodiscard]] size_type count(const T &V) const { + return contains(V) ? 1 : 0; + } /// insert - Insert an element into the set if it isn't already there. /// Returns a pair. The first value of it is an iterator to the inserted @@ -210,20 +212,20 @@ class SmallSet { Set.clear(); } - const_iterator begin() const { + [[nodiscard]] const_iterator begin() const { if (isSmall()) return {Vector.begin()}; return {Set.begin()}; } - const_iterator end() const { + [[nodiscard]] const_iterator end() const { if (isSmall()) return {Vector.end()}; return {Set.end()}; } /// Check if the SmallSet contains the given element. - bool contains(const T &V) const { + [[nodiscard]] bool contains(const T &V) const { if (isSmall()) return vfind(V) != Vector.end(); return Set.find(V) != Set.end(); @@ -279,7 +281,8 @@ class SmallSet : public SmallPtrSet {}; /// For large-set mode amortized complexity is linear, worst case is O(N^2) (if /// every hash collides). template -bool operator==(const SmallSet &LHS, const SmallSet &RHS) { +[[nodiscard]] bool operator==(const SmallSet &LHS, + const SmallSet &RHS) { if (LHS.size() != RHS.size()) return false; @@ -291,7 +294,8 @@ bool operator==(const SmallSet &LHS, const SmallSet &RHS) { /// /// Equivalent to !(LHS == RHS). See operator== for performance notes. template -bool operator!=(const SmallSet &LHS, const SmallSet &RHS) { +[[nodiscard]] bool operator!=(const SmallSet &LHS, + const SmallSet &RHS) { return !(LHS == RHS); } diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h index 5577b09fee89c..77805f5c03c14 100644 --- a/llvm/include/llvm/ADT/SmallVector.h +++ b/llvm/include/llvm/ADT/SmallVector.h @@ -199,17 +199,18 @@ class SmallVectorTemplateCommon } /// Check whether any part of the range will be invalidated by clearing. - void assertSafeToReferenceAfterClear(const T *From, const T *To) { - if (From == To) - return; - this->assertSafeToReferenceAfterResize(From, 0); - this->assertSafeToReferenceAfterResize(To - 1, 0); + template + void assertSafeToReferenceAfterClear(ItTy From, ItTy To) { + if constexpr (std::is_pointer_v && + std::is_same_v< + std::remove_const_t>, + std::remove_const_t>) { + if (From == To) + return; + this->assertSafeToReferenceAfterResize(From, 0); + this->assertSafeToReferenceAfterResize(To - 1, 0); + } } - template < - class ItTy, - std::enable_if_t, T *>::value, - bool> = false> - void assertSafeToReferenceAfterClear(ItTy, ItTy) {} /// Check whether any part of the range will be invalidated by growing. template void assertSafeToAddRange(ItTy From, ItTy To) { @@ -221,6 +222,8 @@ class SmallVectorTemplateCommon this->assertSafeToAdd(From, To - From); this->assertSafeToAdd(To - 1, To - From); } + (void)From; + (void)To; } /// Reserve enough space to add one element, and return the updated element diff --git a/llvm/include/llvm/ADT/SparseMultiSet.h b/llvm/include/llvm/ADT/SparseMultiSet.h index cf7603158b28b..0aa7edbcea673 100644 --- a/llvm/include/llvm/ADT/SparseMultiSet.h +++ b/llvm/include/llvm/ADT/SparseMultiSet.h @@ -400,7 +400,7 @@ class SparseMultiSet { RangePair equal_range(const KeyT &K) { iterator B = find(K); iterator E = iterator(this, SMSNode::INVALID, B.SparseIdx); - return std::make_pair(B, E); + return {B, E}; } /// Insert a new element at the tail of the subset list. Returns an iterator diff --git a/llvm/include/llvm/ADT/SparseSet.h b/llvm/include/llvm/ADT/SparseSet.h index 395cfc3ebfd43..9783301be4b64 100644 --- a/llvm/include/llvm/ADT/SparseSet.h +++ b/llvm/include/llvm/ADT/SparseSet.h @@ -171,23 +171,23 @@ class SparseSet { using iterator = typename DenseT::iterator; using const_iterator = typename DenseT::const_iterator; - const_iterator begin() const { return Dense.begin(); } - const_iterator end() const { return Dense.end(); } - iterator begin() { return Dense.begin(); } - iterator end() { return Dense.end(); } + [[nodiscard]] const_iterator begin() const { return Dense.begin(); } + [[nodiscard]] const_iterator end() const { return Dense.end(); } + [[nodiscard]] iterator begin() { return Dense.begin(); } + [[nodiscard]] iterator end() { return Dense.end(); } /// empty - Returns true if the set is empty. /// /// This is not the same as BitVector::empty(). /// - bool empty() const { return Dense.empty(); } + [[nodiscard]] bool empty() const { return Dense.empty(); } /// size - Returns the number of elements in the set. /// /// This is not the same as BitVector::size() which returns the size of the /// universe. /// - size_type size() const { return Dense.size(); } + [[nodiscard]] size_type size() const { return Dense.size(); } /// clear - Clears the set. This is a very fast constant time operation. /// @@ -222,21 +222,27 @@ class SparseSet { /// @param Key A valid key to find. /// @returns An iterator to the element identified by key, or end(). /// - iterator find(const KeyT &Key) { return findIndex(KeyIndexOf(Key)); } + [[nodiscard]] iterator find(const KeyT &Key) { + return findIndex(KeyIndexOf(Key)); + } - const_iterator find(const KeyT &Key) const { + [[nodiscard]] const_iterator find(const KeyT &Key) const { return const_cast(this)->findIndex(KeyIndexOf(Key)); } /// Check if the set contains the given \c Key. /// /// @param Key A valid key to find. - bool contains(const KeyT &Key) const { return find(Key) != end(); } + [[nodiscard]] bool contains(const KeyT &Key) const { + return find(Key) != end(); + } /// count - Returns 1 if this set contains an element identified by Key, /// 0 otherwise. /// - size_type count(const KeyT &Key) const { return contains(Key) ? 1 : 0; } + [[nodiscard]] size_type count(const KeyT &Key) const { + return contains(Key) ? 1 : 0; + } /// insert - Attempts to insert a new element. /// diff --git a/llvm/include/llvm/ADT/StringMap.h b/llvm/include/llvm/ADT/StringMap.h index 2c146fbf08df1..01cbf2d3fff71 100644 --- a/llvm/include/llvm/ADT/StringMap.h +++ b/llvm/include/llvm/ADT/StringMap.h @@ -102,18 +102,18 @@ class StringMapImpl { return reinterpret_cast(TombstoneIntVal); } - unsigned getNumBuckets() const { return NumBuckets; } - unsigned getNumItems() const { return NumItems; } + [[nodiscard]] unsigned getNumBuckets() const { return NumBuckets; } + [[nodiscard]] unsigned getNumItems() const { return NumItems; } - bool empty() const { return NumItems == 0; } - unsigned size() const { return NumItems; } + [[nodiscard]] bool empty() const { return NumItems == 0; } + [[nodiscard]] unsigned size() const { return NumItems; } /// Returns the hash value that will be used for the given string. /// This allows precomputing the value and passing it explicitly /// to some of the functions. /// The implementation of this function is not guaranteed to be stable /// and may change. - LLVM_ABI static uint32_t hash(StringRef Key); + [[nodiscard]] LLVM_ABI static uint32_t hash(StringRef Key); void swap(StringMapImpl &Other) { std::swap(TheTable, Other.TheTable); @@ -220,30 +220,35 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap using const_iterator = StringMapIterBase; using iterator = StringMapIterBase; - iterator begin() { return iterator(TheTable, NumBuckets != 0); } - iterator end() { return iterator(TheTable + NumBuckets); } - const_iterator begin() const { + [[nodiscard]] iterator begin() { return iterator(TheTable, NumBuckets != 0); } + [[nodiscard]] iterator end() { return iterator(TheTable + NumBuckets); } + [[nodiscard]] const_iterator begin() const { return const_iterator(TheTable, NumBuckets != 0); } - const_iterator end() const { return const_iterator(TheTable + NumBuckets); } + [[nodiscard]] const_iterator end() const { + return const_iterator(TheTable + NumBuckets); + } - iterator_range> keys() const { + [[nodiscard]] iterator_range> keys() const { return make_range(StringMapKeyIterator(begin()), StringMapKeyIterator(end())); } - iterator find(StringRef Key) { return find(Key, hash(Key)); } + [[nodiscard]] iterator find(StringRef Key) { return find(Key, hash(Key)); } - iterator find(StringRef Key, uint32_t FullHashValue) { + [[nodiscard]] iterator find(StringRef Key, uint32_t FullHashValue) { int Bucket = FindKey(Key, FullHashValue); if (Bucket == -1) return end(); return iterator(TheTable + Bucket); } - const_iterator find(StringRef Key) const { return find(Key, hash(Key)); } + [[nodiscard]] const_iterator find(StringRef Key) const { + return find(Key, hash(Key)); + } - const_iterator find(StringRef Key, uint32_t FullHashValue) const { + [[nodiscard]] const_iterator find(StringRef Key, + uint32_t FullHashValue) const { int Bucket = FindKey(Key, FullHashValue); if (Bucket == -1) return end(); @@ -252,7 +257,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap /// lookup - Return the entry for the specified key, or a default /// constructed value if no such entry exists. - ValueTy lookup(StringRef Key) const { + [[nodiscard]] ValueTy lookup(StringRef Key) const { const_iterator Iter = find(Key); if (Iter != end()) return Iter->second; @@ -261,7 +266,7 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap /// at - Return the entry for the specified key, or abort if no such /// entry exists. - const ValueTy &at(StringRef Val) const { + [[nodiscard]] const ValueTy &at(StringRef Val) const { auto Iter = this->find(Val); assert(Iter != this->end() && "StringMap::at failed due to a missing key"); return Iter->second; @@ -272,18 +277,22 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap ValueTy &operator[](StringRef Key) { return try_emplace(Key).first->second; } /// contains - Return true if the element is in the map, false otherwise. - bool contains(StringRef Key) const { return find(Key) != end(); } + [[nodiscard]] bool contains(StringRef Key) const { + return find(Key) != end(); + } /// count - Return 1 if the element is in the map, 0 otherwise. - size_type count(StringRef Key) const { return contains(Key) ? 1 : 0; } + [[nodiscard]] size_type count(StringRef Key) const { + return contains(Key) ? 1 : 0; + } template - size_type count(const StringMapEntry &MapEntry) const { + [[nodiscard]] size_type count(const StringMapEntry &MapEntry) const { return count(MapEntry.getKey()); } /// equal - check whether both of the containers are equal. - bool operator==(const StringMap &RHS) const { + [[nodiscard]] bool operator==(const StringMap &RHS) const { if (size() != RHS.size()) return false; @@ -302,7 +311,9 @@ class LLVM_ALLOCATORHOLDER_EMPTYBASE StringMap return true; } - bool operator!=(const StringMap &RHS) const { return !(*this == RHS); } + [[nodiscard]] bool operator!=(const StringMap &RHS) const { + return !(*this == RHS); + } /// insert - Insert the specified key/value pair into the map. If the key /// already exists in the map, return false and ignore the request, otherwise @@ -447,8 +458,12 @@ template class StringMapIterBase { AdvancePastEmptyBuckets(); } - reference operator*() const { return *static_cast(*Ptr); } - pointer operator->() const { return static_cast(*Ptr); } + [[nodiscard]] reference operator*() const { + return *static_cast(*Ptr); + } + [[nodiscard]] pointer operator->() const { + return static_cast(*Ptr); + } StringMapIterBase &operator++() { // Preincrement ++Ptr; diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h index 16aca4d45892d..7aee2aa67ddec 100644 --- a/llvm/include/llvm/ADT/StringRef.h +++ b/llvm/include/llvm/ADT/StringRef.h @@ -90,15 +90,7 @@ namespace llvm { /// Construct a string ref from a cstring. /*implicit*/ constexpr StringRef(const char *Str LLVM_LIFETIME_BOUND) - : Data(Str), Length(Str ? - // GCC 7 doesn't have constexpr char_traits. Fall back to __builtin_strlen. -#if defined(_GLIBCXX_RELEASE) && _GLIBCXX_RELEASE < 8 - __builtin_strlen(Str) -#else - std::char_traits::length(Str) -#endif - : 0) { - } + : StringRef(Str ? std::string_view(Str) : std::string_view()) {} /// Construct a string ref from a pointer and length. /*implicit*/ constexpr StringRef(const char *data LLVM_LIFETIME_BOUND, @@ -725,8 +717,8 @@ namespace llvm { split(StringRef Separator) const { size_t Idx = find(Separator); if (Idx == npos) - return std::make_pair(*this, StringRef()); - return std::make_pair(slice(0, Idx), substr(Idx + Separator.size())); + return {*this, StringRef()}; + return {slice(0, Idx), substr(Idx + Separator.size())}; } /// Split into two substrings around the last occurrence of a separator @@ -743,8 +735,8 @@ namespace llvm { rsplit(StringRef Separator) const { size_t Idx = rfind(Separator); if (Idx == npos) - return std::make_pair(*this, StringRef()); - return std::make_pair(slice(0, Idx), substr(Idx + Separator.size())); + return {*this, StringRef()}; + return {slice(0, Idx), substr(Idx + Separator.size())}; } /// Split into substrings around the occurrences of a separator string. diff --git a/llvm/include/llvm/ADT/StringSet.h b/llvm/include/llvm/ADT/StringSet.h index b4853423a1ef3..c8be3f2a503e4 100644 --- a/llvm/include/llvm/ADT/StringSet.h +++ b/llvm/include/llvm/ADT/StringSet.h @@ -57,7 +57,9 @@ class StringSet : public StringMap { } /// Check if the set contains the given \c key. - bool contains(StringRef key) const { return Base::contains(key); } + [[nodiscard]] bool contains(StringRef key) const { + return Base::contains(key); + } }; } // end namespace llvm diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h index 575b3c929e40c..9422a6da1ce8e 100644 --- a/llvm/include/llvm/ADT/StringTable.h +++ b/llvm/include/llvm/ADT/StringTable.h @@ -118,12 +118,8 @@ class StringTable { constexpr Iterator(const Iterator &RHS) = default; constexpr Iterator(Iterator &&RHS) = default; - Iterator &operator=(const Iterator &RHS) { - Table = RHS.Table; - O = RHS.O; - S = RHS.S; - return *this; - } + constexpr Iterator &operator=(const Iterator &RHS) = default; + constexpr Iterator &operator=(Iterator &&RHS) = default; bool operator==(const Iterator &RHS) const { assert(Table == RHS.Table && "Compared iterators for unrelated tables!"); diff --git a/llvm/include/llvm/ADT/ilist_node.h b/llvm/include/llvm/ADT/ilist_node.h index 8d78d5dbbda44..2af1c6ebbffce 100644 --- a/llvm/include/llvm/ADT/ilist_node.h +++ b/llvm/include/llvm/ADT/ilist_node.h @@ -51,12 +51,11 @@ class ilist_iterator_w_bits; template class ilist_sentinel; // Selector for which iterator type to pick given the iterator-bits node option. -template -struct ilist_select_iterator_type { - using type = std::conditional_t, - ilist_iterator>; -}; +template +using ilist_select_iterator_type = + std::conditional_t, + ilist_iterator>; /// Implementation for an ilist node. /// @@ -91,18 +90,12 @@ class ilist_node_impl friend class ilist_iterator_w_bits; protected: - using self_iterator = - typename ilist_select_iterator_type::type; - using const_self_iterator = - typename ilist_select_iterator_type::type; + using self_iterator = ilist_select_iterator_type; + using const_self_iterator = ilist_select_iterator_type; using reverse_self_iterator = - typename ilist_select_iterator_type::type; + ilist_select_iterator_type; using const_reverse_self_iterator = - typename ilist_select_iterator_type::type; + ilist_select_iterator_type; ilist_node_impl() = default; diff --git a/llvm/include/llvm/ADT/simple_ilist.h b/llvm/include/llvm/ADT/simple_ilist.h index 7236b3fa5a7d2..fcb2e41f62bf0 100644 --- a/llvm/include/llvm/ADT/simple_ilist.h +++ b/llvm/include/llvm/ADT/simple_ilist.h @@ -92,18 +92,11 @@ class simple_ilist using reference = typename OptionsT::reference; using const_pointer = typename OptionsT::const_pointer; using const_reference = typename OptionsT::const_reference; - using iterator = - typename ilist_select_iterator_type::type; - using const_iterator = - typename ilist_select_iterator_type::type; - using reverse_iterator = - typename ilist_select_iterator_type::type; + using iterator = ilist_select_iterator_type; + using const_iterator = ilist_select_iterator_type; + using reverse_iterator = ilist_select_iterator_type; using const_reverse_iterator = - typename ilist_select_iterator_type::type; + ilist_select_iterator_type; using size_type = size_t; using difference_type = ptrdiff_t; diff --git a/llvm/include/llvm/Analysis/AssumptionCache.h b/llvm/include/llvm/Analysis/AssumptionCache.h index 1b026ef76a45e..5656729d20366 100644 --- a/llvm/include/llvm/Analysis/AssumptionCache.h +++ b/llvm/include/llvm/Analysis/AssumptionCache.h @@ -28,6 +28,7 @@ namespace llvm { class AssumeInst; +struct OperandBundleUse; class Function; class raw_ostream; class TargetTransformInfo; @@ -65,7 +66,7 @@ class AssumptionCache { /// Vector of weak value handles to calls of the \@llvm.assume /// intrinsic. - SmallVector AssumeHandles; + SmallVector AssumeHandles; class LLVM_ABI AffectedValueCallbackVH final : public CallbackVH { AssumptionCache *AC; @@ -148,7 +149,7 @@ class AssumptionCache { /// FIXME: We should replace this with pointee_iterator> /// when we can write that to filter out the null values. Then caller code /// will become simpler. - MutableArrayRef assumptions() { + MutableArrayRef assumptions() { if (!Scanned) scanFunction(); return AssumeHandles; @@ -165,6 +166,11 @@ class AssumptionCache { return AVI->second; } + + /// Determine which values are affected by this assume operand bundle. + static void + findValuesAffectedByOperandBundle(OperandBundleUse Bundle, + function_ref InsertAffected); }; /// A function analysis which provides an \c AssumptionCache. diff --git a/llvm/include/llvm/Analysis/InterestingMemoryOperand.h b/llvm/include/llvm/Analysis/InterestingMemoryOperand.h new file mode 100644 index 0000000000000..e8124f72a1a81 --- /dev/null +++ b/llvm/include/llvm/Analysis/InterestingMemoryOperand.h @@ -0,0 +1,60 @@ +//===- InterestingMemoryOperand.h -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines InterestingMemoryOperand class that is used when getting +// the information of a memory reference instruction. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_INTERESTINGMEMORYOPERAND_H +#define LLVM_ANALYSIS_INTERESTINGMEMORYOPERAND_H + +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Instruction.h" +#include "llvm/Support/TypeSize.h" + +namespace llvm { +class InterestingMemoryOperand { +public: + Use *PtrUse; + bool IsWrite; + Type *OpType; + TypeSize TypeStoreSize = TypeSize::getFixed(0); + MaybeAlign Alignment; + // The mask Value, if we're looking at a masked load/store. + Value *MaybeMask; + // The EVL Value, if we're looking at a vp intrinsic. + Value *MaybeEVL; + // The Stride Value, if we're looking at a strided load/store. + Value *MaybeStride; + // The Offset Value, if we're looking at a indexed load/store. The + // offset actually means byte-offset instead of array index. + Value *MaybeByteOffset; + + InterestingMemoryOperand(Instruction *I, unsigned OperandNo, bool IsWrite, + class Type *OpType, MaybeAlign Alignment, + Value *MaybeMask = nullptr, + Value *MaybeEVL = nullptr, + Value *MaybeStride = nullptr, + Value *MaybeByteOffset = nullptr) + : IsWrite(IsWrite), OpType(OpType), Alignment(Alignment), + MaybeMask(MaybeMask), MaybeEVL(MaybeEVL), MaybeStride(MaybeStride), + MaybeByteOffset(MaybeByteOffset) { + const DataLayout &DL = I->getDataLayout(); + TypeStoreSize = DL.getTypeStoreSizeInBits(OpType); + PtrUse = &I->getOperandUse(OperandNo); + } + + Instruction *getInsn() { return cast(PtrUse->getUser()); } + + Value *getPtr() { return PtrUse->get(); } +}; + +} // namespace llvm + +#endif // LLVM_ANALYSIS_INTERESTINGMEMORYOPERAND_H diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index 49a795b5fd6a7..52ab38583d5de 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -413,30 +413,29 @@ class MemoryDepChecker { uint64_t MaxStride; std::optional CommonStride; - /// TypeByteSize is a pair of alloc sizes of the source and sink. - std::pair TypeByteSize; - - // HasSameSize is a boolean indicating whether the store sizes of the source - // and sink are equal. - // TODO: Remove this. - bool HasSameSize; + /// TypeByteSize is either the common store size of both accesses, or 0 when + /// store sizes mismatch. + uint64_t TypeByteSize; bool AIsWrite; bool BIsWrite; DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t MaxStride, std::optional CommonStride, - std::pair TypeByteSize, - bool HasSameSize, bool AIsWrite, bool BIsWrite) + uint64_t TypeByteSize, bool AIsWrite, + bool BIsWrite) : Dist(Dist), MaxStride(MaxStride), CommonStride(CommonStride), - TypeByteSize(TypeByteSize), HasSameSize(HasSameSize), - AIsWrite(AIsWrite), BIsWrite(BIsWrite) {} + TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {} }; /// Get the dependence distance, strides, type size and whether it is a write - /// for the dependence between A and B. Returns either a DepType, the - /// dependence result, if it could already be determined, or a - /// DepDistanceStrideAndSizeInfo struct. + /// for the dependence between A and B. Returns a DepType, if we can prove + /// there's no dependence or the analysis fails. Outlined to lambda to limit + /// he scope of various temporary variables, like A/BPtr, StrideA/BPtr and + /// others. Returns either the dependence result, if it could already be + /// determined, or a DepDistanceStrideAndSizeInfo struct, noting that + /// TypeByteSize could be 0 when store sizes mismatch, and this should be + /// checked in the caller. std::variant getDependenceDistanceStrideAndSize(const MemAccessInfo &A, Instruction *AInst, const MemAccessInfo &B, diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 41ff54f0781a2..7a4abe9ee5082 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -25,6 +25,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/InterestingMemoryOperand.h" #include "llvm/IR/FMF.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" @@ -88,6 +89,8 @@ struct MemIntrinsicInfo { bool WriteMem = false; bool IsVolatile = false; + SmallVector InterestingOperands; + bool isUnordered() const { return (Ordering == AtomicOrdering::NotAtomic || Ordering == AtomicOrdering::Unordered) && diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index e6a0eae9da30c..d976d40e5e956 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -187,6 +187,9 @@ enum Kind { kw_graalcc, kw_riscv_vector_cc, kw_riscv_vls_cc, + kw_cheriot_compartmentcallcc, + kw_cheriot_compartmentcalleecc, + kw_cheriot_librarycallcc, // Attributes: kw_attributes, diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h index c04380667a640..08a7ddb6929f5 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainer.h +++ b/llvm/include/llvm/BinaryFormat/DXContainer.h @@ -185,6 +185,15 @@ enum class DescriptorRangeFlags : uint32_t { LLVM_ABI ArrayRef> getDescriptorRangeFlags(); +#define STATIC_SAMPLER_FLAG(Num, Enum, Flag) Enum = Num, +enum class StaticSamplerFlags : uint32_t { +#include "DXContainerConstants.def" + + LLVM_MARK_AS_BITMASK_ENUM(NonNormalizedCoordinates) +}; + +LLVM_ABI ArrayRef> getStaticSamplerFlags(); + #define ROOT_PARAMETER(Val, Enum) Enum = Val, enum class RootParameterType : uint32_t { #include "DXContainerConstants.def" @@ -813,6 +822,22 @@ struct DescriptorRange { } }; } // namespace v2 + +namespace v3 { +struct StaticSampler : public v1::StaticSampler { + uint32_t Flags; + + StaticSampler() = default; + explicit StaticSampler(v1::StaticSampler &Base) + : v1::StaticSampler(Base), Flags(0U) {} + + void swapBytes() { + v1::StaticSampler::swapBytes(); + sys::swapByteOrder(Flags); + } +}; + +} // namespace v3 } // namespace RTS0 // D3D_ROOT_SIGNATURE_VERSION diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def index 889653611d79a..f576d958037cd 100644 --- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def +++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def @@ -104,6 +104,16 @@ DESCRIPTOR_RANGE_FLAG(0x10000, DescriptorsStaticKeepingBufferBoundsChecks, DESCR #undef DESCRIPTOR_RANGE_FLAG #endif // DESCRIPTOR_RANGE_FLAG +// STATIC_SAMPLER_FLAG(flag value, name, flag). +#ifdef STATIC_SAMPLER_FLAG + +STATIC_SAMPLER_FLAG(0x0, None, SAMPLER_FLAG_NONE) +STATIC_SAMPLER_FLAG(0x1, UintBorderColor, SAMPLER_FLAG_UINT_BORDER_COLOR) +STATIC_SAMPLER_FLAG(0x2, NonNormalizedCoordinates, SAMPLER_FLAG_NON_NORMALIZED_COORDINATES) + +#undef STATIC_SAMPLER_FLAG +#endif // STATIC_SAMPLER_FLAG + #ifdef ROOT_PARAMETER ROOT_PARAMETER(0, DescriptorTable) diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 7e880fdec3478..e619b186dfe3d 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -303,7 +303,7 @@ enum { EM_BA2 = 202, // Beyond BA2 CPU architecture EM_XCORE = 203, // XMOS xCORE processor family EM_MCHP_PIC = 204, // Microchip 8-bit PIC(r) family - EM_INTEL205 = 205, // Reserved by Intel + EM_INTELGT = 205, // Intel Graphics Technology EM_INTEL206 = 206, // Reserved by Intel EM_INTEL207 = 207, // Reserved by Intel EM_INTEL208 = 208, // Reserved by Intel diff --git a/llvm/include/llvm/Bitstream/BitstreamWriter.h b/llvm/include/llvm/Bitstream/BitstreamWriter.h index 5f53681320ce4..a2938642f824a 100644 --- a/llvm/include/llvm/Bitstream/BitstreamWriter.h +++ b/llvm/include/llvm/Bitstream/BitstreamWriter.h @@ -87,7 +87,7 @@ class BitstreamWriter { void WriteWord(unsigned Value) { Value = - support::endian::byte_swap(Value); + support::endian::byte_swap(Value, llvm::endianness::little); Buffer.append(reinterpret_cast(&Value), reinterpret_cast(&Value + 1)); } diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h index 71317619098ad..4c744a2c0a4d2 100644 --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -17,6 +17,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/StaticDataProfileInfo.h" @@ -192,28 +193,28 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass { /// Store symbols and type identifiers used to create callgraph section /// entries related to a function. - struct FunctionInfo { + struct FunctionCallGraphInfo { /// Numeric type identifier used in callgraph section for indirect calls /// and targets. using CGTypeId = uint64_t; - /// Enumeration of function kinds, and their mapping to function kind values - /// stored in callgraph section entries. - /// Must match the enum in llvm/tools/llvm-objdump/llvm-objdump.cpp. - enum class FunctionKind : uint64_t { - /// Function cannot be target to indirect calls. - NOT_INDIRECT_TARGET = 0, - - /// Function may be target to indirect calls but its type id is unknown. - INDIRECT_TARGET_UNKNOWN_TID = 1, - - /// Function may be target to indirect calls and its type id is known. - INDIRECT_TARGET_KNOWN_TID = 2, - }; - /// Map type identifiers to callsite labels. Labels are generated for each /// indirect callsite in the function. SmallVector> CallSiteLabels; + SmallSet DirectCallees; + }; + + /// Enumeration of function kinds, and their mapping to function kind values + /// stored in callgraph section entries. + enum class FunctionKind : uint64_t { + /// Function cannot be target to indirect calls. + NOT_INDIRECT_TARGET = 0, + + /// Function may be target to indirect calls but its type id is unknown. + INDIRECT_TARGET_UNKNOWN_TID = 1, + + /// Function may be target to indirect calls and its type id is known. + INDIRECT_TARGET_KNOWN_TID = 2, }; enum CallGraphSectionFormatVersion : uint64_t { @@ -385,10 +386,11 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass { /// are available. Returns empty string otherwise. StringRef getConstantSectionSuffix(const Constant *C) const; - /// Generate and emit labels for callees of the indirect callsites which will - /// be used to populate the .callgraph section. - void emitIndirectCalleeLabels( - FunctionInfo &FuncInfo, + /// Iff MI is an indirect call, generate and emit a label after the callsites + /// which will be used to populate the .callgraph section. For direct + /// callsites add the callee symbol to direct callsites list of FuncCGInfo. + void handleCallsiteForCallgraph( + FunctionCallGraphInfo &FuncCGInfo, const MachineFunction::CallSiteInfoMap &CallSitesInfoMap, const MachineInstr &MI); @@ -479,7 +481,8 @@ class LLVM_ABI AsmPrinter : public MachineFunctionPass { void emitKCFITrapEntry(const MachineFunction &MF, const MCSymbol *Symbol); virtual void emitKCFITypeId(const MachineFunction &MF); - void emitCallGraphSection(const MachineFunction &MF, FunctionInfo &FuncInfo); + void emitCallGraphSection(const MachineFunction &MF, + FunctionCallGraphInfo &FuncCGInfo); void emitPseudoProbe(const MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h index f0cfa7663c5fa..82dd5feb31dba 100644 --- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h +++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h @@ -50,6 +50,10 @@ struct FunctionPathAndClusterInfo { // the edge a -> b (a is not cloned). The index of the path in this vector // determines the `UniqueBBID::CloneID` of the cloned blocks in that path. SmallVector> ClonePaths; + // Node counts for each basic block. + DenseMap NodeCounts; + // Edge counts for each edge, stored as a nested map. + DenseMap> EdgeCounts; }; class BasicBlockSectionsProfileReader { @@ -77,6 +81,11 @@ class BasicBlockSectionsProfileReader { SmallVector> getClonePathsForFunction(StringRef FuncName) const; + // Returns the profile count for the edge from `SrcBBID` to `SinkBBID` in + // function `FuncName` or zero if it does not exist. + uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID, + const UniqueBBID &SinkBBID) const; + private: StringRef getAliasName(StringRef FuncName) const { auto R = FuncAliasMap.find(FuncName); @@ -183,6 +192,9 @@ class BasicBlockSectionsProfileReaderWrapperPass : public ImmutablePass { SmallVector> getClonePathsForFunction(StringRef FuncName) const; + uint64_t getEdgeCount(StringRef FuncName, const UniqueBBID &SrcBBID, + const UniqueBBID &DestBBID) const; + // Initializes the FunctionNameToDIFilename map for the current module and // then reads the profile for the matching functions. bool doInitialization(Module &M) override; diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index dce423fc1b18b..42ddb32d24093 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2929,7 +2929,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { CostKind); EVT VT = TLI->getValueType(DL, CmpTy, true); - if (TLI->shouldExpandCmpUsingSelects(VT)) { + if (TLI->preferSelectsOverBooleanArithmetic(VT)) { // x < y ? -1 : (x > y ? 1 : 0) Cost += 2 * thisT()->getCmpSelInstrCost( BinaryOperator::Select, RetTy, CondTy, diff --git a/llvm/include/llvm/CodeGen/CalcSpillWeights.h b/llvm/include/llvm/CodeGen/CalcSpillWeights.h index 5a86dd9650fbd..11d1c16561507 100644 --- a/llvm/include/llvm/CodeGen/CalcSpillWeights.h +++ b/llvm/include/llvm/CodeGen/CalcSpillWeights.h @@ -81,6 +81,14 @@ class VirtRegMap; static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, const VirtRegMap &VRM, + const MachineRegisterInfo &MRI, + const TargetInstrInfo &TII); + + /// \returns true if all registers used by \p MI are also available with the + /// same value at \p UseIdx. + static bool allUsesAvailableAt(const MachineInstr *MI, SlotIndex UseIdx, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, const TargetInstrInfo &TII); protected: diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 99d3cd0aac85c..0b6033b4ba60a 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -2463,6 +2463,11 @@ class LLVM_ABI MachineIRBuilder { return buildInstr(TargetOpcode::G_GET_ROUNDING, {Dst}, {}); } + /// Build and insert G_SET_ROUNDING + MachineInstrBuilder buildSetRounding(const SrcOp &Src) { + return buildInstr(TargetOpcode::G_SET_ROUNDING, {}, {Src}); + } + virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, ArrayRef SrcOps, std::optional Flags = std::nullopt); diff --git a/llvm/include/llvm/CodeGen/LiveInterval.h b/llvm/include/llvm/CodeGen/LiveInterval.h index e1c5717f5face..f18c177b1c35b 100644 --- a/llvm/include/llvm/CodeGen/LiveInterval.h +++ b/llvm/include/llvm/CodeGen/LiveInterval.h @@ -83,8 +83,16 @@ namespace llvm { /// Mark this value as unused. void markUnused() { def = SlotIndex(); } + + LLVM_ABI void print(raw_ostream &OS) const; + LLVM_ABI void dump() const; }; + inline raw_ostream &operator<<(raw_ostream &OS, const VNInfo &VNI) { + VNI.print(OS); + return OS; + } + /// Result of a LiveRange query. This class hides the implementation details /// of live ranges, and it should be used as the primary interface for /// examining live ranges around instructions. diff --git a/llvm/include/llvm/CodeGen/LiveRangeEdit.h b/llvm/include/llvm/CodeGen/LiveRangeEdit.h index db1785de255f0..6473138a801f7 100644 --- a/llvm/include/llvm/CodeGen/LiveRangeEdit.h +++ b/llvm/include/llvm/CodeGen/LiveRangeEdit.h @@ -189,11 +189,6 @@ class LiveRangeEdit : private MachineRegisterInfo::Delegate { explicit Remat(const VNInfo *ParentVNI) : ParentVNI(ParentVNI) {} }; - /// allUsesAvailableAt - Return true if all registers used by OrigMI at - /// OrigIdx are also available with the same value at UseIdx. - bool allUsesAvailableAt(const MachineInstr *OrigMI, SlotIndex OrigIdx, - SlotIndex UseIdx) const; - /// canRematerializeAt - Determine if ParentVNI can be rematerialized at /// UseIdx. It is assumed that parent_.getVNINfoAt(UseIdx) == ParentVNI. bool canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex UseIdx); diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index a91c26ee1122a..c7304e386b542 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -634,19 +634,36 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::CalledGlobal) namespace llvm { namespace yaml { -// Struct representing one save/restore point in the 'savePoint'/'restorePoint' -// list +// Struct representing one save/restore point in the 'savePoint' / +// 'restorePoint' list. One point consists of machine basic block name and list +// of registers saved/restored in this basic block. In MIR it looks like: +// savePoint: +// - point: '%bb.1' +// registers: +// - '$rbx' +// - '$r12' +// ... +// restorePoint: +// - point: '%bb.1' +// registers: +// - '$rbx' +// - '$r12' +// If no register is saved/restored in the selected BB, +// field 'registers' is not specified. struct SaveRestorePointEntry { StringValue Point; + std::vector Registers; bool operator==(const SaveRestorePointEntry &Other) const { - return Point == Other.Point; + return Point == Other.Point && Registers == Other.Registers; } }; template <> struct MappingTraits { static void mapping(IO &YamlIO, SaveRestorePointEntry &Entry) { YamlIO.mapRequired("point", Entry.Point); + YamlIO.mapOptional("registers", Entry.Registers, + std::vector()); } }; diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index 94139b64a3e30..71739278cf513 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -505,6 +505,11 @@ class MachineBasicBlock LLVM_ABI void removeLiveIn(MCRegister Reg, LaneBitmask LaneMask = LaneBitmask::getAll()); + /// Remove the specified register from any overlapped live in. The method is + /// subreg-aware and removes Reg and its subregs from the live in set. It also + /// clears the corresponding bitmask from its live-in super registers. + LLVM_ABI void removeLiveInOverlappedWith(MCRegister Reg); + /// Return true if the specified register is in the live in set. LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask = LaneBitmask::getAll()) const; @@ -1035,7 +1040,9 @@ class MachineBasicBlock /// Succ, can be split. If this returns true a subsequent call to /// SplitCriticalEdge is guaranteed to return a valid basic block if /// no changes occurred in the meantime. - LLVM_ABI bool canSplitCriticalEdge(const MachineBasicBlock *Succ) const; + LLVM_ABI bool + canSplitCriticalEdge(const MachineBasicBlock *Succ, + const MachineLoopInfo *MLI = nullptr) const; void pop_front() { Insts.pop_front(); } void pop_back() { Insts.pop_back(); } @@ -1287,6 +1294,15 @@ class MachineBasicBlock // Helper function for MIRPrinter. LLVM_ABI bool canPredictBranchProbabilities() const; + /// Iterate over block PHI instructions and remove all incoming values for + /// PredMBB. + /// + /// Method does not erase PHI instructions even if they have single income or + /// do not have incoming values ar all. It is a caller responsibility to make + /// decision how to process PHI instructions after incoming values removal. + LLVM_ABI void + removePHIsIncomingValuesForPredecessor(const MachineBasicBlock &PredMBB); + private: /// Return probability iterator corresponding to the I successor iterator. probability_iterator getProbabilityIterator(succ_iterator I); diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index e666001035deb..00c734330a40b 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -76,6 +76,9 @@ class CalleeSavedInfo { bool isSpilledToReg() const { return SpilledToReg; } }; +using SaveRestorePoints = + DenseMap>; + /// The MachineFrameInfo class represents an abstract stack frame until /// prolog/epilog code is inserted. This class is key to allowing stack frame /// representation optimizations, such as frame pointer elimination. It also @@ -333,9 +336,9 @@ class MachineFrameInfo { bool HasTailCall = false; /// Not empty, if shrink-wrapping found a better place for the prologue. - SmallVector SavePoints; + SaveRestorePoints SavePoints; /// Not empty, if shrink-wrapping found a better place for the epilogue. - SmallVector RestorePoints; + SaveRestorePoints RestorePoints; /// Size of the UnsafeStack Frame uint64_t UnsafeStackSize = 0; @@ -825,17 +828,21 @@ class MachineFrameInfo { void setCalleeSavedInfoValid(bool v) { CSIValid = v; } - ArrayRef getSavePoints() const { return SavePoints; } - void setSavePoints(ArrayRef NewSavePoints) { - SavePoints.assign(NewSavePoints.begin(), NewSavePoints.end()); - } - ArrayRef getRestorePoints() const { - return RestorePoints; + const SaveRestorePoints &getRestorePoints() const { return RestorePoints; } + + const SaveRestorePoints &getSavePoints() const { return SavePoints; } + + void setSavePoints(SaveRestorePoints NewSavePoints) { + SavePoints = std::move(NewSavePoints); } - void setRestorePoints(ArrayRef NewRestorePoints) { - RestorePoints.assign(NewRestorePoints.begin(), NewRestorePoints.end()); + + void setRestorePoints(SaveRestorePoints NewRestorePoints) { + RestorePoints = std::move(NewRestorePoints); } + void clearSavePoints() { SavePoints.clear(); } + void clearRestorePoints() { RestorePoints.clear(); } + uint64_t getUnsafeStackSize() const { return UnsafeStackSize; } void setUnsafeStackSize(uint64_t Size) { UnsafeStackSize = Size; } diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h index 10a9b1ff1411d..4fcb7f36e0238 100644 --- a/llvm/include/llvm/CodeGen/MachineInstr.h +++ b/llvm/include/llvm/CodeGen/MachineInstr.h @@ -1229,7 +1229,7 @@ class MachineInstr /// Returns true if this instruction is a candidate for remat. /// This flag is deprecated, please don't use it anymore. If this - /// flag is set, the isReallyTriviallyReMaterializable() method is called to + /// flag is set, the isReMaterializableImpl() method is called to /// verify the instruction is really rematerializable. bool isRematerializable(QueryType Type = AllInBundle) const { // It's only possible to re-mat a bundle if all bundled instructions are @@ -2000,6 +2000,15 @@ class MachineInstr /// and point them to \p Reg instead. LLVM_ABI void changeDebugValuesDefReg(Register Reg); + /// Remove all incoming values of Phi instruction for the given block. + /// + /// Return deleted operands count. + /// + /// Method does not erase PHI instruction even if it has single income or does + /// not have incoming values at all. It is a caller responsibility to make + /// decision how to process PHI instruction after incoming values removed. + LLVM_ABI unsigned removePHIIncomingValueFor(const MachineBasicBlock &MBB); + /// Sets all register debug operands in this debug value instruction to be /// undef. void setDebugValueUndef() { diff --git a/llvm/include/llvm/CodeGen/RDFGraph.h b/llvm/include/llvm/CodeGen/RDFGraph.h index 8a93afbcb5491..6bb6033a8a2f2 100644 --- a/llvm/include/llvm/CodeGen/RDFGraph.h +++ b/llvm/include/llvm/CodeGen/RDFGraph.h @@ -447,7 +447,7 @@ struct NodeAllocator { AllocatorTy MemPool; }; -using RegisterSet = std::set; +using RegisterSet = std::set; struct TargetOperandInfo { TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {} diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h index 4a9a4063c9e83..82027cad53bdb 100644 --- a/llvm/include/llvm/CodeGen/RDFRegisters.h +++ b/llvm/include/llvm/CodeGen/RDFRegisters.h @@ -199,6 +199,33 @@ struct PhysicalRegisterInfo { std::vector AliasInfos; }; +struct RegisterRefEqualTo { + constexpr RegisterRefEqualTo(const llvm::rdf::PhysicalRegisterInfo &pri) + : PRI(&pri) {} + + bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const { + return PRI->equal_to(A, B); + } + +private: + // Make it a pointer just in case. See comment in `RegisterRefLess` below. + const llvm::rdf::PhysicalRegisterInfo *PRI; +}; + +struct RegisterRefLess { + constexpr RegisterRefLess(const llvm::rdf::PhysicalRegisterInfo &pri) + : PRI(&pri) {} + + bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const { + return PRI->less(A, B); + } + +private: + // Make it a pointer because apparently some versions of MSVC use std::swap + // on the comparator object. + const llvm::rdf::PhysicalRegisterInfo *PRI; +}; + struct RegisterAggr { RegisterAggr(const PhysicalRegisterInfo &pri) : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {} @@ -334,18 +361,6 @@ template <> struct hash { } }; -template <> struct equal_to { - constexpr equal_to(const llvm::rdf::PhysicalRegisterInfo &pri) : PRI(&pri) {} - - bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const { - return PRI->equal_to(A, B); - } - -private: - // Make it a pointer just in case. See comment in `less` below. - const llvm::rdf::PhysicalRegisterInfo *PRI; -}; - template <> struct equal_to { bool operator()(const llvm::rdf::RegisterAggr &A, const llvm::rdf::RegisterAggr &B) const { @@ -353,23 +368,10 @@ template <> struct equal_to { } }; -template <> struct less { - constexpr less(const llvm::rdf::PhysicalRegisterInfo &pri) : PRI(&pri) {} - - bool operator()(llvm::rdf::RegisterRef A, llvm::rdf::RegisterRef B) const { - return PRI->less(A, B); - } - -private: - // Make it a pointer because apparently some versions of MSVC use std::swap - // on the std::less specialization. - const llvm::rdf::PhysicalRegisterInfo *PRI; -}; - } // namespace std namespace llvm::rdf { -using RegisterSet = std::set>; +using RegisterSet = std::set; } // namespace llvm::rdf #endif // LLVM_CODEGEN_RDFREGISTERS_H diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 21dec19e3cb9d..201dc68de8b76 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -597,9 +597,9 @@ struct BinaryOpc_match { unsigned Opcode; LHS_P LHS; RHS_P RHS; - std::optional Flags; + SDNodeFlags Flags; BinaryOpc_match(unsigned Opc, const LHS_P &L, const RHS_P &R, - std::optional Flgs = std::nullopt) + SDNodeFlags Flgs = SDNodeFlags()) : Opcode(Opc), LHS(L), RHS(R), Flags(Flgs) {} template @@ -613,10 +613,7 @@ struct BinaryOpc_match { RHS.match(Ctx, N->getOperand(EO.FirstIndex))))) return false; - if (!Flags.has_value()) - return true; - - return (*Flags & N->getFlags()) == *Flags; + return (Flags & N->getFlags()) == Flags; } return false; diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index f2ad5ee249b46..175f205328361 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -168,10 +168,22 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { /// registers so that the instructions result is independent of the place /// in the function. bool isTriviallyReMaterializable(const MachineInstr &MI) const { + if (!isReMaterializable(MI)) + return false; + for (const MachineOperand &MO : MI.all_uses()) { + if (MO.getReg().isVirtual()) + return false; + } + return true; + } + + /// Return true if the instruction would be materializable at a point + /// in the containing function where all virtual register uses were + /// known to be live and available in registers. + bool isReMaterializable(const MachineInstr &MI) const { return (MI.getOpcode() == TargetOpcode::IMPLICIT_DEF && MI.getNumOperands() == 1) || - (MI.getDesc().isRematerializable() && - isReallyTriviallyReMaterializable(MI)); + (MI.getDesc().isRematerializable() && isReMaterializableImpl(MI)); } /// Given \p MO is a PhysReg use return if it can be ignored for the purpose @@ -194,11 +206,10 @@ class LLVM_ABI TargetInstrInfo : public MCInstrInfo { protected: /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is /// set, this hook lets the target specify whether the instruction is actually - /// trivially rematerializable, taking into consideration its operands. This + /// rematerializable, taking into consideration its operands. This /// predicate must return false if the instruction has any side effects other - /// than producing a value, or if it requres any address registers that are - /// not always available. - virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const; + /// than producing a value. + virtual bool isReMaterializableImpl(const MachineInstr &MI) const; /// This method commutes the operands of the given machine instruction MI. /// The operands to be commuted are specified by their indices OpIdx1 and diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4c2d991308d30..c45e03a7bdad8 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3455,6 +3455,10 @@ class LLVM_ABI TargetLoweringBase { /// matching of other patterns. virtual bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const { + // Form it if it is legal. + if (isOperationLegal(Opcode, VT)) + return true; + // TODO: The default logic is inherited from code in CodeGenPrepare. // The opcode should not make a difference by default? if (Opcode != ISD::UADDO) @@ -3505,9 +3509,10 @@ class LLVM_ABI TargetLoweringBase { return isOperationLegalOrCustom(Op, VT); } - /// Should we expand [US]CMP nodes using two selects and two compares, or by - /// doing arithmetic on boolean types - virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; } + /// Should we prefer selects to doing arithmetic on boolean types + virtual bool preferSelectsOverBooleanArithmetic(EVT VT) const { + return false; + } /// True if target has some particular form of dealing with pointer arithmetic /// semantics for pointers with the given value type. False if pointer diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index 44edec98d20f3..9ea127dd15943 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -367,6 +367,10 @@ def aarch64mfp8 : ValueType<8, 253>; // 8-bit value in FPR (AArch64) def c64 : VTCheriCapability<64, 254>; // 64-bit CHERI capability value def c128 : VTCheriCapability<128, 255>; // 128-bit CHERI capability value +// Pseudo valuetype mapped to the current CHERI capability pointer size. +// Should only be used in TableGen. +def cPTR : VTAny<503>; + let isNormalValueType = false in { def token : ValueType<0, 504>; // TokenTy def MetadataVT : ValueType<0, 505> { // Metadata diff --git a/llvm/include/llvm/CodeGenTypes/MachineValueType.h b/llvm/include/llvm/CodeGenTypes/MachineValueType.h index e4114ae957c70..69d52e33d900f 100644 --- a/llvm/include/llvm/CodeGenTypes/MachineValueType.h +++ b/llvm/include/llvm/CodeGenTypes/MachineValueType.h @@ -582,6 +582,12 @@ namespace llvm { MVT::LAST_FP_SCALABLE_VECTOR_VALUETYPE, force_iteration_on_noniterable_enum); } + + static auto cheri_capability_valuetypes() { + return enum_seq_inclusive(MVT::FIRST_CHERI_CAPABILITY_VALUETYPE, + MVT::LAST_CHERI_CAPABILITY_VALUETYPE, + force_iteration_on_noniterable_enum); + } /// @} }; diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h index ee9a87e25e15d..4caf1236dc0fb 100644 --- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h +++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVObject.h @@ -82,6 +82,8 @@ using LVScopes = SmallVector; using LVSymbols = SmallVector; using LVTypes = SmallVector; +using LVElementsView = detail::concat_range; using LVOffsets = SmallVector; // The following DWARF documents detail the 'tombstone' concept: diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h index a453923d032e4..f4f3516769938 100644 --- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h +++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h @@ -14,6 +14,7 @@ #ifndef LLVM_DEBUGINFO_LOGICALVIEW_CORE_LVSCOPE_H #define LLVM_DEBUGINFO_LOGICALVIEW_CORE_LVSCOPE_H +#include "llvm/ADT/STLExtras.h" #include "llvm/DebugInfo/LogicalView/Core/LVElement.h" #include "llvm/DebugInfo/LogicalView/Core/LVLocation.h" #include "llvm/DebugInfo/LogicalView/Core/LVSort.h" @@ -94,6 +95,11 @@ class LLVM_ABI LVScope : public LVElement { LVProperties Kinds; LVProperties Properties; static LVScopeDispatch Dispatch; + // Empty containers used in `getChildren()` in case there is no Types, + // Symbols, or Scopes. + static const LVTypes EmptyTypes; + static const LVSymbols EmptySymbols; + static const LVScopes EmptyScopes; // Size in bits if this scope represents also a compound type. uint32_t BitSize = 0; @@ -128,14 +134,6 @@ class LLVM_ABI LVScope : public LVElement { std::unique_ptr Lines; std::unique_ptr Ranges; - // Vector of elements (types, scopes and symbols). - // It is the union of (*Types, *Symbols and *Scopes) to be used for - // the following reasons: - // - Preserve the order the logical elements are read in. - // - To have a single container with all the logical elements, when - // the traversal does not require any specific element kind. - std::unique_ptr Children; - // Resolve the template parameters/arguments relationship. void resolveTemplate(); void printEncodedArgs(raw_ostream &OS, bool Full) const; @@ -213,7 +211,23 @@ class LLVM_ABI LVScope : public LVElement { const LVScopes *getScopes() const { return Scopes.get(); } const LVSymbols *getSymbols() const { return Symbols.get(); } const LVTypes *getTypes() const { return Types.get(); } - const LVElements *getChildren() const { return Children.get(); } + // Return view over union of child Scopes, Types, and Symbols, in that order. + // + // Calling `LVScope::sort()` ensures that each of groups is sorted according + // to the given criteria (see also `LVOptions::setSortMode()`). Because + // `getChildren()` iterates over the concatenation, the result returned by + // this function is not necessarily sorted. If order is important, use + // `getSortedChildren()`. + LVElementsView getChildren() const { + return llvm::concat(Scopes ? *Scopes : EmptyScopes, + Types ? *Types : EmptyTypes, + Symbols ? *Symbols : EmptySymbols); + } + // Return vector of child Scopes, Types, and Symbols that is sorted using + // `SortFunction`. This requires copy + sort; if order is not important, + // use `getChildren()` instead. + LVElements getSortedChildren( + LVSortFunction SortFunction = llvm::logicalview::getSortFunction()) const; void addElement(LVElement *Element); void addElement(LVLine *Line); @@ -222,7 +236,6 @@ class LLVM_ABI LVScope : public LVElement { void addElement(LVType *Type); void addObject(LVLocation *Location); void addObject(LVAddress LowerAddress, LVAddress UpperAddress); - void addToChildren(LVElement *Element); // Add the missing elements from the given 'Reference', which is the // scope associated with any DW_AT_specification, DW_AT_abstract_origin. diff --git a/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h b/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h index c6d7c32c4ad95..bfcbf728d415c 100644 --- a/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h +++ b/llvm/include/llvm/Frontend/HLSL/RootSignatureMetadata.h @@ -48,6 +48,91 @@ class RootSignatureValidationError } }; +class OffsetAppendAfterOverflow : public ErrorInfo { +public: + static char ID; + dxil::ResourceClass Type; + uint32_t Register; + uint32_t Space; + + OffsetAppendAfterOverflow(dxil::ResourceClass Type, uint32_t Register, + uint32_t Space) + : Type(Type), Register(Register), Space(Space) {} + + void log(raw_ostream &OS) const override { + OS << "Range " << getResourceClassName(Type) << "(register=" << Register + << ", space=" << Space << ") " + << "cannot be appended after an unbounded range "; + } + + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +class ShaderRegisterOverflowError + : public ErrorInfo { +public: + static char ID; + dxil::ResourceClass Type; + uint32_t Register; + uint32_t Space; + + ShaderRegisterOverflowError(dxil::ResourceClass Type, uint32_t Register, + uint32_t Space) + : Type(Type), Register(Register), Space(Space) {} + + void log(raw_ostream &OS) const override { + OS << "Overflow for shader register range: " << getResourceClassName(Type) + << "(register=" << Register << ", space=" << Space << ")."; + } + + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +class OffsetOverflowError : public ErrorInfo { +public: + static char ID; + dxil::ResourceClass Type; + uint32_t Register; + uint32_t Space; + + OffsetOverflowError(dxil::ResourceClass Type, uint32_t Register, + uint32_t Space) + : Type(Type), Register(Register), Space(Space) {} + + void log(raw_ostream &OS) const override { + OS << "Offset overflow for descriptor range: " << getResourceClassName(Type) + << "(register=" << Register << ", space=" << Space << ")."; + } + + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +class TableSamplerMixinError : public ErrorInfo { +public: + static char ID; + dxil::ResourceClass Type; + uint32_t Location; + + TableSamplerMixinError(dxil::ResourceClass Type, uint32_t Location) + : Type(Type), Location(Location) {} + + void log(raw_ostream &OS) const override { + OS << "Samplers cannot be mixed with other " + << "resource types in a descriptor table, " << getResourceClassName(Type) + << "(location=" << Location << ")"; + } + + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + class GenericRSMetadataError : public ErrorInfo { public: LLVM_ABI static char ID; diff --git a/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h b/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h index ea96094b18300..4dd18111b0c9d 100644 --- a/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h +++ b/llvm/include/llvm/Frontend/HLSL/RootSignatureValidations.h @@ -33,14 +33,14 @@ LLVM_ABI bool verifyRangeType(uint32_t Type); LLVM_ABI bool verifyDescriptorRangeFlag(uint32_t Version, dxil::ResourceClass Type, dxbc::DescriptorRangeFlags FlagsVal); +LLVM_ABI bool verifyStaticSamplerFlags(uint32_t Version, uint32_t FlagsNumber); LLVM_ABI bool verifyNumDescriptors(uint32_t NumDescriptors); LLVM_ABI bool verifyMipLODBias(float MipLODBias); LLVM_ABI bool verifyMaxAnisotropy(uint32_t MaxAnisotropy); LLVM_ABI bool verifyLOD(float LOD); -LLVM_ABI bool verifyBoundOffset(uint32_t Offset); LLVM_ABI bool verifyNoOverflowedOffset(uint64_t Offset); -LLVM_ABI uint64_t computeRangeBound(uint32_t Offset, uint32_t Size); +LLVM_ABI uint64_t computeRangeBound(uint64_t Offset, uint32_t Size); } // namespace rootsig } // namespace hlsl diff --git a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h index 6b9da06707261..24017492e30b2 100644 --- a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h +++ b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h @@ -13,6 +13,8 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Compiler.h" +#include + namespace llvm { namespace offloading { using EntryArrayTy = std::pair; @@ -52,6 +54,24 @@ LLVM_ABI llvm::Error wrapHIPBinary(llvm::Module &M, llvm::ArrayRef Images, EntryArrayTy EntryArray, llvm::StringRef Suffix = "", bool EmitSurfacesAndTextures = true); + +struct SYCLJITOptions { + // Target/compiler specific options that are suggested to use to "compile" + // program at runtime. + std::string CompileOptions; + // Target/compiler specific options that are suggested to use to "link" + // program at runtime. + std::string LinkOptions; +}; + +/// Wraps OffloadBinaries in the given \p Buffers into the module \p M +/// as global symbols and registers the images with the SYCL Runtime. +/// \param Options Compiler and linker options to be encoded for the later +/// use by a runtime for JIT compilation. +LLVM_ABI llvm::Error +wrapSYCLBinaries(llvm::Module &M, llvm::ArrayRef Buffer, + SYCLJITOptions Options = SYCLJITOptions()); + } // namespace offloading } // namespace llvm diff --git a/llvm/include/llvm/Frontend/Offloading/Utility.h b/llvm/include/llvm/Frontend/Offloading/Utility.h index f8a2b1237b5e1..23e6702beb476 100644 --- a/llvm/include/llvm/Frontend/Offloading/Utility.h +++ b/llvm/include/llvm/Frontend/Offloading/Utility.h @@ -82,7 +82,8 @@ LLVM_ABI StructType *getEntryTy(Module &M); /// \param Data Extra data storage associated with the entry. /// \param SectionName The section this entry will be placed at. /// \param AuxAddr An extra pointer if needed. -LLVM_ABI void +/// \return The emitted global variable containing the offloading entry. +LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr = nullptr, diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index 1ed23eed1571d..db781b58944bc 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -663,13 +663,13 @@ struct GrainsizeT { // [6.0:438] `graph_id` clause template // struct GraphIdT { - using EmptyTrait = std::true_type; + using IncompleteTrait = std::true_type; }; // [6.0:438] `graph_reset` clause template // struct GraphResetT { - using EmptyTrait = std::true_type; + using IncompleteTrait = std::true_type; }; // V5.2: [5.4.9] `has_device_addr` clause @@ -1268,6 +1268,15 @@ struct WriteT { using EmptyTrait = std::true_type; }; +// V6: [6.4.7] Looprange clause +template struct LoopRangeT { + using Begin = E; + using End = E; + + using TupleTrait = std::true_type; + std::tuple t; +}; + // --- template @@ -1278,11 +1287,10 @@ using ExtensionClausesT = template using EmptyClausesT = std::variant< AcqRelT, AcquireT, CaptureT, CompareT, - DynamicAllocatorsT, FullT, GraphIdT, - GraphResetT, InbranchT, MergeableT, - NogroupT, NoOpenmpRoutinesT, NoOpenmpT, - NoParallelismT, NotinbranchT, NowaitT, - ReadT, RelaxedT, ReleaseT, + DynamicAllocatorsT, FullT, InbranchT, + MergeableT, NogroupT, NoOpenmpRoutinesT, + NoOpenmpT, NoParallelismT, NotinbranchT, + NowaitT, ReadT, RelaxedT, ReleaseT, ReverseOffloadT, SeqCstT, SimdT, ThreadsT, UnifiedAddressT, UnifiedSharedMemoryT, UnknownT, UntiedT, UseT, WeakT, @@ -1290,9 +1298,9 @@ using EmptyClausesT = std::variant< template using IncompleteClausesT = - std::variant, AppendArgsT, MatchT, - OtherwiseT, ReplayableT, - TransparentT, WhenT>; + std::variant, AppendArgsT, GraphIdT, + GraphResetT, MatchT, OtherwiseT, + ReplayableT, TransparentT, WhenT>; template using TupleClausesT = @@ -1301,8 +1309,8 @@ using TupleClausesT = DoacrossT, DynGroupprivateT, FromT, GrainsizeT, IfT, InitT, InReductionT, LastprivateT, LinearT, - MapT, NumTasksT, OrderT, - ReductionT, ScheduleT, + LoopRangeT, MapT, NumTasksT, + OrderT, ReductionT, ScheduleT, TaskReductionT, ToT>; template diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 4d9b8f8a6c51e..38f95a11bf85f 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -284,6 +284,10 @@ def OMPC_Linear : Clause<[Spelling<"linear">]> { def OMPC_Link : Clause<[Spelling<"link">]> { let flangClass = "OmpObjectList"; } +def OMPC_LoopRange : Clause<[Spelling<"looprange">]> { + let clangClass = "OMPLoopRangeClause"; + let flangClass = "OmpLoopRangeClause"; +} def OMPC_Map : Clause<[Spelling<"map">]> { let clangClass = "OMPMapClause"; let flangClass = "OmpMapClause"; @@ -902,6 +906,11 @@ def OMP_Groupprivate : Directive<[Spelling<"groupprivate">]> { let category = CA_Declarative; let languages = [L_C, L_Fortran]; } +def OMP_Fuse : Directive<[Spelling<"fuse">]> { + let allowedOnceClauses = [VersionedClause]; + let association = AS_Block; + let category = CA_Executable; +} def OMP_Interchange : Directive<[Spelling<"interchange">]> { let allowedOnceClauses = [ VersionedClause, @@ -2087,9 +2096,11 @@ def OMP_TargetParallel : Directive<[Spelling<"target parallel">]> { let allowedOnceClauses = [ VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, ]; let leafConstructs = [OMP_Target, OMP_Parallel]; @@ -2117,12 +2128,14 @@ def OMP_TargetParallelDo : Directive<[Spelling<"target parallel do">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, + VersionedClause, ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_Do]; let category = CA_Executable; @@ -2146,6 +2159,7 @@ def OMP_TargetParallelDoSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2156,6 +2170,7 @@ def OMP_TargetParallelDoSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2180,6 +2195,7 @@ def OMP_TargetParallelFor : Directive<[Spelling<"target parallel for">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2189,6 +2205,7 @@ def OMP_TargetParallelFor : Directive<[Spelling<"target parallel for">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, ]; @@ -2218,6 +2235,7 @@ def OMP_TargetParallelForSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2229,6 +2247,7 @@ def OMP_TargetParallelForSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2264,11 +2283,13 @@ def OMP_target_parallel_loop : Directive<[Spelling<"target parallel loop">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, ]; let leafConstructs = [OMP_Target, OMP_Parallel, OMP_loop]; @@ -2299,12 +2320,14 @@ def OMP_TargetSimd : Directive<[Spelling<"target simd">]> { VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, ]; @@ -2397,12 +2420,14 @@ def OMP_TargetTeamsDistributeParallelDo VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, ]; let leafConstructs = @@ -2436,6 +2461,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2443,6 +2469,7 @@ def OMP_TargetTeamsDistributeParallelDoSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, ]; @@ -2468,6 +2495,7 @@ def OMP_TargetTeamsDistributeParallelFor VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2477,6 +2505,7 @@ def OMP_TargetTeamsDistributeParallelFor VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2508,6 +2537,7 @@ def OMP_TargetTeamsDistributeParallelForSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, @@ -2519,6 +2549,7 @@ def OMP_TargetTeamsDistributeParallelForSimd VersionedClause, VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index f43ef932e965a..0a11617ea971c 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1085,11 +1085,13 @@ class OpenMPIRBuilder { /// preheader of the loop. /// \param LoopType Information about type of loop worksharing. /// It corresponds to type of loop workshare OpenMP pragma. + /// \param NoLoop If true, no-loop code is generated. /// /// \returns Point where to insert code after the workshare construct. InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, - omp::WorksharingLoopType LoopType); + omp::WorksharingLoopType LoopType, + bool NoLoop); /// Modifies the canonical loop to be a statically-scheduled workshare loop. /// @@ -1209,6 +1211,7 @@ class OpenMPIRBuilder { /// present. /// \param LoopType Information about type of loop worksharing. /// It corresponds to type of loop workshare OpenMP pragma. + /// \param NoLoop If true, no-loop code is generated. /// /// \returns Point where to insert code after the workshare construct. LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop( @@ -1219,7 +1222,8 @@ class OpenMPIRBuilder { bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false, bool HasOrderedClause = false, omp::WorksharingLoopType LoopType = - omp::WorksharingLoopType::ForStaticLoop); + omp::WorksharingLoopType::ForStaticLoop, + bool NoLoop = false); /// Tile a loop nest. /// @@ -1402,7 +1406,7 @@ class OpenMPIRBuilder { /// any. LLVM_ABI static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, - StringRef ParentName = ""); + vfs::FileSystem &VFS, StringRef ParentName = ""); /// Enum class for the RedctionGen CallBack type to be used. enum class ReductionGenCBKind { Clang, MLIR }; diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h index ef761eb1aed73..bbf9f8486f31c 100644 --- a/llvm/include/llvm/IR/CallingConv.h +++ b/llvm/include/llvm/IR/CallingConv.h @@ -287,6 +287,16 @@ namespace CallingConv { // Calling convention for AMDGPU whole wave functions. AMDGPU_Gfx_WholeWave = 124, + /// Calling convention used for CHERIoT when crossing a protection boundary. + CHERIoT_CompartmentCall = 125, + /// Calling convention used for the callee of CHERIoT_CompartmentCall. + /// Ignores the first two capability arguments and the first integer + /// argument, zeroes all unused return registers on return. + CHERIoT_CompartmentCallee = 126, + /// Calling convention used for CHERIoT for cross-library calls to a + /// stateless compartment. + CHERIoT_LibraryCall = 127, + /// The highest possible ID. Must be some 2^k - 1. MaxID = 1023 }; diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h index 5653ee7b6837d..56fc749838ef9 100644 --- a/llvm/include/llvm/IR/DataLayout.h +++ b/llvm/include/llvm/IR/DataLayout.h @@ -77,12 +77,21 @@ class DataLayout { uint32_t BitWidth; Align ABIAlign; Align PrefAlign; + /// The index bit width also defines the address size in this address space. + /// If the index width is less than the representation bit width, the + /// pointer is non-integral and bits beyond the index width could be used + /// for additional metadata (e.g. AMDGPU buffer fat pointers with bounds + /// and other flags or CHERI capabilities that contain bounds+permissions). uint32_t IndexBitWidth; /// Pointers in this address space don't have a well-defined bitwise - /// representation (e.g. may be relocated by a copying garbage collector). - /// Additionally, they may also be non-integral (i.e. containing additional - /// metadata such as bounds information/permissions). - bool IsNonIntegral; + /// representation (e.g. they may be relocated by a copying garbage + /// collector and thus have different addresses at different times). + bool HasUnstableRepresentation; + /// Pointers in this address space have additional state bits that are + /// located at a target-defined location when stored in memory. An example + /// of this would be CHERI capabilities where the validity bit is stored + /// separately from the pointer address+bounds information. + bool HasExternalState; LLVM_ABI bool operator==(const PointerSpec &Other) const; }; @@ -149,7 +158,7 @@ class DataLayout { /// Sets or updates the specification for pointer in the given address space. void setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth, Align ABIAlign, Align PrefAlign, uint32_t IndexBitWidth, - bool IsNonIntegral); + bool HasUnstableRepr, bool HasExternalState); /// Internal helper to get alignment for integer of given bitwidth. LLVM_ABI Align getIntegerAlignment(uint32_t BitWidth, bool abi_or_pref) const; @@ -355,19 +364,91 @@ class DataLayout { /// \sa DataLayout::getAddressSizeInBits unsigned getAddressSize(unsigned AS) const { return getIndexSize(AS); } - /// Return the address spaces containing non-integral pointers. Pointers in - /// this address space don't have a well-defined bitwise representation. - SmallVector getNonIntegralAddressSpaces() const { + /// Return the address spaces with special pointer semantics (such as being + /// unstable or non-integral). + SmallVector getNonStandardAddressSpaces() const { SmallVector AddrSpaces; for (const PointerSpec &PS : PointerSpecs) { - if (PS.IsNonIntegral) + if (PS.HasUnstableRepresentation || PS.HasExternalState || + PS.BitWidth != PS.IndexBitWidth) AddrSpaces.push_back(PS.AddrSpace); } return AddrSpaces; } + /// Returns whether this address space has a non-integral pointer + /// representation, i.e. the pointer is not just an integer address but some + /// other bitwise representation. When true, passes cannot assume that all + /// bits of the representation map directly to the allocation address. + /// NOTE: This also returns true for "unstable" pointers where the + /// representation may be just an address, but this value can change at any + /// given time (e.g. due to copying garbage collection). + /// Examples include AMDGPU buffer descriptors with a 128-bit fat pointer + /// and a 32-bit offset or CHERI capabilities that contain bounds, permissions + /// and an out-of-band validity bit. + /// + /// In general, more specialized functions such as mustNotIntroduceIntToPtr(), + /// mustNotIntroducePtrToInt(), or hasExternalState() should be + /// preferred over this one when reasoning about the behavior of IR + /// analysis/transforms. + /// TODO: should remove/deprecate this once all uses have migrated. bool isNonIntegralAddressSpace(unsigned AddrSpace) const { - return getPointerSpec(AddrSpace).IsNonIntegral; + const auto &PS = getPointerSpec(AddrSpace); + return PS.BitWidth != PS.IndexBitWidth || PS.HasUnstableRepresentation || + PS.HasExternalState; + } + + /// Returns whether this address space has an "unstable" pointer + /// representation. The bitwise pattern of such pointers is allowed to change + /// in a target-specific way. For example, this could be used for copying + /// garbage collection where the garbage collector could update the pointer + /// value as part of the collection sweep. + bool hasUnstableRepresentation(unsigned AddrSpace) const { + return getPointerSpec(AddrSpace).HasUnstableRepresentation; + } + bool hasUnstableRepresentation(Type *Ty) const { + auto *PTy = dyn_cast(Ty->getScalarType()); + return PTy && hasUnstableRepresentation(PTy->getPointerAddressSpace()); + } + + /// Returns whether this address space has external state (implies having + /// a non-integral pointer representation). + /// These pointer types must be loaded and stored using appropriate + /// instructions and cannot use integer loads/stores as this would not + /// propagate the out-of-band state. An example of such a pointer type is a + /// CHERI capability that contain bounds, permissions and an out-of-band + /// validity bit that is invalidated whenever an integer/FP store is performed + /// to the associated memory location. + bool hasExternalState(unsigned AddrSpace) const { + return getPointerSpec(AddrSpace).HasExternalState; + } + bool hasExternalState(Type *Ty) const { + auto *PTy = dyn_cast(Ty->getScalarType()); + return PTy && hasExternalState(PTy->getPointerAddressSpace()); + } + + /// Returns whether passes must avoid introducing `inttoptr` instructions + /// for this address space (unless they have target-specific knowledge). + /// + /// This is currently the case for non-integral pointer representations with + /// external state (hasExternalState()) since `inttoptr` cannot recreate the + /// external state bits. + /// New `inttoptr` instructions should also be avoided for "unstable" bitwise + /// representations (hasUnstableRepresentation()) unless the pass knows it is + /// within a critical section that retains the current representation. + bool mustNotIntroduceIntToPtr(unsigned AddrSpace) const { + return hasUnstableRepresentation(AddrSpace) || hasExternalState(AddrSpace); + } + + /// Returns whether passes must avoid introducing `ptrtoint` instructions + /// for this address space (unless they have target-specific knowledge). + /// + /// This is currently the case for pointer address spaces that have an + /// "unstable" representation (hasUnstableRepresentation()) since the + /// bitwise pattern of such pointers could change unless the pass knows it is + /// within a critical section that retains the current representation. + bool mustNotIntroducePtrToInt(unsigned AddrSpace) const { + return hasUnstableRepresentation(AddrSpace); } bool isNonIntegralPointerType(PointerType *PT) const { @@ -375,10 +456,20 @@ class DataLayout { } bool isNonIntegralPointerType(Type *Ty) const { - auto *PTy = dyn_cast(Ty); + auto *PTy = dyn_cast(Ty->getScalarType()); return PTy && isNonIntegralPointerType(PTy); } + bool mustNotIntroducePtrToInt(Type *Ty) const { + auto *PTy = dyn_cast(Ty->getScalarType()); + return PTy && mustNotIntroducePtrToInt(PTy->getPointerAddressSpace()); + } + + bool mustNotIntroduceIntToPtr(Type *Ty) const { + auto *PTy = dyn_cast(Ty->getScalarType()); + return PTy && mustNotIntroduceIntToPtr(PTy->getPointerAddressSpace()); + } + /// The size in bits of the pointer representation in a given address space. /// This is not necessarily the same as the integer address of a pointer (e.g. /// for fat pointers). diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 585371a6a4423..96da698538314 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -297,46 +297,39 @@ def IIT_MMX : IIT_VT; def IIT_TOKEN : IIT_VT; def IIT_METADATA : IIT_VT; def IIT_EMPTYSTRUCT : IIT_VT; -def IIT_STRUCT2 : IIT_Base<21>; -def IIT_STRUCT3 : IIT_Base<22>; -def IIT_STRUCT4 : IIT_Base<23>; -def IIT_STRUCT5 : IIT_Base<24>; -def IIT_EXTEND_ARG : IIT_Base<25>; -def IIT_TRUNC_ARG : IIT_Base<26>; -def IIT_ANYPTR : IIT_Base<27>; -def IIT_V1 : IIT_Vec<1, 28>; -def IIT_VARARG : IIT_VT; -def IIT_ONE_NTH_ELTS_VEC_ARG : IIT_Base<30>; -def IIT_SAME_VEC_WIDTH_ARG : IIT_Base<31>; -def IIT_VEC_OF_ANYPTRS_TO_ELT : IIT_Base<34>; -def IIT_I128 : IIT_Int<128, 35>; -def IIT_V512 : IIT_Vec<512, 36>; -def IIT_V1024 : IIT_Vec<1024, 37>; -def IIT_STRUCT6 : IIT_Base<38>; -def IIT_STRUCT7 : IIT_Base<39>; -def IIT_STRUCT8 : IIT_Base<40>; -def IIT_F128 : IIT_VT; -def IIT_VEC_ELEMENT : IIT_Base<42>; -def IIT_SCALABLE_VEC : IIT_Base<43>; -def IIT_SUBDIVIDE2_ARG : IIT_Base<44>; -def IIT_SUBDIVIDE4_ARG : IIT_Base<45>; -def IIT_VEC_OF_BITCASTS_TO_INT : IIT_Base<46>; -def IIT_V128 : IIT_Vec<128, 47>; -def IIT_BF16 : IIT_VT; -def IIT_STRUCT9 : IIT_Base<49>; -def IIT_V256 : IIT_Vec<256, 50>; -def IIT_AMX : IIT_VT; -def IIT_PPCF128 : IIT_VT; -def IIT_V3 : IIT_Vec<3, 53>; -def IIT_EXTERNREF : IIT_VT; -def IIT_FUNCREF : IIT_VT; -def IIT_I2 : IIT_Int<2, 57>; -def IIT_I4 : IIT_Int<4, 58>; -def IIT_AARCH64_SVCOUNT : IIT_VT; -def IIT_V6 : IIT_Vec<6, 60>; -def IIT_V10 : IIT_Vec<10, 61>; -def IIT_V2048 : IIT_Vec<2048, 62>; -def IIT_V4096 : IIT_Vec<4096, 63>; +def IIT_STRUCT : IIT_Base<21>; +def IIT_EXTEND_ARG : IIT_Base<22>; +def IIT_TRUNC_ARG : IIT_Base<23>; +def IIT_ANYPTR : IIT_Base<24>; +def IIT_V1 : IIT_Vec<1, 25>; +def IIT_VARARG : IIT_VT; +def IIT_ONE_NTH_ELTS_VEC_ARG : IIT_Base<27>; +def IIT_SAME_VEC_WIDTH_ARG : IIT_Base<28>; +def IIT_VEC_OF_ANYPTRS_TO_ELT : IIT_Base<29>; +def IIT_I128 : IIT_Int<128, 30>; +def IIT_V512 : IIT_Vec<512, 31>; +def IIT_V1024 : IIT_Vec<1024, 32>; +def IIT_F128 : IIT_VT; +def IIT_VEC_ELEMENT : IIT_Base<34>; +def IIT_SCALABLE_VEC : IIT_Base<35>; +def IIT_SUBDIVIDE2_ARG : IIT_Base<36>; +def IIT_SUBDIVIDE4_ARG : IIT_Base<37>; +def IIT_VEC_OF_BITCASTS_TO_INT : IIT_Base<38>; +def IIT_V128 : IIT_Vec<128, 39>; +def IIT_BF16 : IIT_VT; +def IIT_V256 : IIT_Vec<256, 41>; +def IIT_AMX : IIT_VT; +def IIT_PPCF128 : IIT_VT; +def IIT_V3 : IIT_Vec<3, 44>; +def IIT_EXTERNREF : IIT_VT; +def IIT_FUNCREF : IIT_VT; +def IIT_I2 : IIT_Int<2, 47>; +def IIT_I4 : IIT_Int<4, 48>; +def IIT_AARCH64_SVCOUNT : IIT_VT; +def IIT_V6 : IIT_Vec<6, 50>; +def IIT_V10 : IIT_Vec<10, 51>; +def IIT_V2048 : IIT_Vec<2048, 52>; +def IIT_V4096 : IIT_Vec<4096, 53>; } defvar IIT_all_FixedTypes = !filter(iit, IIT_all, @@ -345,19 +338,6 @@ defvar IIT_all_FixedTypes = !filter(iit, IIT_all, defvar IIT_all_VectorTypes = !filter(iit, IIT_all, !isa(iit)); -defvar IIT_RetNumbers = [ - [IIT_Done.Number], - [], - [IIT_STRUCT2.Number], - [IIT_STRUCT3.Number], - [IIT_STRUCT4.Number], - [IIT_STRUCT5.Number], - [IIT_STRUCT6.Number], - [IIT_STRUCT7.Number], - [IIT_STRUCT8.Number], - [IIT_STRUCT9.Number], -]; - //===----------------------------------------------------------------------===// // Types used by intrinsics. //===----------------------------------------------------------------------===// @@ -663,7 +643,10 @@ class TypeInfoGen< !if(!isa(ty), ACTys[MappingRIdxs[ty.Number]], ty)); list TypeSig = !listflatten(!listconcat( - [IIT_RetNumbers[!size(RetTypes)]], + [!cond( + !eq(!size(RetTypes), 0): [IIT_Done.Number], + !eq(!size(RetTypes), 1): [], + true: [IIT_STRUCT.Number, !sub(!size(RetTypes), 2)])], !foreach(i, !range(AllTypes), !foreach(a, AllTypes[i].Sig, ResolveArgCode< @@ -977,8 +960,12 @@ def int_instrprof_mcdc_tvbitmap_update : Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty, llvm_i32_ty, llvm_ptr_ty]>; -def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>; -def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>; +def int_call_preallocated_setup + : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty], + [ImmArg>]>; +def int_call_preallocated_arg + : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty], + [ImmArg>]>; def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>; // This intrinsic is intentionally undocumented and users shouldn't call it; @@ -1775,12 +1762,13 @@ def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty], [IntrReadMem, IntrArgMemOnly, ReadOnly>, NoCapture>]>; -def int_coro_end : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty, llvm_token_ty], []>; +def int_coro_end : Intrinsic<[], [llvm_ptr_ty, llvm_i1_ty, llvm_token_ty], []>; def int_coro_end_results : Intrinsic<[llvm_token_ty], [llvm_vararg_ty]>; def int_coro_end_async - : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty, llvm_vararg_ty], []>; + : Intrinsic<[], [llvm_ptr_ty, llvm_i1_ty, llvm_vararg_ty], []>; def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; +def int_coro_is_in_ramp : Intrinsic<[llvm_i1_ty], [], [IntrNoMem], "llvm.coro.is_in_ramp">; def int_coro_noop : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>; def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; def int_coro_align : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index afce1fe6af854..ded00b1274670 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1431,7 +1431,7 @@ def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < [llvm_anyptr_ty], [llvm_anyptr_ty, // base llvm_i16_ty, // stride (and swizzle control) - llvm_i32_ty, // NumRecords / extent + llvm_i64_ty, // NumRecords / extent llvm_i32_ty], // flags // Attributes lifted from ptrmask + some extra argument attributes. [IntrNoMem, ReadNone>, @@ -3808,6 +3808,7 @@ class AMDGPUCooperativeAtomicLoad : Intrinsic < [SDNPMemOperand, SDNPMayLoad] >; +// TODO: We may want to drop _relaxed and use an atomic ordering operand instead. def int_amdgcn_cooperative_atomic_load_32x4B : AMDGPUCooperativeAtomicLoad; def int_amdgcn_cooperative_atomic_store_32x4B : AMDGPUCooperativeAtomicStore; def int_amdgcn_cooperative_atomic_load_16x8B : AMDGPUCooperativeAtomicLoad; diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index 5d76c3f8df89d..570d6bc35cbd0 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -39,6 +39,10 @@ def int_dx_resource_handlefromimplicitbinding def int_dx_resource_getpointer : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty], [IntrNoMem]>; + +def int_dx_resource_nonuniformindex + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + def int_dx_resource_load_typedbuffer : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty], [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>; @@ -130,6 +134,8 @@ def int_dx_degrees : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty], [IntrNoMem]>; +def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyfloat_ty], [IntrNoMem]>; def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem]>; @@ -160,5 +166,6 @@ def int_dx_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, def int_dx_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>; def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>; -def int_dx_group_memory_barrier_with_group_sync : DefaultAttrsIntrinsic<[], [], []>; +def int_dx_group_memory_barrier_with_group_sync + : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>; } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7b40841e45d0d..9cfab26fffa54 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -947,6 +947,78 @@ class NVVM_TCGEN05_LDST_ACCESS_SIZE { true : llvm_void_ty); } +class NVVM_TCGEN05_MMA_BASE { + LLVMType a_operand_type = !if(!eq(Space, "tensor"), + llvm_tmem_ptr_ty, llvm_i64_ty); + list common_args = !listconcat( + [llvm_tmem_ptr_ty, // d + a_operand_type, // a + llvm_i64_ty, // b + llvm_i32_ty, // idesc + llvm_i1_ty], // enable_input_d + !if(!eq(Sp, 1), [llvm_tmem_ptr_ty], [])); // spmetadata + list common_intr_props = !listconcat( + [IntrArgMemOnly, WriteOnly>], + !if(!eq(Space, "tensor"), [ReadOnly>], []) + ); +} + +class NVVM_TCGEN05_MMA: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # !if(!eq(ScaleInputD, 1), ".scale_d", "") + # !if(!eq(AShift, 1), ".ashift", ""); + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_BLOCKSCALE: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # "." # Kind + # ".block_scale" # ScaleVecSize; + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_WS: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma.ws" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # !if(!eq(ZeroColMask, 1), ".zero_col_mask", ""); + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_DISABLE_OUTPUT_LANE: + NVVM_TCGEN05_MMA_BASE { + string intr = "llvm.nvvm.tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # "." # Space + # !if(!eq(ScaleInputD, 1), ".scale_d", "") + # ".disable_output_lane.cg" # CtaGroup + # !if(!eq(AShift, 1), ".ashift", ""); + string record = !subst(".", "_", !subst("llvm.", "int_", intr)); +} + +class NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED { + bit ret = !cond( + !and(!eq(Kind, "mxf8f6f4"), !eq(ScaleVecSize, "")) : true, + !and(!eq(Kind, "mxf4"), !eq(ScaleVecSize, "")) : true, + !and(!eq(Kind, "mxf4nvf4"), !eq(ScaleVecSize, ".block16")) : true, + !and(!eq(Kind, "mxf4"), !eq(ScaleVecSize, ".block32")) : true, + !and(!eq(Kind, "mxf4nvf4"), !eq(ScaleVecSize, ".block32")) : true, + !and(!eq(Kind, "mxf8f6f4"), !eq(ScaleVecSize, ".block32")) : true, + true: false + ); +} + class TexVector types> { string Name = name; list Types = types; @@ -2268,13 +2340,15 @@ def int_nvvm_exit : NVVMBuiltin, class DefaultAttrsIntrinsicFlags ret_types, list param_types, list flags, - list intr_properties> + list intr_properties, + string name = ""> : DefaultAttrsIntrinsic< ret_types, !listconcat(param_types, flags), !listconcat(intr_properties, !foreach(i, !range(flags), - ImmArg>))>; + ImmArg>)), + name>; // TMA Tensor Copy Intrinsics: S2G -> From Shared to Global memory variants foreach dim = 1...5 in { @@ -2663,4 +2737,136 @@ foreach dim = ["x", "y", "z"] in : PureIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [], "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; -} // let TargetPrefix = "nvvm" +// +// tcgen05.mma intrinsics +// + +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach scale_d = [0, 1] in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + defvar mma = NVVM_TCGEN05_MMA; + defvar args = !listconcat( + mma.common_args, + !if(!eq(scale_d, 1), [llvm_i64_ty], []) // scale_d_imm + ); + defvar flags = [llvm_i32_ty, // kind + llvm_i32_ty, // cta_group + llvm_i32_ty]; // collector_usage_a + defvar nargs = !size(args); + defvar scale_d_imm = ArgIndex; + defvar scale_d_imm_range = [ImmArg, Range]; + defvar intrinsic_properties = !listconcat( + mma.common_intr_props, + !if(!eq(scale_d, 1), scale_d_imm_range, []), + [Range, 0, !if(!eq(scale_d, 1), 2, 4)>, // kind + Range, 1, 3>, // cta_group + Range, 0, + !if(!eq(ashift, 1), 2, 4)> // collector_usage + ] + ); + + def mma.record: + DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties, + mma.intr>; + } + } + } +} + +// +// tcgen05.mma disable_output_lane intrinsics +// +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach cta_group = [1, 2] in { + foreach scale_d = [0, 1] in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + defvar mma = NVVM_TCGEN05_MMA_DISABLE_OUTPUT_LANE< + sp, space, cta_group, ashift, scale_d>; + defvar disable_output_lane_type = + !if(!eq(cta_group, 1), llvm_v4i32_ty, llvm_v8i32_ty); + defvar args = !listconcat( + mma.common_args, + !if(!eq(scale_d, 1), [llvm_i64_ty], []), + [disable_output_lane_type] + ); + defvar flags = [llvm_i32_ty, // kind_flag + llvm_i32_ty]; // collector_usage_a_flag + defvar nargs = !size(args); + defvar scale_d_imm = ArgIndex; + defvar scale_d_imm_range = [ImmArg, Range]; + defvar intrinsic_properties = !listconcat( + mma.common_intr_props, + !if(!eq(scale_d, 1), scale_d_imm_range, []), + [Range, 0, !if(!eq(scale_d, 1), 2, 4)>, + Range, 0, !if(!eq(ashift, 1), 2, 4)>] + ); + + def mma.record: DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties, + mma.intr>; + } // ashift + } // scale_d + } // cta_group + } // space +} // sp + +// +// tcgen05.mma block_scale intrinsics +// +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["mxf8f6f4", "mxf4", "mxf4nvf4"] in { + foreach scale_vec_size = ["", ".block16", ".block32"] in { + defvar mma = NVVM_TCGEN05_MMA_BLOCKSCALE; + defvar args = !listconcat(mma.common_args, + [llvm_tmem_ptr_ty, // scale_a + llvm_tmem_ptr_ty]); // scale_b + defvar flags = [llvm_i32_ty, // cta_group + llvm_i32_ty]; // collector_usage_a + defvar nargs = !size(args); + defvar cta_group = ArgIndex; + defvar collector_usage = ArgIndex; + + if NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED.ret then { + def mma.record: DefaultAttrsIntrinsicFlags<[], args, flags, + !listconcat(mma.common_intr_props, + [Range, + Range]), + mma.intr>; + } + } + } + } +} + +// +// tcgen05.mma ws intrinsics +// +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach zero_col_mask = [0, 1] in { + defvar mma = NVVM_TCGEN05_MMA_WS; + defvar args = !listconcat( + mma.common_args, + !if(!eq(zero_col_mask, 1), [llvm_i64_ty], []) + ); + defvar flags = [llvm_i32_ty, // kind + llvm_i32_ty, // collector_buffer_b + llvm_i32_ty]; // collector_usage_b_op + defvar nargs = !size(args); + defvar intrinsic_properties = !listconcat( + mma.common_intr_props, + [Range, 0, 4>, + Range, 0, 4>, + Range, 0, 4>] + ); + + def mma.record: + DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties, + mma.intr>; + } + } +} + +} // let TargetPrefix = "nvvm" \ No newline at end of file diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index bc026fa33c769..823c491e1bfee 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -87,6 +87,8 @@ let TargetPrefix = "spv" in { def int_spv_frac : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [llvm_anyfloat_ty], [IntrNoMem]>; + def int_spv_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>], [IntrNoMem] >; def int_spv_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; @@ -127,7 +129,8 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty] : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent]>; def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>; def int_spv_radians : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; - def int_spv_group_memory_barrier_with_group_sync : DefaultAttrsIntrinsic<[], [], []>; + def int_spv_group_memory_barrier_with_group_sync + : DefaultAttrsIntrinsic<[], [], [IntrConvergent]>; def int_spv_discard : DefaultAttrsIntrinsic<[], [], []>; def int_spv_uclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; def int_spv_sclamp : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; @@ -160,6 +163,9 @@ def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty] : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty], [IntrNoMem]>; +def int_spv_resource_nonuniformindex + : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; + // Read a value from the image buffer. It does not translate directly to a // single OpImageRead because the result type is not necessarily a 4 element // vector. diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h index 852a3a4e2f638..5972dcb637dfa 100644 --- a/llvm/include/llvm/IR/LLVMContext.h +++ b/llvm/include/llvm/IR/LLVMContext.h @@ -97,6 +97,8 @@ class LLVMContext { OB_ptrauth = 7, // "ptrauth" OB_kcfi = 8, // "kcfi" OB_convergencectrl = 9, // "convergencectrl" + OB_align = 10, // "align" + OB_LastBundleID = OB_align // Marker for last bundle ID }; /// getMDKindID - Return a unique non-zero ID for the specified metadata kind. diff --git a/llvm/include/llvm/IR/LLVMRemarkStreamer.h b/llvm/include/llvm/IR/LLVMRemarkStreamer.h index 376acdec49fbb..96cccebf0d70e 100644 --- a/llvm/include/llvm/IR/LLVMRemarkStreamer.h +++ b/llvm/include/llvm/IR/LLVMRemarkStreamer.h @@ -17,6 +17,7 @@ #include "llvm/Remarks/Remark.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ToolOutputFile.h" #include #include #include @@ -82,20 +83,81 @@ struct LLVMRemarkSetupFormatError LLVMRemarkSetupFormatError>::LLVMRemarkSetupErrorInfo; }; -/// Setup optimization remarks that output to a file. -LLVM_ABI Expected> setupLLVMOptimizationRemarks( +/// RAII handle that manages the lifetime of the ToolOutputFile used to output +/// remarks. On destruction (or when calling releaseFile()), this handle ensures +/// that the optimization remarks are finalized and the RemarkStreamer is +/// correctly deregistered from the LLVMContext. +class LLVMRemarkFileHandle final { + struct Finalizer { + LLVMContext *Context; + + Finalizer(LLVMContext *Ctx) : Context(Ctx) {} + + Finalizer(const Finalizer &) = delete; + Finalizer &operator=(const Finalizer &) = delete; + + Finalizer(Finalizer &&Other) : Context(Other.Context) { + Other.Context = nullptr; + } + + Finalizer &operator=(Finalizer &&Other) { + std::swap(Context, Other.Context); + return *this; + } + + ~Finalizer() { finalize(); } + + LLVM_ABI void finalize(); + }; + + std::unique_ptr OutputFile; + Finalizer Finalize; + +public: + LLVMRemarkFileHandle() : OutputFile(nullptr), Finalize(nullptr) {} + + LLVMRemarkFileHandle(std::unique_ptr OutputFile, + LLVMContext &Ctx) + : OutputFile(std::move(OutputFile)), Finalize(&Ctx) {} + + ToolOutputFile *get() { return OutputFile.get(); } + explicit operator bool() { return bool(OutputFile); } + + /// Finalize remark emission and release the underlying ToolOutputFile. + std::unique_ptr releaseFile() { + finalize(); + return std::move(OutputFile); + } + + void finalize() { Finalize.finalize(); } + + ToolOutputFile &operator*() { return *OutputFile; } + ToolOutputFile *operator->() { return &*OutputFile; } +}; + +/// Set up optimization remarks that output to a file. The LLVMRemarkFileHandle +/// manages the lifetime of the underlying ToolOutputFile to ensure \ref +/// finalizeLLVMOptimizationRemarks() is called before the file is destroyed or +/// released from the handle. The handle must be kept alive until all remarks +/// were emitted through the remark streamer. +LLVM_ABI Expected setupLLVMOptimizationRemarks( LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses, StringRef RemarksFormat, bool RemarksWithHotness, std::optional RemarksHotnessThreshold = 0); -/// Setup optimization remarks that output directly to a raw_ostream. -/// \p OS is managed by the caller and should be open for writing as long as \p -/// Context is streaming remarks to it. +/// Set up optimization remarks that output directly to a raw_ostream. +/// \p OS is managed by the caller and must be open for writing until +/// \ref finalizeLLVMOptimizationRemarks() is called. LLVM_ABI Error setupLLVMOptimizationRemarks( LLVMContext &Context, raw_ostream &OS, StringRef RemarksPasses, StringRef RemarksFormat, bool RemarksWithHotness, std::optional RemarksHotnessThreshold = 0); +/// Finalize optimization remarks and deregister the RemarkStreamer from the \p +/// Context. This must be called before closing the (file) stream that was used +/// to set up the remarks. +LLVM_ABI void finalizeLLVMOptimizationRemarks(LLVMContext &Context); + } // end namespace llvm #endif // LLVM_IR_LLVMREMARKSTREAMER_H diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h index cc4929a1ff8da..d55100e5e709d 100644 --- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h +++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h @@ -47,6 +47,15 @@ enum class CTAGroupKind : uint8_t { CG_2 = 2, // cta_group::2 modifier }; +enum class Tcgen05MMAKind : uint8_t { F16 = 0, TF32 = 1, F8F6F4 = 2, I8 = 3 }; + +enum class Tcgen05CollectorUsageOp : uint8_t { + DISCARD = 0, + LASTUSE = 1, + FILL = 2, + USE = 3, +}; + inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) { switch (IntrinsicID) { case Intrinsic::nvvm_f2i_rm_ftz: @@ -180,6 +189,70 @@ inline bool FPToIntegerIntrinsicResultIsSigned(Intrinsic::ID IntrinsicID) { "Checking invalid f2i/d2i intrinsic for signed int conversion"); } +inline bool FPToIntegerIntrinsicNaNZero(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + // f2i + case Intrinsic::nvvm_f2i_rm: + case Intrinsic::nvvm_f2i_rn: + case Intrinsic::nvvm_f2i_rp: + case Intrinsic::nvvm_f2i_rz: + case Intrinsic::nvvm_f2i_rm_ftz: + case Intrinsic::nvvm_f2i_rn_ftz: + case Intrinsic::nvvm_f2i_rp_ftz: + case Intrinsic::nvvm_f2i_rz_ftz: + // f2ui + case Intrinsic::nvvm_f2ui_rm: + case Intrinsic::nvvm_f2ui_rn: + case Intrinsic::nvvm_f2ui_rp: + case Intrinsic::nvvm_f2ui_rz: + case Intrinsic::nvvm_f2ui_rm_ftz: + case Intrinsic::nvvm_f2ui_rn_ftz: + case Intrinsic::nvvm_f2ui_rp_ftz: + case Intrinsic::nvvm_f2ui_rz_ftz: + return true; + // d2i + case Intrinsic::nvvm_d2i_rm: + case Intrinsic::nvvm_d2i_rn: + case Intrinsic::nvvm_d2i_rp: + case Intrinsic::nvvm_d2i_rz: + // d2ui + case Intrinsic::nvvm_d2ui_rm: + case Intrinsic::nvvm_d2ui_rn: + case Intrinsic::nvvm_d2ui_rp: + case Intrinsic::nvvm_d2ui_rz: + // f2ll + case Intrinsic::nvvm_f2ll_rm: + case Intrinsic::nvvm_f2ll_rn: + case Intrinsic::nvvm_f2ll_rp: + case Intrinsic::nvvm_f2ll_rz: + case Intrinsic::nvvm_f2ll_rm_ftz: + case Intrinsic::nvvm_f2ll_rn_ftz: + case Intrinsic::nvvm_f2ll_rp_ftz: + case Intrinsic::nvvm_f2ll_rz_ftz: + // f2ull + case Intrinsic::nvvm_f2ull_rm: + case Intrinsic::nvvm_f2ull_rn: + case Intrinsic::nvvm_f2ull_rp: + case Intrinsic::nvvm_f2ull_rz: + case Intrinsic::nvvm_f2ull_rm_ftz: + case Intrinsic::nvvm_f2ull_rn_ftz: + case Intrinsic::nvvm_f2ull_rp_ftz: + case Intrinsic::nvvm_f2ull_rz_ftz: + // d2ll + case Intrinsic::nvvm_d2ll_rm: + case Intrinsic::nvvm_d2ll_rn: + case Intrinsic::nvvm_d2ll_rp: + case Intrinsic::nvvm_d2ll_rz: + // d2ull + case Intrinsic::nvvm_d2ull_rm: + case Intrinsic::nvvm_d2ull_rn: + case Intrinsic::nvvm_d2ull_rp: + case Intrinsic::nvvm_d2ull_rz: + return false; + } + llvm_unreachable("Checking NaN result for invalid f2i/d2i intrinsic"); +} + inline APFloat::roundingMode GetFPToIntegerRoundingMode(Intrinsic::ID IntrinsicID) { switch (IntrinsicID) { diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h index a16776c62f32b..6168e24569f99 100644 --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -259,86 +259,65 @@ inline match_combine_and m_CombineAnd(const LTy &L, const RTy &R) { return match_combine_and(L, R); } -struct apint_match { - const APInt *&Res; +template struct ap_match { + static_assert(std::is_same_v || std::is_same_v); + using ConstantTy = + std::conditional_t, ConstantInt, ConstantFP>; + + const APTy *&Res; bool AllowPoison; - apint_match(const APInt *&Res, bool AllowPoison) + ap_match(const APTy *&Res, bool AllowPoison) : Res(Res), AllowPoison(AllowPoison) {} template bool match(ITy *V) const { - if (auto *CI = dyn_cast(V)) { + if (auto *CI = dyn_cast(V)) { Res = &CI->getValue(); return true; } if (V->getType()->isVectorTy()) if (const auto *C = dyn_cast(V)) if (auto *CI = - dyn_cast_or_null(C->getSplatValue(AllowPoison))) { + dyn_cast_or_null(C->getSplatValue(AllowPoison))) { Res = &CI->getValue(); return true; } return false; } }; -// Either constexpr if or renaming ConstantFP::getValueAPF to -// ConstantFP::getValue is needed to do it via single template -// function for both apint/apfloat. -struct apfloat_match { - const APFloat *&Res; - bool AllowPoison; - - apfloat_match(const APFloat *&Res, bool AllowPoison) - : Res(Res), AllowPoison(AllowPoison) {} - - template bool match(ITy *V) const { - if (auto *CI = dyn_cast(V)) { - Res = &CI->getValueAPF(); - return true; - } - if (V->getType()->isVectorTy()) - if (const auto *C = dyn_cast(V)) - if (auto *CI = - dyn_cast_or_null(C->getSplatValue(AllowPoison))) { - Res = &CI->getValueAPF(); - return true; - } - return false; - } -}; /// Match a ConstantInt or splatted ConstantVector, binding the /// specified pointer to the contained APInt. -inline apint_match m_APInt(const APInt *&Res) { +inline ap_match m_APInt(const APInt *&Res) { // Forbid poison by default to maintain previous behavior. - return apint_match(Res, /* AllowPoison */ false); + return ap_match(Res, /* AllowPoison */ false); } /// Match APInt while allowing poison in splat vector constants. -inline apint_match m_APIntAllowPoison(const APInt *&Res) { - return apint_match(Res, /* AllowPoison */ true); +inline ap_match m_APIntAllowPoison(const APInt *&Res) { + return ap_match(Res, /* AllowPoison */ true); } /// Match APInt while forbidding poison in splat vector constants. -inline apint_match m_APIntForbidPoison(const APInt *&Res) { - return apint_match(Res, /* AllowPoison */ false); +inline ap_match m_APIntForbidPoison(const APInt *&Res) { + return ap_match(Res, /* AllowPoison */ false); } /// Match a ConstantFP or splatted ConstantVector, binding the /// specified pointer to the contained APFloat. -inline apfloat_match m_APFloat(const APFloat *&Res) { +inline ap_match m_APFloat(const APFloat *&Res) { // Forbid undefs by default to maintain previous behavior. - return apfloat_match(Res, /* AllowPoison */ false); + return ap_match(Res, /* AllowPoison */ false); } /// Match APFloat while allowing poison in splat vector constants. -inline apfloat_match m_APFloatAllowPoison(const APFloat *&Res) { - return apfloat_match(Res, /* AllowPoison */ true); +inline ap_match m_APFloatAllowPoison(const APFloat *&Res) { + return ap_match(Res, /* AllowPoison */ true); } /// Match APFloat while forbidding poison in splat vector constants. -inline apfloat_match m_APFloatForbidPoison(const APFloat *&Res) { - return apfloat_match(Res, /* AllowPoison */ false); +inline ap_match m_APFloatForbidPoison(const APFloat *&Res) { + return ap_match(Res, /* AllowPoison */ false); } template struct constantint_match { @@ -1027,7 +1006,7 @@ struct bind_const_intval_ty { template bool match(ITy *V) const { const APInt *ConstInt; - if (!apint_match(ConstInt, /*AllowPoison=*/false).match(V)) + if (!ap_match(ConstInt, /*AllowPoison=*/false).match(V)) return false; if (ConstInt->getActiveBits() > 64) return false; diff --git a/llvm/include/llvm/IR/Verifier.h b/llvm/include/llvm/IR/Verifier.h index 8dbb9c8a41d7e..b15b4d8ca99fe 100644 --- a/llvm/include/llvm/IR/Verifier.h +++ b/llvm/include/llvm/IR/Verifier.h @@ -60,12 +60,13 @@ class TBAAVerifier { /// \name Helper functions used by \c visitTBAAMetadata. /// @{ - MDNode *getFieldNodeFromTBAABaseNode(Instruction &I, const MDNode *BaseNode, - APInt &Offset, bool IsNewFormat); - TBAAVerifier::TBAABaseNodeSummary verifyTBAABaseNode(Instruction &I, + MDNode *getFieldNodeFromTBAABaseNode(const Instruction *I, + const MDNode *BaseNode, APInt &Offset, + bool IsNewFormat); + TBAAVerifier::TBAABaseNodeSummary verifyTBAABaseNode(const Instruction *I, const MDNode *BaseNode, bool IsNewFormat); - TBAABaseNodeSummary verifyTBAABaseNodeImpl(Instruction &I, + TBAABaseNodeSummary verifyTBAABaseNodeImpl(const Instruction *I, const MDNode *BaseNode, bool IsNewFormat); @@ -75,9 +76,9 @@ class TBAAVerifier { public: TBAAVerifier(VerifierSupport *Diagnostic = nullptr) : Diagnostic(Diagnostic) {} - /// Visit an instruction and return true if it is valid, return false if an - /// invalid TBAA is attached. - LLVM_ABI bool visitTBAAMetadata(Instruction &I, const MDNode *MD); + /// Visit an instruction, or a TBAA node itself as part of a metadata, and + /// return true if it is valid, return false if an invalid TBAA is attached. + LLVM_ABI bool visitTBAAMetadata(const Instruction *I, const MDNode *MD); }; /// Check a function for errors, useful for use when debugging a diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h index 323c478691a92..3a9a7f7c25859 100644 --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -15,6 +15,7 @@ #ifndef LLVM_LTO_LTO_H #define LLVM_LTO_LTO_H +#include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/Support/Compiler.h" #include @@ -91,7 +92,7 @@ LLVM_ABI std::string getThinLTOOutputFile(StringRef Path, StringRef OldPrefix, StringRef NewPrefix); /// Setup optimization remarks. -LLVM_ABI Expected> setupLLVMOptimizationRemarks( +LLVM_ABI Expected setupLLVMOptimizationRemarks( LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses, StringRef RemarksFormat, bool RemarksWithHotness, std::optional RemarksHotnessThreshold = 0, int Count = -1); @@ -579,7 +580,7 @@ class LTO { DenseSet DynamicExportSymbols; // Diagnostic optimization remarks file - std::unique_ptr DiagnosticOutputFile; + LLVMRemarkFileHandle DiagnosticOutputFile; }; /// The resolution for a symbol. The linker must provide a SymbolResolution for diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h index 86b488c764e06..48ad5aa64f61f 100644 --- a/llvm/include/llvm/LTO/LTOBackend.h +++ b/llvm/include/llvm/LTO/LTOBackend.h @@ -65,8 +65,7 @@ thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream, Module &M, AddStreamFn IRAddStream = nullptr, const std::vector &CmdArgs = std::vector()); -LLVM_ABI Error -finalizeOptimizationRemarks(std::unique_ptr DiagOutputFile); +LLVM_ABI Error finalizeOptimizationRemarks(LLVMRemarkFileHandle DiagOutputFile); /// Returns the BitcodeModule that is ThinLTO. LLVM_ABI BitcodeModule *findThinLTOModule(MutableArrayRef BMs); diff --git a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h index 806d3c5bdfd77..caff198358caa 100644 --- a/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h +++ b/llvm/include/llvm/LTO/legacy/LTOCodeGenerator.h @@ -244,7 +244,7 @@ struct LTOCodeGenerator { bool ShouldInternalize = EnableLTOInternalization; bool ShouldEmbedUselists = false; bool ShouldRestoreGlobalsLinkage = false; - std::unique_ptr DiagnosticOutputFile; + LLVMRemarkFileHandle DiagnosticOutputFile; std::unique_ptr StatsFile = nullptr; std::string SaveIRBeforeOptPath; diff --git a/llvm/include/llvm/MC/DXContainerRootSignature.h b/llvm/include/llvm/MC/DXContainerRootSignature.h index 54677ef70244f..2b08b2439d2c0 100644 --- a/llvm/include/llvm/MC/DXContainerRootSignature.h +++ b/llvm/include/llvm/MC/DXContainerRootSignature.h @@ -74,6 +74,8 @@ struct StaticSampler { uint32_t ShaderRegister; uint32_t RegisterSpace; dxbc::ShaderVisibility ShaderVisibility; + // Version 3 onwards: + uint32_t Flags = 0; }; struct RootParametersContainer { diff --git a/llvm/include/llvm/MC/MCAsmInfoELF.h b/llvm/include/llvm/MC/MCAsmInfoELF.h index c05e4ad78ecd1..e0678888d1003 100644 --- a/llvm/include/llvm/MC/MCAsmInfoELF.h +++ b/llvm/include/llvm/MC/MCAsmInfoELF.h @@ -15,7 +15,7 @@ namespace llvm { class MCAsmInfoELF : public MCAsmInfo { virtual void anchor(); - MCSection *getNonexecutableStackSection(MCContext &Ctx) const final; + MCSection *getNonexecutableStackSection(MCContext &Ctx) const override; void printSwitchToSection(const MCSection &, uint32_t, const Triple &, raw_ostream &) const final; bool useCodeAlign(const MCSection &Sec) const final; diff --git a/llvm/include/llvm/MC/MCCodeEmitter.h b/llvm/include/llvm/MC/MCCodeEmitter.h index 1c454c3795c2c..5f288e9e45c4b 100644 --- a/llvm/include/llvm/MC/MCCodeEmitter.h +++ b/llvm/include/llvm/MC/MCCodeEmitter.h @@ -16,7 +16,6 @@ namespace llvm { class MCFixup; class MCInst; class MCSubtargetInfo; -class raw_ostream; template class SmallVectorImpl; /// MCCodeEmitter - Generic instruction encoding interface. @@ -36,6 +35,12 @@ class LLVM_ABI MCCodeEmitter { virtual void encodeInstruction(const MCInst &Inst, SmallVectorImpl &CB, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const = 0; + +protected: + // Helper function used by CodeEmitterGen for error reporting. + [[noreturn]] static void reportUnsupportedInst(const MCInst &Inst); + [[noreturn]] static void reportUnsupportedOperand(const MCInst &Inst, + unsigned OpNum); }; } // end namespace llvm diff --git a/llvm/include/llvm/MC/MCInstrDesc.h b/llvm/include/llvm/MC/MCInstrDesc.h index 0a4bd17e20738..c2f15b81da02c 100644 --- a/llvm/include/llvm/MC/MCInstrDesc.h +++ b/llvm/include/llvm/MC/MCInstrDesc.h @@ -532,7 +532,7 @@ class MCInstrDesc { /// Returns true if this instruction is a candidate for remat. This /// flag is only used in TargetInstrInfo method isTriviallyRematerializable. /// - /// If this flag is set, the isReallyTriviallyReMaterializable() method is + /// If this flag is set, the isReMaterializableImpl() method is /// called to verify the instruction is really rematerializable. bool isRematerializable() const { return Flags & (1ULL << MCID::Rematerializable); diff --git a/llvm/include/llvm/MCA/CustomBehaviour.h b/llvm/include/llvm/MCA/CustomBehaviour.h index 0ce3993be95ba..8ad674c4ecf13 100644 --- a/llvm/include/llvm/MCA/CustomBehaviour.h +++ b/llvm/include/llvm/MCA/CustomBehaviour.h @@ -49,8 +49,7 @@ class InstrPostProcess { /// object after it has been lowered from the MCInst. /// This is generally a less disruptive alternative to modifying the /// scheduling model. - virtual void postProcessInstruction(std::unique_ptr &Inst, - const MCInst &MCI) {} + virtual void postProcessInstruction(Instruction &Inst, const MCInst &MCI) {} // The resetState() method gets invoked at the beginning of each code region // so that targets that override this function can clear any state that they diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index 9bc1918852335..5a5a4dbaae2ad 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -228,11 +228,11 @@ class RootSignature { uint32_t Flags; ViewArray ParametersHeaders; StringRef PartData; - ViewArray StaticSamplers; + ViewArray StaticSamplers; using param_header_iterator = ViewArray::iterator; - using samplers_iterator = ViewArray::iterator; + using samplers_iterator = ViewArray::iterator; public: RootSignature(StringRef PD) : PartData(PD) {} diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h index b5c845fa8eb70..ac2dbf60e2aec 100644 --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -48,6 +48,7 @@ enum ImageKind : uint16_t { IMG_Cubin, IMG_Fatbinary, IMG_PTX, + IMG_SPIRV, IMG_LAST, }; @@ -70,9 +71,9 @@ class OffloadBinary : public Binary { /// The offloading metadata that will be serialized to a memory buffer. struct OffloadingImage { - ImageKind TheImageKind; - OffloadKind TheOffloadKind; - uint32_t Flags; + ImageKind TheImageKind = ImageKind::IMG_None; + OffloadKind TheOffloadKind = OffloadKind::OFK_None; + uint32_t Flags = 0; MapVector StringData; std::unique_ptr Image; }; diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h index 62bfee7693db1..b5b110d0f59a1 100644 --- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h +++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h @@ -178,6 +178,11 @@ struct StaticSamplerYamlDesc { uint32_t ShaderRegister; uint32_t RegisterSpace; dxbc::ShaderVisibility ShaderVisibility; + + LLVM_ABI uint32_t getEncodedFlags() const; + +#define STATIC_SAMPLER_FLAG(Num, Enum, Flag) bool Enum = false; +#include "llvm/BinaryFormat/DXContainerConstants.def" }; struct RootSignatureYamlDesc { diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h index 2742ec1b71b7e..8538a8b2afe14 100644 --- a/llvm/include/llvm/Passes/PassBuilder.h +++ b/llvm/include/llvm/Passes/PassBuilder.h @@ -23,6 +23,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/PGOOptions.h" +#include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO/Inliner.h" #include "llvm/Transforms/IPO/ModuleInliner.h" @@ -35,10 +36,6 @@ class StringRef; class AAManager; class TargetMachine; class ModuleSummaryIndex; -template class IntrusiveRefCntPtr; -namespace vfs { -class FileSystem; -} // namespace vfs /// Tunable parameters for passes in the default pipelines. class PipelineTuningOptions { @@ -115,6 +112,7 @@ class PassBuilder { PipelineTuningOptions PTO; std::optional PGOOpt; PassInstrumentationCallbacks *PIC; + IntrusiveRefCntPtr FS; public: /// A struct to capture parsed pass pipeline names. @@ -134,7 +132,8 @@ class PassBuilder { TargetMachine *TM = nullptr, PipelineTuningOptions PTO = PipelineTuningOptions(), std::optional PGOOpt = std::nullopt, - PassInstrumentationCallbacks *PIC = nullptr); + PassInstrumentationCallbacks *PIC = nullptr, + IntrusiveRefCntPtr FS = vfs::getRealFileSystem()); /// Cross register the analysis managers through their proxies. /// @@ -632,8 +631,7 @@ class PassBuilder { bool RunProfileGen, bool IsCS, bool AtomicCounterUpdate, std::string ProfileFile, - std::string ProfileRemappingFile, - IntrusiveRefCntPtr FS); + std::string ProfileRemappingFile); /// Returns PIC. External libraries can use this to register pass /// instrumentation callbacks. @@ -641,6 +639,11 @@ class PassBuilder { return PIC; } + /// Returns the virtual file system. + IntrusiveRefCntPtr getVirtualFileSystemPtr() const { + return FS; + } + // Invoke the callbacks registered for the various extension points. // Custom pipelines should use these to invoke the callbacks registered // by TargetMachines and other clients. @@ -772,8 +775,7 @@ class PassBuilder { void addPGOInstrPasses(ModulePassManager &MPM, OptimizationLevel Level, bool RunProfileGen, bool IsCS, bool AtomicCounterUpdate, std::string ProfileFile, - std::string ProfileRemappingFile, - IntrusiveRefCntPtr FS); + std::string ProfileRemappingFile); void addPostPGOLoopRotation(ModulePassManager &MPM, OptimizationLevel Level); bool isInstrumentedPGOUse() const; diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h index 7d1a85ba528fc..e09958160b9a0 100644 --- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h +++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h @@ -1215,19 +1215,19 @@ namespace accessors { /// Return the structural hash associated with the function. template uint64_t getFuncHash(const FuncRecordTy *Record) { - return support::endian::byte_swap(Record->FuncHash); + return support::endian::byte_swap(Record->FuncHash, Endian); } /// Return the coverage map data size for the function. template uint64_t getDataSize(const FuncRecordTy *Record) { - return support::endian::byte_swap(Record->DataSize); + return support::endian::byte_swap(Record->DataSize, Endian); } /// Return the function lookup key. The value is considered opaque. template uint64_t getFuncNameRef(const FuncRecordTy *Record) { - return support::endian::byte_swap(Record->NameRef); + return support::endian::byte_swap(Record->NameRef, Endian); } /// Return the PGO name of the function. Used for formats in which the name is @@ -1280,14 +1280,14 @@ struct CovMapFunctionRecordV1 { /// Return function lookup key. The value is consider opaque. template IntPtrT getFuncNameRef() const { - return support::endian::byte_swap(NamePtr); + return support::endian::byte_swap(NamePtr, Endian); } /// Return the PGO name of the function. template Error getFuncName(InstrProfSymtab &ProfileNames, StringRef &FuncName) const { IntPtrT NameRef = getFuncNameRef(); - uint32_t NameS = support::endian::byte_swap(NameSize); + uint32_t NameS = support::endian::byte_swap(NameSize, Endian); FuncName = ProfileNames.getFuncName(NameRef, NameS); if (NameS && FuncName.empty()) return make_error(coveragemap_error::malformed, @@ -1385,7 +1385,7 @@ struct CovMapFunctionRecordV3 { /// Get the filename set reference. template uint64_t getFilenamesRef() const { - return support::endian::byte_swap(FilenamesRef); + return support::endian::byte_swap(FilenamesRef, Endian); } /// Read the inline coverage mapping. Ignore the buffer parameter, it is for @@ -1416,19 +1416,19 @@ struct CovMapHeader { #define COVMAP_HEADER(Type, LLVMType, Name, Init) Type Name; #include "llvm/ProfileData/InstrProfData.inc" template uint32_t getNRecords() const { - return support::endian::byte_swap(NRecords); + return support::endian::byte_swap(NRecords, Endian); } template uint32_t getFilenamesSize() const { - return support::endian::byte_swap(FilenamesSize); + return support::endian::byte_swap(FilenamesSize, Endian); } template uint32_t getCoverageSize() const { - return support::endian::byte_swap(CoverageSize); + return support::endian::byte_swap(CoverageSize, Endian); } template uint32_t getVersion() const { - return support::endian::byte_swap(Version); + return support::endian::byte_swap(Version, Endian); } }; diff --git a/llvm/include/llvm/Remarks/BitstreamRemarkContainer.h b/llvm/include/llvm/Remarks/BitstreamRemarkContainer.h index 48a148a3adc13..d4b70e54bf6bc 100644 --- a/llvm/include/llvm/Remarks/BitstreamRemarkContainer.h +++ b/llvm/include/llvm/Remarks/BitstreamRemarkContainer.h @@ -23,35 +23,35 @@ namespace remarks { /// The current version of the remark container. /// Note: this is different from the version of the remark entry. -constexpr uint64_t CurrentContainerVersion = 0; +constexpr uint64_t CurrentContainerVersion = 1; /// The magic number used for identifying remark blocks. constexpr StringLiteral ContainerMagic("RMRK"); /// Type of the remark container. -/// The remark container has two modes: -/// * separate: the metadata is separate from the remarks and points to the -/// auxiliary file that contains the remarks. -/// * standalone: the metadata and the remarks are emitted together. enum class BitstreamRemarkContainerType { - /// The metadata emitted separately. - /// This will contain the following: - /// * Container version and type - /// * String table - /// * External file - SeparateRemarksMeta, - /// The remarks emitted separately. - /// This will contain the following: - /// * Container version and type - /// * Remark version - SeparateRemarksFile, - /// Everything is emitted together. - /// This will contain the following: - /// * Container version and type - /// * Remark version - /// * String table - Standalone, - First = SeparateRemarksMeta, - Last = Standalone, + /// Emit a link to an external remarks file + /// (usually as a section of the object file, to enable discovery of all + /// remarks files from the final linked object file) + /// RemarksFileExternal: + /// | Meta: + /// | | Container info + /// | | External file + RemarksFileExternal, + /// Emit metadata and remarks into a file + /// RemarksFile: + /// | Meta: + /// | | Container info + /// | | Remark version + /// | Remarks: + /// | | Remark0 + /// | | Remark1 + /// | | Remark2 + /// | | ... + /// | Late Meta: + /// | | String table + RemarksFile, + First = RemarksFileExternal, + Last = RemarksFile }; /// The possible blocks that will be encountered in a bitstream remark diff --git a/llvm/include/llvm/Remarks/BitstreamRemarkSerializer.h b/llvm/include/llvm/Remarks/BitstreamRemarkSerializer.h index 6236800337508..76e2d5b4fd3bc 100644 --- a/llvm/include/llvm/Remarks/BitstreamRemarkSerializer.h +++ b/llvm/include/llvm/Remarks/BitstreamRemarkSerializer.h @@ -27,31 +27,7 @@ struct Remarks; /// Serialize the remarks to LLVM bitstream. /// This class provides ways to emit remarks in the LLVM bitstream format and /// its associated metadata. -/// -/// * The separate model: -/// Separate meta: | Container info -/// | String table -/// | External file -/// -/// Separate remarks: | Container info -/// | Remark version -/// | Remark0 -/// | Remark1 -/// | Remark2 -/// | ... -/// -/// * The standalone model: | Container info -/// | String table -/// | Remark version -/// | Remark0 -/// | Remark1 -/// | Remark2 -/// | ... -/// struct BitstreamRemarkSerializerHelper { - /// Buffer used for encoding the bitstream before writing it to the final - /// stream. - SmallVector Encoded; /// Buffer used to construct records and pass to the bitstream writer. SmallVector R; /// The Bitstream writer. @@ -73,7 +49,8 @@ struct BitstreamRemarkSerializerHelper { uint64_t RecordRemarkArgWithDebugLocAbbrevID = 0; uint64_t RecordRemarkArgWithoutDebugLocAbbrevID = 0; - BitstreamRemarkSerializerHelper(BitstreamRemarkContainerType ContainerType); + BitstreamRemarkSerializerHelper(BitstreamRemarkContainerType ContainerType, + raw_ostream &OS); // Disable copy and move: Bitstream points to Encoded, which needs special // handling during copy/move, but moving the vectors is probably useless @@ -104,20 +81,15 @@ struct BitstreamRemarkSerializerHelper { /// The block info for the remarks block. void setupRemarkBlockInfo(); - /// Emit the metadata for the remarks. - void emitMetaBlock(uint64_t ContainerVersion, - std::optional RemarkVersion, - std::optional StrTab = std::nullopt, - std::optional Filename = std::nullopt); + /// Emit the main metadata at the beginning of the file + void emitMetaBlock(std::optional Filename = std::nullopt); + + /// Emit the remaining metadata at the end of the file. Here we emit metadata + /// that is only known once all remarks were emitted. + void emitLateMetaBlock(const StringTable &StrTab); /// Emit a remark block. The string table is required. - void emitRemarkBlock(const Remark &Remark, StringTable &StrTab); - /// Finalize the writing to \p OS. - void flushToStream(raw_ostream &OS); - /// Finalize the writing to a buffer. - /// The contents of the buffer remain valid for the lifetime of the object. - /// Any call to any other function in this class will invalidate the buffer. - StringRef getBuffer(); + void emitRemark(const Remark &Remark, StringTable &StrTab); }; /// Implementation of the remark serializer using LLVM bitstream. @@ -127,68 +99,57 @@ struct BitstreamRemarkSerializer : public RemarkSerializer { /// 2) The metadata block that contains various information about the remarks /// in the file. /// 3) A number of remark blocks. + /// 4) Another metadata block for metadata that is only finalized once all + /// remarks were emitted (e.g. StrTab) - /// We need to set up 1) and 2) first, so that we can emit 3) after. This flag - /// is used to emit the first two blocks only once. - bool DidSetUp = false; - /// The helper to emit bitstream. - BitstreamRemarkSerializerHelper Helper; + /// The helper to emit bitstream. This is nullopt when the Serializer has not + /// been setup yet. + std::optional Helper; /// Construct a serializer that will create its own string table. - BitstreamRemarkSerializer(raw_ostream &OS, SerializerMode Mode); + BitstreamRemarkSerializer(raw_ostream &OS); /// Construct a serializer with a pre-filled string table. - BitstreamRemarkSerializer(raw_ostream &OS, SerializerMode Mode, - StringTable StrTab); + BitstreamRemarkSerializer(raw_ostream &OS, StringTable StrTab); + + ~BitstreamRemarkSerializer() override; /// Emit a remark to the stream. This also emits the metadata associated to - /// the remarks based on the SerializerMode specified at construction. - /// This writes the serialized output to the provided stream. + /// the remarks. This writes the serialized output to the provided stream. void emit(const Remark &Remark) override; + + /// Finalize emission of remarks. This emits the late metadata block and + /// flushes internal buffers. It is safe to call this function multiple times, + /// and it is automatically executed on destruction of the Serializer. + void finalize() override; + /// The metadata serializer associated to this remark serializer. Based on the /// container type of the current serializer, the container type of the /// metadata serializer will change. - std::unique_ptr metaSerializer( - raw_ostream &OS, - std::optional ExternalFilename = std::nullopt) override; + std::unique_ptr + metaSerializer(raw_ostream &OS, StringRef ExternalFilename) override; static bool classof(const RemarkSerializer *S) { return S->SerializerFormat == Format::Bitstream; } + +private: + void setup(); }; /// Serializer of metadata for bitstream remarks. struct BitstreamMetaSerializer : public MetaSerializer { - /// This class can be used with [1] a pre-constructed - /// BitstreamRemarkSerializerHelper, or with [2] one that is owned by the meta - /// serializer. In case of [1], we need to be able to store a reference to the - /// object, while in case of [2] we need to store the whole object. - std::optional TmpHelper; - /// The actual helper, that can point to \p TmpHelper or to an external helper - /// object. - BitstreamRemarkSerializerHelper *Helper = nullptr; - - std::optional StrTab; - std::optional ExternalFilename; + std::optional Helper; + + StringRef ExternalFilename; /// Create a new meta serializer based on \p ContainerType. - BitstreamMetaSerializer( - raw_ostream &OS, BitstreamRemarkContainerType ContainerType, - std::optional StrTab = std::nullopt, - std::optional ExternalFilename = std::nullopt) - : MetaSerializer(OS), TmpHelper(std::nullopt), Helper(nullptr), - StrTab(StrTab), ExternalFilename(ExternalFilename) { - TmpHelper.emplace(ContainerType); - Helper = &*TmpHelper; + BitstreamMetaSerializer(raw_ostream &OS, + BitstreamRemarkContainerType ContainerType, + StringRef ExternalFilename) + : MetaSerializer(OS), ExternalFilename(ExternalFilename) { + Helper.emplace(ContainerType, OS); } - /// Create a new meta serializer based on a previously built \p Helper. - BitstreamMetaSerializer( - raw_ostream &OS, BitstreamRemarkSerializerHelper &Helper, - std::optional StrTab = std::nullopt, - std::optional ExternalFilename = std::nullopt) - : MetaSerializer(OS), TmpHelper(std::nullopt), Helper(&Helper), - StrTab(StrTab), ExternalFilename(ExternalFilename) {} - void emit() override; }; diff --git a/llvm/include/llvm/Remarks/RemarkSerializer.h b/llvm/include/llvm/Remarks/RemarkSerializer.h index 05ef14ae5566b..1785152b87c70 100644 --- a/llvm/include/llvm/Remarks/RemarkSerializer.h +++ b/llvm/include/llvm/Remarks/RemarkSerializer.h @@ -26,16 +26,6 @@ namespace remarks { struct Remark; -enum class SerializerMode { - Separate, // A mode where the metadata is serialized separately from the - // remarks. Typically, this is used when the remarks need to be - // streamed to a side file and the metadata is embedded into the - // final result of the compilation. - Standalone // A mode where everything can be retrieved in the same - // file/buffer. Typically, this is used for storing remarks for - // later use. -}; - struct MetaSerializer; /// This is the base class for a remark serializer. @@ -45,24 +35,27 @@ struct RemarkSerializer { Format SerializerFormat; /// The open raw_ostream that the remark diagnostics are emitted to. raw_ostream &OS; - /// The serialization mode. - SerializerMode Mode; /// The string table containing all the unique strings used in the output. /// The table can be serialized to be consumed after the compilation. std::optional StrTab; - RemarkSerializer(Format SerializerFormat, raw_ostream &OS, - SerializerMode Mode) - : SerializerFormat(SerializerFormat), OS(OS), Mode(Mode) {} + RemarkSerializer(Format SerializerFormat, raw_ostream &OS) + : SerializerFormat(SerializerFormat), OS(OS) {} - /// This is just an interface. virtual ~RemarkSerializer() = default; + + /// Finalize remark emission (e.g. finish writing metadata, flush internal + /// buffers). It is safe to call this function multiple times, and it should + /// have the same behavior as destructing the RemarkSerializer. + /// After finalizing, the behavior of emit is unspecified. + virtual void finalize() {} + /// Emit a remark to the stream. virtual void emit(const Remark &Remark) = 0; + /// Return the corresponding metadata serializer. virtual std::unique_ptr - metaSerializer(raw_ostream &OS, - std::optional ExternalFilename = std::nullopt) = 0; + metaSerializer(raw_ostream &OS, StringRef ExternalFilename) = 0; }; /// This is the base class for a remark metadata serializer. @@ -79,13 +72,12 @@ struct MetaSerializer { /// Create a remark serializer. LLVM_ABI Expected> -createRemarkSerializer(Format RemarksFormat, SerializerMode Mode, - raw_ostream &OS); +createRemarkSerializer(Format RemarksFormat, raw_ostream &OS); /// Create a remark serializer that uses a pre-filled string table. LLVM_ABI Expected> -createRemarkSerializer(Format RemarksFormat, SerializerMode Mode, - raw_ostream &OS, remarks::StringTable StrTab); +createRemarkSerializer(Format RemarksFormat, raw_ostream &OS, + remarks::StringTable StrTab); } // end namespace remarks } // end namespace llvm diff --git a/llvm/include/llvm/Remarks/RemarkStreamer.h b/llvm/include/llvm/Remarks/RemarkStreamer.h index 5b1cc81cdbf50..dd5bfcbc7ff99 100644 --- a/llvm/include/llvm/Remarks/RemarkStreamer.h +++ b/llvm/include/llvm/Remarks/RemarkStreamer.h @@ -52,6 +52,7 @@ class RemarkStreamer final { public: RemarkStreamer(std::unique_ptr RemarkSerializer, std::optional Filename = std::nullopt); + ~RemarkStreamer(); /// Return the filename that the remark diagnostics are emitted to. std::optional getFilename() const { @@ -61,6 +62,14 @@ class RemarkStreamer final { raw_ostream &getStream() { return RemarkSerializer->OS; } /// Return the serializer used for this stream. remarks::RemarkSerializer &getSerializer() { return *RemarkSerializer; } + + /// Release the underlying RemarkSerializer. Destructing the RemarkStreamer + /// will assert that the RemarkStreamer has been released, to ensure that the + /// remarks were properly finalized. + std::unique_ptr releaseSerializer() { + return std::move(RemarkSerializer); + } + /// Set a pass filter based on a regex \p Filter. /// Returns an error if the regex is invalid. Error setFilter(StringRef Filter); diff --git a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h index d80464c0fe74a..69b8f9f000e1d 100644 --- a/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h +++ b/llvm/include/llvm/Remarks/YAMLRemarkSerializer.h @@ -36,28 +36,22 @@ struct LLVM_ABI YAMLRemarkSerializer : public RemarkSerializer { /// The YAML streamer. yaml::Output YAMLOutput; - YAMLRemarkSerializer(raw_ostream &OS, SerializerMode Mode, - std::optional StrTab = std::nullopt); + YAMLRemarkSerializer(raw_ostream &OS); + YAMLRemarkSerializer(raw_ostream &OS, StringTable StrTabIn); void emit(const Remark &Remark) override; - std::unique_ptr metaSerializer( - raw_ostream &OS, - std::optional ExternalFilename = std::nullopt) override; + std::unique_ptr + metaSerializer(raw_ostream &OS, StringRef ExternalFilename) override; static bool classof(const RemarkSerializer *S) { return S->SerializerFormat == Format::YAML; } - -protected: - YAMLRemarkSerializer(Format SerializerFormat, raw_ostream &OS, - SerializerMode Mode, - std::optional StrTab = std::nullopt); }; struct LLVM_ABI YAMLMetaSerializer : public MetaSerializer { - std::optional ExternalFilename; + StringRef ExternalFilename; - YAMLMetaSerializer(raw_ostream &OS, std::optional ExternalFilename) + YAMLMetaSerializer(raw_ostream &OS, StringRef ExternalFilename) : MetaSerializer(OS), ExternalFilename(ExternalFilename) {} void emit() override; diff --git a/llvm/include/llvm/Support/Alignment.h b/llvm/include/llvm/Support/Alignment.h index 84773f1d9c37b..a4ca54e26f18d 100644 --- a/llvm/include/llvm/Support/Alignment.h +++ b/llvm/include/llvm/Support/Alignment.h @@ -52,16 +52,8 @@ struct Align { friend unsigned encode(struct MaybeAlign A); friend struct MaybeAlign decodeMaybeAlign(unsigned Value); - /// A trivial type to allow construction of constexpr Align. - /// This is currently needed to workaround a bug in GCC 5.3 which prevents - /// definition of constexpr assign operators. - /// https://stackoverflow.com/questions/46756288/explicitly-defaulted-function-cannot-be-declared-as-constexpr-because-the-implic - /// FIXME: Remove this, make all assign operators constexpr and introduce user - /// defined literals when we don't have to support GCC 5.3 anymore. - /// https://llvm.org/docs/GettingStarted.html#getting-a-modern-host-c-toolchain - struct LogValue { - uint8_t Log; - }; + struct FromShiftValue {}; + constexpr Align(FromShiftValue, uint8_t Shift) : ShiftValue(Shift) {} public: /// Default is byte-aligned. @@ -70,8 +62,8 @@ struct Align { /// checks have been performed when building `Other`. constexpr Align(const Align &Other) = default; constexpr Align(Align &&Other) = default; - Align &operator=(const Align &Other) = default; - Align &operator=(Align &&Other) = default; + constexpr Align &operator=(const Align &Other) = default; + constexpr Align &operator=(Align &&Other) = default; explicit Align(uint64_t Value) { assert(Value > 0 && "Value must not be 0"); @@ -82,7 +74,7 @@ struct Align { /// This is a hole in the type system and should not be abused. /// Needed to interact with C for instance. - uint64_t value() const { return uint64_t(1) << ShiftValue; } + constexpr uint64_t value() const { return uint64_t(1) << ShiftValue; } // Returns the previous alignment. Align previous() const { @@ -94,7 +86,7 @@ struct Align { /// Allow constructions of constexpr Align. template constexpr static Align Constant() { - return LogValue{static_cast(ConstantLog2())}; + return Align(FromShiftValue{}, ConstantLog2()); } /// Allow constructions of constexpr Align from types. @@ -102,9 +94,6 @@ struct Align { template constexpr static Align Of() { return Constant>(); } - - /// Constexpr constructor from LogValue type. - constexpr Align(LogValue CA) : ShiftValue(CA.Log) {} }; /// Treats the value 0 as a 1, so Align is always at least 1. diff --git a/llvm/include/llvm/Support/AllocatorBase.h b/llvm/include/llvm/Support/AllocatorBase.h index 0442432250069..6414c5dc5122c 100644 --- a/llvm/include/llvm/Support/AllocatorBase.h +++ b/llvm/include/llvm/Support/AllocatorBase.h @@ -28,6 +28,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/MemAlloc.h" #include +#include namespace llvm { @@ -111,7 +112,7 @@ template class AllocatorHolder : Alloc { public: AllocatorHolder() = default; AllocatorHolder(const Alloc &A) : Alloc(A) {} - AllocatorHolder(Alloc &&A) : Alloc(static_cast(A)) {} + AllocatorHolder(Alloc &&A) : Alloc(std::move(A)) {} Alloc &getAllocator() { return *this; } const Alloc &getAllocator() const { return *this; } }; diff --git a/llvm/include/llvm/Support/BinaryStreamRef.h b/llvm/include/llvm/Support/BinaryStreamRef.h index 47009ff0b96fc..8ca312daa3bd7 100644 --- a/llvm/include/llvm/Support/BinaryStreamRef.h +++ b/llvm/include/llvm/Support/BinaryStreamRef.h @@ -209,7 +209,7 @@ struct BinarySubstreamRef { BinarySubstreamRef keep_front(uint64_t N) const { return slice(0, N); } std::pair split(uint64_t Off) const { - return std::make_pair(keep_front(Off), drop_front(Off)); + return {keep_front(Off), drop_front(Off)}; } uint64_t size() const { return StreamData.getLength(); } diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h index b81df756247c9..dd05c530cc06e 100644 --- a/llvm/include/llvm/Support/CommandLine.h +++ b/llvm/include/llvm/Support/CommandLine.h @@ -1193,6 +1193,31 @@ class LLVM_ABI parser : public basic_parser { //-------------------------------------------------- +template <> +class LLVM_ABI parser> + : public basic_parser> { +public: + parser(Option &O) : basic_parser(O) {} + + // Return true on error. + bool parse(Option &, StringRef, StringRef Arg, + std::optional &Value) { + Value = Arg.str(); + return false; + } + + // Overload in subclass to provide a better default value. + StringRef getValueName() const override { return "optional string"; } + + void printOptionDiff(const Option &O, std::optional V, + const OptVal &Default, size_t GlobalWidth) const; + + // An out-of-line virtual method to provide a 'home' for this class. + void anchor() override; +}; + +//-------------------------------------------------- + extern template class LLVM_TEMPLATE_ABI basic_parser; template <> class LLVM_ABI parser : public basic_parser { diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index 89349d1ebffee..48fc60035b189 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -136,7 +136,7 @@ class DebugCounter { // Return the name and description of the counter with the given ID. std::pair getCounterInfo(unsigned ID) const { - return std::make_pair(RegisteredCounters[ID], Counters.lookup(ID).Desc); + return {RegisteredCounters[ID], Counters.lookup(ID).Desc}; } // Iterate through the registered counters diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h index 7eb1d7e8dfe7f..51db225841dbe 100644 --- a/llvm/include/llvm/Support/Endian.h +++ b/llvm/include/llvm/Support/Endian.h @@ -49,7 +49,9 @@ template /// Swap the bytes of value to match the given endianness. template -[[nodiscard]] inline value_type byte_swap(value_type value) { +[[nodiscard]] +LLVM_DEPRECATED("Pass endian as a function argument instead", + "byte_swap") inline value_type byte_swap(value_type value) { return byte_swap(value, endian); } @@ -66,7 +68,9 @@ template } template -[[nodiscard]] inline value_type read(const void *memory) { +[[nodiscard]] LLVM_DEPRECATED("Pass endian as a function argument instead", + "read") inline value_type + read(const void *memory) { return read(memory, endian); } @@ -127,7 +131,7 @@ template uint64_t startBit) { assert(startBit < 8); if (startBit == 0) - return read(memory); + return read(memory, endian); else { // Read two values and compose the result from them. value_type val[2]; @@ -135,8 +139,8 @@ template LLVM_ASSUME_ALIGNED( memory, (detail::PickAlignment::value)), sizeof(value_type) * 2); - val[0] = byte_swap(val[0]); - val[1] = byte_swap(val[1]); + val[0] = byte_swap(val[0], endian); + val[1] = byte_swap(val[1], endian); // Shift bits from the lower value into place. make_unsigned_t lowerVal = val[0] >> startBit; @@ -170,8 +174,8 @@ inline void writeAtBitAlignment(void *memory, value_type value, LLVM_ASSUME_ALIGNED( memory, (detail::PickAlignment::value)), sizeof(value_type) * 2); - val[0] = byte_swap(val[0]); - val[1] = byte_swap(val[1]); + val[0] = byte_swap(val[0], endian); + val[1] = byte_swap(val[1], endian); // Mask off any existing bits in the upper part of the lower value that // we want to replace. @@ -199,8 +203,8 @@ inline void writeAtBitAlignment(void *memory, value_type value, val[1] |= upperVal; // Finally, rewrite values. - val[0] = byte_swap(val[0]); - val[1] = byte_swap(val[1]); + val[0] = byte_swap(val[0], endian); + val[1] = byte_swap(val[1], endian); memcpy(LLVM_ASSUME_ALIGNED( memory, (detail::PickAlignment::value)), &val[0], sizeof(value_type) * 2); @@ -223,8 +227,8 @@ struct packed_endian_specific_integral { explicit packed_endian_specific_integral(value_type val) { *this = val; } value_type value() const { - return endian::read( - (const void*)Value.buffer); + return endian::read((const void *)Value.buffer, + endian); } operator value_type() const { return value(); } @@ -263,7 +267,7 @@ struct packed_endian_specific_integral { explicit ref(void *Ptr) : Ptr(Ptr) {} operator value_type() const { - return endian::read(Ptr); + return endian::read(Ptr, endian); } void operator=(value_type NewValue) { diff --git a/llvm/include/llvm/Support/FileCollector.h b/llvm/include/llvm/Support/FileCollector.h index b00bf3174e654..9cc6776b948ba 100644 --- a/llvm/include/llvm/Support/FileCollector.h +++ b/llvm/include/llvm/Support/FileCollector.h @@ -81,19 +81,25 @@ class LLVM_ABI FileCollector : public FileCollectorBase { /// Canonicalize a pair of virtual and real paths. LLVM_ABI PathStorage canonicalize(StringRef SrcPath); + explicit PathCanonicalizer(IntrusiveRefCntPtr VFS) + : VFS(std::move(VFS)) {} + private: /// Replace with a (mostly) real path, or don't modify. Resolves symlinks /// in the directory, using \a CachedDirs to avoid redundant lookups, but /// leaves the filename as a possible symlink. void updateWithRealPath(SmallVectorImpl &Path); + IntrusiveRefCntPtr VFS; + StringMap CachedDirs; }; /// \p Root is the directory where collected files are will be stored. /// \p OverlayRoot is VFS mapping root. /// \p Root directory gets created in copyFiles unless it already exists. - FileCollector(std::string Root, std::string OverlayRoot); + FileCollector(std::string Root, std::string OverlayRoot, + IntrusiveRefCntPtr VFS); /// Write the yaml mapping (for the VFS) to the given file. std::error_code writeMapping(StringRef MappingFile); diff --git a/llvm/include/llvm/Support/FormatProviders.h b/llvm/include/llvm/Support/FormatProviders.h index 3e0800e1efe6c..9147782055574 100644 --- a/llvm/include/llvm/Support/FormatProviders.h +++ b/llvm/include/llvm/Support/FormatProviders.h @@ -389,7 +389,7 @@ template class format_provider> { StringRef Sep = consumeOneOption(Style, '$', ", "); StringRef Args = consumeOneOption(Style, '@', ""); assert(Style.empty() && "Unexpected text in range option string!"); - return std::make_pair(Sep, Args); + return {Sep, Args}; } public: diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h index 66e2119f8a132..ed29826bab0cb 100644 --- a/llvm/include/llvm/Support/MD5.h +++ b/llvm/include/llvm/Support/MD5.h @@ -57,7 +57,7 @@ class MD5 { } std::pair words() const { using namespace support; - return std::make_pair(high(), low()); + return {high(), low()}; } }; diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h index f6b4055e74de7..d7d72cfbbc649 100644 --- a/llvm/include/llvm/Support/OnDiskHashTable.h +++ b/llvm/include/llvm/Support/OnDiskHashTable.h @@ -309,7 +309,7 @@ template class OnDiskChainedHashTable { offset_type NumEntries = endian::readNext( Buckets); - return std::make_pair(NumBuckets, NumEntries); + return {NumBuckets, NumEntries}; } offset_type getNumBuckets() const { return NumBuckets; } diff --git a/llvm/include/llvm/Support/PGOOptions.h b/llvm/include/llvm/Support/PGOOptions.h index 6527a18258bf8..fb1dc0cf4aa0a 100644 --- a/llvm/include/llvm/Support/PGOOptions.h +++ b/llvm/include/llvm/Support/PGOOptions.h @@ -14,16 +14,10 @@ #ifndef LLVM_SUPPORT_PGOOPTIONS_H #define LLVM_SUPPORT_PGOOPTIONS_H -#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" namespace llvm { - -namespace vfs { -class FileSystem; -} // namespace vfs - /// A struct capturing PGO tunables. struct PGOOptions { enum PGOAction { NoAction, IRInstr, IRUse, SampleUse }; @@ -31,9 +25,7 @@ struct PGOOptions { enum class ColdFuncOpt { Default, OptSize, MinSize, OptNone }; LLVM_ABI PGOOptions(std::string ProfileFile, std::string CSProfileGenFile, std::string ProfileRemappingFile, - std::string MemoryProfile, - IntrusiveRefCntPtr FS, - PGOAction Action = NoAction, + std::string MemoryProfile, PGOAction Action = NoAction, CSPGOAction CSAction = NoCSAction, ColdFuncOpt ColdType = ColdFuncOpt::Default, bool DebugInfoForProfiling = false, @@ -53,7 +45,6 @@ struct PGOOptions { bool DebugInfoForProfiling; bool PseudoProbeForProfiling; bool AtomicCounterUpdate; - IntrusiveRefCntPtr FS; }; } // namespace llvm diff --git a/llvm/include/llvm/Support/ProgramStack.h b/llvm/include/llvm/Support/ProgramStack.h index 0dd8235b90c06..13729a2990588 100644 --- a/llvm/include/llvm/Support/ProgramStack.h +++ b/llvm/include/llvm/Support/ProgramStack.h @@ -46,17 +46,15 @@ LLVM_ABI unsigned getDefaultStackSize(); LLVM_ABI void runOnNewStack(unsigned StackSize, function_ref Fn); template -std::enable_if_t, R> -runOnNewStack(unsigned StackSize, function_ref Fn, Ts &&...Args) { - std::optional Ret; - runOnNewStack(StackSize, [&]() { Ret = Fn(std::forward(Args)...); }); - return std::move(*Ret); -} - -template -void runOnNewStack(unsigned StackSize, function_ref Fn, +auto runOnNewStack(unsigned StackSize, function_ref Fn, Ts &&...Args) { - runOnNewStack(StackSize, [&]() { Fn(std::forward(Args)...); }); + if constexpr (std::is_same_v) { + runOnNewStack(StackSize, [&]() { Fn(std::forward(Args)...); }); + } else { + std::optional Ret; + runOnNewStack(StackSize, [&]() { Ret = Fn(std::forward(Args)...); }); + return std::move(*Ret); + } } } // namespace llvm diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h index ff9226c39359c..c02f15e5e32b8 100644 --- a/llvm/include/llvm/Support/Registry.h +++ b/llvm/include/llvm/Support/Registry.h @@ -58,8 +58,8 @@ namespace llvm { // declaration causing error C2487 "member of dll interface class may not // be declared with dll interface". // https://developercommunity.visualstudio.com/t/c2487-in-dllexport-class-with-static-members/69878 - static node *Head; - static node *Tail; + static inline node *Head = nullptr; + static inline node *Tail = nullptr; public: /// Node in linked list of entries. @@ -143,19 +143,11 @@ namespace llvm { /// Instantiate a registry class. #define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \ namespace llvm { \ - template \ - typename Registry::node *Registry::Head = nullptr; \ - template \ - typename Registry::node *Registry::Tail = nullptr; \ template class LLVM_ABI_EXPORT Registry; \ } #else #define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \ namespace llvm { \ - template \ - typename Registry::node *Registry::Head = nullptr; \ - template \ - typename Registry::node *Registry::Tail = nullptr; \ template class Registry; \ } #endif diff --git a/llvm/include/llvm/Support/SMLoc.h b/llvm/include/llvm/Support/SMLoc.h index d7dde81ce0be7..c80969b1d83dc 100644 --- a/llvm/include/llvm/Support/SMLoc.h +++ b/llvm/include/llvm/Support/SMLoc.h @@ -28,8 +28,8 @@ class SMLoc { constexpr bool isValid() const { return Ptr != nullptr; } - constexpr bool operator==(const SMLoc &RHS) const { return RHS.Ptr == Ptr; } - constexpr bool operator!=(const SMLoc &RHS) const { return RHS.Ptr != Ptr; } + constexpr bool operator==(SMLoc RHS) const { return RHS.Ptr == Ptr; } + constexpr bool operator!=(SMLoc RHS) const { return RHS.Ptr != Ptr; } constexpr const char *getPointer() const { return Ptr; } diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h index 87a56809976a3..07baf153e10c6 100644 --- a/llvm/include/llvm/Support/ScaledNumber.h +++ b/llvm/include/llvm/Support/ScaledNumber.h @@ -57,8 +57,8 @@ inline std::pair getRounded(DigitsT Digits, int16_t Scale, if (ShouldRound) if (!++Digits) // Overflow. - return std::make_pair(DigitsT(1) << (getWidth() - 1), Scale + 1); - return std::make_pair(Digits, Scale); + return {DigitsT(1) << (getWidth() - 1), Scale + 1}; + return {Digits, Scale}; } /// Convenience helper for 32-bit rounding. @@ -83,7 +83,7 @@ inline std::pair getAdjusted(uint64_t Digits, const int Width = getWidth(); if (Width == 64 || Digits <= std::numeric_limits::max()) - return std::make_pair(Digits, Scale); + return {Digits, Scale}; // Shift right and round. int Shift = llvm::bit_width(Digits) - Width; @@ -160,9 +160,9 @@ std::pair getQuotient(DigitsT Dividend, DigitsT Divisor) { // Check for zero. if (!Dividend) - return std::make_pair(0, 0); + return {0, 0}; if (!Divisor) - return std::make_pair(std::numeric_limits::max(), MaxScale); + return {std::numeric_limits::max(), MaxScale}; if (getWidth() == 64) return divide64(Dividend, Divisor); @@ -192,7 +192,7 @@ inline std::pair getLgImpl(DigitsT Digits, int16_t Scale) { static_assert(!std::numeric_limits::is_signed, "expected unsigned"); if (!Digits) - return std::make_pair(INT32_MIN, 0); + return {INT32_MIN, 0}; // Get the floor of the lg of Digits. static_assert(sizeof(Digits) <= sizeof(uint64_t)); @@ -201,12 +201,12 @@ inline std::pair getLgImpl(DigitsT Digits, int16_t Scale) { // Get the actual floor. int32_t Floor = Scale + LocalFloor; if (Digits == UINT64_C(1) << LocalFloor) - return std::make_pair(Floor, 0); + return {Floor, 0}; // Round based on the next digit. assert(LocalFloor >= 1); bool Round = Digits & UINT64_C(1) << (LocalFloor - 1); - return std::make_pair(Floor + Round, Round ? 1 : -1); + return {Floor + Round, Round ? 1 : -1}; } /// Get the lg (rounded) of a scaled number. @@ -348,11 +348,11 @@ std::pair getSum(DigitsT LDigits, int16_t LScale, // Compute sum. DigitsT Sum = LDigits + RDigits; if (Sum >= RDigits) - return std::make_pair(Sum, Scale); + return {Sum, Scale}; // Adjust sum after arithmetic overflow. DigitsT HighBit = DigitsT(1) << (getWidth() - 1); - return std::make_pair(HighBit | Sum >> 1, Scale + 1); + return {HighBit | Sum >> 1, Scale + 1}; } /// Convenience helper for 32-bit sum. @@ -384,18 +384,18 @@ std::pair getDifference(DigitsT LDigits, int16_t LScale, // Compute difference. if (LDigits <= RDigits) - return std::make_pair(0, 0); + return {0, 0}; if (RDigits || !SavedRDigits) - return std::make_pair(LDigits - RDigits, LScale); + return {LDigits - RDigits, LScale}; // Check if RDigits just barely lost its last bit. E.g., for 32-bit: // // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32 const auto RLgFloor = getLgFloor(SavedRDigits, SavedRScale); if (!compare(LDigits, LScale, DigitsT(1), RLgFloor + getWidth())) - return std::make_pair(std::numeric_limits::max(), RLgFloor); + return {std::numeric_limits::max(), RLgFloor}; - return std::make_pair(LDigits, LScale); + return {LDigits, LScale}; } /// Convenience helper for 32-bit difference. @@ -435,9 +435,9 @@ class ScaledNumberBase { static std::pair splitSigned(int64_t N) { if (N >= 0) - return std::make_pair(N, false); + return {N, false}; uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N); - return std::make_pair(Unsigned, true); + return {Unsigned, true}; } static int64_t joinSigned(uint64_t U, bool IsNeg) { if (U > uint64_t(INT64_MAX)) diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index b905576b61791..7710e2fc2f22b 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -745,6 +745,7 @@ HANDLE_TARGET_OPCODE(G_SET_FPMODE) HANDLE_TARGET_OPCODE(G_RESET_FPMODE) HANDLE_TARGET_OPCODE(G_GET_ROUNDING) +HANDLE_TARGET_OPCODE(G_SET_ROUNDING) /// Generic pointer offset HANDLE_TARGET_OPCODE(G_PTR_ADD) diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h index d7211a930ae49..3eb7c0bd1f379 100644 --- a/llvm/include/llvm/Support/TrailingObjects.h +++ b/llvm/include/llvm/Support/TrailingObjects.h @@ -57,25 +57,9 @@ namespace llvm { namespace trailing_objects_internal { -/// Helper template to calculate the max alignment requirement for a set of -/// objects. -template class AlignmentCalcHelper { -private: - enum { - FirstAlignment = alignof(First), - RestAlignment = AlignmentCalcHelper::Alignment, - }; -public: - enum { - Alignment = FirstAlignment > RestAlignment ? FirstAlignment : RestAlignment - }; -}; - -template class AlignmentCalcHelper { -public: - enum { Alignment = alignof(First) }; -}; +template +inline constexpr size_t MaxAlignment = std::max({alignof(T)...}); /// The base class for TrailingObjects* classes. class TrailingObjectsBase { @@ -209,11 +193,10 @@ class alignas(Align) TrailingObjectsImpl /// See the file comment for details on the usage of the /// TrailingObjects type. template -class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl< - trailing_objects_internal::AlignmentCalcHelper< - TrailingTys...>::Alignment, - BaseTy, TrailingObjects, - BaseTy, TrailingTys...> { +class TrailingObjects + : private trailing_objects_internal::TrailingObjectsImpl< + trailing_objects_internal::MaxAlignment, BaseTy, + TrailingObjects, BaseTy, TrailingTys...> { template friend class trailing_objects_internal::TrailingObjectsImpl; @@ -221,8 +204,8 @@ class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl< template class Foo {}; typedef trailing_objects_internal::TrailingObjectsImpl< - trailing_objects_internal::AlignmentCalcHelper::Alignment, - BaseTy, TrailingObjects, BaseTy, TrailingTys...> + trailing_objects_internal::MaxAlignment, BaseTy, + TrailingObjects, BaseTy, TrailingTys...> ParentType; using TrailingObjectsBase = trailing_objects_internal::TrailingObjectsBase; diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h index 81e3e2e41e86d..3d36f41ca1a04 100644 --- a/llvm/include/llvm/Support/YAMLTraits.h +++ b/llvm/include/llvm/Support/YAMLTraits.h @@ -705,7 +705,7 @@ class LLVM_ABI IO { virtual bool mapTag(StringRef Tag, bool Default = false) = 0; virtual void beginMapping() = 0; virtual void endMapping() = 0; - virtual bool preflightKey(const char *, bool, bool, bool &, void *&) = 0; + virtual bool preflightKey(StringRef, bool, bool, bool &, void *&) = 0; virtual void postflightKey(void *) = 0; virtual std::vector keys() = 0; @@ -713,12 +713,12 @@ class LLVM_ABI IO { virtual void endFlowMapping() = 0; virtual void beginEnumScalar() = 0; - virtual bool matchEnumScalar(const char *, bool) = 0; + virtual bool matchEnumScalar(StringRef, bool) = 0; virtual bool matchEnumFallback() = 0; virtual void endEnumScalar() = 0; virtual bool beginBitSetScalar(bool &) = 0; - virtual bool bitSetMatch(const char *, bool) = 0; + virtual bool bitSetMatch(StringRef, bool) = 0; virtual void endBitSetScalar() = 0; virtual void scalarString(StringRef &, QuotingType) = 0; @@ -731,8 +731,7 @@ class LLVM_ABI IO { virtual std::error_code error() = 0; virtual void setAllowUnknownKeys(bool Allow); - template - void enumCase(T &Val, const char *Str, const T ConstVal) { + template void enumCase(T &Val, StringRef Str, const T ConstVal) { if (matchEnumScalar(Str, outputting() && Val == ConstVal)) { Val = ConstVal; } @@ -740,7 +739,7 @@ class LLVM_ABI IO { // allow anonymous enum values to be used with LLVM_YAML_STRONG_TYPEDEF template - void enumCase(T &Val, const char *Str, const uint32_t ConstVal) { + void enumCase(T &Val, StringRef Str, const uint32_t ConstVal) { if (matchEnumScalar(Str, outputting() && Val == static_cast(ConstVal))) { Val = ConstVal; } @@ -757,7 +756,7 @@ class LLVM_ABI IO { } template - void bitSetCase(T &Val, const char *Str, const T ConstVal) { + void bitSetCase(T &Val, StringRef Str, const T ConstVal) { if (bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal)) { Val = static_cast(Val | ConstVal); } @@ -765,20 +764,20 @@ class LLVM_ABI IO { // allow anonymous enum values to be used with LLVM_YAML_STRONG_TYPEDEF template - void bitSetCase(T &Val, const char *Str, const uint32_t ConstVal) { + void bitSetCase(T &Val, StringRef Str, const uint32_t ConstVal) { if (bitSetMatch(Str, outputting() && (Val & ConstVal) == ConstVal)) { Val = static_cast(Val | ConstVal); } } template - void maskedBitSetCase(T &Val, const char *Str, T ConstVal, T Mask) { + void maskedBitSetCase(T &Val, StringRef Str, T ConstVal, T Mask) { if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal)) Val = Val | ConstVal; } template - void maskedBitSetCase(T &Val, const char *Str, uint32_t ConstVal, + void maskedBitSetCase(T &Val, StringRef Str, uint32_t ConstVal, uint32_t Mask) { if (bitSetMatch(Str, outputting() && (Val & Mask) == ConstVal)) Val = Val | ConstVal; @@ -787,29 +786,29 @@ class LLVM_ABI IO { void *getContext() const; void setContext(void *); - template void mapRequired(const char *Key, T &Val) { + template void mapRequired(StringRef Key, T &Val) { EmptyContext Ctx; this->processKey(Key, Val, true, Ctx); } template - void mapRequired(const char *Key, T &Val, Context &Ctx) { + void mapRequired(StringRef Key, T &Val, Context &Ctx) { this->processKey(Key, Val, true, Ctx); } - template void mapOptional(const char *Key, T &Val) { + template void mapOptional(StringRef Key, T &Val) { EmptyContext Ctx; mapOptionalWithContext(Key, Val, Ctx); } template - void mapOptional(const char *Key, T &Val, const DefaultT &Default) { + void mapOptional(StringRef Key, T &Val, const DefaultT &Default) { EmptyContext Ctx; mapOptionalWithContext(Key, Val, Default, Ctx); } template - void mapOptionalWithContext(const char *Key, T &Val, Context &Ctx) { + void mapOptionalWithContext(StringRef Key, T &Val, Context &Ctx) { if constexpr (has_SequenceTraits::value) { // omit key/value instead of outputting empty sequence if (this->canElideEmptySequence() && Val.begin() == Val.end()) @@ -819,14 +818,14 @@ class LLVM_ABI IO { } template - void mapOptionalWithContext(const char *Key, std::optional &Val, + void mapOptionalWithContext(StringRef Key, std::optional &Val, Context &Ctx) { this->processKeyWithDefault(Key, Val, std::optional(), /*Required=*/false, Ctx); } template - void mapOptionalWithContext(const char *Key, T &Val, const DefaultT &Default, + void mapOptionalWithContext(StringRef Key, T &Val, const DefaultT &Default, Context &Ctx) { static_assert(std::is_convertible::value, "Default type must be implicitly convertible to value type!"); @@ -836,12 +835,12 @@ class LLVM_ABI IO { private: template - void processKeyWithDefault(const char *Key, std::optional &Val, + void processKeyWithDefault(StringRef Key, std::optional &Val, const std::optional &DefaultValue, bool Required, Context &Ctx); template - void processKeyWithDefault(const char *Key, T &Val, const T &DefaultValue, + void processKeyWithDefault(StringRef Key, T &Val, const T &DefaultValue, bool Required, Context &Ctx) { void *SaveInfo; bool UseDefault; @@ -857,7 +856,7 @@ class LLVM_ABI IO { } template - void processKey(const char *Key, T &Val, bool Required, Context &Ctx) { + void processKey(StringRef Key, T &Val, bool Required, Context &Ctx) { void *SaveInfo; bool UseDefault; if (this->preflightKey(Key, Required, false, UseDefault, SaveInfo)) { @@ -1332,7 +1331,7 @@ class LLVM_ABI Input : public IO { bool mapTag(StringRef, bool) override; void beginMapping() override; void endMapping() override; - bool preflightKey(const char *, bool, bool, bool &, void *&) override; + bool preflightKey(StringRef Key, bool, bool, bool &, void *&) override; void postflightKey(void *) override; std::vector keys() override; void beginFlowMapping() override; @@ -1346,11 +1345,11 @@ class LLVM_ABI Input : public IO { void postflightFlowElement(void *) override; void endFlowSequence() override; void beginEnumScalar() override; - bool matchEnumScalar(const char *, bool) override; + bool matchEnumScalar(StringRef, bool) override; bool matchEnumFallback() override; void endEnumScalar() override; bool beginBitSetScalar(bool &) override; - bool bitSetMatch(const char *, bool) override; + bool bitSetMatch(StringRef, bool) override; void endBitSetScalar() override; void scalarString(StringRef &, QuotingType) override; void blockScalarString(StringRef &) override; @@ -1483,7 +1482,7 @@ class LLVM_ABI Output : public IO { bool mapTag(StringRef, bool) override; void beginMapping() override; void endMapping() override; - bool preflightKey(const char *key, bool, bool, bool &, void *&) override; + bool preflightKey(StringRef Key, bool, bool, bool &, void *&) override; void postflightKey(void *) override; std::vector keys() override; void beginFlowMapping() override; @@ -1497,11 +1496,11 @@ class LLVM_ABI Output : public IO { void postflightFlowElement(void *) override; void endFlowSequence() override; void beginEnumScalar() override; - bool matchEnumScalar(const char *, bool) override; + bool matchEnumScalar(StringRef, bool) override; bool matchEnumFallback() override; void endEnumScalar() override; bool beginBitSetScalar(bool &) override; - bool bitSetMatch(const char *, bool) override; + bool bitSetMatch(StringRef, bool) override; void endBitSetScalar() override; void scalarString(StringRef &, QuotingType) override; void blockScalarString(StringRef &) override; @@ -1558,7 +1557,7 @@ class LLVM_ABI Output : public IO { }; template -void IO::processKeyWithDefault(const char *Key, std::optional &Val, +void IO::processKeyWithDefault(StringRef Key, std::optional &Val, const std::optional &DefaultValue, bool Required, Context &Ctx) { assert(!DefaultValue && "std::optional shouldn't have a value!"); diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index d4fa1e5d65749..cb2721aba4f25 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -1577,7 +1577,7 @@ class RecordVal { } /// Get the source location of the point where the field was defined. - const SMLoc &getLoc() const { return Loc; } + SMLoc getLoc() const { return Loc; } /// Is this a field where nonconcrete values are okay? bool isNonconcreteOK() const { diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index ce4750db88c9a..733d10b1c5f3c 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1273,6 +1273,12 @@ def G_GET_ROUNDING : GenericInstruction { let hasSideEffects = true; } +def G_SET_ROUNDING : GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins type0:$src); + let hasSideEffects = true; +} + //------------------------------------------------------------------------------ // Memory ops //------------------------------------------------------------------------------ diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h index 0688068167ae6..38daf25cacd83 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h @@ -428,6 +428,18 @@ class CoroFrameInst : public IntrinsicInst { } }; +/// This represents the llvm.coro.is_in_ramp instruction. +class CoroIsInRampInst : public IntrinsicInst { +public: + // Methods to support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::coro_is_in_ramp; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + /// This represents the llvm.coro.free instruction. class CoroFreeInst : public IntrinsicInst { enum { IdArg, FrameArg }; diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroShape.h b/llvm/include/llvm/Transforms/Coroutines/CoroShape.h index c54081de2d9da..11b004572957f 100644 --- a/llvm/include/llvm/Transforms/Coroutines/CoroShape.h +++ b/llvm/include/llvm/Transforms/Coroutines/CoroShape.h @@ -53,6 +53,7 @@ enum class ABI { struct Shape { CoroBeginInst *CoroBegin = nullptr; SmallVector CoroEnds; + SmallVector CoroIsInRampInsts; SmallVector CoroSizes; SmallVector CoroAligns; SmallVector CoroSuspends; @@ -65,6 +66,7 @@ struct Shape { void clear() { CoroBegin = nullptr; CoroEnds.clear(); + CoroIsInRampInsts.clear(); CoroSizes.clear(); CoroAligns.clear(); CoroSuspends.clear(); diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h index 4e0e9010b42f0..6d2def3d2b72d 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h +++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h @@ -14,6 +14,7 @@ #define LLVM_TRANSFORMS_INSTRUMENTATION_ADDRESSSANITIZERCOMMON_H #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InterestingMemoryOperand.h" #include "llvm/Analysis/PostDominators.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instruction.h" @@ -21,38 +22,6 @@ #include "llvm/IR/Module.h" namespace llvm { - -class InterestingMemoryOperand { -public: - Use *PtrUse; - bool IsWrite; - Type *OpType; - TypeSize TypeStoreSize = TypeSize::getFixed(0); - MaybeAlign Alignment; - // The mask Value, if we're looking at a masked load/store. - Value *MaybeMask; - // The EVL Value, if we're looking at a vp intrinsic. - Value *MaybeEVL; - // The Stride Value, if we're looking at a strided load/store. - Value *MaybeStride; - - InterestingMemoryOperand(Instruction *I, unsigned OperandNo, bool IsWrite, - class Type *OpType, MaybeAlign Alignment, - Value *MaybeMask = nullptr, - Value *MaybeEVL = nullptr, - Value *MaybeStride = nullptr) - : IsWrite(IsWrite), OpType(OpType), Alignment(Alignment), - MaybeMask(MaybeMask), MaybeEVL(MaybeEVL), MaybeStride(MaybeStride) { - const DataLayout &DL = I->getDataLayout(); - TypeStoreSize = DL.getTypeStoreSizeInBits(OpType); - PtrUse = &I->getOperandUse(OperandNo); - } - - Instruction *getInsn() { return cast(PtrUse->getUser()); } - - Value *getPtr() { return PtrUse->get(); } -}; - // Get AddressSanitizer parameters. void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, diff --git a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h index af3662e4a6565..9c9d6afe1872f 100644 --- a/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h +++ b/llvm/include/llvm/Transforms/Instrumentation/DataFlowSanitizer.h @@ -10,6 +10,7 @@ #include "llvm/IR/PassManager.h" #include "llvm/Support/Compiler.h" +#include "llvm/Support/VirtualFileSystem.h" #include #include @@ -19,11 +20,13 @@ class Module; class DataFlowSanitizerPass : public PassInfoMixin { private: std::vector ABIListFiles; + IntrusiveRefCntPtr FS; public: DataFlowSanitizerPass( - const std::vector &ABIListFiles = std::vector()) - : ABIListFiles(ABIListFiles) {} + const std::vector &ABIListFiles = std::vector(), + IntrusiveRefCntPtr FS = vfs::getRealFileSystem()) + : ABIListFiles(ABIListFiles), FS(std::move(FS)) {} LLVM_ABI PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); static bool isRequired() { return true; } }; diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h index c03cdf48fb1c6..8bb65bf7225e0 100644 --- a/llvm/include/llvm/Transforms/Scalar/SROA.h +++ b/llvm/include/llvm/Transforms/Scalar/SROA.h @@ -1,4 +1,4 @@ -//===- SROA.h - Scalar Replacement Of Aggregates ----------------*- C++ -*-===// +//===- SROA.h - Scalar Replacement Of Aggregates ----------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h deleted file mode 100644 index 3178dc762a195..0000000000000 --- a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h +++ /dev/null @@ -1,31 +0,0 @@ -//===------ EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass optimizes a vectorized loop with canonical IV to using EVL-based -// IV if it was tail-folded by predicated EVL. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H -#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H - -#include "llvm/Analysis/LoopAnalysisManager.h" -#include "llvm/IR/PassManager.h" - -namespace llvm { -class Loop; -class LPMUpdater; - -/// Turn vectorized loops with canonical induction variables into loops that -/// only use a single EVL-based induction variable. -struct EVLIndVarSimplifyPass : public PassInfoMixin { - PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM, - LoopStandardAnalysisResults &AR, LPMUpdater &U); -}; -} // namespace llvm -#endif diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp index 980a891266e50..61b7b3fa9e2c4 100644 --- a/llvm/lib/Analysis/AssumptionCache.cpp +++ b/llvm/lib/Analysis/AssumptionCache.cpp @@ -53,6 +53,22 @@ AssumptionCache::getOrInsertAffectedValues(Value *V) { return AffectedValues[AffectedValueCallbackVH(V, this)]; } +void AssumptionCache::findValuesAffectedByOperandBundle( + OperandBundleUse Bundle, function_ref InsertAffected) { + auto AddAffectedVal = [&](Value *V) { + if (isa(V)) + InsertAffected(V); + }; + + if (Bundle.getTagName() == "separate_storage") { + assert(Bundle.Inputs.size() == 2 && "separate_storage must have two args"); + AddAffectedVal(getUnderlyingObject(Bundle.Inputs[0])); + AddAffectedVal(getUnderlyingObject(Bundle.Inputs[1])); + } else if (Bundle.Inputs.size() > ABA_WasOn && + Bundle.getTagName() != IgnoreBundleTag) + AddAffectedVal(Bundle.Inputs[ABA_WasOn]); +} + static void findAffectedValues(CallBase *CI, TargetTransformInfo *TTI, SmallVectorImpl &Affected) { @@ -69,17 +85,10 @@ findAffectedValues(CallBase *CI, TargetTransformInfo *TTI, } }; - for (unsigned Idx = 0; Idx != CI->getNumOperandBundles(); Idx++) { - OperandBundleUse Bundle = CI->getOperandBundleAt(Idx); - if (Bundle.getTagName() == "separate_storage") { - assert(Bundle.Inputs.size() == 2 && - "separate_storage must have two args"); - AddAffectedVal(getUnderlyingObject(Bundle.Inputs[0]), Idx); - AddAffectedVal(getUnderlyingObject(Bundle.Inputs[1]), Idx); - } else if (Bundle.Inputs.size() > ABA_WasOn && - Bundle.getTagName() != IgnoreBundleTag) - AddAffectedVal(Bundle.Inputs[ABA_WasOn], Idx); - } + for (unsigned Idx = 0; Idx != CI->getNumOperandBundles(); Idx++) + AssumptionCache::findValuesAffectedByOperandBundle( + CI->getOperandBundleAt(Idx), + [&](Value *V) { Affected.push_back({V, Idx}); }); Value *Cond = CI->getArgOperand(0); findValuesAffectedByCondition(Cond, /*IsAssume=*/true, InsertAffected); @@ -172,7 +181,7 @@ void AssumptionCache::scanFunction() { for (BasicBlock &B : F) for (Instruction &I : B) if (isa(&I)) - AssumeHandles.push_back({&I, ExprResultIdx}); + AssumeHandles.push_back(&I); // Mark the scan as complete. Scanned = true; @@ -188,7 +197,7 @@ void AssumptionCache::registerAssumption(AssumeInst *CI) { if (!Scanned) return; - AssumeHandles.push_back({CI, ExprResultIdx}); + AssumeHandles.push_back(CI); #ifndef NDEBUG assert(CI->getParent() && diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index a3b2e62a1b8ba..d52b073854630 100755 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -951,21 +951,21 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP, // If the base value for this address is a literal integer value, fold the // getelementptr to the resulting integer value casted to the pointer type. - APInt BasePtr(DL.getPointerTypeSizeInBits(Ptr->getType()), 0); + APInt BaseIntVal(DL.getPointerTypeSizeInBits(Ptr->getType()), 0); if (auto *CE = dyn_cast(Ptr)) { if (CE->getOpcode() == Instruction::IntToPtr) { if (auto *Base = dyn_cast(CE->getOperand(0))) - BasePtr = Base->getValue().zextOrTrunc(BasePtr.getBitWidth()); + BaseIntVal = Base->getValue().zextOrTrunc(BaseIntVal.getBitWidth()); } } - auto *PTy = cast(Ptr->getType()); - if ((Ptr->isNullValue() || BasePtr != 0) && - !DL.isNonIntegralPointerType(PTy)) { + if ((Ptr->isNullValue() || BaseIntVal != 0) && + !DL.mustNotIntroduceIntToPtr(Ptr->getType())) { + // If the index size is smaller than the pointer size, add to the low // bits only. - BasePtr.insertBits(BasePtr.trunc(BitWidth) + Offset, 0); - Constant *C = ConstantInt::get(Ptr->getContext(), BasePtr); + BaseIntVal.insertBits(BaseIntVal.trunc(BitWidth) + Offset, 0); + Constant *C = ConstantInt::get(Ptr->getContext(), BaseIntVal); return ConstantExpr::getIntToPtr(C, ResTy); } @@ -2625,8 +2625,17 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, case Intrinsic::nvvm_d2ull_rp: case Intrinsic::nvvm_d2ull_rz: { // In float-to-integer conversion, NaN inputs are converted to 0. - if (U.isNaN()) - return ConstantInt::get(Ty, 0); + if (U.isNaN()) { + // In float-to-integer conversion, NaN inputs are converted to 0 + // when the source and destination bitwidths are both less than 64. + if (nvvm::FPToIntegerIntrinsicNaNZero(IntrinsicID)) + return ConstantInt::get(Ty, 0); + + // Otherwise, the most significant bit is set. + unsigned BitWidth = Ty->getIntegerBitWidth(); + uint64_t Val = 1ULL << (BitWidth - 1); + return ConstantInt::get(Ty, APInt(BitWidth, Val, /*IsSigned=*/false)); + } APFloat::roundingMode RMode = nvvm::GetFPToIntegerRoundingMode(IntrinsicID); @@ -2636,13 +2645,11 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, APSInt ResInt(Ty->getIntegerBitWidth(), !IsSigned); auto FloatToRound = IsFTZ ? FTZPreserveSign(U) : U; + // Return max/min value for integers if the result is +/-inf or + // is too large to fit in the result's integer bitwidth. bool IsExact = false; - APFloat::opStatus Status = - FloatToRound.convertToInteger(ResInt, RMode, &IsExact); - - if (Status != APFloat::opInvalidOp) - return ConstantInt::get(Ty, ResInt); - return nullptr; + FloatToRound.convertToInteger(ResInt, RMode, &IsExact); + return ConstantInt::get(Ty, ResInt); } } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index a90b618607ad6..07f4a8e5c889e 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -6514,10 +6514,27 @@ Value *llvm::simplifyBinaryIntrinsic(Intrinsic::ID IID, Type *ReturnType, const CallBase *Call) { unsigned BitWidth = ReturnType->getScalarSizeInBits(); switch (IID) { - case Intrinsic::get_active_lane_mask: + case Intrinsic::get_active_lane_mask: { if (match(Op1, m_Zero())) return ConstantInt::getFalse(ReturnType); + + const Function *F = Call->getFunction(); + auto *ScalableTy = dyn_cast(ReturnType); + Attribute Attr = F->getFnAttribute(Attribute::VScaleRange); + if (ScalableTy && Attr.isValid()) { + std::optional VScaleMax = Attr.getVScaleRangeMax(); + if (!VScaleMax) + break; + uint64_t MaxPossibleMaskElements = + (uint64_t)ScalableTy->getMinNumElements() * (*VScaleMax); + + const APInt *Op1Val; + if (match(Op0, m_Zero()) && match(Op1, m_APInt(Op1Val)) && + Op1Val->uge(MaxPossibleMaskElements)) + return ConstantInt::getAllOnesValue(ReturnType); + } break; + } case Intrinsic::abs: // abs(abs(x)) -> abs(x). We don't need to worry about the nsw arg here. // It is always ok to pick the earlier abs. We'll just lose nsw if its only diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index d6ad855cad9a7..87fae92977cd2 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2090,12 +2090,14 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( return MemoryDepChecker::Dependence::Unknown; } + TypeSize AStoreSz = DL.getTypeStoreSize(ATy); + TypeSize BStoreSz = DL.getTypeStoreSize(BTy); + + // If store sizes are not the same, set TypeByteSize to zero, so we can check + // it in the caller isDependent. uint64_t ASz = DL.getTypeAllocSize(ATy); uint64_t BSz = DL.getTypeAllocSize(BTy); - - // Both the source and sink sizes are neeeded in dependence checks, depending - // on the use. - std::pair TypeByteSize(ASz, BSz); + uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0; uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz; uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz; @@ -2117,23 +2119,8 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( return Dependence::Unknown; } - // When the distance is possibly zero, we're reading/writing the same memory - // location: if the store sizes are not equal, fail with an unknown - // dependence. - TypeSize AStoreSz = DL.getTypeStoreSize(ATy); - TypeSize BStoreSz = DL.getTypeStoreSize(BTy); - if (AStoreSz != BStoreSz && !SE.isKnownNonZero(Dist)) { - LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence distance with " - "different type sizes\n"); - return Dependence::Unknown; - } - - // TODO: Remove this. - bool HasSameSize = AStoreSz == BStoreSz; - return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, - TypeByteSize, HasSameSize, AIsWrite, - BIsWrite); + TypeByteSize, AIsWrite, BIsWrite); } MemoryDepChecker::Dependence::DepType @@ -2165,8 +2152,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, return std::get(Res); } - auto &[Dist, MaxStride, CommonStride, TypeByteSize, HasSameSize, AIsWrite, - BIsWrite] = std::get(Res); + auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] = + std::get(Res); + bool HasSameSize = TypeByteSize > 0; ScalarEvolution &SE = *PSE.getSE(); auto &DL = InnermostLoop->getHeader()->getDataLayout(); @@ -2192,8 +2180,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // If the distance between accesses and their strides are known constants, // check whether the accesses interlace each other. if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize && - areStridedAccessesIndependent(ConstDist, *CommonStride, - TypeByteSize.first)) { + areStridedAccessesIndependent(ConstDist, *CommonStride, TypeByteSize)) { LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); return Dependence::NoDep; } @@ -2207,9 +2194,13 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // Negative distances are not plausible dependencies. if (SE.isKnownNonPositive(Dist)) { if (SE.isKnownNonNegative(Dist)) { - // Write to the same location with the same size. - assert(HasSameSize && "Accesses must have the same size"); - return Dependence::Forward; + if (HasSameSize) { + // Write to the same location with the same size. + return Dependence::Forward; + } + LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " + "different type sizes\n"); + return Dependence::Unknown; } bool IsTrueDataDependence = (AIsWrite && !BIsWrite); @@ -2227,7 +2218,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, : Dependence::Unknown; } if (!HasSameSize || - couldPreventStoreLoadForward(ConstDist, TypeByteSize.first)) { + couldPreventStoreLoadForward(ConstDist, TypeByteSize)) { LLVM_DEBUG( dbgs() << "LAA: Forward but may prevent st->ld forwarding\n"); return Dependence::ForwardButPreventsForwarding; @@ -2293,8 +2284,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // We know that Dist is positive, but it may not be constant. Use the signed // minimum for computations below, as this ensures we compute the closest // possible dependence distance. - uint64_t MinDistanceNeeded = - MaxStride * (MinNumIter - 1) + TypeByteSize.first; + uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize; if (MinDistanceNeeded > static_cast(MinDistance)) { if (!ConstDist) { // For non-constant distances, we checked the lower bound of the @@ -2322,15 +2312,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, bool IsTrueDataDependence = (!AIsWrite && BIsWrite); if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist && - couldPreventStoreLoadForward(MinDistance, TypeByteSize.first, - *CommonStride)) + couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride)) return Dependence::BackwardVectorizableButPreventsForwarding; uint64_t MaxVF = MinDepDistBytes / MaxStride; LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance << " with max VF = " << MaxVF << '\n'); - uint64_t MaxVFInBits = MaxVF * TypeByteSize.first * 8; + uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) { // For non-constant distances, we checked the lower bound of the dependence // distance and the distance may be larger at runtime (and safe for diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp index 67c2cfadb6533..9a022d9ed09ce 100644 --- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp +++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp @@ -80,6 +80,10 @@ static cl::opt cl::desc("The number of blocks to scan during memory " "dependency analysis (default = 200)")); +static cl::opt CacheGlobalLimit( + "memdep-cache-global-limit", cl::Hidden, cl::init(10000), + cl::desc("The max number of entries allowed in a cache (default = 10000)")); + // Limit on the number of memdep results to process. static const unsigned int NumResultsLimit = 100; @@ -1142,6 +1146,10 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB( return true; } + // If the size of this cache has surpassed the global limit, stop here. + if (Cache->size() > CacheGlobalLimit) + return false; + // Otherwise, either this is a new block, a block with an invalid cache // pointer or one that we're about to invalidate by putting more info into // it than its valid cache info. If empty and not explicitly indicated as diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 09b50c5270e57..bf62623099a97 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/TargetLibraryInfo.h" diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 3d5bd6155536e..f6937d38eb38c 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -685,6 +685,9 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(graalcc); KEYWORD(riscv_vector_cc); KEYWORD(riscv_vls_cc); + KEYWORD(cheriot_compartmentcallcc); + KEYWORD(cheriot_compartmentcalleecc); + KEYWORD(cheriot_librarycallcc); KEYWORD(cc); KEYWORD(c); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 8739b24d4b74b..897e679095906 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2315,6 +2315,15 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { #undef CC_VLS_CASE } return false; + case lltok::kw_cheriot_compartmentcallcc: + CC = CallingConv::CHERIoT_CompartmentCall; + break; + case lltok::kw_cheriot_compartmentcalleecc: + CC = CallingConv::CHERIoT_CompartmentCallee; + break; + case lltok::kw_cheriot_librarycallcc: + CC = CallingConv::CHERIoT_LibraryCall; + break; case lltok::kw_cc: { Lex.Lex(); return parseUInt32(CC); diff --git a/llvm/lib/BinaryFormat/DXContainer.cpp b/llvm/lib/BinaryFormat/DXContainer.cpp index 36d10d0b63078..c06a3e34653f0 100644 --- a/llvm/lib/BinaryFormat/DXContainer.cpp +++ b/llvm/lib/BinaryFormat/DXContainer.cpp @@ -89,6 +89,15 @@ ArrayRef> dxbc::getDescriptorRangeFlags() { return ArrayRef(DescriptorRangeFlagNames); } +static const EnumEntry StaticSamplerFlagNames[] = { +#define STATIC_SAMPLER_FLAG(Val, Enum, Flag) {#Enum, StaticSamplerFlags::Enum}, +#include "llvm/BinaryFormat/DXContainerConstants.def" +}; + +ArrayRef> dxbc::getStaticSamplerFlags() { + return ArrayRef(StaticSamplerFlagNames); +} + #define SHADER_VISIBILITY(Val, Enum) {#Enum, ShaderVisibility::Enum}, static const EnumEntry ShaderVisibilityValues[] = { diff --git a/llvm/lib/BinaryFormat/ELF.cpp b/llvm/lib/BinaryFormat/ELF.cpp index 0ad1a09429e7c..4b430bc287ff9 100644 --- a/llvm/lib/BinaryFormat/ELF.cpp +++ b/llvm/lib/BinaryFormat/ELF.cpp @@ -175,7 +175,7 @@ uint16_t ELF::convertArchNameToEMachine(StringRef Arch) { .Case("ba2", EM_BA2) .Case("xcore", EM_XCORE) .Case("mchp_pic", EM_MCHP_PIC) - .Case("intel205", EM_INTEL205) + .Case("intelgt", EM_INTELGT) .Case("intel206", EM_INTEL206) .Case("intel207", EM_INTEL207) .Case("intel208", EM_INTEL208) @@ -584,8 +584,8 @@ StringRef ELF::convertEMachineToArchName(uint16_t EMachine) { return "xcore"; case EM_MCHP_PIC: return "mchp_pic"; - case EM_INTEL205: - return "intel205"; + case EM_INTELGT: + return "intelgt"; case EM_INTEL206: return "intel206"; case EM_INTEL207: diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index 22a0d0ffdbaab..832aa9ff7ed3d 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -7024,7 +7024,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) { if (!MDLoader->isStrippingTBAA()) { for (auto &I : instructions(F)) { MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa); - if (!TBAA || TBAAVerifyHelper.visitTBAAMetadata(I, TBAA)) + if (!TBAA || TBAAVerifyHelper.visitTBAAMetadata(&I, TBAA)) continue; MDLoader->setStripTBAA(true); stripTBAA(F->getParent()); diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp index fc59be8df525a..3fd8cfe1a8762 100644 --- a/llvm/lib/CGData/CodeGenDataReader.cpp +++ b/llvm/lib/CGData/CodeGenDataReader.cpp @@ -169,8 +169,8 @@ bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) { if (DataBuffer.getBufferSize() < sizeof(IndexedCGData::Magic)) return false; - uint64_t Magic = endian::read( - DataBuffer.getBufferStart()); + uint64_t Magic = endian::read(DataBuffer.getBufferStart(), + llvm::endianness::little); // Verify that it's magical. return Magic == IndexedCGData::Magic; } diff --git a/llvm/lib/CGData/CodeGenDataWriter.cpp b/llvm/lib/CGData/CodeGenDataWriter.cpp index 14a8558ba63b7..a2bbceebd0317 100644 --- a/llvm/lib/CGData/CodeGenDataWriter.cpp +++ b/llvm/lib/CGData/CodeGenDataWriter.cpp @@ -40,7 +40,7 @@ void CGDataOStream::patch(ArrayRef P) { for (const auto &K : P) { for (size_t I = 0; I < K.D.size(); ++I) { uint64_t Bytes = - endian::byte_swap(K.D[I]); + endian::byte_swap(K.D[I], llvm::endianness::little); Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t), reinterpret_cast(&Bytes), sizeof(uint64_t)); } @@ -52,7 +52,7 @@ void CGDataOStream::patch(ArrayRef P) { for (const auto &K : P) { for (size_t I = 0; I < K.D.size(); ++I) { uint64_t Bytes = - endian::byte_swap(K.D[I]); + endian::byte_swap(K.D[I], llvm::endianness::little); VOStream.pwrite(reinterpret_cast(&Bytes), sizeof(uint64_t), K.Pos + I * sizeof(uint64_t)); } diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index cd14a4f57f760..701a6a2f0f7a0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -78,6 +78,8 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMRemarkStreamer.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" @@ -1672,7 +1674,7 @@ static ConstantInt *extractNumericCGTypeId(const Function &F) { /// Emits .callgraph section. void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, - FunctionInfo &FuncInfo) { + FunctionCallGraphInfo &FuncCGInfo) { if (!MF.getTarget().Options.EmitCallGraphSection) return; @@ -1711,27 +1713,34 @@ void AsmPrinter::emitCallGraphSection(const MachineFunction &MF, // Emit function kind, and type id if available. if (!IsIndirectTarget) { OutStreamer->emitInt64( - static_cast(FunctionInfo::FunctionKind::NOT_INDIRECT_TARGET)); + static_cast(FunctionKind::NOT_INDIRECT_TARGET)); } else { if (const auto *TypeId = extractNumericCGTypeId(F)) { - OutStreamer->emitInt64(static_cast( - FunctionInfo::FunctionKind::INDIRECT_TARGET_KNOWN_TID)); + OutStreamer->emitInt64( + static_cast(FunctionKind::INDIRECT_TARGET_KNOWN_TID)); OutStreamer->emitInt64(TypeId->getZExtValue()); } else { - OutStreamer->emitInt64(static_cast( - FunctionInfo::FunctionKind::INDIRECT_TARGET_UNKNOWN_TID)); + OutStreamer->emitInt64( + static_cast(FunctionKind::INDIRECT_TARGET_UNKNOWN_TID)); } } // Emit callsite labels, where each element is a pair of type id and // indirect callsite pc. - const auto &CallSiteLabels = FuncInfo.CallSiteLabels; + const auto &CallSiteLabels = FuncCGInfo.CallSiteLabels; OutStreamer->emitInt64(CallSiteLabels.size()); for (const auto &[TypeId, Label] : CallSiteLabels) { OutStreamer->emitInt64(TypeId); OutStreamer->emitSymbolValue(Label, TM.getProgramPointerSize()); } - FuncInfo.CallSiteLabels.clear(); + FuncCGInfo.CallSiteLabels.clear(); + + const auto &DirectCallees = FuncCGInfo.DirectCallees; + OutStreamer->emitInt64(DirectCallees.size()); + for (const auto &CalleeSymbol : DirectCallees) { + OutStreamer->emitSymbolValue(CalleeSymbol, TM.getProgramPointerSize()); + } + FuncCGInfo.DirectCallees.clear(); OutStreamer->popSection(); } @@ -1866,20 +1875,40 @@ static StringRef getMIMnemonic(const MachineInstr &MI, MCStreamer &Streamer) { return Name; } -void AsmPrinter::emitIndirectCalleeLabels( - FunctionInfo &FuncInfo, +void AsmPrinter::handleCallsiteForCallgraph( + FunctionCallGraphInfo &FuncCGInfo, const MachineFunction::CallSiteInfoMap &CallSitesInfoMap, const MachineInstr &MI) { - // Only indirect calls have type identifiers set. + assert(MI.isCall() && + "Callsite labels are meant for call instructions only."); + const MachineOperand &CalleeOperand = MI.getOperand(0); + if (CalleeOperand.isGlobal() || CalleeOperand.isSymbol()) { + // Handle direct calls. + MCSymbol *CalleeSymbol = nullptr; + switch (CalleeOperand.getType()) { + case llvm::MachineOperand::MO_GlobalAddress: + CalleeSymbol = getSymbol(CalleeOperand.getGlobal()); + break; + case llvm::MachineOperand::MO_ExternalSymbol: + CalleeSymbol = GetExternalSymbolSymbol(CalleeOperand.getSymbolName()); + break; + default: + llvm_unreachable( + "Expected to only handle direct call instructions here."); + } + FuncCGInfo.DirectCallees.insert(CalleeSymbol); + return; // Early exit after handling the direct call instruction. + } const auto &CallSiteInfo = CallSitesInfoMap.find(&MI); if (CallSiteInfo == CallSitesInfoMap.end()) return; - + // Handle indirect callsite info. + // Only indirect calls have type identifiers set. for (ConstantInt *CalleeTypeId : CallSiteInfo->second.CalleeTypeIds) { MCSymbol *S = MF->getContext().createTempSymbol(); OutStreamer->emitLabel(S); uint64_t CalleeTypeIdVal = CalleeTypeId->getZExtValue(); - FuncInfo.CallSiteLabels.emplace_back(CalleeTypeIdVal, S); + FuncCGInfo.CallSiteLabels.emplace_back(CalleeTypeIdVal, S); } } @@ -1929,7 +1958,7 @@ void AsmPrinter::emitFunctionBody() { MBBSectionRanges[MF->front().getSectionID()] = MBBSectionRange{CurrentFnBegin, nullptr}; - FunctionInfo FuncInfo; + FunctionCallGraphInfo FuncCGInfo; const auto &CallSitesInfoMap = MF->getCallSitesInfo(); for (auto &MBB : *MF) { // Print a label for the basic block. @@ -2066,7 +2095,7 @@ void AsmPrinter::emitFunctionBody() { OutStreamer->emitLabel(createCallsiteEndSymbol(MBB)); if (TM.Options.EmitCallGraphSection && MI.isCall()) - emitIndirectCalleeLabels(FuncInfo, CallSitesInfoMap, MI); + handleCallsiteForCallgraph(FuncCGInfo, CallSitesInfoMap, MI); // If there is a post-instruction symbol, emit a label for it here. if (MCSymbol *S = MI.getPostInstrSymbol()) @@ -2248,7 +2277,7 @@ void AsmPrinter::emitFunctionBody() { emitStackSizeSection(*MF); // Emit section containing call graph metadata. - emitCallGraphSection(*MF, FuncInfo); + emitCallGraphSection(*MF, FuncCGInfo); // Emit .su file containing function stack size information. emitStackUsage(*MF); @@ -2508,6 +2537,8 @@ void AsmPrinter::emitGlobalIFunc(Module &M, const GlobalIFunc &GI) { void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) { if (!RS.needsSection()) return; + if (!RS.getFilename()) + return; MCSection *RemarksSection = OutContext.getObjectFileInfo()->getRemarksSection(); @@ -2518,20 +2549,16 @@ void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) { return; } - remarks::RemarkSerializer &RemarkSerializer = RS.getSerializer(); - - std::optional> Filename; - if (std::optional FilenameRef = RS.getFilename()) { - Filename = *FilenameRef; - sys::fs::make_absolute(*Filename); - assert(!Filename->empty() && "The filename can't be empty."); - } + SmallString<128> Filename = *RS.getFilename(); + sys::fs::make_absolute(Filename); + assert(!Filename.empty() && "The filename can't be empty."); std::string Buf; raw_string_ostream OS(Buf); + + remarks::RemarkSerializer &RemarkSerializer = RS.getSerializer(); std::unique_ptr MetaSerializer = - Filename ? RemarkSerializer.metaSerializer(OS, Filename->str()) - : RemarkSerializer.metaSerializer(OS); + RemarkSerializer.metaSerializer(OS, Filename); MetaSerializer->emit(); // Switch to the remarks section. diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 7ce014e9fac9a..518121e200190 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -1836,8 +1836,12 @@ DIE *DwarfCompileUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, if (!F && SP->isDefinition()) { F = DD->getLexicalScopes().getFunction(SP); - if (!F) - return &getCU().getOrCreateAbstractSubprogramDIE(SP); + if (!F) { + // SP may belong to another CU. Determine the CU similarly + // to DwarfDebug::constructAbstractSubprogramScopeDIE. + return &DD->getOrCreateAbstractSubprogramCU(SP, *this) + .getOrCreateAbstractSubprogramDIE(SP); + } } return DwarfUnit::getOrCreateSubprogramDIE(SP, F, Minimal); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 8efc6f124a55d..09d5f9c57a1a7 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -548,6 +548,16 @@ bool DwarfDebug::shareAcrossDWOCUs() const { return SplitDwarfCrossCuReferences; } +DwarfCompileUnit & +DwarfDebug::getOrCreateAbstractSubprogramCU(const DISubprogram *SP, + DwarfCompileUnit &SrcCU) { + auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); + if (CU.getSkeleton()) + return shareAcrossDWOCUs() ? CU : SrcCU; + + return CU; +} + void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope) { assert(Scope && Scope->getScopeNode()); @@ -559,14 +569,11 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit()); - if (auto *SkelCU = CU.getSkeleton()) { - (shareAcrossDWOCUs() ? CU : SrcCU) - .constructAbstractSubprogramScopeDIE(Scope); + auto &TargetCU = getOrCreateAbstractSubprogramCU(SP, SrcCU); + TargetCU.constructAbstractSubprogramScopeDIE(Scope); + if (auto *SkelCU = CU.getSkeleton()) if (CU.getCUNode()->getSplitDebugInlining()) SkelCU->constructAbstractSubprogramScopeDIE(Scope); - } else { - CU.constructAbstractSubprogramScopeDIE(Scope); - } } /// Represents a parameter whose call site value can be described by applying a diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index 89813dcf0fdab..1a1b28a6fc035 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -906,6 +906,10 @@ class DwarfDebug : public DebugHandlerBase { return CUDieMap.lookup(Die); } + /// Find the matching DwarfCompileUnit for the given SP referenced from SrcCU. + DwarfCompileUnit &getOrCreateAbstractSubprogramCU(const DISubprogram *SP, + DwarfCompileUnit &SrcCU); + unsigned getStringTypeLoc(const DIStringType *ST) const { return StringTypeLocMap.lookup(ST); } diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp index 8a30714db2fdf..1703b27d350f3 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp @@ -154,7 +154,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI, unsigned Size = TRI.getSubRegIdxSize(Idx); unsigned Offset = TRI.getSubRegIdxOffset(Idx); Reg = TRI.getDwarfRegNum(SR, false); - if (Reg < 0) + if (Reg < 0 || Offset + Size > RegSize) continue; // Used to build the intersection between the bits we already diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp index 7baeb3fd7bcee..fbcd614b85d18 100644 --- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp +++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp @@ -76,6 +76,21 @@ BasicBlockSectionsProfileReader::getClonePathsForFunction( return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths; } +uint64_t BasicBlockSectionsProfileReader::getEdgeCount( + StringRef FuncName, const UniqueBBID &SrcBBID, + const UniqueBBID &SinkBBID) const { + auto It = ProgramPathAndClusterInfo.find(getAliasName(FuncName)); + if (It == ProgramPathAndClusterInfo.end()) + return 0; + auto NodeIt = It->second.EdgeCounts.find(SrcBBID); + if (NodeIt == It->second.EdgeCounts.end()) + return 0; + auto EdgeIt = NodeIt->second.find(SinkBBID); + if (EdgeIt == NodeIt->second.end()) + return 0; + return EdgeIt->second; +} + // Reads the version 1 basic block sections profile. Profile for each function // is encoded as follows: // m @@ -240,6 +255,38 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() { } continue; } + case 'g': { // CFG profile specifier. + // Skip the profile when we the profile iterator (FI) refers to the + // past-the-end element. + if (FI == ProgramPathAndClusterInfo.end()) + continue; + // For each node, its CFG profile is encoded as + // :,:,:,... + for (auto BasicBlockEdgeProfile : Values) { + if (BasicBlockEdgeProfile.empty()) + continue; + SmallVector NodeEdgeCounts; + BasicBlockEdgeProfile.split(NodeEdgeCounts, ','); + UniqueBBID SrcBBID; + for (size_t i = 0; i < NodeEdgeCounts.size(); ++i) { + auto [BBIDStr, CountStr] = NodeEdgeCounts[i].split(':'); + auto BBID = parseUniqueBBID(BBIDStr); + if (!BBID) + return BBID.takeError(); + unsigned long long Count = 0; + if (getAsUnsignedInteger(CountStr, 10, Count)) + return createProfileParseError( + Twine("unsigned integer expected: '") + CountStr + "'"); + if (i == 0) { + // The first element represents the source and its total count. + FI->second.NodeCounts[SrcBBID = *BBID] = Count; + continue; + } + FI->second.EdgeCounts[SrcBBID][*BBID] = Count; + } + } + continue; + } default: return createProfileParseError(Twine("invalid specifier: '") + Twine(Specifier) + "'"); @@ -440,6 +487,12 @@ BasicBlockSectionsProfileReaderWrapperPass::getClonePathsForFunction( return BBSPR.getClonePathsForFunction(FuncName); } +uint64_t BasicBlockSectionsProfileReaderWrapperPass::getEdgeCount( + StringRef FuncName, const UniqueBBID &SrcBBID, + const UniqueBBID &SinkBBID) const { + return BBSPR.getEdgeCount(FuncName, SrcBBID, SinkBBID); +} + BasicBlockSectionsProfileReader & BasicBlockSectionsProfileReaderWrapperPass::getBBSPR() { return BBSPR; diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp index b16694eafd90e..a77da01761579 100644 --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -81,12 +81,15 @@ Register VirtRegAuxInfo::copyHint(const MachineInstr *MI, Register Reg, bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS, const VirtRegMap &VRM, + const MachineRegisterInfo &MRI, const TargetInstrInfo &TII) { Register Reg = LI.reg(); Register Original = VRM.getOriginal(Reg); + SmallDenseMap VNIDefs; for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end(); I != E; ++I) { const VNInfo *VNI = *I; + const VNInfo *OrigVNI = VNI; if (VNI->isUnused()) continue; if (VNI->isPHIDef()) @@ -122,8 +125,77 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI, assert(MI && "Dead valno in interval"); } - if (!TII.isTriviallyReMaterializable(*MI)) + if (!TII.isReMaterializable(*MI)) return false; + + VNIDefs[OrigVNI->id] = MI; + } + + // If MI has register uses, it will only be rematerializable if its uses are + // also live at the indices it will be rematerialized at. + for (MachineOperand &MO : MRI.reg_nodbg_operands(LI.reg())) { + if (!MO.readsReg()) + continue; + SlotIndex UseIdx = LIS.getInstructionIndex(*MO.getParent()); + MachineInstr *Def = VNIDefs[LI.getVNInfoAt(UseIdx)->id]; + assert(Def && "Use with no def"); + if (!allUsesAvailableAt(Def, UseIdx, LIS, MRI, TII)) + return false; + } + + return true; +} + +bool VirtRegAuxInfo::allUsesAvailableAt(const MachineInstr *MI, + SlotIndex UseIdx, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI, + const TargetInstrInfo &TII) { + SlotIndex OrigIdx = LIS.getInstructionIndex(*MI).getRegSlot(true); + UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true)); + for (const MachineOperand &MO : MI->operands()) { + if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) + continue; + + // We can't remat physreg uses, unless it is a constant or target wants + // to ignore this use. + if (MO.getReg().isPhysical()) { + if (MRI.isConstantPhysReg(MO.getReg()) || TII.isIgnorableUse(MO)) + continue; + return false; + } + + const LiveInterval &li = LIS.getInterval(MO.getReg()); + const VNInfo *OVNI = li.getVNInfoAt(OrigIdx); + if (!OVNI) + continue; + + // Don't allow rematerialization immediately after the original def. + // It would be incorrect if OrigMI redefines the register. + // See PR14098. + if (SlotIndex::isSameInstr(OrigIdx, UseIdx)) + return false; + + if (OVNI != li.getVNInfoAt(UseIdx)) + return false; + + // Check that subrange is live at UseIdx. + if (li.hasSubRanges()) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + unsigned SubReg = MO.getSubReg(); + LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) + : MRI.getMaxLaneMaskForVReg(MO.getReg()); + for (const LiveInterval::SubRange &SR : li.subranges()) { + if ((SR.LaneMask & LM).none()) + continue; + if (!SR.liveAt(UseIdx)) + return false; + // Early exit if all used lanes are checked. No need to continue. + LM &= ~SR.LaneMask; + if (LM.none()) + break; + } + } } return true; } @@ -339,7 +411,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start, // it is a preferred candidate for spilling. // FIXME: this gets much more complicated once we support non-trivial // re-materialization. - if (isRematerializable(LI, LIS, VRM, *MF.getSubtarget().getInstrInfo())) + if (isRematerializable(LI, LIS, VRM, MRI, *MF.getSubtarget().getInstrInfo())) TotalWeight *= 0.5F; // Finally, we scale the weight by the scale factor of register class. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index d290f202f3cca..eb73d01b3558c 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1749,6 +1749,12 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp, Sub->hasNUsesOrMore(1))) return false; + // We don't want to move around uses of condition values this late, so we + // check if it is legal to create the call to the intrinsic in the basic + // block containing the icmp. + if (Sub->getParent() != Cmp->getParent() && !Sub->hasOneUse()) + return false; + if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1), Cmp, Intrinsic::usub_with_overflow)) return false; diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 12b735e053bde..56e13f075aaac 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2607,6 +2607,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, case Intrinsic::get_rounding: MIRBuilder.buildGetRounding(getOrCreateVReg(CI)); return true; + case Intrinsic::set_rounding: + MIRBuilder.buildSetRounding(getOrCreateVReg(*CI.getOperand(0))); + return true; case Intrinsic::vscale: { MIRBuilder.buildVScale(getOrCreateVReg(CI), 1); return true; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index f3e036ed1b947..03dfa6f3f243f 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -2935,6 +2935,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT); widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT); widenScalarDst(MI, WideTy); + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), --MIRBuilder.getInsertPt()); widenScalarDst(MI, WideTy, 1); Observer.changedInstr(MI); return Legalized; @@ -2972,6 +2973,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT); widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT); widenScalarDst(MI, WideTy); + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), --MIRBuilder.getInsertPt()); widenScalarDst(MI, WideTy, 1); Observer.changedInstr(MI); return Legalized; @@ -8598,7 +8600,8 @@ LegalizerHelper::lowerThreewayCompare(MachineInstr &MI) { auto &Ctx = MIRBuilder.getMF().getFunction().getContext(); auto BC = TLI.getBooleanContents(DstTy.isVector(), /*isFP=*/false); - if (TLI.shouldExpandCmpUsingSelects(getApproximateEVTForLLT(SrcTy, Ctx)) || + if (TLI.preferSelectsOverBooleanArithmetic( + getApproximateEVTForLLT(SrcTy, Ctx)) || BC == TargetLowering::UndefinedBooleanContent) { auto One = MIRBuilder.buildConstant(DstTy, 1); auto SelectZeroOrOne = MIRBuilder.buildSelect(DstTy, IsGT, One, Zero); diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp index 98c56f739ad4e..0c2b74c907d2a 100644 --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -642,8 +642,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { SmallVector, 8> Ops; VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg(), &Ops); - if (!RI.Reads) + // Defs without reads will be deleted if unused after remat is + // completed for other users of the virtual register. + if (!RI.Reads) { + LLVM_DEBUG(dbgs() << "\tskipping remat of def " << MI); return false; + } SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true); VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex()); @@ -657,8 +661,13 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) { return true; } - if (SnippetCopies.count(&MI)) + // Snippets copies are ignored for remat, and will be deleted if they + // don't feed a live user after rematerialization completes. + if (SnippetCopies.count(&MI)) { + LLVM_DEBUG(dbgs() << "\tskipping remat snippet copy for " << UseIdx << '\t' + << MI); return false; + } LiveInterval &OrigLI = LIS.getInterval(Original); VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx); diff --git a/llvm/lib/CodeGen/LiveInterval.cpp b/llvm/lib/CodeGen/LiveInterval.cpp index b682998c329bc..299db85233c2d 100644 --- a/llvm/lib/CodeGen/LiveInterval.cpp +++ b/llvm/lib/CodeGen/LiveInterval.cpp @@ -996,6 +996,17 @@ LLVM_DUMP_METHOD void LiveRange::Segment::dump() const { } #endif +void VNInfo::print(raw_ostream &OS) const { + OS << id << '@'; + if (isUnused()) { + OS << 'x'; + } else { + OS << def; + if (isPHIDef()) + OS << "-phi"; + } +} + void LiveRange::print(raw_ostream &OS) const { if (empty()) OS << "EMPTY"; @@ -1013,15 +1024,10 @@ void LiveRange::print(raw_ostream &OS) const { for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e; ++i, ++vnum) { const VNInfo *vni = *i; - if (vnum) OS << ' '; - OS << vnum << '@'; - if (vni->isUnused()) { - OS << 'x'; - } else { - OS << vni->def; - if (vni->isPHIDef()) - OS << "-phi"; - } + if (vnum) + OS << ' '; + OS << *vni; + assert(vnum == vni->id && "Bad VNInfo"); } } } @@ -1041,9 +1047,9 @@ void LiveInterval::print(raw_ostream &OS) const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -LLVM_DUMP_METHOD void LiveRange::dump() const { - dbgs() << *this << '\n'; -} +LLVM_DUMP_METHOD void VNInfo::dump() const { dbgs() << *this << '\n'; } + +LLVM_DUMP_METHOD void LiveRange::dump() const { dbgs() << *this << '\n'; } LLVM_DUMP_METHOD void LiveInterval::SubRange::dump() const { dbgs() << *this << '\n'; diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp index 33e980a5993d3..59bc82dc267b5 100644 --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -80,7 +80,7 @@ void LiveRangeEdit::scanRemattable() { MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def); if (!DefMI) continue; - if (TII.isTriviallyReMaterializable(*DefMI)) + if (TII.isReMaterializable(*DefMI)) Remattable.insert(OrigVNI); } ScannedRemattable = true; @@ -92,60 +92,6 @@ bool LiveRangeEdit::anyRematerializable() { return !Remattable.empty(); } -/// allUsesAvailableAt - Return true if all registers used by OrigMI at -/// OrigIdx are also available with the same value at UseIdx. -bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI, - SlotIndex OrigIdx, - SlotIndex UseIdx) const { - OrigIdx = OrigIdx.getRegSlot(true); - UseIdx = std::max(UseIdx, UseIdx.getRegSlot(true)); - for (const MachineOperand &MO : OrigMI->operands()) { - if (!MO.isReg() || !MO.getReg() || !MO.readsReg()) - continue; - - // We can't remat physreg uses, unless it is a constant or target wants - // to ignore this use. - if (MO.getReg().isPhysical()) { - if (MRI.isConstantPhysReg(MO.getReg()) || TII.isIgnorableUse(MO)) - continue; - return false; - } - - LiveInterval &li = LIS.getInterval(MO.getReg()); - const VNInfo *OVNI = li.getVNInfoAt(OrigIdx); - if (!OVNI) - continue; - - // Don't allow rematerialization immediately after the original def. - // It would be incorrect if OrigMI redefines the register. - // See PR14098. - if (SlotIndex::isSameInstr(OrigIdx, UseIdx)) - return false; - - if (OVNI != li.getVNInfoAt(UseIdx)) - return false; - - // Check that subrange is live at UseIdx. - if (li.hasSubRanges()) { - const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); - unsigned SubReg = MO.getSubReg(); - LaneBitmask LM = SubReg ? TRI->getSubRegIndexLaneMask(SubReg) - : MRI.getMaxLaneMaskForVReg(MO.getReg()); - for (LiveInterval::SubRange &SR : li.subranges()) { - if ((SR.LaneMask & LM).none()) - continue; - if (!SR.liveAt(UseIdx)) - return false; - // Early exit if all used lanes are checked. No need to continue. - LM &= ~SR.LaneMask; - if (LM.none()) - break; - } - } - } - return true; -} - bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI, SlotIndex UseIdx) { assert(ScannedRemattable && "Call anyRematerializable first"); @@ -155,12 +101,10 @@ bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI, return false; // No defining instruction provided. - SlotIndex DefIdx; assert(RM.OrigMI && "No defining instruction for remattable value"); - DefIdx = LIS.getInstructionIndex(*RM.OrigMI); // Verify that all used registers are available with the same values. - if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(RM.OrigMI, UseIdx, LIS, MRI, TII)) return false; return true; @@ -221,8 +165,8 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI, // Since we're moving the DefMI load, make sure we're not extending any live // ranges. - if (!allUsesAvailableAt(DefMI, LIS.getInstructionIndex(*DefMI), - LIS.getInstructionIndex(*UseMI))) + if (!VirtRegAuxInfo::allUsesAvailableAt( + DefMI, LIS.getInstructionIndex(*UseMI), LIS, MRI, TII)) return false; // We also need to make sure it is safe to move the load. @@ -387,7 +331,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) { // register uses. That may provoke RA to split an interval at the KILL // and later result in an invalid live segment end. if (isOrigDef && DeadRemats && !HasLiveVRegUses && - TII.isTriviallyReMaterializable(*MI)) { + TII.isReMaterializable(*MI)) { LiveInterval &NewLI = createEmptyIntervalFrom(Dest, false); VNInfo::Allocator &Alloc = LIS.getVNInfoAllocator(); VNInfo *VNI = NewLI.getNextValue(Idx, Alloc); diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index bb70e7805e818..0f792b0ef206c 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -127,7 +127,7 @@ class MIRParserImpl { bool initializeSaveRestorePoints( PerFunctionMIParsingState &PFS, const std::vector &YamlSRPoints, - SmallVectorImpl &SaveRestorePoints); + llvm::SaveRestorePoints &SaveRestorePoints); bool initializeCallSiteInfo(PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); @@ -872,11 +872,11 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS, MFI.setHasTailCall(YamlMFI.HasTailCall); MFI.setCalleeSavedInfoValid(YamlMFI.IsCalleeSavedInfoValid); MFI.setLocalFrameSize(YamlMFI.LocalFrameSize); - SmallVector SavePoints; + llvm::SaveRestorePoints SavePoints; if (initializeSaveRestorePoints(PFS, YamlMFI.SavePoints, SavePoints)) return true; MFI.setSavePoints(SavePoints); - SmallVector RestorePoints; + llvm::SaveRestorePoints RestorePoints; if (initializeSaveRestorePoints(PFS, YamlMFI.RestorePoints, RestorePoints)) return true; MFI.setRestorePoints(RestorePoints); @@ -1098,14 +1098,22 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS, bool MIRParserImpl::initializeSaveRestorePoints( PerFunctionMIParsingState &PFS, const std::vector &YamlSRPoints, - SmallVectorImpl &SaveRestorePoints) { + llvm::SaveRestorePoints &SaveRestorePoints) { + SMDiagnostic Error; MachineBasicBlock *MBB = nullptr; for (const yaml::SaveRestorePointEntry &Entry : YamlSRPoints) { if (parseMBBReference(PFS, MBB, Entry.Point.Value)) return true; - SaveRestorePoints.push_back(MBB); - } + std::vector Registers; + for (auto &RegStr : Entry.Registers) { + Register Reg; + if (parseNamedRegisterReference(PFS, Reg, RegStr.Value, Error)) + return error(Error, RegStr.SourceRange); + Registers.push_back(CalleeSavedInfo(Reg)); + } + SaveRestorePoints.try_emplace(MBB, std::move(Registers)); + } return false; } diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 91a21a4adf4eb..bf8a6cdf097a9 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -149,11 +149,13 @@ static void convertMCP(yaml::MachineFunction &MF, static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI, const MachineJumpTableInfo &JTI); static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI, - const MachineFrameInfo &MFI); + const MachineFrameInfo &MFI, + const TargetRegisterInfo *TRI); static void convertSRPoints(ModuleSlotTracker &MST, std::vector &YamlSRPoints, - ArrayRef SaveRestorePoints); + const llvm::SaveRestorePoints &SRPoints, + const TargetRegisterInfo *TRI); static void convertStackObjects(yaml::MachineFunction &YMF, const MachineFunction &MF, ModuleSlotTracker &MST, MFPrintState &State); @@ -204,7 +206,8 @@ static void printMF(raw_ostream &OS, const MachineModuleInfo &MMI, convertMRI(YamlMF, MF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo()); MachineModuleSlotTracker &MST = State.MST; MST.incorporateFunction(MF.getFunction()); - convertMFI(MST, YamlMF.FrameInfo, MF.getFrameInfo()); + convertMFI(MST, YamlMF.FrameInfo, MF.getFrameInfo(), + MF.getSubtarget().getRegisterInfo()); convertStackObjects(YamlMF, MF, MST, State); convertEntryValueObjects(YamlMF, MF, MST); convertCallSiteObjects(YamlMF, MF, MST); @@ -339,7 +342,8 @@ static void convertMRI(yaml::MachineFunction &YamlMF, const MachineFunction &MF, } static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI, - const MachineFrameInfo &MFI) { + const MachineFrameInfo &MFI, + const TargetRegisterInfo *TRI) { YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken(); YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken(); YamlMFI.HasStackMap = MFI.hasStackMap(); @@ -360,9 +364,9 @@ static void convertMFI(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI, YamlMFI.IsCalleeSavedInfoValid = MFI.isCalleeSavedInfoValid(); YamlMFI.LocalFrameSize = MFI.getLocalFrameSize(); if (!MFI.getSavePoints().empty()) - convertSRPoints(MST, YamlMFI.SavePoints, MFI.getSavePoints()); + convertSRPoints(MST, YamlMFI.SavePoints, MFI.getSavePoints(), TRI); if (!MFI.getRestorePoints().empty()) - convertSRPoints(MST, YamlMFI.RestorePoints, MFI.getRestorePoints()); + convertSRPoints(MST, YamlMFI.RestorePoints, MFI.getRestorePoints(), TRI); } static void convertEntryValueObjects(yaml::MachineFunction &YMF, @@ -619,16 +623,35 @@ static void convertMCP(yaml::MachineFunction &MF, static void convertSRPoints(ModuleSlotTracker &MST, std::vector &YamlSRPoints, - ArrayRef SRPoints) { - for (const auto &MBB : SRPoints) { + const llvm::SaveRestorePoints &SRPoints, + const TargetRegisterInfo *TRI) { + for (const auto &[MBB, CSInfos] : SRPoints) { SmallString<16> Str; yaml::SaveRestorePointEntry Entry; raw_svector_ostream StrOS(Str); StrOS << printMBBReference(*MBB); Entry.Point = StrOS.str().str(); Str.clear(); + for (const CalleeSavedInfo &Info : CSInfos) { + if (Info.getReg()) { + StrOS << printReg(Info.getReg(), TRI); + Entry.Registers.push_back(StrOS.str().str()); + Str.clear(); + } + } + // Sort here needed for stable output for lit tests + std::sort(Entry.Registers.begin(), Entry.Registers.end(), + [](const yaml::StringValue &Lhs, const yaml::StringValue &Rhs) { + return Lhs.Value < Rhs.Value; + }); YamlSRPoints.push_back(std::move(Entry)); } + // Sort here needed for stable output for lit tests + std::sort(YamlSRPoints.begin(), YamlSRPoints.end(), + [](const yaml::SaveRestorePointEntry &Lhs, + const yaml::SaveRestorePointEntry &Rhs) { + return Lhs.Point.Value < Rhs.Point.Value; + }); } static void convertMJTI(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI, diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp index 1cde094d78e23..b2731b691d54c 100644 --- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp +++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp @@ -927,7 +927,7 @@ MLEvictAdvisor::getLIFeatureComponents(const LiveInterval &LI) const { Ret.HintWeights += Freq; } Ret.IsRemat = VirtRegAuxInfo::isRematerializable( - LI, *LIS, *VRM, *MF.getSubtarget().getInstrInfo()); + LI, *LIS, *VRM, *MRI, *MF.getSubtarget().getInstrInfo()); return Ret; } diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 08a51b9b0242a..1cb57a4fa4258 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -606,6 +606,26 @@ void MachineBasicBlock::removeLiveIn(MCRegister Reg, LaneBitmask LaneMask) { LiveIns.erase(I); } +void MachineBasicBlock::removeLiveInOverlappedWith(MCRegister Reg) { + const MachineFunction *MF = getParent(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + // Remove Reg and its subregs from live in set. + for (MCPhysReg S : TRI->subregs_inclusive(Reg)) + removeLiveIn(S); + + // Remove live-in bitmask in super registers as well. + for (MCPhysReg Super : TRI->superregs(Reg)) { + for (MCSubRegIndexIterator SRI(Super, TRI); SRI.isValid(); ++SRI) { + if (Reg == SRI.getSubReg()) { + unsigned SubRegIndex = SRI.getSubRegIndex(); + LaneBitmask SubRegLaneMask = TRI->getSubRegIndexLaneMask(SubRegIndex); + removeLiveIn(Super, SubRegLaneMask); + break; + } + } + } +} + MachineBasicBlock::livein_iterator MachineBasicBlock::removeLiveIn(MachineBasicBlock::livein_iterator I) { // Get non-const version of iterator. @@ -1160,7 +1180,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( MachineBasicBlock *Succ, const SplitCriticalEdgeAnalyses &Analyses, std::vector> *LiveInSets, MachineDomTreeUpdater *MDTU) { - if (!canSplitCriticalEdge(Succ)) + if (!canSplitCriticalEdge(Succ, Analyses.MLI)) return nullptr; MachineFunction *MF = getParent(); @@ -1388,8 +1408,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge( return NMBB; } -bool MachineBasicBlock::canSplitCriticalEdge( - const MachineBasicBlock *Succ) const { +bool MachineBasicBlock::canSplitCriticalEdge(const MachineBasicBlock *Succ, + const MachineLoopInfo *MLI) const { // Splitting the critical edge to a landing pad block is non-trivial. Don't do // it in this generic function. if (Succ->isEHPad()) @@ -1403,8 +1423,17 @@ bool MachineBasicBlock::canSplitCriticalEdge( const MachineFunction *MF = getParent(); // Performance might be harmed on HW that implements branching using exec mask // where both sides of the branches are always executed. - if (MF->getTarget().requiresStructuredCFG()) + + if (MF->getTarget().requiresStructuredCFG()) { + // If `Succ` is a loop header, splitting the critical edge will not + // break structured CFG. + if (MLI) { + const MachineLoop *L = MLI->getLoopFor(Succ); + return L && L->getHeader() == Succ; + } + return false; + } // Do we have an Indirect jump with a jumptable that we can rewrite? int JTI = findJumpTableIndex(*this); @@ -1802,6 +1831,12 @@ bool MachineBasicBlock::sizeWithoutDebugLargerThan(unsigned Limit) const { return false; } +void MachineBasicBlock::removePHIsIncomingValuesForPredecessor( + const MachineBasicBlock &PredMBB) { + for (MachineInstr &Phi : phis()) + Phi.removePHIIncomingValueFor(PredMBB); +} + const MBBSectionID MBBSectionID::ColdSectionID(MBBSectionID::SectionType::Cold); const MBBSectionID MBBSectionID::ExceptionSectionID(MBBSectionID::SectionType::Exception); diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp index a8306b2ef2e5b..aed68afb4eb1b 100644 --- a/llvm/lib/CodeGen/MachineFrameInfo.cpp +++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp @@ -250,14 +250,14 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{ OS << "save points:\n"; for (auto &item : SavePoints) - OS << printMBBReference(*item) << "\n"; + OS << printMBBReference(*item.first) << "\n"; } else OS << "save points are empty\n"; if (!RestorePoints.empty()) { OS << "restore points:\n"; for (auto &item : RestorePoints) - OS << printMBBReference(*item) << "\n"; + OS << printMBBReference(*item.first) << "\n"; } else OS << "restore points are empty\n"; } diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp index 2c06c5ad4a5e4..8ad9245a47684 100644 --- a/llvm/lib/CodeGen/MachineInstr.cpp +++ b/llvm/lib/CodeGen/MachineInstr.cpp @@ -2747,3 +2747,18 @@ bool MachineInstr::mayFoldInlineAsmRegOp(unsigned OpId) const { return F.getRegMayBeFolded(); return false; } + +unsigned MachineInstr::removePHIIncomingValueFor(const MachineBasicBlock &MBB) { + assert(isPHI()); + + // Phi might have multiple entries for MBB. Need to remove them all. + unsigned RemovedCount = 0; + for (unsigned N = getNumOperands(); N > 2; N -= 2) { + if (getOperand(N - 1).getMBB() == &MBB) { + removeOperand(N - 1); + removeOperand(N - 2); + RemovedCount += 2; + } + } + return RemovedCount; +} diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 4f164e2d53460..7acddff753693 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -244,8 +244,6 @@ namespace { bool IsGuaranteedToExecute(MachineBasicBlock *BB, MachineLoop *CurLoop); - bool isTriviallyReMaterializable(const MachineInstr &MI) const; - void EnterScope(MachineBasicBlock *MBB); void ExitScope(MachineBasicBlock *MBB); @@ -771,23 +769,6 @@ bool MachineLICMImpl::IsGuaranteedToExecute(MachineBasicBlock *BB, return true; } -/// Check if \p MI is trivially remateralizable and if it does not have any -/// virtual register uses. Even though rematerializable RA might not actually -/// rematerialize it in this scenario. In that case we do not want to hoist such -/// instruction out of the loop in a belief RA will sink it back if needed. -bool MachineLICMImpl::isTriviallyReMaterializable( - const MachineInstr &MI) const { - if (!TII->isTriviallyReMaterializable(MI)) - return false; - - for (const MachineOperand &MO : MI.all_uses()) { - if (MO.getReg().isVirtual()) - return false; - } - - return true; -} - void MachineLICMImpl::EnterScope(MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n'); @@ -1300,9 +1281,9 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI, return false; } - // Rematerializable instructions should always be hoisted providing the - // register allocator can just pull them down again when needed. - if (isTriviallyReMaterializable(MI)) + // Trivially rematerializable instructions should always be hoisted + // providing the register allocator can just pull them down again when needed. + if (TII->isTriviallyReMaterializable(MI)) return true; // FIXME: If there are long latency loop-invariant instructions inside the @@ -1386,7 +1367,7 @@ bool MachineLICMImpl::IsProfitableToHoist(MachineInstr &MI, // High register pressure situation, only hoist if the instruction is going // to be remat'ed. - if (!isTriviallyReMaterializable(MI) && + if (!TII->isTriviallyReMaterializable(MI) && !MI.isDereferenceableInvariantLoad()) { LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI); return false; diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp index c6fa8f42757db..299bcc46e4bd2 100644 --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -4157,33 +4157,32 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) { return nullptr; } SUnit *SU; - do { - if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); - if (!SU) { - CandPolicy NoPolicy; - TopCand.reset(NoPolicy); - pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); - tracePick(TopCand); - SU = TopCand.SU; - } - IsTopNode = true; - } else if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); - if (!SU) { - CandPolicy NoPolicy; - BotCand.reset(NoPolicy); - pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); - assert(BotCand.Reason != NoCand && "failed to find a candidate"); - tracePick(BotCand); - SU = BotCand.SU; - } - IsTopNode = false; - } else { - SU = pickNodeBidirectional(IsTopNode); + if (RegionPolicy.OnlyTopDown) { + SU = Top.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + TopCand.reset(NoPolicy); + pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand); + assert(TopCand.Reason != NoCand && "failed to find a candidate"); + tracePick(TopCand); + SU = TopCand.SU; } - } while (SU->isScheduled); + IsTopNode = true; + } else if (RegionPolicy.OnlyBottomUp) { + SU = Bot.pickOnlyChoice(); + if (!SU) { + CandPolicy NoPolicy; + BotCand.reset(NoPolicy); + pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand); + assert(BotCand.Reason != NoCand && "failed to find a candidate"); + tracePick(BotCand); + SU = BotCand.SU; + } + IsTopNode = false; + } else { + SU = pickNodeBidirectional(IsTopNode); + } + assert(!SU->isScheduled && "SUnit scheduled twice."); // If IsTopNode, then SU is in Top.Available and must be removed. Otherwise, // if isTopReady(), then SU is in either Top.Available or Top.Pending. @@ -4524,43 +4523,42 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) { return nullptr; } SUnit *SU; - do { - if (RegionPolicy.OnlyBottomUp) { - SU = Bot.pickOnlyChoice(); - if (SU) { - tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/true); - } else { - CandPolicy NoPolicy; - BotCand.reset(NoPolicy); - // Set the bottom-up policy based on the state of the current bottom - // zone and the instructions outside the zone, including the top zone. - setPolicy(BotCand.Policy, /*IsPostRA=*/true, Bot, nullptr); - pickNodeFromQueue(Bot, BotCand); - assert(BotCand.Reason != NoCand && "failed to find a candidate"); - tracePick(BotCand, /*IsPostRA=*/true); - SU = BotCand.SU; - } - IsTopNode = false; - } else if (RegionPolicy.OnlyTopDown) { - SU = Top.pickOnlyChoice(); - if (SU) { - tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/true); - } else { - CandPolicy NoPolicy; - TopCand.reset(NoPolicy); - // Set the top-down policy based on the state of the current top zone - // and the instructions outside the zone, including the bottom zone. - setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); - pickNodeFromQueue(Top, TopCand); - assert(TopCand.Reason != NoCand && "failed to find a candidate"); - tracePick(TopCand, /*IsPostRA=*/true); - SU = TopCand.SU; - } - IsTopNode = true; + if (RegionPolicy.OnlyBottomUp) { + SU = Bot.pickOnlyChoice(); + if (SU) { + tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/true); } else { - SU = pickNodeBidirectional(IsTopNode); + CandPolicy NoPolicy; + BotCand.reset(NoPolicy); + // Set the bottom-up policy based on the state of the current bottom + // zone and the instructions outside the zone, including the top zone. + setPolicy(BotCand.Policy, /*IsPostRA=*/true, Bot, nullptr); + pickNodeFromQueue(Bot, BotCand); + assert(BotCand.Reason != NoCand && "failed to find a candidate"); + tracePick(BotCand, /*IsPostRA=*/true); + SU = BotCand.SU; } - } while (SU->isScheduled); + IsTopNode = false; + } else if (RegionPolicy.OnlyTopDown) { + SU = Top.pickOnlyChoice(); + if (SU) { + tracePick(Only1, /*IsTopNode=*/true, /*IsPostRA=*/true); + } else { + CandPolicy NoPolicy; + TopCand.reset(NoPolicy); + // Set the top-down policy based on the state of the current top zone + // and the instructions outside the zone, including the bottom zone. + setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr); + pickNodeFromQueue(Top, TopCand); + assert(TopCand.Reason != NoCand && "failed to find a candidate"); + tracePick(TopCand, /*IsPostRA=*/true); + SU = TopCand.SU; + } + IsTopNode = true; + } else { + SU = pickNodeBidirectional(IsTopNode); + } + assert(!SU->isScheduled && "SUnit scheduled twice."); if (SU->isTopReady()) Top.removeReady(SU); diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index 9ec5151a039b7..d5153b7fb6207 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -2187,11 +2187,9 @@ static void clearKillFlags(MachineInstr *MI, MachineBasicBlock &CurBB, static void updateLiveIn(MachineInstr *MI, MachineBasicBlock *SuccBB, const SmallVectorImpl &UsedOpsInCopy, const SmallVectorImpl &DefedRegsInCopy) { - MachineFunction &MF = *SuccBB->getParent(); - const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); for (Register DefReg : DefedRegsInCopy) - for (MCPhysReg S : TRI->subregs_inclusive(DefReg)) - SuccBB->removeLiveIn(S); + SuccBB->removeLiveInOverlappedWith(DefReg); + for (auto U : UsedOpsInCopy) SuccBB->addLiveIn(MI->getOperand(U).getReg()); SuccBB->sortUniqueLiveIns(); diff --git a/llvm/lib/CodeGen/MachineStripDebug.cpp b/llvm/lib/CodeGen/MachineStripDebug.cpp index ea291f64bff43..d54fe023a4a7e 100644 --- a/llvm/lib/CodeGen/MachineStripDebug.cpp +++ b/llvm/lib/CodeGen/MachineStripDebug.cpp @@ -58,7 +58,7 @@ struct StripDebugMachineModule : public ModulePass { // preservation. Preserve it for now. if (MI.getNumOperands() > 1) { LLVM_DEBUG(dbgs() << "Removing debug instruction " << MI); - MBB.erase(&MI); + MBB.erase_instr(&MI); Changed |= true; continue; } diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index e911ce8a75828..115485509c4a5 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1549,7 +1549,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { report("G_BUILD_VECTOR result element type must match source type", MI); if (DstTy.getNumElements() != MI->getNumOperands() - 1) - report("G_BUILD_VECTOR must have an operand for each elemement", MI); + report("G_BUILD_VECTOR must have an operand for each element", MI); for (const MachineOperand &MO : llvm::drop_begin(MI->operands(), 2)) if (MRI->getType(MI->getOperand(1).getReg()) != MRI->getType(MO.getReg())) @@ -2398,11 +2398,11 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) { // The next two checks allow COPY between physical and virtual registers, // when the virtual register has a scalable size and the physical register - // has a fixed size. These checks allow COPY between *potentialy* mismatched - // sizes. However, once RegisterBankSelection occurs, MachineVerifier should - // be able to resolve a fixed size for the scalable vector, and at that - // point this function will know for sure whether the sizes are mismatched - // and correctly report a size mismatch. + // has a fixed size. These checks allow COPY between *potentially* + // mismatched sizes. However, once RegisterBankSelection occurs, + // MachineVerifier should be able to resolve a fixed size for the scalable + // vector, and at that point this function will know for sure whether the + // sizes are mismatched and correctly report a size mismatch. if (SrcReg.isPhysical() && DstReg.isVirtual() && DstSize.isScalable() && !SrcSize.isScalable()) break; @@ -3213,13 +3213,13 @@ struct VRegFilter { private: static constexpr unsigned SparseUniverseMax = 10 * 1024 * 8; - // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyound - // are tracked by Dense. The only purpose of the threashold and the Dense set + // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyond + // are tracked by Dense. The only purpose of the threshold and the Dense set // is to have a reasonably growing memory usage in pathological cases (large // number of very sparse VRegFilter instances live at the same time). In // practice even in the worst-by-execution time cases having all elements // tracked by Sparse (very large SparseUniverseMax scenario) tends to be more - // space efficient than if tracked by Dense. The threashold is set to keep the + // space efficient than if tracked by Dense. The threshold is set to keep the // worst-case memory usage within 2x of figures determined empirically for // "all Dense" scenario in such worst-by-execution-time cases. BitVector Sparse; @@ -3459,7 +3459,7 @@ void MachineVerifier::visitMachineFunctionAfter() { // Check live-in list of each MBB. If a register is live into MBB, check // that the register is in regsLiveOut of each predecessor block. Since - // this must come from a definition in the predecesssor or its live-in + // this must come from a definition in the predecessor or its live-in // list, this will catch a live-through case where the predecessor does not // have the register in its live-in list. This currently only checks // registers that have no aliases, are not allocatable and are not diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp index 21bf052d1fdaf..d47ed65540cf4 100644 --- a/llvm/lib/CodeGen/ModuloSchedule.cpp +++ b/llvm/lib/CodeGen/ModuloSchedule.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -859,20 +860,6 @@ void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB, } } -/// Remove the incoming block from the Phis in a basic block. -static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) { - for (MachineInstr &MI : *BB) { - if (!MI.isPHI()) - break; - for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) - if (MI.getOperand(i + 1).getMBB() == Incoming) { - MI.removeOperand(i + 1); - MI.removeOperand(i); - break; - } - } -} - /// Create branches from each prolog basic block to the appropriate epilog /// block. These edges are needed if the loop ends before reaching the /// kernel. @@ -906,7 +893,7 @@ void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB, Prolog->removeSuccessor(LastPro); LastEpi->removeSuccessor(Epilog); numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc()); - removePhis(Epilog, LastEpi); + Epilog->removePHIsIncomingValuesForPredecessor(*LastEpi); // Remove the blocks that are no longer referenced. if (LastPro != LastEpi) { for (auto &MI : *LastEpi) @@ -924,7 +911,7 @@ void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB, LastPro->eraseFromParent(); } else { numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc()); - removePhis(Epilog, Prolog); + Epilog->removePHIsIncomingValuesForPredecessor(*Prolog); } LastPro = Prolog; LastEpi = Epilog; diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp index 7bfc9dae59fcf..fb3e6482bb096 100644 --- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1004,9 +1004,8 @@ bool PeepholeOptimizer::findNextSource(const TargetRegisterClass *DefRC, // Thus, instead of maintaining untested code, we will revisit that if // that changes at some point. Register Reg = RegSubReg.Reg; - SmallVector SrcToLook; RegSubRegPair CurSrcPair = RegSubReg; - SrcToLook.push_back(CurSrcPair); + SmallVector SrcToLook = {CurSrcPair}; unsigned PHICount = 0; do { diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 8fc0748ecc0e3..0be75e073dedd 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -351,8 +351,8 @@ bool PEIImpl::run(MachineFunction &MF) { delete RS; SaveBlocks.clear(); RestoreBlocks.clear(); - MFI.setSavePoints({}); - MFI.setRestorePoints({}); + MFI.clearSavePoints(); + MFI.clearRestorePoints(); return true; } @@ -431,10 +431,12 @@ void PEIImpl::calculateSaveRestoreBlocks(MachineFunction &MF) { if (!MFI.getSavePoints().empty()) { assert(MFI.getSavePoints().size() == 1 && "Multiple save points are not yet supported!"); - SaveBlocks.push_back(MFI.getSavePoints().front()); + const auto &SavePoint = *MFI.getSavePoints().begin(); + SaveBlocks.push_back(SavePoint.first); assert(MFI.getRestorePoints().size() == 1 && "Multiple restore points are not yet supported!"); - MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); + const auto &RestorePoint = *MFI.getRestorePoints().begin(); + MachineBasicBlock *RestoreBlock = RestorePoint.first; // If RestoreBlock does not have any successor and is not a return block // then the end point is unreachable and we do not need to insert any // epilogue. @@ -563,8 +565,9 @@ static void updateLiveness(MachineFunction &MF) { assert(MFI.getSavePoints().size() < 2 && "Multiple save points not yet supported!"); - MachineBasicBlock *Save = - MFI.getSavePoints().empty() ? nullptr : MFI.getSavePoints().front(); + MachineBasicBlock *Save = MFI.getSavePoints().empty() + ? nullptr + : (*MFI.getSavePoints().begin()).first; if (!Save) Save = Entry; @@ -577,8 +580,9 @@ static void updateLiveness(MachineFunction &MF) { assert(MFI.getRestorePoints().size() < 2 && "Multiple restore points not yet supported!"); - MachineBasicBlock *Restore = - MFI.getRestorePoints().empty() ? nullptr : MFI.getRestorePoints().front(); + MachineBasicBlock *Restore = MFI.getRestorePoints().empty() + ? nullptr + : (*MFI.getRestorePoints().begin()).first; if (Restore) // By construction Restore cannot be visited, otherwise it // means there exists a path to Restore that does not go @@ -687,6 +691,20 @@ void PEIImpl::spillCalleeSavedRegs(MachineFunction &MF) { MFI.setCalleeSavedInfoValid(true); std::vector &CSI = MFI.getCalleeSavedInfo(); + + // Fill SavePoints and RestorePoints with CalleeSavedRegisters + if (!MFI.getSavePoints().empty()) { + SaveRestorePoints SaveRestorePts; + for (const auto &SavePoint : MFI.getSavePoints()) + SaveRestorePts.insert({SavePoint.first, CSI}); + MFI.setSavePoints(std::move(SaveRestorePts)); + + SaveRestorePts.clear(); + for (const auto &RestorePoint : MFI.getRestorePoints()) + SaveRestorePts.insert({RestorePoint.first, CSI}); + MFI.setRestorePoints(std::move(SaveRestorePts)); + } + if (!CSI.empty()) { if (!MFI.hasCalls()) NumLeafFuncWithSpills++; diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp index 318422b46e811..2e1cf499eab41 100644 --- a/llvm/lib/CodeGen/RDFLiveness.cpp +++ b/llvm/lib/CodeGen/RDFLiveness.cpp @@ -652,8 +652,9 @@ void Liveness::computePhiInfo() { // defs, cache the result of subtracting these defs from a given register // ref. using RefHash = std::hash; - using RefEqual = std::equal_to; - using SubMap = std::unordered_map; + using RefEqual = RegisterRefEqualTo; + using SubMap = + std::unordered_map; std::unordered_map Subs; auto ClearIn = [](RegisterRef RR, const RegisterAggr &Mid, SubMap &SM) { if (Mid.empty()) @@ -868,7 +869,7 @@ void Liveness::computeLiveIns() { std::vector LV; for (const MachineBasicBlock::RegisterMaskPair &LI : B.liveins()) LV.push_back(RegisterRef(LI.PhysReg, LI.LaneMask)); - llvm::sort(LV, std::less(PRI)); + llvm::sort(LV, RegisterRefLess(PRI)); dbgs() << printMBBReference(B) << "\t rec = {"; for (auto I : LV) dbgs() << ' ' << Print(I, DFG); @@ -878,7 +879,7 @@ void Liveness::computeLiveIns() { LV.clear(); for (RegisterRef RR : LiveMap[&B].refs()) LV.push_back(RR); - llvm::sort(LV, std::less(PRI)); + llvm::sort(LV, RegisterRefLess(PRI)); dbgs() << "\tcomp = {"; for (auto I : LV) dbgs() << ' ' << Print(I, DFG); diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp index 0d44ddc428570..f2c2f74755ace 100644 --- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp +++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp @@ -224,7 +224,7 @@ bool DefaultEvictionAdvisor::canEvictHintInterference( const LiveInterval &VirtReg, MCRegister PhysReg, const SmallVirtRegSet &FixedRegisters) const { EvictionCost MaxCost; - MaxCost.setBrokenHints(1); + MaxCost.setBrokenHints(MRI->getRegClass(VirtReg.reg())->getCopyCost()); return canEvictInterferenceBasedOnCost(VirtReg, PhysReg, true, MaxCost, FixedRegisters); } @@ -300,12 +300,14 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost( return false; // We permit breaking cascades for urgent evictions. It should be the // last resort, though, so make it really expensive. - Cost.BrokenHints += 10; + Cost.BrokenHints += 10 * MRI->getRegClass(Intf->reg())->getCopyCost(); } // Would this break a satisfied hint? bool BreaksHint = VRM->hasPreferredPhys(Intf->reg()); // Update eviction cost. - Cost.BrokenHints += BreaksHint; + if (BreaksHint) + Cost.BrokenHints += MRI->getRegClass(Intf->reg())->getCopyCost(); + Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight()); // Abort if this would be too expensive. if (Cost >= MaxCost) diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp index d004815d2c17a..8e6cf3e6b51b3 100644 --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -1383,21 +1383,37 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint, // Compute the cost of assigning a non Hint physical register to VirtReg. // We define it as the total frequency of broken COPY instructions to/from // Hint register, and after split, they can be deleted. - for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) { - if (!TII->isFullCopyInstr(Instr)) + + // FIXME: This is miscounting the costs with subregisters. In particular, this + // should support recognizing SplitKit formed copy bundles instead of direct + // copy instructions, which will appear in the same block. + for (const MachineOperand &Opnd : MRI->reg_nodbg_operands(Reg)) { + const MachineInstr &Instr = *Opnd.getParent(); + if (!Instr.isCopy() || Opnd.isImplicit()) continue; - Register OtherReg = Instr.getOperand(1).getReg(); - if (OtherReg == Reg) { - OtherReg = Instr.getOperand(0).getReg(); - if (OtherReg == Reg) - continue; - // Check if VirtReg interferes with OtherReg after this COPY instruction. - if (VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot())) - continue; - } + + // Look for the other end of the copy. + const bool IsDef = Opnd.isDef(); + const MachineOperand &OtherOpnd = Instr.getOperand(IsDef); + Register OtherReg = OtherOpnd.getReg(); + assert(Reg == Opnd.getReg()); + if (OtherReg == Reg) + continue; + + unsigned SubReg = Opnd.getSubReg(); + unsigned OtherSubReg = OtherOpnd.getSubReg(); + if (SubReg && OtherSubReg && SubReg != OtherSubReg) + continue; + + // Check if VirtReg interferes with OtherReg after this COPY instruction. + if (!IsDef && VirtReg.liveAt(LIS->getInstructionIndex(Instr).getRegSlot())) + continue; + MCRegister OtherPhysReg = OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg); - if (OtherPhysReg == Hint) + MCRegister ThisHint = + SubReg ? TRI->getSubReg(Hint, SubReg) : MCRegister(Hint); + if (OtherPhysReg == ThisHint) Cost += MBFI->getBlockFreq(Instr.getParent()); } diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp index b86647dbe0a48..9c9cc1f1f0b7b 100644 --- a/llvm/lib/CodeGen/RegAllocScore.cpp +++ b/llvm/lib/CodeGen/RegAllocScore.cpp @@ -79,8 +79,7 @@ llvm::calculateRegAllocScore(const MachineFunction &MF, return MBFI.getBlockFreqRelativeToEntryBlock(&MBB); }, [&](const MachineInstr &MI) { - return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable( - MI); + return MF.getSubtarget().getInstrInfo()->isReMaterializable(MI); }); } diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index db00f54daeb62..7ac1aef83777a 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -20,6 +20,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/CalcSpillWeights.h" #include "llvm/CodeGen/LiveInterval.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRangeEdit.h" @@ -294,10 +295,10 @@ class RegisterCoalescer : private LiveRangeEdit::Delegate { /// We found a copy which can be moved to its less frequent predecessor. bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI); - /// If the source of a copy is defined by a - /// trivial computation, replace the copy by rematerialize the definition. - bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI, - bool &IsDefCopy); + /// If the source of a copy is defined by a CheapAsAMove computation, + /// replace the copy by rematerialize the definition. + bool reMaterializeDef(const CoalescerPair &CP, MachineInstr *CopyMI, + bool &IsDefCopy); /// Return true if a copy involving a physreg should be joined. bool canJoinPhys(const CoalescerPair &CP); @@ -1297,9 +1298,9 @@ static bool definesFullReg(const MachineInstr &MI, Register Reg) { return false; } -bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, - MachineInstr *CopyMI, - bool &IsDefCopy) { +bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP, + MachineInstr *CopyMI, + bool &IsDefCopy) { IsDefCopy = false; Register SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg(); unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx(); @@ -1325,7 +1326,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (!TII->isAsCheapAsAMove(*DefMI)) return false; - if (!TII->isTriviallyReMaterializable(*DefMI)) + if (!TII->isReMaterializable(*DefMI)) return false; if (!definesFullReg(*DefMI, SrcReg)) @@ -1393,10 +1394,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, } } - SmallVector NewRegs; - LiveRangeEdit Edit(&SrcInt, NewRegs, *MF, *LIS, nullptr, this); - SlotIndex DefIdx = LIS->getInstructionIndex(*DefMI); - if (!Edit.allUsesAvailableAt(DefMI, DefIdx, CopyIdx)) + if (!VirtRegAuxInfo::allUsesAvailableAt(DefMI, CopyIdx, *LIS, *MRI, *TII)) return false; DebugLoc DL = CopyMI->getDebugLoc(); @@ -1405,6 +1403,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, std::next(MachineBasicBlock::iterator(CopyMI)); LiveRangeEdit::Remat RM(ValNo); RM.OrigMI = DefMI; + SmallVector NewRegs; + LiveRangeEdit Edit(&SrcInt, NewRegs, *MF, *LIS, nullptr, this); Edit.rematerializeAt(*MBB, MII, DstReg, RM, *TRI, false, SrcIdx, CopyMI); MachineInstr &NewMI = *std::prev(MII); NewMI.setDebugLoc(DL); @@ -1475,10 +1475,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // // The implicit-def of the super register may have been reduced to // subregisters depending on the uses. - - bool NewMIDefinesFullReg = false; - - SmallVector NewMIImplDefs; + SmallVector, 4> NewMIImplDefs; for (unsigned i = NewMI.getDesc().getNumOperands(), e = NewMI.getNumOperands(); i != e; ++i) { @@ -1486,9 +1483,6 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (MO.isReg() && MO.isDef()) { assert(MO.isImplicit()); if (MO.getReg().isPhysical()) { - if (MO.getReg() == DstReg) - NewMIDefinesFullReg = true; - assert(MO.isImplicit() && MO.getReg().isPhysical() && (MO.isDead() || (DefSubIdx && @@ -1496,7 +1490,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, MCRegister((unsigned)NewMI.getOperand(0).getReg())) || TRI->isSubRegisterEq(NewMI.getOperand(0).getReg(), MO.getReg()))))); - NewMIImplDefs.push_back(MO.getReg().asMCReg()); + NewMIImplDefs.push_back({i, MO.getReg()}); } else { assert(MO.getReg() == NewMI.getOperand(0).getReg()); @@ -1641,12 +1635,30 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, // been asked for. If so it must implicitly define the whole thing. assert(DstReg.isPhysical() && "Only expect virtual or physical registers in remat"); + + // When we're rematerializing into a not-quite-right register we already add + // the real definition as an implicit-def, but we should also be marking the + // "official" register as dead, since nothing else is going to use it as a + // result of this remat. Not doing this can affect pressure tracking. NewMI.getOperand(0).setIsDead(true); - if (!NewMIDefinesFullReg) { + bool HasDefMatchingCopy = false; + for (auto [OpIndex, Reg] : NewMIImplDefs) { + if (Reg != DstReg) + continue; + // Also, if CopyDstReg is a sub-register of DstReg (and it is defined), we + // must mark DstReg as dead since it is not going to used as a result of + // this remat. + if (DstReg != CopyDstReg) + NewMI.getOperand(OpIndex).setIsDead(true); + else + HasDefMatchingCopy = true; + } + + // If NewMI does not already have an implicit-def CopyDstReg add one now. + if (!HasDefMatchingCopy) NewMI.addOperand(MachineOperand::CreateReg( CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/)); - } // Record small dead def live-ranges for all the subregisters // of the destination register. @@ -1677,8 +1689,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, NewMI.addOperand(MO); SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI); - for (MCRegister Reg : NewMIImplDefs) { - for (MCRegUnit Unit : TRI->regunits(Reg)) + for (Register Reg : make_second_range(NewMIImplDefs)) { + for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) if (LiveRange *LR = LIS->getCachedRegUnit(Unit)) LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator()); } @@ -2128,10 +2140,10 @@ bool RegisterCoalescer::joinCopy( << printReg(CP.getSrcReg(), TRI) << " with " << printReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n'); if (!canJoinPhys(CP)) { - // Before giving up coalescing, if definition of source is defined by - // trivial computation, try rematerializing it. + // Before giving up coalescing, try rematerializing the source of + // the copy instead if it is cheap. bool IsDefCopy = false; - if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy)) + if (reMaterializeDef(CP, CopyMI, IsDefCopy)) return true; if (IsDefCopy) Again = true; // May be possible to coalesce later. @@ -2167,10 +2179,9 @@ bool RegisterCoalescer::joinCopy( if (!joinIntervals(CP)) { // Coalescing failed. - // If definition of source is defined by trivial computation, try - // rematerializing it. + // Try rematerializing the definition of the source if it is cheap. bool IsDefCopy = false; - if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy)) + if (reMaterializeDef(CP, CopyMI, IsDefCopy)) return true; // If we can eliminate the copy without merging the live segments, do so diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a6ba6e518899f..77df4b4598c48 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17770,7 +17770,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); if (N1C && N1C->isZero()) - if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) + if (N1C->isNegative() || Flags.hasNoSignedZeros()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -17823,11 +17823,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return DAG.getConstantFP(0.0, DL, VT); } - // If 'unsafe math' or reassoc and nsz, fold lots of things. + // If reassoc and nsz, fold lots of things. // TODO: break out portions of the transformations below for which Unsafe is // considered and which do not require both nsz and reassoc - if ((Options.NoSignedZerosFPMath || - (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && + if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() && AllowNewConst) { // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 if (N1CFP && N0.getOpcode() == ISD::FADD && @@ -17911,10 +17910,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { DAG.getConstantFP(4.0, DL, VT)); } } - } // enable-unsafe-fp-math && AllowNewConst + } // reassoc && nsz && AllowNewConst - if ((Options.NoSignedZerosFPMath || - (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))) { + if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()) { // Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y)) if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL, VT, N0, N1, Flags)) @@ -17985,8 +17983,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) { - if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath || - Flags.hasNoSignedZeros()) { + if (!N1CFP->isNegative() || Flags.hasNoSignedZeros()) { return N0; } } @@ -17999,8 +17996,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { // (fsub -0.0, N1) -> -N1 if (N0CFP && N0CFP->isZero()) { - if (N0CFP->isNegative() || - (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) { + if (N0CFP->isNegative() || Flags.hasNoSignedZeros()) { // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are // flushed to zero, unless all users treat denorms as zero (DAZ). // FIXME: This transform will change the sign of a NaN and the behavior @@ -18016,8 +18012,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) { } } - if ((Options.NoSignedZerosFPMath || - (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && + if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() && N1.getOpcode() == ISD::FADD) { // X - (X + Y) -> -Y if (N0 == N1->getOperand(0)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 7aa293af963e6..8fc7eabf90ea8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -11161,8 +11161,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList, APFloat FrexpMant = frexp(C->getValueAPF(), FrexpExp, APFloat::rmNearestTiesToEven); SDValue Result0 = getConstantFP(FrexpMant, DL, VTList.VTs[0]); - SDValue Result1 = - getConstant(FrexpMant.isFinite() ? FrexpExp : 0, DL, VTList.VTs[1]); + SDValue Result1 = getSignedConstant(FrexpMant.isFinite() ? FrexpExp : 0, + DL, VTList.VTs[1]); return getNode(ISD::MERGE_VALUES, DL, VTList, {Result0, Result1}, Flags); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 80500e48351e4..cc503d324e74b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7492,7 +7492,6 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, // Pre-increment recursion depth for use in recursive calls. ++Depth; const SDNodeFlags Flags = Op->getFlags(); - const TargetOptions &Options = DAG.getTarget().Options; EVT VT = Op.getValueType(); unsigned Opcode = Op.getOpcode(); @@ -7572,7 +7571,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, return DAG.getBuildVector(VT, DL, Ops); } case ISD::FADD: { - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; // After operation legalization, it might not be legal to create new FSUBs. @@ -7617,7 +7616,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } case ISD::FSUB: { // We can't turn -(A-B) into B-A when we honor signed zeros. - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; SDValue X = Op.getOperand(0), Y = Op.getOperand(1); @@ -7678,7 +7677,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } case ISD::FMA: case ISD::FMAD: { - if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) + if (!Flags.hasNoSignedZeros()) break; SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2); @@ -8797,7 +8796,6 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, EVT VT = Node->getValueType(0); EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); bool IsMax = Opc == ISD::FMAXIMUMNUM; - const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = Node->getFlags(); unsigned NewOp = @@ -8839,7 +8837,9 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, return DAG.getNode(IEEE2008Op, DL, VT, LHS, RHS, Flags); } - if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + if (VT.isVector() && + (isOperationLegalOrCustomOrPromote(Opc, VT.getVectorElementType()) || + !isOperationLegalOrCustom(ISD::VSELECT, VT))) return DAG.UnrollVectorOp(Node); // If only one operand is NaN, override it with another operand. @@ -8856,8 +8856,8 @@ SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, // TODO: We need quiet sNaN if strictfp. // Fixup signed zero behavior. - if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros() || - DAG.isKnownNeverZeroFloat(LHS) || DAG.isKnownNeverZeroFloat(RHS)) { + if (Flags.hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(LHS) || + DAG.isKnownNeverZeroFloat(RHS)) { return MinMax; } SDValue TestZero = @@ -9775,11 +9775,12 @@ SDValue TargetLowering::expandABD(SDNode *N, SelectionDAG &DAG) const { return DAG.getNode(ISD::SUB, dl, VT, Cmp, Xor); } - // Similar to the branchless expansion, use the (sign-extended) usubo overflow - // flag if the (scalar) type is illegal as this is more likely to legalize - // cleanly: - // abdu(lhs, rhs) -> sub(xor(sub(lhs, rhs), uof(lhs, rhs)), uof(lhs, rhs)) - if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT)) { + // Similar to the branchless expansion, if we don't prefer selects, use the + // (sign-extended) usubo overflow flag if the (scalar) type is illegal as this + // is more likely to legalize cleanly: abdu(lhs, rhs) -> sub(xor(sub(lhs, + // rhs), uof(lhs, rhs)), uof(lhs, rhs)) + if (!IsSigned && VT.isScalarInteger() && !isTypeLegal(VT) && + !preferSelectsOverBooleanArithmetic(VT)) { SDValue USubO = DAG.getNode(ISD::USUBO, dl, DAG.getVTList(VT, MVT::i1), {LHS, RHS}); SDValue Cmp = DAG.getNode(ISD::SIGN_EXTEND, dl, VT, USubO.getValue(1)); @@ -10974,7 +10975,8 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const { // because one of the conditions can be merged with one of the selects. // And finally, if we don't know the contents of high bits of a boolean value // we can't perform any arithmetic either. - if (shouldExpandCmpUsingSelects(VT) || BoolVT.getScalarSizeInBits() == 1 || + if (preferSelectsOverBooleanArithmetic(VT) || + BoolVT.getScalarSizeInBits() == 1 || getBooleanContents(BoolVT) == UndefinedBooleanContent) { SDValue SelectZeroOrOne = DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT), diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp index 938f2d756bc87..826e4126de44c 100644 --- a/llvm/lib/CodeGen/ShrinkWrap.cpp +++ b/llvm/lib/CodeGen/ShrinkWrap.cpp @@ -967,12 +967,12 @@ bool ShrinkWrapImpl::run(MachineFunction &MF) { << "\nRestore: " << printMBBReference(*Restore) << '\n'); MachineFrameInfo &MFI = MF.getFrameInfo(); - SmallVector SavePoints; - SmallVector RestorePoints; - if (Save) { - SavePoints.push_back(Save); - RestorePoints.push_back(Restore); - } + + // List of CalleeSavedInfo for registers will be added during prologepilog + // pass + SaveRestorePoints SavePoints({{Save, {}}}); + SaveRestorePoints RestorePoints({{Restore, {}}}); + MFI.setSavePoints(SavePoints); MFI.setRestorePoints(RestorePoints); ++NumCandidates; diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp index 9b1420a94142d..8e48d19537165 100644 --- a/llvm/lib/CodeGen/TailDuplicator.cpp +++ b/llvm/lib/CodeGen/TailDuplicator.cpp @@ -375,13 +375,7 @@ void TailDuplicator::processPHI( if (!Remove) return; - // MI might have multiple entries for PredBB. Need to remove them all. - for (unsigned N = MI->getNumOperands(); N > 2; N -= 2) { - if (MI->getOperand(N - 1).getMBB() == PredBB) { - MI->removeOperand(N - 1); - MI->removeOperand(N - 2); - } - } + MI->removePHIIncomingValueFor(*PredBB); if (MI->getNumOperands() == 1 && !TailBB->hasAddressTaken()) MI->eraseFromParent(); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 5be89b49fb6ba..2f3b7a2c8fcdf 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -1590,7 +1590,7 @@ MachineTraceStrategy TargetInstrInfo::getMachineCombinerTraceStrategy() const { return MachineTraceStrategy::TS_MinInstrCount; } -bool TargetInstrInfo::isReallyTriviallyReMaterializable( +bool TargetInstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { const MachineFunction &MF = *MI.getMF(); const MachineRegisterInfo &MRI = MF.getRegInfo(); diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp index 512e83db40a5a..cf8c1a7bd08d0 100644 --- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp +++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/UnreachableBlockElim.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -155,18 +156,7 @@ bool UnreachableMachineBlockElim::run(MachineFunction &F) { if (MDT && MDT->getNode(&BB)) MDT->eraseNode(&BB); while (!BB.succ_empty()) { - MachineBasicBlock* succ = *BB.succ_begin(); - - for (MachineInstr &Phi : succ->phis()) { - for (unsigned i = Phi.getNumOperands() - 1; i >= 2; i -= 2) { - if (Phi.getOperand(i).isMBB() && - Phi.getOperand(i).getMBB() == &BB) { - Phi.removeOperand(i); - Phi.removeOperand(i - 1); - } - } - } - + (*BB.succ_begin())->removePHIsIncomingValuesForPredecessor(BB); BB.removeSuccessor(BB.succ_begin()); } } diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 8ec3f1729b974..5ab80e339a1ad 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -851,6 +851,86 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, } break; } + case DW_AT_LLVM_stmt_sequence: { + // Make sure the offset in the DW_AT_LLVM_stmt_sequence attribute is valid + // and points to a valid sequence offset in the line table. + auto SectionOffset = AttrValue.Value.getAsSectionOffset(); + if (!SectionOffset) { + ReportError("Invalid DW_AT_LLVM_stmt_sequence encoding", + "DIE has invalid DW_AT_LLVM_stmt_sequence encoding"); + break; + } + if (*SectionOffset >= U->getLineSection().Data.size()) { + ReportError( + "DW_AT_LLVM_stmt_sequence offset out of bounds", + "DW_AT_LLVM_stmt_sequence offset is beyond .debug_line bounds: " + + llvm::formatv("{0:x8}", *SectionOffset)); + break; + } + + // Get the line table for this unit to validate bounds + const auto *LineTable = DCtx.getLineTableForUnit(U); + if (!LineTable) { + ReportError("DW_AT_LLVM_stmt_sequence without line table", + "DIE has DW_AT_LLVM_stmt_sequence but compile unit has no " + "line table"); + break; + } + + // Get the DW_AT_stmt_list offset from the compile unit DIE + DWARFDie CUDie = U->getUnitDIE(); + auto StmtListOffset = toSectionOffset(CUDie.find(DW_AT_stmt_list)); + if (!StmtListOffset) { + ReportError("DW_AT_LLVM_stmt_sequence without DW_AT_stmt_list", + "DIE has DW_AT_LLVM_stmt_sequence but compile unit has no " + "DW_AT_stmt_list"); + break; + } + + const int8_t DwarfOffset = + LineTable->Prologue.getFormParams().getDwarfOffsetByteSize(); + // Calculate the bounds of this specific line table + uint64_t LineTableStart = *StmtListOffset; + uint64_t PrologueLength = LineTable->Prologue.PrologueLength; + uint64_t TotalLength = LineTable->Prologue.TotalLength; + uint64_t LineTableEnd = LineTableStart + TotalLength + DwarfOffset; + + // See DWARF definition for this, the following three do not + // count toward prologue length. Calculate SequencesStart correctly + // according to DWARF specification: + uint64_t InitialLengthSize = DwarfOffset; + // Version field is always 2 bytes + uint64_t VersionSize = 2; + uint64_t PrologueLengthSize = DwarfOffset; + uint64_t SequencesStart = LineTableStart + InitialLengthSize + VersionSize + + PrologueLengthSize + PrologueLength; + + // Check if the offset is within the bounds of this specific line table + if (*SectionOffset < SequencesStart || *SectionOffset >= LineTableEnd) { + ReportError("DW_AT_LLVM_stmt_sequence offset out of line table bounds", + "DW_AT_LLVM_stmt_sequence offset " + + llvm::formatv("{0:x8}", *SectionOffset) + + " is not within the line table bounds [" + + llvm::formatv("{0:x8}", SequencesStart) + ", " + + llvm::formatv("{0:x8}", LineTableEnd) + ")"); + break; + } + + // Check if the offset matches any of the sequence offset. + auto It = + std::find_if(LineTable->Sequences.begin(), LineTable->Sequences.end(), + [SectionOffset](const auto &Sequence) { + return Sequence.StmtSeqOffset == *SectionOffset; + }); + + if (It == LineTable->Sequences.end()) + ReportError( + "Invalid DW_AT_LLVM_stmt_sequence offset", + "DW_AT_LLVM_stmt_sequence offset " + + llvm::formatv("{0:x8}", *SectionOffset) + + " does not point to a valid sequence offset in the line table"); + break; + } default: break; } diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp index 64f1bfc015380..e03932622b259 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp @@ -107,10 +107,16 @@ LVScopeDispatch LVScope::Dispatch = { {LVScopeKind::IsTryBlock, &LVScope::getIsTryBlock}, {LVScopeKind::IsUnion, &LVScope::getIsUnion}}; -void LVScope::addToChildren(LVElement *Element) { - if (!Children) - Children = std::make_unique(); - Children->push_back(Element); +const LVTypes LVScope::EmptyTypes{}; +const LVSymbols LVScope::EmptySymbols{}; +const LVScopes LVScope::EmptyScopes{}; + +LVElements LVScope::getSortedChildren(LVSortFunction SortFunction) const { + const auto UnsortedChildren = getChildren(); + LVElements Elements{UnsortedChildren.begin(), UnsortedChildren.end()}; + if (SortFunction) + llvm::stable_sort(Elements, SortFunction); + return Elements; } void LVScope::addElement(LVElement *Element) { @@ -175,7 +181,6 @@ void LVScope::addElement(LVScope *Scope) { // Add it to parent. Scopes->push_back(Scope); - addToChildren(Scope); Scope->setParent(this); // Notify the reader about the new element being added. @@ -202,7 +207,6 @@ void LVScope::addElement(LVSymbol *Symbol) { // Add it to parent. Symbols->push_back(Symbol); - addToChildren(Symbol); Symbol->setParent(this); // Notify the reader about the new element being added. @@ -229,7 +233,6 @@ void LVScope::addElement(LVType *Type) { // Add it to parent. Types->push_back(Type); - addToChildren(Type); Type->setParent(this); // Notify the reader about the new element being added. @@ -277,15 +280,12 @@ bool LVScope::removeElement(LVElement *Element) { if (Element->getIsLine()) return RemoveElement(Lines); - if (RemoveElement(Children)) { - if (Element->getIsSymbol()) - return RemoveElement(Symbols); - if (Element->getIsType()) - return RemoveElement(Types); - if (Element->getIsScope()) - return RemoveElement(Scopes); - llvm_unreachable("Invalid element."); - } + if (Element->getIsSymbol()) + return RemoveElement(Symbols); + if (Element->getIsType()) + return RemoveElement(Types); + if (Element->getIsScope()) + return RemoveElement(Scopes); return false; } @@ -356,9 +356,8 @@ void LVScope::updateLevel(LVScope *Parent, bool Moved) { setLevel(Parent->getLevel() + 1); // Update the children. - if (Children) - for (LVElement *Element : *Children) - Element->updateLevel(this, Moved); + for (LVElement *Element : getChildren()) + Element->updateLevel(this, Moved); // Update any lines. if (Lines) @@ -374,13 +373,12 @@ void LVScope::resolve() { LVElement::resolve(); // Resolve the children. - if (Children) - for (LVElement *Element : *Children) { - if (getIsGlobalReference()) - // If the scope is a global reference, mark all its children as well. - Element->setIsGlobalReference(); - Element->resolve(); - } + for (LVElement *Element : getChildren()) { + if (getIsGlobalReference()) + // If the scope is a global reference, mark all its children as well. + Element->setIsGlobalReference(); + Element->resolve(); + } } void LVScope::resolveName() { @@ -633,14 +631,13 @@ Error LVScope::doPrint(bool Split, bool Match, bool Print, raw_ostream &OS, options().getPrintFormatting() && getLevel() < options().getOutputLevel()) { // Print the children. - if (Children) - for (const LVElement *Element : *Children) { - if (Match && !Element->getHasPattern()) - continue; - if (Error Err = - Element->doPrint(Split, Match, Print, *StreamSplit, Full)) - return Err; - } + for (const LVElement *Element : getSortedChildren()) { + if (Match && !Element->getHasPattern()) + continue; + if (Error Err = + Element->doPrint(Split, Match, Print, *StreamSplit, Full)) + return Err; + } // Print the line records. if (Lines) @@ -692,7 +689,6 @@ void LVScope::sort() { Traverse(Parent->Symbols, SortFunction); Traverse(Parent->Scopes, SortFunction); Traverse(Parent->Ranges, compareRange); - Traverse(Parent->Children, SortFunction); if (Parent->Scopes) for (LVScope *Scope : *Parent->Scopes) @@ -978,9 +974,8 @@ bool LVScope::equals(const LVScopes *References, const LVScopes *Targets) { void LVScope::report(LVComparePass Pass) { getComparator().printItem(this, Pass); getComparator().push(this); - if (Children) - for (LVElement *Element : *Children) - Element->report(Pass); + for (LVElement *Element : getSortedChildren()) + Element->report(Pass); if (Lines) for (LVLine *Line : *Lines) @@ -1656,9 +1651,8 @@ void LVScopeCompileUnit::printMatchedElements(raw_ostream &OS, // Print the view for the matched scopes. for (const LVScope *Scope : MatchedScopes) { Scope->print(OS); - if (const LVElements *Elements = Scope->getChildren()) - for (LVElement *Element : *Elements) - Element->print(OS); + for (LVElement *Element : Scope->getSortedChildren()) + Element->print(OS); } } diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index f29f2c7602fc6..5785505ce2b0c 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -27,6 +27,11 @@ namespace rootsig { char GenericRSMetadataError::ID; char InvalidRSMetadataFormat::ID; char InvalidRSMetadataValue::ID; +char TableSamplerMixinError::ID; +char ShaderRegisterOverflowError::ID; +char OffsetOverflowError::ID; +char OffsetAppendAfterOverflow::ID; + template char RootSignatureValidationError::ID; static std::optional extractMdIntValue(MDNode *Node, @@ -55,8 +60,9 @@ static std::optional extractMdStringValue(MDNode *Node, template && std::is_same_v, uint32_t>>> -Expected extractEnumValue(MDNode *Node, unsigned int OpId, StringRef ErrText, - llvm::function_ref VerifyFn) { +static Expected +extractEnumValue(MDNode *Node, unsigned int OpId, StringRef ErrText, + llvm::function_ref VerifyFn) { if (std::optional Val = extractMdIntValue(Node, OpId)) { if (!VerifyFn(*Val)) return make_error>(ErrText, *Val); @@ -538,6 +544,60 @@ Error MetadataParser::parseRootSignatureElement(mcdxbc::RootSignatureDesc &RSD, llvm_unreachable("Unhandled RootSignatureElementKind enum."); } +static Error +validateDescriptorTableSamplerMixin(const mcdxbc::DescriptorTable &Table, + uint32_t Location) { + dxil::ResourceClass CurrRC = dxil::ResourceClass::Sampler; + for (const mcdxbc::DescriptorRange &Range : Table.Ranges) { + if (Range.RangeType == dxil::ResourceClass::Sampler && + CurrRC != dxil::ResourceClass::Sampler) + return make_error(CurrRC, Location); + CurrRC = Range.RangeType; + } + return Error::success(); +} + +static Error +validateDescriptorTableRegisterOverflow(const mcdxbc::DescriptorTable &Table, + uint32_t Location) { + uint64_t Offset = 0; + bool IsPrevUnbound = false; + for (const mcdxbc::DescriptorRange &Range : Table.Ranges) { + // Validation of NumDescriptors should have happened by this point. + if (Range.NumDescriptors == 0) + continue; + + const uint64_t RangeBound = llvm::hlsl::rootsig::computeRangeBound( + Range.BaseShaderRegister, Range.NumDescriptors); + + if (!verifyNoOverflowedOffset(RangeBound)) + return make_error( + Range.RangeType, Range.BaseShaderRegister, Range.RegisterSpace); + + bool IsAppending = + Range.OffsetInDescriptorsFromTableStart == DescriptorTableOffsetAppend; + if (!IsAppending) + Offset = Range.OffsetInDescriptorsFromTableStart; + + if (IsPrevUnbound && IsAppending) + return make_error( + Range.RangeType, Range.BaseShaderRegister, Range.RegisterSpace); + + const uint64_t OffsetBound = + llvm::hlsl::rootsig::computeRangeBound(Offset, Range.NumDescriptors); + + if (!verifyNoOverflowedOffset(OffsetBound)) + return make_error( + Range.RangeType, Range.BaseShaderRegister, Range.RegisterSpace); + + Offset = OffsetBound + 1; + IsPrevUnbound = + Range.NumDescriptors == llvm::hlsl::rootsig::NumDescriptorsUnbounded; + } + + return Error::success(); +} + Error MetadataParser::validateRootSignature( const mcdxbc::RootSignatureDesc &RSD) { Error DeferredErrs = Error::success(); @@ -611,6 +671,14 @@ Error MetadataParser::validateRootSignature( joinErrors(std::move(DeferredErrs), make_error>( "DescriptorFlag", Range.Flags)); + + if (Error Err = + validateDescriptorTableSamplerMixin(Table, Info.Location)) + DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err)); + + if (Error Err = + validateDescriptorTableRegisterOverflow(Table, Info.Location)) + DeferredErrs = joinErrors(std::move(DeferredErrs), std::move(Err)); } break; } @@ -651,6 +719,12 @@ Error MetadataParser::validateRootSignature( joinErrors(std::move(DeferredErrs), make_error>( "RegisterSpace", Sampler.RegisterSpace)); + + if (!hlsl::rootsig::verifyStaticSamplerFlags(RSD.Version, Sampler.Flags)) + DeferredErrs = + joinErrors(std::move(DeferredErrs), + make_error>( + "Static Sampler Flag", Sampler.Flags)); } return DeferredErrs; diff --git a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp index 0970977b5064f..2c78d622f7f28 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureValidations.cpp @@ -20,7 +20,9 @@ namespace rootsig { bool verifyRootFlag(uint32_t Flags) { return (Flags & ~0xfff) == 0; } -bool verifyVersion(uint32_t Version) { return (Version == 1 || Version == 2); } +bool verifyVersion(uint32_t Version) { + return (Version == 1 || Version == 2 || Version == 3); +} bool verifyRegisterValue(uint32_t RegisterValue) { return RegisterValue != ~0U; @@ -111,6 +113,25 @@ bool verifyDescriptorRangeFlag(uint32_t Version, dxil::ResourceClass Type, return (Flags & ~Mask) == FlagT::None; } +bool verifyStaticSamplerFlags(uint32_t Version, uint32_t FlagsNumber) { + uint32_t LargestValue = llvm::to_underlying( + dxbc::StaticSamplerFlags::LLVM_BITMASK_LARGEST_ENUMERATOR); + if (FlagsNumber >= NextPowerOf2(LargestValue)) + return false; + + dxbc::StaticSamplerFlags Flags = dxbc::StaticSamplerFlags(FlagsNumber); + if (Version <= 2) + return Flags == dxbc::StaticSamplerFlags::None; + + assert(Version == 3 && "Provided invalid root signature version"); + + dxbc::StaticSamplerFlags Mask = + dxbc::StaticSamplerFlags::NonNormalizedCoordinates | + dxbc::StaticSamplerFlags::UintBorderColor | + dxbc::StaticSamplerFlags::None; + return (Flags | Mask) == Mask; +} + bool verifyNumDescriptors(uint32_t NumDescriptors) { return NumDescriptors > 0; } @@ -125,22 +146,17 @@ bool verifyMaxAnisotropy(uint32_t MaxAnisotropy) { bool verifyLOD(float LOD) { return !std::isnan(LOD); } -bool verifyBoundOffset(uint32_t Offset) { - return Offset != NumDescriptorsUnbounded; -} - bool verifyNoOverflowedOffset(uint64_t Offset) { return Offset <= std::numeric_limits::max(); } -uint64_t computeRangeBound(uint32_t Offset, uint32_t Size) { +uint64_t computeRangeBound(uint64_t Offset, uint32_t Size) { assert(0 < Size && "Must be a non-empty range"); if (Size == NumDescriptorsUnbounded) return NumDescriptorsUnbounded; - return uint64_t(Offset) + uint64_t(Size) - 1; + return Offset + uint64_t(Size) - 1; } - } // namespace rootsig } // namespace hlsl } // namespace llvm diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp index cfddc06fbc00b..c4aa2c7638450 100644 --- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp +++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp @@ -8,19 +8,32 @@ #include "llvm/Frontend/Offloading/OffloadWrapper.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Frontend/Offloading/Utility.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" #include "llvm/Object/OffloadBinary.h" #include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LineIterator.h" +#include "llvm/Support/MemoryBufferRef.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Utils/ModuleUtils.h" +#include +#include +#include + using namespace llvm; +using namespace llvm::object; using namespace llvm::offloading; namespace { @@ -620,6 +633,384 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, // Add this function to constructors. appendToGlobalCtors(M, CtorFunc, /*Priority=*/101); } + +/// SYCLWrapper helper class that creates all LLVM IRs wrapping given images. +struct SYCLWrapper { + Module &M; + LLVMContext &C; + SYCLJITOptions Options; + + StructType *EntryTy = nullptr; + StructType *SyclDeviceImageTy = nullptr; + StructType *SyclBinDescTy = nullptr; + + SYCLWrapper(Module &M, const SYCLJITOptions &Options) + : M(M), C(M.getContext()), Options(Options) { + EntryTy = offloading::getEntryTy(M); + SyclDeviceImageTy = getSyclDeviceImageTy(); + SyclBinDescTy = getSyclBinDescTy(); + } + + IntegerType *getSizeTTy() { + switch (M.getDataLayout().getPointerSize()) { + case 4: + return Type::getInt32Ty(C); + case 8: + return Type::getInt64Ty(C); + } + llvm_unreachable("unsupported pointer type size"); + } + + SmallVector getSizetConstPair(size_t First, size_t Second) { + IntegerType *SizeTTy = getSizeTTy(); + return SmallVector{ConstantInt::get(SizeTTy, First), + ConstantInt::get(SizeTTy, Second)}; + } + + /// Note: Properties aren't supported and the support is going + /// to be added later. + /// Creates a structure corresponding to: + /// SYCL specific image descriptor type. + /// \code + /// struct __sycl.tgt_device_image { + /// // version of this structure - for backward compatibility; + /// // all modifications which change order/type/offsets of existing fields + /// // should increment the version. + /// uint16_t Version; + /// // the kind of offload model the image employs. + /// uint8_t OffloadKind; + /// // format of the image data - SPIRV, LLVMIR bitcode, etc + /// uint8_t Format; + /// // null-terminated string representation of the device's target + /// // architecture + /// const char *Arch; + /// // a null-terminated string; target- and compiler-specific options + /// // which are suggested to use to "compile" program at runtime + /// const char *CompileOptions; + /// // a null-terminated string; target- and compiler-specific options + /// // which are suggested to use to "link" program at runtime + /// const char *LinkOptions; + /// // Pointer to the device binary image start + /// void *ImageStart; + /// // Pointer to the device binary image end + /// void *ImageEnd; + /// // the entry table + /// __tgt_offload_entry *EntriesBegin; + /// __tgt_offload_entry *EntriesEnd; + /// const char *PropertiesBegin; + /// const char *PropertiesEnd; + /// }; + /// \endcode + StructType *getSyclDeviceImageTy() { + return StructType::create( + { + Type::getInt16Ty(C), // Version + Type::getInt8Ty(C), // OffloadKind + Type::getInt8Ty(C), // Format + PointerType::getUnqual(C), // Arch + PointerType::getUnqual(C), // CompileOptions + PointerType::getUnqual(C), // LinkOptions + PointerType::getUnqual(C), // ImageStart + PointerType::getUnqual(C), // ImageEnd + PointerType::getUnqual(C), // EntriesBegin + PointerType::getUnqual(C), // EntriesEnd + PointerType::getUnqual(C), // PropertiesBegin + PointerType::getUnqual(C) // PropertiesEnd + }, + "__sycl.tgt_device_image"); + } + + /// Creates a structure for SYCL specific binary descriptor type. Corresponds + /// to: + /// + /// \code + /// struct __sycl.tgt_bin_desc { + /// // version of this structure - for backward compatibility; + /// // all modifications which change order/type/offsets of existing fields + /// // should increment the version. + /// uint16_t Version; + /// uint16_t NumDeviceImages; + /// __sycl.tgt_device_image *DeviceImages; + /// // the offload entry table + /// __tgt_offload_entry *HostEntriesBegin; + /// __tgt_offload_entry *HostEntriesEnd; + /// }; + /// \endcode + StructType *getSyclBinDescTy() { + return StructType::create( + {Type::getInt16Ty(C), Type::getInt16Ty(C), PointerType::getUnqual(C), + PointerType::getUnqual(C), PointerType::getUnqual(C)}, + "__sycl.tgt_bin_desc"); + } + + /// Adds a global readonly variable that is initialized by given + /// \p Initializer to the module. + GlobalVariable *addGlobalArrayVariable(const Twine &Name, + ArrayRef Initializer, + const Twine &Section = "") { + auto *Arr = ConstantDataArray::get(M.getContext(), Initializer); + auto *Var = new GlobalVariable(M, Arr->getType(), /*isConstant*/ true, + GlobalVariable::InternalLinkage, Arr, Name); + Var->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + SmallVector NameBuf; + auto SectionName = Section.toStringRef(NameBuf); + if (!SectionName.empty()) + Var->setSection(SectionName); + return Var; + } + + /// Adds given \p Buf as a global variable into the module. + /// \returns Pair of pointers that point at the beginning and the end of the + /// variable. + std::pair + addArrayToModule(ArrayRef Buf, const Twine &Name, + const Twine &Section = "") { + auto *Var = addGlobalArrayVariable(Name, Buf, Section); + auto *ImageB = ConstantExpr::getGetElementPtr(Var->getValueType(), Var, + getSizetConstPair(0, 0)); + auto *ImageE = ConstantExpr::getGetElementPtr( + Var->getValueType(), Var, getSizetConstPair(0, Buf.size())); + return std::make_pair(ImageB, ImageE); + } + + /// Adds given \p Data as constant byte array in the module. + /// \returns Constant pointer to the added data. The pointer type does not + /// carry size information. + Constant *addRawDataToModule(ArrayRef Data, const Twine &Name) { + auto *Var = addGlobalArrayVariable(Name, Data); + auto *DataPtr = ConstantExpr::getGetElementPtr(Var->getValueType(), Var, + getSizetConstPair(0, 0)); + return DataPtr; + } + + /// Creates a global variable of const char* type and creates an + /// initializer that initializes it with \p Str. + /// + /// \returns Link-time constant pointer (constant expr) to that + /// variable. + Constant *addStringToModule(StringRef Str, const Twine &Name) { + auto *Arr = ConstantDataArray::getString(C, Str); + auto *Var = new GlobalVariable(M, Arr->getType(), /*isConstant*/ true, + GlobalVariable::InternalLinkage, Arr, Name); + Var->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + auto *Zero = ConstantInt::get(getSizeTTy(), 0); + Constant *ZeroZero[] = {Zero, Zero}; + return ConstantExpr::getGetElementPtr(Var->getValueType(), Var, ZeroZero); + } + + /// Each image contains its own set of symbols, which may contain different + /// symbols than other images. This function constructs an array of + /// symbol entries for a particular image. + /// + /// \returns Pointers to the beginning and end of the array. + std::pair + initOffloadEntriesPerImage(StringRef Entries, const Twine &OffloadKindTag) { + SmallVector EntriesInits; + std::unique_ptr MB = MemoryBuffer::getMemBuffer( + Entries, /*BufferName*/ "", /*RequiresNullTerminator*/ false); + for (line_iterator LI(*MB); !LI.is_at_eof(); ++LI) { + GlobalVariable *GV = + emitOffloadingEntry(M, /*Kind*/ OffloadKind::OFK_SYCL, + Constant::getNullValue(PointerType::getUnqual(C)), + /*Name*/ *LI, /*Size*/ 0, + /*Flags*/ 0, /*Data*/ 0); + EntriesInits.push_back(GV->getInitializer()); + } + + auto *Arr = ConstantArray::get(ArrayType::get(EntryTy, EntriesInits.size()), + EntriesInits); + auto *EntriesGV = new GlobalVariable(M, Arr->getType(), /*isConstant*/ true, + GlobalVariable::InternalLinkage, Arr, + OffloadKindTag + "entries_arr"); + + auto *EntriesB = ConstantExpr::getGetElementPtr( + EntriesGV->getValueType(), EntriesGV, getSizetConstPair(0, 0)); + auto *EntriesE = ConstantExpr::getGetElementPtr( + EntriesGV->getValueType(), EntriesGV, + getSizetConstPair(0, EntriesInits.size())); + return std::make_pair(EntriesB, EntriesE); + } + + Constant *wrapImage(const OffloadBinary &OB, const Twine &ImageID, + StringRef OffloadKindTag) { + // Note: Intel DPC++ compiler had 2 versions of this structure + // and clang++ has a third different structure. To avoid ABI incompatibility + // between generated device images the Version here starts from 3. + constexpr uint16_t DeviceImageStructVersion = 3; + Constant *Version = + ConstantInt::get(Type::getInt16Ty(C), DeviceImageStructVersion); + Constant *OffloadKindConstant = ConstantInt::get( + Type::getInt8Ty(C), static_cast(OB.getOffloadKind())); + Constant *ImageKindConstant = ConstantInt::get( + Type::getInt8Ty(C), static_cast(OB.getImageKind())); + StringRef Triple = OB.getString("triple"); + Constant *TripleConstant = + addStringToModule(Triple, Twine(OffloadKindTag) + "target." + ImageID); + Constant *CompileOptions = + addStringToModule(Options.CompileOptions, + Twine(OffloadKindTag) + "opts.compile." + ImageID); + Constant *LinkOptions = addStringToModule( + Options.LinkOptions, Twine(OffloadKindTag) + "opts.link." + ImageID); + + // Note: NULL for now. + std::pair PropertiesConstants = { + Constant::getNullValue(PointerType::getUnqual(C)), + Constant::getNullValue(PointerType::getUnqual(C))}; + + StringRef RawImage = OB.getImage(); + std::pair Binary = addArrayToModule( + ArrayRef(RawImage.begin(), RawImage.end()), + Twine(OffloadKindTag) + ImageID + ".data", ".llvm.offloading"); + + // For SYCL images offload entries are defined here per image. + std::pair ImageEntriesPtrs = + initOffloadEntriesPerImage(OB.getString("symbols"), OffloadKindTag); + Constant *WrappedBinary = ConstantStruct::get( + SyclDeviceImageTy, Version, OffloadKindConstant, ImageKindConstant, + TripleConstant, CompileOptions, LinkOptions, Binary.first, + Binary.second, ImageEntriesPtrs.first, ImageEntriesPtrs.second, + PropertiesConstants.first, PropertiesConstants.second); + + return WrappedBinary; + } + + GlobalVariable *combineWrappedImages(ArrayRef WrappedImages, + StringRef OffloadKindTag) { + auto *ImagesData = ConstantArray::get( + ArrayType::get(SyclDeviceImageTy, WrappedImages.size()), WrappedImages); + auto *ImagesGV = + new GlobalVariable(M, ImagesData->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, ImagesData, + Twine(OffloadKindTag) + "device_images"); + ImagesGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + + auto *Zero = ConstantInt::get(getSizeTTy(), 0); + Constant *ZeroZero[] = {Zero, Zero}; + auto *ImagesB = ConstantExpr::getGetElementPtr(ImagesGV->getValueType(), + ImagesGV, ZeroZero); + + Constant *EntriesB = Constant::getNullValue(PointerType::getUnqual(C)); + Constant *EntriesE = Constant::getNullValue(PointerType::getUnqual(C)); + static constexpr uint16_t BinDescStructVersion = 1; + auto *DescInit = ConstantStruct::get( + SyclBinDescTy, + ConstantInt::get(Type::getInt16Ty(C), BinDescStructVersion), + ConstantInt::get(Type::getInt16Ty(C), WrappedImages.size()), ImagesB, + EntriesB, EntriesE); + + return new GlobalVariable(M, DescInit->getType(), /*isConstant*/ true, + GlobalValue::InternalLinkage, DescInit, + Twine(OffloadKindTag) + "descriptor"); + } + + /// Creates binary descriptor for the given device images. Binary descriptor + /// is an object that is passed to the offloading runtime at program startup + /// and it describes all device images available in the executable or shared + /// library. It is defined as follows: + /// + /// \code + /// __attribute__((visibility("hidden"))) + /// __tgt_offload_entry *__sycl_offload_entries_arr0[]; + /// ... + /// __attribute__((visibility("hidden"))) + /// __tgt_offload_entry *__sycl_offload_entries_arrN[]; + /// + /// __attribute__((visibility("hidden"))) + /// extern const char *CompileOptions = "..."; + /// ... + /// __attribute__((visibility("hidden"))) + /// extern const char *LinkOptions = "..."; + /// ... + /// + /// static const char Image0[] = { ... }; + /// ... + /// static const char ImageN[] = { ... }; + /// + /// static const __sycl.tgt_device_image Images[] = { + /// { + /// Version, // Version + /// OffloadKind, // OffloadKind + /// Format, // Format of the image. + // TripleString, // Arch + /// CompileOptions, // CompileOptions + /// LinkOptions, // LinkOptions + /// Image0, // ImageStart + /// Image0 + IMAGE0_SIZE, // ImageEnd + /// __sycl_offload_entries_arr0, // EntriesBegin + /// __sycl_offload_entries_arr0 + ENTRIES0_SIZE, // EntriesEnd + /// NULL, // PropertiesBegin + /// NULL, // PropertiesEnd + /// }, + /// ... + /// }; + /// + /// static const __sycl.tgt_bin_desc FatbinDesc = { + /// Version, //Version + /// sizeof(Images) / sizeof(Images[0]), //NumDeviceImages + /// Images, //DeviceImages + /// NULL, //HostEntriesBegin + /// NULL //HostEntriesEnd + /// }; + /// \endcode + /// + /// \returns Global variable that represents FatbinDesc. + GlobalVariable *createFatbinDesc(ArrayRef OffloadFiles) { + StringRef OffloadKindTag = ".sycl_offloading."; + SmallVector WrappedImages; + WrappedImages.reserve(OffloadFiles.size()); + for (size_t I = 0, E = OffloadFiles.size(); I != E; ++I) + WrappedImages.push_back( + wrapImage(*OffloadFiles[I].getBinary(), Twine(I), OffloadKindTag)); + + return combineWrappedImages(WrappedImages, OffloadKindTag); + } + + void createRegisterFatbinFunction(GlobalVariable *FatbinDesc) { + auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + Twine("sycl") + ".descriptor_reg", &M); + Func->setSection(".text.startup"); + + // Get RegFuncName function declaration. + auto *RegFuncTy = + FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), + /*isVarArg=*/false); + FunctionCallee RegFuncC = + M.getOrInsertFunction("__sycl_register_lib", RegFuncTy); + + // Construct function body + IRBuilder Builder(BasicBlock::Create(C, "entry", Func)); + Builder.CreateCall(RegFuncC, FatbinDesc); + Builder.CreateRetVoid(); + + // Add this function to constructors. + appendToGlobalCtors(M, Func, /*Priority*/ 1); + } + + void createUnregisterFunction(GlobalVariable *FatbinDesc) { + auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); + auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage, + "sycl.descriptor_unreg", &M); + Func->setSection(".text.startup"); + + // Get UnregFuncName function declaration. + auto *UnRegFuncTy = + FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C), + /*isVarArg=*/false); + FunctionCallee UnRegFuncC = + M.getOrInsertFunction("__sycl_unregister_lib", UnRegFuncTy); + + // Construct function body + IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func)); + Builder.CreateCall(UnRegFuncC, FatbinDesc); + Builder.CreateRetVoid(); + + // Add this function to global destructors. + appendToGlobalDtors(M, Func, /*Priority*/ 1); + } +}; // end of SYCLWrapper + } // namespace Error offloading::wrapOpenMPBinaries(Module &M, ArrayRef> Images, @@ -660,3 +1051,22 @@ Error offloading::wrapHIPBinary(Module &M, ArrayRef Image, EmitSurfacesAndTextures); return Error::success(); } + +Error llvm::offloading::wrapSYCLBinaries(llvm::Module &M, ArrayRef Buffer, + SYCLJITOptions Options) { + SYCLWrapper W(M, Options); + MemoryBufferRef MBR(StringRef(Buffer.begin(), Buffer.size()), + /*Identifier*/ ""); + SmallVector OffloadFiles; + if (Error E = extractOffloadBinaries(MBR, OffloadFiles)) + return E; + + GlobalVariable *Desc = W.createFatbinDesc(OffloadFiles); + if (!Desc) + return createStringError(inconvertibleErrorCode(), + "No binary descriptors created."); + + W.createRegisterFatbinFunction(Desc); + W.createUnregisterFunction(Desc); + return Error::success(); +} diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp index 5dcc16d23004c..5000488a52f37 100644 --- a/llvm/lib/Frontend/Offloading/Utility.cpp +++ b/llvm/lib/Frontend/Offloading/Utility.cpp @@ -82,11 +82,11 @@ offloading::getOffloadingEntryInitializer(Module &M, object::OffloadKind Kind, return {EntryInitializer, Str}; } -void offloading::emitOffloadingEntry(Module &M, object::OffloadKind Kind, - Constant *Addr, StringRef Name, - uint64_t Size, uint32_t Flags, - uint64_t Data, Constant *AuxAddr, - StringRef SectionName) { +GlobalVariable * +offloading::emitOffloadingEntry(Module &M, object::OffloadKind Kind, + Constant *Addr, StringRef Name, uint64_t Size, + uint32_t Flags, uint64_t Data, + Constant *AuxAddr, StringRef SectionName) { const llvm::Triple &Triple = M.getTargetTriple(); auto [EntryInitializer, NameGV] = getOffloadingEntryInitializer( @@ -106,6 +106,7 @@ void offloading::emitOffloadingEntry(Module &M, object::OffloadKind Kind, else Entry->setSection(SectionName); Entry->setAlignment(Align(object::OffloadBinary::getAlignment())); + return Entry; } std::pair @@ -423,9 +424,7 @@ Error offloading::intel::containerizeOpenMPSPIRVImage( Header.Class = ELF::ELFCLASS64; Header.Data = ELF::ELFDATA2LSB; Header.Type = ELF::ET_DYN; - // Use an existing Intel machine type as there is not one specifically for - // Intel GPUs. - Header.Machine = ELF::EM_IA_64; + Header.Machine = ELF::EM_INTELGT; // Create a section with notes. ELFYAML::NoteSection Section{}; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 220eee3cb8b08..5980ee35a5cd2 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3507,6 +3507,8 @@ Expected OpenMPIRBuilder::createReductionFunction( return AfterIP.takeError(); if (!Builder.GetInsertBlock()) return ReductionFunc; + + Builder.restoreIP(*AfterIP); Builder.CreateStore(Reduced, LHSPtr); } } @@ -3751,6 +3753,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); if (!AfterIP) return AfterIP.takeError(); + Builder.restoreIP(*AfterIP); Builder.CreateStore(Reduced, LHS, false); } } @@ -4976,7 +4979,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, - Function &LoopBodyFn) { + Function &LoopBodyFn, bool NoLoop) { Type *TripCountTy = TripCount->getType(); Module &M = OMPBuilder->M; IRBuilder<> &Builder = OMPBuilder->Builder; @@ -5004,8 +5007,10 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); if (LoopType == WorksharingLoopType::DistributeForStaticLoop) { RealArgs.push_back(ConstantInt::get(TripCountTy, 0)); + RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop)); + } else { + RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0)); } - RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0)); Builder.CreateCall(RTLFn, RealArgs); } @@ -5013,7 +5018,7 @@ static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, static void workshareLoopTargetCallback( OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector &ToBeDeleted, - WorksharingLoopType LoopType) { + WorksharingLoopType LoopType, bool NoLoop) { IRBuilder<> &Builder = OMPIRBuilder->Builder; BasicBlock *Preheader = CLI->getPreheader(); Value *TripCount = CLI->getTripCount(); @@ -5060,17 +5065,16 @@ static void workshareLoopTargetCallback( OutlinedFnCallInstruction->eraseFromParent(); createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident, - LoopBodyArg, TripCount, OutlinedFn); + LoopBodyArg, TripCount, OutlinedFn, NoLoop); for (auto &ToBeDeletedItem : ToBeDeleted) ToBeDeletedItem->eraseFromParent(); CLI->invalidate(); } -OpenMPIRBuilder::InsertPointTy -OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, - InsertPointTy AllocaIP, - WorksharingLoopType LoopType) { +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget( + DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, + WorksharingLoopType LoopType, bool NoLoop) { uint32_t SrcLocStrSize; Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize); Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); @@ -5153,7 +5157,7 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI, OI.PostOutlineCB = [=, ToBeDeletedVec = std::move(ToBeDeleted)](Function &OutlinedFn) { workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec, - LoopType); + LoopType, NoLoop); }; addOutlineInfo(std::move(OI)); return CLI->getAfterIP(); @@ -5164,9 +5168,9 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop( bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, - WorksharingLoopType LoopType) { + WorksharingLoopType LoopType, bool NoLoop) { if (Config.isTargetDevice()) - return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType); + return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop); OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType( SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier, HasNonmonotonicModifier, HasOrderedClause); @@ -10033,12 +10037,16 @@ OpenMPIRBuilder::createDistribute(const LocationDescription &Loc, if (Error Err = BodyGenCB(AllocaIP, CodeGenIP)) return Err; - OutlineInfo OI; - OI.OuterAllocaBB = OuterAllocaIP.getBlock(); - OI.EntryBB = AllocaBB; - OI.ExitBB = ExitBB; + // When using target we use different runtime functions which require a + // callback. + if (Config.isTargetDevice()) { + OutlineInfo OI; + OI.OuterAllocaBB = OuterAllocaIP.getBlock(); + OI.EntryBB = AllocaBB; + OI.ExitBB = ExitBB; - addOutlineInfo(std::move(OI)); + addOutlineInfo(std::move(OI)); + } Builder.SetInsertPoint(ExitBB, ExitBB->begin()); return Builder.saveIP(); @@ -10305,17 +10313,19 @@ void OffloadEntriesInfoManager::getTargetRegionEntryFnName( TargetRegionEntryInfo OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, + vfs::FileSystem &VFS, StringRef ParentName) { sys::fs::UniqueID ID(0xdeadf17e, 0); auto FileIDInfo = CallBack(); uint64_t FileID = 0; - std::error_code EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID); - // If the inode ID could not be determined, create a hash value - // the current file name and use that as an ID. - if (EC) + if (ErrorOr Status = VFS.status(std::get<0>(FileIDInfo))) { + ID = Status->getUniqueID(); + FileID = Status->getUniqueID().getFile(); + } else { + // If the inode ID could not be determined, create a hash value + // the current file name and use that as an ID. FileID = hash_value(std::get<0>(FileIDInfo)); - else - FileID = ID.getFile(); + } return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID, std::get<1>(FileIDInfo)); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 60e86d4d2b5a5..1a518305cffbe 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -430,6 +430,15 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { CC_VLS_CASE(32768) CC_VLS_CASE(65536) #undef CC_VLS_CASE + case CallingConv::CHERIoT_CompartmentCall: + Out << "cheriot_compartmentcallcc"; + break; + case CallingConv::CHERIoT_CompartmentCallee: + Out << "cheriot_compartmentcalleecc"; + break; + case CallingConv::CHERIoT_LibraryCall: + Out << "cheriot_librarycallcc"; + break; } } diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index 77f9b997a2ebf..49e1f898ca594 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -151,7 +151,8 @@ bool DataLayout::PointerSpec::operator==(const PointerSpec &Other) const { return AddrSpace == Other.AddrSpace && BitWidth == Other.BitWidth && ABIAlign == Other.ABIAlign && PrefAlign == Other.PrefAlign && IndexBitWidth == Other.IndexBitWidth && - IsNonIntegral == Other.IsNonIntegral; + HasUnstableRepresentation == Other.HasUnstableRepresentation && + HasExternalState == Other.HasExternalState; } namespace { @@ -194,7 +195,7 @@ constexpr DataLayout::PrimitiveSpec DefaultVectorSpecs[] = { // Default pointer type specifications. constexpr DataLayout::PointerSpec DefaultPointerSpecs[] = { // p0:64:64:64:64 - {0, 64, Align::Constant<8>(), Align::Constant<8>(), 64, false}, + {0, 64, Align::Constant<8>(), Align::Constant<8>(), 64, false, false}, }; DataLayout::DataLayout() @@ -405,9 +406,29 @@ Error DataLayout::parsePointerSpec(StringRef Spec) { // Address space. Optional, defaults to 0. unsigned AddrSpace = 0; - if (!Components[0].empty()) - if (Error Err = parseAddrSpace(Components[0], AddrSpace)) - return Err; + bool ExternalState = false; + bool UnstableRepr = false; + StringRef AddrSpaceStr = Components[0]; + while (!AddrSpaceStr.empty()) { + char C = AddrSpaceStr.front(); + if (C == 'e') { + ExternalState = true; + } else if (C == 'u') { + UnstableRepr = true; + } else if (isAlpha(C)) { + return createStringError("'%c' is not a valid pointer specification flag", + C); + } else { + break; // not a valid flag, remaining must be the address space number. + } + AddrSpaceStr = AddrSpaceStr.drop_front(1); + } + if (!AddrSpaceStr.empty()) + if (Error Err = parseAddrSpace(AddrSpaceStr, AddrSpace)) + return Err; // Failed to parse the remaining characters as a number + if (AddrSpace == 0 && (ExternalState || UnstableRepr)) + return createStringError( + "address space 0 cannot be unstable or have external state"); // Size. Required, cannot be zero. unsigned BitWidth; @@ -441,7 +462,7 @@ Error DataLayout::parsePointerSpec(StringRef Spec) { "index size cannot be larger than the pointer size"); setPointerSpec(AddrSpace, BitWidth, ABIAlign, PrefAlign, IndexBitWidth, - false); + UnstableRepr, ExternalState); return Error::success(); } @@ -617,7 +638,7 @@ Error DataLayout::parseLayoutString(StringRef LayoutString) { // the spec for AS0, and we then update that to mark it non-integral. const PointerSpec &PS = getPointerSpec(AS); setPointerSpec(AS, PS.BitWidth, PS.ABIAlign, PS.PrefAlign, PS.IndexBitWidth, - true); + /*HasUnstableRepr=*/true, /*HasExternalState=*/false); } return Error::success(); @@ -665,17 +686,20 @@ DataLayout::getPointerSpec(uint32_t AddrSpace) const { void DataLayout::setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth, Align ABIAlign, Align PrefAlign, - uint32_t IndexBitWidth, bool IsNonIntegral) { + uint32_t IndexBitWidth, bool HasUnstableRepr, + bool HasExternalState) { auto I = lower_bound(PointerSpecs, AddrSpace, LessPointerAddrSpace()); if (I == PointerSpecs.end() || I->AddrSpace != AddrSpace) { PointerSpecs.insert(I, PointerSpec{AddrSpace, BitWidth, ABIAlign, PrefAlign, - IndexBitWidth, IsNonIntegral}); + IndexBitWidth, HasUnstableRepr, + HasExternalState}); } else { I->BitWidth = BitWidth; I->ABIAlign = ABIAlign; I->PrefAlign = PrefAlign; I->IndexBitWidth = IndexBitWidth; - I->IsNonIntegral = IsNonIntegral; + I->HasUnstableRepresentation = HasUnstableRepr; + I->HasExternalState = HasExternalState; } } diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 166521a276643..f9ded507f8328 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -375,6 +375,38 @@ bool DebugInfoFinder::addScope(DIScope *Scope) { return true; } +/// Recursively handle DILocations in followup metadata etc. +/// +/// TODO: If for example a followup loop metadata would refence itself this +/// function would go into infinite recursion. We do not expect such cycles in +/// the loop metadata (except for the self-referencing first element +/// "LoopID"). However, we could at least handle such situations more gracefully +/// somehow (e.g. by keeping track of visited nodes and dropping metadata). +static Metadata *updateLoopMetadataDebugLocationsRecursive( + Metadata *MetadataIn, function_ref Updater) { + const MDTuple *M = dyn_cast_or_null(MetadataIn); + // The loop metadata options should start with a MDString. + if (!M || M->getNumOperands() < 1 || !isa(M->getOperand(0))) + return MetadataIn; + + bool Updated = false; + SmallVector MDs{M->getOperand(0)}; + for (Metadata *MD : llvm::drop_begin(M->operands())) { + if (!MD) { + MDs.push_back(nullptr); + continue; + } + Metadata *NewMD = + Updater(updateLoopMetadataDebugLocationsRecursive(MD, Updater)); + if (NewMD) + MDs.push_back(NewMD); + Updated |= NewMD != MD; + } + + assert(!M->isDistinct() && "M should not be distinct."); + return Updated ? MDNode::get(M->getContext(), MDs) : MetadataIn; +} + static MDNode *updateLoopMetadataDebugLocationsImpl( MDNode *OrigLoopID, function_ref Updater) { assert(OrigLoopID && OrigLoopID->getNumOperands() > 0 && @@ -385,11 +417,11 @@ static MDNode *updateLoopMetadataDebugLocationsImpl( // Save space for the self-referential LoopID. SmallVector MDs = {nullptr}; - for (unsigned i = 1; i < OrigLoopID->getNumOperands(); ++i) { - Metadata *MD = OrigLoopID->getOperand(i); + for (Metadata *MD : llvm::drop_begin(OrigLoopID->operands())) { if (!MD) MDs.push_back(nullptr); - else if (Metadata *NewMD = Updater(MD)) + else if (Metadata *NewMD = Updater( + updateLoopMetadataDebugLocationsRecursive(MD, Updater))) MDs.push_back(NewMD); } diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index c1fafd759b5ab..a8bb34f69c629 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -865,7 +865,7 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { bool Instruction::hasSameSpecialState(const Instruction *I2, bool IgnoreAlignment, bool IntersectAttrs) const { - auto I1 = this; + const auto *I1 = this; assert(I1->getOpcode() == I2->getOpcode() && "Can not compare special state of different instructions"); @@ -918,6 +918,8 @@ bool Instruction::hasSameSpecialState(const Instruction *I2, FI->getSyncScopeID() == cast(I2)->getSyncScopeID(); if (const AtomicCmpXchgInst *CXI = dyn_cast(I1)) return CXI->isVolatile() == cast(I2)->isVolatile() && + (CXI->getAlign() == cast(I2)->getAlign() || + IgnoreAlignment) && CXI->isWeak() == cast(I2)->isWeak() && CXI->getSuccessOrdering() == cast(I2)->getSuccessOrdering() && @@ -928,6 +930,8 @@ bool Instruction::hasSameSpecialState(const Instruction *I2, if (const AtomicRMWInst *RMWI = dyn_cast(I1)) return RMWI->getOperation() == cast(I2)->getOperation() && RMWI->isVolatile() == cast(I2)->isVolatile() && + (RMWI->getAlign() == cast(I2)->getAlign() || + IgnoreAlignment) && RMWI->getOrdering() == cast(I2)->getOrdering() && RMWI->getSyncScopeID() == cast(I2)->getSyncScopeID(); if (const ShuffleVectorInst *SVI = dyn_cast(I1)) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index daebf447a2107..dd83168ab3c6e 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2847,6 +2847,7 @@ unsigned CastInst::isEliminableCastPair( // FPTRUNC > FloatPt n/a FloatPt n/a // FPEXT < FloatPt n/a FloatPt n/a // PTRTOINT n/a Pointer n/a Integral Unsigned + // PTRTOADDR n/a Pointer n/a Integral Unsigned // INTTOPTR n/a Integral Unsigned Pointer n/a // BITCAST = FirstClass n/a FirstClass n/a // ADDRSPCST n/a Pointer n/a Pointer n/a @@ -2878,7 +2879,7 @@ unsigned CastInst::isEliminableCastPair( { 99,99,99, 2, 2,99,99, 8, 2,99,99,99, 4, 0}, // FPExt | { 1, 0, 0,99,99, 0, 0,99,99,99,99, 7, 3, 0}, // PtrToInt | { 1, 0, 0,99,99, 0, 0,99,99,99,99, 0, 3, 0}, // PtrToAddr | - { 99,99,99,99,99,99,99,99,99,11,99,99,15, 0}, // IntToPtr | + { 99,99,99,99,99,99,99,99,99,11,11,99,15, 0}, // IntToPtr | { 5, 5, 5, 0, 0, 5, 5, 0, 0,16,16, 5, 1,14}, // BitCast | { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+ }; @@ -2972,7 +2973,8 @@ unsigned CastInst::isEliminableCastPair( // zext, sext -> zext, because sext can't sign extend after zext return Instruction::ZExt; case 11: { - // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize + // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize and + // SrcSize==DstSize if (!MidIntPtrTy) return 0; unsigned PtrSize = MidIntPtrTy->getScalarSizeInBits(); diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index 4d2e8fadff4f7..6797a100ff732 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -207,7 +207,6 @@ DecodeIITType(unsigned &NextElt, ArrayRef Infos, bool IsScalableVector = (LastInfo == IIT_SCALABLE_VEC); IIT_Info Info = IIT_Info(Infos[NextElt++]); - unsigned StructElts = 2; switch (Info) { case IIT_Done: @@ -390,28 +389,9 @@ DecodeIITType(unsigned &NextElt, ArrayRef Infos, case IIT_EMPTYSTRUCT: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); return; - case IIT_STRUCT9: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT8: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT7: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT6: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT5: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT4: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT3: - ++StructElts; - [[fallthrough]]; - case IIT_STRUCT2: { + case IIT_STRUCT: { + unsigned StructElts = Infos[NextElt++] + 2; + OutputTable.push_back( IITDescriptor::get(IITDescriptor::Struct, StructElts)); diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp index 57532cd491dd6..335c210c10e1a 100644 --- a/llvm/lib/IR/LLVMContext.cpp +++ b/llvm/lib/IR/LLVMContext.cpp @@ -53,6 +53,8 @@ static StringRef knownBundleName(unsigned BundleTagID) { return "kcfi"; case LLVMContext::OB_convergencectrl: return "convergencectrl"; + case LLVMContext::OB_align: + return "align"; default: llvm_unreachable("unknown bundle id"); } @@ -76,7 +78,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) { } for (unsigned BundleTagID = LLVMContext::OB_deopt; - BundleTagID <= LLVMContext::OB_convergencectrl; ++BundleTagID) { + BundleTagID <= LLVMContext::OB_LastBundleID; ++BundleTagID) { [[maybe_unused]] const auto *Entry = pImpl->getOrInsertBundleTag(knownBundleName(BundleTagID)); assert(Entry->second == BundleTagID && "operand bundle id drifted!"); diff --git a/llvm/lib/IR/LLVMRemarkStreamer.cpp b/llvm/lib/IR/LLVMRemarkStreamer.cpp index 71f8d4a4b1c7c..9e1e45998f2f1 100644 --- a/llvm/lib/IR/LLVMRemarkStreamer.cpp +++ b/llvm/lib/IR/LLVMRemarkStreamer.cpp @@ -92,7 +92,7 @@ char LLVMRemarkSetupFileError::ID = 0; char LLVMRemarkSetupPatternError::ID = 0; char LLVMRemarkSetupFormatError::ID = 0; -Expected> llvm::setupLLVMOptimizationRemarks( +Expected llvm::setupLLVMOptimizationRemarks( LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses, StringRef RemarksFormat, bool RemarksWithHotness, std::optional RemarksHotnessThreshold) { @@ -102,7 +102,7 @@ Expected> llvm::setupLLVMOptimizationRemarks( Context.setDiagnosticsHotnessThreshold(RemarksHotnessThreshold); if (RemarksFilename.empty()) - return nullptr; + return LLVMRemarkFileHandle(); Expected Format = remarks::parseFormat(RemarksFormat); if (Error E = Format.takeError()) @@ -119,24 +119,35 @@ Expected> llvm::setupLLVMOptimizationRemarks( return make_error(errorCodeToError(EC)); Expected> RemarkSerializer = - remarks::createRemarkSerializer( - *Format, remarks::SerializerMode::Separate, RemarksFile->os()); + remarks::createRemarkSerializer(*Format, RemarksFile->os()); if (Error E = RemarkSerializer.takeError()) return make_error(std::move(E)); - // Create the main remark streamer. - Context.setMainRemarkStreamer(std::make_unique( - std::move(*RemarkSerializer), RemarksFilename)); + auto RS = std::make_unique( + std::move(*RemarkSerializer), RemarksFilename); + + if (!RemarksPasses.empty()) + if (Error E = RS->setFilter(RemarksPasses)) { + RS->releaseSerializer(); + return make_error(std::move(E)); + } + + // Install the main remark streamer. Only install this after setting the + // filter, because this might fail. + Context.setMainRemarkStreamer(std::move(RS)); // Create LLVM's optimization remarks streamer. Context.setLLVMRemarkStreamer( std::make_unique(*Context.getMainRemarkStreamer())); - if (!RemarksPasses.empty()) - if (Error E = Context.getMainRemarkStreamer()->setFilter(RemarksPasses)) - return make_error(std::move(E)); + return LLVMRemarkFileHandle{std::move(RemarksFile), Context}; +} - return std::move(RemarksFile); +void LLVMRemarkFileHandle::Finalizer::finalize() { + if (!Context) + return; + finalizeLLVMOptimizationRemarks(*Context); + Context = nullptr; } Error llvm::setupLLVMOptimizationRemarks( @@ -153,22 +164,34 @@ Error llvm::setupLLVMOptimizationRemarks( return make_error(std::move(E)); Expected> RemarkSerializer = - remarks::createRemarkSerializer(*Format, - remarks::SerializerMode::Separate, OS); + remarks::createRemarkSerializer(*Format, OS); if (Error E = RemarkSerializer.takeError()) return make_error(std::move(E)); - // Create the main remark streamer. - Context.setMainRemarkStreamer( - std::make_unique(std::move(*RemarkSerializer))); + auto RS = + std::make_unique(std::move(*RemarkSerializer)); + + if (!RemarksPasses.empty()) + if (Error E = RS->setFilter(RemarksPasses)) { + RS->releaseSerializer(); + return make_error(std::move(E)); + } + + // Install the main remark streamer. Only install this after setting the + // filter, because this might fail. + Context.setMainRemarkStreamer(std::move(RS)); // Create LLVM's optimization remarks streamer. Context.setLLVMRemarkStreamer( std::make_unique(*Context.getMainRemarkStreamer())); - if (!RemarksPasses.empty()) - if (Error E = Context.getMainRemarkStreamer()->setFilter(RemarksPasses)) - return make_error(std::move(E)); - return Error::success(); } + +void llvm::finalizeLLVMOptimizationRemarks(LLVMContext &Context) { + Context.setLLVMRemarkStreamer(nullptr); + if (auto *RS = Context.getMainRemarkStreamer()) { + RS->releaseSerializer(); + Context.setMainRemarkStreamer(nullptr); + } +} diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 9bde965d660a4..8c03d6f809d50 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -480,6 +480,7 @@ class Verifier : public InstVisitor, VerifierSupport { visitModuleFlags(); visitModuleIdents(); visitModuleCommandLines(); + visitModuleErrnoTBAA(); verifyCompileUnits(); @@ -516,6 +517,7 @@ class Verifier : public InstVisitor, VerifierSupport { void visitComdat(const Comdat &C); void visitModuleIdents(); void visitModuleCommandLines(); + void visitModuleErrnoTBAA(); void visitModuleFlags(); void visitModuleFlag(const MDNode *Op, DenseMap &SeenIDs, @@ -1815,6 +1817,18 @@ void Verifier::visitModuleCommandLines() { } } +void Verifier::visitModuleErrnoTBAA() { + const NamedMDNode *ErrnoTBAA = M.getNamedMetadata("llvm.errno.tbaa"); + if (!ErrnoTBAA) + return; + + Check(ErrnoTBAA->getNumOperands() >= 1, + "llvm.errno.tbaa must have at least one operand", ErrnoTBAA); + + for (const MDNode *N : ErrnoTBAA->operands()) + TBAAVerifyHelper.visitTBAAMetadata(nullptr, N); +} + void Verifier::visitModuleFlags() { const NamedMDNode *Flags = M.getModuleFlagsMetadata(); if (!Flags) return; @@ -5537,7 +5551,7 @@ void Verifier::visitInstruction(Instruction &I) { visitNofreeMetadata(I, MD); if (MDNode *TBAA = I.getMetadata(LLVMContext::MD_tbaa)) - TBAAVerifyHelper.visitTBAAMetadata(I, TBAA); + TBAAVerifyHelper.visitTBAAMetadata(&I, TBAA); if (MDNode *MD = I.getMetadata(LLVMContext::MD_noalias)) visitAliasScopeListMetadata(MD); @@ -5675,6 +5689,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { default: break; case Intrinsic::assume: { + if (Call.hasOperandBundles()) { + auto *Cond = dyn_cast(Call.getArgOperand(0)); + Check(Cond && Cond->isOne(), + "assume with operand bundles must have i1 true condition", Call); + } for (auto &Elem : Call.bundle_op_infos()) { unsigned ArgCount = Elem.End - Elem.Begin; // Separate storage assumptions are special insofar as they're the only @@ -5850,9 +5869,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { break; } case Intrinsic::call_preallocated_setup: { - auto *NumArgs = dyn_cast(Call.getArgOperand(0)); - Check(NumArgs != nullptr, - "llvm.call.preallocated.setup argument must be a constant"); + auto *NumArgs = cast(Call.getArgOperand(0)); bool FoundCall = false; for (User *U : Call.users()) { auto *UseCall = dyn_cast(U); @@ -7655,10 +7672,10 @@ template void TBAAVerifier::CheckFailed(Tys &&... Args) { /// TBAA scheme. This means \p BaseNode is either a scalar node, or a /// struct-type node describing an aggregate data structure (like a struct). TBAAVerifier::TBAABaseNodeSummary -TBAAVerifier::verifyTBAABaseNode(Instruction &I, const MDNode *BaseNode, +TBAAVerifier::verifyTBAABaseNode(const Instruction *I, const MDNode *BaseNode, bool IsNewFormat) { if (BaseNode->getNumOperands() < 2) { - CheckFailed("Base nodes must have at least two operands", &I, BaseNode); + CheckFailed("Base nodes must have at least two operands", I, BaseNode); return {true, ~0u}; } @@ -7674,8 +7691,8 @@ TBAAVerifier::verifyTBAABaseNode(Instruction &I, const MDNode *BaseNode, } TBAAVerifier::TBAABaseNodeSummary -TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, - bool IsNewFormat) { +TBAAVerifier::verifyTBAABaseNodeImpl(const Instruction *I, + const MDNode *BaseNode, bool IsNewFormat) { const TBAAVerifier::TBAABaseNodeSummary InvalidNode = {true, ~0u}; if (BaseNode->getNumOperands() == 2) { @@ -7704,7 +7721,7 @@ TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, auto *TypeSizeNode = mdconst::dyn_extract_or_null( BaseNode->getOperand(1)); if (!TypeSizeNode) { - CheckFailed("Type size nodes must be constants!", &I, BaseNode); + CheckFailed("Type size nodes must be constants!", I, BaseNode); return InvalidNode; } } @@ -7730,7 +7747,7 @@ TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, const MDOperand &FieldTy = BaseNode->getOperand(Idx); const MDOperand &FieldOffset = BaseNode->getOperand(Idx + 1); if (!isa(FieldTy)) { - CheckFailed("Incorrect field entry in struct type node!", &I, BaseNode); + CheckFailed("Incorrect field entry in struct type node!", I, BaseNode); Failed = true; continue; } @@ -7738,7 +7755,7 @@ TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, auto *OffsetEntryCI = mdconst::dyn_extract_or_null(FieldOffset); if (!OffsetEntryCI) { - CheckFailed("Offset entries must be constants!", &I, BaseNode); + CheckFailed("Offset entries must be constants!", I, BaseNode); Failed = true; continue; } @@ -7748,7 +7765,7 @@ TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, if (OffsetEntryCI->getBitWidth() != BitWidth) { CheckFailed( - "Bitwidth between the offsets and struct type entries must match", &I, + "Bitwidth between the offsets and struct type entries must match", I, BaseNode); Failed = true; continue; @@ -7763,7 +7780,7 @@ TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, !PrevOffset || PrevOffset->ule(OffsetEntryCI->getValue()); if (!IsAscending) { - CheckFailed("Offsets must be increasing!", &I, BaseNode); + CheckFailed("Offsets must be increasing!", I, BaseNode); Failed = true; } @@ -7773,7 +7790,7 @@ TBAAVerifier::verifyTBAABaseNodeImpl(Instruction &I, const MDNode *BaseNode, auto *MemberSizeNode = mdconst::dyn_extract_or_null( BaseNode->getOperand(Idx + 2)); if (!MemberSizeNode) { - CheckFailed("Member size entries must be constants!", &I, BaseNode); + CheckFailed("Member size entries must be constants!", I, BaseNode); Failed = true; continue; } @@ -7825,7 +7842,7 @@ bool TBAAVerifier::isValidScalarTBAANode(const MDNode *MD) { /// Offset in place to be the offset within the field node returned. /// /// We assume we've okayed \p BaseNode via \c verifyTBAABaseNode. -MDNode *TBAAVerifier::getFieldNodeFromTBAABaseNode(Instruction &I, +MDNode *TBAAVerifier::getFieldNodeFromTBAABaseNode(const Instruction *I, const MDNode *BaseNode, APInt &Offset, bool IsNewFormat) { @@ -7845,7 +7862,7 @@ MDNode *TBAAVerifier::getFieldNodeFromTBAABaseNode(Instruction &I, mdconst::extract(BaseNode->getOperand(Idx + 1)); if (OffsetEntryCI->getValue().ugt(Offset)) { if (Idx == FirstFieldOpNo) { - CheckFailed("Could not find TBAA parent in struct type node", &I, + CheckFailed("Could not find TBAA parent in struct type node", I, BaseNode, &Offset); return nullptr; } @@ -7874,21 +7891,22 @@ static bool isNewFormatTBAATypeNode(llvm::MDNode *Type) { return isa_and_nonnull(Type->getOperand(0)); } -bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) { - CheckTBAA(MD->getNumOperands() > 0, "TBAA metadata cannot have 0 operands", - &I, MD); +bool TBAAVerifier::visitTBAAMetadata(const Instruction *I, const MDNode *MD) { + CheckTBAA(MD->getNumOperands() > 0, "TBAA metadata cannot have 0 operands", I, + MD); - CheckTBAA(isa(I) || isa(I) || isa(I) || - isa(I) || isa(I) || - isa(I), - "This instruction shall not have a TBAA access tag!", &I); + if (I) + CheckTBAA(isa(I) || isa(I) || isa(I) || + isa(I) || isa(I) || + isa(I), + "This instruction shall not have a TBAA access tag!", I); bool IsStructPathTBAA = isa(MD->getOperand(0)) && MD->getNumOperands() >= 3; CheckTBAA(IsStructPathTBAA, "Old-style TBAA is no longer allowed, use struct-path TBAA instead", - &I); + I); MDNode *BaseNode = dyn_cast_or_null(MD->getOperand(0)); MDNode *AccessType = dyn_cast_or_null(MD->getOperand(1)); @@ -7897,17 +7915,17 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) { if (IsNewFormat) { CheckTBAA(MD->getNumOperands() == 4 || MD->getNumOperands() == 5, - "Access tag metadata must have either 4 or 5 operands", &I, MD); + "Access tag metadata must have either 4 or 5 operands", I, MD); } else { CheckTBAA(MD->getNumOperands() < 5, - "Struct tag metadata must have either 3 or 4 operands", &I, MD); + "Struct tag metadata must have either 3 or 4 operands", I, MD); } // Check the access size field. if (IsNewFormat) { auto *AccessSizeNode = mdconst::dyn_extract_or_null( MD->getOperand(3)); - CheckTBAA(AccessSizeNode, "Access size field must be a constant", &I, MD); + CheckTBAA(AccessSizeNode, "Access size field must be a constant", I, MD); } // Check the immutability flag. @@ -7916,27 +7934,27 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) { auto *IsImmutableCI = mdconst::dyn_extract_or_null( MD->getOperand(ImmutabilityFlagOpNo)); CheckTBAA(IsImmutableCI, - "Immutability tag on struct tag metadata must be a constant", &I, + "Immutability tag on struct tag metadata must be a constant", I, MD); CheckTBAA( IsImmutableCI->isZero() || IsImmutableCI->isOne(), - "Immutability part of the struct tag metadata must be either 0 or 1", - &I, MD); + "Immutability part of the struct tag metadata must be either 0 or 1", I, + MD); } CheckTBAA(BaseNode && AccessType, "Malformed struct tag metadata: base and access-type " "should be non-null and point to Metadata nodes", - &I, MD, BaseNode, AccessType); + I, MD, BaseNode, AccessType); if (!IsNewFormat) { CheckTBAA(isValidScalarTBAANode(AccessType), - "Access type node must be a valid scalar type", &I, MD, + "Access type node must be a valid scalar type", I, MD, AccessType); } auto *OffsetCI = mdconst::dyn_extract_or_null(MD->getOperand(2)); - CheckTBAA(OffsetCI, "Offset must be constant integer", &I, MD); + CheckTBAA(OffsetCI, "Offset must be constant integer", I, MD); APInt Offset = OffsetCI->getValue(); bool SeenAccessTypeInPath = false; @@ -7944,17 +7962,17 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) { SmallPtrSet StructPath; for (/* empty */; BaseNode && !IsRootTBAANode(BaseNode); - BaseNode = getFieldNodeFromTBAABaseNode(I, BaseNode, Offset, - IsNewFormat)) { + BaseNode = + getFieldNodeFromTBAABaseNode(I, BaseNode, Offset, IsNewFormat)) { if (!StructPath.insert(BaseNode).second) { - CheckFailed("Cycle detected in struct path", &I, MD); + CheckFailed("Cycle detected in struct path", I, MD); return false; } bool Invalid; unsigned BaseNodeBitWidth; - std::tie(Invalid, BaseNodeBitWidth) = verifyTBAABaseNode(I, BaseNode, - IsNewFormat); + std::tie(Invalid, BaseNodeBitWidth) = + verifyTBAABaseNode(I, BaseNode, IsNewFormat); // If the base node is invalid in itself, then we've already printed all the // errors we wanted to print. @@ -7964,20 +7982,20 @@ bool TBAAVerifier::visitTBAAMetadata(Instruction &I, const MDNode *MD) { SeenAccessTypeInPath |= BaseNode == AccessType; if (isValidScalarTBAANode(BaseNode) || BaseNode == AccessType) - CheckTBAA(Offset == 0, "Offset not zero at the point of scalar access", - &I, MD, &Offset); + CheckTBAA(Offset == 0, "Offset not zero at the point of scalar access", I, + MD, &Offset); CheckTBAA(BaseNodeBitWidth == Offset.getBitWidth() || (BaseNodeBitWidth == 0 && Offset == 0) || (IsNewFormat && BaseNodeBitWidth == ~0u), - "Access bit-width not the same as description bit-width", &I, MD, + "Access bit-width not the same as description bit-width", I, MD, BaseNodeBitWidth, Offset.getBitWidth()); if (IsNewFormat && SeenAccessTypeInPath) break; } - CheckTBAA(SeenAccessTypeInPath, "Did not see access type in access path!", &I, + CheckTBAA(SeenAccessTypeInPath, "Did not see access type in access path!", I, MD); return true; } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index ce9ecc35e1922..7b252627d73f9 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1290,11 +1290,11 @@ void lto::updateMemProfAttributes(Module &Mod, Error LTO::runRegularLTO(AddStreamFn AddStream) { llvm::TimeTraceScope timeScope("Run regular LTO"); + LLVMContext &CombinedCtx = RegularLTO.CombinedModule->getContext(); // Setup optimization remarks. auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks( - RegularLTO.CombinedModule->getContext(), Conf.RemarksFilename, - Conf.RemarksPasses, Conf.RemarksFormat, Conf.RemarksWithHotness, - Conf.RemarksHotnessThreshold); + CombinedCtx, Conf.RemarksFilename, Conf.RemarksPasses, Conf.RemarksFormat, + Conf.RemarksWithHotness, Conf.RemarksHotnessThreshold); LLVM_DEBUG(dbgs() << "Running regular LTO\n"); if (!DiagFileOrErr) return DiagFileOrErr.takeError(); @@ -2177,7 +2177,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, return RunBackends(SecondRoundLTO.get()); } -Expected> lto::setupLLVMOptimizationRemarks( +Expected lto::setupLLVMOptimizationRemarks( LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses, StringRef RemarksFormat, bool RemarksWithHotness, std::optional RemarksHotnessThreshold, int Count) { diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp index ce42fc526beac..11a7b3221bec9 100644 --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -240,27 +240,26 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, unsigned OptLevel, bool IsThinLTO, ModuleSummaryIndex *ExportSummary, const ModuleSummaryIndex *ImportSummary) { - auto FS = vfs::getRealFileSystem(); std::optional PGOOpt; if (!Conf.SampleProfile.empty()) PGOOpt = PGOOptions(Conf.SampleProfile, "", Conf.ProfileRemapping, - /*MemoryProfile=*/"", FS, PGOOptions::SampleUse, + /*MemoryProfile=*/"", PGOOptions::SampleUse, PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default, true); else if (Conf.RunCSIRInstr) { PGOOpt = PGOOptions("", Conf.CSIRProfile, Conf.ProfileRemapping, - /*MemoryProfile=*/"", FS, PGOOptions::IRUse, + /*MemoryProfile=*/"", PGOOptions::IRUse, PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default, Conf.AddFSDiscriminator); } else if (!Conf.CSIRProfile.empty()) { - PGOOpt = PGOOptions(Conf.CSIRProfile, "", Conf.ProfileRemapping, - /*MemoryProfile=*/"", FS, PGOOptions::IRUse, - PGOOptions::CSIRUse, PGOOptions::ColdFuncOpt::Default, - Conf.AddFSDiscriminator); + PGOOpt = + PGOOptions(Conf.CSIRProfile, "", Conf.ProfileRemapping, + /*MemoryProfile=*/"", PGOOptions::IRUse, PGOOptions::CSIRUse, + PGOOptions::ColdFuncOpt::Default, Conf.AddFSDiscriminator); NoPGOWarnMismatch = !Conf.PGOWarnMismatch; } else if (Conf.AddFSDiscriminator) { - PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr, - PGOOptions::NoAction, PGOOptions::NoCSAction, + PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", PGOOptions::NoAction, + PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default, true); } TM->setPGOOption(PGOOpt); @@ -540,12 +539,12 @@ static Expected initAndLookupTarget(const Config &C, return T; } -Error lto::finalizeOptimizationRemarks( - std::unique_ptr DiagOutputFile) { +Error lto::finalizeOptimizationRemarks(LLVMRemarkFileHandle DiagOutputFile) { // Make sure we flush the diagnostic remarks file in case the linker doesn't // call the global destructors before exiting. if (!DiagOutputFile) return Error::success(); + DiagOutputFile.finalize(); DiagOutputFile->keep(); DiagOutputFile->os().flush(); return Error::success(); @@ -640,7 +639,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream, auto OptimizeAndCodegen = [&](Module &Mod, TargetMachine *TM, - std::unique_ptr DiagnosticOutputFile) { + LLVMRemarkFileHandle DiagnosticOutputFile) { // Perform optimization and code generation for ThinLTO. if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true, /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex, diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index d8a96f73110fd..8aa404da15286 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -545,6 +545,7 @@ void LTOCodeGenerator::finishOptimizationRemarks() { if (DiagnosticOutputFile) { DiagnosticOutputFile->keep(); // FIXME: LTOCodeGenerator dtor is not invoked on Darwin + DiagnosticOutputFile.finalize(); DiagnosticOutputFile->os().flush(); } } diff --git a/llvm/lib/MC/DXContainerRootSignature.cpp b/llvm/lib/MC/DXContainerRootSignature.cpp index b9ebb7a9e789c..713aa3d8143e8 100644 --- a/llvm/lib/MC/DXContainerRootSignature.cpp +++ b/llvm/lib/MC/DXContainerRootSignature.cpp @@ -23,19 +23,20 @@ static uint32_t writePlaceholder(raw_svector_ostream &Stream) { static uint32_t rewriteOffsetToCurrentByte(raw_svector_ostream &Stream, uint32_t Offset) { uint32_t ByteOffset = Stream.tell(); - uint32_t Value = - support::endian::byte_swap( - ByteOffset); + uint32_t Value = support::endian::byte_swap( + ByteOffset, llvm::endianness::little); Stream.pwrite(reinterpret_cast(&Value), sizeof(Value), Offset); return ByteOffset; } size_t RootSignatureDesc::getSize() const { uint32_t StaticSamplersOffset = computeStaticSamplersOffset(); - size_t StaticSamplersSize = - StaticSamplers.size() * sizeof(dxbc::RTS0::v1::StaticSampler); + size_t StaticSamplersSize = sizeof(dxbc::RTS0::v1::StaticSampler); + if (Version > 2) + StaticSamplersSize = sizeof(dxbc::RTS0::v3::StaticSampler); - return size_t(StaticSamplersOffset) + StaticSamplersSize; + return size_t(StaticSamplersOffset) + + (StaticSamplersSize * StaticSamplers.size()); } uint32_t RootSignatureDesc::computeRootParametersOffset() const { @@ -171,6 +172,9 @@ void RootSignatureDesc::write(raw_ostream &OS) const { support::endian::write(BOS, S.ShaderRegister, llvm::endianness::little); support::endian::write(BOS, S.RegisterSpace, llvm::endianness::little); support::endian::write(BOS, S.ShaderVisibility, llvm::endianness::little); + + if (Version > 2) + support::endian::write(BOS, S.Flags, llvm::endianness::little); } assert(Storage.size() == getSize()); OS.write(Storage.data(), Storage.size()); diff --git a/llvm/lib/MC/MCCodeEmitter.cpp b/llvm/lib/MC/MCCodeEmitter.cpp index 0d114f12d58c5..76a8406cee7bd 100644 --- a/llvm/lib/MC/MCCodeEmitter.cpp +++ b/llvm/lib/MC/MCCodeEmitter.cpp @@ -7,9 +7,28 @@ //===----------------------------------------------------------------------===// #include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" +#include using namespace llvm; MCCodeEmitter::MCCodeEmitter() = default; MCCodeEmitter::~MCCodeEmitter() = default; + +void MCCodeEmitter::reportUnsupportedInst(const MCInst &Inst) { + std::string Msg; + raw_string_ostream OS(Msg); + OS << "Unsupported instruction : " << Inst; + reportFatalInternalError(Msg.c_str()); +} + +void MCCodeEmitter::reportUnsupportedOperand(const MCInst &Inst, + unsigned OpNum) { + std::string Msg; + raw_string_ostream OS(Msg); + OS << "Unsupported instruction operand : \"" << Inst << "\"[" << OpNum << "]"; + reportFatalInternalError(Msg.c_str()); +} diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index a0cd09b11d8de..a755c22ab879a 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -776,10 +776,18 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) { ".debug_loc.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ); + DwarfLoclistsDWOSection = Ctx->getCOFFSection( + ".debug_loclists.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE | + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ); DwarfStrOffDWOSection = Ctx->getCOFFSection( ".debug_str_offsets.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ); + DwarfRnglistsDWOSection = Ctx->getCOFFSection( + ".debug_rnglists.dwo", COFF::IMAGE_SCN_MEM_DISCARDABLE | + COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | + COFF::IMAGE_SCN_MEM_READ); DwarfAddrSection = Ctx->getCOFFSection( ".debug_addr", COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp index 066d1a34e1548..d6fa54c087ca3 100644 --- a/llvm/lib/MC/MCSFrame.cpp +++ b/llvm/lib/MC/MCSFrame.cpp @@ -111,6 +111,8 @@ struct SFrameFDE { MCFragment *Frag; // Unwinding fres SmallVector FREs; + // .cfi_remember_state stack + SmallVector SaveState; SFrameFDE(const MCDwarfFrameInfo &DF, MCSymbol *FRES) : DFrame(DF), FREStart(FRES), Frag(nullptr) {} @@ -198,7 +200,7 @@ class SFrameEmitterImpl { return false; } - bool setCFAOffset(SFrameFRE &FRE, const SMLoc &Loc, size_t Offset) { + bool setCFAOffset(SFrameFRE &FRE, SMLoc Loc, size_t Offset) { if (!FRE.CFARegSet) { Streamer.getContext().reportWarning( Loc, "adjusting CFA offset without a base register. " @@ -237,13 +239,30 @@ class SFrameEmitterImpl { case MCCFIInstruction::OpAdjustCfaOffset: return setCFAOffset(FRE, CFI.getLoc(), FRE.CFAOffset + CFI.getOffset()); case MCCFIInstruction::OpRememberState: - // TODO: Implement. Will use FDE. + if (FDE.FREs.size() == 1) { + // Error for gas compatibility: If the initial FRE isn't complete, + // then any state is incomplete. FIXME: Dwarf doesn't error here. + // Why should sframe? + Streamer.getContext().reportWarning( + CFI.getLoc(), "skipping SFrame FDE; .cfi_remember_state without " + "prior SFrame FRE state"); + return false; + } + FDE.SaveState.push_back(FRE); return true; case MCCFIInstruction::OpRestore: - // TODO: Implement. Will use FDE. + // The first FRE generated has the original state. + if (CFI.getRegister() == FPReg) + FRE.FPOffset = FDE.FREs.front().FPOffset; + else if (CFI.getRegister() == RAReg) + FRE.RAOffset = FDE.FREs.front().RAOffset; return true; case MCCFIInstruction::OpRestoreState: - // TODO: Implement. Will use FDE. + // The cfi parser will have caught unbalanced directives earlier, so a + // mismatch here is an implementation error. + assert(!FDE.SaveState.empty() && + "cfi_restore_state without cfi_save_state"); + FRE = FDE.SaveState.pop_back_val(); return true; case MCCFIInstruction::OpEscape: // TODO: Implement. Will use FDE. @@ -394,8 +413,8 @@ class SFrameEmitterImpl { // shf_fdeoff. With no sfh_auxhdr, these immediately follow this header. Streamer.emitInt32(0); // shf_freoff - Streamer.emitAbsoluteSymbolDiff(FRESubSectionStart, FDESubSectionStart, - sizeof(uint32_t)); + Streamer.emitInt32(FDEs.size() * + sizeof(sframe::FuncDescEntry)); } void emitFDEs() { diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp index 6fc0889afc6a8..a11259748b9cc 100644 --- a/llvm/lib/Object/ArchiveWriter.cpp +++ b/llvm/lib/Object/ArchiveWriter.cpp @@ -1119,10 +1119,26 @@ Error writeArchiveToStream(raw_ostream &Out, // to switch to 64-bit. Note that the file can be larger than 4GB as long as // the last member starts before the 4GB offset. if (*HeadersSize + LastMemberHeaderOffset >= Sym64Threshold) { - if (Kind == object::Archive::K_DARWIN) + switch (Kind) { + case object::Archive::K_COFF: + // COFF format has no 64-bit version, so we use GNU64 instead. + if (!SymMap.Map.empty() && !SymMap.ECMap.empty()) + // Only the COFF format supports the ECSYMBOLS section, so don’t use + // GNU64 when two symbol maps are required. + return make_error( + "Archive is too large: ARM64X does not support archives larger " + "than 4GB"); + // Since this changes the headers, we need to recalculate everything. + return writeArchiveToStream(Out, NewMembers, WriteSymtab, + object::Archive::K_GNU64, Deterministic, + Thin, IsEC, Warn); + case object::Archive::K_DARWIN: Kind = object::Archive::K_DARWIN64; - else + break; + default: Kind = object::Archive::K_GNU64; + break; + } HeadersSize.reset(); } } diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp index 031b9414f4c1a..7b7b8d88c63fc 100644 --- a/llvm/lib/Object/DXContainer.cpp +++ b/llvm/lib/Object/DXContainer.cpp @@ -276,10 +276,13 @@ Error DirectX::RootSignature::parse() { RootParametersOffset, NumParameters * sizeof(dxbc::RTS0::v1::RootParameterHeader)); - StaticSamplers.Stride = sizeof(dxbc::RTS0::v1::StaticSampler); - StaticSamplers.Data = PartData.substr( - StaticSamplersOffset, - NumStaticSamplers * sizeof(dxbc::RTS0::v1::StaticSampler)); + StaticSamplers.Stride = (Version <= 2) + ? sizeof(dxbc::RTS0::v1::StaticSampler) + : sizeof(dxbc::RTS0::v3::StaticSampler); + + StaticSamplers.Data = PartData.substr(StaticSamplersOffset, + static_cast(NumStaticSamplers) * + StaticSamplers.Stride); return Error::success(); } diff --git a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp index 910383816f43b..b00e45d912be1 100644 --- a/llvm/lib/ObjectYAML/DXContainerEmitter.cpp +++ b/llvm/lib/ObjectYAML/DXContainerEmitter.cpp @@ -343,6 +343,9 @@ Error DXContainerWriter::writeParts(raw_ostream &OS) { NewSampler.RegisterSpace = Param.RegisterSpace; NewSampler.ShaderVisibility = Param.ShaderVisibility; + if (RS.Version > 2) + NewSampler.Flags = Param.getEncodedFlags(); + RS.StaticSamplers.push_back(NewSampler); } diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp index 22674b1ceb734..3c09ae4e5f2bc 100644 --- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp +++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp @@ -209,6 +209,11 @@ DXContainerYAML::RootSignatureYamlDesc::create( NewS.RegisterSpace = S.RegisterSpace; NewS.ShaderVisibility = dxbc::ShaderVisibility(S.ShaderVisibility); + if (Version > 2) { +#define STATIC_SAMPLER_FLAG(Num, Enum, Flag) \ + NewS.Enum = (S.Flags & llvm::to_underlying(dxbc::StaticSamplerFlags::Enum)); +#include "llvm/BinaryFormat/DXContainerConstants.def" + } RootSigDesc.StaticSamplers.push_back(NewS); } @@ -245,6 +250,15 @@ uint32_t DXContainerYAML::DescriptorRangeYaml::getEncodedFlags() const { return Flags; } +uint32_t DXContainerYAML::StaticSamplerYamlDesc::getEncodedFlags() const { + uint64_t Flags = 0; +#define STATIC_SAMPLER_FLAG(Num, Enum, Flag) \ + if (Enum) \ + Flags |= (uint32_t)dxbc::StaticSamplerFlags::Enum; +#include "llvm/BinaryFormat/DXContainerConstants.def" + return Flags; +} + uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() { uint64_t Flag = 0; #define SHADER_FEATURE_FLAG(Num, DxilModuleNum, Val, Str) \ @@ -512,6 +526,9 @@ void MappingTraits::mapping( IO.mapRequired("ShaderRegister", S.ShaderRegister); IO.mapRequired("RegisterSpace", S.RegisterSpace); IO.mapRequired("ShaderVisibility", S.ShaderVisibility); +#define STATIC_SAMPLER_FLAG(Num, Enum, Flag) \ + IO.mapOptional(#Flag, S.Enum, false); +#include "llvm/BinaryFormat/DXContainerConstants.def" } void MappingTraits::mapping(IO &IO, diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 0228dedb3fbfa..d9cce1eb7641d 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -359,6 +359,7 @@ void ScalarEnumerationTraits::enumeration( ECase(EM_VE); ECase(EM_CSKY); ECase(EM_LOONGARCH); + ECase(EM_INTELGT); #undef ECase IO.enumFallback(Value); } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e4dab4acc0b4a..c234623caecf9 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -375,7 +375,6 @@ #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" -#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" @@ -504,8 +503,9 @@ static Expected parseOptLevelParam(StringRef S) { PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO, std::optional PGOOpt, - PassInstrumentationCallbacks *PIC) - : TM(TM), PTO(PTO), PGOOpt(PGOOpt), PIC(PIC) { + PassInstrumentationCallbacks *PIC, + IntrusiveRefCntPtr FS) + : TM(TM), PTO(PTO), PGOOpt(PGOOpt), PIC(PIC), FS(std::move(FS)) { if (TM) TM->registerPassBuilderCallbacks(*this); if (PIC) { diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 30c6f06be139d..256cf9d4cd1ce 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -848,8 +848,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, OptimizationLevel Level, bool RunProfileGen, bool IsCS, bool AtomicCounterUpdate, std::string ProfileFile, - std::string ProfileRemappingFile, - IntrusiveRefCntPtr FS) { + std::string ProfileRemappingFile) { assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); if (!RunProfileGen) { @@ -884,10 +883,11 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, MPM.addPass(InstrProfilingLoweringPass(Options, IsCS)); } -void PassBuilder::addPGOInstrPassesForO0( - ModulePassManager &MPM, bool RunProfileGen, bool IsCS, - bool AtomicCounterUpdate, std::string ProfileFile, - std::string ProfileRemappingFile, IntrusiveRefCntPtr FS) { +void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM, + bool RunProfileGen, bool IsCS, + bool AtomicCounterUpdate, + std::string ProfileFile, + std::string ProfileRemappingFile) { if (!RunProfileGen) { assert(!ProfileFile.empty() && "Profile use expecting a profile file!"); MPM.addPass( @@ -1133,8 +1133,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, if (LoadSampleProfile) { // Annotate sample profile right after early FPM to ensure freshness of // the debug info. - MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, - PGOOpt->ProfileRemappingFile, Phase)); + MPM.addPass(SampleProfileLoaderPass( + PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, Phase, FS)); // Cache ProfileSummaryAnalysis once to avoid the potential need to insert // RequireAnalysisPass for PSI before subsequent non-module passes. MPM.addPass(RequireAnalysisPass()); @@ -1230,8 +1230,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/IsPGOInstrGen, /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, - PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, - PGOOpt->FS); + PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); } else if (IsCtxProfGen || IsCtxProfUse) { MPM.addPass(PGOInstrumentationGen(PGOInstrumentationType::CTXPROF)); // In pre-link, we just want the instrumented IR. We use the contextual @@ -1254,10 +1253,10 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, addPostPGOLoopRotation(MPM, Level); MPM.addPass(PGOCtxProfLoweringPass()); } else if (IsColdFuncOnlyInstrGen) { - addPGOInstrPasses( - MPM, Level, /* RunProfileGen */ true, /* IsCS */ false, - /* AtomicCounterUpdate */ false, InstrumentColdFuncOnlyPath, - /* ProfileRemappingFile */ "", IntrusiveRefCntPtr()); + addPGOInstrPasses(MPM, Level, /* RunProfileGen */ true, /* IsCS */ false, + /* AtomicCounterUpdate */ false, + InstrumentColdFuncOnlyPath, + /* ProfileRemappingFile */ ""); } if (IsPGOInstrGen || IsPGOInstrUse || IsCtxProfGen) @@ -1268,7 +1267,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EnableSampledInstr)); if (IsMemprofUse) - MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, PGOOpt->FS)); + MPM.addPass(MemProfUsePass(PGOOpt->MemoryProfile, FS)); if (PGOOpt && (PGOOpt->Action == PGOOptions::IRUse || PGOOpt->Action == PGOOptions::SampleUse)) @@ -1477,13 +1476,11 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level, if (PGOOpt->CSAction == PGOOptions::CSIRInstr) addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, - PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, - PGOOpt->FS); + PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile); else if (PGOOpt->CSAction == PGOOptions::CSIRUse) addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, - PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, - PGOOpt->FS); + PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); } // Re-compute GlobalsAA here prior to function passes. This is particularly @@ -1831,6 +1828,7 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( // in ICP. MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, lowertypetests::DropTestKind::Assume)); + MPM.addPass(buildCoroWrapper(ThinOrFullLTOPhase::ThinLTOPostLink)); // Drop available_externally and unreferenced globals. This is necessary // with ThinLTO in order to avoid leaving undefined references to dead // globals in the object file. @@ -2070,13 +2068,11 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, if (PGOOpt->CSAction == PGOOptions::CSIRInstr) addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true, /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, - PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile, - PGOOpt->FS); + PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile); else if (PGOOpt->CSAction == PGOOptions::CSIRUse) addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false, /*IsCS=*/true, PGOOpt->AtomicCounterUpdate, - PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, - PGOOpt->FS); + PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile); } // Break up allocas @@ -2236,7 +2232,7 @@ PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, MPM, /*RunProfileGen=*/(PGOOpt->Action == PGOOptions::IRInstr), /*IsCS=*/false, PGOOpt->AtomicCounterUpdate, PGOOpt->ProfileFile, - PGOOpt->ProfileRemappingFile, PGOOpt->FS); + PGOOpt->ProfileRemappingFile); // Instrument function entry and exit before all inlining. MPM.addPass(createModuleToFunctionPassAdaptor( diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 49d5d08474f0f..f0e7d36f78aab 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -755,7 +755,6 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch", #endif LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass()) LOOP_PASS("dot-ddg", DDGDotPrinterPass()) -LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass()) LOOP_PASS("guard-widening", GuardWideningPass()) LOOP_PASS("extra-simple-loop-unswitch-passes", ExtraLoopPassManager()) diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp index fc2577e6ada5d..075ad8d7aec8b 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp @@ -949,9 +949,9 @@ loadTestingFormat(StringRef Data, StringRef CompilationDir) { if (Data.size() < sizeof(uint64_t)) return make_error(coveragemap_error::malformed, "the size of data is too small"); - auto TestingVersion = - support::endian::byte_swap( - *reinterpret_cast(Data.data())); + auto TestingVersion = support::endian::byte_swap( + *reinterpret_cast(Data.data()), + llvm::endianness::little); Data = Data.substr(sizeof(uint64_t)); // Read the ProfileNames data. @@ -1274,9 +1274,9 @@ BinaryCoverageReader::create( std::vector> Readers; if (ObjectBuffer.getBuffer().size() > sizeof(TestingFormatMagic)) { - uint64_t Magic = - support::endian::byte_swap( - *reinterpret_cast(ObjectBuffer.getBufferStart())); + uint64_t Magic = support::endian::byte_swap( + *reinterpret_cast(ObjectBuffer.getBufferStart()), + llvm::endianness::little); if (Magic == TestingFormatMagic) { // This is a special format used for testing. auto ReaderOrErr = diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp index 12b1687af69db..3875f01c48528 100644 --- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp +++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp @@ -292,7 +292,7 @@ void CoverageMappingWriter::write(raw_ostream &OS) { void TestingFormatWriter::write(raw_ostream &OS, TestingFormatVersion Version) { auto ByteSwap = [](uint64_t N) { - return support::endian::byte_swap(N); + return support::endian::byte_swap(N, llvm::endianness::little); }; // Output a 64bit magic number. diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index e1c6315853b3b..3c8e44a18f533 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -292,7 +292,7 @@ void ProfOStream::patch(ArrayRef P) { for (const auto &K : P) { for (int I = 0, E = K.D.size(); I != E; I++) { uint64_t Bytes = - endian::byte_swap(K.D[I]); + endian::byte_swap(K.D[I], llvm::endianness::little); Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t), (const char *)&Bytes, sizeof(uint64_t)); } diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp index 886add7131da2..d2ae4b5226ff6 100644 --- a/llvm/lib/ProfileData/InstrProfReader.cpp +++ b/llvm/lib/ProfileData/InstrProfReader.cpp @@ -1171,8 +1171,8 @@ bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) { if (DataBuffer.getBufferSize() < 8) return false; - uint64_t Magic = endian::read( - DataBuffer.getBufferStart()); + uint64_t Magic = endian::read(DataBuffer.getBufferStart(), + llvm::endianness::little); // Verify that it's magical. return Magic == IndexedInstrProf::Magic; } @@ -1186,10 +1186,10 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version, if (Version >= IndexedInstrProf::Version4) { const IndexedInstrProf::Summary *SummaryInLE = reinterpret_cast(Cur); - uint64_t NFields = endian::byte_swap( - SummaryInLE->NumSummaryFields); - uint64_t NEntries = endian::byte_swap( - SummaryInLE->NumCutoffEntries); + uint64_t NFields = endian::byte_swap( + SummaryInLE->NumSummaryFields, llvm::endianness::little); + uint64_t NEntries = endian::byte_swap( + SummaryInLE->NumCutoffEntries, llvm::endianness::little); uint32_t SummarySize = IndexedInstrProf::Summary::getSize(NFields, NEntries); std::unique_ptr SummaryData = @@ -1198,7 +1198,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version, const uint64_t *Src = reinterpret_cast(SummaryInLE); uint64_t *Dst = reinterpret_cast(SummaryData.get()); for (unsigned I = 0; I < SummarySize / sizeof(uint64_t); I++) - Dst[I] = endian::byte_swap(Src[I]); + Dst[I] = endian::byte_swap(Src[I], llvm::endianness::little); SummaryEntryVector DetailedSummary; for (unsigned I = 0; I < SummaryData->NumCutoffEntries; I++) { @@ -1598,8 +1598,8 @@ Error IndexedInstrProfReader::getFunctionBitmap(StringRef FuncName, std::memset(W, 0, sizeof(W)); std::memcpy(W, &BitmapBytes[I], N); I += N; - return support::endian::read(W); + return support::endian::read( + W, llvm::endianness::little); }, Bitmap, Bitmap); assert(I == E); diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 81ae792e70b99..766c0814ca067 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -1290,8 +1290,8 @@ SampleProfileReaderExtBinaryBase::readNameTableSec(bool IsMD5, NameTable.reserve(*Size); for (size_t I = 0; I < *Size; ++I) { using namespace support; - uint64_t FID = endian::read( - Data + I * sizeof(uint64_t)); + uint64_t FID = endian::read( + Data + I * sizeof(uint64_t), endianness::little); NameTable.emplace_back(FunctionId(FID)); } if (!ProfileIsCS) diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/llvm/lib/Remarks/BitstreamRemarkParser.cpp index d40b40dfb2ba0..33eedd6042c37 100644 --- a/llvm/lib/Remarks/BitstreamRemarkParser.cpp +++ b/llvm/lib/Remarks/BitstreamRemarkParser.cpp @@ -197,14 +197,9 @@ Error BitstreamRemarkParserHelper::parseNext() { Loc.reset(); Args.clear(); - if (Error E = expectBlock()) - return E; return parseBlock(); } -BitstreamParserHelper::BitstreamParserHelper(StringRef Buffer) - : Stream(Buffer) {} - Error BitstreamParserHelper::expectMagic() { std::array Result; for (unsigned I = 0; I < 4; ++I) @@ -244,14 +239,57 @@ Error BitstreamParserHelper::parseBlockInfoBlock() { return Error::success(); } -Error BitstreamParserHelper::advanceToMetaBlock() { +Error BitstreamParserHelper::parseMeta() { if (Error E = expectMagic()) return E; if (Error E = parseBlockInfoBlock()) return E; + + // Parse early meta block. + if (Error E = MetaHelper.expectBlock()) + return E; + if (Error E = MetaHelper.parseBlock()) + return E; + + // Skip all Remarks blocks. + while (!Stream.AtEndOfStream()) { + auto MaybeBlockID = expectSubBlock(Stream); + if (!MaybeBlockID) + return MaybeBlockID.takeError(); + if (*MaybeBlockID == META_BLOCK_ID) + break; + if (*MaybeBlockID != REMARK_BLOCK_ID) + return error("Unexpected block between meta blocks."); + // Remember first remark block. + if (!RemarkStartBitPos) + RemarkStartBitPos = Stream.GetCurrentBitNo(); + if (Error E = Stream.SkipBlock()) + return E; + } + + // Late meta block is optional if there are no remarks. + if (Stream.AtEndOfStream()) + return Error::success(); + + // Parse late meta block. + if (Error E = MetaHelper.parseBlock()) + return E; return Error::success(); } +Error BitstreamParserHelper::parseRemark() { + if (RemarkStartBitPos) { + RemarkStartBitPos.reset(); + } else { + auto MaybeBlockID = expectSubBlock(Stream); + if (!MaybeBlockID) + return MaybeBlockID.takeError(); + if (*MaybeBlockID != REMARK_BLOCK_ID) + return make_error(); + } + return RemarksHelper->parseNext(); +} + Expected> remarks::createBitstreamParserFromMeta( StringRef Buf, std::optional ExternalFilePrependPath) { @@ -263,45 +301,52 @@ remarks::createBitstreamParserFromMeta( return std::move(Parser); } +BitstreamRemarkParser::BitstreamRemarkParser(StringRef Buf) + : RemarkParser(Format::Bitstream), ParserHelper(Buf) {} + Expected> BitstreamRemarkParser::next() { - if (ParserHelper.atEndOfStream()) - return make_error(); + if (!IsMetaReady) { + // Container is completely empty. + if (ParserHelper->Stream.AtEndOfStream()) + return make_error(); - if (!ReadyToParseRemarks) { if (Error E = parseMeta()) return std::move(E); - ReadyToParseRemarks = true; + IsMetaReady = true; + + // Container has meta, but no remarks blocks. + if (!ParserHelper->RemarkStartBitPos) + return error( + "Container is non-empty, but does not contain any remarks blocks."); + + if (Error E = + ParserHelper->Stream.JumpToBit(*ParserHelper->RemarkStartBitPos)) + return std::move(E); + ParserHelper->RemarksHelper.emplace(ParserHelper->Stream); } - return parseRemark(); + if (Error E = ParserHelper->parseRemark()) + return std::move(E); + return processRemark(); } Error BitstreamRemarkParser::parseMeta() { - if (Error E = ParserHelper.advanceToMetaBlock()) - return E; - - BitstreamMetaParserHelper MetaHelper(ParserHelper.Stream); - if (Error E = MetaHelper.expectBlock()) + if (Error E = ParserHelper->parseMeta()) return E; - if (Error E = MetaHelper.parseBlock()) - return E; - - if (Error E = processCommonMeta(MetaHelper)) + if (Error E = processCommonMeta()) return E; switch (ContainerType) { - case BitstreamRemarkContainerType::Standalone: - return processStandaloneMeta(MetaHelper); - case BitstreamRemarkContainerType::SeparateRemarksFile: - return processSeparateRemarksFileMeta(MetaHelper); - case BitstreamRemarkContainerType::SeparateRemarksMeta: - return processSeparateRemarksMetaMeta(MetaHelper); + case BitstreamRemarkContainerType::RemarksFileExternal: + return processExternalFilePath(); + case BitstreamRemarkContainerType::RemarksFile: + return processFileContainerMeta(); } llvm_unreachable("Unknown BitstreamRemarkContainerType enum"); } -Error BitstreamRemarkParser::processCommonMeta( - BitstreamMetaParserHelper &Helper) { +Error BitstreamRemarkParser::processCommonMeta() { + auto &Helper = ParserHelper->MetaHelper; if (!Helper.Container) return Helper.error("Missing container info."); auto &Container = *Helper.Container; @@ -313,7 +358,16 @@ Error BitstreamRemarkParser::processCommonMeta( return Error::success(); } -Error BitstreamRemarkParser::processStrTab(BitstreamMetaParserHelper &Helper) { +Error BitstreamRemarkParser::processFileContainerMeta() { + if (Error E = processRemarkVersion()) + return E; + if (Error E = processStrTab()) + return E; + return Error::success(); +} + +Error BitstreamRemarkParser::processStrTab() { + auto &Helper = ParserHelper->MetaHelper; if (!Helper.StrTabBuf) return Helper.error("Missing string table."); // Parse and assign the string table. @@ -321,26 +375,25 @@ Error BitstreamRemarkParser::processStrTab(BitstreamMetaParserHelper &Helper) { return Error::success(); } -Error BitstreamRemarkParser::processRemarkVersion( - BitstreamMetaParserHelper &Helper) { +Error BitstreamRemarkParser::processRemarkVersion() { + auto &Helper = ParserHelper->MetaHelper; if (!Helper.RemarkVersion) return Helper.error("Missing remark version."); RemarkVersion = *Helper.RemarkVersion; return Error::success(); } -Error BitstreamRemarkParser::processExternalFilePath( - BitstreamMetaParserHelper &Helper) { +Error BitstreamRemarkParser::processExternalFilePath() { + auto &Helper = ParserHelper->MetaHelper; if (!Helper.ExternalFilePath) return Helper.error("Missing external file path."); - StringRef ExternalFilePath = *Helper.ExternalFilePath; SmallString<80> FullPath(ExternalFilePrependPath); - sys::path::append(FullPath, ExternalFilePath); + sys::path::append(FullPath, *Helper.ExternalFilePath); // External file: open the external file, parse it, check if its metadata - // matches the one from the separate metadata, then replace the current parser - // with the one parsing the remarks. + // matches the one from the separate metadata, then replace the current + // parser with the one parsing the remarks. ErrorOr> BufferOrErr = MemoryBuffer::getFile(FullPath); if (std::error_code EC = BufferOrErr.getError()) @@ -353,58 +406,19 @@ Error BitstreamRemarkParser::processExternalFilePath( return make_error(); // Create a separate parser used for parsing the separate file. - ParserHelper = BitstreamParserHelper(TmpRemarkBuffer->getBuffer()); - // Advance and check until we can parse the meta block. - if (Error E = ParserHelper.advanceToMetaBlock()) - return E; - // Parse the meta from the separate file. - // Note: here we overwrite the BlockInfo with the one from the file. This will - // be used to parse the rest of the file. - BitstreamMetaParserHelper SeparateMetaHelper(ParserHelper.Stream); - if (Error E = SeparateMetaHelper.expectBlock()) - return E; - if (Error E = SeparateMetaHelper.parseBlock()) - return E; - - if (Error E = processCommonMeta(SeparateMetaHelper)) - return E; - - if (ContainerType != BitstreamRemarkContainerType::SeparateRemarksFile) - return SeparateMetaHelper.error("Wrong container type in external file."); - - // Process the meta from the separate file. - return processSeparateRemarksFileMeta(SeparateMetaHelper); -} - -Error BitstreamRemarkParser::processStandaloneMeta( - BitstreamMetaParserHelper &Helper) { - if (Error E = processStrTab(Helper)) + ParserHelper.emplace(TmpRemarkBuffer->getBuffer()); + if (Error E = parseMeta()) return E; - return processRemarkVersion(Helper); -} -Error BitstreamRemarkParser::processSeparateRemarksFileMeta( - BitstreamMetaParserHelper &Helper) { - return processRemarkVersion(Helper); -} + if (ContainerType != BitstreamRemarkContainerType::RemarksFile) + return ParserHelper->MetaHelper.error( + "Wrong container type in external file."); -Error BitstreamRemarkParser::processSeparateRemarksMetaMeta( - BitstreamMetaParserHelper &Helper) { - if (Error E = processStrTab(Helper)) - return E; - return processExternalFilePath(Helper); -} - -Expected> BitstreamRemarkParser::parseRemark() { - BitstreamRemarkParserHelper RemarkHelper(ParserHelper.Stream); - if (Error E = RemarkHelper.parseNext()) - return std::move(E); - - return processRemark(RemarkHelper); + return Error::success(); } -Expected> -BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) { +Expected> BitstreamRemarkParser::processRemark() { + auto &Helper = *ParserHelper->RemarksHelper; std::unique_ptr Result = std::make_unique(); Remark &R = *Result; @@ -491,5 +505,3 @@ BitstreamRemarkParser::processRemark(BitstreamRemarkParserHelper &Helper) { return std::move(Result); } -llvm::remarks::BitstreamRemarkParser::BitstreamRemarkParser(StringRef Buf) - : RemarkParser(Format::Bitstream), ParserHelper(Buf) {} diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.h b/llvm/lib/Remarks/BitstreamRemarkParser.h index d756e3296a871..4f66c47bb4b29 100644 --- a/llvm/lib/Remarks/BitstreamRemarkParser.h +++ b/llvm/lib/Remarks/BitstreamRemarkParser.h @@ -187,35 +187,49 @@ struct BitstreamParserHelper { BitstreamCursor Stream; /// The block info block. BitstreamBlockInfo BlockInfo; + + /// Helper to parse the metadata blocks in this bitstream. + BitstreamMetaParserHelper MetaHelper; + /// Helper to parse the remark blocks in this bitstream. Only needed + /// for ContainerType RemarksFile. + std::optional RemarksHelper; + /// The position of the first remark block we encounter after + /// the initial metadata block. + std::optional RemarkStartBitPos; + /// Start parsing at \p Buffer. - BitstreamParserHelper(StringRef Buffer); + BitstreamParserHelper(StringRef Buffer) + : Stream(Buffer), MetaHelper(Stream), RemarksHelper(Stream) {} + /// Parse and validate the magic number. Error expectMagic(); - /// Advance to the meta block - Error advanceToMetaBlock(); /// Parse the block info block containing all the abbrevs. /// This needs to be called before calling any other parsing function. Error parseBlockInfoBlock(); - /// Return true if the parser reached the end of the stream. - bool atEndOfStream() { return Stream.AtEndOfStream(); } + + /// Parse all metadata blocks in the file. This populates the meta helper. + Error parseMeta(); + /// Parse the next remark. This populates the remark helper data. + Error parseRemark(); }; /// Parses and holds the state of the latest parsed remark. struct BitstreamRemarkParser : public RemarkParser { /// The buffer to parse. - BitstreamParserHelper ParserHelper; + std::optional ParserHelper; /// The string table used for parsing strings. std::optional StrTab; /// Temporary remark buffer used when the remarks are stored separately. std::unique_ptr TmpRemarkBuffer; + /// Whether the metadata has already been parsed, so we can continue parsing + /// remarks. + bool IsMetaReady = false; /// The common metadata used to decide how to parse the buffer. /// This is filled when parsing the metadata block. uint64_t ContainerVersion = 0; uint64_t RemarkVersion = 0; BitstreamRemarkContainerType ContainerType = - BitstreamRemarkContainerType::Standalone; - /// Wether the parser is ready to parse remarks. - bool ReadyToParseRemarks = false; + BitstreamRemarkContainerType::RemarksFile; /// Create a parser that expects to find a string table embedded in the /// stream. @@ -230,20 +244,15 @@ struct BitstreamRemarkParser : public RemarkParser { /// Parse and process the metadata of the buffer. Error parseMeta(); - /// Parse a Bitstream remark. - Expected> parseRemark(); - private: - Error processCommonMeta(BitstreamMetaParserHelper &Helper); - Error processStandaloneMeta(BitstreamMetaParserHelper &Helper); - Error processSeparateRemarksFileMeta(BitstreamMetaParserHelper &Helper); - Error processSeparateRemarksMetaMeta(BitstreamMetaParserHelper &Helper); - Error processExternalFilePath(BitstreamMetaParserHelper &Helper); - Error processStrTab(BitstreamMetaParserHelper &Helper); - Error processRemarkVersion(BitstreamMetaParserHelper &Helper); - - Expected> - processRemark(BitstreamRemarkParserHelper &Helper); + Error processCommonMeta(); + Error processFileContainerMeta(); + Error processExternalFilePath(); + + Expected> processRemark(); + + Error processStrTab(); + Error processRemarkVersion(); }; Expected> createBitstreamParserFromMeta( diff --git a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp index b2627196bce62..abd436e0ee561 100644 --- a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp +++ b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp @@ -12,25 +12,23 @@ //===----------------------------------------------------------------------===// #include "llvm/Remarks/BitstreamRemarkSerializer.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Remarks/Remark.h" +#include #include using namespace llvm; using namespace llvm::remarks; BitstreamRemarkSerializerHelper::BitstreamRemarkSerializerHelper( - BitstreamRemarkContainerType ContainerType) - : Bitstream(Encoded), ContainerType(ContainerType) {} - -static void push(SmallVectorImpl &R, StringRef Str) { - append_range(R, Str); -} + BitstreamRemarkContainerType ContainerType, raw_ostream &OS) + : Bitstream(OS), ContainerType(ContainerType) {} static void setRecordName(unsigned RecordID, BitstreamWriter &Bitstream, SmallVectorImpl &R, StringRef Str) { R.clear(); R.push_back(RecordID); - push(R, Str); + append_range(R, Str); Bitstream.EmitRecord(bitc::BLOCKINFO_CODE_SETRECORDNAME, R); } @@ -41,7 +39,7 @@ static void initBlock(unsigned BlockID, BitstreamWriter &Bitstream, Bitstream.EmitRecord(bitc::BLOCKINFO_CODE_SETBID, R); R.clear(); - push(R, Str); + append_range(R, Str); Bitstream.EmitRecord(bitc::BLOCKINFO_CODE_BLOCKNAME, R); } @@ -200,75 +198,64 @@ void BitstreamRemarkSerializerHelper::setupBlockInfo() { Bitstream.Emit(static_cast(C), 8); Bitstream.EnterBlockInfoBlock(); + auto ExitBlock = make_scope_exit([&] { Bitstream.ExitBlock(); }); // Setup the main metadata. Depending on the container type, we'll setup the // required records next. setupMetaBlockInfo(); switch (ContainerType) { - case BitstreamRemarkContainerType::SeparateRemarksMeta: - // Needs a string table that the separate remark file is using. - setupMetaStrTab(); + case BitstreamRemarkContainerType::RemarksFileExternal: // Needs to know where the external remarks file is. setupMetaExternalFile(); - break; - case BitstreamRemarkContainerType::SeparateRemarksFile: - // Contains remarks: emit the version. - setupMetaRemarkVersion(); - // Contains remarks: emit the remark abbrevs. - setupRemarkBlockInfo(); - break; - case BitstreamRemarkContainerType::Standalone: + return; + case BitstreamRemarkContainerType::RemarksFile: // Contains remarks: emit the version. setupMetaRemarkVersion(); // Needs a string table. setupMetaStrTab(); // Contains remarks: emit the remark abbrevs. setupRemarkBlockInfo(); - break; + return; } - - Bitstream.ExitBlock(); + llvm_unreachable("Unexpected BitstreamRemarkContainerType"); } void BitstreamRemarkSerializerHelper::emitMetaBlock( - uint64_t ContainerVersion, std::optional RemarkVersion, - std::optional StrTab, std::optional Filename) { // Emit the meta block Bitstream.EnterSubblock(META_BLOCK_ID, 3); + auto ExitBlock = make_scope_exit([&] { Bitstream.ExitBlock(); }); // The container version and type. R.clear(); R.push_back(RECORD_META_CONTAINER_INFO); - R.push_back(ContainerVersion); + R.push_back(CurrentContainerVersion); R.push_back(static_cast(ContainerType)); Bitstream.EmitRecordWithAbbrev(RecordMetaContainerInfoAbbrevID, R); switch (ContainerType) { - case BitstreamRemarkContainerType::SeparateRemarksMeta: - assert(StrTab != std::nullopt && *StrTab != nullptr); - emitMetaStrTab(**StrTab); + case BitstreamRemarkContainerType::RemarksFileExternal: assert(Filename != std::nullopt); emitMetaExternalFile(*Filename); - break; - case BitstreamRemarkContainerType::SeparateRemarksFile: - assert(RemarkVersion != std::nullopt); - emitMetaRemarkVersion(*RemarkVersion); - break; - case BitstreamRemarkContainerType::Standalone: - assert(RemarkVersion != std::nullopt); - emitMetaRemarkVersion(*RemarkVersion); - assert(StrTab != std::nullopt && *StrTab != nullptr); - emitMetaStrTab(**StrTab); - break; + return; + case BitstreamRemarkContainerType::RemarksFile: + emitMetaRemarkVersion(CurrentRemarkVersion); + return; } + llvm_unreachable("Unexpected BitstreamRemarkContainerType"); +} +void BitstreamRemarkSerializerHelper::emitLateMetaBlock( + const StringTable &StrTab) { + // Emit the late meta block (after all remarks are serialized) + Bitstream.EnterSubblock(META_BLOCK_ID, 3); + emitMetaStrTab(StrTab); Bitstream.ExitBlock(); } -void BitstreamRemarkSerializerHelper::emitRemarkBlock(const Remark &Remark, - StringTable &StrTab) { +void BitstreamRemarkSerializerHelper::emitRemark(const Remark &Remark, + StringTable &StrTab) { Bitstream.EnterSubblock(REMARK_BLOCK_ID, 4); R.clear(); @@ -317,73 +304,49 @@ void BitstreamRemarkSerializerHelper::emitRemarkBlock(const Remark &Remark, Bitstream.ExitBlock(); } -void BitstreamRemarkSerializerHelper::flushToStream(raw_ostream &OS) { - OS.write(Encoded.data(), Encoded.size()); - Encoded.clear(); -} - -StringRef BitstreamRemarkSerializerHelper::getBuffer() { - return StringRef(Encoded.data(), Encoded.size()); -} - -BitstreamRemarkSerializer::BitstreamRemarkSerializer(raw_ostream &OS, - SerializerMode Mode) - : RemarkSerializer(Format::Bitstream, OS, Mode), - Helper(BitstreamRemarkContainerType::SeparateRemarksFile) { - assert(Mode == SerializerMode::Separate && - "For SerializerMode::Standalone, a pre-filled string table needs to " - "be provided."); - // We always use a string table with bitstream. +BitstreamRemarkSerializer::BitstreamRemarkSerializer(raw_ostream &OS) + : RemarkSerializer(Format::Bitstream, OS) { StrTab.emplace(); } BitstreamRemarkSerializer::BitstreamRemarkSerializer(raw_ostream &OS, - SerializerMode Mode, StringTable StrTabIn) - : RemarkSerializer(Format::Bitstream, OS, Mode), - Helper(Mode == SerializerMode::Separate - ? BitstreamRemarkContainerType::SeparateRemarksFile - : BitstreamRemarkContainerType::Standalone) { + : RemarkSerializer(Format::Bitstream, OS) { StrTab = std::move(StrTabIn); } -void BitstreamRemarkSerializer::emit(const Remark &Remark) { - if (!DidSetUp) { - // Emit the metadata that is embedded in the remark file. - // If we're in standalone mode, serialize the string table as well. - bool IsStandalone = - Helper.ContainerType == BitstreamRemarkContainerType::Standalone; - BitstreamMetaSerializer MetaSerializer( - OS, Helper, - IsStandalone ? &*StrTab - : std::optional(std::nullopt)); - MetaSerializer.emit(); - DidSetUp = true; - } +BitstreamRemarkSerializer::~BitstreamRemarkSerializer() { finalize(); } + +void BitstreamRemarkSerializer::setup() { + if (Helper) + return; + Helper.emplace(BitstreamRemarkContainerType::RemarksFile, OS); + Helper->setupBlockInfo(); + Helper->emitMetaBlock(); +} - assert(DidSetUp && - "The Block info block and the meta block were not emitted yet."); - Helper.emitRemarkBlock(Remark, *StrTab); +void BitstreamRemarkSerializer::finalize() { + if (!Helper) + return; + Helper->emitLateMetaBlock(*StrTab); + Helper = std::nullopt; +} - Helper.flushToStream(OS); +void BitstreamRemarkSerializer::emit(const Remark &Remark) { + setup(); + Helper->emitRemark(Remark, *StrTab); } -std::unique_ptr BitstreamRemarkSerializer::metaSerializer( - raw_ostream &OS, std::optional ExternalFilename) { - assert(Helper.ContainerType != - BitstreamRemarkContainerType::SeparateRemarksMeta); - bool IsStandalone = - Helper.ContainerType == BitstreamRemarkContainerType::Standalone; +std::unique_ptr +BitstreamRemarkSerializer::metaSerializer(raw_ostream &OS, + StringRef ExternalFilename) { return std::make_unique( - OS, - IsStandalone ? BitstreamRemarkContainerType::Standalone - : BitstreamRemarkContainerType::SeparateRemarksMeta, - &*StrTab, ExternalFilename); + OS, BitstreamRemarkContainerType::RemarksFileExternal, ExternalFilename); } void BitstreamMetaSerializer::emit() { + assert(Helper && "BitstreamMetaSerializer emitted multiple times"); Helper->setupBlockInfo(); - Helper->emitMetaBlock(CurrentContainerVersion, CurrentRemarkVersion, StrTab, - ExternalFilename); - Helper->flushToStream(OS); + Helper->emitMetaBlock(ExternalFilename); + Helper = std::nullopt; } diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp index b00419bd4e51b..f0feeccbfe1b8 100644 --- a/llvm/lib/Remarks/RemarkLinker.cpp +++ b/llvm/lib/Remarks/RemarkLinker.cpp @@ -108,7 +108,7 @@ Error RemarkLinker::link(const object::ObjectFile &Obj, Format RemarkFormat) { Error RemarkLinker::serialize(raw_ostream &OS, Format RemarksFormat) const { Expected> MaybeSerializer = - createRemarkSerializer(RemarksFormat, SerializerMode::Standalone, OS, + createRemarkSerializer(RemarksFormat, OS, std::move(const_cast(StrTab))); if (!MaybeSerializer) return MaybeSerializer.takeError(); diff --git a/llvm/lib/Remarks/RemarkSerializer.cpp b/llvm/lib/Remarks/RemarkSerializer.cpp index df1da53d7c8a6..80388b4c47cb0 100644 --- a/llvm/lib/Remarks/RemarkSerializer.cpp +++ b/llvm/lib/Remarks/RemarkSerializer.cpp @@ -18,34 +18,32 @@ using namespace llvm; using namespace llvm::remarks; Expected> -remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode, - raw_ostream &OS) { +remarks::createRemarkSerializer(Format RemarksFormat, raw_ostream &OS) { switch (RemarksFormat) { case Format::Unknown: case Format::Auto: return createStringError(std::errc::invalid_argument, "Invalid remark serializer format."); case Format::YAML: - return std::make_unique(OS, Mode); + return std::make_unique(OS); case Format::Bitstream: - return std::make_unique(OS, Mode); + return std::make_unique(OS); } llvm_unreachable("Unknown remarks::Format enum"); } Expected> -remarks::createRemarkSerializer(Format RemarksFormat, SerializerMode Mode, - raw_ostream &OS, remarks::StringTable StrTab) { +remarks::createRemarkSerializer(Format RemarksFormat, raw_ostream &OS, + remarks::StringTable StrTab) { switch (RemarksFormat) { case Format::Unknown: case Format::Auto: return createStringError(std::errc::invalid_argument, "Invalid remark serializer format."); case Format::YAML: - return std::make_unique(OS, Mode, std::move(StrTab)); + return std::make_unique(OS, std::move(StrTab)); case Format::Bitstream: - return std::make_unique(OS, Mode, - std::move(StrTab)); + return std::make_unique(OS, std::move(StrTab)); } llvm_unreachable("Unknown remarks::Format enum"); } diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp index bb62c8b5c2fdc..d9be2f1fcb6a4 100644 --- a/llvm/lib/Remarks/RemarkStreamer.cpp +++ b/llvm/lib/Remarks/RemarkStreamer.cpp @@ -12,6 +12,7 @@ #include "llvm/Remarks/RemarkStreamer.h" #include "llvm/Support/CommandLine.h" +#include #include using namespace llvm; @@ -31,6 +32,14 @@ RemarkStreamer::RemarkStreamer( Filename(FilenameIn ? std::optional(FilenameIn->str()) : std::nullopt) {} +RemarkStreamer::~RemarkStreamer() { + // Ensure that llvm::finalizeOptimizationRemarks was called before the + // RemarkStreamer is destroyed. + assert(!RemarkSerializer && + "RemarkSerializer must be released before RemarkStreamer is " + "destroyed. Ensure llvm::finalizeOptimizationRemarks is called."); +} + Error RemarkStreamer::setFilter(StringRef Filter) { Regex R = Regex(Filter); std::string RegexError; @@ -57,16 +66,7 @@ bool RemarkStreamer::needsSection() const { assert(EnableRemarksSection == cl::BOU_UNSET); - // We only need a section if we're in separate mode. - if (RemarkSerializer->Mode != remarks::SerializerMode::Separate) - return false; - - // Only some formats need a section: - // * bitstream - switch (RemarkSerializer->SerializerFormat) { - case remarks::Format::Bitstream: - return true; - default: - return false; - } + // Enable remark sections by default for bitstream remarks (so dsymutil can + // find all remarks for a linked binary) + return RemarkSerializer->SerializerFormat == Format::Bitstream; } diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp index 5ff42fe6b9a9c..baad378d72bd4 100644 --- a/llvm/lib/Remarks/YAMLRemarkParser.cpp +++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp @@ -385,7 +385,11 @@ Expected YAMLRemarkParser::parseArg(yaml::Node &Node) { if (!ValueStr) return error("argument value is missing.", *ArgMap); - return Argument{*KeyStr, *ValueStr, Loc}; + Argument Arg; + Arg.Key = *KeyStr; + Arg.Val = *ValueStr; + Arg.Loc = Loc; + return Arg; } Expected> YAMLRemarkParser::next() { diff --git a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp index 846a72182d8f0..22e297040575c 100644 --- a/llvm/lib/Remarks/YAMLRemarkSerializer.cpp +++ b/llvm/lib/Remarks/YAMLRemarkSerializer.cpp @@ -19,8 +19,6 @@ using namespace llvm; using namespace llvm::remarks; -// Use the same keys whether we use a string table or not (respectively, T is an -// unsigned or a StringRef). static void mapRemarkHeader(yaml::IO &io, StringRef PassName, StringRef RemarkName, std::optional RL, StringRef FunctionName, @@ -116,11 +114,13 @@ template <> struct MappingTraits { static void mapping(IO &io, Argument &A) { assert(io.outputting() && "input not yet implemented"); + // NB: A.Key.data() is not necessarily null-terminated, as the StringRef may + // be a span into the middle of a string. if (StringRef(A.Val).count('\n') > 1) { StringBlockVal S(A.Val); - io.mapRequired(A.Key.data(), S); + io.mapRequired(A.Key, S); } else { - io.mapRequired(A.Key.data(), A.Val); + io.mapRequired(A.Key, A.Val); } io.mapOptional("DebugLoc", A.Loc); } @@ -131,10 +131,13 @@ template <> struct MappingTraits { LLVM_YAML_IS_SEQUENCE_VECTOR(Argument) -YAMLRemarkSerializer::YAMLRemarkSerializer(raw_ostream &OS, SerializerMode Mode, - std::optional StrTabIn) - : RemarkSerializer(Format::YAML, OS, Mode), - YAMLOutput(OS, reinterpret_cast(this)) { +YAMLRemarkSerializer::YAMLRemarkSerializer(raw_ostream &OS) + : RemarkSerializer(Format::YAML, OS), + YAMLOutput(OS, reinterpret_cast(this)) {} + +YAMLRemarkSerializer::YAMLRemarkSerializer(raw_ostream &OS, + StringTable StrTabIn) + : YAMLRemarkSerializer(OS) { StrTab = std::move(StrTabIn); } @@ -145,8 +148,9 @@ void YAMLRemarkSerializer::emit(const Remark &Remark) { YAMLOutput << R; } -std::unique_ptr YAMLRemarkSerializer::metaSerializer( - raw_ostream &OS, std::optional ExternalFilename) { +std::unique_ptr +YAMLRemarkSerializer::metaSerializer(raw_ostream &OS, + StringRef ExternalFilename) { return std::make_unique(OS, ExternalFilename); } @@ -186,6 +190,5 @@ void YAMLMetaSerializer::emit() { support::endian::write64le(StrTabSizeBuf.data(), StrTabSize); OS.write(StrTabSizeBuf.data(), StrTabSizeBuf.size()); - if (ExternalFilename) - emitExternalFile(OS, *ExternalFilename); + emitExternalFile(OS, ExternalFilename); } diff --git a/llvm/lib/Support/ARMWinEH.cpp b/llvm/lib/Support/ARMWinEH.cpp index 29c7a28541f23..fedea774b0da1 100644 --- a/llvm/lib/Support/ARMWinEH.cpp +++ b/llvm/lib/Support/ARMWinEH.cpp @@ -41,7 +41,7 @@ std::pair SavedRegisterMask(const RuntimeFunction &RF, GPRMask |= (((1 << ((RF.StackAdjust() & 0x3) + 1)) - 1) << (~RF.StackAdjust() & 0x3)); - return std::make_pair(GPRMask, VFPMask); + return {GPRMask, VFPMask}; } } // namespace WinEH } // namespace ARM diff --git a/llvm/lib/Support/BinaryStreamReader.cpp b/llvm/lib/Support/BinaryStreamReader.cpp index afc00864a5fb6..26ddf3f9c193d 100644 --- a/llvm/lib/Support/BinaryStreamReader.cpp +++ b/llvm/lib/Support/BinaryStreamReader.cpp @@ -174,5 +174,5 @@ BinaryStreamReader::split(uint64_t Off) const { First = First.keep_front(Off); BinaryStreamReader W1{First}; BinaryStreamReader W2{Second}; - return std::make_pair(W1, W2); + return {W1, W2}; } diff --git a/llvm/lib/Support/BinaryStreamWriter.cpp b/llvm/lib/Support/BinaryStreamWriter.cpp index dff08fee3fefa..0c399d5691f5b 100644 --- a/llvm/lib/Support/BinaryStreamWriter.cpp +++ b/llvm/lib/Support/BinaryStreamWriter.cpp @@ -89,7 +89,7 @@ BinaryStreamWriter::split(uint64_t Off) const { First = First.keep_front(Off); BinaryStreamWriter W1{First}; BinaryStreamWriter W2{Second}; - return std::make_pair(W1, W2); + return {W1, W2}; } Error BinaryStreamWriter::padToAlignment(uint32_t Align) { diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp index 12a8d0c3a6bae..9491ec049f79d 100644 --- a/llvm/lib/Support/CommandLine.cpp +++ b/llvm/lib/Support/CommandLine.cpp @@ -101,6 +101,7 @@ void parser::anchor() {} void parser::anchor() {} void parser::anchor() {} void parser::anchor() {} +void parser>::anchor() {} void parser::anchor() {} // These anchor functions instantiate opt and reference its virtual @@ -2261,6 +2262,22 @@ void parser::printOptionDiff(const Option &O, StringRef V, outs() << ")\n"; } +void parser>::printOptionDiff( + const Option &O, std::optional V, + const OptionValue> &D, + size_t GlobalWidth) const { + printOptionName(O, GlobalWidth); + outs() << "= " << V; + size_t VSize = V.has_value() ? V.value().size() : 0; + size_t NumSpaces = MaxOptWidth > VSize ? MaxOptWidth - VSize : 0; + outs().indent(NumSpaces) << " (default: "; + if (D.hasValue() && D.getValue().has_value()) + outs() << D.getValue(); + else + outs() << "*no value*"; + outs() << ")\n"; +} + // Print a placeholder for options that don't yet support printOptionDiff(). void basic_parser_impl::printOptionNoValue(const Option &O, size_t GlobalWidth) const { diff --git a/llvm/lib/Support/DXILABI.cpp b/llvm/lib/Support/DXILABI.cpp index 082e32061bd45..ba6e16a0181c7 100644 --- a/llvm/lib/Support/DXILABI.cpp +++ b/llvm/lib/Support/DXILABI.cpp @@ -1,33 +1,33 @@ -//===-- DXILABI.cpp - ABI Sensitive Values for DXIL -----------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains definitions of various constants and enums that are -// required to remain stable as per the DXIL format's requirements. -// -// Documentation for DXIL can be found in -// https://github.com/Microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Support/DXILABI.h" -#include "llvm/Support/ErrorHandling.h" -using namespace llvm; - -StringRef dxil::getResourceClassName(dxil::ResourceClass RC) { - switch (RC) { - case dxil::ResourceClass::SRV: - return "SRV"; - case dxil::ResourceClass::UAV: - return "UAV"; - case dxil::ResourceClass::CBuffer: - return "CBV"; - case dxil::ResourceClass::Sampler: - return "Sampler"; - } - llvm_unreachable("Invalid ResourceClass enum value"); -} +//===-- DXILABI.cpp - ABI Sensitive Values for DXIL -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains definitions of various constants and enums that are +// required to remain stable as per the DXIL format's requirements. +// +// Documentation for DXIL can be found in +// https://github.com/Microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/DXILABI.h" +#include "llvm/Support/ErrorHandling.h" +using namespace llvm; + +StringRef dxil::getResourceClassName(dxil::ResourceClass RC) { + switch (RC) { + case dxil::ResourceClass::SRV: + return "SRV"; + case dxil::ResourceClass::UAV: + return "UAV"; + case dxil::ResourceClass::CBuffer: + return "CBV"; + case dxil::ResourceClass::Sampler: + return "Sampler"; + } + llvm_unreachable("Invalid ResourceClass enum value"); +} diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp index edb5313d43eec..5dc224a6d427b 100644 --- a/llvm/lib/Support/FileCollector.cpp +++ b/llvm/lib/Support/FileCollector.cpp @@ -49,8 +49,9 @@ static bool isCaseSensitivePath(StringRef Path) { return true; } -FileCollector::FileCollector(std::string Root, std::string OverlayRoot) - : Root(Root), OverlayRoot(OverlayRoot) { +FileCollector::FileCollector(std::string Root, std::string OverlayRoot, + IntrusiveRefCntPtr VFS) + : Root(Root), OverlayRoot(OverlayRoot), Canonicalizer(std::move(VFS)) { assert(sys::path::is_absolute(Root) && "Root not absolute"); assert(sys::path::is_absolute(OverlayRoot) && "OverlayRoot not absolute"); } @@ -88,9 +89,9 @@ void FileCollector::PathCanonicalizer::updateWithRealPath( } /// Make Path absolute. -static void makeAbsolute(SmallVectorImpl &Path) { +static void makeAbsolute(vfs::FileSystem &VFS, SmallVectorImpl &Path) { // We need an absolute src path to append to the root. - sys::fs::make_absolute(Path); + VFS.makeAbsolute(Path); // Canonicalize src to a native path to avoid mixed separator styles. sys::path::native(Path); @@ -105,7 +106,7 @@ FileCollector::PathCanonicalizer::PathStorage FileCollector::PathCanonicalizer::canonicalize(StringRef SrcPath) { PathStorage Paths; Paths.VirtualPath = SrcPath; - makeAbsolute(Paths.VirtualPath); + makeAbsolute(*VFS, Paths.VirtualPath); // If a ".." component is present after a symlink component, remove_dots may // lead to the wrong real destination path. Let the source be canonicalized diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp index 6c2ed6c84c6cf..686688ad6c25f 100644 --- a/llvm/lib/Support/Mustache.cpp +++ b/llvm/lib/Support/Mustache.cpp @@ -305,6 +305,8 @@ SmallVector tokenize(StringRef Template) { SmallVector Tokens; StringLiteral Open("{{"); StringLiteral Close("}}"); + StringLiteral TripleOpen("{{{"); + StringLiteral TripleClose("}}}"); size_t Start = 0; size_t DelimiterStart = Template.find(Open); if (DelimiterStart == StringRef::npos) { @@ -314,18 +316,33 @@ SmallVector tokenize(StringRef Template) { while (DelimiterStart != StringRef::npos) { if (DelimiterStart != Start) Tokens.emplace_back(Template.substr(Start, DelimiterStart - Start).str()); - size_t DelimiterEnd = Template.find(Close, DelimiterStart); - if (DelimiterEnd == StringRef::npos) - break; - // Extract the Interpolated variable without delimiters. - size_t InterpolatedStart = DelimiterStart + Open.size(); - size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size(); - std::string Interpolated = - Template.substr(InterpolatedStart, InterpolatedEnd).str(); - std::string RawBody = Open.str() + Interpolated + Close.str(); - Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]); - Start = DelimiterEnd + Close.size(); + if (Template.substr(DelimiterStart).starts_with(TripleOpen)) { + size_t DelimiterEnd = Template.find(TripleClose, DelimiterStart); + if (DelimiterEnd == StringRef::npos) + break; + size_t BodyStart = DelimiterStart + TripleOpen.size(); + std::string Body = + Template.substr(BodyStart, DelimiterEnd - BodyStart).str(); + std::string RawBody = + Template.substr(DelimiterStart, DelimiterEnd - DelimiterStart + 3) + .str(); + Tokens.emplace_back(RawBody, "&" + Body, '&'); + Start = DelimiterEnd + TripleClose.size(); + } else { + size_t DelimiterEnd = Template.find(Close, DelimiterStart); + if (DelimiterEnd == StringRef::npos) + break; + + // Extract the Interpolated variable without delimiters. + size_t InterpolatedStart = DelimiterStart + Open.size(); + size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size(); + std::string Interpolated = + Template.substr(InterpolatedStart, InterpolatedEnd).str(); + std::string RawBody = Open.str() + Interpolated + Close.str(); + Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]); + Start = DelimiterEnd + Close.size(); + } DelimiterStart = Template.find(Open, Start); } @@ -380,19 +397,32 @@ class EscapeStringStream : public raw_ostream { public: explicit EscapeStringStream(llvm::raw_ostream &WrappedStream, EscapeMap &Escape) - : Escape(Escape), WrappedStream(WrappedStream) { + : Escape(Escape), EscapeChars(Escape.keys().begin(), Escape.keys().end()), + WrappedStream(WrappedStream) { SetUnbuffered(); } protected: void write_impl(const char *Ptr, size_t Size) override { - llvm::StringRef Data(Ptr, Size); - for (char C : Data) { - auto It = Escape.find(C); - if (It != Escape.end()) - WrappedStream << It->getSecond(); - else - WrappedStream << C; + StringRef Data(Ptr, Size); + size_t Start = 0; + while (Start < Size) { + // Find the next character that needs to be escaped. + size_t Next = Data.find_first_of(EscapeChars.str(), Start); + + // If no escapable characters are found, write the rest of the string. + if (Next == StringRef::npos) { + WrappedStream << Data.substr(Start); + return; + } + + // Write the chunk of text before the escapable character. + if (Next > Start) + WrappedStream << Data.substr(Start, Next - Start); + + // Look up and write the escaped version of the character. + WrappedStream << Escape[Data[Next]]; + Start = Next + 1; } } @@ -400,6 +430,7 @@ class EscapeStringStream : public raw_ostream { private: EscapeMap &Escape; + SmallString<8> EscapeChars; llvm::raw_ostream &WrappedStream; }; diff --git a/llvm/lib/Support/OptimizedStructLayout.cpp b/llvm/lib/Support/OptimizedStructLayout.cpp index 7b21f927a3462..a3b5c312beaa9 100644 --- a/llvm/lib/Support/OptimizedStructLayout.cpp +++ b/llvm/lib/Support/OptimizedStructLayout.cpp @@ -82,7 +82,7 @@ llvm::performOptimizedStructLayout(MutableArrayRef Fields) { #ifndef NDEBUG checkValidLayout(Fields, Size, MaxAlign); #endif - return std::make_pair(Size, MaxAlign); + return {Size, MaxAlign}; } // Walk over the flexible-offset fields, tracking MaxAlign and @@ -164,7 +164,7 @@ llvm::performOptimizedStructLayout(MutableArrayRef Fields) { #ifndef NDEBUG checkValidLayout(Fields, LastEnd, MaxAlign); #endif - return std::make_pair(LastEnd, MaxAlign); + return {LastEnd, MaxAlign}; } } @@ -452,5 +452,5 @@ llvm::performOptimizedStructLayout(MutableArrayRef Fields) { checkValidLayout(Fields, LastEnd, MaxAlign); #endif - return std::make_pair(LastEnd, MaxAlign); + return {LastEnd, MaxAlign}; } diff --git a/llvm/lib/Support/PGOOptions.cpp b/llvm/lib/Support/PGOOptions.cpp index 5981dff9e0946..ecfb0ca33f16c 100644 --- a/llvm/lib/Support/PGOOptions.cpp +++ b/llvm/lib/Support/PGOOptions.cpp @@ -13,8 +13,7 @@ using namespace llvm; PGOOptions::PGOOptions(std::string ProfileFile, std::string CSProfileGenFile, std::string ProfileRemappingFile, - std::string MemoryProfile, - IntrusiveRefCntPtr FS, PGOAction Action, + std::string MemoryProfile, PGOAction Action, CSPGOAction CSAction, ColdFuncOpt ColdType, bool DebugInfoForProfiling, bool PseudoProbeForProfiling, bool AtomicCounterUpdate) @@ -24,7 +23,7 @@ PGOOptions::PGOOptions(std::string ProfileFile, std::string CSProfileGenFile, DebugInfoForProfiling(DebugInfoForProfiling || (Action == SampleUse && !PseudoProbeForProfiling)), PseudoProbeForProfiling(PseudoProbeForProfiling), - AtomicCounterUpdate(AtomicCounterUpdate), FS(std::move(FS)) { + AtomicCounterUpdate(AtomicCounterUpdate) { // Note, we do allow ProfileFile.empty() for Action=IRUse LTO can // callback with IRUse action without ProfileFile. @@ -47,10 +46,6 @@ PGOOptions::PGOOptions(std::string ProfileFile, std::string CSProfileGenFile, assert(this->Action != NoAction || this->CSAction != NoCSAction || !this->MemoryProfile.empty() || this->DebugInfoForProfiling || this->PseudoProbeForProfiling); - - // If we need to use the profile, the VFS cannot be nullptr. - assert(this->FS || !(this->Action == IRUse || this->CSAction == CSIRUse || - !this->MemoryProfile.empty())); } PGOOptions::PGOOptions(const PGOOptions &) = default; diff --git a/llvm/lib/Support/ScaledNumber.cpp b/llvm/lib/Support/ScaledNumber.cpp index 4d5923e3634b1..2c99e07660334 100644 --- a/llvm/lib/Support/ScaledNumber.cpp +++ b/llvm/lib/Support/ScaledNumber.cpp @@ -41,7 +41,7 @@ std::pair ScaledNumbers::multiply64(uint64_t LHS, // Check whether the upper digit is empty. if (!Upper) - return std::make_pair(Lower, 0); + return {Lower, 0}; // Shift as little as possible to maximize precision. unsigned LeadingZeros = llvm::countl_zero(Upper); @@ -91,7 +91,7 @@ std::pair ScaledNumbers::divide64(uint64_t Dividend, // Check for powers of two. if (Divisor == 1) - return std::make_pair(Dividend, Shift); + return {Dividend, Shift}; // Maximize size of dividend. if (int Zeros = llvm::countl_zero(Dividend)) { diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp index a602165a0753c..e377dbf4a6999 100644 --- a/llvm/lib/Support/SmallPtrSet.cpp +++ b/llvm/lib/Support/SmallPtrSet.cpp @@ -52,7 +52,7 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) { // Okay, we know we have space. Find a hash bucket. const void **Bucket = const_cast(FindBucketFor(Ptr)); if (*Bucket == Ptr) - return std::make_pair(Bucket, false); // Already inserted, good. + return {Bucket, false}; // Already inserted, good. // Otherwise, insert it! if (*Bucket == getTombstoneMarker()) @@ -60,7 +60,7 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) { ++NumEntries; *Bucket = Ptr; incrementEpoch(); - return std::make_pair(Bucket, true); + return {Bucket, true}; } const void *const *SmallPtrSetImplBase::doFind(const void *Ptr) const { diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp index 3f97213d86c05..a43cf37a79824 100644 --- a/llvm/lib/Support/SourceMgr.cpp +++ b/llvm/lib/Support/SourceMgr.cpp @@ -202,7 +202,7 @@ SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const { size_t NewlineOffs = StringRef(BufStart, Ptr - BufStart).find_last_of("\n\r"); if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0; - return std::make_pair(LineNo, Ptr - BufStart - NewlineOffs); + return {LineNo, Ptr - BufStart - NewlineOffs}; } // FIXME: Note that the formatting of source locations is spread between diff --git a/llvm/lib/Support/StringExtras.cpp b/llvm/lib/Support/StringExtras.cpp index 6ae26267337b1..5058c08aff64a 100644 --- a/llvm/lib/Support/StringExtras.cpp +++ b/llvm/lib/Support/StringExtras.cpp @@ -44,7 +44,7 @@ std::pair llvm::getToken(StringRef Source, // Find the next occurrence of the delimiter. StringRef::size_type End = Source.find_first_of(Delimiters, Start); - return std::make_pair(Source.slice(Start, End), Source.substr(End)); + return {Source.slice(Start, End), Source.substr(End)}; } /// SplitString - Split up the specified string according to the specified diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index b4ee0f8ee8bfd..804ff07f6e9a8 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -161,7 +161,7 @@ TextEncodingConverterICU::convertString(StringRef Source, EC = U_ZERO_ERROR; const char *Input = In; - Output = InputLength ? static_cast(Result.data()) : nullptr; + Output = static_cast(Result.data()); ucnv_convertEx(&*ToConvDesc, &*FromConvDesc, &Output, Result.end(), &Input, In + InputLength, /*pivotStart=*/NULL, /*pivotSource=*/NULL, /*pivotTarget=*/NULL, @@ -172,8 +172,10 @@ TextEncodingConverterICU::convertString(StringRef Source, if (Capacity < Result.max_size()) { HandleOverflow(Capacity, Output, OutputLength, Result); continue; - } else + } else { + Result.resize(Output - Result.data()); return std::error_code(E2BIG, std::generic_category()); + } } // Some other error occured. Result.resize(Output - Result.data()); @@ -268,10 +270,8 @@ TextEncodingConverterIconv::convertString(StringRef Source, }; do { - // Setup the input. Use nullptr to reset iconv state if input length is - // zero. size_t InputLength = Source.size(); - char *Input = const_cast(InputLength ? Source.data() : ""); + char *Input = const_cast(Source.data()); Ret = iconv(ConvDesc, &Input, &InputLength, &Output, &OutputLength); if (Ret != 0) { if (auto EC = HandleError(Ret)) diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp index fa5db46598905..6734877802caf 100644 --- a/llvm/lib/Support/YAMLParser.cpp +++ b/llvm/lib/Support/YAMLParser.cpp @@ -59,7 +59,7 @@ using EncodingInfo = std::pair; /// and how long the byte order mark is if one exists. static EncodingInfo getUnicodeEncoding(StringRef Input) { if (Input.empty()) - return std::make_pair(UEF_Unknown, 0); + return {UEF_Unknown, 0}; switch (uint8_t(Input[0])) { case 0x00: @@ -67,44 +67,44 @@ static EncodingInfo getUnicodeEncoding(StringRef Input) { if ( Input[1] == 0 && uint8_t(Input[2]) == 0xFE && uint8_t(Input[3]) == 0xFF) - return std::make_pair(UEF_UTF32_BE, 4); + return {UEF_UTF32_BE, 4}; if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) - return std::make_pair(UEF_UTF32_BE, 0); + return {UEF_UTF32_BE, 0}; } if (Input.size() >= 2 && Input[1] != 0) - return std::make_pair(UEF_UTF16_BE, 0); - return std::make_pair(UEF_Unknown, 0); + return {UEF_UTF16_BE, 0}; + return {UEF_Unknown, 0}; case 0xFF: if ( Input.size() >= 4 && uint8_t(Input[1]) == 0xFE && Input[2] == 0 && Input[3] == 0) - return std::make_pair(UEF_UTF32_LE, 4); + return {UEF_UTF32_LE, 4}; if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) - return std::make_pair(UEF_UTF16_LE, 2); - return std::make_pair(UEF_Unknown, 0); + return {UEF_UTF16_LE, 2}; + return {UEF_Unknown, 0}; case 0xFE: if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) - return std::make_pair(UEF_UTF16_BE, 2); - return std::make_pair(UEF_Unknown, 0); + return {UEF_UTF16_BE, 2}; + return {UEF_Unknown, 0}; case 0xEF: if ( Input.size() >= 3 && uint8_t(Input[1]) == 0xBB && uint8_t(Input[2]) == 0xBF) - return std::make_pair(UEF_UTF8, 3); - return std::make_pair(UEF_Unknown, 0); + return {UEF_UTF8, 3}; + return {UEF_Unknown, 0}; } // It could still be utf-32 or utf-16. if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) - return std::make_pair(UEF_UTF32_LE, 0); + return {UEF_UTF32_LE, 0}; if (Input.size() >= 2 && Input[1] == 0) - return std::make_pair(UEF_UTF16_LE, 0); + return {UEF_UTF16_LE, 0}; - return std::make_pair(UEF_UTF8, 0); + return {UEF_UTF8, 0}; } /// Pin the vtables to this file. @@ -199,7 +199,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) { // 1 byte: [0x00, 0x7f] // Bit pattern: 0xxxxxxx if (Position < End && (*Position & 0x80) == 0) { - return std::make_pair(*Position, 1); + return {*Position, 1}; } // 2 bytes: [0x80, 0x7ff] // Bit pattern: 110xxxxx 10xxxxxx @@ -208,7 +208,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) { uint32_t codepoint = ((*Position & 0x1F) << 6) | (*(Position + 1) & 0x3F); if (codepoint >= 0x80) - return std::make_pair(codepoint, 2); + return {codepoint, 2}; } // 3 bytes: [0x8000, 0xffff] // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx @@ -222,7 +222,7 @@ static UTF8Decoded decodeUTF8(StringRef Range) { // they are high / low surrogate halves used by UTF-16. if (codepoint >= 0x800 && (codepoint < 0xD800 || codepoint > 0xDFFF)) - return std::make_pair(codepoint, 3); + return {codepoint, 3}; } // 4 bytes: [0x10000, 0x10FFFF] // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx @@ -235,9 +235,9 @@ static UTF8Decoded decodeUTF8(StringRef Range) { ((*(Position + 2) & 0x3F) << 6) | (*(Position + 3) & 0x3F); if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) - return std::make_pair(codepoint, 4); + return {codepoint, 4}; } - return std::make_pair(0, 0); + return {0, 0}; } namespace llvm { diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp index 035828b594e84..95a41eafdf5e4 100644 --- a/llvm/lib/Support/YAMLTraits.cpp +++ b/llvm/lib/Support/YAMLTraits.cpp @@ -144,7 +144,7 @@ std::vector Input::keys() { return Ret; } -bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault, +bool Input::preflightKey(StringRef Key, bool Required, bool, bool &UseDefault, void *&SaveInfo) { UseDefault = false; if (EC) @@ -168,7 +168,7 @@ bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault, UseDefault = true; return false; } - MN->ValidKeys.push_back(Key); + MN->ValidKeys.push_back(Key.str()); HNode *Value = MN->Mapping[Key].first; if (!Value) { if (Required) @@ -266,7 +266,7 @@ void Input::beginEnumScalar() { ScalarMatchFound = false; } -bool Input::matchEnumScalar(const char *Str, bool) { +bool Input::matchEnumScalar(StringRef Str, bool) { if (ScalarMatchFound) return false; if (ScalarHNode *SN = dyn_cast(CurrentNode)) { @@ -302,7 +302,7 @@ bool Input::beginBitSetScalar(bool &DoClear) { return true; } -bool Input::bitSetMatch(const char *Str, bool) { +bool Input::bitSetMatch(StringRef Str, bool) { if (EC) return false; if (SequenceHNode *SQ = dyn_cast(CurrentNode)) { @@ -541,7 +541,7 @@ std::vector Output::keys() { report_fatal_error("invalid call"); } -bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault, +bool Output::preflightKey(StringRef Key, bool Required, bool SameAsDefault, bool &UseDefault, void *&SaveInfo) { UseDefault = false; SaveInfo = nullptr; @@ -666,7 +666,7 @@ void Output::beginEnumScalar() { EnumerationMatchFound = false; } -bool Output::matchEnumScalar(const char *Str, bool Match) { +bool Output::matchEnumScalar(StringRef Str, bool Match) { if (Match && !EnumerationMatchFound) { newLineCheck(); outputUpToEndOfLine(Str); @@ -695,7 +695,7 @@ bool Output::beginBitSetScalar(bool &DoClear) { return true; } -bool Output::bitSetMatch(const char *Str, bool Matches) { +bool Output::bitSetMatch(StringRef Str, bool Matches) { if (Matches) { if (NeedBitValueComma) output(", "); diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.h b/llvm/lib/Support/rpmalloc/rpmalloc.h index 3911c53b779b3..5b7fe1ff4286b 100644 --- a/llvm/lib/Support/rpmalloc/rpmalloc.h +++ b/llvm/lib/Support/rpmalloc/rpmalloc.h @@ -1,428 +1,428 @@ -//===---------------------- rpmalloc.h ------------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#if defined(__clang__) || defined(__GNUC__) -#define RPMALLOC_EXPORT __attribute__((visibility("default"))) -#define RPMALLOC_ALLOCATOR -#if (defined(__clang_major__) && (__clang_major__ < 4)) || \ - (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#else -#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) \ - __attribute__((alloc_size(count, size))) -#endif -#define RPMALLOC_CDECL -#elif defined(_MSC_VER) -#define RPMALLOC_EXPORT -#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#define RPMALLOC_CDECL __cdecl -#else -#define RPMALLOC_EXPORT -#define RPMALLOC_ALLOCATOR -#define RPMALLOC_ATTRIB_MALLOC -#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) -#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) -#define RPMALLOC_CDECL -#endif - -//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce -// a very small overhead due to some size calculations not being compile time -// constants -#ifndef RPMALLOC_CONFIGURABLE -#define RPMALLOC_CONFIGURABLE 0 -#endif - -//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* -//! functions). -// Will introduce a very small overhead to track fully allocated spans in heaps -#ifndef RPMALLOC_FIRST_CLASS_HEAPS -#define RPMALLOC_FIRST_CLASS_HEAPS 0 -#endif - -//! Flag to rpaligned_realloc to not preserve content in reallocation -#define RPMALLOC_NO_PRESERVE 1 -//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be -//! done in-place, -// in which case the original pointer is still valid (just like a call to -// realloc which failes to allocate a new block). -#define RPMALLOC_GROW_OR_FAIL 2 - -typedef struct rpmalloc_global_statistics_t { - //! Current amount of virtual memory mapped, all of which might not have been - //! committed (only if ENABLE_STATISTICS=1) - size_t mapped; - //! Peak amount of virtual memory mapped, all of which might not have been - //! committed (only if ENABLE_STATISTICS=1) - size_t mapped_peak; - //! Current amount of memory in global caches for small and medium sizes - //! (<32KiB) - size_t cached; - //! Current amount of memory allocated in huge allocations, i.e larger than - //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) - size_t huge_alloc; - //! Peak amount of memory allocated in huge allocations, i.e larger than - //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) - size_t huge_alloc_peak; - //! Total amount of memory mapped since initialization (only if - //! ENABLE_STATISTICS=1) - size_t mapped_total; - //! Total amount of memory unmapped since initialization (only if - //! ENABLE_STATISTICS=1) - size_t unmapped_total; -} rpmalloc_global_statistics_t; - -typedef struct rpmalloc_thread_statistics_t { - //! Current number of bytes available in thread size class caches for small - //! and medium sizes (<32KiB) - size_t sizecache; - //! Current number of bytes available in thread span caches for small and - //! medium sizes (<32KiB) - size_t spancache; - //! Total number of bytes transitioned from thread cache to global cache (only - //! if ENABLE_STATISTICS=1) - size_t thread_to_global; - //! Total number of bytes transitioned from global cache to thread cache (only - //! if ENABLE_STATISTICS=1) - size_t global_to_thread; - //! Per span count statistics (only if ENABLE_STATISTICS=1) - struct { - //! Currently used number of spans - size_t current; - //! High water mark of spans used - size_t peak; - //! Number of spans transitioned to global cache - size_t to_global; - //! Number of spans transitioned from global cache - size_t from_global; - //! Number of spans transitioned to thread cache - size_t to_cache; - //! Number of spans transitioned from thread cache - size_t from_cache; - //! Number of spans transitioned to reserved state - size_t to_reserved; - //! Number of spans transitioned from reserved state - size_t from_reserved; - //! Number of raw memory map calls (not hitting the reserve spans but - //! resulting in actual OS mmap calls) - size_t map_calls; - } span_use[64]; - //! Per size class statistics (only if ENABLE_STATISTICS=1) - struct { - //! Current number of allocations - size_t alloc_current; - //! Peak number of allocations - size_t alloc_peak; - //! Total number of allocations - size_t alloc_total; - //! Total number of frees - size_t free_total; - //! Number of spans transitioned to cache - size_t spans_to_cache; - //! Number of spans transitioned from cache - size_t spans_from_cache; - //! Number of spans transitioned from reserved state - size_t spans_from_reserved; - //! Number of raw memory map calls (not hitting the reserve spans but - //! resulting in actual OS mmap calls) - size_t map_calls; - } size_use[128]; -} rpmalloc_thread_statistics_t; - -typedef struct rpmalloc_config_t { - //! Map memory pages for the given number of bytes. The returned address MUST - //! be - // aligned to the rpmalloc span size, which will always be a power of two. - // Optionally the function can store an alignment offset in the offset - // variable in case it performs alignment and the returned pointer is offset - // from the actual start of the memory region due to this alignment. The - // alignment offset will be passed to the memory unmap function. The - // alignment offset MUST NOT be larger than 65535 (storable in an uint16_t), - // if it is you must use natural alignment to shift it into 16 bits. If you - // set a memory_map function, you must also set a memory_unmap function or - // else the default implementation will be used for both. This function must - // be thread safe, it can be called by multiple threads simultaneously. - void *(*memory_map)(size_t size, size_t *offset); - //! Unmap the memory pages starting at address and spanning the given number - //! of bytes. - // If release is set to non-zero, the unmap is for an entire span range as - // returned by a previous call to memory_map and that the entire range should - // be released. The release argument holds the size of the entire span range. - // If release is set to 0, the unmap is a partial decommit of a subset of the - // mapped memory range. If you set a memory_unmap function, you must also set - // a memory_map function or else the default implementation will be used for - // both. This function must be thread safe, it can be called by multiple - // threads simultaneously. - void (*memory_unmap)(void *address, size_t size, size_t offset, - size_t release); - //! Called when an assert fails, if asserts are enabled. Will use the standard - //! assert() - // if this is not set. - void (*error_callback)(const char *message); - //! Called when a call to map memory pages fails (out of memory). If this - //! callback is - // not set or returns zero the library will return a null pointer in the - // allocation call. If this callback returns non-zero the map call will be - // retried. The argument passed is the number of bytes that was requested in - // the map call. Only used if the default system memory map function is used - // (memory_map callback is not set). - int (*map_fail_callback)(size_t size); - //! Size of memory pages. The page size MUST be a power of two. All memory - //! mapping - // requests to memory_map will be made with size set to a multiple of the - // page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system - // page size is used. - size_t page_size; - //! Size of a span of memory blocks. MUST be a power of two, and in - //! [4096,262144] - // range (unless 0 - set to 0 to use the default span size). Used if - // RPMALLOC_CONFIGURABLE is defined to 1. - size_t span_size; - //! Number of spans to map at each request to map new virtual memory blocks. - //! This can - // be used to minimize the system call overhead at the cost of virtual memory - // address space. The extra mapped pages will not be written until actually - // used, so physical committed memory should not be affected in the default - // implementation. Will be aligned to a multiple of spans that match memory - // page size in case of huge pages. - size_t span_map_count; - //! Enable use of large/huge pages. If this flag is set to non-zero and page - //! size is - // zero, the allocator will try to enable huge pages and auto detect the - // configuration. If this is set to non-zero and page_size is also non-zero, - // the allocator will assume huge pages have been configured and enabled - // prior to initializing the allocator. For Windows, see - // https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support - // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt - int enable_huge_pages; - //! Respectively allocated pages and huge allocated pages names for systems - // supporting it to be able to distinguish among anonymous regions. - const char *page_name; - const char *huge_page_name; -} rpmalloc_config_t; - -//! Initialize allocator with default configuration -RPMALLOC_EXPORT int rpmalloc_initialize(void); - -//! Initialize allocator with given configuration -RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config); - -//! Get allocator configuration -RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void); - -//! Finalize allocator -RPMALLOC_EXPORT void rpmalloc_finalize(void); - -//! Initialize allocator for calling thread -RPMALLOC_EXPORT void rpmalloc_thread_initialize(void); - -//! Finalize allocator for calling thread -RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches); - -//! Perform deferred deallocations pending for the calling thread heap -RPMALLOC_EXPORT void rpmalloc_thread_collect(void); - -//! Query if allocator is initialized for calling thread -RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void); - -//! Get per-thread statistics -RPMALLOC_EXPORT void -rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats); - -//! Get global statistics -RPMALLOC_EXPORT void -rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats); - -//! Dump all statistics in human readable format to file (should be a FILE*) -RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file); - -//! Allocate a memory block of at least the given size -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); - -//! Free the given memory block -RPMALLOC_EXPORT void rpfree(void *ptr); - -//! Allocate a memory block of at least the given size and zero initialize it -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); - -//! Reallocate the given block to at least the given size -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Reallocate the given block to at least the given size and alignment, -// with optional control flags (see RPMALLOC_NO_PRESERVE). -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size and alignment, and zero -//! initialize it. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpaligned_calloc(size_t alignment, size_t num, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size and alignment. -// Alignment must be a power of two and a multiple of sizeof(void*), -// and should ideally be less than memory page size. A caveat of rpmalloc -// internals is that this must also be strictly less than the span size -// (default 64KiB) -RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment, - size_t size); - -//! Query the usable size of the given memory block (from given pointer to the -//! end of block) -RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr); - -//! Dummy empty function for forcing linker symbol inclusion -RPMALLOC_EXPORT void rpmalloc_linker_reference(void); - -#if RPMALLOC_FIRST_CLASS_HEAPS - -//! Heap type -typedef struct heap_t rpmalloc_heap_t; - -//! Acquire a new heap. Will reuse existing released heaps or allocate memory -//! for a new heap -// if none available. Heap API is implemented with the strict assumption that -// only one single thread will call heap functions for a given heap at any -// given time, no functions are thread safe. -RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void); - -//! Release a heap (does NOT free the memory allocated by the heap, use -//! rpmalloc_heap_free_all before destroying the heap). -// Releasing a heap will enable it to be reused by other threads. Safe to pass -// a null pointer. -RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap); - -//! Allocate a memory block of at least the given size using the given heap. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(2); - -//! Allocate a memory block of at least the given size using the given heap. The -//! returned -// block will have the requested alignment. Alignment must be a power of two -// and a multiple of sizeof(void*), and should ideally be less than memory page -// size. A caveat of rpmalloc internals is that this must also be strictly less -// than the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Allocate a memory block of at least the given size using the given heap and -//! zero initialize it. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, - size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Allocate a memory block of at least the given size using the given heap and -//! zero initialize it. The returned -// block will have the requested alignment. Alignment must either be zero, or a -// power of two and a multiple of sizeof(void*), and should ideally be less -// than memory page size. A caveat of rpmalloc internals is that this must also -// be strictly less than the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, - size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); - -//! Reallocate the given block to at least the given size. The memory block MUST -//! be allocated -// by the same heap given to this function. -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * -rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC - RPMALLOC_ATTRIB_ALLOC_SIZE(3); - -//! Reallocate the given block to at least the given size. The memory block MUST -//! be allocated -// by the same heap given to this function. The returned block will have the -// requested alignment. Alignment must be either zero, or a power of two and a -// multiple of sizeof(void*), and should ideally be less than memory page size. -// A caveat of rpmalloc internals is that this must also be strictly less than -// the span size (default 64KiB). -RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc( - rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size, - unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); - -//! Free the given memory block from the given heap. The memory block MUST be -//! allocated -// by the same heap given to this function. -RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr); - -//! Free all memory allocated by the heap -RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap); - -//! Set the given heap as the current heap for the calling thread. A heap MUST -//! only be current heap -// for a single thread, a heap can never be shared between multiple threads. -// The previous current heap for the calling thread is released to be reused by -// other threads. -RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap); - -//! Returns which heap the given pointer is allocated on -RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr); - -#endif - -#ifdef __cplusplus -} -#endif +//===---------------------- rpmalloc.h ------------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__clang__) || defined(__GNUC__) +#define RPMALLOC_EXPORT __attribute__((visibility("default"))) +#define RPMALLOC_ALLOCATOR +#if (defined(__clang_major__) && (__clang_major__ < 4)) || \ + (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD) +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#else +#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__)) +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size))) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) \ + __attribute__((alloc_size(count, size))) +#endif +#define RPMALLOC_CDECL +#elif defined(_MSC_VER) +#define RPMALLOC_EXPORT +#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict) +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#define RPMALLOC_CDECL __cdecl +#else +#define RPMALLOC_EXPORT +#define RPMALLOC_ALLOCATOR +#define RPMALLOC_ATTRIB_MALLOC +#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) +#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size) +#define RPMALLOC_CDECL +#endif + +//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce +// a very small overhead due to some size calculations not being compile time +// constants +#ifndef RPMALLOC_CONFIGURABLE +#define RPMALLOC_CONFIGURABLE 0 +#endif + +//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* +//! functions). +// Will introduce a very small overhead to track fully allocated spans in heaps +#ifndef RPMALLOC_FIRST_CLASS_HEAPS +#define RPMALLOC_FIRST_CLASS_HEAPS 0 +#endif + +//! Flag to rpaligned_realloc to not preserve content in reallocation +#define RPMALLOC_NO_PRESERVE 1 +//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be +//! done in-place, +// in which case the original pointer is still valid (just like a call to +// realloc which failes to allocate a new block). +#define RPMALLOC_GROW_OR_FAIL 2 + +typedef struct rpmalloc_global_statistics_t { + //! Current amount of virtual memory mapped, all of which might not have been + //! committed (only if ENABLE_STATISTICS=1) + size_t mapped; + //! Peak amount of virtual memory mapped, all of which might not have been + //! committed (only if ENABLE_STATISTICS=1) + size_t mapped_peak; + //! Current amount of memory in global caches for small and medium sizes + //! (<32KiB) + size_t cached; + //! Current amount of memory allocated in huge allocations, i.e larger than + //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc; + //! Peak amount of memory allocated in huge allocations, i.e larger than + //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1) + size_t huge_alloc_peak; + //! Total amount of memory mapped since initialization (only if + //! ENABLE_STATISTICS=1) + size_t mapped_total; + //! Total amount of memory unmapped since initialization (only if + //! ENABLE_STATISTICS=1) + size_t unmapped_total; +} rpmalloc_global_statistics_t; + +typedef struct rpmalloc_thread_statistics_t { + //! Current number of bytes available in thread size class caches for small + //! and medium sizes (<32KiB) + size_t sizecache; + //! Current number of bytes available in thread span caches for small and + //! medium sizes (<32KiB) + size_t spancache; + //! Total number of bytes transitioned from thread cache to global cache (only + //! if ENABLE_STATISTICS=1) + size_t thread_to_global; + //! Total number of bytes transitioned from global cache to thread cache (only + //! if ENABLE_STATISTICS=1) + size_t global_to_thread; + //! Per span count statistics (only if ENABLE_STATISTICS=1) + struct { + //! Currently used number of spans + size_t current; + //! High water mark of spans used + size_t peak; + //! Number of spans transitioned to global cache + size_t to_global; + //! Number of spans transitioned from global cache + size_t from_global; + //! Number of spans transitioned to thread cache + size_t to_cache; + //! Number of spans transitioned from thread cache + size_t from_cache; + //! Number of spans transitioned to reserved state + size_t to_reserved; + //! Number of spans transitioned from reserved state + size_t from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but + //! resulting in actual OS mmap calls) + size_t map_calls; + } span_use[64]; + //! Per size class statistics (only if ENABLE_STATISTICS=1) + struct { + //! Current number of allocations + size_t alloc_current; + //! Peak number of allocations + size_t alloc_peak; + //! Total number of allocations + size_t alloc_total; + //! Total number of frees + size_t free_total; + //! Number of spans transitioned to cache + size_t spans_to_cache; + //! Number of spans transitioned from cache + size_t spans_from_cache; + //! Number of spans transitioned from reserved state + size_t spans_from_reserved; + //! Number of raw memory map calls (not hitting the reserve spans but + //! resulting in actual OS mmap calls) + size_t map_calls; + } size_use[128]; +} rpmalloc_thread_statistics_t; + +typedef struct rpmalloc_config_t { + //! Map memory pages for the given number of bytes. The returned address MUST + //! be + // aligned to the rpmalloc span size, which will always be a power of two. + // Optionally the function can store an alignment offset in the offset + // variable in case it performs alignment and the returned pointer is offset + // from the actual start of the memory region due to this alignment. The + // alignment offset will be passed to the memory unmap function. The + // alignment offset MUST NOT be larger than 65535 (storable in an uint16_t), + // if it is you must use natural alignment to shift it into 16 bits. If you + // set a memory_map function, you must also set a memory_unmap function or + // else the default implementation will be used for both. This function must + // be thread safe, it can be called by multiple threads simultaneously. + void *(*memory_map)(size_t size, size_t *offset); + //! Unmap the memory pages starting at address and spanning the given number + //! of bytes. + // If release is set to non-zero, the unmap is for an entire span range as + // returned by a previous call to memory_map and that the entire range should + // be released. The release argument holds the size of the entire span range. + // If release is set to 0, the unmap is a partial decommit of a subset of the + // mapped memory range. If you set a memory_unmap function, you must also set + // a memory_map function or else the default implementation will be used for + // both. This function must be thread safe, it can be called by multiple + // threads simultaneously. + void (*memory_unmap)(void *address, size_t size, size_t offset, + size_t release); + //! Called when an assert fails, if asserts are enabled. Will use the standard + //! assert() + // if this is not set. + void (*error_callback)(const char *message); + //! Called when a call to map memory pages fails (out of memory). If this + //! callback is + // not set or returns zero the library will return a null pointer in the + // allocation call. If this callback returns non-zero the map call will be + // retried. The argument passed is the number of bytes that was requested in + // the map call. Only used if the default system memory map function is used + // (memory_map callback is not set). + int (*map_fail_callback)(size_t size); + //! Size of memory pages. The page size MUST be a power of two. All memory + //! mapping + // requests to memory_map will be made with size set to a multiple of the + // page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system + // page size is used. + size_t page_size; + //! Size of a span of memory blocks. MUST be a power of two, and in + //! [4096,262144] + // range (unless 0 - set to 0 to use the default span size). Used if + // RPMALLOC_CONFIGURABLE is defined to 1. + size_t span_size; + //! Number of spans to map at each request to map new virtual memory blocks. + //! This can + // be used to minimize the system call overhead at the cost of virtual memory + // address space. The extra mapped pages will not be written until actually + // used, so physical committed memory should not be affected in the default + // implementation. Will be aligned to a multiple of spans that match memory + // page size in case of huge pages. + size_t span_map_count; + //! Enable use of large/huge pages. If this flag is set to non-zero and page + //! size is + // zero, the allocator will try to enable huge pages and auto detect the + // configuration. If this is set to non-zero and page_size is also non-zero, + // the allocator will assume huge pages have been configured and enabled + // prior to initializing the allocator. For Windows, see + // https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support + // For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt + int enable_huge_pages; + //! Respectively allocated pages and huge allocated pages names for systems + // supporting it to be able to distinguish among anonymous regions. + const char *page_name; + const char *huge_page_name; +} rpmalloc_config_t; + +//! Initialize allocator with default configuration +RPMALLOC_EXPORT int rpmalloc_initialize(void); + +//! Initialize allocator with given configuration +RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config); + +//! Get allocator configuration +RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void); + +//! Finalize allocator +RPMALLOC_EXPORT void rpmalloc_finalize(void); + +//! Initialize allocator for calling thread +RPMALLOC_EXPORT void rpmalloc_thread_initialize(void); + +//! Finalize allocator for calling thread +RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches); + +//! Perform deferred deallocations pending for the calling thread heap +RPMALLOC_EXPORT void rpmalloc_thread_collect(void); + +//! Query if allocator is initialized for calling thread +RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void); + +//! Get per-thread statistics +RPMALLOC_EXPORT void +rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats); + +//! Get global statistics +RPMALLOC_EXPORT void +rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats); + +//! Dump all statistics in human readable format to file (should be a FILE*) +RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file); + +//! Allocate a memory block of at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1); + +//! Free the given memory block +RPMALLOC_EXPORT void rpfree(void *ptr); + +//! Allocate a memory block of at least the given size and zero initialize it +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2); + +//! Reallocate the given block to at least the given size +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Reallocate the given block to at least the given size and alignment, +// with optional control flags (see RPMALLOC_NO_PRESERVE). +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment, and zero +//! initialize it. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpaligned_calloc(size_t alignment, size_t num, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size and alignment. +// Alignment must be a power of two and a multiple of sizeof(void*), +// and should ideally be less than memory page size. A caveat of rpmalloc +// internals is that this must also be strictly less than the span size +// (default 64KiB) +RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment, + size_t size); + +//! Query the usable size of the given memory block (from given pointer to the +//! end of block) +RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr); + +//! Dummy empty function for forcing linker symbol inclusion +RPMALLOC_EXPORT void rpmalloc_linker_reference(void); + +#if RPMALLOC_FIRST_CLASS_HEAPS + +//! Heap type +typedef struct heap_t rpmalloc_heap_t; + +//! Acquire a new heap. Will reuse existing released heaps or allocate memory +//! for a new heap +// if none available. Heap API is implemented with the strict assumption that +// only one single thread will call heap functions for a given heap at any +// given time, no functions are thread safe. +RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void); + +//! Release a heap (does NOT free the memory allocated by the heap, use +//! rpmalloc_heap_free_all before destroying the heap). +// Releasing a heap will enable it to be reused by other threads. Safe to pass +// a null pointer. +RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap); + +//! Allocate a memory block of at least the given size using the given heap. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(2); + +//! Allocate a memory block of at least the given size using the given heap. The +//! returned +// block will have the requested alignment. Alignment must be a power of two +// and a multiple of sizeof(void*), and should ideally be less than memory page +// size. A caveat of rpmalloc internals is that this must also be strictly less +// than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Allocate a memory block of at least the given size using the given heap and +//! zero initialize it. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, + size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Allocate a memory block of at least the given size using the given heap and +//! zero initialize it. The returned +// block will have the requested alignment. Alignment must either be zero, or a +// power of two and a multiple of sizeof(void*), and should ideally be less +// than memory page size. A caveat of rpmalloc internals is that this must also +// be strictly less than the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment, + size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3); + +//! Reallocate the given block to at least the given size. The memory block MUST +//! be allocated +// by the same heap given to this function. +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void * +rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC + RPMALLOC_ATTRIB_ALLOC_SIZE(3); + +//! Reallocate the given block to at least the given size. The memory block MUST +//! be allocated +// by the same heap given to this function. The returned block will have the +// requested alignment. Alignment must be either zero, or a power of two and a +// multiple of sizeof(void*), and should ideally be less than memory page size. +// A caveat of rpmalloc internals is that this must also be strictly less than +// the span size (default 64KiB). +RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc( + rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size, + unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4); + +//! Free the given memory block from the given heap. The memory block MUST be +//! allocated +// by the same heap given to this function. +RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr); + +//! Free all memory allocated by the heap +RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap); + +//! Set the given heap as the current heap for the calling thread. A heap MUST +//! only be current heap +// for a single thread, a heap can never be shared between multiple threads. +// The previous current heap for the calling thread is released to be reused by +// other threads. +RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap); + +//! Returns which heap the given pointer is allocated on +RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr); + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/llvm/lib/Support/rpmalloc/rpnew.h b/llvm/lib/Support/rpmalloc/rpnew.h index d8303c6f95652..a18f0799d56d1 100644 --- a/llvm/lib/Support/rpmalloc/rpnew.h +++ b/llvm/lib/Support/rpmalloc/rpnew.h @@ -1,113 +1,113 @@ -//===-------------------------- rpnew.h -----------------*- C -*-=============// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This library provides a cross-platform lock free thread caching malloc -// implementation in C11. -// -//===----------------------------------------------------------------------===// - -#ifdef __cplusplus - -#include -#include - -#ifndef __CRTDECL -#define __CRTDECL -#endif - -extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); } - -extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); } - -extern void *__CRTDECL operator new(std::size_t size) noexcept(false) { - return rpmalloc(size); -} - -extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) { - return rpmalloc(size); -} - -extern void *__CRTDECL operator new(std::size_t size, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpmalloc(size); -} - -extern void *__CRTDECL operator new[](std::size_t size, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpmalloc(size); -} - -#if (__cplusplus >= 201402L || _MSC_VER >= 1916) - -extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept { - (void)sizeof(size); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept { - (void)sizeof(size); - rpfree(p); -} - -#endif - -#if (__cplusplus > 201402L || defined(__cpp_aligned_new)) - -extern void __CRTDECL operator delete(void *p, - std::align_val_t align) noexcept { - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, - std::align_val_t align) noexcept { - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete(void *p, std::size_t size, - std::align_val_t align) noexcept { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -extern void __CRTDECL operator delete[](void *p, std::size_t size, - std::align_val_t align) noexcept { - (void)sizeof(size); - (void)sizeof(align); - rpfree(p); -} - -extern void *__CRTDECL operator new(std::size_t size, - std::align_val_t align) noexcept(false) { - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new[](std::size_t size, - std::align_val_t align) noexcept(false) { - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpaligned_alloc(static_cast(align), size); -} - -extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align, - const std::nothrow_t &tag) noexcept { - (void)sizeof(tag); - return rpaligned_alloc(static_cast(align), size); -} - -#endif - -#endif +//===-------------------------- rpnew.h -----------------*- C -*-=============// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This library provides a cross-platform lock free thread caching malloc +// implementation in C11. +// +//===----------------------------------------------------------------------===// + +#ifdef __cplusplus + +#include +#include + +#ifndef __CRTDECL +#define __CRTDECL +#endif + +extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); } + +extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); } + +extern void *__CRTDECL operator new(std::size_t size) noexcept(false) { + return rpmalloc(size); +} + +extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) { + return rpmalloc(size); +} + +extern void *__CRTDECL operator new(std::size_t size, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpmalloc(size); +} + +extern void *__CRTDECL operator new[](std::size_t size, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpmalloc(size); +} + +#if (__cplusplus >= 201402L || _MSC_VER >= 1916) + +extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept { + (void)sizeof(size); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept { + (void)sizeof(size); + rpfree(p); +} + +#endif + +#if (__cplusplus > 201402L || defined(__cpp_aligned_new)) + +extern void __CRTDECL operator delete(void *p, + std::align_val_t align) noexcept { + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, + std::align_val_t align) noexcept { + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete(void *p, std::size_t size, + std::align_val_t align) noexcept { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +extern void __CRTDECL operator delete[](void *p, std::size_t size, + std::align_val_t align) noexcept { + (void)sizeof(size); + (void)sizeof(align); + rpfree(p); +} + +extern void *__CRTDECL operator new(std::size_t size, + std::align_val_t align) noexcept(false) { + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new[](std::size_t size, + std::align_val_t align) noexcept(false) { + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpaligned_alloc(static_cast(align), size); +} + +extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align, + const std::nothrow_t &tag) noexcept { + (void)sizeof(tag); + return rpaligned_alloc(static_cast(align), size); +} + +#endif + +#endif diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 54bdb8750f709..6a1b06eea4309 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4346,34 +4346,14 @@ bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm, ->getAPIntValue() .trunc(VT.getFixedSizeInBits()) .getSExtValue(); + int32_t ImmVal, ShiftVal; + if (!AArch64_AM::isSVECpyDupImm(VT.getScalarSizeInBits(), Val, ImmVal, + ShiftVal)) + return false; - switch (VT.SimpleTy) { - case MVT::i8: - // All immediates are supported. - Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32); - return true; - case MVT::i16: - case MVT::i32: - case MVT::i64: - // Support 8bit signed immediates. - if (Val >= -128 && Val <= 127) { - Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); - Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32); - return true; - } - // Support 16bit signed immediates that are a multiple of 256. - if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) { - Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); - Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32); - return true; - } - break; - default: - break; - } - - return false; + Shift = CurDAG->getTargetConstant(ShiftVal, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; } bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index cd7f0e719ad0c..899baa9c998ec 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2182,13 +2182,6 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT, return false; } -bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic( - const IntrinsicInst *I) const { - assert(I->getIntrinsicID() == Intrinsic::vector_partial_reduce_add && - "Unexpected intrinsic!"); - return true; -} - bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const { if (!Subtarget->isSVEorStreamingSVEAvailable()) return true; @@ -8093,13 +8086,76 @@ static SDValue getZT0FrameIndex(MachineFrameInfo &MFI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); } +// Emit a call to __arm_sme_save or __arm_sme_restore. +static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, + SelectionDAG &DAG, + AArch64FunctionInfo *Info, SDLoc DL, + SDValue Chain, bool IsSave) { + MachineFunction &MF = DAG.getMachineFunction(); + AArch64FunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setSMESaveBufferUsed(); + TargetLowering::ArgListTy Args; + Args.emplace_back( + DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), + PointerType::getUnqual(*DAG.getContext())); + + RTLIB::Libcall LC = + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); + auto *RetTy = Type::getVoidTy(*DAG.getContext()); + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( + TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); + return TLI.LowerCallTo(CLI).second; +} + +static SDValue emitRestoreZALazySave(SDValue Chain, SDLoc DL, + const AArch64TargetLowering &TLI, + const AArch64RegisterInfo &TRI, + AArch64FunctionInfo &FuncInfo, + SelectionDAG &DAG) { + // Conditionally restore the lazy save using a pseudo node. + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; + TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj(); + SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask( + DAG.getMachineFunction(), TLI.getLibcallCallingConv(LC))); + SDValue RestoreRoutine = DAG.getTargetExternalSymbol( + TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout())); + SDValue TPIDR2_EL0 = DAG.getNode( + ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); + // Copy the address of the TPIDR2 block into X0 before 'calling' the + // RESTORE_ZA pseudo. + SDValue Glue; + SDValue TPIDR2Block = DAG.getFrameIndex( + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, + {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), + RestoreRoutine, RegMask, Chain.getValue(1)}); + // Finally reset the TPIDR2_EL0 register to 0. + Chain = DAG.getNode( + ISD::INTRINSIC_VOID, DL, MVT::Other, Chain, + DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), + DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; + return Chain; +} + SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, SelectionDAG &DAG) const { assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value"); SDValue Glue = Chain.getValue(1); MachineFunction &MF = DAG.getMachineFunction(); - SMEAttrs SMEFnAttrs = MF.getInfo()->getSMEFnAttrs(); + auto &FuncInfo = *MF.getInfo(); + auto &Subtarget = DAG.getSubtarget(); + const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); + + SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs(); // The following conditions are true on entry to an exception handler: // - PSTATE.SM is 0. @@ -8114,14 +8170,43 @@ SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL, // These mode changes are usually optimized away in catch blocks as they // occur before the __cxa_begin_catch (which is a non-streaming function), // but are necessary in some cases (such as for cleanups). + // + // Additionally, if the function has ZA or ZT0 state, we must restore it. + // [COND_]SMSTART SM if (SMEFnAttrs.hasStreamingInterfaceOrBody()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, - /*Glue*/ Glue, AArch64SME::Always); + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, + /*Glue*/ Glue, AArch64SME::Always); + else if (SMEFnAttrs.hasStreamingCompatibleInterface()) + Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, + AArch64SME::IfCallerIsStreaming); + + if (getTM().useNewSMEABILowering()) + return Chain; + + if (SMEFnAttrs.hasAgnosticZAInterface()) { + // Restore full ZA + Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain, + /*IsSave=*/false); + } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) { + // SMSTART ZA + Chain = DAG.getNode( + AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, + DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32)); - if (SMEFnAttrs.hasStreamingCompatibleInterface()) - return changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue, - AArch64SME::IfCallerIsStreaming); + // Restore ZT0 + if (SMEFnAttrs.hasZT0State()) { + SDValue ZT0FrameIndex = + getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG); + Chain = + DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other), + {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex}); + } + + // Restore ZA + if (SMEFnAttrs.hasZAState()) + Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG); + } return Chain; } @@ -9239,30 +9324,6 @@ SDValue AArch64TargetLowering::changeStreamingMode( return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1)); } -// Emit a call to __arm_sme_save or __arm_sme_restore. -static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, - SelectionDAG &DAG, - AArch64FunctionInfo *Info, SDLoc DL, - SDValue Chain, bool IsSave) { - MachineFunction &MF = DAG.getMachineFunction(); - AArch64FunctionInfo *FuncInfo = MF.getInfo(); - FuncInfo->setSMESaveBufferUsed(); - TargetLowering::ArgListTy Args; - Args.emplace_back( - DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), - PointerType::getUnqual(*DAG.getContext())); - - RTLIB::Libcall LC = - IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; - SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), - TLI.getPointerTy(DAG.getDataLayout())); - auto *RetTy = Type::getVoidTy(*DAG.getContext()); - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); - return TLI.LowerCallTo(CLI).second; -} - static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs) { if (!CallAttrs.caller().hasStreamingCompatibleInterface() || @@ -10022,33 +10083,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx}); if (RequiresLazySave) { - // Conditionally restore the lazy save using a pseudo node. - RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; - TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - SDValue RegMask = DAG.getRegisterMask( - TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); - SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - SDValue TPIDR2_EL0 = DAG.getNode( - ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, - DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); - // Copy the address of the TPIDR2 block into X0 before 'calling' the - // RESTORE_ZA pseudo. - SDValue Glue; - SDValue TPIDR2Block = DAG.getFrameIndex( - TPIDR2.FrameIndex, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); - Result = - DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, - {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64), - RestoreRoutine, RegMask, Result.getValue(1)}); - // Finally reset the TPIDR2_EL0 register to 0. - Result = DAG.getNode( - ISD::INTRINSIC_VOID, DL, MVT::Other, Result, - DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), - DAG.getConstant(0, DL, MVT::i64)); - TPIDR2.Uses++; + Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG); } else if (RequiresSaveAllZA) { Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result, /*IsSave=*/false); @@ -11743,6 +11778,28 @@ SDValue AArch64TargetLowering::LowerSELECT_CC( return DAG.getNode(ISD::AND, DL, VT, LHS, Shift); } + // Check for sign bit test patterns that can use TST optimization. + // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + // (SELECT_CC setlt, sign_extend, 0, tval, fval) + // -> TST %operand, sign_bit; CSEL + if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() && + (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG || + LHS.getOpcode() == ISD::SIGN_EXTEND)) { + + uint64_t SignBitPos; + std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS); + EVT TestVT = LHS.getValueType(); + SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT); + SDValue TST = + DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32), + LHS, SignBitConst); + + SDValue Flags = TST.getValue(1); + return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal, + DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags); + } + // Canonicalise absolute difference patterns: // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc -> // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc @@ -15277,6 +15334,27 @@ static SDValue NormalizeBuildVector(SDValue Op, return DAG.getBuildVector(VT, DL, Ops); } +static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST, APInt &DefBits) { + EVT VT = Op.getValueType(); + // TODO: We should be able to support 64-bit destinations too + if (!ST->hasSVE() || !VT.is128BitVector() || + DefBits.getHiBits(64) != DefBits.getLoBits(64)) + return SDValue(); + + // See if we can make use of the SVE dup instruction. + APInt Val64 = DefBits.trunc(64); + int32_t ImmVal, ShiftVal; + if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal)) + return SDValue(); + + SDLoc DL(Op); + SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL, + DAG.getConstant(Val64, DL, MVT::i64)); + SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal); + return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res); +} + static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST) { EVT VT = Op.getValueType(); @@ -15316,6 +15394,10 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, if (SDValue R = TryMOVIWithBits(UndefBits)) return R; + // Try to materialise the constant using SVE when available. + if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits)) + return R; + // See if a fneg of the constant can be materialized with a MOVI, etc auto TryWithFNeg = [&](APInt DefBits, MVT FVT) { // FNegate each sub-element of the constant @@ -20467,6 +20549,69 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, } } + // Given an extract(load) or extract(extend(load)), produce a scalar load + // instead to avoid the cross-register-bank copies. + if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() && + VT.isInteger() && isa(N1)) { + SDValue LoadN0 = N0; + // Look through sext/zext and extract_subvector / insert_subvector if + // required. + if ((N0.getOpcode() == ISD::ZERO_EXTEND || + N0.getOpcode() == ISD::SIGN_EXTEND || + N0.getOpcode() == ISD::ANY_EXTEND) && + N0.getOperand(0).hasOneUse()) + LoadN0 = N0.getOperand(0); + unsigned OffsetElts = 0; + if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + OffsetElts = LoadN0.getConstantOperandVal(1); + LoadN0 = LoadN0.getOperand(0); + } + if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR && + LoadN0.getOperand(0).isUndef() && + isNullConstant(LoadN0.getOperand(2)) && + LoadN0.getOperand(1).hasOneUse()) + LoadN0 = LoadN0.getOperand(1); + + // Check all the uses are valid and can be scalarized. We check that all the + // uses are extracts and those extracts are not re-inserted into an + // operation best treated as a vector register. + auto Load = dyn_cast(LoadN0); + if (Load && Load->isSimple() && ISD::isNormalLoad(Load) && + Load->getMemoryVT().isByteSized() && + all_of(N0->uses(), [&](const SDUse &U) { + return U.getResNo() != N0.getResNo() || + (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + !any_of(U.getUser()->uses(), [](const SDUse &U2) { + return U2.getUser()->getOpcode() == + ISD::INSERT_VECTOR_ELT || + U2.getUser()->getOpcode() == ISD::BUILD_VECTOR || + U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR; + })); + })) { + + SDLoc DL(Load); + + // Generate a new scalar load. + unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) * + Load->getValueType(0).getScalarSizeInBits() / 8; + SDValue BasePtr = DAG.getObjectPtrOffset( + DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64)); + ISD::LoadExtType ExtType = + N0.getOpcode() == ISD::ZERO_EXTEND + ? ISD::ZEXTLOAD + : (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD + : ISD::EXTLOAD); + SDValue ScalarLoad = + DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr, + Load->getPointerInfo().getWithOffset(Offset), + Load->getValueType(0).getScalarType(), + commonAlignment(Load->getAlign(), Offset), + Load->getMemOperand()->getFlags(), Load->getAAInfo()); + DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad); + return ScalarLoad; + } + } + return SDValue(); } @@ -29325,7 +29470,7 @@ bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT, return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT); } -bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const { +bool AArch64TargetLowering::preferSelectsOverBooleanArithmetic(EVT VT) const { // Expand scalar and SVE operations using selects. Neon vectors prefer sub to // avoid vselect becoming bsl / unrolling. return !VT.isFixedLengthVector(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index ff073d3eafb1f..d8072d15853ee 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -435,7 +435,7 @@ class AArch64TargetLowering : public TargetLowering { bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override; - bool shouldExpandCmpUsingSelects(EVT VT) const override; + bool preferSelectsOverBooleanArithmetic(EVT VT) const override; bool isComplexDeinterleavingSupported() const override; bool isComplexDeinterleavingOperationSupported( @@ -523,9 +523,6 @@ class AArch64TargetLowering : public TargetLowering { bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override; - bool - shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override; - bool shouldExpandCttzElements(EVT VT) const override; bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 78d683a4b4256..f07d3514d1a99 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5301,28 +5301,29 @@ multiclass FPToIntegerUnscaled rmode, bits<3> opcode, string asm, } } -multiclass FPToIntegerSIMDScalar rmode, bits<3> opcode, string asm> { +multiclass FPToIntegerSIMDScalar rmode, bits<3> opcode, string asm, + SDPatternOperator OpN = null_frag> { // double-precision to 32-bit SIMD/FPR def SDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm, - []> { + [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> { let Inst{31} = 0; // 32-bit FPR flag } // half-precision to 32-bit SIMD/FPR def SHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, FPR32, asm, - []> { + [(set FPR32:$Rd, (i32 (OpN (f16 FPR16:$Rn))))]> { let Inst{31} = 0; // 32-bit FPR flag } // half-precision to 64-bit SIMD/FPR def DHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, FPR64, asm, - []> { + [(set FPR64:$Rd, (i64 (OpN (f16 FPR16:$Rn))))]> { let Inst{31} = 1; // 64-bit FPR flag } // single-precision to 64-bit SIMD/FPR def DSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, FPR64, asm, - []> { + [(set FPR64:$Rd, (i64 (OpN (f32 FPR32:$Rn))))]> { let Inst{31} = 1; // 64-bit FPR flag } } @@ -7940,14 +7941,18 @@ multiclass SIMDTwoScalarD opc, string asm, } } -let mayRaiseFPException = 1, Uses = [FPCR] in -multiclass SIMDFPTwoScalar opc, string asm> { +let mayRaiseFPException = 1, Uses = [FPCR], FastISelShouldIgnore = 1 in +multiclass SIMDFPTwoScalar opc, string asm, + SDPatternOperator OpN = null_frag> { let Predicates = [HasNEONandIsStreamingSafe] in { - def v1i64 : BaseSIMDTwoScalar; - def v1i32 : BaseSIMDTwoScalar; + def v1i64 : BaseSIMDTwoScalar; + def v1i32 : BaseSIMDTwoScalar; } let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in { - def v1f16 : BaseSIMDTwoScalar; + def v1f16 : BaseSIMDTwoScalar; } } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 980636c1b562b..f788c7510f80c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5231,18 +5231,19 @@ defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>; defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>; let Predicates = [HasNEON, HasFPRCVT] in{ - defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas">; - defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau">; - defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms">; - defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu">; - defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns">; - defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu">; - defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps">; - defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu">; + defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas", int_aarch64_neon_fcvtas>; + defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau", int_aarch64_neon_fcvtau>; + defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms", int_aarch64_neon_fcvtms>; + defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu", int_aarch64_neon_fcvtmu>; + defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns", int_aarch64_neon_fcvtns>; + defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>; + defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>; + defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>; defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">; defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">; } + // AArch64's FCVT instructions saturate when out of range. multiclass FPToIntegerSatPats { let Predicates = [HasFullFP16] in { @@ -5309,35 +5310,6 @@ multiclass FPToIntegerSatPats; defm : FPToIntegerSatPats; -multiclass FPToIntegerIntPats { - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (round f16:$Rn)), (!cast(INST # UWHr) $Rn)>; - def : Pat<(i64 (round f16:$Rn)), (!cast(INST # UXHr) $Rn)>; - } - def : Pat<(i32 (round f32:$Rn)), (!cast(INST # UWSr) $Rn)>; - def : Pat<(i64 (round f32:$Rn)), (!cast(INST # UXSr) $Rn)>; - def : Pat<(i32 (round f64:$Rn)), (!cast(INST # UWDr) $Rn)>; - def : Pat<(i64 (round f64:$Rn)), (!cast(INST # UXDr) $Rn)>; - - let Predicates = [HasFullFP16] in { - def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), - (!cast(INST # SWHri) $Rn, $scale)>; - def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), - (!cast(INST # SXHri) $Rn, $scale)>; - } - def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), - (!cast(INST # SWSri) $Rn, $scale)>; - def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), - (!cast(INST # SXSri) $Rn, $scale)>; - def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), - (!cast(INST # SWDri) $Rn, $scale)>; - def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), - (!cast(INST # SXDri) $Rn, $scale)>; -} - -defm : FPToIntegerIntPats; -defm : FPToIntegerIntPats; - multiclass FPToIntegerPats { def : Pat<(i32 (to_int (round f32:$Rn))), (!cast(INST # UWSr) f32:$Rn)>; @@ -6572,14 +6544,14 @@ defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>; defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>; defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; -defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">; -defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">; -defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">; -defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">; -defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">; -defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">; -defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">; -defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">; +defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas", int_aarch64_neon_fcvtas>; +defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau", int_aarch64_neon_fcvtau>; +defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms", int_aarch64_neon_fcvtms>; +defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu", int_aarch64_neon_fcvtmu>; +defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns", int_aarch64_neon_fcvtns>; +defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtnu>; +defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>; +defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>; def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">; defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">; defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">; @@ -6600,6 +6572,86 @@ defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd", int_aarch64_neon_usqadd>; +// Floating-point conversion patterns. +multiclass FPToIntegerSIMDScalarPatterns { + def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))), + (!cast(INST # SDr) FPR64:$Rn)>; + def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))), + (!cast(INST # SHr) FPR16:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (OpN (f16 FPR16:$Rn))))), + (!cast(INST # DHr) FPR16:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))), + (!cast(INST # DSr) FPR32:$Rn)>; + def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))), + (!cast(INST # v1i32) FPR32:$Rn)>; + def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))), + (!cast(INST # v1i64) FPR64:$Rn)>; + +} +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; +defm: FPToIntegerSIMDScalarPatterns; + +multiclass FPToIntegerIntPats { + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (round f16:$Rn)), (!cast(INST # UWHr) $Rn)>; + def : Pat<(i64 (round f16:$Rn)), (!cast(INST # UXHr) $Rn)>; + } + def : Pat<(i32 (round f32:$Rn)), (!cast(INST # UWSr) $Rn)>; + def : Pat<(i64 (round f32:$Rn)), (!cast(INST # UXSr) $Rn)>; + def : Pat<(i32 (round f64:$Rn)), (!cast(INST # UWDr) $Rn)>; + def : Pat<(i64 (round f64:$Rn)), (!cast(INST # UXDr) $Rn)>; + + // For global-isel we can use register classes to determine + // which FCVT instruction to use. + let Predicates = [HasFPRCVT] in { + def : Pat<(i32 (round f16:$Rn)), (!cast(INST # SHr) $Rn)>; + def : Pat<(i64 (round f16:$Rn)), (!cast(INST # DHr) $Rn)>; + def : Pat<(i64 (round f32:$Rn)), (!cast(INST # DSr) $Rn)>; + def : Pat<(i32 (round f64:$Rn)), (!cast(INST # SDr) $Rn)>; + } + def : Pat<(i32 (round f32:$Rn)), (!cast(INST # v1i32) $Rn)>; + def : Pat<(i64 (round f64:$Rn)), (!cast(INST # v1i64) $Rn)>; + + let Predicates = [HasFPRCVT] in { + def : Pat<(f32 (bitconvert (i32 (round f16:$Rn)))), + (!cast(INST # SHr) $Rn)>; + def : Pat<(f64 (bitconvert (i64 (round f16:$Rn)))), + (!cast(INST # DHr) $Rn)>; + def : Pat<(f64 (bitconvert (i64 (round f32:$Rn)))), + (!cast(INST # DSr) $Rn)>; + def : Pat<(f32 (bitconvert (i32 (round f64:$Rn)))), + (!cast(INST # SDr) $Rn)>; + } + def : Pat<(f32 (bitconvert (i32 (round f32:$Rn)))), + (!cast(INST # v1i32) $Rn)>; + def : Pat<(f64 (bitconvert (i64 (round f64:$Rn)))), + (!cast(INST # v1i64) $Rn)>; + + let Predicates = [HasFullFP16] in { + def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))), + (!cast(INST # SWHri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))), + (!cast(INST # SXHri) $Rn, $scale)>; + } + def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))), + (!cast(INST # SWSri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))), + (!cast(INST # SXSri) $Rn, $scale)>; + def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))), + (!cast(INST # SWDri) $Rn, $scale)>; + def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))), + (!cast(INST # SXDri) $Rn, $scale)>; +} + +defm : FPToIntegerIntPats; +defm : FPToIntegerIntPats; + // f16 -> s16 conversions let Predicates = [HasFullFP16] in { def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>; @@ -9855,8 +9907,14 @@ def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 (REV64v4i16 FPR64:$src))>; } -def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>; -def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4f16 (bitconvert (v4bf16 FPR64:$src))), + (v4f16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), + (v4bf16 FPR64:$src)>; +def : Pat<(v4bf16 (bitconvert (v4f16 FPR64:$src))), + (v4bf16 FPR64:$src)>; let Predicates = [IsLE] in { def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>; @@ -10184,8 +10242,14 @@ def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 (REV32v8i16 FPR128:$src))>; } -def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>; -def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), + (v8bf16 FPR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8bf16 FPR128:$src))), + (v8f16 FPR128:$src)>; +def : Pat<(v8bf16 (bitconvert (v8f16 FPR128:$src))), + (v8bf16 FPR128:$src)>; let Predicates = [IsLE] in { def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 7fe4f7acdbd49..1e30735b7a56a 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -709,13 +709,13 @@ let Predicates = [HasSVE_or_SME] in { let Predicates = [HasSVE_or_SME] in { def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)), - (UDOT_ZZZ_S $Acc, $MulLHS, $MulRHS)>; + (UDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>; def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)), - (SDOT_ZZZ_S $Acc, $MulLHS, $MulRHS)>; + (SDOT_ZZZ_BtoS $Acc, $MulLHS, $MulRHS)>; def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), - (UDOT_ZZZ_D $Acc, $MulLHS, $MulRHS)>; + (UDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>; def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)), - (SDOT_ZZZ_D $Acc, $MulLHS, $MulRHS)>; + (SDOT_ZZZ_HtoD $Acc, $MulLHS, $MulRHS)>; } // End HasSVE_or_SME defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; @@ -2541,7 +2541,7 @@ let Predicates = [HasBF16, HasSVE_or_SME] in { } // End HasBF16, HasSVE_or_SME let Predicates = [HasBF16, HasSVE] in { - defm BFMMLA_ZZZ : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>; + defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>; } // End HasBF16, HasSVE let Predicates = [HasBF16, HasSVE_or_SME] in { diff --git a/llvm/lib/Target/AArch64/AArch64SchedA320.td b/llvm/lib/Target/AArch64/AArch64SchedA320.td index 5ec95c707c28f..2c193e59cc417 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA320.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA320.td @@ -826,13 +826,13 @@ def : InstRW<[CortexA320MCWrite<15, 12, CortexA320UnitVMC>], (instregex "^[SU]DI def : InstRW<[CortexA320MCWrite<26, 23, CortexA320UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>; // Dot product, 8 bit -def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>; +def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_BtoS")>; // Dot product, 8 bit, using signed and unsigned integers def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; // Dot product, 16 bit -def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>; +def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_HtoD")>; // Duplicate, immediate and indexed form def : InstRW<[CortexA320Write<3, CortexA320UnitVALU>], (instregex "^DUP_ZI_[BHSD]", @@ -1182,7 +1182,7 @@ def : InstRW<[CortexA320Write<4, CortexA320UnitVALU>], (instrs BFCVT_ZPmZ, BFCVT def : InstRW<[CortexA320Write_11cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[CortexA320Write_16cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>; +def : InstRW<[CortexA320Write_16cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long def : InstRW<[CortexA320Write<4, CortexA320UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td index 356e3fa39c53f..66f49f040ad12 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA510.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td @@ -804,13 +804,13 @@ def : InstRW<[CortexA510MCWrite<15, 12, CortexA510UnitVMC>], (instregex "^[SU]DI def : InstRW<[CortexA510MCWrite<26, 23, CortexA510UnitVMC>], (instregex "^[SU]DIVR?_(ZPmZ|ZPZZ)_D")>; // Dot product, 8 bit -def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_S")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_BtoS")>; // Dot product, 8 bit, using signed and unsigned integers def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; // Dot product, 16 bit -def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_D")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^[SU]DOT_ZZZI?_HtoD")>; // Duplicate, immediate and indexed form def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^DUP_ZI_[BHSD]", @@ -1160,7 +1160,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs BFCVT_ZPmZ, BFCVT def : InstRW<[A510Write_10cyc_1VMAC_1VALU], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ)>; +def : InstRW<[A510Write_15cyc_1VMAC_1VALU], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^BFMLAL[BT]_ZZZ(I)?")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td index e7982226ff3d1..50f10114989d0 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td @@ -1764,13 +1764,13 @@ def : InstRW<[N2Write_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[N2Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_S$")>; +def : InstRW<[N2Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>; // Dot product, 8 bit, using signed and unsigned integers def : InstRW<[N2Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; // Dot product, 16 bit -def : InstRW<[N2Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; +def : InstRW<[N2Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>; // Duplicate, immediate and indexed form def : InstRW<[N2Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$", @@ -2118,7 +2118,7 @@ def : InstRW<[N2Write_3c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; def : InstRW<[N2Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[N2Write_5c_1V], (instrs BFMMLA_ZZZ)>; +def : InstRW<[N2Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long def : InstRW<[N2Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td index cd0d8a9186d5b..411b372a3f533 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td @@ -1736,13 +1736,13 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_S$")>; +def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>; // Dot product, 8 bit, using signed and unsigned integers def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; // Dot product, 16 bit -def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; +def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>; // Duplicate, immediate and indexed form def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$", @@ -2082,7 +2082,7 @@ def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ)>; +def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index f28df44bfdb38..3cbfc59423c9a 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -1555,14 +1555,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>; +def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>; // Dot product, 8 bit, using signed and unsigned integers def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; // Dot product, 16 bit -def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>; +def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>; // Duplicate, immediate and indexed form def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$", @@ -1808,7 +1808,7 @@ def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td index 6261220082029..bdde8e388cccc 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td @@ -2251,13 +2251,13 @@ def : InstRW<[V2Write_20c_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S")>; +def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS")>; // Dot product, 8 bit, using signed and unsigned integers def : InstRW<[V2Wr_ZDOTB, V2Rd_ZDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; // Dot product, 16 bit -def : InstRW<[V2Wr_ZDOTH, V2Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D")>; +def : InstRW<[V2Wr_ZDOTH, V2Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD")>; // Duplicate, immediate and indexed form def : InstRW<[V2Write_2c_1V], (instregex "^DUP_ZI_[BHSD]", @@ -2614,7 +2614,7 @@ def : InstRW<[V2Write_4c_1V02], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; def : InstRW<[V2Wr_ZBFDOT, V2Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[V2Wr_ZBFMMA, V2Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>; +def : InstRW<[V2Wr_ZBFMMA, V2Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long def : InstRW<[V2Wr_ZBFMAL, V2Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZI?")>; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 5748556d07285..96cc3f3cac91c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2914,10 +2914,10 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { } if (OpFlags & AArch64II::MO_GOT) { - I.setDesc(TII.get(MF.getInfo()->hasELFSignedGOT() - ? AArch64::LOADgotAUTH - : AArch64::LOADgot)); + bool IsGOTSigned = MF.getInfo()->hasELFSignedGOT(); + I.setDesc(TII.get(IsGOTSigned ? AArch64::LOADgotAUTH : AArch64::LOADgot)); I.getOperand(1).setTargetFlags(OpFlags); + I.addImplicitDefUseOperands(MF); } else if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) { // Materialize the global using movz/movk instructions. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index ea2196a584127..7ee54c5932b15 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -444,7 +444,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) // Regardless of FP16 support, widen 16-bit elements to 32-bits. .minScalar(0, s32) .libcallFor({s32, s64, s128}); - getActionDefinitionsBuilder(G_FPOWI) + getActionDefinitionsBuilder({G_FPOWI, G_FLDEXP}) .scalarize(0) .minScalar(0, s32) .libcallFor({{s32, s32}, {s64, s32}, {s128, s32}}); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index eeb34e12993b9..f90bcc7a77cdf 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -573,9 +573,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI, case Intrinsic::aarch64_neon_fcvtnu: case Intrinsic::aarch64_neon_fcvtps: case Intrinsic::aarch64_neon_fcvtpu: - // Force FPR register bank for half types, as those types otherwise - // don't get legalized correctly resulting in fp16 <-> gpr32 COPY's. - return MRI.getType(MI.getOperand(2).getReg()) == LLT::float16(); + return true; default: break; } @@ -1148,6 +1146,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case TargetOpcode::G_INTRINSIC: case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: { switch (cast(MI).getIntrinsicID()) { + case Intrinsic::aarch64_neon_fcvtas: + case Intrinsic::aarch64_neon_fcvtau: + case Intrinsic::aarch64_neon_fcvtzs: + case Intrinsic::aarch64_neon_fcvtzu: + case Intrinsic::aarch64_neon_fcvtms: + case Intrinsic::aarch64_neon_fcvtmu: + case Intrinsic::aarch64_neon_fcvtns: + case Intrinsic::aarch64_neon_fcvtnu: + case Intrinsic::aarch64_neon_fcvtps: + case Intrinsic::aarch64_neon_fcvtpu: { + OpRegBankIdx[2] = PMI_FirstFPR; + if (MRI.getType(MI.getOperand(0).getReg()).isVector()) { + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } + TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); + TypeSize SrcSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, TRI); + if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) && + all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()), + [&](const MachineInstr &UseMI) { + return onlyUsesFP(UseMI, MRI, TRI) || + prefersFPUse(UseMI, MRI, TRI); + })) + OpRegBankIdx[0] = PMI_FirstFPR; + else + OpRegBankIdx[0] = PMI_FirstGPR; + break; + } case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: case Intrinsic::aarch64_neon_vcvtfp2fxs: diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h index f542592d22c5f..4ae5d040d5e8a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -871,6 +871,36 @@ inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) { return isAnyMOVZMovAlias(Value, RegWidth); } +static inline bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, + int32_t &Shift) { + switch (SizeInBits) { + case 8: + // All immediates are supported. + Shift = 0; + Imm = Val & 0xFF; + return true; + case 16: + case 32: + case 64: + // Support 8bit signed immediates. + if (Val >= -128 && Val <= 127) { + Shift = 0; + Imm = Val & 0xFF; + return true; + } + // Support 16bit signed immediates that are a multiple of 256. + if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) { + Shift = 8; + Imm = (Val >> 8) & 0xFF; + return true; + } + break; + default: + break; + } + return false; +} + } // end namespace AArch64_AM } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 6e5a69030dbc6..21ff55e9d9a7f 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -25,7 +25,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include #include diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index 85cca1de47b78..2a563663a34d1 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -184,6 +184,17 @@ bool SMEPeepholeOpt::optimizeStartStopPairs( isSVERegOp(TRI, MRI, MI.getOperand(1))) Prev = nullptr; break; + case AArch64::RestoreZAPseudo: + case AArch64::InOutZAUsePseudo: + case AArch64::CommitZASavePseudo: + case AArch64::SMEStateAllocPseudo: + case AArch64::RequiresZASavePseudo: + // These instructions only depend on the ZA state, not the streaming mode, + // so if the pair of smstart/stop is only changing the streaming mode, we + // can permit these instructions. + if (Prev->getOperand(0).getImm() != AArch64SVCR::SVCRSM) + Prev = nullptr; + break; case AArch64::ADJCALLSTACKDOWN: case AArch64::ADJCALLSTACKUP: case AArch64::ANDXri: diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 7913e8ca8652e..9a23c35766cac 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3748,18 +3748,18 @@ multiclass sve2_int_mla_long_by_indexed_elem opc, string asm, // SVE Integer Dot Product Group //===----------------------------------------------------------------------===// -class sve_intx_dot +class sve_intx_dot sz, bits<5> op5, bit U, string asm, + ZPRRegOp zprty1, ZPRRegOp zprty2> : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm), asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { bits<5> Zda; bits<5> Zn; bits<5> Zm; - let Inst{31-23} = 0b010001001; - let Inst{22} = sz; + let Inst{31-24} = 0b01000100; + let Inst{23-22} = sz; let Inst{21} = 0; let Inst{20-16} = Zm; - let Inst{15-11} = 0; + let Inst{15-11} = op5; let Inst{10} = U; let Inst{9-5} = Zn; let Inst{4-0} = Zda; @@ -3770,11 +3770,17 @@ class sve_intx_dot { - def _S : sve_intx_dot<0b0, opc, asm, ZPR32, ZPR8>; - def _D : sve_intx_dot<0b1, opc, asm, ZPR64, ZPR16>; + def _BtoS : sve_intx_dot<0b10, 0b00000, opc, asm, ZPR32, ZPR8>; + def _HtoD : sve_intx_dot<0b11, 0b00000, opc, asm, ZPR64, ZPR16>; + + def : SVE_3_Op_Pat(NAME # _BtoS)>; + def : SVE_3_Op_Pat(NAME # _HtoD)>; +} - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; +multiclass sve2p1_two_way_dot_vv { + def NAME : sve_intx_dot<0b00, 0b11001, u, mnemonic, ZPR32, ZPR16>; + + def : SVE_3_Op_Pat(NAME)>; } //===----------------------------------------------------------------------===// @@ -3804,21 +3810,21 @@ class sve_intx_dot_by_indexed_elem { - def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { + def _BtoS : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> { bits<2> iop; bits<3> Zm; let Inst{20-19} = iop; let Inst{18-16} = Zm; } - def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { + def _HtoD : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> { bits<1> iop; bits<4> Zm; let Inst{20} = iop; let Inst{19-16} = Zm; } - def : SVE_4_Op_Imm_Pat(NAME # _S)>; - def : SVE_4_Op_Imm_Pat(NAME # _D)>; + def : SVE_4_Op_Imm_Pat(NAME # _BtoS)>; + def : SVE_4_Op_Imm_Pat(NAME # _HtoD)>; } //===----------------------------------------------------------------------===// @@ -9893,32 +9899,6 @@ multiclass sve_fp_clamp_bfloat { def : SVE_3_Op_Pat(NAME)>; } -// SVE two-way dot product -class sve2p1_two_way_dot_vv - : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm), - mnemonic, "\t$Zda, $Zn, $Zm", - "", []>, Sched<[]> { - bits<5> Zda; - bits<5> Zn; - bits<5> Zm; - let Inst{31-21} = 0b01000100000; - let Inst{20-16} = Zm; - let Inst{15-11} = 0b11001; - let Inst{10} = u; - let Inst{9-5} = Zn; - let Inst{4-0} = Zda; - - let Constraints = "$Zda = $_Zda"; - let DestructiveInstType = DestructiveOther; - let hasSideEffects = 0; -} - -multiclass sve2p1_two_way_dot_vv { - def NAME : sve2p1_two_way_dot_vv; - - def : SVE_3_Op_Pat(NAME)>; -} - // SVE two-way dot product (indexed) class sve2p1_two_way_dot_vvi : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS32b:$i2), diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 23339b2ad228e..eaa1870f4be28 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1238,6 +1238,19 @@ def FeatureSetPrioIncWgInst : SubtargetFeature<"setprio-inc-wg-inst", // Subtarget Features (options and debugging) //===------------------------------------------------------------===// +// Ugly hack to accomodate assembling modules with mixed +// wavesizes. Ideally we would have a mapping symbol in assembly which +// would keep track of which sections of code should be treated as +// wave32 and wave64. Instead what users do is assemble with both +// wavesizes enabled. We translate this into this special mode so this +// only influences assembler behavior and nothing else. +def FeatureAssemblerPermissiveWavesize : SubtargetFeature< + "assembler-permissive-wavesize", + "AssemblerPermissiveWavesize", + "true", + "allow parsing wave32 and wave64 variants of instructions" +>; + class FeatureMaxPrivateElementSize : SubtargetFeature< "max-private-element-size-"#size, "MaxPrivateElementSize", @@ -1443,6 +1456,12 @@ def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic "Has LDS barrier-arrive atomic instructions" >; +def Feature45BitNumRecordsBufferResource : SubtargetFeature< "45-bit-num-records-buffer-resource", + "Has45BitNumRecordsBufferResource", + "true", + "The buffer resource (V#) supports 45-bit num_records" +>; + // Dummy feature used to disable assembler instructions. def FeatureDisable : SubtargetFeature<"", "FeatureDisable","true", @@ -2106,6 +2125,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureMadU32Inst, FeatureLdsBarrierArriveAtomic, FeatureSetPrioIncWgInst, + Feature45BitNumRecordsBufferResource, ]>; def FeatureISAVersion12_51 : FeatureSet< diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index d4210b8bc9a87..2192a72bb27b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1089,10 +1089,17 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) { for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E; ++UI) if (UI.getUse().getResNo() == 1) { - if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) || - (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) { - IsVALU = true; - break; + if (UI->isMachineOpcode()) { + if (UI->getMachineOpcode() != + (IsAdd ? AMDGPU::S_ADD_CO_PSEUDO : AMDGPU::S_SUB_CO_PSEUDO)) { + IsVALU = true; + break; + } + } else { + if (UI->getOpcode() != (IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY)) { + IsVALU = true; + break; + } } } @@ -4078,18 +4085,26 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, // register. Mods |= SISrcMods::OP_SEL_1; - if (IsExtractHigh || - (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { - Mods |= SISrcMods::OP_SEL_0; + if (Src.getValueSizeInBits() == 16) { + if (isExtractHiElt(Src, Src)) { + Mods |= SISrcMods::OP_SEL_0; - // TODO: Should we try to look for neg/abs here? - } + // TODO: Should we try to look for neg/abs here? + return true; + } + + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + return true; + } + + if (Subtarget->useRealTrue16Insts()) + // In true16 mode, pack src to a 32bit + Src = createVOP3PSrc32FromLo16(Src, In, CurDAG, Subtarget); + } else if (IsExtractHigh) + Mods |= SISrcMods::OP_SEL_0; - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index f069b591eb315..a44af5f854c18 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5287,30 +5287,6 @@ SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N, return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0)); } -bool AMDGPUTargetLowering::isInt64ImmLegal(SDNode *N, SelectionDAG &DAG) const { - if (!Subtarget->isGCN()) - return false; - - ConstantSDNode *SDConstant = dyn_cast(N); - ConstantFPSDNode *SDFPConstant = dyn_cast(N); - auto &ST = DAG.getSubtarget(); - const auto *TII = ST.getInstrInfo(); - - if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant)) - return false; - - if (ST.has64BitLiterals()) - return true; - - if (SDConstant) { - const APInt &APVal = SDConstant->getAPIntValue(); - return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal); - } - - APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt(); - return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val); -} - SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -5360,8 +5336,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast(Src)) { SDLoc SL(N); - if (isInt64ImmLegal(C, DAG)) - break; uint64_t CVal = C->getZExtValue(); SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, DAG.getConstant(Lo_32(CVal), SL, MVT::i32), @@ -5372,8 +5346,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, if (ConstantFPSDNode *C = dyn_cast(Src)) { const APInt &Val = C->getValueAPF().bitcastToAPInt(); SDLoc SL(N); - if (isInt64ImmLegal(C, DAG)) - break; uint64_t CVal = Val.getZExtValue(); SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, DAG.getConstant(Lo_32(CVal), SL, MVT::i32), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 610f0ebb4caa5..bdaf48652d107 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -103,9 +103,6 @@ class AMDGPUTargetLowering : public TargetLowering { SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; protected: - /// Check whether value Val can be supported by v_mov_b64, for the current - /// target. - bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const; bool shouldCombineMemoryType(EVT VT) const; SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c690b2b7129b4..ee466ca20bde3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5905,33 +5905,50 @@ bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( Register Flags = MI.getOperand(5).getReg(); LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); B.setInsertPt(B.getMBB(), ++B.getInsertPt()); - auto Unmerge = B.buildUnmerge(S32, Pointer); - Register LowHalf = Unmerge.getReg(0); - Register HighHalf = Unmerge.getReg(1); - - auto AndMask = B.buildConstant(S32, 0x0000ffff); - auto Masked = B.buildAnd(S32, HighHalf, AndMask); - - MachineInstrBuilder NewHighHalf = Masked; - std::optional StrideConst = - getIConstantVRegValWithLookThrough(Stride, MRI); - if (!StrideConst || !StrideConst->Value.isZero()) { - MachineInstrBuilder ShiftedStride; - if (StrideConst) { - uint32_t StrideVal = StrideConst->Value.getZExtValue(); - uint32_t ShiftedStrideVal = StrideVal << 16; - ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); - } else { - auto ExtStride = B.buildAnyExt(S32, Stride); - auto ShiftConst = B.buildConstant(S32, 16); - ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); - } - NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); + + auto ExtStride = B.buildAnyExt(S32, Stride); + + if (ST.has45BitNumRecordsBufferResource()) { + Register Zero = B.buildConstant(S32, 0).getReg(0); + // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit + // num_records. + LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits()); + auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer); + auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt); + auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57)); + Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0); + + // Build the higher 64-bit value, which has the higher 38-bit num_records, + // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag. + auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7)); + auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12)); + auto ExtShiftedStride = + B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)}); + auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28)); + auto ExtShiftedFlags = + B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)}); + auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride); + Register HighHalf = + B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0); + B.buildMergeValues(Result, {LowHalf, HighHalf}); + } else { + NumRecords = B.buildTrunc(S32, NumRecords).getReg(0); + auto Unmerge = B.buildUnmerge(S32, Pointer); + auto LowHalf = Unmerge.getReg(0); + auto HighHalf = Unmerge.getReg(1); + + auto AndMask = B.buildConstant(S32, 0x0000ffff); + auto Masked = B.buildAnd(S32, HighHalf, AndMask); + auto ShiftConst = B.buildConstant(S32, 16); + auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); + auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); + Register NewHighHalfReg = NewHighHalf.getReg(0); + B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); } - Register NewHighHalfReg = NewHighHalf.getReg(0); - B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); + MI.eraseFromParent(); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp index 6acbf52b97de5..680e7eb3de6be 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -175,6 +175,40 @@ void AMDGPUMCInstLower::lowerT16D16Helper(const MachineInstr *MI, } } +void AMDGPUMCInstLower::lowerT16FmaMixFP16(const MachineInstr *MI, + MCInst &OutMI) const { + unsigned Opcode = MI->getOpcode(); + const auto *TII = static_cast(ST.getInstrInfo()); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + int VDstIdx = AMDGPU::getNamedOperandIdx(Opcode, llvm::AMDGPU::OpName::vdst); + const MachineOperand &VDst = MI->getOperand(VDstIdx); + bool IsHi = AMDGPU::isHi16Reg(VDst.getReg(), TRI); + switch (Opcode) { + case AMDGPU::V_FMA_MIX_F16_t16: + Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_F16 : AMDGPU::V_FMA_MIXLO_F16; + break; + case AMDGPU::V_FMA_MIX_BF16_t16: + Opcode = IsHi ? AMDGPU::V_FMA_MIXHI_BF16 : AMDGPU::V_FMA_MIXLO_BF16; + break; + } + int MCOpcode = TII->pseudoToMCOpcode(Opcode); + assert(MCOpcode != -1 && + "Pseudo instruction doesn't have a target-specific version"); + OutMI.setOpcode(MCOpcode); + + // lower operands + for (int I = 0, E = MI->getNumExplicitOperands(); I < E; I++) { + const MachineOperand &MO = MI->getOperand(I); + MCOperand MCOp; + if (I == VDstIdx) + MCOp = MCOperand::createReg(TRI.get32BitRegister(VDst.getReg())); + else + lowerOperand(MO, MCOp); + OutMI.addOperand(MCOp); + } +} + void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { unsigned Opcode = MI->getOpcode(); const auto *TII = static_cast(ST.getInstrInfo()); @@ -201,6 +235,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { } else if (AMDGPU::getT16D16Helper(Opcode)) { lowerT16D16Helper(MI, OutMI); return; + } else if (Opcode == AMDGPU::V_FMA_MIX_F16_t16 || + Opcode == AMDGPU::V_FMA_MIX_BF16_t16) { + lowerT16FmaMixFP16(MI, OutMI); + return; } int MCOpcode = TII->pseudoToMCOpcode(Opcode); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h index 68b8d4e25a6cc..23ed55d45220f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h @@ -38,6 +38,7 @@ class AMDGPUMCInstLower { void lower(const MachineInstr *MI, MCInst &OutMI) const; void lowerT16D16Helper(const MachineInstr *MI, MCInst &OutMI) const; + void lowerT16FmaMixFP16(const MachineInstr *MI, MCInst &OutMI) const; }; namespace { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3e2b2c3510569..03d16fdd54c42 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -607,13 +607,15 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost( case ISD::FSUB: if (ST->hasPackedFP32Ops() && SLT == MVT::f32) NElts = (NElts + 1) / 2; + if (ST->hasBF16PackedInsts() && SLT == MVT::bf16) + NElts = (NElts + 1) / 2; if (SLT == MVT::f64) return LT.first * NElts * get64BitInstrCost(CostKind); if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; - if (SLT == MVT::f32 || SLT == MVT::f16) + if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16) return LT.first * NElts * getFullRateInstrCost(); break; case ISD::FDIV: @@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy; - if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) || + if ((ST->hasVOP3PInsts() && + (SLT == MVT::f16 || SLT == MVT::i16 || + (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) || (ST->hasPackedFP32Ops() && SLT == MVT::f32)) NElts = (NElts + 1) / 2; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 2ced4d6813766..a67a7bedf19a3 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -80,8 +80,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool Abs = false; bool Neg = false; bool Sext = false; - bool Lit = false; - bool Lit64 = false; + LitModifier Lit = LitModifier::None; bool hasFPModifiers() const { return Abs || Neg; } bool hasIntModifiers() const { return Sext; } @@ -1247,6 +1246,12 @@ raw_ostream &operator <<(raw_ostream &OS, AMDGPUOperand::Modifiers Mods) { // AsmParser //===----------------------------------------------------------------------===// +// TODO: define GET_SUBTARGET_FEATURE_NAME +#define GET_REGISTER_MATCHER +#include "AMDGPUGenAsmMatcher.inc" +#undef GET_REGISTER_MATCHER +#undef GET_SUBTARGET_FEATURE_NAME + // Holds info related to the current kernel, e.g. count of SGPRs used. // Kernel scope begins at .amdgpu_hsa_kernel directive, ends at next // .amdgpu_hsa_kernel or at EOF. @@ -1537,6 +1542,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser { return AMDGPU::isGFX10_BEncoding(getSTI()); } + bool isWave32() const { return getAvailableFeatures()[Feature_isWave32Bit]; } + + bool isWave64() const { return getAvailableFeatures()[Feature_isWave64Bit]; } + bool hasInv2PiInlineImm() const { return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]; } @@ -1590,16 +1599,22 @@ class AMDGPUAsmParser : public MCTargetAsmParser { return static_cast(TS); } - const MCRegisterInfo *getMRI() const { + MCContext &getContext() const { // We need this const_cast because for some reason getContext() is not const // in MCAsmParser. - return const_cast(this)->getContext().getRegisterInfo(); + return const_cast(this)->MCTargetAsmParser::getContext(); + } + + const MCRegisterInfo *getMRI() const { + return getContext().getRegisterInfo(); } const MCInstrInfo *getMII() const { return &MII; } + // FIXME: This should not be used. Instead, should use queries derived from + // getAvailableFeatures(). const FeatureBitset &getFeatureBits() const { return getSTI().getFeatureBits(); } @@ -1675,10 +1690,10 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const; bool parseSP3NegModifier(); ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false, - bool HasLit = false, bool HasLit64 = false); + LitModifier Lit = LitModifier::None); ParseStatus parseReg(OperandVector &Operands); ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false, - bool HasLit = false, bool HasLit64 = false); + LitModifier Lit = LitModifier::None); ParseStatus parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true); ParseStatus parseRegOrImmWithIntInputMods(OperandVector &Operands, @@ -1792,7 +1807,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { const OperandVector &Operands) const; SMLoc getInstLoc(const OperandVector &Operands) const; - bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands); + bool validateInstruction(const MCInst &Inst, SMLoc IDLoc, + const OperandVector &Operands); bool validateOffset(const MCInst &Inst, const OperandVector &Operands); bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands); bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands); @@ -1809,8 +1825,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool validateMIMGAtomicDMask(const MCInst &Inst); bool validateMIMGGatherDMask(const MCInst &Inst); bool validateMovrels(const MCInst &Inst, const OperandVector &Operands); - bool validateMIMGDataSize(const MCInst &Inst, const SMLoc &IDLoc); - bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc); + bool validateMIMGDataSize(const MCInst &Inst, SMLoc IDLoc); + bool validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc); bool validateMIMGD16(const MCInst &Inst); bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands); bool validateTensorR128(const MCInst &Inst); @@ -1832,7 +1848,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool validateDivScale(const MCInst &Inst); bool validateWaitCnt(const MCInst &Inst, const OperandVector &Operands); bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, - const SMLoc &IDLoc); + SMLoc IDLoc); bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands, const unsigned CPol); bool validateTFE(const MCInst &Inst, const OperandVector &Operands); @@ -1849,7 +1865,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser { bool isSupportedMnemo(StringRef Mnemo, const FeatureBitset &FBS, ArrayRef Variants); - bool checkUnsupportedInstruction(StringRef Name, const SMLoc &IDLoc); + bool checkUnsupportedInstruction(StringRef Name, SMLoc IDLoc); bool isId(const StringRef Id) const; bool isId(const AsmToken &Token, const StringRef Id) const; @@ -2256,9 +2272,8 @@ bool AMDGPUOperand::isSDWAInt32Operand() const { } bool AMDGPUOperand::isBoolReg() const { - auto FB = AsmParser->getFeatureBits(); - return isReg() && ((FB[AMDGPU::FeatureWavefrontSize64] && isSCSrc_b64()) || - (FB[AMDGPU::FeatureWavefrontSize32] && isSCSrc_b32())); + return isReg() && ((AsmParser->isWave64() && isSCSrc_b64()) || + (AsmParser->isWave32() && isSCSrc_b32())); } uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const @@ -2312,6 +2327,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo APInt Literal(64, Val); uint8_t OpTy = InstDesc.operands()[OpNum].OperandType; + bool CanUse64BitLiterals = + AsmParser->has64BitLiterals() && + !(InstDesc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); + MCContext &Ctx = AsmParser->getContext(); + if (Imm.IsFPImm) { // We got fp literal token switch (OpTy) { case AMDGPU::OPERAND_REG_IMM_INT64: @@ -2341,7 +2361,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo Val &= 0xffffffff00000000u; } - Inst.addOperand(MCOperand::createImm(Val)); + if ((OpTy == AMDGPU::OPERAND_REG_IMM_FP64 || + OpTy == AMDGPU::OPERAND_REG_INLINE_C_FP64 || + OpTy == AMDGPU::OPERAND_REG_INLINE_AC_FP64) && + CanUse64BitLiterals && Lo_32(Val) != 0) { + Inst.addOperand(MCOperand::createExpr( + AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; } @@ -2351,7 +2379,12 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo llvm_unreachable("fp literal in 64-bit integer instruction."); case AMDGPU::OPERAND_KIMM64: - Inst.addOperand(MCOperand::createImm(Val)); + if (CanUse64BitLiterals && Lo_32(Val) != 0) { + Inst.addOperand(MCOperand::createExpr( + AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; case AMDGPU::OPERAND_REG_IMM_BF16: @@ -2437,10 +2470,16 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // truncated to uint32_t), if the target doesn't support 64-bit literals, or // the lit modifier is explicitly used, we need to truncate it to the 32 // LSBs. - if (!AsmParser->has64BitLiterals() || getModifiers().Lit) + if (!AsmParser->has64BitLiterals() || + getModifiers().Lit == LitModifier::Lit) Val = Lo_32(Val); - Inst.addOperand(MCOperand::createImm(Val)); + if (CanUse64BitLiterals && (!isInt<32>(Val) || !isUInt<32>(Val))) { + Inst.addOperand(MCOperand::createExpr( + AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; case AMDGPU::OPERAND_REG_IMM_FP64: @@ -2461,12 +2500,18 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo // 1) explicitly forced by using lit modifier; // 2) the value is a valid 32-bit representation (signed or unsigned), // meanwhile not forced by lit64 modifier. - if (getModifiers().Lit || - (!getModifiers().Lit64 && (isInt<32>(Val) || isUInt<32>(Val)))) + if (getModifiers().Lit == LitModifier::Lit || + (getModifiers().Lit != LitModifier::Lit64 && + (isInt<32>(Val) || isUInt<32>(Val)))) Val = static_cast(Val) << 32; } - Inst.addOperand(MCOperand::createImm(Val)); + if (CanUse64BitLiterals && Lo_32(Val) != 0) { + Inst.addOperand(MCOperand::createExpr( + AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; case AMDGPU::OPERAND_REG_IMM_INT16: @@ -2484,10 +2529,16 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_KIMM64: - if ((isInt<32>(Val) || isUInt<32>(Val)) && !getModifiers().Lit64) + if ((isInt<32>(Val) || isUInt<32>(Val)) && + getModifiers().Lit != LitModifier::Lit64) Val <<= 32; - Inst.addOperand(MCOperand::createImm(Val)); + if (CanUse64BitLiterals && Lo_32(Val) != 0) { + Inst.addOperand(MCOperand::createExpr( + AMDGPUMCExpr::createLit(LitModifier::Lit64, Val, Ctx))); + } else { + Inst.addOperand(MCOperand::createImm(Val)); + } return; default: @@ -3167,20 +3218,22 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) { } ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, - bool HasSP3AbsModifier, bool HasLit, - bool HasLit64) { + bool HasSP3AbsModifier, LitModifier Lit) { // TODO: add syntactic sugar for 1/(2*PI) if (isRegister() || isModifier()) return ParseStatus::NoMatch; - if (!HasLit && !HasLit64) { - HasLit64 = trySkipId("lit64"); - HasLit = !HasLit64 && trySkipId("lit"); - if (HasLit || HasLit64) { + if (Lit == LitModifier::None) { + if (trySkipId("lit")) + Lit = LitModifier::Lit; + else if (trySkipId("lit64")) + Lit = LitModifier::Lit64; + + if (Lit != LitModifier::None) { if (!skipToken(AsmToken::LParen, "expected left paren after lit")) return ParseStatus::Failure; - ParseStatus S = parseImm(Operands, HasSP3AbsModifier, HasLit, HasLit64); + ParseStatus S = parseImm(Operands, HasSP3AbsModifier, Lit); if (S.isSuccess() && !skipToken(AsmToken::RParen, "expected closing parentheses")) return ParseStatus::Failure; @@ -3201,8 +3254,7 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, } AMDGPUOperand::Modifiers Mods; - Mods.Lit = HasLit; - Mods.Lit64 = HasLit64; + Mods.Lit = Lit; if (IsReal) { // Floating-point expressions are not supported. @@ -3253,7 +3305,7 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands, AMDGPUOperand &Op = static_cast(*Operands.back()); Op.setModifiers(Mods); } else { - if (HasLit || HasLit64) + if (Lit != LitModifier::None) return ParseStatus::NoMatch; Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S)); } @@ -3277,14 +3329,13 @@ ParseStatus AMDGPUAsmParser::parseReg(OperandVector &Operands) { } ParseStatus AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, - bool HasSP3AbsMod, bool HasLit, - bool HasLit64) { + bool HasSP3AbsMod, LitModifier Lit) { ParseStatus Res = parseReg(Operands); if (!Res.isNoMatch()) return Res; if (isModifier()) return ParseStatus::NoMatch; - return parseImm(Operands, HasSP3AbsMod, HasLit, HasLit64); + return parseImm(Operands, HasSP3AbsMod, Lit); } bool @@ -3380,7 +3431,6 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm) { bool Neg, SP3Neg; bool Abs, SP3Abs; - bool Lit64, Lit; SMLoc Loc; // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead. @@ -3400,18 +3450,19 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs")) return ParseStatus::Failure; - Lit64 = trySkipId("lit64"); - if (Lit64) { + LitModifier Lit = LitModifier::None; + if (trySkipId("lit")) { + Lit = LitModifier::Lit; + if (!skipToken(AsmToken::LParen, "expected left paren after lit")) + return ParseStatus::Failure; + } else if (trySkipId("lit64")) { + Lit = LitModifier::Lit64; if (!skipToken(AsmToken::LParen, "expected left paren after lit64")) return ParseStatus::Failure; if (!has64BitLiterals()) return Error(Loc, "lit64 is not supported on this GPU"); } - Lit = !Lit64 && trySkipId("lit"); - if (Lit && !skipToken(AsmToken::LParen, "expected left paren after lit")) - return ParseStatus::Failure; - Loc = getLoc(); SP3Abs = trySkipToken(AsmToken::Pipe); if (Abs && SP3Abs) @@ -3419,16 +3470,16 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, ParseStatus Res; if (AllowImm) { - Res = parseRegOrImm(Operands, SP3Abs, Lit, Lit64); + Res = parseRegOrImm(Operands, SP3Abs, Lit); } else { Res = parseReg(Operands); } if (!Res.isSuccess()) - return (SP3Neg || Neg || SP3Abs || Abs || Lit || Lit64) + return (SP3Neg || Neg || SP3Abs || Abs || Lit != LitModifier::None) ? ParseStatus::Failure : Res; - if ((Lit || Lit64) && !Operands.back()->isImm()) + if (Lit != LitModifier::None && !Operands.back()->isImm()) Error(Loc, "expected immediate with lit modifier"); if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar")) @@ -3437,7 +3488,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, return ParseStatus::Failure; if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses")) return ParseStatus::Failure; - if ((Lit || Lit64) && + if (Lit != LitModifier::None && !skipToken(AsmToken::RParen, "expected closing parentheses")) return ParseStatus::Failure; @@ -3445,9 +3496,8 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands, Mods.Abs = Abs || SP3Abs; Mods.Neg = Neg || SP3Neg; Mods.Lit = Lit; - Mods.Lit64 = Lit64; - if (Mods.hasFPModifiers() || Lit || Lit64) { + if (Mods.hasFPModifiers() || Lit != LitModifier::None) { AMDGPUOperand &Op = static_cast(*Operands.back()); if (Op.isExpr()) return Error(Op.getStartLoc(), "expected an absolute expression"); @@ -3637,7 +3687,7 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, const MCOperand &MO = Inst.getOperand(OpIdx); - int64_t Val = MO.getImm(); + int64_t Val = MO.isImm() ? MO.getImm() : getLitValue(MO.getExpr()); auto OpSize = AMDGPU::getOperandSize(Desc, OpIdx); switch (OpSize) { // expected operand size @@ -4038,8 +4088,7 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) { constexpr uint64_t MIMGFlags = SIInstrFlags::MIMG | SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE; -bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, - const SMLoc &IDLoc) { +bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, SMLoc IDLoc) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); @@ -4086,8 +4135,7 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst, return false; } -bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, - const SMLoc &IDLoc) { +bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst, SMLoc IDLoc) { const unsigned Opc = Inst.getOpcode(); const MCInstrDesc &Desc = MII.get(Opc); @@ -4765,16 +4813,26 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst, const MCOperand &MO = Inst.getOperand(OpIdx); // Exclude special imm operands (like that used by s_set_gpr_idx_on) if (AMDGPU::isSISrcOperand(Desc, OpIdx)) { - if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { + std::optional Imm; + if (MO.isImm()) { + Imm = MO.getImm(); + } else if (MO.isExpr()) { + if (isLitExpr(MO.getExpr())) + Imm = getLitValue(MO.getExpr()); + } else { + continue; + } + + if (!Imm.has_value()) { + ++NumExprs; + } else if (!isInlineConstant(Inst, OpIdx)) { auto OpType = static_cast( Desc.operands()[OpIdx].OperandType); - int64_t Value = encode32BitLiteral(MO.getImm(), OpType); + int64_t Value = encode32BitLiteral(*Imm, OpType); if (NumLiterals == 0 || LiteralValue != Value) { LiteralValue = Value; ++NumLiterals; } - } else if (MO.isExpr()) { - ++NumExprs; } } } @@ -4977,9 +5035,8 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst, // Check if VCC register matches wavefront size bool AMDGPUAsmParser::validateVccOperand(MCRegister Reg) const { - auto FB = getFeatureBits(); - return (FB[AMDGPU::FeatureWavefrontSize64] && Reg == AMDGPU::VCC) || - (FB[AMDGPU::FeatureWavefrontSize32] && Reg == AMDGPU::VCC_LO); + return (Reg == AMDGPU::VCC && isWave64()) || + (Reg == AMDGPU::VCC_LO && isWave32()); } // One unique literal can be used. VOP3 literal is only allowed in GFX10+ @@ -5007,9 +5064,18 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst, if (!isSISrcOperand(Desc, OpIdx)) continue; + std::optional Imm; + if (MO.isImm()) + Imm = MO.getImm(); + else if (MO.isExpr() && isLitExpr(MO.getExpr())) + Imm = getLitValue(MO.getExpr()); + bool IsAnotherLiteral = false; - if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) { - uint64_t Value = static_cast(MO.getImm()); + if (!Imm.has_value()) { + // Literal value not known, so we conservately assume it's different. + IsAnotherLiteral = true; + } else if (!isInlineConstant(Inst, OpIdx)) { + uint64_t Value = *Imm; bool IsForcedFP64 = Desc.operands()[OpIdx].OperandType == AMDGPU::OPERAND_KIMM64 || (Desc.operands()[OpIdx].OperandType == AMDGPU::OPERAND_REG_IMM_FP64 && @@ -5030,9 +5096,6 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst, IsAnotherLiteral = !LiteralValue || *LiteralValue != Value; LiteralValue = Value; - } else if (MO.isExpr()) { - // Literal value not known, so we conservately assume it's different. - IsAnotherLiteral = true; } if (IsAnotherLiteral && !HasMandatoryLiteral && @@ -5280,7 +5343,7 @@ bool AMDGPUAsmParser::validateGWS(const MCInst &Inst, bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands, - const SMLoc &IDLoc) { + SMLoc IDLoc) { int CPolPos = AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::cpol); if (CPolPos == -1) @@ -5477,8 +5540,7 @@ bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst, validateFmt(AMDGPU::OpName::matrix_b_fmt, AMDGPU::OpName::src1); } -bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, - const SMLoc &IDLoc, +bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc, const OperandVector &Operands) { if (!validateLdsDirect(Inst, Operands)) return false; @@ -5640,7 +5702,7 @@ bool AMDGPUAsmParser::isSupportedMnemo(StringRef Mnemo, } bool AMDGPUAsmParser::checkUnsupportedInstruction(StringRef Mnemo, - const SMLoc &IDLoc) { + SMLoc IDLoc) { FeatureBitset FBS = ComputeAvailableFeatures(getFeatureBits()); // Check if requested instruction variant is supported. @@ -5663,7 +5725,7 @@ bool AMDGPUAsmParser::checkUnsupportedInstruction(StringRef Mnemo, // Check if this instruction may be used with a different wavesize. if (isGFX10Plus() && getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && !getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) { - + // FIXME: Use getAvailableFeatures, and do not manually recompute FeatureBitset FeaturesWS32 = getFeatureBits(); FeaturesWS32.flip(AMDGPU::FeatureWavefrontSize64) .flip(AMDGPU::FeatureWavefrontSize32); @@ -6418,10 +6480,10 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, if (C.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) { if (!isGFX10Plus()) return TokError("enable_wavefront_size32=1 is only allowed on GFX10+"); - if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) + if (!isWave32()) return TokError("enable_wavefront_size32=1 requires +WavefrontSize32"); } else { - if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64]) + if (!isWave64()) return TokError("enable_wavefront_size32=0 requires +WavefrontSize64"); } } @@ -6430,10 +6492,10 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID, if (C.wavefront_size == 5) { if (!isGFX10Plus()) return TokError("wavefront_size=5 is only allowed on GFX10+"); - if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize32]) + if (!isWave32()) return TokError("wavefront_size=5 requires +WavefrontSize32"); } else if (C.wavefront_size == 6) { - if (!getFeatureBits()[AMDGPU::FeatureWavefrontSize64]) + if (!isWave64()) return TokError("wavefront_size=6 requires +WavefrontSize64"); } } @@ -10336,7 +10398,6 @@ LLVMInitializeAMDGPUAsmParser() { RegisterMCAsmParser B(getTheGCNTarget()); } -#define GET_REGISTER_MATCHER #define GET_MATCHER_IMPLEMENTATION #define GET_MNEMONIC_SPELL_CHECKER #define GET_MNEMONIC_CHECKER diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index d3db1b7394675..2d5ae29c1037c 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -17,6 +17,7 @@ // ToDo: What to do with instruction suffixes (v_mov_b32 vs v_mov_b32_e32)? #include "Disassembler/AMDGPUDisassembler.h" +#include "MCTargetDesc/AMDGPUMCExpr.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "SIRegisterInfo.h" @@ -123,14 +124,14 @@ static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr, static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr, const MCDisassembler *Decoder) { const auto *DAsm = static_cast(Decoder); - return addOperand(Inst, DAsm->decodeBoolReg(Val)); + return addOperand(Inst, DAsm->decodeBoolReg(Inst, Val)); } static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val, uint64_t Addr, const MCDisassembler *Decoder) { const auto *DAsm = static_cast(Decoder); - return addOperand(Inst, DAsm->decodeSplitBarrier(Val)); + return addOperand(Inst, DAsm->decodeSplitBarrier(Inst, Val)); } static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr, @@ -164,7 +165,7 @@ static DecodeStatus decodeDpp8FI(MCInst &Inst, unsigned Val, uint64_t Addr, const MCDisassembler *Decoder) { \ assert(Imm < (1 << EncSize) && #EncSize "-bit encoding"); \ auto DAsm = static_cast(Decoder); \ - return addOperand(Inst, DAsm->decodeSrcOp(OpWidth, EncImm)); \ + return addOperand(Inst, DAsm->decodeSrcOp(Inst, OpWidth, EncImm)); \ } static DecodeStatus decodeSrcOp(MCInst &Inst, unsigned EncSize, @@ -172,7 +173,7 @@ static DecodeStatus decodeSrcOp(MCInst &Inst, unsigned EncSize, const MCDisassembler *Decoder) { assert(Imm < (1U << EncSize) && "Operand doesn't fit encoding!"); const auto *DAsm = static_cast(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(OpWidth, EncImm)); + return addOperand(Inst, DAsm->decodeSrcOp(Inst, OpWidth, EncImm)); } // Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to @@ -317,7 +318,7 @@ static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm, unsigned RegIdx = Imm & 0x7f; return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi)); } - return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(OpWidth, Imm & 0xFF)); + return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(Inst, OpWidth, Imm & 0xFF)); } template @@ -332,7 +333,7 @@ static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm, unsigned RegIdx = Imm & 0xff; return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi)); } - return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(OpWidth, Imm & 0xFF)); + return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(Inst, OpWidth, Imm & 0xFF)); } static DecodeStatus decodeOperand_VGPR_16(MCInst &Inst, unsigned Imm, @@ -371,7 +372,7 @@ static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val, static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm, unsigned Opw, const MCDisassembler *Decoder) { const auto *DAsm = static_cast(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256)); + return addOperand(Inst, DAsm->decodeSrcOp(Inst, Opw, Imm | 256)); } template @@ -386,7 +387,7 @@ static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm, const MCDisassembler *Decoder) { assert(Imm < (1 << 9) && "9-bit encoding"); const auto *DAsm = static_cast(Decoder); - return addOperand(Inst, DAsm->decodeSrcOp(64, Imm)); + return addOperand(Inst, DAsm->decodeSrcOp(Inst, 64, Imm)); } #define DECODE_SDWA(DecName) \ @@ -510,8 +511,8 @@ void AMDGPUDisassembler::decodeImmOperands(MCInst &MI, } if (Imm == AMDGPU::EncValues::LITERAL_CONST) { - Op = decodeLiteralConstant(OpDesc.OperandType == - AMDGPU::OPERAND_REG_IMM_FP64); + Op = decodeLiteralConstant( + Desc, OpDesc, OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64); continue; } @@ -1543,10 +1544,16 @@ AMDGPUDisassembler::decodeMandatoryLiteral64Constant(uint64_t Val) const { } HasLiteral = true; Literal = Literal64 = Val; - return MCOperand::createImm(Literal64); + + bool UseLit64 = Lo_32(Literal64) != 0; + return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( + LitModifier::Lit64, Literal64, getContext())) + : MCOperand::createImm(Literal64); } -MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const { +MCOperand AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc, + const MCOperandInfo &OpDesc, + bool ExtendFP64) const { // For now all literal constants are supposed to be unsigned integer // ToDo: deal with signed/unsigned 64-bit integer constants // ToDo: deal with float/double constants @@ -1560,10 +1567,31 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const { if (ExtendFP64) Literal64 <<= 32; } - return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal); + + int64_t Val = ExtendFP64 ? Literal64 : Literal; + + bool CanUse64BitLiterals = + STI.hasFeature(AMDGPU::Feature64BitLiterals) && + !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); + + bool UseLit64 = false; + if (CanUse64BitLiterals) { + if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || + OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) + UseLit64 = !isInt<32>(Val) || !isUInt<32>(Val); + else if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || + OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || + OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64) + UseLit64 = Lo_32(Val) != 0; + } + + return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( + LitModifier::Lit64, Val, getContext())) + : MCOperand::createImm(Val); } -MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const { +MCOperand +AMDGPUDisassembler::decodeLiteral64Constant(const MCInst &Inst) const { assert(STI.hasFeature(AMDGPU::Feature64BitLiterals)); if (!HasLiteral) { @@ -1574,7 +1602,23 @@ MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const { HasLiteral = true; Literal64 = eatBytes(Bytes); } - return MCOperand::createImm(Literal64); + + bool UseLit64 = false; + const MCInstrDesc &Desc = MCII->get(Inst.getOpcode()); + const MCOperandInfo &OpDesc = Desc.operands()[Inst.getNumOperands()]; + if (OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || + OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT64) { + UseLit64 = !isInt<32>(Literal64) || !isUInt<32>(Literal64); + } else { + assert(OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_FP64 || + OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP64 || + OpDesc.OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP64); + UseLit64 = Lo_32(Literal64) != 0; + } + + return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit( + LitModifier::Lit64, Literal64, getContext())) + : MCOperand::createImm(Literal64); } MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { @@ -1822,7 +1866,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const { return (TTmpMin <= Val && Val <= TTmpMax)? Val - TTmpMin : -1; } -MCOperand AMDGPUDisassembler::decodeSrcOp(unsigned Width, unsigned Val) const { +MCOperand AMDGPUDisassembler::decodeSrcOp(const MCInst &Inst, unsigned Width, + unsigned Val) const { using namespace AMDGPU::EncValues; assert(Val < 1024); // enum10 @@ -1834,10 +1879,11 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(unsigned Width, unsigned Val) const { return createRegOperand(IsAGPR ? getAgprClassId(Width) : getVgprClassId(Width), Val - VGPR_MIN); } - return decodeNonVGPRSrcOp(Width, Val & 0xFF); + return decodeNonVGPRSrcOp(Inst, Width, Val & 0xFF); } -MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(unsigned Width, +MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const MCInst &Inst, + unsigned Width, unsigned Val) const { // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been // decoded earlier. @@ -1861,7 +1907,7 @@ MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(unsigned Width, return MCOperand::createImm(Val); if (Val == LITERAL64_CONST && STI.hasFeature(AMDGPU::Feature64BitLiterals)) { - return decodeLiteral64Constant(); + return decodeLiteral64Constant(Inst); } switch (Width) { @@ -2053,13 +2099,16 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const { return createRegOperand(IsWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC); } -MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const { - return STI.hasFeature(AMDGPU::FeatureWavefrontSize32) ? decodeSrcOp(32, Val) - : decodeSrcOp(64, Val); +MCOperand AMDGPUDisassembler::decodeBoolReg(const MCInst &Inst, + unsigned Val) const { + return STI.hasFeature(AMDGPU::FeatureWavefrontSize32) + ? decodeSrcOp(Inst, 32, Val) + : decodeSrcOp(Inst, 64, Val); } -MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const { - return decodeSrcOp(32, Val); +MCOperand AMDGPUDisassembler::decodeSplitBarrier(const MCInst &Inst, + unsigned Val) const { + return decodeSrcOp(Inst, 32, Val); } MCOperand AMDGPUDisassembler::decodeDpp8FI(unsigned Val) const { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index c1131c2936fc7..935c3836f2ed9 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -142,12 +142,15 @@ class AMDGPUDisassembler : public MCDisassembler { MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const; MCOperand decodeMandatoryLiteral64Constant(uint64_t Imm) const; - MCOperand decodeLiteralConstant(bool ExtendFP64) const; - MCOperand decodeLiteral64Constant() const; + MCOperand decodeLiteralConstant(const MCInstrDesc &Desc, + const MCOperandInfo &OpDesc, + bool ExtendFP64) const; + MCOperand decodeLiteral64Constant(const MCInst &Inst) const; - MCOperand decodeSrcOp(unsigned Width, unsigned Val) const; + MCOperand decodeSrcOp(const MCInst &Inst, unsigned Width, unsigned Val) const; - MCOperand decodeNonVGPRSrcOp(unsigned Width, unsigned Val) const; + MCOperand decodeNonVGPRSrcOp(const MCInst &Inst, unsigned Width, + unsigned Val) const; MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const; MCOperand decodeSpecialReg32(unsigned Val) const; @@ -159,8 +162,8 @@ class AMDGPUDisassembler : public MCDisassembler { MCOperand decodeSDWASrc32(unsigned Val) const; MCOperand decodeSDWAVopcDst(unsigned Val) const; - MCOperand decodeBoolReg(unsigned Val) const; - MCOperand decodeSplitBarrier(unsigned Val) const; + MCOperand decodeBoolReg(const MCInst &Inst, unsigned Val) const; + MCOperand decodeSplitBarrier(const MCInst &Inst, unsigned Val) const; MCOperand decodeDpp8FI(unsigned Val) const; MCOperand decodeVersionImm(unsigned Imm) const; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 7e5ae25ff30e6..9f33bac4c56ea 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2009,8 +2009,8 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16; defm : FlatStorePats_t16 ; defm : FlatStorePats_t16 ; - def : FlatStorePat ; - def : FlatStorePat ; + defm : FlatStorePats_t16 ; + defm : FlatStorePats_t16 ; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts defm : FlatLoadPats ; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index a3b64aee297b2..1d9a427f2829b 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -82,7 +82,7 @@ static bool isDivFMas(unsigned Opcode) { } static bool isSGetReg(unsigned Opcode) { - return Opcode == AMDGPU::S_GETREG_B32; + return Opcode == AMDGPU::S_GETREG_B32 || Opcode == AMDGPU::S_GETREG_B32_const; } static bool isSSetReg(unsigned Opcode) { @@ -443,40 +443,101 @@ using GetNumWaitStatesFn = function_ref; // Search for a hazard in a block and its predecessors. template static bool -hasHazard(StateT State, +hasHazard(StateT InitialState, function_ref IsHazard, function_ref UpdateState, - const MachineBasicBlock *MBB, - MachineBasicBlock::const_reverse_instr_iterator I, - DenseSet &Visited) { - for (auto E = MBB->instr_rend(); I != E; ++I) { - // No need to look at parent BUNDLE instructions. - if (I->isBundle()) - continue; - - switch (IsHazard(State, *I)) { - case HazardFound: - return true; - case HazardExpired: - return false; - default: - // Continue search - break; + const MachineBasicBlock *InitialMBB, + MachineBasicBlock::const_reverse_instr_iterator InitialI) { + struct StateMapKey { + SmallVectorImpl *States; + unsigned Idx; + static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) { + return LHS.States == RHS.States && LHS.Idx == RHS.Idx; + } + }; + struct StateMapKeyTraits : DenseMapInfo { + static inline StateMapKey getEmptyKey() { + return {static_cast *>( + DenseMapInfo::getEmptyKey()), + DenseMapInfo::getEmptyKey()}; + } + static inline StateMapKey getTombstoneKey() { + return {static_cast *>( + DenseMapInfo::getTombstoneKey()), + DenseMapInfo::getTombstoneKey()}; + } + static unsigned getHashValue(const StateMapKey &Key) { + return StateT::getHashValue((*Key.States)[Key.Idx]); } + static unsigned getHashValue(const StateT &State) { + return StateT::getHashValue(State); + } + static bool isEqual(const StateMapKey &LHS, const StateMapKey &RHS) { + const auto EKey = getEmptyKey(); + const auto TKey = getTombstoneKey(); + if (StateMapKey::isEqual(LHS, EKey) || StateMapKey::isEqual(RHS, EKey) || + StateMapKey::isEqual(LHS, TKey) || StateMapKey::isEqual(RHS, TKey)) + return StateMapKey::isEqual(LHS, RHS); + return StateT::isEqual((*LHS.States)[LHS.Idx], (*RHS.States)[RHS.Idx]); + } + static bool isEqual(const StateT &LHS, const StateMapKey &RHS) { + if (StateMapKey::isEqual(RHS, getEmptyKey()) || + StateMapKey::isEqual(RHS, getTombstoneKey())) + return false; + return StateT::isEqual(LHS, (*RHS.States)[RHS.Idx]); + } + }; - if (I->isInlineAsm() || I->isMetaInstruction()) - continue; + SmallDenseMap StateMap; + SmallVector States; - UpdateState(State, *I); - } + MachineBasicBlock::const_reverse_instr_iterator I = InitialI; + const MachineBasicBlock *MBB = InitialMBB; + StateT State = InitialState; - for (MachineBasicBlock *Pred : MBB->predecessors()) { - if (!Visited.insert(Pred).second) - continue; + SmallSetVector, 16> Worklist; + unsigned WorkIdx = 0; + for (;;) { + bool Expired = false; + for (auto E = MBB->instr_rend(); I != E; ++I) { + // No need to look at parent BUNDLE instructions. + if (I->isBundle()) + continue; - if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(), - Visited)) - return true; + auto Result = IsHazard(State, *I); + if (Result == HazardFound) + return true; + if (Result == HazardExpired) { + Expired = true; + break; + } + + if (I->isInlineAsm() || I->isMetaInstruction()) + continue; + + UpdateState(State, *I); + } + + if (!Expired) { + unsigned StateIdx = States.size(); + StateMapKey Key = {&States, StateIdx}; + auto Insertion = StateMap.insert_as(std::pair(Key, StateIdx), State); + if (Insertion.second) { + States.emplace_back(State); + } else { + StateIdx = Insertion.first->second; + } + for (MachineBasicBlock *Pred : MBB->predecessors()) + Worklist.insert(std::pair(Pred, StateIdx)); + } + + if (WorkIdx == Worklist.size()) + break; + + unsigned StateIdx; + std::tie(MBB, StateIdx) = Worklist[WorkIdx++]; + State = States[StateIdx]; + I = MBB->instr_rbegin(); } return false; @@ -1641,6 +1702,15 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { SmallDenseMap DefPos; int ExecPos = std::numeric_limits::max(); int VALUs = 0; + + static unsigned getHashValue(const StateType &State) { + return hash_combine(State.ExecPos, State.VALUs, + hash_combine_range(State.DefPos)); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.DefPos == RHS.DefPos && LHS.ExecPos == RHS.ExecPos && + LHS.VALUs == RHS.VALUs; + } }; StateType State; @@ -1735,9 +1805,8 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { State.VALUs += 1; }; - DenseSet Visited; if (!hasHazard(State, IsHazardFn, UpdateStateFn, MI->getParent(), - std::next(MI->getReverseIterator()), Visited)) + std::next(MI->getReverseIterator()))) return false; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), @@ -1778,6 +1847,13 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { struct StateType { int VALUs = 0; int TRANS = 0; + + static unsigned getHashValue(const StateType &State) { + return hash_combine(State.VALUs, State.TRANS); + } + static bool isEqual(const StateType &LHS, const StateType &RHS) { + return LHS.VALUs == RHS.VALUs && LHS.TRANS == RHS.TRANS; + } }; StateType State; @@ -1813,9 +1889,8 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { State.TRANS += 1; }; - DenseSet Visited; if (!hasHazard(State, IsHazardFn, UpdateStateFn, MI->getParent(), - std::next(MI->getReverseIterator()), Visited)) + std::next(MI->getReverseIterator()))) return false; // Hazard is observed - insert a wait on va_dst counter to ensure hazard is diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 254b75b784e75..fab78a93aa063 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1777,9 +1777,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() { for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) { auto Region = DAG.Regions[I]; for (auto MI = Region.first; MI != Region.second; ++MI) { - // The instruction must be trivially rematerializable. + // The instruction must be rematerializable. MachineInstr &DefMI = *MI; - if (!isTriviallyReMaterializable(DefMI)) + if (!isReMaterializable(DefMI)) continue; // We only support rematerializing virtual registers with one definition. @@ -2002,8 +2002,8 @@ void PreRARematStage::rematerialize() { } // Copied from MachineLICM -bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) { - if (!DAG.TII->isTriviallyReMaterializable(MI)) +bool PreRARematStage::isReMaterializable(const MachineInstr &MI) { + if (!DAG.TII->isReMaterializable(MI)) return false; for (const MachineOperand &MO : MI.all_uses()) { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 790370ff8ab4d..06b9b64091f00 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -433,7 +433,7 @@ class ClusteredLowOccStage : public GCNSchedStage { /// Attempts to reduce function spilling or, if there is no spilling, to /// increase function occupancy by one with respect to ArchVGPR usage by sinking -/// trivially rematerializable instructions to their use. When the stage +/// rematerializable instructions to their use. When the stage /// estimates reducing spilling or increasing occupancy is possible, as few /// instructions as possible are rematerialized to reduce potential negative /// effects on function latency. @@ -483,9 +483,8 @@ class PreRARematStage : public GCNSchedStage { /// PreRARematStage::TargetOccupancy. bool canIncreaseOccupancyOrReduceSpill(); - /// Whether the MI is trivially rematerializable and does not have any virtual - /// register use. - bool isTriviallyReMaterializable(const MachineInstr &MI); + /// Whether the MI is rematerializable + bool isReMaterializable(const MachineInstr &MI); /// Rematerializes all instructions in PreRARematStage::Rematerializations /// and stores the achieved occupancy after remat in diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 920a47b5afe07..a54d6651c25c1 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -99,6 +99,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableDS128 = false; bool EnablePRTStrictNull = false; bool DumpCode = false; + bool AssemblerPermissiveWavesize = false; // Subtarget statically properties set by tablegen bool FP64 = false; @@ -285,6 +286,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool UseBlockVGPROpsForCSR = false; bool HasGloballyAddressableScratch = false; + bool Has45BitNumRecordsBufferResource = false; + // Dummy feature to use for assembler in tablegen. bool FeatureDisable = false; @@ -1849,6 +1852,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return 4; return 3; } + + /// \returns true if the sub-target supports buffer resource (V#) with 45-bit + /// num_records. + bool has45BitNumRecordsBufferResource() const { + return Has45BitNumRecordsBufferResource; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp index b8f43c4550b7e..afaa19013bfc2 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp @@ -21,8 +21,8 @@ namespace llvm::mca { -void AMDGPUInstrPostProcess::postProcessInstruction( - std::unique_ptr &Inst, const MCInst &MCI) { +void AMDGPUInstrPostProcess::postProcessInstruction(Instruction &Inst, + const MCInst &MCI) { switch (MCI.getOpcode()) { case AMDGPU::S_WAITCNT: case AMDGPU::S_WAITCNT_soft: @@ -44,7 +44,7 @@ void AMDGPUInstrPostProcess::postProcessInstruction( // s_waitcnt instructions encode important information as immediate operands // which are lost during the MCInst -> mca::Instruction lowering. -void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr &Inst, +void AMDGPUInstrPostProcess::processWaitCnt(Instruction &Inst, const MCInst &MCI) { for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) { MCAOperand Op; @@ -55,7 +55,7 @@ void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr &Inst, Op = MCAOperand::createImm(MCOp.getImm()); } Op.setIndex(Idx); - Inst->addOperand(Op); + Inst.addOperand(Op); } } diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h index 85b9c188b5d1a..cbc7427ce6cdf 100644 --- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h +++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h @@ -26,7 +26,7 @@ namespace llvm { namespace mca { class AMDGPUInstrPostProcess : public InstrPostProcess { - void processWaitCnt(std::unique_ptr &Inst, const MCInst &MCI); + void processWaitCnt(Instruction &Inst, const MCInst &MCI); public: AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) @@ -34,8 +34,7 @@ class AMDGPUInstrPostProcess : public InstrPostProcess { ~AMDGPUInstrPostProcess() = default; - void postProcessInstruction(std::unique_ptr &Inst, - const MCInst &MCI) override; + void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; struct WaitCntInfo { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index f098e7a3c6c67..d3b5718093997 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -80,9 +80,13 @@ void AMDGPUInstPrinter::printFP64ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { // KIMM64 - const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - uint64_t Imm = MI->getOperand(OpNo).getImm(); - printLiteral64(Desc, Imm, STI, O, /*IsFP=*/true); + const MCOperand &Op = MI->getOperand(OpNo); + if (Op.isExpr()) { + MAI.printExpr(O, *Op.getExpr()); + return; + } + + printLiteral64(Op.getImm(), O, /*IsFP=*/true); } void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo, @@ -332,8 +336,16 @@ static MCPhysReg getRegForPrinting(MCPhysReg Reg, const MCRegisterInfo &MRI) { if (Idx < 0x100) return Reg; + unsigned RegNo = Idx % 0x100; const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); - return RC->getRegister(Idx % 0x100); + if (RC->getID() == AMDGPU::VGPR_16RegClassID) { + // This class has 2048 registers with interleaved lo16 and hi16. + RegNo *= 2; + if (Enc & AMDGPU::HWEncoding::IS_HI16) + ++RegNo; + } + + return RC->getRegister(RegNo); } // Restore MSBs of a VGPR above 255 from the MCInstrAnalysis. @@ -652,7 +664,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, O << formatHex(static_cast(Imm)); } -void AMDGPUInstPrinter::printImmediate64(const MCInstrDesc &Desc, uint64_t Imm, +void AMDGPUInstPrinter::printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP) { int64_t SImm = static_cast(Imm); @@ -683,27 +695,15 @@ void AMDGPUInstPrinter::printImmediate64(const MCInstrDesc &Desc, uint64_t Imm, STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) O << "0.15915494309189532"; else - printLiteral64(Desc, Imm, STI, O, IsFP); + printLiteral64(Imm, O, IsFP); } -void AMDGPUInstPrinter::printLiteral64(const MCInstrDesc &Desc, uint64_t Imm, - const MCSubtargetInfo &STI, - raw_ostream &O, bool IsFP) { - // This part needs to align with AMDGPUOperand::addLiteralImmOperand. - bool CanUse64BitLiterals = - STI.hasFeature(AMDGPU::Feature64BitLiterals) && - !(Desc.TSFlags & (SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)); - if (IsFP) { - if (CanUse64BitLiterals && Lo_32(Imm)) - O << "lit64(" << formatHex(static_cast(Imm)) << ')'; - else - O << formatHex(static_cast(Hi_32(Imm))); - } else { - if (CanUse64BitLiterals && (!isInt<32>(Imm) || !isUInt<32>(Imm))) - O << "lit64(" << formatHex(static_cast(Imm)) << ')'; - else - O << formatHex(static_cast(Imm)); - } +void AMDGPUInstPrinter::printLiteral64(uint64_t Imm, raw_ostream &O, + bool IsFP) { + if (IsFP && Lo_32(Imm) == 0) + O << formatHex(static_cast(Hi_32(Imm))); + else + O << formatHex(Imm); } void AMDGPUInstPrinter::printBLGP(const MCInst *MI, unsigned OpNo, @@ -814,12 +814,12 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, break; case AMDGPU::OPERAND_REG_IMM_INT64: case AMDGPU::OPERAND_REG_INLINE_C_INT64: - printImmediate64(Desc, Op.getImm(), STI, O, false); + printImmediate64(Op.getImm(), STI, O, false); break; case AMDGPU::OPERAND_REG_IMM_FP64: case AMDGPU::OPERAND_REG_INLINE_C_FP64: case AMDGPU::OPERAND_REG_INLINE_AC_FP64: - printImmediate64(Desc, Op.getImm(), STI, O, true); + printImmediate64(Op.getImm(), STI, O, true); break; case AMDGPU::OPERAND_REG_INLINE_C_INT16: case AMDGPU::OPERAND_REG_IMM_INT16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 21cc2f229de91..b27295e73ec99 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -89,10 +89,9 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediate64(const MCInstrDesc &Desc, uint64_t Imm, - const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP); - void printLiteral64(const MCInstrDesc &Desc, uint64_t Imm, - const MCSubtargetInfo &STI, raw_ostream &O, bool IsFP); + void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O, bool IsFP); + void printLiteral64(uint64_t Imm, raw_ostream &O, bool IsFP); void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printRegularOperand(const MCInst *MI, unsigned OpNo, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp index bf212bbca934c..f287911654c24 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp @@ -345,7 +345,7 @@ std::optional AMDGPUMCCodeEmitter::getLitEncoding( case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: case AMDGPU::OPERAND_KIMM64: - return MO.getImm(); + return Imm; default: llvm_unreachable("invalid operand size"); } @@ -457,6 +457,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI, else if (Op.isExpr()) { if (const auto *C = dyn_cast(Op.getExpr())) Imm = C->getValue(); + else if (AMDGPU::isLitExpr(Op.getExpr())) + Imm = AMDGPU::getLitValue(Op.getExpr()); } else // Exprs will be replaced with a fixup value. llvm_unreachable("Must be immediate or expr"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp index 6638fa2f687d8..c27be0250e386 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp @@ -75,6 +75,12 @@ void AMDGPUMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const { case AGVK_Occupancy: OS << "occupancy("; break; + case AGVK_Lit: + OS << "lit("; + break; + case AGVK_Lit64: + OS << "lit64("; + break; } for (const auto *It = Args.begin(); It != Args.end(); ++It) { MAI->printExpr(OS, **It); @@ -259,6 +265,9 @@ bool AMDGPUMCExpr::evaluateAsRelocatableImpl(MCValue &Res, return evaluateTotalNumVGPR(Res, Asm); case AGVK_Occupancy: return evaluateOccupancy(Res, Asm); + case AGVK_Lit: + case AGVK_Lit64: + return Args[0]->evaluateAsRelocatable(Res, Asm); } for (const MCExpr *Arg : Args) { @@ -332,6 +341,14 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy( Ctx); } +const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value, + MCContext &Ctx) { + assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64); + return create(Lit == LitModifier::Lit ? VariantKind::AGVK_Lit + : VariantKind::AGVK_Lit64, + {MCConstantExpr::create(Value, Ctx, /*PrintInHex=*/true)}, Ctx); +} + static KnownBits fromOptionalToKnownBits(std::optional CompareResult) { static constexpr unsigned BitWidth = 64; const APInt True(BitWidth, 1); @@ -513,7 +530,9 @@ static void targetOpKnownBitsMapHelper(const MCExpr *Expr, KnownBitsMap &KBM, case AMDGPUMCExpr::VariantKind::AGVK_ExtraSGPRs: case AMDGPUMCExpr::VariantKind::AGVK_TotalNumVGPRs: case AMDGPUMCExpr::VariantKind::AGVK_AlignTo: - case AMDGPUMCExpr::VariantKind::AGVK_Occupancy: { + case AMDGPUMCExpr::VariantKind::AGVK_Occupancy: + case AMDGPUMCExpr::VariantKind::AGVK_Lit: + case AMDGPUMCExpr::VariantKind::AGVK_Lit64: { int64_t Val; if (AGVK->evaluateAsAbsolute(Val)) { APInt APValue(BitWidth, Val); @@ -709,3 +728,15 @@ void llvm::AMDGPU::printAMDGPUMCExpr(const MCExpr *Expr, raw_ostream &OS, MAI->printExpr(OS, *Expr); } + +bool AMDGPU::isLitExpr(const MCExpr *Expr) { + const auto *E = dyn_cast(Expr); + return E && (E->getKind() == AMDGPUMCExpr::AGVK_Lit || + E->getKind() == AMDGPUMCExpr::AGVK_Lit64); +} + +int64_t AMDGPU::getLitValue(const MCExpr *Expr) { + assert(isLitExpr(Expr)); + return cast(cast(Expr)->getArgs()[0]) + ->getValue(); +} diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h index bc6fdf7f2e4cd..54fcd2af49ecd 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h @@ -17,6 +17,8 @@ namespace llvm { class Function; class GCNSubtarget; +enum class LitModifier { None, Lit, Lit64 }; + /// AMDGPU target specific MCExpr operations. /// /// Takes in a minimum of 1 argument to be used with an operation. The supported @@ -36,7 +38,9 @@ class AMDGPUMCExpr : public MCTargetExpr { AGVK_ExtraSGPRs, AGVK_TotalNumVGPRs, AGVK_AlignTo, - AGVK_Occupancy + AGVK_Occupancy, + AGVK_Lit, + AGVK_Lit64, }; // Relocation specifiers. @@ -99,6 +103,9 @@ class AMDGPUMCExpr : public MCTargetExpr { const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx); + static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value, + MCContext &Ctx); + ArrayRef getArgs() const { return Args; } VariantKind getKind() const { return Kind; } const MCExpr *getSubExpr(size_t Index) const; @@ -129,6 +136,11 @@ const MCExpr *foldAMDGPUMCExpr(const MCExpr *Expr, MCContext &Ctx); static inline AMDGPUMCExpr::Specifier getSpecifier(const MCSymbolRefExpr *SRE) { return AMDGPUMCExpr::Specifier(SRE->getKind()); } + +LLVM_READONLY bool isLitExpr(const MCExpr *Expr); + +LLVM_READONLY int64_t getLitValue(const MCExpr *Expr); + } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index f2e2d0ed3f8a6..013cfeb364048 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -82,20 +82,36 @@ createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { MCSubtargetInfo *STI = createAMDGPUMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); + bool IsWave64 = STI->hasFeature(AMDGPU::FeatureWavefrontSize64); + bool IsWave32 = STI->hasFeature(AMDGPU::FeatureWavefrontSize32); + // FIXME: We should error for the default target. if (STI->getFeatureBits().none()) STI->ToggleFeature(AMDGPU::FeatureSouthernIslands); - if (!STI->hasFeature(AMDGPU::FeatureWavefrontSize64) && - !STI->hasFeature(AMDGPU::FeatureWavefrontSize32)) { + if (!IsWave64 && !IsWave32) { // If there is no default wave size it must be a generation before gfx10, // these have FeatureWavefrontSize64 in their definition already. For gfx10+ // set wave32 as a default. STI->ToggleFeature(AMDGPU::isGFX10Plus(*STI) ? AMDGPU::FeatureWavefrontSize32 : AMDGPU::FeatureWavefrontSize64); + } else if (IsWave64 && IsWave32) { + // The wave size is mutually exclusive. If both somehow end up set, wave32 + // wins if supported. + STI->ToggleFeature(AMDGPU::supportsWave32(*STI) + ? AMDGPU::FeatureWavefrontSize64 + : AMDGPU::FeatureWavefrontSize32); + + // If both wavesizes were manually requested, hack in a feature to permit + // assembling modules with mixed wavesizes. + STI->ToggleFeature(AMDGPU::FeatureAssemblerPermissiveWavesize); } + assert((STI->hasFeature(AMDGPU::FeatureWavefrontSize64) != + STI->hasFeature(AMDGPU::FeatureWavefrontSize32)) && + "wavesize features are mutually exclusive"); + return STI; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 38331b614bceb..fed37788802b9 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -712,10 +712,15 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const { TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) { const TargetRegisterClass *NewRC = TRI->getRegClassForReg(*MRI, New->getReg()); - const TargetRegisterClass *ConstrainRC = - TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg()); - if (!ConstrainRC) - return false; + + const TargetRegisterClass *ConstrainRC = OpRC; + if (New->getSubReg()) { + ConstrainRC = + TRI->getMatchingSuperRegClass(NewRC, OpRC, New->getSubReg()); + + if (!ConstrainRC) + return false; + } if (!MRI->constrainRegClass(New->getReg(), ConstrainRC)) { LLVM_DEBUG(dbgs() << "Cannot constrain " << printReg(New->getReg(), TRI) @@ -1308,6 +1313,15 @@ void SIFoldOperandsImpl::foldOperand( if (MovSrcRC) { if (UseSubReg) MovSrcRC = TRI->getMatchingSuperRegClass(SrcRC, MovSrcRC, UseSubReg); + + // FIXME: We should be able to directly check immediate operand legality + // for all cases, but gfx908 hacks break. + if (MovOp == AMDGPU::AV_MOV_B32_IMM_PSEUDO && + (!OpToFold.isImm() || + !TII->isImmOperandLegal(MovDesc, SrcIdx, + *OpToFold.getEffectiveImmVal()))) + break; + if (!MRI->constrainRegClass(SrcReg, MovSrcRC)) break; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 78d608556f056..16530087444d2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -851,6 +851,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16}, Custom); + if (Subtarget->hasBF16PackedInsts()) { + for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16}) + // Split vector operations. + setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, + VT, Custom); + } + if (Subtarget->hasPackedFP32Ops()) { setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG}, MVT::v2f32, Legal); @@ -5902,10 +5909,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI, MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { - - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); MachineFunction *MF = BB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const DebugLoc &DL = MI.getDebugLoc(); switch (MI.getOpcode()) { case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32: @@ -5946,24 +5956,23 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64); case AMDGPU::S_UADDO_PSEUDO: case AMDGPU::S_USUBO_PSEUDO: { - const DebugLoc &DL = MI.getDebugLoc(); MachineOperand &Dest0 = MI.getOperand(0); MachineOperand &Dest1 = MI.getOperand(1); MachineOperand &Src0 = MI.getOperand(2); MachineOperand &Src1 = MI.getOperand(3); unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) - ? AMDGPU::S_ADD_I32 - : AMDGPU::S_SUB_I32; + ? AMDGPU::S_ADD_U32 + : AMDGPU::S_SUB_U32; // clang-format off BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()) .add(Src0) .add(Src1); // clang-format on - BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) - .addImm(1) - .addImm(0); + unsigned SelOpc = + Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0); MI.eraseFromParent(); return BB; @@ -5974,11 +5983,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::V_ADD_U64_PSEUDO: case AMDGPU::V_SUB_U64_PSEUDO: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - const DebugLoc &DL = MI.getDebugLoc(); - bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); MachineOperand &Dest = MI.getOperand(0); @@ -6071,11 +6075,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // This pseudo has a chance to be selected // only from uniform add/subcarry node. All the VGPR operands // therefore assumed to be splat vectors. - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock::iterator MII = MI; - const DebugLoc &DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); MachineOperand &CarryDest = MI.getOperand(1); MachineOperand &Src0 = MI.getOperand(2); @@ -6103,16 +6103,13 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, Src2.setReg(RegOp2); } - const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); - unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); - assert(WaveSize == 64 || WaveSize == 32); - - if (WaveSize == 64) { + if (ST.isWave64()) { if (ST.hasScalarCompareEq64()) { BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) .addReg(Src2.getReg()) .addImm(0); } else { + const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); const TargetRegisterClass *SubRC = TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( @@ -6142,7 +6139,7 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // clang-format on unsigned SelOpc = - (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) .addImm(-1) @@ -6171,7 +6168,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::GET_GROUPSTATICSIZE: { assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); - DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .add(MI.getOperand(0)) .addImm(MFI->getLDSSize()); @@ -6180,8 +6176,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::GET_SHADERCYCLESHILO: { assert(MF->getSubtarget().hasShaderCyclesHiLoRegisters()); - MachineRegisterInfo &MRI = MF->getRegInfo(); - const DebugLoc &DL = MI.getDebugLoc(); // The algorithm is: // // hi1 = getreg(SHADER_CYCLES_HI) @@ -6244,14 +6238,9 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::SI_KILL_I1_PSEUDO: return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - Register Dst = MI.getOperand(0).getReg(); const MachineOperand &Src0 = MI.getOperand(1); const MachineOperand &Src1 = MI.getOperand(2); - const DebugLoc &DL = MI.getDebugLoc(); Register SrcCond = MI.getOperand(3).getReg(); Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -6304,8 +6293,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } case AMDGPU::SI_BR_UNDEF: { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) .add(MI.getOperand(0)); Br->getOperand(1).setIsUndef(); // read undef SCC @@ -6321,9 +6308,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, return BB; } case AMDGPU::SI_CALL_ISEL: { - const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - const DebugLoc &DL = MI.getDebugLoc(); - unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); MachineInstrBuilder MIB; @@ -6340,7 +6324,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case AMDGPU::V_SUB_CO_U32_e32: case AMDGPU::V_SUBREV_CO_U32_e32: { // TODO: Define distinct V_*_I32_Pseudo instructions instead. - const DebugLoc &DL = MI.getDebugLoc(); unsigned Opc = MI.getOpcode(); bool NeedClampOperand = false; @@ -6351,8 +6334,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); if (TII->isVOP3(*I)) { - const GCNSubtarget &ST = MF->getSubtarget(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); I.addReg(TRI->getVCC(), RegState::Define); } I.add(MI.getOperand(1)).add(MI.getOperand(2)); @@ -6423,7 +6404,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } if (SetRoundOp || SetDenormOp) { - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { unsigned ImmVal = Def->getOperand(1).getImm(); @@ -6460,7 +6440,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.setDesc(TII->get(AMDGPU::COPY)); return BB; case AMDGPU::ENDPGM_TRAP: { - const DebugLoc &DL = MI.getDebugLoc(); if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); MI.addOperand(MachineOperand::CreateImm(0)); @@ -6487,7 +6466,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case AMDGPU::SIMULATED_TRAP: { assert(Subtarget->hasPrivEnabledTrap2NopBug()); - MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); MachineBasicBlock *SplitBB = TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); MI.eraseFromParent(); @@ -6650,10 +6628,12 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || - VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || - VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); + assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 || + VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || + VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 || + VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 || + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 || + VT == MVT::v32bf16); auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -7552,17 +7532,30 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const { SDNode *BR = nullptr; SDNode *SetCC = nullptr; - if (Intr->getOpcode() == ISD::SETCC) { + switch (Intr->getOpcode()) { + case ISD::SETCC: { // As long as we negate the condition everything is fine SetCC = Intr; Intr = SetCC->getOperand(0).getNode(); - - } else { + break; + } + case ISD::XOR: { + // Similar to SETCC, if we have (xor c, -1), we will be fine. + SDValue LHS = Intr->getOperand(0); + SDValue RHS = Intr->getOperand(1); + if (auto *C = dyn_cast(RHS); C && C->getZExtValue()) { + Intr = LHS.getNode(); + break; + } + [[fallthrough]]; + } + default: { // Get the target from BR if we don't negate the condition BR = findUser(BRCOND, ISD::BR); assert(BR && "brcond missing unconditional branch user"); Target = BR->getOperand(1); } + } unsigned CFNode = isCFIntrinsic(Intr); if (CFNode == 0) { @@ -11602,29 +11595,61 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op, SDValue NumRecords = Op->getOperand(3); SDValue Flags = Op->getOperand(4); - auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); - SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); - SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); - std::optional ConstStride = std::nullopt; - if (auto *ConstNode = dyn_cast(Stride)) - ConstStride = ConstNode->getZExtValue(); - - SDValue NewHighHalf = Masked; - if (!ConstStride || *ConstStride != 0) { - SDValue ShiftedStride; - if (ConstStride) { - ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32); - } else { - SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); - ShiftedStride = - DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, - DAG.getShiftAmountConstant(16, MVT::i32, Loc)); - } - NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); + SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32); + SDValue Rsrc; + + if (Subtarget->has45BitNumRecordsBufferResource()) { + SDValue Zero = DAG.getConstant(0, Loc, MVT::i32); + // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit + // num_records. + SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64); + SDValue NumRecordsLHS = + DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords, + DAG.getShiftAmountConstant(57, MVT::i32, Loc)); + SDValue LowHalf = + DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS); + + // Build the higher 64-bit value, which has the higher 38-bit num_records, + // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag. + SDValue NumRecordsRHS = + DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords, + DAG.getShiftAmountConstant(7, MVT::i32, Loc)); + SDValue ShiftedStride = + DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, + DAG.getShiftAmountConstant(12, MVT::i32, Loc)); + SDValue ExtShiftedStrideVec = + DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride); + SDValue ExtShiftedStride = + DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec); + SDValue ShiftedFlags = + DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags, + DAG.getShiftAmountConstant(28, MVT::i32, Loc)); + SDValue ExtShiftedFlagsVec = + DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags); + SDValue ExtShiftedFlags = + DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec); + SDValue CombinedFields = + DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride); + SDValue HighHalf = + DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags); + + Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf); + } else { + NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32); + auto [LowHalf, HighHalf] = + DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32); + SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32); + SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask); + SDValue ShiftedStride = + DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride, + DAG.getShiftAmountConstant(16, MVT::i32, Loc)); + SDValue NewHighHalf = + DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride); + + Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf, + NumRecords, Flags); } - SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, - NewHighHalf, NumRecords, Flags); SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc); return RsrcPtr; } @@ -15210,36 +15235,13 @@ SITargetLowering::performExtractVectorEltCombine(SDNode *N, return V; } - // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx) - // => - // i32:Lo(k) if Idx == 0, or - // i32:Hi(k) if Idx == 1 - auto *Idx = dyn_cast(N->getOperand(1)); - if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) { - SDLoc SL(N); - SDValue PeekThrough = Vec.getOperand(0); - auto *KImm = dyn_cast(PeekThrough); - if (KImm && KImm->getValueType(0).getSizeInBits() == 64) { - uint64_t KImmValue = KImm->getZExtValue(); - return DAG.getConstant( - (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32); - } - auto *KFPImm = dyn_cast(PeekThrough); - if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) { - uint64_t KFPImmValue = - KFPImm->getValueAPF().bitcastToAPInt().getZExtValue(); - return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) & - 0xffffffff, - SL, MVT::i32); - } - } - if (!DCI.isBeforeLegalize()) return SDValue(); // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit // elements. This exposes more load reduction opportunities by replacing // multiple small extract_vector_elements with a single 32-bit extract. + auto *Idx = dyn_cast(N->getOperand(1)); if (isa(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() && VecSize > 32 && VecSize % 32 == 0 && Idx) { EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ec98851d0bef..f291191dbfd5c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -2730,7 +2730,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, HasVMemStore = true; } for (const MachineOperand &Op : MI.all_uses()) { - if (!TRI->isVectorRegister(*MRI, Op.getReg())) + if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, Op); // Vgpr use diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 13d05ee54d7b3..044ea866342c2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -124,7 +124,7 @@ static bool canRemat(const MachineInstr &MI) { return false; } -bool SIInstrInfo::isReallyTriviallyReMaterializable( +bool SIInstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { if (canRemat(MI)) { @@ -145,7 +145,7 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable( return true; } - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI); } // Returns true if the scalar result of a VALU instruction depends on exec. @@ -3773,7 +3773,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); if (Def && Def->isMoveImmediate() && isInlineConstant(Def->getOperand(1)) && - MRI->hasOneUse(Src0->getReg())) { + MRI->hasOneNonDBGUse(Src0->getReg())) { Src0->ChangeToImmediate(Def->getOperand(1).getImm()); Src0Inlined = true; } else if (ST.getConstantBusLimit(Opc) <= 1 && @@ -3788,7 +3788,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); if (Def && Def->isMoveImmediate() && isInlineConstant(Def->getOperand(1)) && - MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) + MRI->hasOneNonDBGUse(Src1->getReg()) && commuteInstruction(UseMI)) Src0->ChangeToImmediate(Def->getOperand(1).getImm()); else if (RI.isSGPRReg(*MRI, Src1->getReg())) return false; @@ -9506,6 +9506,13 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { DescSize = Desc.getSize(); } + // If FMA Pseudo inst, get correct MC code size + if (Opc == AMDGPU::V_FMA_MIX_F16_t16 || Opc == AMDGPU::V_FMA_MIX_BF16_t16) { + // All potential lowerings are the same size; arbitrarily pick one. + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(AMDGPU::V_FMA_MIXLO_F16); + DescSize = Desc.getSize(); + } + return DescSize; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index e249fc6cbb79d..31a2d55e1baad 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -244,7 +244,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return ST; } - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI) const override; bool isIgnorableUse(const MachineOperand &MO) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index fb2cd04b364d7..18a53931a6390 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -7,9 +7,11 @@ //===----------------------------------------------------------------------===// def isWave32 : Predicate<"Subtarget->isWave32()">, - AssemblerPredicate <(all_of FeatureWavefrontSize32)>; + AssemblerPredicate <(any_of FeatureWavefrontSize32, + FeatureAssemblerPermissiveWavesize)>; def isWave64 : Predicate<"Subtarget->isWave64()">, - AssemblerPredicate <(all_of FeatureWavefrontSize64)>; + AssemblerPredicate <(any_of FeatureWavefrontSize64, + FeatureAssemblerPermissiveWavesize)>; class AMDGPUMnemonicAlias : MnemonicAlias, PredicateControl; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d4c1bc6d84384..59fd2f10ccacd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2287,8 +2287,9 @@ def : GCNPat < def : GCNPat < (fcopysign fp16vt:$src0, f32:$src1), - (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff0000)), - (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src0, hi16), $src1), hi16) + (EXTRACT_SUBREG (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), + (REG_SEQUENCE VGPR_32, $src0, lo16, (i16 (IMPLICIT_DEF)), hi16), + (V_LSHRREV_B32_e64 (i32 16), $src1)), lo16) >; def : GCNPat < @@ -2660,11 +2661,11 @@ let True16Predicate = NotHasTrue16BitInsts in { let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern ; -def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), +def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; -def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), +def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; } // isNotGFX9Plus @@ -2678,8 +2679,8 @@ def : GCNPat < $src1, /* clamp */ 0, /* op_sel */ 0) >; -foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), - (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in +foreach pat = [(i32 (DivergentUnaryFrag (srl i64:$src0, (and i32:$src1, (i32 31))))), + (i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in def : GCNPat; -def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), +def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 0, /* src1_modifiers */ @@ -2734,7 +2735,7 @@ def : GCNPat < $src1, /* clamp */ 0, /* op_sel */ 0) >; -def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), +def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (and i32:$src1, (i32 31))))), (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 0, /* src1_modifiers */ @@ -2743,7 +2744,7 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), $src1, /* clamp */ 0, /* op_sel */ 0) >; -def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), +def : GCNPat<(i32 (DivergentUnaryFrag (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */ (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 0, /* src1_modifiers */ diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index 115a020f44098..8586d6c18b361 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -293,7 +293,6 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) { LIS->InsertMachineInstrInMaps(*SetExec); LIS->InsertMachineInstrInMaps(*NewBr); - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); MI.eraseFromParent(); // FIXME: Is there a better way of adjusting the liveness? It shouldn't be @@ -363,9 +362,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) { RecomputeRegs.insert(SrcReg); RecomputeRegs.insert(DstReg); LIS->createAndComputeVirtRegInterval(SaveReg); - - // Let this be recomputed. - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); } void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { @@ -828,7 +824,10 @@ bool SILowerControlFlow::run(MachineFunction &MF) { optimizeEndCf(); - if (LIS) { + if (LIS && Changed) { + // These will need to be recomputed for insertions and removals. + LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); + LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC); for (Register Reg : RecomputeRegs) { LIS->removeInterval(Reg); LIS->createAndComputeVirtRegInterval(Reg); diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 1a91bbd433553..40eeeb8a8630d 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -213,10 +213,12 @@ void SILowerSGPRSpills::calculateSaveRestoreBlocks(MachineFunction &MF) { if (!MFI.getSavePoints().empty()) { assert(MFI.getSavePoints().size() == 1 && "Multiple save points not yet supported!"); - SaveBlocks.push_back(MFI.getSavePoints().front()); + const auto &SavePoint = *MFI.getSavePoints().begin(); + SaveBlocks.push_back(SavePoint.first); assert(MFI.getRestorePoints().size() == 1 && "Multiple restore points not yet supported!"); - MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); + const auto &RestorePoint = *MFI.getRestorePoints().begin(); + MachineBasicBlock *RestoreBlock = RestorePoint.first; // If RestoreBlock does not have any successor and is not a return block // then the end point is unreachable and we do not need to insert any // epilogue. diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index c501ebba0c7ed..484861dcaac07 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -106,6 +106,7 @@ class SIMemOpInfo final { bool IsLastUse = false; bool IsCooperative = false; + // TODO: Should we assume Cooperative=true if no MMO is present? SIMemOpInfo( const GCNSubtarget &ST, AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent, @@ -299,6 +300,10 @@ class SICacheControl { bool enableNamedBit(const MachineBasicBlock::iterator MI, AMDGPU::CPol::CPol Bit) const; + /// Check if any atomic operation on AS can affect memory accessible via the + /// global address space. + bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const; + public: /// Create a cache control for the subtarget \p ST. @@ -334,6 +339,11 @@ class SICacheControl { bool IsNonTemporal, bool IsLastUse = false) const = 0; + /// Add final touches to a `mayStore` instruction \p MI, which may be a + /// Store or RMW instruction. + /// FIXME: This takes a MI because iterators aren't handled properly. When + /// this is called, they often point to entirely different insts. Thus we back + /// up the inst early and pass it here instead. virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const { return false; }; @@ -991,6 +1001,15 @@ bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI, return true; } +bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const { + assert((!ST.hasGloballyAddressableScratch() || + (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE || + (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) && + "scratch instructions should already be replaced by flat " + "instructions if GloballyAddressableScratch is enabled"); + return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE; +} + /* static */ std::unique_ptr SICacheControl::create(const GCNSubtarget &ST) { GCNSubtarget::Generation Generation = ST.getGeneration(); @@ -1016,7 +1035,7 @@ bool SIGfx6CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1239,7 +1258,7 @@ bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1299,7 +1318,7 @@ bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1336,7 +1355,7 @@ bool SIGfx90ACacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1378,7 +1397,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -1487,7 +1506,7 @@ bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Ensures that following loads will not see stale remote VMEM data or @@ -1551,7 +1570,7 @@ bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the @@ -1594,7 +1613,7 @@ bool SIGfx940CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Set SC bits to indicate system scope. @@ -1638,7 +1657,7 @@ bool SIGfx940CacheControl::enableStoreCacheBypass( assert(!MI->mayLoad() && MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Set SC bits to indicate system scope. @@ -1678,7 +1697,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass( assert(MI->mayLoad() && MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Set SC1 bit to indicate system scope. @@ -1756,7 +1775,7 @@ bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Ensures that following loads will not see stale remote VMEM data or @@ -1840,7 +1859,7 @@ bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the @@ -1897,7 +1916,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -2129,7 +2148,7 @@ bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, if (Pos == Position::AFTER) ++MI; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -2194,7 +2213,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass( assert(MI->mayLoad() && !MI->mayStore()); bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: case SIAtomicScope::AGENT: @@ -2368,7 +2387,10 @@ bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, // which shares the same L0. // // GFX12.5: - // TODO DOCS + // CU$ has two ports. To ensure operations are visible at the workgroup + // level, we need to ensure all operations in this port have completed + // so the other SIMDs in the WG can see them. There is no ordering + // guarantee between the ports. if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts()) { if ((Op & SIMemOp::LOAD) != SIMemOp::NONE) LOADCnt |= true; @@ -2462,7 +2484,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, /// memory. /// Other address spaces do not have a cache. - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) + if (!canAffectGlobalAddrSpace(AddrSpace)) return false; AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV; @@ -2483,8 +2505,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI, // Otherwise in CU mode all waves of a work-group are on the same CU, and // so the L0 does not need to be invalidated. // - // GFX12.5 - // TODO DOCS + // GFX12.5 has a shared WGP$, so no invalidates are required. if (ST.isCuModeEnabled()) return false; @@ -2514,6 +2535,8 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const { + bool Changed = false; + MachineBasicBlock &MBB = *MI->getParent(); DebugLoc DL = MI->getDebugLoc(); @@ -2521,53 +2544,52 @@ bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI, // writeback as all memory operations by the same thread are // sequentially consistent, and no other thread can access scratch // memory. + if (canAffectGlobalAddrSpace(AddrSpace)) { + if (Pos == Position::AFTER) + ++MI; - // Other address spaces do not have a cache. - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE) - return false; - - if (Pos == Position::AFTER) - ++MI; - - // global_wb is only necessary at system scope for GFX12.0, - // they're also necessary at device scope for GFX12.5. - // - // Emitting it for lower scopes is a slow no-op, so we omit it - // for performance. - switch (Scope) { - case SIAtomicScope::SYSTEM: - BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_SYS); - break; - case SIAtomicScope::AGENT: - // TODO DOCS - if (ST.hasGFX1250Insts()) { + // global_wb is only necessary at system scope for GFX12.0, + // they're also necessary at device scope for GFX12.5 as stores + // cannot report completion earlier than L2. + // + // Emitting it for lower scopes is a slow no-op, so we omit it + // for performance. + switch (Scope) { + case SIAtomicScope::SYSTEM: BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) - .addImm(AMDGPU::CPol::SCOPE_DEV); + .addImm(AMDGPU::CPol::SCOPE_SYS); + Changed = true; + break; + case SIAtomicScope::AGENT: + // GFX12.5 may have >1 L2 per device so we must emit a device scope WB. + if (ST.hasGFX1250Insts()) { + BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)) + .addImm(AMDGPU::CPol::SCOPE_DEV); + Changed = true; + } + break; + case SIAtomicScope::CLUSTER: + case SIAtomicScope::WORKGROUP: + // No WB necessary, but we still have to wait. + case SIAtomicScope::WAVEFRONT: + case SIAtomicScope::SINGLETHREAD: + // No WB or wait necessary here, but insertWait takes care of that. + break; + default: + llvm_unreachable("Unsupported synchronization scope"); } - break; - case SIAtomicScope::CLUSTER: - case SIAtomicScope::WORKGROUP: - // No WB necessary, but we still have to wait. - break; - case SIAtomicScope::WAVEFRONT: - case SIAtomicScope::SINGLETHREAD: - // No WB or wait necessary here. - return false; - default: - llvm_unreachable("Unsupported synchronization scope"); - } - if (Pos == Position::AFTER) - --MI; + if (Pos == Position::AFTER) + --MI; + } // We always have to wait for previous memory operations (load/store) to // complete, whether we inserted a WB or not. If we inserted a WB (storecnt), // we of course need to wait for that as well. - insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, - IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); + Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE, + IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release); - return true; + return Changed; } bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( @@ -2655,7 +2677,7 @@ bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace) const { bool Changed = false; - if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) { + if (canAffectGlobalAddrSpace(AddrSpace)) { switch (Scope) { case SIAtomicScope::SYSTEM: Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 7a519117f2482..179ecbad5239f 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -712,10 +712,13 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { bool KilledT = false; for (auto Iter = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end(); - Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) { + Iter != E && Count < SearchLimit && !KilledT; ++Iter) { MachineInstr *MovY = &*Iter; KilledT = MovY->killsRegister(T, TRI); + if (MovY->isDebugInstr()) + continue; + ++Count; if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 && MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 && @@ -733,6 +736,8 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const { MachineInstr *MovX = nullptr; for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator()); I != IY; ++I) { + if (I->isDebugInstr()) + continue; if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) || instModifiesReg(&*I, T, Tsub) || (MovX && instModifiesReg(&*I, X, Xsub))) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index 14ebbf8e9c929..6489e63d4f6b8 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -14,11 +14,9 @@ namespace llvm::AMDGPU { //===----------------------------------------------------------------------===// // Custom Operands. // -// A table of custom operands shall describe "primary" operand names first -// followed by aliases if any. It is not required but recommended to arrange -// operands so that operand encoding match operand position in the table. This -// will make getNameFromOperandTable() a bit more efficient. Unused slots in the -// table shall have an empty name. +// A table of custom operands must be ordered by Encoding in ascending order +// to enable binary search lookup. Within entries that share the same encoding, +// "primary" operand names should be listed first followed by aliases if any. // //===----------------------------------------------------------------------===// @@ -27,21 +25,18 @@ template static StringRef getNameFromOperandTable(const CustomOperand (&Table)[N], unsigned Encoding, const MCSubtargetInfo &STI) { - auto isValidIndexForEncoding = [&](size_t Idx) { - return Idx < N && Table[Idx].Encoding == Encoding && - !Table[Idx].Name.empty() && - (!Table[Idx].Cond || Table[Idx].Cond(STI)); - }; - - // This is an optimization that should work in most cases. As a side effect, - // it may cause selection of an alias instead of a primary operand name in - // case of sparse tables. - if (isValidIndexForEncoding(Encoding)) - return Table[Encoding].Name; - - for (size_t Idx = 0; Idx != N; ++Idx) - if (isValidIndexForEncoding(Idx)) - return Table[Idx].Name; + // Find the first entry with the target encoding + auto First = + std::lower_bound(Table, Table + N, Encoding, + [](const CustomOperand &Entry, unsigned TargetEncoding) { + return Entry.Encoding < TargetEncoding; + }); + + // Search through entries with the same encoding to find the first valid one + for (auto It = First; It != Table + N && It->Encoding == Encoding; ++It) { + if (It->Encoding == Encoding && (!It->Cond || It->Cond(STI))) + return It->Name; + } return ""; } @@ -92,10 +87,11 @@ namespace SendMsg { // clang-format off static constexpr CustomOperand MsgOperands[] = { - {{""}}, {{"MSG_INTERRUPT"}, ID_INTERRUPT}, {{"MSG_GS"}, ID_GS_PreGFX11, isNotGFX11Plus}, + {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus}, {{"MSG_GS_DONE"}, ID_GS_DONE_PreGFX11, isNotGFX11Plus}, + {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus}, {{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10}, {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9_GFX10_GFX11}, {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9_GFX10_GFX11}, @@ -103,10 +99,8 @@ static constexpr CustomOperand MsgOperands[] = { {{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10}, {{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus}, {{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10}, - {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, - {{"MSG_HS_TESSFACTOR"}, ID_HS_TESSFACTOR_GFX11Plus, isGFX11Plus}, - {{"MSG_DEALLOC_VGPRS"}, ID_DEALLOC_VGPRS_GFX11Plus, isGFX11Plus}, {{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250}, + {{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10}, {{"MSG_SYSMSG"}, ID_SYSMSG}, {{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus}, {{"MSG_RTN_GET_DDID"}, ID_RTN_GET_DDID, isGFX11Plus}, @@ -121,7 +115,6 @@ static constexpr CustomOperand MsgOperands[] = { }; static constexpr CustomOperand SysMsgOperands[] = { - {{""}}, {{"SYSMSG_OP_ECC_ERR_INTERRUPT"}, OP_SYS_ECC_ERR_INTERRUPT}, {{"SYSMSG_OP_REG_RD"}, OP_SYS_REG_RD}, {{"SYSMSG_OP_HOST_TRAP_ACK"}, OP_SYS_HOST_TRAP_ACK, isNotGFX9Plus}, @@ -169,68 +162,67 @@ namespace Hwreg { // NOLINTBEGIN // clang-format off static constexpr CustomOperand Operands[] = { - {{""}}, - {{"HW_REG_MODE"}, ID_MODE}, - {{"HW_REG_STATUS"}, ID_STATUS}, - {{"HW_REG_TRAPSTS"}, ID_TRAPSTS, isNotGFX12Plus}, - {{"HW_REG_HW_ID"}, ID_HW_ID, isNotGFX10Plus}, - {{"HW_REG_GPR_ALLOC"}, ID_GPR_ALLOC}, - {{"HW_REG_LDS_ALLOC"}, ID_LDS_ALLOC}, - {{"HW_REG_IB_STS"}, ID_IB_STS}, - {{""}}, - {{""}}, - {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx12, isGFX12Plus}, - {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx12, isGFX12Plus}, - {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx12, isGFX12Plus}, - {{""}}, - {{""}}, - {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9_GFX10_GFX11}, - {{"HW_REG_TBA_LO"}, ID_TBA_LO, isGFX9_GFX10}, - {{"HW_REG_TBA_HI"}, ID_TBA_HI, isGFX9_GFX10}, - {{"HW_REG_TMA_LO"}, ID_TMA_LO, isGFX9_GFX10}, - {{"HW_REG_TMA_HI"}, ID_TMA_HI, isGFX9_GFX10}, - {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10_GFX11}, - {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10_GFX11}, - {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK, isGFX10Before1030}, - {{"HW_REG_HW_ID1"}, ID_HW_ID1, isGFX10Plus}, - {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, - {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, - {{""}}, - {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, - {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, - {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, - {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, - {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, - {{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus}, - - // Register numbers reused in GFX11 - {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx11, isGFX11}, - {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx11, isGFX11}, - - // Register numbers reused in GFX12+ - {{"HW_REG_STATE_PRIV"}, ID_STATE_PRIV, isGFX12Plus}, - {{"HW_REG_PERF_SNAPSHOT_DATA1"}, ID_PERF_SNAPSHOT_DATA1, isGFX12Plus}, - {{"HW_REG_PERF_SNAPSHOT_DATA2"}, ID_PERF_SNAPSHOT_DATA2, isGFX12Plus}, - {{"HW_REG_EXCP_FLAG_PRIV"}, ID_EXCP_FLAG_PRIV, isGFX12Plus}, - {{"HW_REG_EXCP_FLAG_USER"}, ID_EXCP_FLAG_USER, isGFX12Plus}, - {{"HW_REG_TRAP_CTRL"}, ID_TRAP_CTRL, isGFX12Plus}, - {{"HW_REG_SCRATCH_BASE_LO"}, ID_FLAT_SCR_LO, isGFX12Plus}, - {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, - {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, - - // GFX942 specific registers - {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, - {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, - {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940}, - {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940}, - {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, - - // GFX1250 - {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250}, - {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250}, - - // Aliases - {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10}, + {{"HW_REG_WAVE_MODE"}, ID_MODE, isGFX12Plus}, + {{"HW_REG_MODE"}, ID_MODE}, + {{"HW_REG_WAVE_STATUS"}, ID_STATUS, isGFX12Plus}, + {{"HW_REG_STATUS"}, ID_STATUS}, + {{"HW_REG_TRAPSTS"}, ID_TRAPSTS, isNotGFX12Plus}, + {{"HW_REG_HW_ID"}, ID_HW_ID, isNotGFX10Plus}, + {{"HW_REG_WAVE_STATE_PRIV"}, ID_STATE_PRIV, isGFX12Plus}, + {{"HW_REG_STATE_PRIV"}, ID_STATE_PRIV, isGFX12Plus}, + {{"HW_REG_WAVE_GPR_ALLOC"}, ID_GPR_ALLOC, isGFX12Plus}, + {{"HW_REG_GPR_ALLOC"}, ID_GPR_ALLOC}, + {{"HW_REG_WAVE_LDS_ALLOC"}, ID_LDS_ALLOC, isGFX12Plus}, + {{"HW_REG_LDS_ALLOC"}, ID_LDS_ALLOC}, + {{"HW_REG_IB_STS"}, ID_IB_STS}, + {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx12, isGFX12Plus}, + {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx12, isGFX12Plus}, + {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx12, isGFX12Plus}, + {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9_GFX10_GFX11}, + {{"HW_REG_PERF_SNAPSHOT_DATA1"}, ID_PERF_SNAPSHOT_DATA1, isGFX12Plus}, + {{"HW_REG_TBA_LO"}, ID_TBA_LO, isGFX9_GFX10}, + {{"HW_REG_PERF_SNAPSHOT_DATA2"}, ID_PERF_SNAPSHOT_DATA2, isGFX12Plus}, + {{"HW_REG_TBA_HI"}, ID_TBA_HI, isGFX9_GFX10}, + {{"HW_REG_WAVE_EXCP_FLAG_PRIV"}, ID_EXCP_FLAG_PRIV, isGFX12Plus}, + {{"HW_REG_EXCP_FLAG_PRIV"}, ID_EXCP_FLAG_PRIV, isGFX12Plus}, + {{"HW_REG_TMA_LO"}, ID_TMA_LO, isGFX9_GFX10}, + {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx11, isGFX11}, + {{"HW_REG_WAVE_EXCP_FLAG_USER"}, ID_EXCP_FLAG_USER, isGFX12Plus}, + {{"HW_REG_EXCP_FLAG_USER"}, ID_EXCP_FLAG_USER, isGFX12Plus}, + {{"HW_REG_TMA_HI"}, ID_TMA_HI, isGFX9_GFX10}, + {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx11, isGFX11}, + {{"HW_REG_WAVE_TRAP_CTRL"}, ID_TRAP_CTRL, isGFX12Plus}, + {{"HW_REG_TRAP_CTRL"}, ID_TRAP_CTRL, isGFX12Plus}, + {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10_GFX11}, + {{"HW_REG_WAVE_SCRATCH_BASE_LO"}, ID_FLAT_SCR_LO, isGFX12Plus}, + {{"HW_REG_SCRATCH_BASE_LO"}, ID_FLAT_SCR_LO, isGFX12Plus}, + {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, + {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10_GFX11}, + {{"HW_REG_WAVE_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, + {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, + {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, + {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK, isGFX10Before1030}, + {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940}, + {{"HW_REG_WAVE_HW_ID1"}, ID_HW_ID1, isGFX12Plus}, + {{"HW_REG_HW_ID1"}, ID_HW_ID1, isGFX10Plus}, + {{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10}, + {{"HW_REG_SQ_PERF_SNAPSHOT_PC_LO"}, ID_SQ_PERF_SNAPSHOT_PC_LO, isGFX940}, + {{"HW_REG_WAVE_HW_ID2"}, ID_HW_ID2, isGFX12Plus}, + {{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus}, + {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI, isGFX940}, + {{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10}, + {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11}, + {{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250}, + {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11}, + {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, + {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus}, + {{"HW_REG_WAVE_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, + {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus}, + {{"HW_REG_WAVE_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus}, + {{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus}, + {{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250}, + {{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250}, + }; // clang-format on // NOLINTEND diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index c80302e03beea..20fa1412a778e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -3410,7 +3410,16 @@ MCPhysReg getVGPRWithMSBs(MCPhysReg Reg, unsigned MSBs, const MCRegisterClass *RC = getVGPRPhysRegClass(Reg, MRI); if (!RC) return AMDGPU::NoRegister; - return RC->getRegister(Idx | (MSBs << 8)); + + Idx |= MSBs << 8; + if (RC->getID() == AMDGPU::VGPR_16RegClassID) { + // This class has 2048 registers with interleaved lo16 and hi16. + Idx *= 2; + if (Enc & AMDGPU::HWEncoding::IS_HI16) + ++Idx; + } + + return RC->getRegister(Idx); } std::pair diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 37b0262966160..2b9c063f42a5e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1568,6 +1568,11 @@ bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI); bool hasMAIInsts(const MCSubtargetInfo &STI); bool hasVOPD(const MCSubtargetInfo &STI); bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI); + +inline bool supportsWave32(const MCSubtargetInfo &STI) { + return AMDGPU::isGFX10Plus(STI) && !AMDGPU::isGFX1250(STI); +} + int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR); unsigned hasKernargPreload(const MCSubtargetInfo &STI); bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index a7a0e33da5e4a..8e601ad8a48fd 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -681,6 +681,22 @@ static const char *getRegisterName(unsigned RegNum) { {0x2e4d, "COMPUTE_USER_DATA_13"}, {0x2e4e, "COMPUTE_USER_DATA_14"}, {0x2e4f, "COMPUTE_USER_DATA_15"}, + {0x2e50, "COMPUTE_USER_DATA_16"}, + {0x2e51, "COMPUTE_USER_DATA_17"}, + {0x2e52, "COMPUTE_USER_DATA_18"}, + {0x2e53, "COMPUTE_USER_DATA_19"}, + {0x2e54, "COMPUTE_USER_DATA_20"}, + {0x2e55, "COMPUTE_USER_DATA_21"}, + {0x2e56, "COMPUTE_USER_DATA_22"}, + {0x2e57, "COMPUTE_USER_DATA_23"}, + {0x2e58, "COMPUTE_USER_DATA_24"}, + {0x2e59, "COMPUTE_USER_DATA_25"}, + {0x2e5a, "COMPUTE_USER_DATA_26"}, + {0x2e5b, "COMPUTE_USER_DATA_27"}, + {0x2e5c, "COMPUTE_USER_DATA_28"}, + {0x2e5d, "COMPUTE_USER_DATA_29"}, + {0x2e5e, "COMPUTE_USER_DATA_30"}, + {0x2e5f, "COMPUTE_USER_DATA_31"}, {0x2e07, "COMPUTE_NUM_THREAD_X"}, {0x2e08, "COMPUTE_NUM_THREAD_Y"}, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp index b0ed1e5e5c52b..24251e12d57dc 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDKernelCodeTUtils.cpp @@ -40,20 +40,12 @@ using namespace llvm::AMDGPU; // returns. #define GEN_HAS_MEMBER(member) \ class HasMember##member { \ - private: \ - struct KnownWithMember { \ - int member; \ - }; \ - class AmbiguousDerived : public AMDGPUMCKernelCodeT, \ - public KnownWithMember {}; \ template \ - static constexpr std::false_type Test(decltype(U::member) *); \ - template static constexpr std::true_type Test(...); \ + using check_member = decltype(std::declval().member); \ \ public: \ static constexpr bool RESULT = \ - std::is_same_v(nullptr)), \ - std::true_type>; \ + llvm::is_detected::value; \ }; \ class IsMCExpr##member { \ template \ diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index f7279b664ed27..52ee1e874ad86 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -64,6 +64,13 @@ class VOP3P_Mix_Profile + : VOP3P_Mix_Profile { + let IsTrue16 = 1; + let IsRealTrue16 = 1; + let DstRC64 = getVALUDstForVT.ret; +} + multiclass VOP3PInst { def NAME : VOP3P_Pseudo { } // end SubtargetPredicate = isGFX11Plus } +multiclass VOP3_VOP3PInst_t16 { + def NAME : VOP3P_Pseudo; + + if P.HasExtVOP3DPP then + def _dpp : VOP3_DPP_Pseudo { + let VOP3P = 1; + let PseudoInstr = OpName#"_dpp"; + } +} + let isReMaterializable = 1 in { let isCommutable = 1 in { defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile>; @@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile { +multiclass MadFmaMixFP32Pats { defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 @@ -189,7 +203,14 @@ multiclass MadFmaMixPats; +} +multiclass MadFmaMixFP16Pats { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); def : GCNPat < (AMDGPUclamp (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), @@ -243,9 +264,6 @@ multiclass MadFmaMixPats; +} - } // end True16Predicate +multiclass MadFmaMixFP16Pats_t16 { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + def : GCNPat < + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), + (mix_inst_16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + (i32 0), (i32 0), + DSTCLAMP.NONE) + >; - let True16Predicate = UseRealTrue16Insts in { def : GCNPat < - (build_vector (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1), - (vecVT (mixlo_inst $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16))) + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), + (mix_inst_16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.NONE) >; + def : GCNPat < - (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), - (vecVT (mixhi_inst $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (mix_inst_16 $src0_modifiers, $src0, + $src1_modifiers, $src1, + $src2_modifiers, $src2, + DSTCLAMP.ENABLE) >; def : GCNPat < - (build_vector - VT:$elt0, - (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), - (vecVT (mixhi_inst $src0_modifiers, $src0, - $src1_modifiers, $src1, - $src2_modifiers, $src2, - DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) + (AMDGPUclamp (build_vector + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0, + $lo_src1_modifiers, $lo_src1, + $lo_src2_modifiers, $lo_src2, + DSTCLAMP.ENABLE), lo16, + (mix_inst_16 $hi_src0_modifiers, $hi_src0, + $hi_src1_modifiers, $hi_src1, + $hi_src2_modifiers, $hi_src2, + DSTCLAMP.ENABLE), hi16)) >; - } // end True16Predicate } class MinimumMaximumByMinimum3Maximum3VOP3P; +defm : MadFmaMixFP32Pats; +defm : MadFmaMixFP16Pats; } // OtherPredicates = [NoFP32Denormals] } // End SubtargetPredicate = HasMadMixInsts @@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile>; } + +// Pseudo true16 inst for v_fma_mixlo/hi_f16 +defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16>; } // End FPDPRounding = 1 } -defm : MadFmaMixPats; +defm : MadFmaMixFP32Pats; + +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in +defm : MadFmaMixFP16Pats; +let True16Predicate = UseRealTrue16Insts in +defm : MadFmaMixFP16Pats_t16; } let SubtargetPredicate = HasFmaMixBF16Insts in { @@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile>; } + +// Pseudo true16 inst for v_fma_mixlo/hi_bf16 +defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16>; } // End FPDPRounding = 1 } // End isCommutable = 1 -defm : MadFmaMixPats; +defm : MadFmaMixFP32Pats; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in +defm : MadFmaMixFP16Pats; +let True16Predicate = UseRealTrue16Insts in +defm : MadFmaMixFP16Pats_t16; } // End SubtargetPredicate = HasFmaMixBF16Insts def PK_ADD_MINMAX_Profile : VOP3P_Profile { diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp index 1c42f44765abf..1f773e2a7e0fc 100644 --- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp @@ -610,25 +610,41 @@ void ARMAsmPrinter::emitEndOfAsmFile(Module &M) { // to appear in the .ARM.attributes section in ELF. // Instead of subclassing the MCELFStreamer, we do the work here. - // Returns true if all functions have the same function attribute value. - // It also returns true when the module has no functions. +// Returns true if all function definitions have the same function attribute +// value. It also returns true when the module has no functions. static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr, StringRef Value) { - return !any_of(M, [&](const Function &F) { - return F.getFnAttribute(Attr).getValueAsString() != Value; - }); + return !any_of(M, [&](const Function &F) { + if (F.isDeclaration()) + return false; + return F.getFnAttribute(Attr).getValueAsString() != Value; + }); } -// Returns true if all functions have the same denormal mode. +// Returns true if all functions definitions have the same denormal mode. // It also returns true when the module has no functions. -static bool checkDenormalAttributeConsistency(const Module &M, - StringRef Attr, +static bool checkDenormalAttributeConsistency(const Module &M, StringRef Attr, DenormalMode Value) { return !any_of(M, [&](const Function &F) { + if (F.isDeclaration()) + return false; StringRef AttrVal = F.getFnAttribute(Attr).getValueAsString(); return parseDenormalFPAttribute(AttrVal) != Value; }); } +// Returns true if all functions have different denormal modes. +static bool checkDenormalAttributeInconsistency(const Module &M) { + auto F = M.functions().begin(); + auto E = M.functions().end(); + if (F == E) + return false; + DenormalMode Value = F->getDenormalModeRaw(); + ++F; + return std::any_of(F, E, [&](const Function &F) { + return !F.isDeclaration() && F.getDenormalModeRaw() != Value; + }); +} + void ARMAsmPrinter::emitAttributes() { MCTargetStreamer &TS = *OutStreamer->getTargetStreamer(); ARMTargetStreamer &ATS = static_cast(TS); @@ -695,7 +711,9 @@ void ARMAsmPrinter::emitAttributes() { DenormalMode::getPositiveZero())) ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::PositiveZero); - else if (!TM.Options.UnsafeFPMath) + else if (checkDenormalAttributeInconsistency(*MMI->getModule()) || + checkDenormalAttributeConsistency( + *MMI->getModule(), "denormal-fp-math", DenormalMode::getIEEE())) ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::IEEEDenormals); else { @@ -730,7 +748,7 @@ void ARMAsmPrinter::emitAttributes() { TM.Options.NoTrappingFPMath) ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Not_Allowed); - else if (!TM.Options.UnsafeFPMath) { + else { ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions, ARMBuildAttrs::Allowed); // If the user has permitted this code to choose the IEEE 754 diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 5c35b3327c16d..22769dbf38719 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -6510,14 +6510,14 @@ bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault( return Subtarget.isMClass() && MF.getFunction().hasMinSize(); } -bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable( +bool ARMBaseInstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { // Try hard to rematerialize any VCTPs because if we spill P0, it will block // the tail predication conversion. This means that the element count // register has to be live for longer, but that has to be better than // spill/restore and VPT predication. return (isVCTP(&MI) && !isPredicated(MI)) || - TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + TargetInstrInfo::isReMaterializableImpl(MI); } unsigned llvm::getBLXOpcode(const MachineFunction &MF) { diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 71de3c6ad597a..2869e7f708046 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -479,7 +479,7 @@ class ARMBaseInstrInfo : public ARMGenInstrInfo { MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI, const TargetInstrInfo *TII) const; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI) const override; private: /// Modeling special VFP / NEON fp MLA / MLS hazards. diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 57141ab69223f..9945ecc9c96e0 100644 --- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -27,6 +27,8 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/Debug.h" +#include + using namespace llvm; #define DEBUG_TYPE "arm-pseudo" diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 9052cbfa89deb..f4ac6bb76b3fe 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1482,7 +1482,7 @@ bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } -bool ARMTargetLowering::shouldExpandCmpUsingSelects(EVT VT) const { +bool ARMTargetLowering::preferSelectsOverBooleanArithmetic(EVT VT) const { return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32; } @@ -5573,7 +5573,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, llvm_unreachable("Unknown VFP cmp argument!"); } -/// OptimizeVFPBrcond - With nnan, it's legal to optimize some +/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some /// f32 and even f64 comparisons to integer ones. SDValue ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { @@ -5729,9 +5729,9 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { } SDNodeFlags Flags = Op->getFlags(); - if ((getTargetMachine().Options.UnsafeFPMath || Flags.hasNoNaNs()) && - (DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() && - DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE()) && + if (Flags.hasNoNaNs() && + DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() && + DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() && (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE || CC == ISD::SETUNE)) { if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) @@ -20428,9 +20428,9 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, if (CVal >= -255 && CVal <= -1) break; } else { - // This must be a constant between -4095 and 4095. It is not clear - // what this constraint is intended for. Implemented for - // compatibility with GCC. + // This must be a constant between -4095 and 4095. This is suitable + // for use as the immediate offset field in LDR and STR instructions + // such as LDR r0,[r1,#offset]. if (CVal >= -4095 && CVal <= 4095) break; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 8e417ac3e1a7b..fa130a153b0de 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -605,7 +605,7 @@ class VectorType; bool preferZeroCompareBranch() const override { return true; } - bool shouldExpandCmpUsingSelects(EVT VT) const override; + bool preferSelectsOverBooleanArithmetic(EVT VT) const override; bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td index e50740f7d57c5..1ad2485dce17f 100644 --- a/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -1219,6 +1219,7 @@ class Thumb1sI pattern> : InstThumb { + bits<0> s; let OutOperandList = !con(oops, (outs s_cc_out:$s)); let InOperandList = !con(iops, (ins pred:$p)); let AsmString = !strconcat(opc, "${s}${p}", asm); diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index 9f600e0c685ab..3329beab63ddf 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -88,18 +88,16 @@ ARMFrameLowering *ARMSubtarget::initializeFrameLowering(StringRef CPU, ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle, - bool MinSize) + bool MinSize, DenormalMode DM) : ARMGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), UseMulOps(UseFusedMulOps), CPUString(CPU), OptMinSize(MinSize), - IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), + IsLittle(IsLittle), DM(DM), TargetTriple(TT), Options(TM.Options), TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. - InstrInfo(isThumb1Only() - ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this) - : !isThumb() - ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this) - : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)), + InstrInfo(isThumb1Only() ? (ARMBaseInstrInfo *)new Thumb1InstrInfo(*this) + : !isThumb() ? (ARMBaseInstrInfo *)new ARMInstrInfo(*this) + : (ARMBaseInstrInfo *)new Thumb2InstrInfo(*this)), TLInfo(TM, *this) { CallLoweringInfo.reset(new ARMCallLowering(*getTargetLowering())); @@ -224,7 +222,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default. const FeatureBitset &Bits = getFeatureBits(); if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters - (Options.UnsafeFPMath || isTargetDarwin())) + (isTargetDarwin() || DM == DenormalMode::getPreserveSign())) HasNEONForFP = true; if (isRWPI()) diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 637eb4560e0f1..b2d368e0ca175 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -186,6 +186,12 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// IsLittle - The target is Little Endian bool IsLittle; + /// DM - Denormal mode + /// NEON and VFP RunFast mode are not IEEE 754 compliant, + /// use this field to determine whether to generate NEON/VFP + /// instructions in related function. + DenormalMode DM; + /// TargetTriple - What processor and OS we're targeting. Triple TargetTriple; @@ -206,7 +212,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo { /// ARMSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const ARMBaseTargetMachine &TM, bool IsLittle, - bool MinSize = false); + bool MinSize = false, DenormalMode DM = DenormalMode::getIEEE()); /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 131b9332e9ade..86740a92b32c5 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -229,6 +229,10 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { if (F.hasMinSize()) Key += "+minsize"; + DenormalMode DM = F.getDenormalModeRaw(); + if (DM != DenormalMode::getIEEE()) + Key += "denormal-fp-math=" + DM.str(); + auto &I = SubtargetMap[Key]; if (!I) { // This needs to be done before we create a new subtarget since any @@ -236,7 +240,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = std::make_unique(TargetTriple, CPU, FS, *this, isLittle, - F.hasMinSize()); + F.hasMinSize(), DM); if (!I->isThumb() && !I->hasARMOps()) F.getContext().emitError("Function '" + F.getName() + "' uses ARM " diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 0e974838a7c6b..f60660b12baca 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -135,17 +135,17 @@ class UnwindContext { MCRegister getFPReg() const { return FPReg; } void emitFnStartLocNotes() const { - for (const SMLoc &Loc : FnStartLocs) + for (SMLoc Loc : FnStartLocs) Parser.Note(Loc, ".fnstart was specified here"); } void emitCantUnwindLocNotes() const { - for (const SMLoc &Loc : CantUnwindLocs) + for (SMLoc Loc : CantUnwindLocs) Parser.Note(Loc, ".cantunwind was specified here"); } void emitHandlerDataLocNotes() const { - for (const SMLoc &Loc : HandlerDataLocs) + for (SMLoc Loc : HandlerDataLocs) Parser.Note(Loc, ".handlerdata was specified here"); } diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp index 56112112a0293..d358913d38af9 100644 --- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp +++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp @@ -119,6 +119,8 @@ class VPTStatus { class ARMDisassembler : public MCDisassembler { public: std::unique_ptr MCII; + mutable ITStatus ITBlock; + mutable VPTStatus VPTBlock; ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, const MCInstrInfo *MCII) @@ -146,10 +148,6 @@ class ARMDisassembler : public MCDisassembler { ArrayRef Bytes, uint64_t Address, raw_ostream &CStream) const; - mutable ITStatus ITBlock; - mutable VPTStatus VPTBlock; - - void AddThumb1SBit(MCInst &MI, bool InITBlock) const; bool isVectorPredicable(const MCInst &MI) const; DecodeStatus AddThumbPredicate(MCInst&) const; void UpdateThumbPredicate(DecodeStatus &S, MCInst &MI) const; @@ -159,12 +157,6 @@ class ARMDisassembler : public MCDisassembler { } // end anonymous namespace -// Forward declare these because the autogenerated code will reference them. -// Definitions are further down. -static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder); - typedef DecodeStatus OperandDecoder(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder); @@ -636,6 +628,17 @@ static DecodeStatus DecodeCCOutOperand(MCInst &Inst, unsigned Val, return MCDisassembler::Success; } +// This overload is called when decoding `s_cc_out` operand, which is not +// encoded into instruction. It is only used in Thumb1 instructions. +static DecodeStatus DecodeCCOutOperand(MCInst &Inst, + const MCDisassembler *Decoder) { + const auto *D = static_cast(Decoder); + // Thumb1 instructions define CPSR unless they are inside an IT block. + MCRegister CCR = D->ITBlock.instrInITBlock() ? ARM::NoRegister : ARM::CPSR; + Inst.addOperand(MCOperand::createReg(CCR)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeSORegImmOperand(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -3158,6 +3161,65 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn, return S; } +static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address, + const MCDisassembler *Decoder) { + int imm = Val & 0xFF; + if (Val == 0) + imm = INT32_MIN; + else if (!(Val & 0x100)) + imm *= -1; + Inst.addOperand(MCOperand::createImm(imm)); + + return MCDisassembler::Success; +} + +static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, + uint64_t Address, + const MCDisassembler *Decoder) { + DecodeStatus S = MCDisassembler::Success; + + unsigned Rn = fieldFromInstruction(Val, 9, 4); + unsigned imm = fieldFromInstruction(Val, 0, 9); + + // Thumb stores cannot use PC as dest register. + switch (Inst.getOpcode()) { + case ARM::t2STRT: + case ARM::t2STRBT: + case ARM::t2STRHT: + case ARM::t2STRi8: + case ARM::t2STRHi8: + case ARM::t2STRBi8: + if (Rn == 15) + return MCDisassembler::Fail; + break; + default: + break; + } + + // Some instructions always use an additive offset. + switch (Inst.getOpcode()) { + case ARM::t2LDRT: + case ARM::t2LDRBT: + case ARM::t2LDRHT: + case ARM::t2LDRSBT: + case ARM::t2LDRSHT: + case ARM::t2STRT: + case ARM::t2STRBT: + case ARM::t2STRHT: + imm |= 0x100; + break; + default: + break; + } + + if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) + return MCDisassembler::Fail; + if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder))) + return MCDisassembler::Fail; + + return S; +} + static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn, uint64_t Address, const MCDisassembler *Decoder) { @@ -3467,18 +3529,6 @@ static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst, unsigned Val, return S; } -static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val, uint64_t Address, - const MCDisassembler *Decoder) { - int imm = Val & 0xFF; - if (Val == 0) - imm = INT32_MIN; - else if (!(Val & 0x100)) - imm *= -1; - Inst.addOperand(MCOperand::createImm(imm)); - - return MCDisassembler::Success; -} - template static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address, const MCDisassembler *Decoder) { @@ -3494,53 +3544,6 @@ static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val, uint64_t Address, return MCDisassembler::Success; } -static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val, - uint64_t Address, - const MCDisassembler *Decoder) { - DecodeStatus S = MCDisassembler::Success; - - unsigned Rn = fieldFromInstruction(Val, 9, 4); - unsigned imm = fieldFromInstruction(Val, 0, 9); - - // Thumb stores cannot use PC as dest register. - switch (Inst.getOpcode()) { - case ARM::t2STRT: - case ARM::t2STRBT: - case ARM::t2STRHT: - case ARM::t2STRi8: - case ARM::t2STRHi8: - case ARM::t2STRBi8: - if (Rn == 15) - return MCDisassembler::Fail; - break; - default: - break; - } - - // Some instructions always use an additive offset. - switch (Inst.getOpcode()) { - case ARM::t2LDRT: - case ARM::t2LDRBT: - case ARM::t2LDRHT: - case ARM::t2LDRSBT: - case ARM::t2LDRSHT: - case ARM::t2STRT: - case ARM::t2STRBT: - case ARM::t2STRHT: - imm |= 0x100; - break; - default: - break; - } - - if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder))) - return MCDisassembler::Fail; - if (!Check(S, DecodeT2Imm8(Inst, imm, Address, Decoder))) - return MCDisassembler::Fail; - - return S; -} - template static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val, uint64_t Address, @@ -6130,26 +6133,6 @@ DecodeStatus ARMDisassembler::getARMInstruction(MCInst &MI, uint64_t &Size, return MCDisassembler::Fail; } -// Thumb1 instructions don't have explicit S bits. Rather, they -// implicitly set CPSR. Since it's not represented in the encoding, the -// auto-generated decoder won't inject the CPSR operand. We need to fix -// that as a post-pass. -void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const { - const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); - MCInst::iterator I = MI.begin(); - for (unsigned i = 0; i < MCID.NumOperands; ++i, ++I) { - if (I == MI.end()) break; - if (MCID.operands()[i].isOptionalDef() && - MCID.operands()[i].RegClass == ARM::CCRRegClassID) { - if (i > 0 && MCID.operands()[i - 1].isPredicate()) - continue; - MI.insert(I, - MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR)); - return; - } - } -} - bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const { const MCInstrDesc &MCID = MCII->get(MI.getOpcode()); for (unsigned i = 0; i < MCID.NumOperands; ++i) { @@ -6343,9 +6326,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, STI); if (Result) { Size = 2; - bool InITBlock = ITBlock.instrInITBlock(); Check(Result, AddThumbPredicate(MI)); - AddThumb1SBit(MI, InITBlock); return Result; } @@ -6411,9 +6392,7 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size, decodeInstruction(DecoderTableThumb32, MI, Insn32, Address, this, STI); if (Result != MCDisassembler::Fail) { Size = 4; - bool InITBlock = ITBlock.instrInITBlock(); Check(Result, AddThumbPredicate(MI)); - AddThumb1SBit(MI, InITBlock); return Result; } diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp index c56b589519533..4a87c638f5fc3 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp @@ -30,7 +30,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/TargetParser/Triple.h" #include diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h index a58244a3d83f3..dfd896fbc735f 100644 --- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h +++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h @@ -48,6 +48,10 @@ class BPFMCAsmInfo : public MCAsmInfoELF { void setDwarfUsesRelocationsAcrossSections(bool enable) { DwarfUsesRelocationsAcrossSections = enable; } + + MCSection *getNonexecutableStackSection(MCContext &Ctx) const override { + return nullptr; + } }; } diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index c65ead45e2c7e..228114c5c24b2 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -424,6 +424,7 @@ def Saturate : DXILOp<7, unary> { def IsNaN : DXILOp<8, isSpecialFloat> { let Doc = "Determines if the specified value is NaN."; + let intrinsics = [IntrinSelect]; let arguments = [OverloadTy]; let result = Int1Ty; let overloads = [Overloads]; diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp index e2469d8df957f..ebb7c2607c0c8 100644 --- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp +++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp @@ -213,6 +213,7 @@ static bool isIntrinsicExpansion(Function &F) { case Intrinsic::dx_nclamp: case Intrinsic::dx_degrees: case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: case Intrinsic::dx_lerp: case Intrinsic::dx_normalize: case Intrinsic::dx_fdot: @@ -1024,6 +1025,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) { case Intrinsic::dx_isinf: Result = expand16BitIsInf(Orig); break; + case Intrinsic::dx_isnan: + Result = expand16BitIsNaN(Orig); + break; case Intrinsic::dx_lerp: Result = expandLerpIntrinsic(Orig); break; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index 577b4624458b9..610d8b63bba27 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/DXILMetadataAnalysis.h" #include "llvm/Analysis/DXILResource.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" @@ -24,6 +25,7 @@ #include "llvm/IR/IntrinsicsDirectX.h" #include "llvm/IR/Module.h" #include "llvm/IR/PassManager.h" +#include "llvm/IR/Use.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" @@ -42,6 +44,7 @@ class OpLowerer { DXILResourceTypeMap &DRTM; const ModuleMetadataInfo &MMDI; SmallVector CleanupCasts; + Function *CleanupNURI = nullptr; public: OpLowerer(Module &M, DXILResourceMap &DRM, DXILResourceTypeMap &DRTM, @@ -195,6 +198,21 @@ class OpLowerer { CleanupCasts.clear(); } + void cleanupNonUniformResourceIndexCalls() { + // Replace all NonUniformResourceIndex calls with their argument. + if (!CleanupNURI) + return; + for (User *U : make_early_inc_range(CleanupNURI->users())) { + CallInst *CI = dyn_cast(U); + if (!CI) + continue; + CI->replaceAllUsesWith(CI->getArgOperand(0)); + CI->eraseFromParent(); + } + CleanupNURI->eraseFromParent(); + CleanupNURI = nullptr; + } + // Remove the resource global associated with the handleFromBinding call // instruction and their uses as they aren't needed anymore. // TODO: We should verify that all the globals get removed. @@ -229,6 +247,31 @@ class OpLowerer { NameGlobal->removeFromParent(); } + bool hasNonUniformIndex(Value *IndexOp) { + if (isa(IndexOp)) + return false; + + SmallVector WorkList; + WorkList.push_back(IndexOp); + + while (!WorkList.empty()) { + Value *V = WorkList.pop_back_val(); + if (auto *CI = dyn_cast(V)) { + if (CI->getCalledFunction()->getIntrinsicID() == + Intrinsic::dx_resource_nonuniformindex) + return true; + } + if (auto *U = llvm::dyn_cast(V)) { + for (llvm::Value *Op : U->operands()) { + if (isa(Op)) + continue; + WorkList.push_back(Op); + } + } + } + return false; + } + [[nodiscard]] bool lowerToCreateHandle(Function &F) { IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int8Ty = IRB.getInt8Ty(); @@ -250,13 +293,12 @@ class OpLowerer { IndexOp = IRB.CreateAdd(IndexOp, ConstantInt::get(Int32Ty, Binding.LowerBound)); - // FIXME: The last argument is a NonUniform flag which needs to be set - // based on resource analysis. - // https://github.com/llvm/llvm-project/issues/155701 + bool HasNonUniformIndex = + (Binding.Size == 1) ? false : hasNonUniformIndex(IndexOp); std::array Args{ ConstantInt::get(Int8Ty, llvm::to_underlying(RC)), ConstantInt::get(Int32Ty, Binding.RecordID), IndexOp, - ConstantInt::get(Int1Ty, false)}; + ConstantInt::get(Int1Ty, HasNonUniformIndex)}; Expected OpCall = OpBuilder.tryCreateOp(OpCode::CreateHandle, Args, CI->getName()); if (Error E = OpCall.takeError()) @@ -300,11 +342,10 @@ class OpLowerer { : Binding.LowerBound + Binding.Size - 1; Constant *ResBind = OpBuilder.getResBind(Binding.LowerBound, UpperBound, Binding.Space, RC); - // FIXME: The last argument is a NonUniform flag which needs to be set - // based on resource analysis. - // https://github.com/llvm/llvm-project/issues/155701 - Constant *NonUniform = ConstantInt::get(Int1Ty, false); - std::array BindArgs{ResBind, IndexOp, NonUniform}; + bool NonUniformIndex = + (Binding.Size == 1) ? false : hasNonUniformIndex(IndexOp); + Constant *NonUniformOp = ConstantInt::get(Int1Ty, NonUniformIndex); + std::array BindArgs{ResBind, IndexOp, NonUniformOp}; Expected OpBind = OpBuilder.tryCreateOp( OpCode::CreateHandleFromBinding, BindArgs, CI->getName()); if (Error E = OpBind.takeError()) @@ -868,6 +909,11 @@ class OpLowerer { case Intrinsic::dx_resource_getpointer: HasErrors |= lowerGetPointer(F); break; + case Intrinsic::dx_resource_nonuniformindex: + assert(!CleanupNURI && + "overloaded llvm.dx.resource.nonuniformindex intrinsics?"); + CleanupNURI = &F; + break; case Intrinsic::dx_resource_load_typedbuffer: HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true); break; @@ -908,8 +954,10 @@ class OpLowerer { } Updated = true; } - if (Updated && !HasErrors) + if (Updated && !HasErrors) { cleanupHandleCasts(); + cleanupNonUniformResourceIndexCalls(); + } return Updated; } diff --git a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp index 7e93474e73118..6e95a4232fabe 100644 --- a/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp +++ b/llvm/lib/Target/DirectX/DXILPostOptimizationValidation.cpp @@ -160,6 +160,41 @@ tripleToVisibility(llvm::Triple::EnvironmentType ET) { } } +static void reportIfDeniedShaderStageAccess(Module &M, + const dxbc::RootFlags &Flags, + const dxbc::RootFlags &Mask) { + if ((Flags & Mask) != Mask) + return; + + SmallString<128> Message; + raw_svector_ostream OS(Message); + OS << "Shader has root bindings but root signature uses a DENY flag to " + "disallow root binding access to the shader stage."; + M.getContext().diagnose(DiagnosticInfoGeneric(Message)); +} + +static std::optional +getEnvironmentDenyFlagMask(Triple::EnvironmentType ShaderProfile) { + switch (ShaderProfile) { + case Triple::Pixel: + return dxbc::RootFlags::DenyPixelShaderRootAccess; + case Triple::Vertex: + return dxbc::RootFlags::DenyVertexShaderRootAccess; + case Triple::Geometry: + return dxbc::RootFlags::DenyGeometryShaderRootAccess; + case Triple::Hull: + return dxbc::RootFlags::DenyHullShaderRootAccess; + case Triple::Domain: + return dxbc::RootFlags::DenyDomainShaderRootAccess; + case Triple::Mesh: + return dxbc::RootFlags::DenyMeshShaderRootAccess; + case Triple::Amplification: + return dxbc::RootFlags::DenyAmplificationShaderRootAccess; + default: + return std::nullopt; + } +} + static void validateRootSignature(Module &M, const mcdxbc::RootSignatureDesc &RSD, dxil::ModuleMetadataInfo &MMI, @@ -225,7 +260,9 @@ static void validateRootSignature(Module &M, Builder.findOverlapping(ReportedBinding); reportOverlappingRegisters(M, ReportedBinding, Overlaping); }); + const hlsl::BoundRegs &BoundRegs = Builder.takeBoundRegs(); + bool HasBindings = false; for (const ResourceInfo &RI : DRM) { const ResourceInfo::ResourceBinding &Binding = RI.getBinding(); const dxil::ResourceTypeInfo &RTI = DRTM[RI.getHandleTy()]; @@ -236,22 +273,33 @@ static void validateRootSignature(Module &M, BoundRegs.findBoundReg(RC, Binding.Space, Binding.LowerBound, Binding.LowerBound + Binding.Size - 1); - if (Reg != nullptr) { - const auto *ParamInfo = - static_cast(Reg->Cookie); - - if (RC != ResourceClass::SRV && RC != ResourceClass::UAV) - continue; + if (!Reg) { + reportRegNotBound(M, RC, Binding); + continue; + } - if (ParamInfo->Type == dxbc::RootParameterType::DescriptorTable) - continue; + const auto *ParamInfo = + static_cast(Reg->Cookie); - if (RK != ResourceKind::RawBuffer && RK != ResourceKind::StructuredBuffer) - reportInvalidHandleTyError(M, RC, Binding); - } else { - reportRegNotBound(M, RC, Binding); + bool IsSRVOrUAV = RC == ResourceClass::SRV || RC == ResourceClass::UAV; + bool IsDescriptorTable = + ParamInfo->Type == dxbc::RootParameterType::DescriptorTable; + bool IsRawOrStructuredBuffer = + RK != ResourceKind::RawBuffer && RK != ResourceKind::StructuredBuffer; + if (IsSRVOrUAV && !IsDescriptorTable && IsRawOrStructuredBuffer) { + reportInvalidHandleTyError(M, RC, Binding); + continue; } + + HasBindings = true; } + + if (!HasBindings) + return; + + if (std::optional Mask = + getEnvironmentDenyFlagMask(MMI.ShaderProfile)) + reportIfDeniedShaderStageAccess(M, dxbc::RootFlags(RSD.Flags), *Mask); } static mcdxbc::RootSignatureDesc * diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp index 5153d24070dc9..68fd3e0bc74c7 100644 --- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp +++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp @@ -30,6 +30,7 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, switch (ID) { case Intrinsic::dx_asdouble: case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: case Intrinsic::dx_firstbitlow: case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_firstbitshigh: @@ -48,6 +49,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable( case Intrinsic::dx_firstbituhigh: case Intrinsic::dx_frac: case Intrinsic::dx_isinf: + case Intrinsic::dx_isnan: case Intrinsic::dx_rsqrt: case Intrinsic::dx_saturate: case Intrinsic::dx_splitdouble: diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp index df2cfd07d8cc0..4d96cfadc79ff 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp @@ -2483,8 +2483,15 @@ OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) { } ++I; + // Upper bits of the vdeal/vshuff parameter that do not cover any byte in + // the vector are ignored. Technically, A2_tfrsi takes a signed value, which + // is sign-extended to 32 bit if there is no extender. The practical + // advantages are that signed values are smaller in common use cases and are + // not sensitive to the vector size. + int SS = SignExtend32(S, HwLog); + NodeTemplate Res; - Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(S, dl)}); + Results.push(Hexagon::A2_tfrsi, MVT::i32, {getSignedConst32(SS, dl)}); Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd; Res.Ty = PairTy; Res.Ops = {OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1)}; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp index dfe0fa973c9b3..021dceb0e0789 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp @@ -25,7 +25,6 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include #include diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp index fafdad08909dd..3b1d3bd89680b 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.cpp +++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp @@ -108,7 +108,7 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) { for (NodeAddr IA : BA.Addr->members(DFG)) { if (DFG.IsCode(IA)) { NodeAddr SA = IA; - EqualityMap EM(std::less(DFG.getPRI())); + EqualityMap EM(RegisterRefLess(DFG.getPRI())); if (interpretAsCopy(SA.Addr->getCode(), EM)) recordCopy(SA, EM); } diff --git a/llvm/lib/Target/Hexagon/RDFCopy.h b/llvm/lib/Target/Hexagon/RDFCopy.h index e4fb89892831d..92b2c65982655 100644 --- a/llvm/lib/Target/Hexagon/RDFCopy.h +++ b/llvm/lib/Target/Hexagon/RDFCopy.h @@ -25,8 +25,8 @@ class MachineInstr; namespace rdf { struct CopyPropagation { - CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg), - RDefMap(std::less(DFG.getPRI())) {} + CopyPropagation(DataFlowGraph &dfg) + : MDT(dfg.getDT()), DFG(dfg), RDefMap(RegisterRefLess(DFG.getPRI())) {} virtual ~CopyPropagation() = default; @@ -35,7 +35,7 @@ namespace rdf { bool trace() const { return Trace; } DataFlowGraph &getDFG() { return DFG; } - using EqualityMap = std::map; + using EqualityMap = std::map; virtual bool interpretAsCopy(const MachineInstr *MI, EqualityMap &EM); private: @@ -45,7 +45,7 @@ namespace rdf { bool Trace = false; // map: register -> (map: stmt -> reaching def) - std::map> RDefMap; + std::map, RegisterRefLess> RDefMap; // map: statement -> (map: dst reg -> src reg) std::map CopyMap; std::vector Copies; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp index 07e722b9a6591..442f0a46a4983 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp @@ -113,10 +113,11 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - unsigned Op; + unsigned Op = 0; EVT ResTy = BVN->getValueType(0); bool Is128Vec = BVN->getValueType(0).is128BitVector(); bool Is256Vec = BVN->getValueType(0).is256BitVector(); + SDNode *Res; if (!Subtarget->hasExtLSX() || (!Is128Vec && !Is256Vec)) break; @@ -124,26 +125,25 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { HasAnyUndefs, 8)) break; - switch (SplatBitSize) { - default: - break; - case 8: - Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B; - break; - case 16: - Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H; - break; - case 32: - Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W; - break; - case 64: - Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D; - break; - } - - SDNode *Res; // If we have a signed 10 bit integer, we can splat it directly. if (SplatValue.isSignedIntN(10)) { + switch (SplatBitSize) { + default: + break; + case 8: + Op = Is256Vec ? LoongArch::PseudoXVREPLI_B : LoongArch::PseudoVREPLI_B; + break; + case 16: + Op = Is256Vec ? LoongArch::PseudoXVREPLI_H : LoongArch::PseudoVREPLI_H; + break; + case 32: + Op = Is256Vec ? LoongArch::PseudoXVREPLI_W : LoongArch::PseudoVREPLI_W; + break; + case 64: + Op = Is256Vec ? LoongArch::PseudoXVREPLI_D : LoongArch::PseudoVREPLI_D; + break; + } + EVT EleType = ResTy.getVectorElementType(); APInt Val = SplatValue.sextOrTrunc(EleType.getSizeInBits()); SDValue Imm = CurDAG->getTargetConstant(Val, DL, EleType); @@ -151,6 +151,21 @@ void LoongArchDAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, Res); return; } + + // Select appropriate [x]vldi instructions for some special constant splats, + // where the immediate value `imm[12] == 1` for used [x]vldi instructions. + const auto &TLI = + *static_cast(getTargetLowering()); + std::pair ConvertVLDI = + TLI.isImmVLDILegalForMode1(SplatValue, SplatBitSize); + if (ConvertVLDI.first) { + Op = Is256Vec ? LoongArch::XVLDI : LoongArch::VLDI; + SDValue Imm = CurDAG->getSignedTargetConstant( + SignExtend32<13>(ConvertVLDI.second), DL, MVT::i32); + Res = CurDAG->getMachineNode(Op, DL, ResTy, Imm); + ReplaceNode(Node, Res); + return; + } break; } } diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 32baa2d111270..098bcfa67d1d3 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -340,6 +340,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); setOperationAction(ISD::SADDSAT, VT, Legal); @@ -419,6 +420,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::ABDS, VT, Legal); setOperationAction(ISD::ABDU, VT, Legal); setOperationAction(ISD::SADDSAT, VT, Legal); @@ -666,6 +668,7 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, unsigned NumEles = Val.getSimpleValueType().getVectorNumElements(); unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits(); + unsigned ResBits = OpVT.getScalarSizeInBits(); unsigned LegalVecSize = 128; bool isLASX256Vector = @@ -691,10 +694,11 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op, if (isLASX256Vector) { SDValue Tmp = DAG.getNode(LoongArchISD::XVPERMI, DL, MVT::v4i64, Val, - DAG.getConstant(2, DL, MVT::i64)); + DAG.getConstant(2, DL, Subtarget.getGRLenVT())); Val = DAG.getNode(ISD::ADD, DL, MVT::v4i64, Tmp, Val); } + Val = DAG.getBitcast(MVT::getVectorVT(OpVT, LegalVecSize / ResBits), Val); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, DAG.getConstant(0, DL, Subtarget.getGRLenVT())); } @@ -727,15 +731,16 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op, unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode()); MVT VecTy = Val.getSimpleValueType(); + MVT GRLenVT = Subtarget.getGRLenVT(); for (int i = NumEles; i > 1; i /= 2) { - SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64); + SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, GRLenVT); SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt); Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val); } return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val, - DAG.getConstant(0, DL, Subtarget.getGRLenVT())); + DAG.getConstant(0, DL, GRLenVT)); } SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op, @@ -1119,6 +1124,10 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op, SDValue Src = Op->getOperand(0); SDLoc DL(Op); + // LoongArchISD::BITREV_8B is not supported on LA32. + if (!Subtarget.is64Bit() && (ResTy == MVT::v16i8 || ResTy == MVT::v32i8)) + return SDValue(); + EVT NewVT = ResTy.is128BitVector() ? MVT::v2i64 : MVT::v4i64; unsigned int OrigEltNum = ResTy.getVectorNumElements(); unsigned int NewEltNum = NewVT.getVectorNumElements(); @@ -1128,7 +1137,7 @@ SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op, SmallVector Ops; for (unsigned int i = 0; i < NewEltNum; i++) { SDValue Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, NewSrc, - DAG.getConstant(i, DL, MVT::i64)); + DAG.getConstant(i, DL, Subtarget.getGRLenVT())); unsigned RevOp = (ResTy == MVT::v16i8 || ResTy == MVT::v32i8) ? (unsigned)LoongArchISD::BITREV_8B : (unsigned)ISD::BITREVERSE; @@ -1596,7 +1605,7 @@ static SDValue lowerVECTOR_SHUFFLEAsZeroOrAnyExtend(const SDLoc &DL, /// value is necessary in order to fit the above form. static SDValue lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef Mask, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG, + SDValue V1, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { int SplatIndex = -1; for (const auto &M : Mask) { @@ -1611,9 +1620,8 @@ lowerVECTOR_SHUFFLE_VREPLVEI(const SDLoc &DL, ArrayRef Mask, MVT VT, assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index"); if (fitsRegularPattern(Mask.begin(), 1, Mask.end(), SplatIndex, 0)) { - APInt Imm(64, SplatIndex); return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1, - DAG.getConstant(Imm, DL, Subtarget.getGRLenVT())); + DAG.getConstant(SplatIndex, DL, Subtarget.getGRLenVT())); } return SDValue(); @@ -1671,7 +1679,7 @@ lowerVECTOR_SHUFFLE_VSHUF4I(const SDLoc &DL, ArrayRef Mask, MVT VT, } // Calculate the immediate. Replace any remaining undefs with zero - APInt Imm(64, 0); + int Imm = 0; for (int i = SubVecSize - 1; i >= 0; --i) { int M = SubMask[i]; @@ -1946,11 +1954,12 @@ static SDValue lowerVECTOR_SHUFFLE_VPICKOD(const SDLoc &DL, ArrayRef Mask, /// adding it as an operand to the resulting VSHUF. static SDValue lowerVECTOR_SHUFFLE_VSHUF(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { SmallVector Ops; for (auto M : Mask) - Ops.push_back(DAG.getConstant(M, DL, MVT::i64)); + Ops.push_back(DAG.getSignedConstant(M, DL, Subtarget.getGRLenVT())); EVT MaskVecTy = VT.changeVectorElementTypeToInteger(); SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops); @@ -1989,8 +1998,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, SDValue Result; // TODO: Add more comparison patterns. if (V2.isUndef()) { - if ((Result = lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, V2, DAG, - Subtarget))) + if ((Result = + lowerVECTOR_SHUFFLE_VREPLVEI(DL, Mask, VT, V1, DAG, Subtarget))) return Result; if ((Result = lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) @@ -2030,7 +2039,8 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, return Result; if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG)) return NewShuffle; - if ((Result = lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG))) + if ((Result = + lowerVECTOR_SHUFFLE_VSHUF(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; return SDValue(); } @@ -2045,7 +2055,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, /// value is necessary in order to fit the above form. static SDValue lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef Mask, MVT VT, - SDValue V1, SDValue V2, SelectionDAG &DAG, + SDValue V1, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { int SplatIndex = -1; for (const auto &M : Mask) { @@ -2060,7 +2070,10 @@ lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef Mask, MVT VT, const auto &Begin = Mask.begin(); const auto &End = Mask.end(); - unsigned HalfSize = Mask.size() / 2; + int HalfSize = Mask.size() / 2; + + if (SplatIndex >= HalfSize) + return SDValue(); assert(SplatIndex < (int)Mask.size() && "Out of bounds mask index"); if (fitsRegularPattern(Begin, 1, End - HalfSize, SplatIndex, 0) && @@ -2085,10 +2098,30 @@ lowerVECTOR_SHUFFLE_XVSHUF4I(const SDLoc &DL, ArrayRef Mask, MVT VT, return lowerVECTOR_SHUFFLE_VSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget); } +/// Lower VECTOR_SHUFFLE into XVPERMI (if possible). +static SDValue +lowerVECTOR_SHUFFLE_XVPERMI(const SDLoc &DL, ArrayRef Mask, MVT VT, + SDValue V1, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + // Only consider XVPERMI_D. + if (Mask.size() != 4 || (VT != MVT::v4i64 && VT != MVT::v4f64)) + return SDValue(); + + unsigned MaskImm = 0; + for (unsigned i = 0; i < Mask.size(); ++i) { + if (Mask[i] == -1) + continue; + MaskImm |= Mask[i] << (i * 2); + } + + return DAG.getNode(LoongArchISD::XVPERMI, DL, VT, V1, + DAG.getConstant(MaskImm, DL, Subtarget.getGRLenVT())); +} + /// Lower VECTOR_SHUFFLE into XVPERM (if possible). static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef Mask, - MVT VT, SDValue V1, SDValue V2, - SelectionDAG &DAG) { + MVT VT, SDValue V1, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { // LoongArch LASX only have XVPERM_W. if (Mask.size() != 8 || (VT != MVT::v8i32 && VT != MVT::v8f32)) return SDValue(); @@ -2119,9 +2152,10 @@ static SDValue lowerVECTOR_SHUFFLE_XVPERM(const SDLoc &DL, ArrayRef Mask, return SDValue(); SmallVector Masks; + MVT GRLenVT = Subtarget.getGRLenVT(); for (unsigned i = 0; i < NumElts; ++i) - Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(MVT::i64) - : DAG.getConstant(Mask[i], DL, MVT::i64)); + Masks.push_back(Mask[i] == -1 ? DAG.getUNDEF(GRLenVT) + : DAG.getConstant(Mask[i], DL, GRLenVT)); SDValue MaskVec = DAG.getBuildVector(MVT::v8i32, DL, Masks); return DAG.getNode(LoongArchISD::XVPERM, DL, VT, V1, MaskVec); @@ -2353,8 +2387,10 @@ static SDValue lowerVECTOR_SHUFFLE_XVSHUF(const SDLoc &DL, ArrayRef Mask, /// The first case is the closest to LoongArch instructions and the other /// cases need to be converted to it for processing. /// -/// This function may modify V1, V2 and Mask -static void canonicalizeShuffleVectorByLane( +/// This function will return true for the last three cases above and will +/// modify V1, V2 and Mask. Otherwise, return false for the first case and +/// cross-lane shuffle cases. +static bool canonicalizeShuffleVectorByLane( const SDLoc &DL, MutableArrayRef Mask, MVT VT, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const LoongArchSubtarget &Subtarget) { @@ -2378,15 +2414,15 @@ static void canonicalizeShuffleVectorByLane( preMask = LowLaneTy; if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) { - return M < 0 || (M >= 0 && M < HalfSize) || - (M >= MaskSize && M < MaskSize + HalfSize); + return M < 0 || (M >= HalfSize && M < MaskSize) || + (M >= MaskSize + HalfSize && M < MaskSize * 2); })) - postMask = HighLaneTy; + postMask = LowLaneTy; else if (std::all_of(Mask.begin() + HalfSize, Mask.end(), [&](int M) { - return M < 0 || (M >= HalfSize && M < MaskSize) || - (M >= MaskSize + HalfSize && M < MaskSize * 2); + return M < 0 || (M >= 0 && M < HalfSize) || + (M >= MaskSize && M < MaskSize + HalfSize); })) - postMask = LowLaneTy; + postMask = HighLaneTy; // The pre-half of mask is high lane type, and the post-half of mask // is low lane type, which is closest to the LoongArch instructions. @@ -2395,7 +2431,7 @@ static void canonicalizeShuffleVectorByLane( // to the lower 128-bit of vector register, and the low lane of mask // corresponds the higher 128-bit of vector register. if (preMask == HighLaneTy && postMask == LowLaneTy) { - return; + return false; } if (preMask == LowLaneTy && postMask == HighLaneTy) { V1 = DAG.getBitcast(MVT::v4i64, V1); @@ -2449,8 +2485,10 @@ static void canonicalizeShuffleVectorByLane( *it = *it < 0 ? *it : *it + HalfSize; } } else { // cross-lane - return; + return false; } + + return true; } /// Lower VECTOR_SHUFFLE as lane permute and then shuffle (if possible). @@ -2516,27 +2554,23 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, assert(Mask.size() % 2 == 0 && "Expected even mask size."); assert(Mask.size() >= 4 && "Mask size is less than 4."); - // canonicalize non cross-lane shuffle vector - SmallVector NewMask(Mask); - canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget); - APInt KnownUndef, KnownZero; - computeZeroableShuffleElements(NewMask, V1, V2, KnownUndef, KnownZero); + computeZeroableShuffleElements(Mask, V1, V2, KnownUndef, KnownZero); APInt Zeroable = KnownUndef | KnownZero; SDValue Result; // TODO: Add more comparison patterns. if (V2.isUndef()) { - if ((Result = lowerVECTOR_SHUFFLE_XVREPLVEI(DL, NewMask, VT, V1, V2, DAG, - Subtarget))) + if ((Result = + lowerVECTOR_SHUFFLE_XVREPLVEI(DL, Mask, VT, V1, DAG, Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, NewMask, VT, V1, V2, DAG, + if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = + lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT, - V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPERM(DL, Mask, VT, V1, DAG, Subtarget))) return Result; // TODO: This comment may be enabled in the future to better match the @@ -2546,24 +2580,39 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef Mask, MVT VT, // It is recommended not to change the pattern comparison order for better // performance. - if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPACKEV(DL, Mask, VT, V1, V2, DAG))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPACKOD(DL, Mask, VT, V1, V2, DAG))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVILVH(DL, Mask, VT, V1, V2, DAG))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVILVL(DL, Mask, VT, V1, V2, DAG))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPICKEV(DL, Mask, VT, V1, V2, DAG))) return Result; - if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, NewMask, VT, V1, V2, DAG))) + if ((Result = lowerVECTOR_SHUFFLE_XVPICKOD(DL, Mask, VT, V1, V2, DAG))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, NewMask, VT, V1, V2, DAG, - Subtarget, Zeroable))) + if ((Result = lowerVECTOR_SHUFFLEAsShift(DL, Mask, VT, V1, V2, DAG, Subtarget, + Zeroable))) return Result; - if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, NewMask, VT, V1, V2, DAG, + if ((Result = lowerVECTOR_SHUFFLEAsByteRotate(DL, Mask, VT, V1, V2, DAG, Subtarget))) return Result; + + // canonicalize non cross-lane shuffle vector + SmallVector NewMask(Mask); + if (canonicalizeShuffleVectorByLane(DL, NewMask, VT, V1, V2, DAG, Subtarget)) + return lower256BitShuffle(DL, NewMask, VT, V1, V2, DAG, Subtarget); + + // FIXME: Handling the remaining cases earlier can degrade performance + // in some situations. Further analysis is required to enable more + // effective optimizations. + if (V2.isUndef()) { + if ((Result = lowerVECTOR_SHUFFLEAsLanePermuteAndShuffle(DL, NewMask, VT, + V1, V2, DAG))) + return Result; + } + if (SDValue NewShuffle = widenShuffleMask(DL, NewMask, VT, V1, V2, DAG)) return NewShuffle; if ((Result = lowerVECTOR_SHUFFLE_XVSHUF(DL, NewMask, VT, V1, V2, DAG))) @@ -2804,9 +2853,10 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op, if (SplatBitSize == 64 && !Subtarget.is64Bit()) { // We can only handle 64-bit elements that are within - // the signed 10-bit range on 32-bit targets. + // the signed 10-bit range or match vldi patterns on 32-bit targets. // See the BUILD_VECTOR case in LoongArchDAGToDAGISel::Select(). - if (!SplatValue.isSignedIntN(10)) + if (!SplatValue.isSignedIntN(10) && + !isImmVLDILegalForMode1(SplatValue, SplatBitSize).first) return SDValue(); if ((Is128Vec && ResTy == MVT::v4i32) || (Is256Vec && ResTy == MVT::v8i32)) @@ -3102,12 +3152,33 @@ LoongArchTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, return SDValue(); SDValue SplatElt = DAG.getSplatBuildVector(VT, DL, Op1); - SDValue SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); - SmallVector RawIndices; - for (unsigned i = 0; i < NumElts; ++i) - RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); - SDValue Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + SDValue SplatIdx; + SDValue Indices; + + if (!Subtarget.is64Bit() && IdxTy == MVT::i64) { + MVT PairVTy = MVT::getVectorVT(MVT::i32, NumElts * 2); + for (unsigned i = 0; i < NumElts; ++i) { + RawIndices.push_back(Op2); + RawIndices.push_back(DAG.getConstant(0, DL, MVT::i32)); + } + SplatIdx = DAG.getBuildVector(PairVTy, DL, RawIndices); + SplatIdx = DAG.getBitcast(IdxVTy, SplatIdx); + + RawIndices.clear(); + for (unsigned i = 0; i < NumElts; ++i) { + RawIndices.push_back(DAG.getConstant(i, DL, MVT::i32)); + RawIndices.push_back(DAG.getConstant(0, DL, MVT::i32)); + } + Indices = DAG.getBuildVector(PairVTy, DL, RawIndices); + Indices = DAG.getBitcast(IdxVTy, Indices); + } else { + SplatIdx = DAG.getSplatBuildVector(IdxVTy, DL, Op2); + + for (unsigned i = 0; i < NumElts; ++i) + RawIndices.push_back(DAG.getConstant(i, DL, Subtarget.getGRLenVT())); + Indices = DAG.getBuildVector(IdxVTy, DL, RawIndices); + } // insert vec, elt, idx // => @@ -5129,7 +5200,7 @@ performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG, if (Opc == ISD::DELETED_NODE) return SDValue(); - SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0)); + SDValue V = DAG.getNode(Opc, DL, Subtarget.getGRLenVT(), Src.getOperand(0)); EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); V = DAG.getZExtOrTrunc(V, DL, T); return DAG.getBitcast(VT, V); @@ -5142,6 +5213,7 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); SDValue Src = N->getOperand(0); EVT SrcVT = Src.getValueType(); + MVT GRLenVT = Subtarget.getGRLenVT(); if (!DCI.isBeforeLegalizeOps()) return SDValue(); @@ -5209,11 +5281,11 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, if (Src.getSimpleValueType() == MVT::v32i8) { SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVector(Src, DL); - Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Lo); - Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, MVT::i64, Hi); - Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, + Lo = DAG.getNode(LoongArchISD::VMSKLTZ, DL, GRLenVT, Lo); + Hi = DAG.getNode(LoongArchISD::VMSKLTZ, DL, GRLenVT, Hi); + Hi = DAG.getNode(ISD::SHL, DL, GRLenVT, Hi, DAG.getConstant(16, DL, MVT::i8)); - V = DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); + V = DAG.getNode(ISD::OR, DL, GRLenVT, Lo, Hi); } else if (UseLASX) { return SDValue(); } @@ -5221,7 +5293,7 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG, if (!V) { Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ; - V = DAG.getNode(Opc, DL, MVT::i64, Src); + V = DAG.getNode(Opc, DL, GRLenVT, Src); } EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); @@ -5878,6 +5950,22 @@ static SDValue lowerVectorBitRevImm(SDNode *Node, SelectionDAG &DAG) { return DAG.getNode(ISD::XOR, DL, ResTy, Node->getOperand(1), BitImm); } +template +static SDValue lowerVectorPickVE2GR(SDNode *N, SelectionDAG &DAG, + unsigned ResOp) { + unsigned Imm = N->getConstantOperandVal(2); + if (!isUInt(Imm)) { + const StringRef ErrorMsg = "argument out of range"; + DAG.getContext()->emitError(N->getOperationName(0) + ": " + ErrorMsg + "."); + return DAG.getUNDEF(N->getValueType(0)); + } + SDLoc DL(N); + SDValue Vec = N->getOperand(1); + SDValue Idx = DAG.getConstant(Imm, DL, MVT::i32); + SDValue EltVT = DAG.getValueType(Vec.getValueType().getVectorElementType()); + return DAG.getNode(ResOp, DL, N->getValueType(0), Vec, Idx, EltVT); +} + static SDValue performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -6367,6 +6455,68 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, N->getOperand(1), DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getGRLenVT(), N->getOperand(2))); + case Intrinsic::loongarch_lsx_vpickve2gr_b: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<4>(N, DAG, LoongArchISD::VPICK_SEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_h: + case Intrinsic::loongarch_lasx_xvpickve2gr_w: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<3>(N, DAG, LoongArchISD::VPICK_SEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_w: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<2>(N, DAG, LoongArchISD::VPICK_SEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_bu: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<4>(N, DAG, LoongArchISD::VPICK_ZEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_hu: + case Intrinsic::loongarch_lasx_xvpickve2gr_wu: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<3>(N, DAG, LoongArchISD::VPICK_ZEXT_ELT); + break; + case Intrinsic::loongarch_lsx_vpickve2gr_wu: + if (!Subtarget.is64Bit()) + return lowerVectorPickVE2GR<2>(N, DAG, LoongArchISD::VPICK_ZEXT_ELT); + break; + case Intrinsic::loongarch_lsx_bz_b: + case Intrinsic::loongarch_lsx_bz_h: + case Intrinsic::loongarch_lsx_bz_w: + case Intrinsic::loongarch_lsx_bz_d: + case Intrinsic::loongarch_lasx_xbz_b: + case Intrinsic::loongarch_lasx_xbz_h: + case Intrinsic::loongarch_lasx_xbz_w: + case Intrinsic::loongarch_lasx_xbz_d: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VALL_ZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; + case Intrinsic::loongarch_lsx_bz_v: + case Intrinsic::loongarch_lasx_xbz_v: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VANY_ZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; + case Intrinsic::loongarch_lsx_bnz_b: + case Intrinsic::loongarch_lsx_bnz_h: + case Intrinsic::loongarch_lsx_bnz_w: + case Intrinsic::loongarch_lsx_bnz_d: + case Intrinsic::loongarch_lasx_xbnz_b: + case Intrinsic::loongarch_lasx_xbnz_h: + case Intrinsic::loongarch_lasx_xbnz_w: + case Intrinsic::loongarch_lasx_xbnz_d: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VALL_NONZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; + case Intrinsic::loongarch_lsx_bnz_v: + case Intrinsic::loongarch_lasx_xbnz_v: + if (!Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), + N->getOperand(1)); + break; } return SDValue(); } @@ -8396,6 +8546,87 @@ SDValue LoongArchTargetLowering::LowerReturn( return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps); } +// Check if a constant splat can be generated using [x]vldi, where imm[12] == 1. +// Note: The following prefixes are excluded: +// imm[11:8] == 4'b0000, 4'b0100, 4'b1000 +// as they can be represented using [x]vrepli.[whb] +std::pair LoongArchTargetLowering::isImmVLDILegalForMode1( + const APInt &SplatValue, const unsigned SplatBitSize) const { + uint64_t RequiredImm = 0; + uint64_t V = SplatValue.getZExtValue(); + if (SplatBitSize == 16 && !(V & 0x00FF)) { + // 4'b0101 + RequiredImm = (0b10101 << 8) | (V >> 8); + return {true, RequiredImm}; + } else if (SplatBitSize == 32) { + // 4'b0001 + if (!(V & 0xFFFF00FF)) { + RequiredImm = (0b10001 << 8) | (V >> 8); + return {true, RequiredImm}; + } + // 4'b0010 + if (!(V & 0xFF00FFFF)) { + RequiredImm = (0b10010 << 8) | (V >> 16); + return {true, RequiredImm}; + } + // 4'b0011 + if (!(V & 0x00FFFFFF)) { + RequiredImm = (0b10011 << 8) | (V >> 24); + return {true, RequiredImm}; + } + // 4'b0110 + if ((V & 0xFFFF00FF) == 0xFF) { + RequiredImm = (0b10110 << 8) | (V >> 8); + return {true, RequiredImm}; + } + // 4'b0111 + if ((V & 0xFF00FFFF) == 0xFFFF) { + RequiredImm = (0b10111 << 8) | (V >> 16); + return {true, RequiredImm}; + } + // 4'b1010 + if ((V & 0x7E07FFFF) == 0x3E000000 || (V & 0x7E07FFFF) == 0x40000000) { + RequiredImm = + (0b11010 << 8) | (((V >> 24) & 0xC0) ^ 0x40) | ((V >> 19) & 0x3F); + return {true, RequiredImm}; + } + } else if (SplatBitSize == 64) { + // 4'b1011 + if ((V & 0xFFFFFFFF7E07FFFFULL) == 0x3E000000ULL || + (V & 0xFFFFFFFF7E07FFFFULL) == 0x40000000ULL) { + RequiredImm = + (0b11011 << 8) | (((V >> 24) & 0xC0) ^ 0x40) | ((V >> 19) & 0x3F); + return {true, RequiredImm}; + } + // 4'b1100 + if ((V & 0x7FC0FFFFFFFFFFFFULL) == 0x4000000000000000ULL || + (V & 0x7FC0FFFFFFFFFFFFULL) == 0x3FC0000000000000ULL) { + RequiredImm = + (0b11100 << 8) | (((V >> 56) & 0xC0) ^ 0x40) | ((V >> 48) & 0x3F); + return {true, RequiredImm}; + } + // 4'b1001 + auto sameBitsPreByte = [](uint64_t x) -> std::pair { + uint8_t res = 0; + for (int i = 0; i < 8; ++i) { + uint8_t byte = x & 0xFF; + if (byte == 0 || byte == 0xFF) + res |= ((byte & 1) << i); + else + return {false, 0}; + x >>= 8; + } + return {true, res}; + }; + auto [IsSame, Suffix] = sameBitsPreByte(V); + if (IsSame) { + RequiredImm = (0b11001 << 8) | Suffix; + return {true, RequiredImm}; + } + } + return {false, RequiredImm}; +} + bool LoongArchTargetLowering::isFPImmVLDILegal(const APFloat &Imm, EVT VT) const { if (!Subtarget.hasExtLSX()) @@ -8460,8 +8691,12 @@ EVT LoongArchTargetLowering::getSetCCResultType(const DataLayout &DL, } bool LoongArchTargetLowering::hasAndNot(SDValue Y) const { - // TODO: Support vectors. - return Y.getValueType().isScalarInteger() && !isa(Y); + EVT VT = Y.getValueType(); + + if (VT.isVector()) + return Subtarget.hasExtLSX() && VT.isInteger(); + + return VT.isScalarInteger() && !isa(Y); } bool LoongArchTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -9305,3 +9540,39 @@ bool LoongArchTargetLowering::SimplifyDemandedBitsForTargetNode( return TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } + +bool LoongArchTargetLowering::shouldScalarizeBinop(SDValue VecOp) const { + unsigned Opc = VecOp.getOpcode(); + + // Assume target opcodes can't be scalarized. + // TODO - do we have any exceptions? + if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc)) + return false; + + // If the vector op is not supported, try to convert to scalar. + EVT VecVT = VecOp.getValueType(); + if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) + return true; + + // If the vector op is supported, but the scalar op is not, the transform may + // not be worthwhile. + EVT ScalarVT = VecVT.getScalarType(); + return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); +} + +bool LoongArchTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const { + if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) + return false; + + // Extract a 128-bit subvector from index 0 of a 256-bit vector is free. + return Index == 0; +} + +bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT, + unsigned Index) const { + EVT EltVT = VT.getScalarType(); + + // Extract a scalar FP value from index 0 of a vector is free. + return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 19c85faa9f9cc..9b60a9fd53726 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -337,6 +337,17 @@ class LoongArchTargetLowering : public TargetLowering { TargetLoweringOpt &TLO, unsigned Depth) const override; + bool shouldScalarizeBinop(SDValue VecOp) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + bool isExtractVecEltCheap(EVT VT, unsigned Index) const override; + + /// Check if a constant splat can be generated using [x]vldi, where imm[12] + /// is 1. + std::pair + isImmVLDILegalForMode1(const APInt &SplatValue, + const unsigned SplatBitSize) const; + private: /// Target-specific function used to lower LoongArch calling conventions. typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI, diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 20ccc622f58dc..9565a55e4c6c5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -943,7 +943,7 @@ let Predicates = [IsLA64] in { def ADD_D : ALU_3R<0x00108000>; def SUB_D : ALU_3R<0x00118000>; // ADDI_D isn't always rematerializable, but isReMaterializable will be used as -// a hint which is verified in isReallyTriviallyReMaterializable. +// a hint which is verified in isReMaterializableImpl. // See LoongArchInstrInfo::isAsCheapAsAMove for more details. let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def ADDI_D : ALU_2RI12<0x02c00000, simm12_addlike>; diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index adfe990ba1234..bbc0489620193 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2015,10 +2015,26 @@ def : Pat<(v4i32(fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)), sub_128)>; +// abs +def : Pat<(abs v32i8:$xj), (XVMAX_B v32i8:$xj, (XVNEG_B v32i8:$xj))>; +def : Pat<(abs v16i16:$xj), (XVMAX_H v16i16:$xj, (XVNEG_H v16i16:$xj))>; +def : Pat<(abs v8i32:$xj), (XVMAX_W v8i32:$xj, (XVNEG_W v8i32:$xj))>; +def : Pat<(abs v4i64:$xj), (XVMAX_D v4i64:$xj, (XVNEG_D v4i64:$xj))>; + // XVABSD_{B/H/W/D}[U] defm : PatXrXr; defm : PatXrXrU; +// XVADDA_{B/H/W/D} +def : Pat<(add (v32i8 (abs v32i8:$xj)), (v32i8 (abs v32i8:$xk))), + (XVADDA_B v32i8:$xj, v32i8:$xk)>; +def : Pat<(add (v16i16 (abs v16i16:$xj)), (v16i16 (abs v16i16:$xk))), + (XVADDA_H v16i16:$xj, v16i16:$xk)>; +def : Pat<(add (v8i32 (abs v8i32:$xj)), (v8i32 (abs v8i32:$xk))), + (XVADDA_W v8i32:$xj, v8i32:$xk)>; +def : Pat<(add (v4i64 (abs v4i64:$xj)), (v4i64 (abs v4i64:$xk))), + (XVADDA_D v4i64:$xj, v4i64:$xk)>; + // XVSADD_{B/H/W/D}[U], XVSSUB_{B/H/W/D}[U] defm : PatXrXr; defm : PatXrXr; diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index d99a57e562528..8d1dc99e316c9 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -26,7 +26,7 @@ def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, def SDT_LoongArchV2RUimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, - SDTCisVT<3, i64>]>; + SDTCisVT<3, GRLenVT>]>; def SDT_LoongArchVreplgr2vr : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>]>; def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; @@ -158,6 +158,7 @@ def vsplatf32_fpimm_eq_1 N = N->getOperand(0).getNode(); return selectVSplat(N, Imm, EltTy.getSizeInBits()) && + Imm.getBitWidth() == 32 && Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == APFloat(+1.0f).bitcastToAPInt(); }]>; @@ -1482,7 +1483,7 @@ multiclass VldreplPat { } multiclass VstelmPat { + Operand ImmOpnd, Operand IdxOpnd, ValueType elt = GRLenVT> { def : Pat<(StoreOp(elt(vector_extract vt:$vd, IdxOpnd:$idx)), BaseAddr:$rj), (Inst vt:$vd, BaseAddr:$rj, 0, IdxOpnd:$idx)>; @@ -2110,8 +2111,8 @@ def : Pat<(GRLenVT (vector_extract v4i32:$vj, GRLenVT:$rk)), (COPY_TO_REGCLASS (f32 (EXTRACT_SUBREG (VREPLVE_W v4i32:$vj, GRLenVT:$rk), sub_32)), GPR)>; -def : Pat<(i64 (vector_extract v2i64:$vj, i64:$rk)), - (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (VREPLVE_D v2i64:$vj, i64:$rk), +def : Pat<(GRLenVT (vector_extract v2i64:$vj, GRLenVT:$rk)), + (COPY_TO_REGCLASS (f64 (EXTRACT_SUBREG (VREPLVE_D v2i64:$vj, GRLenVT:$rk), sub_64)), GPR)>; def : Pat<(f32 (vector_extract v4f32:$vj, GRLenVT:$rk)), @@ -2153,10 +2154,26 @@ def : Pat<(f32 f32imm_vldi:$in), def : Pat<(f64 f64imm_vldi:$in), (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>; +// abs +def : Pat<(abs v16i8:$vj), (VMAX_B v16i8:$vj, (VNEG_B v16i8:$vj))>; +def : Pat<(abs v8i16:$vj), (VMAX_H v8i16:$vj, (VNEG_H v8i16:$vj))>; +def : Pat<(abs v4i32:$vj), (VMAX_W v4i32:$vj, (VNEG_W v4i32:$vj))>; +def : Pat<(abs v2i64:$vj), (VMAX_D v2i64:$vj, (VNEG_D v2i64:$vj))>; + // VABSD_{B/H/W/D}[U] defm : PatVrVr; defm : PatVrVrU; +// VADDA_{B/H/W/D} +def : Pat<(add (v16i8 (abs v16i8:$vj)), (v16i8 (abs v16i8:$vk))), + (VADDA_B v16i8:$vj, v16i8:$vk)>; +def : Pat<(add (v8i16 (abs v8i16:$vj)), (v8i16 (abs v8i16:$vk))), + (VADDA_H v8i16:$vj, v8i16:$vk)>; +def : Pat<(add (v4i32 (abs v4i32:$vj)), (v4i32 (abs v4i32:$vk))), + (VADDA_W v4i32:$vj, v4i32:$vk)>; +def : Pat<(add (v2i64 (abs v2i64:$vj)), (v2i64 (abs v2i64:$vk))), + (VADDA_D v2i64:$vj, v2i64:$vk)>; + // VSADD_{B/H/W/D}[U], VSSUB_{B/H/W/D}[U] defm : PatVrVr; defm : PatVrVr; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index d0a8ababe8e58..c5e26c106b5df 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -57,6 +57,11 @@ static cl::opt cl::desc("Enable the loop data prefetch pass"), cl::init(false)); +static cl::opt + EnableMergeBaseOffset("loongarch-enable-merge-offset", + cl::desc("Enable the merge base offset pass"), + cl::init(true), cl::Hidden); + static Reloc::Model getEffectiveRelocModel(const Triple &TT, std::optional RM) { return RM.value_or(Reloc::Static); @@ -214,7 +219,7 @@ void LoongArchPassConfig::addMachineSSAOptimization() { void LoongArchPassConfig::addPreRegAlloc() { addPass(createLoongArchPreRAExpandPseudoPass()); - if (TM->getOptLevel() != CodeGenOptLevel::None) + if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMergeBaseOffset) addPass(createLoongArchMergeBaseOffsetOptPass()); } diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp index 3e9666f586e0f..e37f3a66fe11f 100644 --- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp +++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp @@ -39,10 +39,10 @@ class M68kAsmParser : public MCTargetAsmParser { #include "M68kGenAsmMatcher.inc" // Helpers for Match&Emit. - bool invalidOperand(const SMLoc &Loc, const OperandVector &Operands, + bool invalidOperand(SMLoc Loc, const OperandVector &Operands, const uint64_t &ErrorInfo); - bool missingFeature(const SMLoc &Loc, const uint64_t &ErrorInfo); - bool emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const; + bool missingFeature(SMLoc Loc, const uint64_t &ErrorInfo); + bool emit(MCInst &Inst, SMLoc Loc, MCStreamer &Out) const; bool parseRegisterName(MCRegister &RegNo, SMLoc Loc, StringRef RegisterName); ParseStatus parseRegister(MCRegister &RegNo); @@ -991,8 +991,7 @@ bool M68kAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name, return false; } -bool M68kAsmParser::invalidOperand(SMLoc const &Loc, - OperandVector const &Operands, +bool M68kAsmParser::invalidOperand(SMLoc Loc, OperandVector const &Operands, uint64_t const &ErrorInfo) { SMLoc ErrorLoc = Loc; char const *Diag = 0; @@ -1015,13 +1014,11 @@ bool M68kAsmParser::invalidOperand(SMLoc const &Loc, return Error(ErrorLoc, Diag); } -bool M68kAsmParser::missingFeature(llvm::SMLoc const &Loc, - uint64_t const &ErrorInfo) { +bool M68kAsmParser::missingFeature(SMLoc Loc, uint64_t const &ErrorInfo) { return Error(Loc, "instruction requires a CPU feature not currently enabled"); } -bool M68kAsmParser::emit(MCInst &Inst, SMLoc const &Loc, - MCStreamer &Out) const { +bool M68kAsmParser::emit(MCInst &Inst, SMLoc Loc, MCStreamer &Out) const { Inst.setLoc(Loc); Out.emitInstruction(Inst, *STI); diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index a8369f2b28fb7..bbfd0872cc4cd 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -28,7 +28,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include #include diff --git a/llvm/lib/Target/Mips/MipsCallingConv.td b/llvm/lib/Target/Mips/MipsCallingConv.td index 3501f9fbfd2e7..748162525b091 100644 --- a/llvm/lib/Target/Mips/MipsCallingConv.td +++ b/llvm/lib/Target/Mips/MipsCallingConv.td @@ -186,7 +186,8 @@ def RetCC_MipsN : CallingConv<[ // // f128 should only occur for the N64 ABI where long double is 128-bit. On // N32, long double is equivalent to double. - CCIfType<[i64], CCIfOrigArgWasF128>>, + CCIfSubtargetNot<"isSingleFloat()", + CCIfType<[i64], CCIfOrigArgWasF128>>>, // Aggregate returns are positioned at the lowest address in the slot for // both little and big-endian targets. When passing in registers, this @@ -316,9 +317,10 @@ def CC_Mips_FixedArg : CallingConv<[ // // f128 should only occur for the N64 ABI where long double is 128-bit. On // N32, long double is equivalent to double. - CCIfType<[i64], - CCIfSubtargetNot<"useSoftFloat()", - CCIfOrigArgWasF128>>>, + CCIfType<[i64], + CCIfSubtargetNot<"isSingleFloat()", + CCIfSubtargetNot<"useSoftFloat()", + CCIfOrigArgWasF128>>>>, CCIfCC<"CallingConv::Fast", CCDelegateTo>, @@ -342,8 +344,8 @@ def CC_Mips : CallingConv<[ // Callee-saved register lists. //===----------------------------------------------------------------------===// -def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP, - (sequence "S%u", 7, 0))>; +def CSR_O32_SingleFloat : CalleeSavedRegs<(add(sequence "F%u", 31, 20), RA, FP, + (sequence "S%u", 7, 0))>; def CSR_O32_FPXX : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP, (sequence "S%u", 7, 0))> { @@ -357,13 +359,19 @@ def CSR_O32_FP64 : CalleeSavedRegs<(add (decimate (sequence "D%u_64", 30, 20), 2), RA, FP, (sequence "S%u", 7, 0))>; -def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64, - D30_64, RA_64, FP_64, GP_64, - (sequence "S%u_64", 7, 0))>; +def CSR_N32 : CalleeSavedRegs<(add(decimate(sequence "D%u_64", 30, 20), 2), + RA_64, FP_64, GP_64, (sequence "S%u_64", 7, 0))>; + +def CSR_N32_SingleFloat + : CalleeSavedRegs<(add(decimate(sequence "F%u", 30, 20), 2), RA_64, FP_64, + GP_64, (sequence "S%u_64", 7, 0))>; def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64, GP_64, (sequence "S%u_64", 7, 0))>; +def CSR_N64_SingleFloat : CalleeSavedRegs<(add(sequence "F%u", 31, 24), RA_64, + FP_64, GP_64, (sequence "S%u_64", 7, 0))>; + def CSR_Mips16RetHelper : CalleeSavedRegs<(add V0, V1, FP, (sequence "A%u", 3, 0), (sequence "S%u", 7, 0), diff --git a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp index 34ff41f6e02da..78f2e5db40f9d 100644 --- a/llvm/lib/Target/Mips/MipsExpandPseudo.cpp +++ b/llvm/lib/Target/Mips/MipsExpandPseudo.cpp @@ -432,13 +432,24 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( Register OldVal = I->getOperand(6).getReg(); Register BinOpRes = I->getOperand(7).getReg(); Register StoreVal = I->getOperand(8).getReg(); + bool NoMovnInstr = (IsMin || IsMax) && !STI->hasMips4() && !STI->hasMips32(); const BasicBlock *LLVM_BB = BB.getBasicBlock(); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop1MBB = nullptr; + MachineBasicBlock *loop2MBB = nullptr; + if (NoMovnInstr) { + loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); + loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); + } MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++BB.getIterator(); MF->insert(It, loopMBB); + if (NoMovnInstr) { + MF->insert(It, loop1MBB); + MF->insert(It, loop2MBB); + } MF->insert(It, sinkMBB); MF->insert(It, exitMBB); @@ -446,9 +457,19 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( exitMBB->transferSuccessorsAndUpdatePHIs(&BB); BB.addSuccessor(loopMBB, BranchProbability::getOne()); - loopMBB->addSuccessor(sinkMBB); - loopMBB->addSuccessor(loopMBB); - loopMBB->normalizeSuccProbs(); + if (NoMovnInstr) { + loopMBB->addSuccessor(loop1MBB); + loopMBB->addSuccessor(loop2MBB); + } else { + loopMBB->addSuccessor(sinkMBB); + loopMBB->addSuccessor(loopMBB); + loopMBB->normalizeSuccProbs(); + } + if (NoMovnInstr) { + loop1MBB->addSuccessor(loop2MBB); + loop2MBB->addSuccessor(loopMBB); + loop2MBB->addSuccessor(sinkMBB); + } BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0); if (IsNand) { @@ -525,7 +546,7 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( BuildMI(loopMBB, DL, TII->get(OR), BinOpRes) .addReg(BinOpRes) .addReg(Scratch4); - } else { + } else if (STI->hasMips4() || STI->hasMips32()) { // max: move BinOpRes, StoreVal // movn BinOpRes, Incr, Scratch4, BinOpRes // min: move BinOpRes, StoreVal @@ -537,12 +558,59 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( .addReg(Incr) .addReg(Scratch4) .addReg(BinOpRes); + } else { + // if min: + // loopMBB: move BinOpRes, StoreVal + // beq Scratch4, 0, loop1MBB + // j loop2MBB + // loop1MBB: move BinOpRes, Incr + // loop2MBB: and BinOpRes, BinOpRes, Mask + // and StoreVal, OlddVal, Mask2 + // or StoreVal, StoreVal, BinOpRes + // StoreVal = sc StoreVal, 0(Ptr) + // beq StoreVal, zero, loopMBB + // + // if max: + // loopMBB: move BinOpRes, Incr + // beq Scratch4, 0, loop1MBB + // j loop2MBB + // loop1MBB: move BinOpRes, StoreVal + // loop2MBB: and BinOpRes, BinOpRes, Mask + // and StoreVal, OlddVal, Mask2 + // or StoreVal, StoreVal, BinOpRes + // StoreVal = sc StoreVal, 0(Ptr) + // beq StoreVal, zero, loopMBB + if (IsMin) { + BuildMI(loopMBB, DL, TII->get(OR), BinOpRes) + .addReg(StoreVal) + .addReg(Mips::ZERO); + BuildMI(loop1MBB, DL, TII->get(OR), BinOpRes) + .addReg(Incr) + .addReg(Mips::ZERO); + } else { + BuildMI(loopMBB, DL, TII->get(OR), BinOpRes) + .addReg(Incr) + .addReg(Mips::ZERO); + BuildMI(loop1MBB, DL, TII->get(OR), BinOpRes) + .addReg(StoreVal) + .addReg(Mips::ZERO); + } + BuildMI(loopMBB, DL, TII->get(BEQ)) + .addReg(Scratch4) + .addReg(Mips::ZERO) + .addMBB(loop1MBB); + BuildMI(loopMBB, DL, TII->get(Mips::J)).addMBB(loop2MBB); } // and BinOpRes, BinOpRes, Mask - BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes) - .addReg(BinOpRes) - .addReg(Mask); + if (NoMovnInstr) + BuildMI(loop2MBB, DL, TII->get(Mips::AND), BinOpRes) + .addReg(BinOpRes) + .addReg(Mask); + else + BuildMI(loopMBB, DL, TII->get(Mips::AND), BinOpRes) + .addReg(BinOpRes) + .addReg(Mask); } else if (!IsSwap) { // binopres, oldval, incr2 @@ -564,14 +632,37 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( // or StoreVal, StoreVal, BinOpRes // StoreVal = sc StoreVal, 0(Ptr) // beq StoreVal, zero, loopMBB - BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal) - .addReg(OldVal).addReg(Mask2); - BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal) - .addReg(StoreVal).addReg(BinOpRes); - BuildMI(loopMBB, DL, TII->get(SC), StoreVal) - .addReg(StoreVal).addReg(Ptr).addImm(0); - BuildMI(loopMBB, DL, TII->get(BEQ)) - .addReg(StoreVal).addReg(Mips::ZERO).addMBB(loopMBB); + if (NoMovnInstr) { + BuildMI(loop2MBB, DL, TII->get(Mips::AND), StoreVal) + .addReg(OldVal) + .addReg(Mask2); + BuildMI(loop2MBB, DL, TII->get(Mips::OR), StoreVal) + .addReg(StoreVal) + .addReg(BinOpRes); + BuildMI(loop2MBB, DL, TII->get(SC), StoreVal) + .addReg(StoreVal) + .addReg(Ptr) + .addImm(0); + BuildMI(loop2MBB, DL, TII->get(BEQ)) + .addReg(StoreVal) + .addReg(Mips::ZERO) + .addMBB(loopMBB); + } else { + BuildMI(loopMBB, DL, TII->get(Mips::AND), StoreVal) + .addReg(OldVal) + .addReg(Mask2); + BuildMI(loopMBB, DL, TII->get(Mips::OR), StoreVal) + .addReg(StoreVal) + .addReg(BinOpRes); + BuildMI(loopMBB, DL, TII->get(SC), StoreVal) + .addReg(StoreVal) + .addReg(Ptr) + .addImm(0); + BuildMI(loopMBB, DL, TII->get(BEQ)) + .addReg(StoreVal) + .addReg(Mips::ZERO) + .addMBB(loopMBB); + } // sinkMBB: // and maskedoldval1,oldval,mask @@ -600,6 +691,11 @@ bool MipsExpandPseudo::expandAtomicBinOpSubword( LivePhysRegs LiveRegs; computeAndAddLiveIns(LiveRegs, *loopMBB); + if (loop1MBB) { + assert(loop2MBB && "should have 2 loop blocks"); + computeAndAddLiveIns(LiveRegs, *loop1MBB); + computeAndAddLiveIns(LiveRegs, *loop2MBB); + } computeAndAddLiveIns(LiveRegs, *sinkMBB); computeAndAddLiveIns(LiveRegs, *exitMBB); @@ -746,20 +842,41 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB, llvm_unreachable("Unknown pseudo atomic!"); } + bool NoMovnInstr = (IsMin || IsMax) && !STI->hasMips4() && !STI->hasMips32(); const BasicBlock *LLVM_BB = BB.getBasicBlock(); MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *loop1MBB = nullptr; + MachineBasicBlock *loop2MBB = nullptr; + if (NoMovnInstr) { + loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB); + loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB); + } MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineFunction::iterator It = ++BB.getIterator(); MF->insert(It, loopMBB); + if (NoMovnInstr) { + MF->insert(It, loop1MBB); + MF->insert(It, loop2MBB); + } MF->insert(It, exitMBB); exitMBB->splice(exitMBB->begin(), &BB, std::next(I), BB.end()); exitMBB->transferSuccessorsAndUpdatePHIs(&BB); BB.addSuccessor(loopMBB, BranchProbability::getOne()); - loopMBB->addSuccessor(exitMBB); - loopMBB->addSuccessor(loopMBB); + if (NoMovnInstr) { + loopMBB->addSuccessor(loop1MBB); + loopMBB->addSuccessor(loop2MBB); + } else { + loopMBB->addSuccessor(exitMBB); + loopMBB->addSuccessor(loopMBB); + } loopMBB->normalizeSuccProbs(); + if (NoMovnInstr) { + loop1MBB->addSuccessor(loop2MBB); + loop2MBB->addSuccessor(loopMBB); + loop2MBB->addSuccessor(exitMBB); + } BuildMI(loopMBB, DL, TII->get(LL), OldVal).addReg(Ptr).addImm(0); assert((OldVal != Ptr) && "Clobbered the wrong ptr reg!"); @@ -802,7 +919,7 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB, BuildMI(loopMBB, DL, TII->get(OR), Scratch) .addReg(Scratch) .addReg(Scratch2); - } else { + } else if (STI->hasMips4() || STI->hasMips32()) { // max: move Scratch, OldVal // movn Scratch, Incr, Scratch2, Scratch // min: move Scratch, OldVal @@ -814,6 +931,38 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB, .addReg(Incr) .addReg(Scratch2) .addReg(Scratch); + } else { + // if min: + // loopMBB: move Scratch, OldVal + // beq Scratch2_32, 0, loop1MBB + // j loop2MBB + // loop1MBB: move Scratch, Incr + // loop2MBB: sc $2, 0($4) + // beqz $2, $BB0_1 + // nop + // + // if max: + // loopMBB: move Scratch, Incr + // beq Scratch2_32, 0, loop1MBB + // j loop2MBB + // loop1MBB: move Scratch, OldVal + // loop2MBB: sc $2, 0($4) + // beqz $2, $BB0_1 + // nop + if (IsMin) { + BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(OldVal).addReg(ZERO); + BuildMI(loop1MBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO); + } else { + BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO); + BuildMI(loop1MBB, DL, TII->get(OR), Scratch) + .addReg(OldVal) + .addReg(ZERO); + } + BuildMI(loopMBB, DL, TII->get(BEQ)) + .addReg(Scratch2_32) + .addReg(ZERO) + .addMBB(loop1MBB); + BuildMI(loopMBB, DL, TII->get(Mips::J)).addMBB(loop2MBB); } } else if (Opcode) { @@ -829,20 +978,36 @@ bool MipsExpandPseudo::expandAtomicBinOp(MachineBasicBlock &BB, BuildMI(loopMBB, DL, TII->get(OR), Scratch).addReg(Incr).addReg(ZERO); } - BuildMI(loopMBB, DL, TII->get(SC), Scratch) - .addReg(Scratch) - .addReg(Ptr) - .addImm(0); - BuildMI(loopMBB, DL, TII->get(BEQ)) - .addReg(Scratch) - .addReg(ZERO) - .addMBB(loopMBB); + if (NoMovnInstr) { + BuildMI(loop2MBB, DL, TII->get(SC), Scratch) + .addReg(Scratch) + .addReg(Ptr) + .addImm(0); + BuildMI(loop2MBB, DL, TII->get(BEQ)) + .addReg(Scratch) + .addReg(ZERO) + .addMBB(loopMBB); + } else { + BuildMI(loopMBB, DL, TII->get(SC), Scratch) + .addReg(Scratch) + .addReg(Ptr) + .addImm(0); + BuildMI(loopMBB, DL, TII->get(BEQ)) + .addReg(Scratch) + .addReg(ZERO) + .addMBB(loopMBB); + } NMBBI = BB.end(); I->eraseFromParent(); LivePhysRegs LiveRegs; computeAndAddLiveIns(LiveRegs, *loopMBB); + if (loop1MBB) { + assert(loop2MBB && "should have 2 loop blocks"); + computeAndAddLiveIns(LiveRegs, *loop1MBB); + computeAndAddLiveIns(LiveRegs, *loop2MBB); + } computeAndAddLiveIns(LiveRegs, *exitMBB); return true; diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index 1491300e37d3e..b05de49d8332a 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -4265,10 +4265,16 @@ parseRegForInlineAsmConstraint(StringRef C, MVT VT) const { return std::make_pair(0U, nullptr); if (Prefix == "$f") { // Parse $f0-$f31. - // If the size of FP registers is 64-bit or Reg is an even number, select - // the 64-bit register class. Otherwise, select the 32-bit register class. - if (VT == MVT::Other) - VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32; + // If the targets is single float only, always select 32-bit registers, + // otherwise if the size of FP registers is 64-bit or Reg is an even number, + // select the 64-bit register class. Otherwise, select the 32-bit register + // class. + if (VT == MVT::Other) { + if (Subtarget.isSingleFloat()) + VT = MVT::f32; + else + VT = (Subtarget.isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32; + } RC = getRegClassFor(VT); @@ -4308,10 +4314,12 @@ MipsTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &Mips::CPU16RegsRegClass); return std::make_pair(0U, &Mips::GPR32RegClass); } - if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) && + if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat()) || + (VT == MVT::f64 && Subtarget.isSingleFloat())) && !Subtarget.isGP64bit()) return std::make_pair(0U, &Mips::GPR32RegClass); - if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat())) && + if ((VT == MVT::i64 || (VT == MVT::f64 && Subtarget.useSoftFloat()) || + (VT == MVT::f64 && Subtarget.isSingleFloat())) && Subtarget.isGP64bit()) return std::make_pair(0U, &Mips::GPR64RegClass); // This will generate an error message diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp index 6f8d6764e77b8..6ca587b1ba4d5 100644 --- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp +++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp @@ -89,14 +89,25 @@ MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { : CSR_Interrupt_32_SaveList; } - if (Subtarget.isSingleFloat()) - return CSR_SingleFloatOnly_SaveList; + // N64 ABI + if (Subtarget.isABI_N64()) { + if (Subtarget.isSingleFloat()) + return CSR_N64_SingleFloat_SaveList; - if (Subtarget.isABI_N64()) return CSR_N64_SaveList; + } + + // N32 ABI + if (Subtarget.isABI_N32()) { + if (Subtarget.isSingleFloat()) + return CSR_N32_SingleFloat_SaveList; - if (Subtarget.isABI_N32()) return CSR_N32_SaveList; + } + + // O32 ABI + if (Subtarget.isSingleFloat()) + return CSR_O32_SingleFloat_SaveList; if (Subtarget.isFP64bit()) return CSR_O32_FP64_SaveList; @@ -111,14 +122,25 @@ const uint32_t * MipsRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const { const MipsSubtarget &Subtarget = MF.getSubtarget(); - if (Subtarget.isSingleFloat()) - return CSR_SingleFloatOnly_RegMask; + // N64 ABI + if (Subtarget.isABI_N64()) { + if (Subtarget.isSingleFloat()) + return CSR_N64_SingleFloat_RegMask; - if (Subtarget.isABI_N64()) return CSR_N64_RegMask; + } + + // N32 ABI + if (Subtarget.isABI_N32()) { + if (Subtarget.isSingleFloat()) + return CSR_N32_SingleFloat_RegMask; - if (Subtarget.isABI_N32()) return CSR_N32_RegMask; + } + + // O32 ABI + if (Subtarget.isSingleFloat()) + return CSR_O32_SingleFloat_RegMask; if (Subtarget.isFP64bit()) return CSR_O32_FP64_RegMask; diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp index 71a70d9c2dd46..19917f3650bb5 100644 --- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp @@ -28,6 +28,7 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/CodeGenTypes/MachineValueType.h" @@ -211,6 +212,16 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, } } + // Targets with 64bits integer registers, but no 64bit floating point register + // do not support conversion between them + if (Subtarget.isGP64bit() && Subtarget.isSingleFloat() && + !Subtarget.useSoftFloat()) { + setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::i64, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand); + } + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom); setOperationAction(ISD::MULHS, MVT::i32, Custom); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c70f48af33cf2..bef4868492d4e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -38,6 +38,13 @@ static cl::opt EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization")); +// FIXME: This is a WAR to recover lost performance from #155024. +// We still need to investigate the regression and find a more permanent +// solution. +static cl::opt EnableMADWide("nvptx-mad-wide-opt", cl::init(false), + cl::Hidden, + cl::desc("Enable MAD wide optimization")); + /// createNVPTXISelDag - This pass converts a legalized DAG into a /// NVPTX-specific DAG, ready for instruction scheduling. FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, @@ -84,6 +91,8 @@ bool NVPTXDAGToDAGISel::allowFMA() const { bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; } +bool NVPTXDAGToDAGISel::doMADWideOpt() const { return EnableMADWide; } + /// Select - Select instructions not customized! Used for /// expanded, promoted and normal instructions. void NVPTXDAGToDAGISel::Select(SDNode *N) { @@ -1018,6 +1027,7 @@ pickOpcodeForVT(MVT::SimpleValueType VT, std::optional Opcode_i16, case MVT::f32: return Opcode_i32; case MVT::v2f32: + case MVT::v2i32: case MVT::i64: case MVT::f64: return Opcode_i64; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 8dcd5362c4512..c912e709d0aa0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -45,6 +45,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool useF32FTZ() const; bool allowFMA() const; bool doRsqrtOpt() const; + bool doMADWideOpt() const; NVPTXScopes Scopes{}; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index ca8a3f69f991d..3ac7c2874408b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -226,21 +226,20 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, switch (VectorVT.SimpleTy) { default: return std::nullopt; + case MVT::v4i64: case MVT::v4f64: - case MVT::v8i32: - // This is a "native" vector type iff the address space is global - // and the target supports 256-bit loads/stores + // This is a "native" vector type iff the address space is global and the + // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; LLVM_FALLTHROUGH; case MVT::v2i8: - case MVT::v2i32: case MVT::v2i64: case MVT::v2f64: - case MVT::v4i32: // This is a "native" vector type return std::pair(NumElts, EltVT); + case MVT::v16f16: // <8 x f16x2> case MVT::v16bf16: // <8 x bf16x2> case MVT::v16i16: // <8 x i16x2> @@ -264,12 +263,18 @@ getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, case MVT::v16i8: // <4 x i8x4> PackRegSize = 32; break; + case MVT::v8f32: // <4 x f32x2> + case MVT::v8i32: // <4 x i32x2> + // This is a "native" vector type iff the address space is global and the + // target supports 256-bit loads/stores if (!CanLowerTo256Bit) return std::nullopt; LLVM_FALLTHROUGH; case MVT::v2f32: // <1 x f32x2> case MVT::v4f32: // <2 x f32x2> + case MVT::v2i32: // <1 x i32x2> + case MVT::v4i32: // <2 x i32x2> if (!STI.hasF32x2Instructions()) return std::pair(NumElts, EltVT); PackRegSize = 64; @@ -590,8 +595,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, addRegisterClass(MVT::bf16, &NVPTX::B16RegClass); addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass); - if (STI.hasF32x2Instructions()) + if (STI.hasF32x2Instructions()) { addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass); + addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass); + } // Conversion to/from FP16/FP16x2 is always legal. setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom); @@ -628,12 +635,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom); - // No support for these operations with v2f32. - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Expand); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Expand); + // No support for these operations with v2f32/v2i32 + setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand); + setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand); // Need custom lowering in case the index is dynamic. if (STI.hasF32x2Instructions()) - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, + Custom); // Custom conversions to/from v2i8. setOperationAction(ISD::BITCAST, MVT::v2i8, Custom); @@ -661,14 +669,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Operations not directly supported by NVPTX. for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32, MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, - MVT::v4i8, MVT::i32, MVT::i64}) { + MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) { setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::BR_CC, VT, Expand); } - // Not directly supported. TLI would attempt to expand operations like - // FMINIMUM(v2f32) using invalid SETCC and VSELECT nodes. - setOperationAction(ISD::VSELECT, MVT::v2f32, Expand); + // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT. + setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand); // Some SIGN_EXTEND_INREG can be done using cvt instruction. // For others we will expand to a SHL/SRA pair. @@ -815,7 +822,14 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SRA, ISD::SRL, ISD::MULHS, ISD::MULHU, ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::SETCC}, - MVT::v2i16, Expand); + {MVT::v2i16, MVT::v2i32}, Expand); + + // v2i32 is not supported for any arithmetic operations + setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, + ISD::CTPOP, ISD::CTLZ, ISD::ADD, ISD::SUB, ISD::MUL, + ISD::SHL, ISD::SRA, ISD::SRL, ISD::OR, ISD::AND, ISD::XOR, + ISD::SREM, ISD::UREM}, + MVT::v2i32, Expand); setOperationAction(ISD::ADDC, MVT::i32, Legal); setOperationAction(ISD::ADDE, MVT::i32, Legal); @@ -829,7 +843,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, } setOperationAction(ISD::CTTZ, MVT::i16, Expand); - setOperationAction(ISD::CTTZ, MVT::v2i16, Expand); + setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand); setOperationAction(ISD::CTTZ, MVT::i32, Expand); setOperationAction(ISD::CTTZ, MVT::i64, Expand); @@ -1071,7 +1085,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // Custom lowering for tcgen05.st vector operands setOperationAction(ISD::INTRINSIC_VOID, {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, - MVT::v32i32, MVT::v64i32, MVT::v128i32}, + MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::Other}, Custom); // Enable custom lowering for the following: @@ -1134,6 +1148,34 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X) MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y) MAKE_CASE(NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1) + MAKE_CASE(NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT) + MAKE_CASE( + NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT) } return nullptr; @@ -2576,7 +2618,7 @@ static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG) { return V; } -static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG) { +static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); SDLoc DL(N); SmallVector Ops; @@ -2602,7 +2644,141 @@ static SDValue LowerTcgen05St(SDValue Op, SelectionDAG &DAG) { return Tcgen05StNode; } -static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { +static unsigned getTcgen05MMADisableOutputLane(unsigned IID) { + switch (IID) { + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift: + return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1; + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2: + return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2; + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: + return NVPTXISD:: + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT; + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: + return NVPTXISD:: + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT; + }; + llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic"); +} + +static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG) { + SDNode *N = Op.getNode(); + SDLoc DL(N); + unsigned IID = cast(N->getOperand(1))->getZExtValue(); + + SmallVector Ops; + // split the vector argument + for (size_t I = 0; I < N->getNumOperands(); I++) { + if (I == 1) + continue; // skip IID + SDValue Val = N->getOperand(I); + EVT ValVT = Val.getValueType(); + if (ValVT.isVector()) { + EVT EltVT = ValVT.getVectorElementType(); + for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++) + Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(J, DL))); + } else + Ops.push_back(Val); + } + + MemIntrinsicSDNode *MemSD = cast(N); + SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode( + getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); + + return Tcgen05MMANode; +} + +// Lower vector return type of tcgen05.ld intrinsics +static std::optional> +lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) { + SDLoc DL(N); + EVT ResVT = N->getValueType(0); + if (!ResVT.isVector()) + return {}; // already legalized. + + const unsigned NumElts = ResVT.getVectorNumElements(); + + // Create the return type of the instructions + SmallVector ListVTs; + for (unsigned i = 0; i < NumElts; ++i) + ListVTs.push_back(MVT::i32); + + ListVTs.push_back(N->getValueType(1)); // Chain + + SDVTList ResVTs = DAG.getVTList(ListVTs); + + SmallVector Ops{N->getOperand(0), N->getOperand(1), + N->getOperand(2)}; + + if (HasOffset) { + Ops.push_back(N->getOperand(3)); // offset + Ops.push_back(N->getOperand(4)); // Pack flag + } else + Ops.push_back(N->getOperand(3)); // Pack flag + + MemIntrinsicSDNode *MemSD = cast(N); + SDValue NewNode = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); + + // split the vector result + SmallVector ScalarRes; + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Res = NewNode.getValue(i); + ScalarRes.push_back(Res); + } + + SDValue Chain = NewNode.getValue(NumElts); + SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); + return {{BuildVector, Chain}}; +} + +static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); SDValue Intrin = N->getOperand(1); @@ -2648,7 +2824,36 @@ static SDValue LowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) { case Intrinsic::nvvm_tcgen05_st_16x64b_x64: case Intrinsic::nvvm_tcgen05_st_32x32b_x64: case Intrinsic::nvvm_tcgen05_st_32x32b_x128: - return LowerTcgen05St(Op, DAG); + return lowerTcgen05St(Op, DAG); + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: + return LowerTcgen05MMADisableOutputLane(Op, DAG); } return Op; } @@ -2721,6 +2926,28 @@ static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { SDValue Selector = (Op->op_end() - 1)->get(); return getPRMT(A, B, Selector, DL, DAG, Mode); } + +static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG) { + switch (Op->getConstantOperandVal(1)) { + default: + return Op; + + // These tcgen05 intrinsics return a v2i32, which is legal, so we have to + // lower them through LowerOperation() instead of ReplaceNodeResults(). + case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: + case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: + case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: + if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG)) + return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op)); + return SDValue(); + + case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: + if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true)) + return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op)); + return SDValue(); + } +} + static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: @@ -2883,11 +3110,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); case ISD::INTRINSIC_W_CHAIN: - return Op; + return lowerIntrinsicWChain(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return lowerIntrinsicWOChain(Op, DAG); case ISD::INTRINSIC_VOID: - return LowerIntrinsicVoid(Op, DAG); + return lowerIntrinsicVoid(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::BITCAST: @@ -4725,6 +4952,53 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.align.reset(); return true; } + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: { + // We are reading and writing back to TMem + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v4i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.align = Align(16); + return true; + } + + case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2: + case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift: + case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift: + case Intrinsic:: + nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: { + // We are reading and writing back to TMem + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::v8i32; + Info.ptrVal = I.getArgOperand(0); + Info.offset = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + Info.align = Align(16); + return true; + } } return false; } @@ -5727,7 +6001,7 @@ static SDValue PerformEXTRACTCombine(SDNode *N, IsPTXVectorType(VectorVT.getSimpleVT())) return SDValue(); // Native vector loads already combine nicely w/ // extract_vector_elt. - // Don't mess with singletons or packed types (v2f32, v2*16, v4i8 and v8i8), + // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8), // we already handle them OK. if (VectorVT.getVectorNumElements() == 1 || NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8) @@ -6107,53 +6381,6 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1})); } -// Lower vector return type of tcgen05.ld intrinsics -static void ReplaceTcgen05Ld(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl &Results, - bool hasOffset = false) { - SDLoc DL(N); - EVT ResVT = N->getValueType(0); - if (!ResVT.isVector()) - return; // already legalized. - - const unsigned NumElts = ResVT.getVectorNumElements(); - - // Create the return type of the instructions - SmallVector ListVTs; - for (unsigned i = 0; i < NumElts; ++i) - ListVTs.push_back(MVT::i32); - - ListVTs.push_back(N->getValueType(1)); // Chain - - SDVTList ResVTs = DAG.getVTList(ListVTs); - - SmallVector Ops{N->getOperand(0), N->getOperand(1), - N->getOperand(2)}; - - if (hasOffset) { - Ops.push_back(N->getOperand(3)); // offset - Ops.push_back(N->getOperand(4)); // Pack flag - } else - Ops.push_back(N->getOperand(3)); // Pack flag - - MemIntrinsicSDNode *MemSD = cast(N); - SDValue NewNode = - DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, ResVTs, Ops, - MemSD->getMemoryVT(), MemSD->getMemOperand()); - - // split the vector result - SmallVector ScalarRes; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue Res = NewNode.getValue(i); - ScalarRes.push_back(Res); - } - - SDValue Chain = NewNode.getValue(NumElts); - SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes); - Results.push_back(BuildVector); // Build Vector - Results.push_back(Chain); // Chain -} - static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results) { SDValue Chain = N->getOperand(0); @@ -6262,21 +6489,18 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, return; } - case Intrinsic::nvvm_tcgen05_ld_16x64b_x2: case Intrinsic::nvvm_tcgen05_ld_16x64b_x4: case Intrinsic::nvvm_tcgen05_ld_16x64b_x8: case Intrinsic::nvvm_tcgen05_ld_16x64b_x16: case Intrinsic::nvvm_tcgen05_ld_16x64b_x32: case Intrinsic::nvvm_tcgen05_ld_16x64b_x64: case Intrinsic::nvvm_tcgen05_ld_16x64b_x128: - case Intrinsic::nvvm_tcgen05_ld_32x32b_x2: case Intrinsic::nvvm_tcgen05_ld_32x32b_x4: case Intrinsic::nvvm_tcgen05_ld_32x32b_x8: case Intrinsic::nvvm_tcgen05_ld_32x32b_x16: case Intrinsic::nvvm_tcgen05_ld_32x32b_x32: case Intrinsic::nvvm_tcgen05_ld_32x32b_x64: case Intrinsic::nvvm_tcgen05_ld_32x32b_x128: - case Intrinsic::nvvm_tcgen05_ld_16x128b_x1: case Intrinsic::nvvm_tcgen05_ld_16x128b_x2: case Intrinsic::nvvm_tcgen05_ld_16x128b_x4: case Intrinsic::nvvm_tcgen05_ld_16x128b_x8: @@ -6289,16 +6513,23 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, case Intrinsic::nvvm_tcgen05_ld_16x256b_x8: case Intrinsic::nvvm_tcgen05_ld_16x256b_x16: case Intrinsic::nvvm_tcgen05_ld_16x256b_x32: - return ReplaceTcgen05Ld(N, DAG, Results); + if (auto Res = lowerTcgen05Ld(N, DAG)) { + Results.push_back(Res->first); + Results.push_back(Res->second); + } + return; - case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64: case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128: - return ReplaceTcgen05Ld(N, DAG, Results, /* Offset */ true); + if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) { + Results.push_back(Res->first); + Results.push_back(Res->second); + } + return; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 03b3edc902e54..769d2fe46f2c8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -99,7 +99,32 @@ enum NodeType : unsigned { StoreV2, StoreV4, StoreV8, - LAST_MEMORY_OPCODE = StoreV8, + TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT, + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT, + LAST_MEMORY_OPCODE = + TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT, }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4e38e026e6bda..4cacee2290763 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -114,11 +114,13 @@ def hasArchAccelFeatures : Predicate<"Subtarget->hasArchAccelFeatures()">; def doF32FTZ : Predicate<"useF32FTZ()">; def doNoF32FTZ : Predicate<"!useF32FTZ()">; def doRsqrtOpt : Predicate<"doRsqrtOpt()">; +def doMADWideOpt : Predicate<"doMADWideOpt()">; def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; +def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">; def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; @@ -754,8 +756,10 @@ def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), (SELP_b32rr $a, $b, $p)>; } -def : Pat<(v2f32 (select i1:$p, v2f32:$a, v2f32:$b)), +foreach vt = [v2f32, v2i32] in { +def : Pat<(vt (select i1:$p, vt:$a, vt:$b)), (SELP_b64rr $a, $b, $p)>; +} //----------------------------------- // Test Instructions @@ -899,8 +903,15 @@ let Predicates = [hasOptEnabled] in { defm MAD_LO_S32 : MADInst<"lo.s32", mul, I32RT, I32RT>; defm MAD_LO_S64 : MADInst<"lo.s64", mul, I64RT, I64RT>; - // Generating mad.wide causes a regression: + // Generating mad.wide causes a regression in some cases: // https://github.com/llvm/llvm-project/pull/150477#issuecomment-3191367837 + // Only do so when the user requests it. + let Predicates = [doMADWideOpt] in { + defm MAD_WIDE_U16 : MADInst<"wide.u16", umul_wide, I32RT, I16RT>; + defm MAD_WIDE_S16 : MADInst<"wide.s16", smul_wide, I32RT, I16RT>; + defm MAD_WIDE_U32 : MADInst<"wide.u32", umul_wide, I64RT, I32RT>; + defm MAD_WIDE_S32 : MADInst<"wide.s32", smul_wide, I64RT, I32RT>; + } } //----------------------------------- @@ -2092,8 +2103,8 @@ foreach vt = [v2f16, v2bf16, v2i16] in { (V2I16toI32 $a, $b)>; } -// Same thing for the 64-bit type v2f32. -foreach vt = [v2f32] in { +// Handle extracting one element from the pair (64-bit types) +foreach vt = [v2f32, v2i32] in { def : Pat<(extractelt vt:$src, 0), (I64toI32L_Sink $src)>, Requires<[hasPTX<71>]>; def : Pat<(extractelt vt:$src, 1), (I64toI32H_Sink $src)>, Requires<[hasPTX<71>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c544911bdf1e3..e91171c1ae38f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -5282,3 +5282,420 @@ foreach dim = ["x", "y", "z"] in { def CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_ # dim: CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID; } + +// +// tcgen05.mma Instructions +// + +class Tcgen05MMAInst PTXPredicates> : + NVPTXInst<(outs), (ins), "?", []>, + Requires { + + Intrinsic Intrin = !cast( + NVVM_TCGEN05_MMA.record + ); + + dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins)); + string ScaleInpStr = !if(!eq(ScaleInputD, 1), ", $scale_input_d", ""); + dag ScaleInpInput = !if(!eq(ScaleInputD, 1), (Intrin i64:$scale_input_d), (Intrin)); + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin B32:$spmetadata), (Intrin)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(KindStr, "f16"): 0, + !eq(KindStr, "tf32"): 1, + !eq(KindStr, "f8f6f4"): 2, + !eq(KindStr, "i8"): 3, + ); + + int CollectorUsageVal = !cond( + !eq(CollectorUsage, "discard"): 0, + !eq(CollectorUsage, "lastuse"): 1, + !eq(CollectorUsage, "fill"): 2, + !eq(CollectorUsage, "use"): 3 + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag input = !con((ins B32:$dtmem, + ARegClass:$a, ADDR:$b, + B32:$idesc, + B1:$enable_inp_d), + SparseMetadataIns, + ScaleInpIns); + + let InOperandList = input; + let OutOperandList = (outs); + let AsmString = "tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::" # CtaGroup + # ".kind::" # KindStr + # ".collector::a::" # CollectorUsage + # !if(!eq(AShift, 1), ".ashift", "") + # " [$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", $idesc, $enable_inp_d" + # ScaleInpStr + # ";"; + + dag IntrinsicPattern = !con((Intrin i32:$dtmem, + ARegClass:$a, addr:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + ScaleInpInput); + + dag FlagOperands = (Intrin (i32 KindVal), (i32 CtaGroup), + (i32 CollectorUsageVal)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["f16", "tf32", "f8f6f4", "i8"] in { + foreach cta_group = [1, 2] in { + foreach collector_usage = ["discard", "lastuse", "fill", "use"] in { + foreach scale_input_d = !if(!or(!eq(kind, "f16"), + !eq(kind, "tf32")), [0, 1], [0]) in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + + def : Tcgen05MMAInst; + } + } + } + } + } + } +} + +class Tcgen05MMADisableOutputLaneTypeProfile: + SDTypeProfile<0, 0, []> { + int DisableOutputLaneVecSize = !mul(4, CtaGroup); + + list VTs = !listconcat( + [i32], // d + !if(!eq(ASpace, "tensor"), [i32], [i64]), // a + [i64, i32, i1], // b, idesc, enable_inp_d + !if(!eq(Sp, 1), [i32], []), // spmetadata + !if(!eq(ScaleInputD, 1), [i64], []), // scale_input_d + !listsplat(i32, DisableOutputLaneVecSize), // disable_output_lane + [i32, i32] // kind, collector_usage + ); + let Constraints = !foreach(x, !range(!size(VTs)), SDTCisVT); + let NumOperands = !size(Constraints); +} + +class Tcgen05MMADisableOutputLaneSDNode: + SDNode<"NVPTXISD::TCGEN05_MMA" + # !if(!eq(Sp, 1), "_SP", "") + # "_" # !toupper(ASpace) + # !if(!eq(ScaleInput, 1), "_SCALE_D", "") + # "_DISABLE_OUTPUT_LANE_CG" # CtaGroup + # !if(!eq(AShift, 1), "_ASHIFT", ""), + Tcgen05MMADisableOutputLaneTypeProfile, + [SDNPHasChain, SDNPSideEffect]>; + +class Tcgen05MMADisableOutputLaneInst PTXPredicates> : + NVPTXInst<(outs), (ins), "?", []>, + Requires { + + SDNode Opcode = Tcgen05MMADisableOutputLaneSDNode; + + + dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins)); + string ScaleInpStr = !if(!eq(ScaleInputD, 1), ", $scale_input_d", ""); + dag ScaleInpInput = !if(!eq(ScaleInputD, 1), (Opcode i64:$scale_input_d), (Opcode)); + + // disable output lane + int DisableOutputLaneVecSize = !mul(4, CtaGroup); + + dag DisableOutputLaneIns = !dag(ins, + !listsplat(B32, DisableOutputLaneVecSize), + !foreach(x, + !range(DisableOutputLaneVecSize), + "disable_output_lane" # x)); + + dag DisableOutputLaneInput = !dag(Opcode, + !listsplat(i32, DisableOutputLaneVecSize), + !foreach(x, + !range(DisableOutputLaneVecSize), + "disable_output_lane" # x)); + + string DisableOutputLaneStr = "{{" # + !interleave( + !foreach(x, + !range(DisableOutputLaneVecSize), + "$disable_output_lane" # x), + ", ") + # "}}"; + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Opcode i32:$spmetadata), (Opcode)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(Kind, "f16"): 0, + !eq(Kind, "tf32"): 1, + !eq(Kind, "f8f6f4"): 2, + !eq(Kind, "i8"): 3, + ); + + int CollectorUsage = !cond( + !eq(CollectorUsageStr, "discard"): 0, + !eq(CollectorUsageStr, "lastuse"): 1, + !eq(CollectorUsageStr, "fill"): 2, + !eq(CollectorUsageStr, "use"): 3, + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag InOperandList = !con((ins B32:$dtmem, + ARegClass:$a, B64:$b, + B32:$idesc, + B1:$enable_inp_d), + SparseMetadataIns, + ScaleInpIns, + DisableOutputLaneIns); + + let OutOperandList = (outs); + let AsmString = "tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::" # CtaGroup + # ".kind::" # Kind + # !if(!eq(AShift, 1), ".ashift", "") + # ".collector::a::" # CollectorUsageStr + # " " # "[$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", " # "$idesc" + # ", " # DisableOutputLaneStr + # ", $enable_inp_d" + # ScaleInpStr + # ";"; + + dag IntrinsicPattern = !con((Opcode i32:$dtmem, + ARegClass:$a, i64:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + ScaleInpInput, + DisableOutputLaneInput); + + dag FlagOperands = (Opcode (i32 KindVal), (i32 CollectorUsage)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma.disable_output_lane +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["f16", "tf32", "f8f6f4", "i8"] in { + foreach cta_group = [1, 2] in { + foreach collector_usage = ["fill", "use", "lastuse", "discard"] in { + foreach scale_input_d = !if(!or(!eq(kind, "f16"), + !eq(kind, "tf32")), [0, 1], [0]) in { + foreach ashift = !if(!eq(space, "tensor"), [0, 1], [0]) in { + def : + Tcgen05MMADisableOutputLaneInst; + } + } + } + } + } + } +} + +class Tcgen05MMABlockScaleInst: + NVPTXInst<(outs), (ins), "?", []>, + Requires<[hasTcgen05Instructions, PTXPredicate]> { + + Intrinsic Intrin = !cast( + NVVM_TCGEN05_MMA_BLOCKSCALE.record); + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin i32:$spmetadata), (Intrin)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(KindStr, "mxf8f6f4") : 0, + !eq(KindStr, "mxf4") : 1, + !eq(KindStr, "mxf4nvf4") : 2, + ); + + int CollectorUsage = !cond( + !eq(CollectorUsageStr, "discard") : 0, + !eq(CollectorUsageStr, "lastuse") : 1, + !eq(CollectorUsageStr, "fill") : 2, + !eq(CollectorUsageStr, "use") : 3, + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag input = !con((ins B32:$dtmem, ARegClass:$a, B64:$b, + B32:$idesc, B1:$enable_inp_d), + SparseMetadataIns, + (ins B32:$scale_a, + B32:$scale_b)); + + let InOperandList = input; + let OutOperandList = (outs); + let AsmString = "tcgen05.mma" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::" # CtaGroup + # ".kind::" # KindStr + # ".block_scale" # ScaleVecSize + # ".collector::a::" # CollectorUsageStr + # " [$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", $idesc, [$scale_a], [$scale_b], $enable_inp_d;"; + + dag IntrinsicPattern = !con((Intrin i32:$dtmem, + ARegClass:$a, i64:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + (Intrin i32:$scale_a, + i32:$scale_b)); + + dag FlagOperands = (Intrin (i32 CtaGroup), (i32 CollectorUsage)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma.block_scale +foreach sp = [0, 1] in { + foreach space = ["tensor", "shared"] in { + foreach kind = ["mxf8f6f4", "mxf4", "mxf4nvf4"] in { + foreach scale_vec_size = ["", ".block16", ".block32"] in { + foreach cta_group = [1, 2] in { + foreach collector_usage = ["fill", "use", "lastuse", "discard"] in { + if NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED.ret then { + def : Tcgen05MMABlockScaleInst, hasPTX<86>)>; + } + } + } + } + } + } +} + +// +// tcgen05.mma.ws Instructions +// + +class Tcgen05MMAWSInst : + NVPTXInst<(outs), (ins), "?", []>, + Requires<[hasTcgen05Instructions]> { + + Intrinsic Intrin = !cast( + NVVM_TCGEN05_MMA_WS.record); + + dag ZeroColMaskIns = !if(!eq(HasZeroColMask, 1), + (ins B64:$zero_col_mask), (ins)); + string ZeroColMaskStr = !if(!eq(HasZeroColMask, 1), ", $zero_col_mask", ""); + dag ZeroColMaskIntr = !if(!eq(HasZeroColMask, 1), + (Intrin i64:$zero_col_mask), (Intrin)); + + dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins)); + dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin B32:$spmetadata), (Intrin)); + string SparseMetadataStr = !if(!eq(Sp, 1), ", [$spmetadata]", ""); + + int KindVal = !cond( + !eq(KindStr, "f16") : 0, + !eq(KindStr, "tf32") : 1, + !eq(KindStr, "f8f6f4"): 2, + !eq(KindStr, "i8") : 3, + ); + + int CollectorUsageOp = !cond( + !eq(CollectorUsageOpStr, "discard"): 0, + !eq(CollectorUsageOpStr, "lastuse"): 1, + !eq(CollectorUsageOpStr, "fill") : 2, + !eq(CollectorUsageOpStr, "use") : 3, + ); + + string AOperandStr = !if(!eq(ASpace, "tensor"), "[$a]", "$a"); + NVPTXRegClass ARegClass = !if(!eq(ASpace, "tensor"), B32, B64); + + dag input = !con((ins B32:$dtmem, + ARegClass:$a, B64:$b, + B32:$idesc, + B1:$enable_inp_d), + SparseMetadataIns, + ZeroColMaskIns); + + let InOperandList = input; + let OutOperandList = (outs); + let AsmString = "tcgen05.mma.ws" + # !if(!eq(Sp, 1), ".sp", "") + # ".cta_group::1" + # ".kind::" # KindStr + # ".collector::b" # CollectorBufferB + # "::" # CollectorUsageOpStr + # " [$dtmem], " # AOperandStr # ", $b" + # SparseMetadataStr + # ", $idesc, $enable_inp_d" + # ZeroColMaskStr + # ";"; + + dag IntrinsicPattern = !con((Intrin i32:$dtmem, + ARegClass:$a, i64:$b, + i32:$idesc, + i1:$enable_inp_d), + SparseMetadataIntr, + ZeroColMaskIntr); + + dag FlagOperands = (Intrin (i32 KindVal), (i32 CollectorBufferB), + (i32 CollectorUsageOp)); + + let Pattern = [!con(IntrinsicPattern, FlagOperands)]; +} + +// tcgen05.mma.ws +foreach sp = [0, 1] in { + foreach space = ["shared", "tensor"] in { + foreach kind = ["f16", "tf32", "f8f6f4", "i8"] in { + foreach collector_buffer_b = [0, 1, 2, 3] in { + foreach collector_usage_op = ["discard", "fill", "use", "lastuse"] in { + foreach zero_col_mask = [0, 1] in { + def : Tcgen05MMAWSInst; + } + } + } + } + } +} diff --git a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td index 2e81ab122d1df..913487b64617a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -54,7 +54,8 @@ def B16 : NVPTXRegClass<[i16, f16, bf16], 16, (add (sequence "RS%u", 0, 4))>; def B32 : NVPTXRegClass<[i32, v2f16, v2bf16, v2i16, v4i8, f32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>; -def B64 : NVPTXRegClass<[i64, v2f32, f64], 64, (add (sequence "RL%u", 0, 4), +def B64 : NVPTXRegClass<[i64, v2i32, v2f32, f64], 64, + (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>; // 128-bit regs are not defined as general regs in NVPTX. They are used for inlineASM only. def B128 : NVPTXRegClass<[i128], 128, (add (sequence "RQ%u", 0, 4))>; diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 0a77a633cb255..e81c56bb4b562 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -114,6 +114,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { case 1013: // sm_101a HasTcgen05 = true; break; + case 1103: // sm_110a + HasTcgen05 = true; + MinPTXVersion = 90; + break; case 1033: // sm_103a HasTcgen05 = true; MinPTXVersion = 88; @@ -122,6 +126,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { return HasTcgen05 && PTXVersion >= MinPTXVersion; } + + bool hasTcgen05MMAScaleInputDImm() const { + return FullSmVersion == 1003 && PTXVersion >= 86; + } // f32x2 instructions in Blackwell family bool hasF32x2Instructions() const; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index f4f89613b358d..4029e143ae2a4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -281,21 +281,12 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, return {Intrinsic::trunc, FTZ_MustBeOn}; // NVVM intrinsics that map to LLVM cast operations. - // - // Note that llvm's target-generic conversion operators correspond to the rz - // (round to zero) versions of the nvvm conversion intrinsics, even though - // most everything else here uses the rn (round to nearest even) nvvm ops. - case Intrinsic::nvvm_d2i_rz: - case Intrinsic::nvvm_f2i_rz: - case Intrinsic::nvvm_d2ll_rz: - case Intrinsic::nvvm_f2ll_rz: - return {Instruction::FPToSI}; - case Intrinsic::nvvm_d2ui_rz: - case Intrinsic::nvvm_f2ui_rz: - case Intrinsic::nvvm_d2ull_rz: - case Intrinsic::nvvm_f2ull_rz: - return {Instruction::FPToUI}; - // Integer to floating-point uses RN rounding, not RZ + // Note - we cannot map intrinsics like nvvm_d2ll_rz to LLVM's + // FPToSI, as NaN to int conversion with FPToSI is considered UB and is + // eliminated. NVVM conversion intrinsics are translated to PTX cvt + // instructions which define the outcome for NaN rather than leaving as UB. + // Therefore, translate NVVM intrinsics to sitofp/uitofp, but not to + // fptosi/fptoui. case Intrinsic::nvvm_i2d_rn: case Intrinsic::nvvm_i2f_rn: case Intrinsic::nvvm_ll2d_rn: @@ -590,8 +581,12 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, } case Intrinsic::nvvm_prefetch_tensormap: { IRBuilder<> Builder(II); - return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap, - NewV); + const unsigned NewAS = NewV->getType()->getPointerAddressSpace(); + if (NewAS == NVPTXAS::ADDRESS_SPACE_CONST || + NewAS == NVPTXAS::ADDRESS_SPACE_PARAM) + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap, + NewV); + return nullptr; } } return nullptr; diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index a070789f85e0b..4b5cb30fd3036 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -99,8 +99,8 @@ namespace NVPTX { // register. NOTE: This must be kept in sync with the register classes // defined in NVPTXRegisterInfo.td. inline auto packed_types() { - static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16, - MVT::v2i16, MVT::v2f32}; + static const auto PackedTypes = {MVT::v4i8, MVT::v2f16, MVT::v2bf16, + MVT::v2i16, MVT::v2f32, MVT::v2i32}; return PackedTypes; } diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index 8ed7c68f54e7f..48c31c91e9338 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -22,7 +22,6 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/TargetParser/Triple.h" #include diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index d89a9487c0da2..4ff2f8a54529f 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -360,8 +360,11 @@ def FeatureFastMFLR : SubtargetFeature<"fast-MFLR", "HasFastMFLR", "true", //===----------------------------------------------------------------------===// // PowerPC Instruction Predicate Definitions. -def In32BitMode : Predicate<"!Subtarget->isPPC64()">; -def In64BitMode : Predicate<"Subtarget->isPPC64()">; + +def IsLittleEndian : Predicate<"Subtarget->isLittleEndian()">; +def IsBigEndian : Predicate<"!Subtarget->isLittleEndian()">; +def IsPPC32 : Predicate<"!Subtarget->isPPC64()">; +def IsPPC64 : Predicate<"Subtarget->isPPC64()">; def IsBookE : Predicate<"Subtarget->isBookE()">; def IsNotBookE : Predicate<"!Subtarget->isBookE()">; def HasOnlyMSYNC : Predicate<"Subtarget->hasOnlyMSYNC()">; @@ -379,27 +382,50 @@ def NaNsFPMath : Predicate<"!Subtarget->getTargetMachine().Options.NoNaNsFPMath">; def HasBPERMD : Predicate<"Subtarget->hasBPERMD()">; def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">; +def HasFPU : Predicate<"Subtarget->hasFPU()">; +def HasHTM : Predicate<"Subtarget->hasHTM()">; +def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">; +def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">; +def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">; +def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">; +def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">; +def MMA : Predicate<"Subtarget->hasMMA()">; + +// Vector support predicates +def HasVSX : Predicate<"Subtarget->hasVSX()">; +def NoP8Vector : Predicate<"!Subtarget->hasP8Vector()">; +def HasP8Vector : Predicate<"Subtarget->hasP8Vector()">; +def HasAltivec : Predicate<"Subtarget->hasAltivec()">; +def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">; +def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">; +def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">; +def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; +def HasP9Altivec : Predicate<"Subtarget->hasP9Altivec()">; +def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">; +def NoP10Vector : Predicate<"!Subtarget->hasP10Vector()">; +def HasP10Vector : Predicate<"Subtarget->hasP10Vector()">; + +// Predicates used to differenciate between different ISAs. def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">; def IsISA2_07 : Predicate<"Subtarget->isISA2_07()">; def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">; -def HasFPU : Predicate<"Subtarget->hasFPU()">; -def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">; +def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">; def IsNotISA3_1 : Predicate<"!Subtarget->isISA3_1()">; +def IsISAFuture : Predicate<"Subtarget->isISAFuture()">; +def IsNotISAFuture : Predicate<"!Subtarget->isISAFuture()">; // AIX assembler may not be modern enough to support some extended mne. def ModernAs: Predicate<"!Subtarget->isAIXABI() || Subtarget->HasModernAIXAs">, AssemblerPredicate<(any_of (not AIXOS), FeatureModernAIXAs)>; def IsAIX : Predicate<"Subtarget->isAIXABI()">; def NotAIX : Predicate<"!Subtarget->isAIXABI()">; -def IsISAFuture : Predicate<"Subtarget->isISAFuture()">; -def IsNotISAFuture : Predicate<"!Subtarget->isISAFuture()">; //===----------------------------------------------------------------------===// // HwModes //===----------------------------------------------------------------------===// defvar PPC32 = DefaultMode; -def PPC64 : HwMode<[In64BitMode]>; +def PPC64 : HwMode<[IsPPC64]>; // Since new processors generally contain a superset of features of those that // came before them, the idea is to make implementations of new processors diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp index 2ad3ed21732ed..910bc9d281259 100644 --- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -2081,9 +2081,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, if (!MFI.getSavePoints().empty() && MFI.hasTailCall()) { assert(MFI.getRestorePoints().size() < 2 && "MFI can't contain multiple restore points!"); - MachineBasicBlock *RestoreBlock = MFI.getRestorePoints().front(); for (MachineBasicBlock &MBB : MF) { - if (MBB.isReturnBlock() && (&MBB) != RestoreBlock) + if (MBB.isReturnBlock() && (!MFI.getRestorePoints().contains(&MBB))) createTailCallBranchInstr(MBB); } } diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2907303874de5..f69218056fc44 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15411,6 +15411,12 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, } } + // Convert PromOps to handles before doing any RAUW operations, as these + // may CSE with existing nodes, deleting the originals. + std::list PromOpHandles; + for (auto &PromOp : PromOps) + PromOpHandles.emplace_back(PromOp); + // Replace all inputs, either with the truncation operand, or a // truncation or extension to the final output type. for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) { @@ -15434,10 +15440,6 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0))); } - std::list PromOpHandles; - for (auto &PromOp : PromOps) - PromOpHandles.emplace_back(PromOp); - // Replace all operations (these are all the same, but have a different // (promoted) return type). DAG.getNode will validate that the types of // a binary operator match, so go through the list in reverse so that diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 269d30318bca8..60efa4c8f0a37 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -76,23 +76,23 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in { let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, hasSideEffects = 0 in { let isReturn = 1, isPredicable = 1, Uses = [LR8, RM] in def BLR8 : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB, - [(PPCretglue)]>, Requires<[In64BitMode]>; + [(PPCretglue)]>, Requires<[IsPPC64]>; let isBranch = 1, isIndirectBranch = 1, Uses = [CTR8] in { let isPredicable = 1 in def BCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def BCCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins (pred $BIBO, $CR):$cond), "b${cond:cc}ctr${cond:pm} ${cond:reg}", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def BCCTR8 : XLForm_2_br2<19, 528, 12, 0, (outs), (ins crbitrc:$BI), "bcctr 12, $BI, 0", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def BCCTR8n : XLForm_2_br2<19, 528, 4, 0, (outs), (ins crbitrc:$BI), "bcctr 4, $BI, 0", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } } @@ -160,20 +160,20 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8], hasSideEffects = 0 in { let isPredicable = 1 in def BCTRL8 : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), "bctrl", IIC_BrB, [(PPCbctrl)]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; let isCodeGenOnly = 1 in { def BCCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins (pred $BIBO, $CR):$cond), "b${cond:cc}ctrl${cond:pm} ${cond:reg}", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def BCCTRL8 : XLForm_2_br2<19, 528, 12, 1, (outs), (ins crbitrc:$BI), "bcctrl 12, $BI, 0", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def BCCTRL8n : XLForm_2_br2<19, 528, 4, 1, (outs), (ins crbitrc:$BI), "bcctrl 4, $BI, 0", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } } } @@ -207,7 +207,7 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8, RM], hasSideEffects = 0, let isPredicable = 1 in def BCTRL8_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), "bctrl", IIC_BrB, [(PPCbctrl_rm)]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } } @@ -218,7 +218,7 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, (ins (memrix $D, $RA):$src), "bctrl\n\tld 2, $src", IIC_BrB, [(PPCbctrl_load_toc iaddrX4:$src)]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, @@ -228,7 +228,7 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, (ins (memrix $D, $RA):$src), "bctrl\n\tld 2, $src", IIC_BrB, [(PPCbctrl_load_toc_rm iaddrX4:$src)]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } } // Interpretation64Bit @@ -449,7 +449,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR8, RM] in def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in @@ -516,7 +516,7 @@ let hasSideEffects = 1 in { def EH_SjLj_SetJmp64 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins memr:$buf), "#EH_SJLJ_SETJMP64", [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } let hasSideEffects = 1, isBarrier = 1 in { @@ -524,7 +524,7 @@ let hasSideEffects = 1, isBarrier = 1 in { def EH_SjLj_LongJmp64 : PPCCustomInserterPseudo<(outs), (ins memr:$buf), "#EH_SJLJ_LONGJMP64", [(PPCeh_sjlj_longjmp addr:$buf)]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; } def MFSPR8 : XFXForm_1<31, 339, (outs g8rc:$RST), (ins i32imm:$SPR), @@ -1948,7 +1948,7 @@ def : Pat<(atomic_load_nonext_64 XForm:$src), (LDX memrr:$src)>; def : Pat<(atomic_store_64 i64:$val, DSForm:$ptr), (STD g8rc:$val, memrix:$ptr)>; def : Pat<(atomic_store_64 i64:$val, XForm:$ptr), (STDX g8rc:$val, memrr:$ptr)>; -let Predicates = [IsISA3_0, In64BitMode] in { +let Predicates = [IsISA3_0, IsPPC64] in { def : Pat<(i64 (int_ppc_cmpeqb g8rc:$a, g8rc:$b)), (i64 (SETB8 (CMPEQB $a, $b)))>; def : Pat<(i64 (int_ppc_setb g8rc:$a, g8rc:$b)), @@ -1961,7 +1961,7 @@ def : Pat<(i64 (int_ppc_maddld g8rc:$a, g8rc:$b, g8rc:$c)), (i64 (MADDLD8 $a, $b, $c))>; } -let Predicates = [In64BitMode] in { +let Predicates = [IsPPC64] in { def : Pat<(i64 (int_ppc_mulhd g8rc:$a, g8rc:$b)), (i64 (MULHD $a, $b))>; def : Pat<(i64 (int_ppc_mulhdu g8rc:$a, g8rc:$b)), diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td index 97d5e28963234..c616db4a1031c 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td @@ -343,7 +343,6 @@ class VXCR_Int_Ty xo, string opc, Intrinsic IntID, ValueType Ty> //===----------------------------------------------------------------------===// // Instruction Definitions. -def HasAltivec : Predicate<"Subtarget->hasAltivec()">; let Predicates = [HasAltivec] in { def DSS : DSS_Form<0, 822, (outs), (ins u5imm:$STRM), @@ -1193,8 +1192,6 @@ class VX_VT5_VA5_VB5_XO9_o xo, string opc, list pattern> let PS = 0; } -def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">; -def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">; let Predicates = [HasP8Altivec] in { let isCommutable = 1 in { @@ -1420,7 +1417,6 @@ def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>; } // HasP8Crypto // The following altivec instructions were introduced in Power ISA 3.0 -def HasP9Altivec : Predicate<"Subtarget->hasP9Altivec()">; let Predicates = [HasP9Altivec] in { // Vector Multiply-Sum diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td index 5751d7dc1628b..c3ab9651ff695 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td +++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td @@ -45,81 +45,148 @@ multiclass XOForm_RTAB5_L1r opcode, bits<9> xo, dag OOL, dag IOL, } } -class VXForm_VRTB5 xo, bits<5> R, dag OOL, dag IOL, string asmstr, - list pattern> : I<4, OOL, IOL, asmstr, NoItinerary> { +class VXForm_VRTB5_Base xo, dag OOL, dag IOL, string asmstr, + list pattern> + : I<4, OOL, IOL, asmstr, NoItinerary> { bits<5> VRT; bits<5> VRB; let Pattern = pattern; let Inst{6...10} = VRT; - let Inst{11...15} = R; let Inst{16...20} = VRB; let Inst{21...31} = xo; } +class VXForm_VRTB5 xo, bits<5> R, dag OOL, dag IOL, string asmstr, + list pattern> + : VXForm_VRTB5_Base { + + let Inst{11...15} = R; +} + class VXForm_VRTB5_UIM2 xo, bits<3> R, dag OOL, dag IOL, string asmstr, list pattern> - : I<4, OOL, IOL, asmstr, NoItinerary> { - bits<5> VRT; - bits<5> VRB; + : VXForm_VRTB5_Base { bits<2> UIM; - let Pattern = pattern; - - let Inst{6...10} = VRT; let Inst{11...13} = R; let Inst{14...15} = UIM; - let Inst{16...20} = VRB; - let Inst{21...31} = xo; } class VXForm_VRTB5_UIM1 xo, bits<4> R, dag OOL, dag IOL, string asmstr, list pattern> - : I<4, OOL, IOL, asmstr, NoItinerary> { - bits<5> VRT; - bits<5> VRB; + : VXForm_VRTB5_Base { bits<1> UIM; - let Pattern = pattern; - - let Inst{6...10} = VRT; let Inst{11...14} = R; let Inst{15} = UIM; - let Inst{16...20} = VRB; - let Inst{21...31} = xo; } class VXForm_VRTB5_UIM3 xo, bits<2> R, dag OOL, dag IOL, string asmstr, list pattern> - : I<4, OOL, IOL, asmstr, NoItinerary> { - bits<5> VRT; - bits<5> VRB; + : VXForm_VRTB5_Base { bits<3> UIM; - let Pattern = pattern; - - let Inst{6...10} = VRT; let Inst{11...12} = R; let Inst{13...15} = UIM; - let Inst{16...20} = VRB; - let Inst{21...31} = xo; } class VXForm_VRTAB5 xo, dag OOL, dag IOL, string asmstr, - list pattern> : I<4, OOL, IOL, asmstr, NoItinerary> { - bits<5> VRT; + list pattern> + : VXForm_VRTB5_Base { bits<5> VRA; - bits<5> VRB; + + let Inst{11...15} = VRA; +} + +class XX3Form_XTBp5_M2 xo, dag OOL, dag IOL, string asmstr, + list pattern> + : I<60, OOL, IOL, asmstr, NoItinerary> { + + bits<5> XTp; + bits<5> XBp; + bits<2> M; let Pattern = pattern; - let Inst{6...10} = VRT; - let Inst{11...15} = VRA; - let Inst{16...20} = VRB; - let Inst{21...31} = xo; + let Inst{6...9} = XTp{3...0}; + let Inst {10} = XTp{4}; + let Inst{15} = M{0}; + let Inst{16...19} = XBp{3...0}; + let Inst{20} = M{1}; + let Inst{21...29} = xo; + let Inst{30} = XBp{4}; +} + +class XX3Form_XTABp5_M2 xo, dag OOL, dag IOL, string asmstr, + list pattern> + : I<60, OOL, IOL, asmstr, NoItinerary> { + + bits<5> XTp; + bits<5> XAp; + bits<5> XBp; + bits<2> M; + + let Pattern = pattern; + + let Inst{6...9} = XTp{3...0}; + let Inst{10} = XTp{4}; + let Inst{11...14} = XAp{3...0}; + let Inst{15} = M{0}; + let Inst{16...19} = XBp{3...0}; + let Inst{20} = M{1}; + let Inst{21...28} = xo; + let Inst{29} = XAp{4}; + let Inst{30} = XBp{4}; } +class XX3Form_XTAB6_P1 xo, dag OOL, dag IOL, string asmstr, + list pattern> + : I<60, OOL, IOL, asmstr, NoItinerary> { + + bits<6> XT; + bits<6> XA; + bits<6> XB; + bits<1> P; + + let Pattern = pattern; + + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...22} = 3; + let Inst{23} = P; + let Inst{24...28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +class XX3Form_XTAB6 opcode, bits<8> xo, dag OOL, dag IOL, string asmstr, + list pattern> + : I { + + bits<6> XT; + bits<6> XA; + bits<6> XB; + + let Pattern = pattern; + + let Inst{6...10} = XT{4...0}; + let Inst{11...15} = XA{4...0}; + let Inst{16...20} = XB{4...0}; + let Inst{21...28} = xo; + let Inst{29} = XA{5}; + let Inst{30} = XB{5}; + let Inst{31} = XT{5}; +} + +//-------------------------- Instruction definitions -------------------------// +// Predicate combinations available: +// [IsISAFuture] +// [HasVSX, IsISAFuture] + let Predicates = [IsISAFuture] in { defm SUBFUS : XOForm_RTAB5_L1r<31, 72, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB, u1imm:$L), "subfus", @@ -134,10 +201,10 @@ let Predicates = [HasVSX, IsISAFuture] in { def LXVRLL : XX1Form_memOp<31, 557, (outs vsrc:$XT), (ins (memr $RA):$addr, g8rc:$RB), "lxvrll $XT, $addr, $RB", IIC_LdStLoad, []>; - def LXVPRL : XForm_XTp5_XAB5<31, 589, (outs vsrprc:$XTp), + def LXVPRL : XForm_XTp5_RAB5<31, 589, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>; - def LXVPRLL : XForm_XTp5_XAB5<31, 621, (outs vsrprc:$XTp), + def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp), (ins (memr $RA):$addr, g8rc:$RB), "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; } @@ -149,11 +216,11 @@ let Predicates = [HasVSX, IsISAFuture] in { def STXVRLL : XX1Form_memOp<31, 685, (outs), (ins vsrc:$XT, (memr $RA):$addr, g8rc:$RB), "stxvrll $XT, $addr, $RB", IIC_LdStLoad, []>; - def STXVPRL : XForm_XTp5_XAB5<31, 717, (outs), + def STXVPRL : XForm_XTp5_RAB5<31, 717, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprl $XTp, $addr, $RB", IIC_LdStLFD, []>; def STXVPRLL - : XForm_XTp5_XAB5<31, 749, (outs), + : XForm_XTp5_RAB5<31, 749, (outs), (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB), "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>; } @@ -191,9 +258,46 @@ let Predicates = [HasVSX, IsISAFuture] in { def VUCMPRLH : VXForm_VRTAB5<323, (outs vrrc:$VRT), (ins vrrc:$VRA, vrrc:$VRB), "vucmprlh $VRT, $VRA, $VRB", []>; + + // AES Acceleration Instructions + def XXAESENCP : XX3Form_XTABp5_M2<194, (outs vsrprc:$XTp), + (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M), + "xxaesencp $XTp, $XAp, $XBp, $M", []>; + def XXAESDECP : XX3Form_XTABp5_M2<202, (outs vsrprc:$XTp), + (ins vsrprc:$XAp, vsrprc:$XBp, u2imm:$M), + "xxaesdecp $XTp, $XAp, $XBp, $M", []>; + def XXAESGENLKP : XX3Form_XTBp5_M2<420, (outs vsrprc:$XTp), + (ins vsrprc:$XBp, u2imm:$M), + "xxaesgenlkp $XTp, $XBp, $M", []>; + def XXGFMUL128 : XX3Form_XTAB6_P1<26, (outs vsrc:$XT), + (ins vsrc:$XA, vsrc:$XB, u1imm:$P), + "xxgfmul128 $XT, $XA, $XB, $P", []>; + + // VSX Vector Integer Arithmetic Instructions + def XVADDUWM : XX3Form_XTAB6<60, 131, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvadduwm $XT, $XA, $XB", []>; + def XVADDUHM : XX3Form_XTAB6<60, 139, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvadduhm $XT, $XA, $XB", []>; + def XVSUBUWM: XX3Form_XTAB6<60, 147, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvsubuwm $XT, $XA, $XB", []>; + def XVSUBUHM: XX3Form_XTAB6<60, 155, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvsubuhm $XT, $XA, $XB", []>; + def XVMULUWM: XX3Form_XTAB6<60, 163, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmuluwm $XT, $XA, $XB", []>; + def XVMULUHM: XX3Form_XTAB6<60, 171, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmuluhm $XT, $XA, $XB", []>; + def XVMULHSW: XX3Form_XTAB6<60, 179, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmulhsw $XT, $XA, $XB", []>; + def XVMULHSH: XX3Form_XTAB6<60, 187, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmulhsh $XT, $XA, $XB", []>; + def XVMULHUW: XX3Form_XTAB6<60, 114, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmulhuw $XT, $XA, $XB", []>; + def XVMULHUH: XX3Form_XTAB6<60, 122, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB), + "xvmulhuh $XT, $XA, $XB", []>; } //---------------------------- Anonymous Patterns ----------------------------// +// Predicate combinations available: // Load/Store VSX Vector with Right Length (Left-justified). def : Pat<(v4i32 (int_ppc_vsx_lxvrl addr:$RA, i64:$RB)), (LXVRL $RA, $RB)>; @@ -210,3 +314,32 @@ def : Pat<(int_ppc_vsx_stxvprl v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRL $XTp, $RA, $RB)>; def : Pat<(int_ppc_vsx_stxvprll v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRLL $XTp, $RA, $RB)>; + +//---------------------------- Instruction aliases ---------------------------// +// Predicate combinations available: +// [HasVSX, IsISAFuture] + +let Predicates = [HasVSX, IsISAFuture] in { + def : InstAlias<"xxaes128encp $XTp, $XAp, $XBp", + (XXAESENCP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 0)>; + def : InstAlias<"xxaes192encp $XTp, $XAp, $XBp", + (XXAESENCP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 1)>; + def : InstAlias<"xxaes256encp $XTp, $XAp, $XBp", + (XXAESENCP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 2)>; + def : InstAlias<"xxaes128decp $XTp, $XAp, $XBp", + (XXAESDECP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 0)>; + def : InstAlias<"xxaes192decp $XTp, $XAp, $XBp", + (XXAESDECP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 1)>; + def : InstAlias<"xxaes256decp $XTp, $XAp, $XBp", + (XXAESDECP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 2)>; + def : InstAlias<"xxaes128genlkp $XTp, $XBp", (XXAESGENLKP vsrprc:$XTp, + vsrprc:$XBp, 0)>; + def : InstAlias<"xxaes192genlkp $XTp, $XBp", (XXAESGENLKP vsrprc:$XTp, + vsrprc:$XBp, 1)>; + def : InstAlias<"xxaes256genlkp $XTp, $XBp", (XXAESGENLKP vsrprc:$XTp, + vsrprc:$XBp, 2)>; + def : InstAlias<"xxgfmul128gcm $XT, $XA, $XB", (XXGFMUL128 vsrc:$XT, vsrc:$XA, + vsrc:$XB, 0)>; + def : InstAlias<"xxgfmul128xts $XT, $XA, $XB", (XXGFMUL128 vsrc:$XT, vsrc:$XA, + vsrc:$XB, 1)>; +} diff --git a/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/llvm/lib/Target/PowerPC/PPCInstrHTM.td index 8d0ac512b290d..6b5da44c91c2b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrHTM.td +++ b/llvm/lib/Target/PowerPC/PPCInstrHTM.td @@ -11,10 +11,6 @@ // //===----------------------------------------------------------------------===// - - -def HasHTM : Predicate<"Subtarget->hasHTM()">; - def HTM_get_imm : SDNodeXFormgetZExtValue(), SDLoc(N)); }]>; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 55e38bcf4afc9..3014aa6bfe31e 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1075,7 +1075,7 @@ Register PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, // For opcodes with the ReMaterializable flag set, this function is called to // verify the instruction is really rematable. -bool PPCInstrInfo::isReallyTriviallyReMaterializable( +bool PPCInstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { switch (MI.getOpcode()) { default: @@ -1112,7 +1112,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable( case PPC::DMXXSETACCZ: return true; } - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI); } Register PPCInstrInfo::isStoreToStackSlot(const MachineInstr &MI, diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h index 63ebd65910572..d67fc28935586 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -530,7 +530,7 @@ class PPCInstrInfo : public PPCGenInstrInfo { unsigned &SubIdx) const override; Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI) const override; Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 1c45050cdf9ca..aca7abd5a45a7 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -1282,7 +1282,7 @@ def RESTORE_CRBIT : PPCEmitTimePseudo<(outs crbitrc:$cond), (ins memri:$F), let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, hasSideEffects = 0 in { let isPredicable = 1, isReturn = 1, Uses = [LR, RM] in def BLR : XLForm_2_ext<19, 16, 20, 0, 0, (outs), (ins), "blr", IIC_BrB, - [(PPCretglue)]>, Requires<[In32BitMode]>; + [(PPCretglue)]>, Requires<[IsPPC32]>; let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in { let isPredicable = 1 in def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, @@ -1455,7 +1455,7 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in { let isPredicable = 1 in def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), "bctrl", IIC_BrB, [(PPCbctrl)]>, - Requires<[In32BitMode]>; + Requires<[IsPPC32]>; let isCodeGenOnly = 1 in { def BCCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins (pred $BIBO, $CR):$cond), @@ -1541,7 +1541,7 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR, RM], isCodeGenOnly = 1 in { let isPredicable = 1 in def BCTRL_RM : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins), "bctrl", IIC_BrB, [(PPCbctrl_rm)]>, - Requires<[In32BitMode]>; + Requires<[IsPPC32]>; } } @@ -1567,7 +1567,7 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, def BCTRL_LWZinto_toc: XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs), (ins (memri $D, $RA):$addr), "bctrl\n\tlwz 2, $addr", IIC_BrB, - [(PPCbctrl_load_toc iaddr:$addr)]>, Requires<[In32BitMode]>; + [(PPCbctrl_load_toc iaddr:$addr)]>, Requires<[IsPPC32]>; } @@ -1576,7 +1576,7 @@ let isCall = 1, PPC970_Unit = 7, isCodeGenOnly = 1, def BCTRL_LWZinto_toc_RM: XLForm_2_ext_and_DForm_1<19, 528, 20, 0, 1, 32, (outs), (ins (memri $D, $RA):$addr), "bctrl\n\tlwz 2, $addr", IIC_BrB, - [(PPCbctrl_load_toc_rm iaddr:$addr)]>, Requires<[In32BitMode]>; + [(PPCbctrl_load_toc_rm iaddr:$addr)]>, Requires<[IsPPC32]>; } @@ -1585,7 +1585,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in { let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1, isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", IIC_BrB, - []>, Requires<[In32BitMode]>; + []>, Requires<[IsPPC32]>; let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7, isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in @@ -1608,7 +1608,7 @@ let hasSideEffects = 1 in { def EH_SjLj_SetJmp32 : PPCCustomInserterPseudo<(outs gprc:$dst), (ins memr:$buf), "#EH_SJLJ_SETJMP32", [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>, - Requires<[In32BitMode]>; + Requires<[IsPPC32]>; } let hasSideEffects = 1, isBarrier = 1 in { @@ -1616,7 +1616,7 @@ let hasSideEffects = 1, isBarrier = 1 in { def EH_SjLj_LongJmp32 : PPCCustomInserterPseudo<(outs), (ins memr:$buf), "#EH_SJLJ_LONGJMP32", [(PPCeh_sjlj_longjmp addr:$buf)]>, - Requires<[In32BitMode]>; + Requires<[IsPPC32]>; } // This pseudo is never removed from the function, as it serves as @@ -3438,8 +3438,6 @@ def Msk2Imm : ImmLeaf(Imm); }]>; def Msk4Imm : ImmLeaf(Imm); }]>; def Msk8Imm : ImmLeaf(Imm); }]>; -def MMA : Predicate<"Subtarget->hasMMA()">; - // Prefixed instructions may require access to the above defs at a later // time so we include this after the def. include "PPCInstrP10.td" @@ -5144,9 +5142,9 @@ def RotateInsertByte1 { } // Clear the upper half of the register when in 64-bit mode -let Predicates = [In64BitMode] in +let Predicates = [IsPPC64] in def : Pat<(i32 (bitreverse i32:$A)), (RLDICL_32 RotateInsertByte1.Left, 0, 32)>; -let Predicates = [In32BitMode] in +let Predicates = [IsPPC32] in def : Pat<(i32 (bitreverse i32:$A)), RotateInsertByte1.Left>; // Fast 64-bit reverse bits algorithm: diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index 8ee9cc952dec6..2384959a60a43 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -51,10 +51,6 @@ // Moreover, the order of operands reflects the order of operands // in the encoding. -//-------------------------- Predicate definitions ---------------------------// -def IsPPC32 : Predicate<"!Subtarget->isPPC64()">; - - //===----------------------------------------------------------------------===// // PowerPC ISA 3.1 specific type constraints. // @@ -634,9 +630,6 @@ multiclass 8LS_DForm_R_SI34_XT6_RA5_MEM_p opcode, dag OOL, dag IOL, } } -def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">; -def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">; -def PairedVectorMemops : Predicate<"Subtarget->pairedVectorMemops()">; def RCCp { dag AToVSRC = (COPY_TO_REGCLASS $XA, VSRC); dag BToVSRC = (COPY_TO_REGCLASS $XB, VSRC); @@ -870,7 +863,7 @@ class DQForm_XTp5_RA17_MEM opcode, bits<4> xo, dag OOL, dag IOL, let Inst{28...31} = xo; } -class XForm_XTp5_XAB5 opcode, bits<10> xo, dag OOL, dag IOL, +class XForm_XTp5_RAB5 opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, InstrItinClass itin, list pattern> : I, XFormMemOp { bits<5> XTp; @@ -1159,7 +1152,7 @@ let Predicates = [PairedVectorMemops] in { def LXVP : DQForm_XTp5_RA17_MEM<6, 0, (outs vsrprc:$XTp), (ins (memrix16 $DQ, $RA):$addr), "lxvp $XTp, $addr", IIC_LdStLFD, []>; - def LXVPX : XForm_XTp5_XAB5<31, 333, (outs vsrprc:$XTp), (ins (memrr $RA, $RB):$addr), + def LXVPX : XForm_XTp5_RAB5<31, 333, (outs vsrprc:$XTp), (ins (memrr $RA, $RB):$addr), "lxvpx $XTp, $addr", IIC_LdStLFD, []>; } @@ -1168,7 +1161,7 @@ let Predicates = [PairedVectorMemops] in { def STXVP : DQForm_XTp5_RA17_MEM<6, 1, (outs), (ins vsrprc:$XTp, (memrix16 $DQ, $RA):$addr), "stxvp $XTp, $addr", IIC_LdStLFD, []>; - def STXVPX : XForm_XTp5_XAB5<31, 461, (outs), (ins vsrprc:$XTp, (memrr $RA, $RB):$addr), + def STXVPX : XForm_XTp5_RAB5<31, 461, (outs), (ins vsrprc:$XTp, (memrr $RA, $RB):$addr), "stxvpx $XTp, $addr", IIC_LdStLFD, []>; } diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index 4e5165bfcda55..979ba31b0431b 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -116,20 +116,6 @@ def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED", SDTypeProfile<1, 1, []>, []>; def PPCxxperm : SDNode<"PPCISD::XXPERM", SDT_PPCxxperm, []>; -//-------------------------- Predicate definitions ---------------------------// -def HasVSX : Predicate<"Subtarget->hasVSX()">; -def IsLittleEndian : Predicate<"Subtarget->isLittleEndian()">; -def IsBigEndian : Predicate<"!Subtarget->isLittleEndian()">; -def IsPPC64 : Predicate<"Subtarget->isPPC64()">; -def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">; -def NoP8Vector : Predicate<"!Subtarget->hasP8Vector()">; -def HasP8Vector : Predicate<"Subtarget->hasP8Vector()">; -def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">; -def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">; -def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">; -def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">; -def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">; -def HasP10Vector: Predicate<"Subtarget->hasP10Vector()">; def PPCldsplatAlign16 : PatFrag<(ops node:$ptr), (PPCldsplat node:$ptr), [{ return cast(N)->getAlign() >= Align(16) && @@ -1293,13 +1279,13 @@ let Predicates = [HasVSX, HasP8Vector] in { def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$RA), (ins vsfrc:$XT), "mfvsrd $RA, $XT", IIC_VecGeneral, [(set i64:$RA, (PPCmfvsr f64:$XT))]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; // FIXME: Setting the hasSideEffects flag here to match current behaviour. let isCodeGenOnly = 1, hasSideEffects = 1 in def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$RA), (ins vsrc:$XT), "mfvsrd $RA, $XT", IIC_VecGeneral, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$RA), (ins vsfrc:$XT), "mfvsrwz $RA, $XT", IIC_VecGeneral, [(set i32:$RA, (PPCmfvsr f64:$XT))]>, ZExt32To64; @@ -1311,13 +1297,13 @@ let Predicates = [HasVSX, HasP8Vector] in { def MTVSRD : XX1_RS6_RD5_XO<31, 179, (outs vsfrc:$XT), (ins g8rc:$RA), "mtvsrd $XT, $RA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsra i64:$RA))]>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; // FIXME: Setting the hasSideEffects flag here to match current behaviour. let isCodeGenOnly = 1, hasSideEffects = 1 in def MTVRD : XX1_RS6_RD5_XO<31, 179, (outs vsrc:$XT), (ins g8rc:$RA), "mtvsrd $XT, $RA", IIC_VecGeneral, []>, - Requires<[In64BitMode]>; + Requires<[IsPPC64]>; def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$RA), "mtvsrwa $XT, $RA", IIC_VecGeneral, [(set f64:$XT, (PPCmtvsra i32:$RA))]>; @@ -1344,11 +1330,11 @@ def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$RA), def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$RA, g8rc:$RB), "mtvsrdd $XT, $RA, $RB", IIC_VecGeneral, - []>, Requires<[In64BitMode]>; + []>, Requires<[IsPPC64]>; def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$RA), (ins vsrc:$XT), "mfvsrld $RA, $XT", IIC_VecGeneral, - []>, Requires<[In64BitMode]>; + []>, Requires<[IsPPC64]>; } // HasVSX, IsISA3_0, HasDirectMove diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index 50891da333f01..21dbb7cbc9844 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -811,6 +811,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { bool isSImm6() const { return isSImm<6>(); } bool isSImm10() const { return isSImm<10>(); } bool isSImm11() const { return isSImm<11>(); } + bool isSImm12() const { return isSImm<12>(); } bool isSImm16() const { return isSImm<16>(); } bool isSImm26() const { return isSImm<26>(); } @@ -859,7 +860,7 @@ struct RISCVOperand final : public MCParsedAsmOperand { return SignExtend64<32>(Imm); } - bool isSImm12() const { + bool isSImm12LO() const { if (!isExpr()) return false; @@ -1599,6 +1600,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, case Match_InvalidUImm16NonZero: return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 16) - 1); case Match_InvalidSImm12: + return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 11), + (1 << 11) - 1); + case Match_InvalidSImm12LO: return generateImmOutOfRangeError( Operands, ErrorInfo, -(1 << 11), (1 << 11) - 1, "operand must be a symbol with %lo/%pcrel_lo/%tprel_lo specifier or an " diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index ff07122b61378..b8ec0bbfcd3bb 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -206,6 +206,14 @@ static DecodeStatus DecodeSPRegisterClass(MCInst &Inst, return MCDisassembler::Success; } +static DecodeStatus DecodeSPRegisterClass(MCInst &Inst, uint64_t RegNo, + uint32_t Address, + const MCDisassembler *Decoder) { + assert(RegNo == 2); + Inst.addOperand(MCOperand::createReg(RISCV::X2)); + return MCDisassembler::Success; +} + static DecodeStatus DecodeGPRX5RegisterClass(MCInst &Inst, const MCDisassembler *Decoder) { Inst.addOperand(MCOperand::createReg(RISCV::X5)); @@ -668,8 +676,8 @@ static constexpr FeatureBitset XTHeadGroup = { RISCV::FeatureVendorXTHeadVdot}; static constexpr FeatureBitset XAndesGroup = { - RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesBFHCvt, - RISCV::FeatureVendorXAndesVBFHCvt, + RISCV::FeatureVendorXAndesPerf, RISCV::FeatureVendorXAndesBFHCvt, + RISCV::FeatureVendorXAndesVBFHCvt, RISCV::FeatureVendorXAndesVSIntH, RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot}; diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp index ae44306170758..50730c697989d 100644 --- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp +++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp @@ -18,6 +18,7 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/DebugLog.h" #define DEBUG_TYPE "llvm-mca-riscv-custombehaviour" @@ -86,7 +87,8 @@ uint8_t RISCVSEWInstrument::getSEW() const { bool RISCVInstrumentManager::supportsInstrumentType( llvm::StringRef Type) const { return Type == RISCVLMULInstrument::DESC_NAME || - Type == RISCVSEWInstrument::DESC_NAME; + Type == RISCVSEWInstrument::DESC_NAME || + InstrumentManager::supportsInstrumentType(Type); } UniqueInstrument @@ -94,8 +96,8 @@ RISCVInstrumentManager::createInstrument(llvm::StringRef Desc, llvm::StringRef Data) { if (Desc == RISCVLMULInstrument::DESC_NAME) { if (!RISCVLMULInstrument::isDataValid(Data)) { - LLVM_DEBUG(dbgs() << "RVCB: Bad data for instrument kind " << Desc << ": " - << Data << '\n'); + LDBG() << "RVCB: Bad data for instrument kind " << Desc << ": " << Data + << '\n'; return nullptr; } return std::make_unique(Data); @@ -103,23 +105,23 @@ RISCVInstrumentManager::createInstrument(llvm::StringRef Desc, if (Desc == RISCVSEWInstrument::DESC_NAME) { if (!RISCVSEWInstrument::isDataValid(Data)) { - LLVM_DEBUG(dbgs() << "RVCB: Bad data for instrument kind " << Desc << ": " - << Data << '\n'); + LDBG() << "RVCB: Bad data for instrument kind " << Desc << ": " << Data + << '\n'; return nullptr; } return std::make_unique(Data); } - LLVM_DEBUG(dbgs() << "RVCB: Unknown instrumentation Desc: " << Desc << '\n'); - return nullptr; + LDBG() << "RVCB: Creating default instrument for Desc: " << Desc << '\n'; + return InstrumentManager::createInstrument(Desc, Data); } SmallVector RISCVInstrumentManager::createInstruments(const MCInst &Inst) { if (Inst.getOpcode() == RISCV::VSETVLI || Inst.getOpcode() == RISCV::VSETIVLI) { - LLVM_DEBUG(dbgs() << "RVCB: Found VSETVLI and creating instrument for it: " - << Inst << "\n"); + LDBG() << "RVCB: Found VSETVLI and creating instrument for it: " << Inst + << "\n"; unsigned VTypeI = Inst.getOperand(2).getImm(); RISCVVType::VLMUL VLMUL = RISCVVType::getVLMUL(VTypeI); @@ -250,8 +252,7 @@ unsigned RISCVInstrumentManager::getSchedClassID( // Need LMUL or LMUL, SEW in order to override opcode. If no LMUL is provided, // then no option to override. if (!LI) { - LLVM_DEBUG( - dbgs() << "RVCB: Did not use instrumentation to override Opcode.\n"); + LDBG() << "RVCB: Did not use instrumentation to override Opcode.\n"; return SchedClassID; } uint8_t LMUL = LI->getLMUL(); @@ -313,22 +314,21 @@ unsigned RISCVInstrumentManager::getSchedClassID( // Not a RVV instr if (!VPOpcode) { - LLVM_DEBUG( - dbgs() << "RVCB: Could not find PseudoInstruction for Opcode " - << MCII.getName(Opcode) - << ", LMUL=" << (LI ? LI->getData() : "Unspecified") - << ", SEW=" << (SI ? SI->getData() : "Unspecified") - << ". Ignoring instrumentation and using original SchedClassID=" - << SchedClassID << '\n'); + LDBG() << "RVCB: Could not find PseudoInstruction for Opcode " + << MCII.getName(Opcode) + << ", LMUL=" << (LI ? LI->getData() : "Unspecified") + << ", SEW=" << (SI ? SI->getData() : "Unspecified") + << ". Ignoring instrumentation and using original SchedClassID=" + << SchedClassID << '\n'; return SchedClassID; } // Override using pseudo - LLVM_DEBUG(dbgs() << "RVCB: Found Pseudo Instruction for Opcode " - << MCII.getName(Opcode) << ", LMUL=" << LI->getData() - << ", SEW=" << (SI ? SI->getData() : "Unspecified") - << ". Overriding original SchedClassID=" << SchedClassID - << " with " << MCII.getName(*VPOpcode) << '\n'); + LDBG() << "RVCB: Found Pseudo Instruction for Opcode " << MCII.getName(Opcode) + << ", LMUL=" << LI->getData() + << ", SEW=" << (SI ? SI->getData() : "Unspecified") + << ". Overriding original SchedClassID=" << SchedClassID << " with " + << MCII.getName(*VPOpcode) << '\n'; return MCII.get(*VPOpcode).getSchedClass(); } diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 677d93521c6f1..a02de31d1cc4d 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1648,6 +1648,14 @@ def HasVendorXAndesVBFHCvt AssemblerPredicate<(all_of FeatureVendorXAndesVBFHCvt), "'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension)">; +def FeatureVendorXAndesVSIntH + : RISCVExtension<5, 0, "Andes Vector Small INT Handling Extension", + [FeatureStdExtZve32x]>; +def HasVendorXAndesVSIntH + : Predicate<"Subtarget->hasVendorXAndesVSIntH()">, + AssemblerPredicate<(all_of FeatureVendorXAndesVSIntH), + "'XAndesVSIntH' (Andes Vector Small INT Handling Extension)">; + def FeatureVendorXAndesVSIntLoad : RISCVExtension<5, 0, "Andes Vector INT4 Load Extension", [FeatureStdExtZve32x]>; diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index a9ecf44e8da1e..2e5f30f8fe35a 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -41,12 +41,12 @@ def GIImmPlus1 : def PtrVT : PtrValueTypeByHwMode; // Define pattern expansions for pointer ult/slt conditional codes -def : Pat<(XLenVT (setult (PtrVT GPR:$rs1), simm12:$imm12)), - (SLTIU GPR:$rs1, simm12:$imm12)>; +def : Pat<(XLenVT (setult (PtrVT GPR:$rs1), simm12_lo:$imm12)), + (SLTIU GPR:$rs1, simm12_lo:$imm12)>; def : Pat<(XLenVT (setult (PtrVT GPR:$rs1), (PtrVT GPR:$rs2))), (SLTU GPR:$rs1, GPR:$rs2)>; -def : Pat<(XLenVT (setlt (PtrVT GPR:$rs1), simm12:$imm12)), - (SLTI GPR:$rs1, simm12:$imm12)>; +def : Pat<(XLenVT (setlt (PtrVT GPR:$rs1), simm12_lo:$imm12)), + (SLTI GPR:$rs1, simm12_lo:$imm12)>; def : Pat<(XLenVT (setlt (PtrVT GPR:$rs1), (PtrVT GPR:$rs2))), (SLT GPR:$rs1, GPR:$rs2)>; @@ -72,12 +72,12 @@ def : Pat<(XLenVT (setgt (Ty GPR:$rs1), (Ty simm12Minus1Nonzero:$imm))), (XORI (SLTI GPR:$rs1, (ImmPlus1 simm12Minus1Nonzero:$imm)), 1)>; def : Pat<(XLenVT (setgt (Ty GPR:$rs1), (Ty GPR:$rs2))), (SLT GPR:$rs2, GPR:$rs1)>; -def : Pat<(XLenVT (setuge (XLenVT GPR:$rs1), (Ty simm12:$imm))), - (XORI (SLTIU GPR:$rs1, simm12:$imm), 1)>; +def : Pat<(XLenVT (setuge (XLenVT GPR:$rs1), (Ty simm12_lo:$imm))), + (XORI (SLTIU GPR:$rs1, simm12_lo:$imm), 1)>; def : Pat<(XLenVT (setuge (Ty GPR:$rs1), (Ty GPR:$rs2))), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>; -def : Pat<(XLenVT (setge (Ty GPR:$rs1), (Ty simm12:$imm))), - (XORI (SLTI GPR:$rs1, simm12:$imm), 1)>; +def : Pat<(XLenVT (setge (Ty GPR:$rs1), (Ty simm12_lo:$imm))), + (XORI (SLTI GPR:$rs1, simm12_lo:$imm), 1)>; def : Pat<(XLenVT (setge (Ty GPR:$rs1), (Ty GPR:$rs2))), (XORI (SLT GPR:$rs1, GPR:$rs2), 1)>; def : Pat<(XLenVT (setule (Ty GPR:$rs1), (Ty simm12Minus1NonzeroNonNeg1:$imm))), @@ -143,8 +143,8 @@ def : Pat<(anyext (i32 GPR:$src)), (COPY GPR:$src)>; def : Pat<(sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>; def : Pat<(i32 (trunc GPR:$src)), (COPY GPR:$src)>; -def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12:$imm)), i32), - (ADDIW GPR:$rs1, simm12:$imm)>; +def : Pat<(sext_inreg (i64 (add GPR:$rs1, simm12_lo:$imm)), i32), + (ADDIW GPR:$rs1, simm12_lo:$imm)>; // Use sext if the sign bit of the input is 0. def : Pat<(zext_is_sext (i32 GPR:$src)), (ADDIW GPR:$src, 0)>; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index dda6023b37f7b..437022f5cde9f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -677,95 +677,6 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) { return false; } -bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) { - // Supported only in Xqcibm for now. - if (!Subtarget->hasVendorXqcibm()) - return false; - - using namespace SDPatternMatch; - - SDValue X; - APInt MaskImm; - if (!sd_match(Node, m_Or(m_OneUse(m_Value(X)), m_ConstInt(MaskImm)))) - return false; - - unsigned ShAmt, Width; - if (!MaskImm.isShiftedMask(ShAmt, Width) || MaskImm.isSignedIntN(12)) - return false; - - // If Zbs is enabled and it is a single bit set we can use BSETI which - // can be compressed to C_BSETI when Xqcibm in enabled. - if (Width == 1 && Subtarget->hasStdExtZbs()) - return false; - - // If C1 is a shifted mask (but can't be formed as an ORI), - // use a bitfield insert of -1. - // Transform (or x, C1) - // -> (qc.insbi x, -1, width, shift) - SDLoc DL(Node); - MVT VT = Node->getSimpleValueType(0); - - SDValue Ops[] = {X, CurDAG->getSignedTargetConstant(-1, DL, VT), - CurDAG->getTargetConstant(Width, DL, VT), - CurDAG->getTargetConstant(ShAmt, DL, VT)}; - SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops); - ReplaceNode(Node, BitIns); - return true; -} - -// Generate a QC_INSB/QC_INSBI from 'or (and X, MaskImm), OrImm' iff the value -// being inserted only sets known zero bits. -bool RISCVDAGToDAGISel::tryBitfieldInsertOpFromOrAndImm(SDNode *Node) { - // Supported only in Xqcibm for now. - if (!Subtarget->hasVendorXqcibm()) - return false; - - using namespace SDPatternMatch; - - SDValue And; - APInt MaskImm, OrImm; - if (!sd_match(Node, m_Or(m_OneUse(m_And(m_Value(And), m_ConstInt(MaskImm))), - m_ConstInt(OrImm)))) - return false; - - // Compute the Known Zero for the AND as this allows us to catch more general - // cases than just looking for AND with imm. - KnownBits Known = CurDAG->computeKnownBits(Node->getOperand(0)); - - // The bits being inserted must only set those bits that are known to be zero. - if (!OrImm.isSubsetOf(Known.Zero)) { - // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't - // currently handle this case. - return false; - } - - unsigned ShAmt, Width; - // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). - if (!Known.Zero.isShiftedMask(ShAmt, Width)) - return false; - - // QC_INSB(I) dst, src, #width, #shamt. - SDLoc DL(Node); - MVT VT = Node->getSimpleValueType(0); - SDValue ImmNode; - auto Opc = RISCV::QC_INSB; - - int32_t LIImm = OrImm.getSExtValue() >> ShAmt; - - if (isInt<5>(LIImm)) { - Opc = RISCV::QC_INSBI; - ImmNode = CurDAG->getSignedTargetConstant(LIImm, DL, MVT::i32); - } else { - ImmNode = selectImm(CurDAG, DL, MVT::i32, LIImm, *Subtarget); - } - - SDValue Ops[] = {And, ImmNode, CurDAG->getTargetConstant(Width, DL, VT), - CurDAG->getTargetConstant(ShAmt, DL, VT)}; - SDNode *BitIns = CurDAG->getMachineNode(Opc, DL, VT, Ops); - ReplaceNode(Node, BitIns); - return true; -} - bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { // Only supported with XAndesPerf at the moment. if (!Subtarget->hasVendorXAndesPerf()) @@ -1384,12 +1295,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { return; } case ISD::OR: { - if (trySignedBitfieldInsertInMask(Node)) - return; - - if (tryBitfieldInsertOpFromOrAndImm(Node)) - return; - if (tryShrinkShlLogicImm(Node)) return; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index cf2f763abc063..f03b44c875cab 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -76,8 +76,6 @@ class RISCVDAGToDAGISel : public SelectionDAGISel { bool tryShrinkShlLogicImm(SDNode *Node); bool trySignedBitfieldExtract(SDNode *Node); bool trySignedBitfieldInsertInSign(SDNode *Node); - bool trySignedBitfieldInsertInMask(SDNode *Node); - bool tryBitfieldInsertOpFromOrAndImm(SDNode *Node); bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, unsigned Lsb); bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8070a512ab078..50649cf3caba4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -79,7 +79,7 @@ static cl::opt FPImmCost(DEBUG_TYPE "-fpimm-cost", cl::Hidden, cl::desc("Give the maximum number of instructions that we will " "use for creating a floating-point immediate value"), - cl::init(2)); + cl::init(3)); static cl::opt ReassocShlAddiAdd("reassoc-shl-addi-add", cl::Hidden, @@ -4564,6 +4564,14 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, break; } + // Do not slideup if the element type of EVec is different. + if (SlideUp) { + MVT EVecEltVT = EVec.getSimpleValueType().getVectorElementType(); + MVT ContainerEltVT = ContainerVT.getVectorElementType(); + if (EVecEltVT != ContainerEltVT) + SlideUp = false; + } + if (SlideUp) { MVT EVecContainerVT = EVec.getSimpleValueType(); // Make sure the original vector has scalable vector type. @@ -16203,7 +16211,6 @@ static SDValue combineXorToBitfieldInsert(SDNode *N, SelectionDAG &DAG, return SDValue(); using namespace SDPatternMatch; - SDValue Base, Inserted; APInt CMask; if (!sd_match(N, m_Xor(m_Value(Base), @@ -16214,7 +16221,6 @@ static SDValue combineXorToBitfieldInsert(SDNode *N, SelectionDAG &DAG, if (N->getValueType(0) != MVT::i32) return SDValue(); - unsigned Width, ShAmt; if (!CMask.isShiftedMask(ShAmt, Width)) return SDValue(); @@ -16235,10 +16241,96 @@ static SDValue combineXorToBitfieldInsert(SDNode *N, SelectionDAG &DAG, return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops); } +static SDValue combineOrToBitfieldInsert(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + if (!Subtarget.hasVendorXqcibm()) + return SDValue(); + + using namespace SDPatternMatch; + + SDValue X; + APInt MaskImm; + if (!sd_match(N, m_Or(m_OneUse(m_Value(X)), m_ConstInt(MaskImm)))) + return SDValue(); + + unsigned ShAmt, Width; + if (!MaskImm.isShiftedMask(ShAmt, Width) || MaskImm.isSignedIntN(12)) + return SDValue(); + + if (N->getValueType(0) != MVT::i32) + return SDValue(); + + // If Zbs is enabled and it is a single bit set we can use BSETI which + // can be compressed to C_BSETI when Xqcibm in enabled. + if (Width == 1 && Subtarget.hasStdExtZbs()) + return SDValue(); + + // If C1 is a shifted mask (but can't be formed as an ORI), + // use a bitfield insert of -1. + // Transform (or x, C1) + // -> (qc.insbi x, -1, width, shift) + SDLoc DL(N); + + SDValue Ops[] = {X, DAG.getSignedConstant(-1, DL, MVT::i32), + DAG.getConstant(Width, DL, MVT::i32), + DAG.getConstant(ShAmt, DL, MVT::i32)}; + return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops); +} + +// Generate a QC_INSB/QC_INSBI from 'or (and X, MaskImm), OrImm' iff the value +// being inserted only sets known zero bits. +static SDValue combineOrAndToBitfieldInsert(SDNode *N, SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + // Supported only in Xqcibm for now. + if (!Subtarget.hasVendorXqcibm()) + return SDValue(); + + using namespace SDPatternMatch; + + SDValue Inserted; + APInt MaskImm, OrImm; + if (!sd_match( + N, m_SpecificVT(MVT::i32, m_Or(m_OneUse(m_And(m_Value(Inserted), + m_ConstInt(MaskImm))), + m_ConstInt(OrImm))))) + return SDValue(); + + // Compute the Known Zero for the AND as this allows us to catch more general + // cases than just looking for AND with imm. + KnownBits Known = DAG.computeKnownBits(N->getOperand(0)); + + // The bits being inserted must only set those bits that are known to be + // zero. + if (!OrImm.isSubsetOf(Known.Zero)) { + // FIXME: It's okay if the OrImm sets NotKnownZero bits to 1, but we don't + // currently handle this case. + return SDValue(); + } + + unsigned ShAmt, Width; + // The KnownZero mask must be a shifted mask (e.g., 1110..011, 11100..00). + if (!Known.Zero.isShiftedMask(ShAmt, Width)) + return SDValue(); + + // QC_INSB(I) dst, src, #width, #shamt. + SDLoc DL(N); + + SDValue ImmNode = + DAG.getSignedConstant(OrImm.getSExtValue() >> ShAmt, DL, MVT::i32); + + SDValue Ops[] = {Inserted, ImmNode, DAG.getConstant(Width, DL, MVT::i32), + DAG.getConstant(ShAmt, DL, MVT::i32)}; + return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops); +} + static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const RISCVSubtarget &Subtarget) { SelectionDAG &DAG = DCI.DAG; + if (SDValue V = combineOrToBitfieldInsert(N, DAG, Subtarget)) + return V; + if (SDValue V = combineOrAndToBitfieldInsert(N, DAG, Subtarget)) + return V; if (SDValue V = combineBinOpToReduce(N, DAG, Subtarget)) return V; if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget)) @@ -16796,7 +16888,7 @@ static SDValue performSETCCCombine(SDNode *N, // addi or xori after shifting. uint64_t N1Int = cast(N1)->getZExtValue(); uint64_t AndRHSInt = AndRHSC.getZExtValue(); - if (OpVT == MVT::i64 && AndRHSInt <= 0xffffffff && + if (OpVT == MVT::i64 && isUInt<32>(AndRHSInt) && isPowerOf2_32(-uint32_t(AndRHSInt)) && (N1Int & AndRHSInt) == N1Int) { unsigned ShiftBits = llvm::countr_zero(AndRHSInt); int64_t NewC = SignExtend64<32>(N1Int) >> ShiftBits; @@ -22190,6 +22282,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, // - They are debug instructions. Otherwise, // - They do not have side-effects, do not access memory and their inputs do // not depend on the results of the select pseudo-instructions. + // - They don't adjust stack. // The TrueV/FalseV operands of the selects cannot depend on the result of // previous selects in the sequence. // These conditions could be further relaxed. See the X86 target for a @@ -22218,6 +22311,8 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, SelectDests.insert(MI.getOperand(0).getReg()); MachineInstr *LastSelectPseudo = &MI; + const RISCVInstrInfo &TII = *Subtarget.getInstrInfo(); + for (auto E = BB->end(), SequenceMBBI = MachineBasicBlock::iterator(MI); SequenceMBBI != E; ++SequenceMBBI) { if (SequenceMBBI->isDebugInstr()) @@ -22237,7 +22332,9 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, } if (SequenceMBBI->hasUnmodeledSideEffects() || SequenceMBBI->mayLoadOrStore() || - SequenceMBBI->usesCustomInsertionHook()) + SequenceMBBI->usesCustomInsertionHook() || + TII.isFrameInstr(*SequenceMBBI) || + SequenceMBBI->isStackAligningInlineAsm()) break; if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) { return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg()); @@ -22245,7 +22342,6 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI, break; } - const RISCVInstrInfo &TII = *Subtarget.getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator I = ++BB->getIterator(); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 0ed97c61ec78a..56db09a286547 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -232,7 +232,7 @@ Register RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return 0; } -bool RISCVInstrInfo::isReallyTriviallyReMaterializable( +bool RISCVInstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { switch (RISCV::getRVVMCOpcode(MI.getOpcode())) { case RISCV::VMV_V_X: @@ -243,7 +243,7 @@ bool RISCVInstrInfo::isReallyTriviallyReMaterializable( case RISCV::VID_V: return MI.getOperand(1).isUndef(); default: - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI); } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index 57ec431749ebe..2bc499bf29957 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -75,7 +75,7 @@ class RISCVInstrInfo : public RISCVGenInstrInfo { Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex, TypeSize &MemBytes) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI) const override; bool shouldBreakCriticalEdgeToSink(MachineInstr &MI) const override { return MI.getOpcode() == RISCV::ADDI && MI.getOperand(1).isReg() && diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 47900cffa370c..9855c47a63392 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -340,7 +340,9 @@ def uimm16 : RISCVUImmOp<16>; def uimm32 : RISCVUImmOp<32>; def uimm48 : RISCVUImmOp<48>; def uimm64 : RISCVUImmOp<64>; -def simm12 : RISCVSImmLeafOp<12> { + +def simm12_lo : RISCVSImmLeafOp<12> { + let ParserMatchClass = SImmAsmOperand<12, "LO">; let MCOperandPredicate = [{ int64_t Imm; if (MCOp.evaluateAsConstantImm(Imm)) @@ -642,7 +644,7 @@ class BranchCC_rri funct3, string opcodestr> let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { class Load_ri funct3, string opcodestr, DAGOperand rty = GPR> - : RVInstI; class HLoad_r funct7, bits<5> funct5, string opcodestr> @@ -658,7 +660,7 @@ class HLoad_r funct7, bits<5> funct5, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { class Store_rri funct3, string opcodestr, DAGOperand rty = GPR> : RVInstS; class HStore_rr funct7, string opcodestr> @@ -671,7 +673,7 @@ class HStore_rr funct7, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class ALU_ri funct3, string opcodestr> - : RVInstI, Sched<[WriteIALU, ReadIALU]>; @@ -754,7 +756,7 @@ def JAL : RVInstJ, Sched<[WriteJal]>; def JALR : RVInstI<0b000, OPC_JALR, (outs GPR:$rd), - (ins GPR:$rs1, simm12:$imm12), + (ins GPR:$rs1, simm12_lo:$imm12), "jalr", "$rd, ${imm12}(${rs1})">, Sched<[WriteJalr, ReadJalr]>; } // hasSideEffects = 0, mayLoad = 0, mayStore = 0 @@ -779,7 +781,7 @@ def SH : Store_rri<0b001, "sh">, Sched<[WriteSTH, ReadStoreData, ReadMemBase]>; def SW : Store_rri<0b010, "sw">, Sched<[WriteSTW, ReadStoreData, ReadMemBase]>; // ADDI isn't always rematerializable, but isReMaterializable will be used as -// a hint which is verified in isReallyTriviallyReMaterializable. +// a hint which is verified in isReMaterializableImpl. let isReMaterializable = 1, isAsCheapAsAMove = 1 in def ADDI : ALU_ri<0b000, "addi">; @@ -894,7 +896,7 @@ def SD : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase] let IsSignExtendingOpW = 1 in { let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def ADDIW : RVInstI<0b000, OPC_OP_IMM_32, (outs GPR:$rd), - (ins GPR:$rs1, simm12:$imm12), + (ins GPR:$rs1, simm12_lo:$imm12), "addiw", "$rd, $rs1, $imm12">, Sched<[WriteIALU32, ReadIALU32]>; @@ -1041,7 +1043,7 @@ def PseudoSD : PseudoStore<"sd">; } // Predicates = [IsRV64] def : InstAlias<"nop", (ADDI X0, X0, 0), 3>; -def : InstAlias<"li $rd, $imm", (ADDI GPR:$rd, X0, simm12:$imm), 2>; +def : InstAlias<"li $rd, $imm", (ADDI GPR:$rd, X0, simm12_lo:$imm), 2>; def : InstAlias<"mv $rd, $rs", (ADDI GPR:$rd, GPR:$rs, 0)>; def : InstAlias<"not $rd, $rs", (XORI GPR:$rd, GPR:$rs, -1)>; @@ -1094,16 +1096,16 @@ def : InstAlias<"jal $offset", (JAL X1, simm21_lsb0_jal:$offset)>; // Non-zero offset aliases of "jalr" are the lowest weight, followed by the // two-register form, then the one-register forms and finally "ret". def : InstAlias<"jr $rs", (JALR X0, GPR:$rs, 0), 3>; -def : InstAlias<"jr ${offset}(${rs})", (JALR X0, GPR:$rs, simm12:$offset)>; +def : InstAlias<"jr ${offset}(${rs})", (JALR X0, GPR:$rs, simm12_lo:$offset)>; def : InstAlias<"jalr $rs", (JALR X1, GPR:$rs, 0), 3>; -def : InstAlias<"jalr ${offset}(${rs})", (JALR X1, GPR:$rs, simm12:$offset)>; +def : InstAlias<"jalr ${offset}(${rs})", (JALR X1, GPR:$rs, simm12_lo:$offset)>; def : InstAlias<"jalr $rd, $rs", (JALR GPR:$rd, GPR:$rs, 0), 2>; def : InstAlias<"ret", (JALR X0, X1, 0), 4>; // Non-canonical forms for jump targets also accepted by the assembler. -def : InstAlias<"jr $rs, $offset", (JALR X0, GPR:$rs, simm12:$offset), 0>; -def : InstAlias<"jalr $rs, $offset", (JALR X1, GPR:$rs, simm12:$offset), 0>; -def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12:$offset), 0>; +def : InstAlias<"jr $rs, $offset", (JALR X0, GPR:$rs, simm12_lo:$offset), 0>; +def : InstAlias<"jalr $rs, $offset", (JALR X1, GPR:$rs, simm12_lo:$offset), 0>; +def : InstAlias<"jalr $rd, $rs, $offset", (JALR GPR:$rd, GPR:$rs, simm12_lo:$offset), 0>; def : InstAlias<"jr (${rs})", (JALR X0, GPR:$rs, 0), 0>; def : InstAlias<"jalr (${rs})", (JALR X1, GPR:$rs, 0), 0>; def : InstAlias<"jalr $rd, (${rs})", (JALR GPR:$rd, GPR:$rs, 0), 0>; @@ -1178,13 +1180,13 @@ def : InstAlias<"sw $rs2, (${rs1})", (SW GPR:$rs2, GPR:$rs1, 0)>; def : InstAlias<"add $rd, $rs1, $imm12", - (ADDI GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (ADDI GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; def : InstAlias<"and $rd, $rs1, $imm12", - (ANDI GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (ANDI GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; def : InstAlias<"xor $rd, $rs1, $imm12", - (XORI GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (XORI GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; def : InstAlias<"or $rd, $rs1, $imm12", - (ORI GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (ORI GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; def : InstAlias<"sll $rd, $rs1, $shamt", (SLLI GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt)>; def : InstAlias<"srl $rd, $rs1, $shamt", @@ -1200,7 +1202,7 @@ def : InstAlias<"sd $rs2, (${rs1})", (SD GPR:$rs2, GPR:$rs1, 0)>; def : InstAlias<"addw $rd, $rs1, $imm12", - (ADDIW GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (ADDIW GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; def : InstAlias<"sllw $rd, $rs1, $shamt", (SLLIW GPR:$rd, GPR:$rs1, uimm5:$shamt)>; def : InstAlias<"srlw $rd, $rs1, $shamt", @@ -1209,9 +1211,9 @@ def : InstAlias<"sraw $rd, $rs1, $shamt", (SRAIW GPR:$rd, GPR:$rs1, uimm5:$shamt)>; } // Predicates = [IsRV64] def : InstAlias<"slt $rd, $rs1, $imm12", - (SLTI GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (SLTI GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; def : InstAlias<"sltu $rd, $rs1, $imm12", - (SLTIU GPR:$rd, GPR:$rs1, simm12:$imm12)>; + (SLTIU GPR:$rd, GPR:$rs1, simm12_lo:$imm12)>; } def : MnemonicAlias<"move", "mv">; @@ -1284,12 +1286,12 @@ def InsnR4 : DirectiveInsnR4<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, AnyReg:$rs3), "$opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3">; def InsnI : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3, - AnyReg:$rs1, simm12:$imm12), + AnyReg:$rs1, simm12_lo:$imm12), "$opcode, $funct3, $rd, $rs1, $imm12">; def InsnI_Mem : DirectiveInsnI<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, - simm12:$imm12), + simm12_lo:$imm12), "$opcode, $funct3, $rd, ${imm12}(${rs1})">; def InsnB : DirectiveInsnB<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, AnyReg:$rs2, @@ -1303,7 +1305,7 @@ def InsnJ : DirectiveInsnJ<(outs AnyReg:$rd), (ins uimm7_opcode:$opcode, "$opcode, $rd, $imm20">; def InsnS : DirectiveInsnS<(outs), (ins uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, - simm12:$imm12), + simm12_lo:$imm12), "$opcode, $funct3, $rs2, ${imm12}(${rs1})">; } // isCodeGenOnly, hasSideEffects, mayLoad, mayStore, hasNoSchedulingInfo @@ -1324,10 +1326,10 @@ def : InstAlias<".insn_r4 $opcode, $funct3, $funct2, $rd, $rs1, $rs2, $rs3", AnyReg:$rs1, AnyReg:$rs2, AnyReg:$rs3)>; def : InstAlias<".insn_i $opcode, $funct3, $rd, $rs1, $imm12", (InsnI AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, - simm12:$imm12)>; + simm12_lo:$imm12)>; def : InstAlias<".insn_i $opcode, $funct3, $rd, ${imm12}(${rs1})", (InsnI_Mem AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, - AnyReg:$rs1, simm12:$imm12)>; + AnyReg:$rs1, simm12_lo:$imm12)>; def : InstAlias<".insn_i $opcode, $funct3, $rd, (${rs1})", (InsnI_Mem AnyReg:$rd, uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs1, 0)>; @@ -1347,7 +1349,7 @@ def : InstAlias<".insn_uj $opcode, $rd, $imm20", (InsnJ AnyReg:$rd, uimm7_opcode:$opcode, simm21_lsb0_jal:$imm20)>; def : InstAlias<".insn_s $opcode, $funct3, $rs2, ${imm12}(${rs1})", (InsnS uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, - AnyReg:$rs1, simm12:$imm12)>; + AnyReg:$rs1, simm12_lo:$imm12)>; def : InstAlias<".insn_s $opcode, $funct3, $rs2, (${rs1})", (InsnS uimm7_opcode:$opcode, uimm3:$funct3, AnyReg:$rs2, AnyReg:$rs1, 0)>; @@ -1374,7 +1376,7 @@ class PatGprImm; class PatGprSimm12 - : PatGprImm; + : PatGprImm; class PatGprUimmLog2XLen : PatGprImm; @@ -1542,8 +1544,8 @@ def : GICustomOperandRenderer<"renderFrameIndex">, def : Pat<(frameindex:$fi), (ADDI (iPTR (to_tframeindex $fi)), 0)>; -def : Pat<(add_like frameindex:$fi, simm12:$offset), - (ADDI (iPTR (to_tframeindex $fi)), simm12:$offset)>; +def : Pat<(add_like frameindex:$fi, simm12_lo:$offset), + (ADDI (iPTR (to_tframeindex $fi)), simm12_lo:$offset)>; def GIAddrRegImm : GIComplexOperandMatcher, @@ -1576,7 +1578,7 @@ def PROBED_STACKALLOC_DYN : Pseudo<(outs), // It will be expanded after register allocation. // FIXME: The scheduling information does not reflect the multiple instructions. let Size = 8, isReMaterializable = 1 in -def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12:$lo), []>, +def PseudoMovAddr : Pseudo<(outs GPR:$dst), (ins uimm20_lui:$hi, simm12_lo:$lo), []>, Sched<[WriteIALU]>; def riscv_hi_oneuse : unop_oneuse; @@ -1673,7 +1675,7 @@ defm Select_GPR : SelectCC_GPR_rrirr; class SelectCompressOpt : Pat<(riscv_selectcc_frag:$select (XLenVT GPR:$lhs), simm12_no6:$Constant, Cond, (XLenVT GPR:$truev), GPR:$falsev), - (Select_GPR_Using_CC_GPR (XLenVT (ADDI GPR:$lhs, (NegImm simm12:$Constant))), (XLenVT X0), + (Select_GPR_Using_CC_GPR (XLenVT (ADDI GPR:$lhs, (NegImm simm12_lo:$Constant))), (XLenVT X0), (IntCCtoRISCVCC $select), GPR:$truev, GPR:$falsev)>; def OptForMinSize : Predicate<"MF ? MF->getFunction().hasMinSize() : false">; @@ -1712,7 +1714,7 @@ multiclass BccPat { class BrccCompressOpt : Pat<(riscv_brcc GPR:$lhs, simm12_no6:$Constant, Cond, bb:$place), - (Inst (XLenVT (ADDI GPR:$lhs, (NegImm simm12:$Constant))), + (Inst (XLenVT (ADDI GPR:$lhs, (NegImm simm12_lo:$Constant))), (XLenVT X0), bb:$place)>; defm : BccPat; @@ -1753,33 +1755,33 @@ def PseudoBR : Pseudo<(outs), (ins simm21_lsb0_jal:$imm20), [(br bb:$imm20)]>, let Predicates = [NoStdExtZicfilp], isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in -def PseudoBRIND : Pseudo<(outs), (ins GPRJALR:$rs1, simm12:$imm12), []>, - PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>; +def PseudoBRIND : Pseudo<(outs), (ins GPRJALR:$rs1, simm12_lo:$imm12), []>, + PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12_lo:$imm12)>; let Predicates = [HasStdExtZicfilp], isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in { -def PseudoBRINDNonX7 : Pseudo<(outs), (ins GPRJALRNonX7:$rs1, simm12:$imm12), []>, - PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>; -def PseudoBRINDX7 : Pseudo<(outs), (ins GPRX7:$rs1, simm12:$imm12), []>, - PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12:$imm12)>; +def PseudoBRINDNonX7 : Pseudo<(outs), (ins GPRJALRNonX7:$rs1, simm12_lo:$imm12), []>, + PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12_lo:$imm12)>; +def PseudoBRINDX7 : Pseudo<(outs), (ins GPRX7:$rs1, simm12_lo:$imm12), []>, + PseudoInstExpansion<(JALR X0, GPR:$rs1, simm12_lo:$imm12)>; } // For Zicfilp, need to avoid using X7/T2 for indirect branches which need // landing pad. let Predicates = [HasStdExtZicfilp] in { def : Pat<(brind GPRJALRNonX7:$rs1), (PseudoBRINDNonX7 GPRJALRNonX7:$rs1, 0)>; -def : Pat<(brind (add GPRJALRNonX7:$rs1, simm12:$imm12)), - (PseudoBRINDNonX7 GPRJALRNonX7:$rs1, simm12:$imm12)>; +def : Pat<(brind (add GPRJALRNonX7:$rs1, simm12_lo:$imm12)), + (PseudoBRINDNonX7 GPRJALRNonX7:$rs1, simm12_lo:$imm12)>; def : Pat<(riscv_sw_guarded_brind GPRX7:$rs1), (PseudoBRINDX7 GPRX7:$rs1, 0)>; -def : Pat<(riscv_sw_guarded_brind (add GPRX7:$rs1, simm12:$imm12)), - (PseudoBRINDX7 GPRX7:$rs1, simm12:$imm12)>; +def : Pat<(riscv_sw_guarded_brind (add GPRX7:$rs1, simm12_lo:$imm12)), + (PseudoBRINDX7 GPRX7:$rs1, simm12_lo:$imm12)>; } let Predicates = [NoStdExtZicfilp] in { def : Pat<(brind GPRJALR:$rs1), (PseudoBRIND GPRJALR:$rs1, 0)>; -def : Pat<(brind (add GPRJALR:$rs1, simm12:$imm12)), - (PseudoBRIND GPRJALR:$rs1, simm12:$imm12)>; +def : Pat<(brind (add GPRJALR:$rs1, simm12_lo:$imm12)), + (PseudoBRIND GPRJALR:$rs1, simm12_lo:$imm12)>; } // PseudoCALLReg is a generic pseudo instruction for calls which will eventually @@ -1942,7 +1944,7 @@ def tlsdesc_call_symbol : Operand { let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, Size = 8, hasSideEffects = 0, mayStore = 0, mayLoad = 0 in def PseudoTLSDESCCall : Pseudo<(outs GPR:$rd), - (ins GPR:$rs1, simm12:$imm12, tlsdesc_call_symbol:$src), [], + (ins GPR:$rs1, simm12_lo:$imm12, tlsdesc_call_symbol:$src), [], "jalr", "$rd, ${imm12}(${rs1}), $src">, Sched<[WriteJalr, ReadJalr]> { let Defs = [X10]; @@ -1971,8 +1973,8 @@ def PseudoZEXT_W : Pseudo<(outs GPR:$rd), (ins GPR:$rs), [], "zext.w", "$rd, $rs /// Loads class LdPat - : Pat<(vt (LoadOp (AddrRegImm (XLenVT GPRMem:$rs1), simm12:$imm12))), - (Inst GPRMem:$rs1, simm12:$imm12)>; + : Pat<(vt (LoadOp (AddrRegImm (XLenVT GPRMem:$rs1), simm12_lo:$imm12))), + (Inst GPRMem:$rs1, simm12_lo:$imm12)>; def : LdPat; def : LdPat; // Prefer unsigned due to no c.lb in Zcb. @@ -1987,8 +1989,8 @@ def : LdPat; class StPat : Pat<(StoreOp (vt StTy:$rs2), (AddrRegImm (XLenVT GPRMem:$rs1), - simm12:$imm12)), - (Inst StTy:$rs2, GPRMem:$rs1, simm12:$imm12)>; + simm12_lo:$imm12)), + (Inst StTy:$rs2, GPRMem:$rs1, simm12_lo:$imm12)>; def : StPat; def : StPat; @@ -2228,8 +2230,8 @@ def : PatGprImm, XORI, u32simm12>; // Select 'or' as ADDIW if the immediate bits are known to be 0 in $rs1 and // $rs1 is sign extended. This can improve compressibility. Using ADDIW gives // more power to RISCVOptWInstrs. -def : Pat<(riscv_or_disjoint 33signbits_node:$rs1, simm12:$imm), - (ADDIW $rs1, simm12:$imm)>; +def : Pat<(riscv_or_disjoint 33signbits_node:$rs1, simm12_lo:$imm), + (ADDIW $rs1, simm12_lo:$imm)>; /// Loads diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 9fc73662d9704..24e7a0ee5a79f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -298,7 +298,7 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd), (ins SP:$rs1, uimm10_lsb00nonzero:$imm), "c.addi4spn", "$rd, $rs1, $imm">, Sched<[WriteIALU, ReadIALU]> { - bits<5> rs1; + bits<0> rs1; let Inst{12-11} = imm{5-4}; let Inst{10-7} = imm{9-6}; let Inst{6} = imm{2}; @@ -404,8 +404,8 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb), "c.addi16sp", "$rd, $imm">, Sched<[WriteIALU, ReadIALU]> { let Constraints = "$rd = $rd_wb"; + let rd = 2; let Inst{12} = imm{9}; - let Inst{11-7} = 2; let Inst{6} = imm{4}; let Inst{5} = imm{6}; let Inst{4-3} = imm{8-7}; @@ -965,4 +965,3 @@ let Predicates = [HasStdExtCOrZcd, HasStdExtD] in { def : CompressPat<(FSD FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm), (C_FSDSP FPR64:$rs2, SPMem:$rs1, uimm9_lsb000:$imm)>; } // Predicates = [HasStdExtCOrZcd, HasStdExtD] - diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 3d9737e3645d5..b9510efc2fba1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -529,11 +529,11 @@ def PseudoFROUND_D_IN32X : PseudoFROUND; /// Loads let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 8, isCodeGenOnly = 1 in -def PseudoRV32ZdinxLD : Pseudo<(outs GPRPair:$dst), (ins GPR:$rs1, simm12:$imm12), []>; +def PseudoRV32ZdinxLD : Pseudo<(outs GPRPair:$dst), (ins GPR:$rs1, simm12_lo:$imm12), []>; /// Stores let hasSideEffects = 0, mayLoad = 0, mayStore = 1, Size = 8, isCodeGenOnly = 1 in -def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRPair:$rs2, GPRNoX0:$rs1, simm12:$imm12), []>; +def PseudoRV32ZdinxSD : Pseudo<(outs), (ins GPRPair:$rs2, GPRNoX0:$rs1, simm12_lo:$imm12), []>; } // Predicates = [HasStdExtZdinx, IsRV32] let Predicates = [HasStdExtZdinx, HasStdExtZilsd, IsRV32] in { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 2c1cf77acff56..fde030ecc3b89 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -196,7 +196,7 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in class FPLoad_r funct3, string opcodestr, DAGOperand rty, SchedWrite sw> : RVInstI, Sched<[sw, ReadFMemBase]>; @@ -204,7 +204,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in class FPStore_r funct3, string opcodestr, DAGOperand rty, SchedWrite sw> : RVInstS, Sched<[sw, ReadFStoreData, ReadFMemBase]>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td index f732ab13e5f88..0114fbdc56302 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td @@ -60,7 +60,7 @@ class SFBALU_rr class SFBALU_ri : Pseudo<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs, cond_code:$cc, GPR:$falsev, GPR:$rs1, - simm12:$imm), []>, + simm12_lo:$imm), []>, Sched<[WriteSFB, ReadSFBJmp, ReadSFBJmp, ReadSFBALU, ReadSFBALU]> { let hasSideEffects = 0; let mayLoad = 0; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index b4be9e0c09b3e..4eb9a3be26fa6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -345,49 +345,60 @@ defset list AllVectors = { } } - defset list AllFloatVectors = { - defset list NoGroupFloatVectors = { - defset list FractionalGroupFloatVectors = { - def VF16MF4: VTypeInfo; - def VF16MF2: VTypeInfo; - def VF32MF2: VTypeInfo; - def VBF16MF4: VTypeInfo; - def VBF16MF2: VTypeInfo; + defset list AllFloatAndBFloatVectors = { + defset list AllFloatVectors = { + defset list NoGroupFloatVectors = { + defset list FractionalGroupFloatVectors = { + def VF16MF4: VTypeInfo; + def VF16MF2: VTypeInfo; + def VF32MF2: VTypeInfo; + } + def VF16M1: VTypeInfo; + def VF32M1: VTypeInfo; + def VF64M1: VTypeInfo; + } + + defset list GroupFloatVectors = { + def VF16M2: GroupVTypeInfo; + def VF16M4: GroupVTypeInfo; + def VF16M8: GroupVTypeInfo; + + def VF32M2: GroupVTypeInfo; + def VF32M4: GroupVTypeInfo; + def VF32M8: GroupVTypeInfo; + + def VF64M2: GroupVTypeInfo; + def VF64M4: GroupVTypeInfo; + def VF64M8: GroupVTypeInfo; } - def VF16M1: VTypeInfo; - def VF32M1: VTypeInfo; - def VF64M1: VTypeInfo; - def VBF16M1: VTypeInfo; } - defset list GroupFloatVectors = { - def VF16M2: GroupVTypeInfo; - def VF16M4: GroupVTypeInfo; - def VF16M8: GroupVTypeInfo; - - def VF32M2: GroupVTypeInfo; - def VF32M4: GroupVTypeInfo; - def VF32M8: GroupVTypeInfo; - - def VF64M2: GroupVTypeInfo; - def VF64M4: GroupVTypeInfo; - def VF64M8: GroupVTypeInfo; - - def VBF16M2: GroupVTypeInfo; - def VBF16M4: GroupVTypeInfo; - def VBF16M8: GroupVTypeInfo; + defset list AllBFloatVectors = { + defset list NoGroupBFloatVectors = { + defset list FractionalGroupBFloatVectors = { + def VBF16MF4: VTypeInfo; + def VBF16MF2: VTypeInfo; + } + def VBF16M1: VTypeInfo; + } + + defset list GroupBFloatVectors = { + def VBF16M2: GroupVTypeInfo; + def VBF16M4: GroupVTypeInfo; + def VBF16M8: GroupVTypeInfo; + } } } } @@ -7143,31 +7154,32 @@ defm : VPatConversionVI_VF<"int_riscv_vfclass", "PseudoVFCLASS">; // We can use vmerge.vvm to support vector-vector vfmerge. // NOTE: Clang previously used int_riscv_vfmerge for vector-vector, but now uses // int_riscv_vmerge. Support both for compatibility. -foreach vti = AllFloatVectors in { +foreach vti = AllFloatAndBFloatVectors in { let Predicates = GetVTypeMinimalPredicates.Predicates in defm : VPatBinaryCarryInTAIL<"int_riscv_vmerge", "PseudoVMERGE", "VVM", vti.Vector, vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW, vti.LMul, vti.RegClass, vti.RegClass, vti.RegClass>; - let Predicates = GetVTypePredicates.Predicates in - defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE", - "V"#vti.ScalarSuffix#"M", - vti.Vector, - vti.Vector, vti.Scalar, vti.Mask, - vti.Log2SEW, vti.LMul, vti.RegClass, - vti.RegClass, vti.ScalarRegClass>; } foreach fvti = AllFloatVectors in { - defvar instr = !cast("PseudoVMERGE_VIM_"#fvti.LMul.MX); - let Predicates = GetVTypePredicates.Predicates in - def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru), - (fvti.Vector fvti.RegClass:$rs2), - (fvti.Scalar (fpimm0)), - (fvti.Mask VMV0:$vm), VLOpFrag)), - (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, - (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; + let Predicates = GetVTypePredicates.Predicates in { + defm : VPatBinaryCarryInTAIL<"int_riscv_vfmerge", "PseudoVFMERGE", + "V"#fvti.ScalarSuffix#"M", + fvti.Vector, + fvti.Vector, fvti.Scalar, fvti.Mask, + fvti.Log2SEW, fvti.LMul, fvti.RegClass, + fvti.RegClass, fvti.ScalarRegClass>; + + defvar instr = !cast("PseudoVMERGE_VIM_"#fvti.LMul.MX); + def : Pat<(fvti.Vector (int_riscv_vfmerge (fvti.Vector fvti.RegClass:$passthru), + (fvti.Vector fvti.RegClass:$rs2), + (fvti.Scalar (fpimm0)), + (fvti.Mask VMV0:$vm), VLOpFrag)), + (instr fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, + (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; + } } //===----------------------------------------------------------------------===// @@ -7328,13 +7340,12 @@ foreach vti = NoGroupIntegerVectors in { //===----------------------------------------------------------------------===// // 16.3. Vector Slide Instructions //===----------------------------------------------------------------------===// -defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllIntegerVectors, uimm5>; -defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllIntegerVectors, uimm5>; +defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllVectors, uimm5>; +defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllVectors, uimm5>; + defm : VPatBinaryV_VX<"int_riscv_vslide1up", "PseudoVSLIDE1UP", AllIntegerVectors>; defm : VPatBinaryV_VX<"int_riscv_vslide1down", "PseudoVSLIDE1DOWN", AllIntegerVectors>; -defm : VPatTernaryV_VX_VI<"int_riscv_vslideup", "PseudoVSLIDEUP", AllFloatVectors, uimm5>; -defm : VPatTernaryV_VX_VI<"int_riscv_vslidedown", "PseudoVSLIDEDOWN", AllFloatVectors, uimm5>; defm : VPatBinaryV_VX<"int_riscv_vfslide1up", "PseudoVFSLIDE1UP", AllFloatVectors>; defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVectors>; @@ -7342,19 +7353,14 @@ defm : VPatBinaryV_VX<"int_riscv_vfslide1down", "PseudoVFSLIDE1DOWN", AllFloatVe // 16.4. Vector Register Gather Instructions //===----------------------------------------------------------------------===// defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", - AllIntegerVectors, uimm5>; + AllVectors, uimm5>; defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", - eew=16, vtilist=AllIntegerVectors>; + eew=16, vtilist=AllVectors>; -defm : VPatBinaryV_VV_VX_VI_INT<"int_riscv_vrgather", "PseudoVRGATHER", - AllFloatVectors, uimm5>; -defm : VPatBinaryV_VV_INT_EEW<"int_riscv_vrgatherei16_vv", "PseudoVRGATHEREI16", - eew=16, vtilist=AllFloatVectors>; //===----------------------------------------------------------------------===// // 16.5. Vector Compress Instruction //===----------------------------------------------------------------------===// -defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllIntegerVectors>; -defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllFloatVectors>; +defm : VPatUnaryV_V_AnyMask<"int_riscv_vcompress", "PseudoVCOMPRESS", AllVectors>; // Include the non-intrinsic ISel patterns include "RISCVInstrInfoVVLPatterns.td" diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index d4c9215e1863a..dc613614aa457 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -1388,7 +1388,7 @@ defm : VPatFPSetCCSDNode_VV_VF_FV; // Floating-point vselects: // 11.15. Vector Integer Merge Instructions // 13.15. Vector Floating-Point Merge Instruction -foreach fvti = AllFloatVectors in { +foreach fvti = AllFloatAndBFloatVectors in { defvar ivti = GetIntVTypeInfo.Vti; let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), fvti.RegClass:$rs1, @@ -1397,7 +1397,12 @@ foreach fvti = AllFloatVectors in { (fvti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; + } +} +foreach fvti = AllFloatVectors in { + defvar ivti = GetIntVTypeInfo.Vti; + let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), fvti.RegClass:$rs2)), @@ -1412,9 +1417,7 @@ foreach fvti = AllFloatVectors in { (fvti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), fvti.AVL, fvti.Log2SEW)>; } -} -foreach fvti = AllFloatVectors in { let Predicates = GetVTypePredicates.Predicates in def : Pat<(fvti.Vector (vselect (fvti.Mask VMV0:$vm), (SplatFPOp fvti.ScalarRegClass:$rs1), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index ff35c1bd558a4..1511f1b55b996 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2423,10 +2423,10 @@ foreach vti = AllFloatVectors in { } } -foreach fvti = AllFloatVectors in { - // Floating-point vselects: - // 11.15. Vector Integer Merge Instructions - // 13.15. Vector Floating-Point Merge Instruction +// Floating-point vselects: +// 11.15. Vector Integer Merge Instructions +// 13.15. Vector Floating-Point Merge Instruction +foreach fvti = AllFloatAndBFloatVectors in { defvar ivti = GetIntVTypeInfo.Vti; let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), @@ -2437,7 +2437,12 @@ foreach fvti = AllFloatVectors in { (!cast("PseudoVMERGE_VVM_"#fvti.LMul.MX) fvti.RegClass:$passthru, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; + } +} +foreach fvti = AllFloatVectors in { + defvar ivti = GetIntVTypeInfo.Vti; + let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), fvti.RegClass:$rs2, @@ -2457,9 +2462,7 @@ foreach fvti = AllFloatVectors in { fvti.RegClass:$passthru, fvti.RegClass:$rs2, 0, (fvti.Mask VMV0:$vm), GPR:$vl, fvti.Log2SEW)>; } -} -foreach fvti = AllFloatVectors in { let Predicates = GetVTypePredicates.Predicates in { def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask VMV0:$vm), (SplatFPOp fvti.ScalarRegClass:$rs1), @@ -2767,7 +2770,7 @@ foreach vti = NoGroupFloatVectors in { } } -foreach vti = AllFloatVectors in { +foreach vti = AllFloatAndBFloatVectors in { defvar ivti = GetIntVTypeInfo.Vti; let Predicates = GetVTypePredicates.Predicates in { def : Pat<(vti.Vector diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index 1fb30a0b73d92..9835c033aea9c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -362,6 +362,47 @@ class NDSRVInstSDGP funct3, string opcodestr> let mayStore = 1; } +class NDSRVInstVSINTLN funct5, string opcodestr> + : RVInst<(outs VR:$vd), (ins GPRMemZeroOffset:$rs1), + opcodestr, "$vd, ${rs1}", [], InstFormatR>, + VLESchedMC { + bits<5> rs1; + bits<5> vd; + + let Inst{31-26} = 0b000001; + let Inst{25} = 1; + let Inst{24-20} = funct5; + let Inst{19-15} = rs1; + let Inst{14-12} = 0b100; + let Inst{11-7} = vd; + let Inst{6-0} = OPC_CUSTOM_2.Value; + let hasSideEffects = 0; + let mayLoad = 1; + let mayStore = 0; + let Uses = [VTYPE, VL]; +} + +class NDSRVInstVSINTCvt fucnt5, string opcodestr> + : RVInst<(outs VR:$vd), (ins VR:$vs, VMaskOp:$vm), + opcodestr, "$vd, $vs$vm", [], InstFormatR> { + bits<5> vs; + bits<5> vd; + bit vm; + + let Inst{31-26} = 0b000000; + let Inst{25} = vm; + let Inst{24-20} = vs; + let Inst{19-15} = fucnt5; + let Inst{14-12} = 0b100; + let Inst{11-7} = vd; + let Inst{6-0} = OPC_CUSTOM_2.Value; + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; + let Uses = [FRM, VL, VTYPE]; + let RVVConstraint = VMConstraint; +} + class NDSRVInstBFHCvt funct7, bits<5> rs1val, DAGOperand rdty, DAGOperand rs2ty, string opcodestr> : RVInstR; } +//===----------------------------------------------------------------------===// +// XAndesVSIntH +//===----------------------------------------------------------------------===// + +let Predicates = [HasVendorXAndesVSIntH] in { + def NDS_VFWCVT_F_N : NDSRVInstVSINTCvt<0b00100, "nds.vfwcvt.f.n.v">; + def NDS_VFWCVT_F_NU : NDSRVInstVSINTCvt<0b00101, "nds.vfwcvt.f.nu.v">; + def NDS_VFWCVT_F_B : NDSRVInstVSINTCvt<0b00110, "nds.vfwcvt.f.b.v">; + def NDS_VFWCVT_F_BU : NDSRVInstVSINTCvt<0b00111, "nds.vfwcvt.f.bu.v">; + def NDS_VLE4_V : NDSRVInstVSINTLN<0b00000, "nds.vle4.v">; +} + //===----------------------------------------------------------------------===// // XAndesVSIntLoad //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td index 996e08bd0a27d..d8f5d3e09d374 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXCV.td @@ -271,7 +271,7 @@ class CVInstImmBranch funct3, dag outs, dag ins, let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in { class CVLoad_ri_inc funct3, string opcodestr> : RVInstI { let Constraints = "$rs1_wb = $rs1"; } @@ -292,7 +292,7 @@ class CVLoad_rr funct7, bits<3> funct3, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in { class CVStore_ri_inc funct3, string opcodestr> : RVInstS { let Constraints = "$rs1_wb = $rs1"; } @@ -332,7 +332,7 @@ class CVStore_rr funct3, bits<7> funct7, string opcodestr> class CVLoad_ri funct3, string opcodestr> : RVInstI; + (ins GPRMem:$rs1, simm12_lo:$imm12), opcodestr, "$rd, ${imm12}(${rs1})">; //===----------------------------------------------------------------------===// // Instructions @@ -673,8 +673,8 @@ class CVLdrrPat (Inst CVrr:$regreg)>; class CVStriPat - : Pat<(StoreOp (XLenVT GPR:$rs2), GPR:$rs1, simm12:$imm12), - (Inst GPR:$rs2, GPR:$rs1, simm12:$imm12)>; + : Pat<(StoreOp (XLenVT GPR:$rs2), GPR:$rs1, simm12_lo:$imm12), + (Inst GPR:$rs2, GPR:$rs1, simm12_lo:$imm12)>; class CVStrriPat : Pat<(StoreOp (XLenVT GPR:$rs2), GPR:$rs1, GPR:$rs3), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td index 808d9117a1746..13b02d1b2d6db 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td @@ -953,7 +953,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { } def QC_MULIADD : RVInstI<0b110, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb), - (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12), + (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12), "qc.muliadd", "$rd, $rs1, $imm12"> { let Constraints = "$rd = $rd_wb"; } @@ -1411,8 +1411,8 @@ class SelectQCbi (IntCCtoRISCVCC $cc), GPRNoX0:$truev, GPRNoX0:$falsev)>; let Predicates = [HasVendorXqciac, IsRV32] in { -def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))), - (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>; +def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))), + (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>; def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)), (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>; def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)), @@ -1667,27 +1667,27 @@ def : CompressPat<(QC_E_LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm), (C_LW GPRC:$rd, GPRCMem:$rs1, uimm7_lsb00:$imm)>; def : CompressPat<(QC_E_LW GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm), (C_LWSP GPRNoX0:$rd, SPMem:$rs1, uimm8_lsb00:$imm)>; -def : CompressPat<(QC_E_LB GPR:$rd, GPRMem:$rs1, simm12:$imm12), - (LB GPR:$rd, GPRMem:$rs1, simm12:$imm12)>; -def : CompressPat<(QC_E_LBU GPR:$rd, GPRMem:$rs1, simm12:$imm12), - (LBU GPR:$rd, GPRMem:$rs1, simm12:$imm12)>; -def : CompressPat<(QC_E_LH GPR:$rd, GPRMem:$rs1, simm12:$imm12), - (LH GPR:$rd, GPRMem:$rs1, simm12:$imm12)>; -def : CompressPat<(QC_E_LHU GPR:$rd, GPRMem:$rs1, simm12:$imm12), - (LHU GPR:$rd, GPRMem:$rs1, simm12:$imm12)>; -def : CompressPat<(QC_E_LW GPR:$rd, GPRMem:$rs1, simm12:$imm12), - (LW GPR:$rd, GPRMem:$rs1, simm12:$imm12)>; +def : CompressPat<(QC_E_LB GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12), + (LB GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12)>; +def : CompressPat<(QC_E_LBU GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12), + (LBU GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12)>; +def : CompressPat<(QC_E_LH GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12), + (LH GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12)>; +def : CompressPat<(QC_E_LHU GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12), + (LHU GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12)>; +def : CompressPat<(QC_E_LW GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12), + (LW GPR:$rd, GPRMem:$rs1, simm12_lo:$imm12)>; def : CompressPat<(QC_E_SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm), (C_SW GPRC:$rs2, GPRCMem:$rs1, uimm7_lsb00:$imm)>; def : CompressPat<(QC_E_SW GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm), (C_SWSP GPR:$rs2, SPMem:$rs1, uimm8_lsb00:$imm)>; -def : CompressPat<(QC_E_SB GPR:$rs2, GPRMem:$rs1, simm12:$imm12), - (SB GPR:$rs2, GPRMem:$rs1, simm12:$imm12)>; -def : CompressPat<(QC_E_SH GPR:$rs2, GPRMem:$rs1, simm12:$imm12), - (SH GPR:$rs2, GPRMem:$rs1, simm12:$imm12)>; -def : CompressPat<(QC_E_SW GPR:$rs2, GPRMem:$rs1, simm12:$imm12), - (SW GPR:$rs2, GPRMem:$rs1, simm12:$imm12)>; +def : CompressPat<(QC_E_SB GPR:$rs2, GPRMem:$rs1, simm12_lo:$imm12), + (SB GPR:$rs2, GPRMem:$rs1, simm12_lo:$imm12)>; +def : CompressPat<(QC_E_SH GPR:$rs2, GPRMem:$rs1, simm12_lo:$imm12), + (SH GPR:$rs2, GPRMem:$rs1, simm12_lo:$imm12)>; +def : CompressPat<(QC_E_SW GPR:$rs2, GPRMem:$rs1, simm12_lo:$imm12), + (SW GPR:$rs2, GPRMem:$rs1, simm12_lo:$imm12)>; } // isCompressOnly = true, Predicates = [HasVendorXqcilo, IsRV32] let Predicates = [HasVendorXqcicm, IsRV32] in { @@ -1752,23 +1752,23 @@ def : CompressPat<(QC_E_ADDAI X2, simm10_lsb0000nonzero:$imm), def : CompressPat<(QC_E_ADDI X2, X2, simm10_lsb0000nonzero:$imm), (C_ADDI16SP X2, simm10_lsb0000nonzero:$imm)>; -def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), - (ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; -def : CompressPat<(QC_E_ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), - (ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; -def : CompressPat<(QC_E_ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), - (ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; -def : CompressPat<(QC_E_XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm), - (XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12:$imm)>; - -def : CompressPat<(QC_E_ADDAI GPRNoX0:$rd, simm12:$imm), - (ADDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; -def : CompressPat<(QC_E_ANDAI GPRNoX0:$rd, simm12:$imm), - (ANDI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; -def : CompressPat<(QC_E_ORAI GPRNoX0:$rd, simm12:$imm), - (ORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; -def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12:$imm), - (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12:$imm)>; +def : CompressPat<(QC_E_ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm), + (ADDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm)>; +def : CompressPat<(QC_E_ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm), + (ANDI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm)>; +def : CompressPat<(QC_E_ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm), + (ORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm)>; +def : CompressPat<(QC_E_XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm), + (XORI GPRNoX0:$rs1, GPRNoX0:$rs2, simm12_lo:$imm)>; + +def : CompressPat<(QC_E_ADDAI GPRNoX0:$rd, simm12_lo:$imm), + (ADDI GPRNoX0:$rd, GPRNoX0:$rd, simm12_lo:$imm)>; +def : CompressPat<(QC_E_ANDAI GPRNoX0:$rd, simm12_lo:$imm), + (ANDI GPRNoX0:$rd, GPRNoX0:$rd, simm12_lo:$imm)>; +def : CompressPat<(QC_E_ORAI GPRNoX0:$rd, simm12_lo:$imm), + (ORI GPRNoX0:$rd, GPRNoX0:$rd, simm12_lo:$imm)>; +def : CompressPat<(QC_E_XORAI GPRNoX0:$rd, simm12_lo:$imm), + (XORI GPRNoX0:$rd, GPRNoX0:$rd, simm12_lo:$imm)>; } // let isCompressOnly = true, Predicates = [HasVendorXqcilia, IsRV32] let isCompressOnly = true, Predicates = [HasVendorXqciac, IsRV32] in { diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index d81718c2361de..3f2e7dbd07a67 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -169,6 +169,64 @@ class SiFive7GetOrderedReductionCycles { int c = !mul(6, VLUpperBound); } +class isSingleDLEN { + bit c = !or(!eq(mx, "MF2"), !or(!eq(mx, "MF4"), !eq(mx, "MF8"))); +} + +class SiFive7GetCyclesVRGatherVV { + // if (hasFastGather && isSingleDLEN(mx)) + // c = 1; + // else if (hasFastGather && (log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32)) + // c = LMUL * 2 * ceil(vl * SEW / DLEN); + // else + // c = vl; + + defvar y = !logtwo(!div(sew, 8)); + defvar x = !cond( + !eq(mx, "M1") : y, + !eq(mx, "M2") : !add(y, 1), + !eq(mx, "M4") : !add(y, 2), + !eq(mx, "M8") : !add(y, 3), + // Give isSingleDLEN(mx) cases a garbage value to avoid build failures, + // even though x will go unused. + true : 1 + ); + // LMUL * 2 * ceil(vl * SEW / DLEN) = LMUL * 2 * ceil(2 * LMUL) + defvar z = !cond( + !eq(mx, "M1") : 4, + !eq(mx, "M2") : 16, + !eq(mx, "M4") : 64, + !eq(mx, "M8") : 256, + // Give isSingleDLEN(mx) cases a garbage value to avoid build failures, + // even though z will go unused. + true : 1 + ); + defvar VLUpperBound = SiFive7GetCyclesOnePerElement.c; + bit IsSingleDLEN = isSingleDLEN.c; + + int c = !cond( + !and(hasFastGather, IsSingleDLEN) : 1, + !and(hasFastGather, !le(x, !logtwo(!div(VLEN, 64)))) : z, + true: VLUpperBound + ); +} + +class SiFive7GetCyclesVCompress { + + // if (hasFastGather && isSingleDLEN(mx)) + // c = 1 + // else + // c = vl + defvar VLUpperBound = SiFive7GetCyclesOnePerElement.c; + bit IsSingleDLEN = isSingleDLEN.c; + + int c = !if(!and(hasFastGather, IsSingleDLEN), + 1, + VLUpperBound); +} + class SiFive7GetSiFiveVFNRClipCycles { int latency = !cond( !eq(mx, "MF8"): 7, @@ -259,7 +317,8 @@ multiclass SiFive7WriteResBase { + bit isFP64Throttled = false, + bit hasFastGather = false> { // Branching let Latency = 3 in { @@ -929,16 +988,16 @@ multiclass SiFive7WriteResBase.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm : LMULWriteResMX<"WriteVMALUV", [VCQ, VA1], mx, IsWorstCase>; - defm : LMULWriteResMX<"WriteVMPopV", [VCQ, VA1], mx, IsWorstCase>; - defm : LMULWriteResMX<"WriteVMFFSV", [VCQ, VA1], mx, IsWorstCase>; defm : LMULWriteResMX<"WriteVMSFSV", [VCQ, VA1], mx, IsWorstCase>; } } + // Simple mask logical used in series foreach mx = SchedMxList in { defvar Cycles = SiFive7GetCyclesDefault.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; @@ -947,13 +1006,23 @@ multiclass SiFive7WriteResBase; } } + // Mask reduction + foreach mx = SchedMxList in { + defvar IsWorstCase = SiFive7IsWorstCaseMX.c; + let Latency = 11, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 3)] in { + defm "" : LMULWriteResMX<"WriteVMFFSV", [VCQ, VA1], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMPopV", [VCQ, VA1], mx, IsWorstCase>; + } + } // 16. Vector Permutation Instructions + let Latency = 11, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 3)] in { + def : WriteRes; + def : WriteRes; + } let Latency = 4, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, 1)] in { def : WriteRes; - def : WriteRes; def : WriteRes; - def : WriteRes; } foreach mx = SchedMxList in { defvar Cycles = SiFive7GetCyclesDefault.c; @@ -966,13 +1035,33 @@ multiclass SiFive7WriteResBase.val in { - defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; - let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>; - } + defvar IsSingleDLEN = isSingleDLEN.c; + + defvar GatherVVCycles = + SiFive7GetCyclesVRGatherVV.c; + // 7 + DLEN/ SEW + defvar SlowGatherLat = !add(7, !div(!div(VLEN, 2), sew)); + defvar GatherVVLat = !if(hasFastGather, + !add(3, GatherVVCycles), SlowGatherLat); + + let Latency = GatherVVLat, AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(5, GatherVVCycles)] in + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>; + + // VRGatherEI16VV is not improved by fastGather. + defvar GatherEI16VVCycles = SiFive7GetCyclesOnePerElement.c; + let Latency = SlowGatherLat, AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(5, GatherEI16VVCycles)] in + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>; + + defvar CompressCycles = SiFive7GetCyclesVCompress.c; + defvar CompressLat = !if(!and(hasFastGather, IsSingleDLEN), + 4, + !add(7, CompressCycles)); // 7 + VL + let Latency = CompressLat, AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(8, CompressCycles)] in + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>; } } @@ -1398,7 +1487,8 @@ multiclass SiFive7ReadAdvance { /// eventually be supplied by different SchedMachineModels. multiclass SiFive7SchedResources { + bit isFP64Throttled, + bit hasFastGather> { defm SiFive7 : SiFive7ProcResources; // Pull out defs from SiFive7ProcResources so we can refer to them by name. @@ -1425,7 +1515,8 @@ multiclass SiFive7SchedResources; + SiFive7VCQ, fpLatencies, isFP64Throttled, + hasFastGather>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -1458,6 +1549,7 @@ class SiFive7SchedMachineModel : SchedMachineModel { SiFive7FPLatencies FPLatencies; bit IsFP64Throttled = false; + bit HasFastGather = false; string Name = !subst("Model", "", !subst("SiFive7", "", NAME)); } @@ -1484,6 +1576,7 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { let HasExtraVALU = true; let FPLatencies = SiFive7LowFPLatencies; let IsFP64Throttled = true; + let HasFastGather = true; } /// Binding models to their scheduling resources. @@ -1491,7 +1584,8 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in { let SchedModel = model in defm model.Name : SiFive7SchedResources; + model.IsFP64Throttled, + model.HasFastGather>; } // Some model name aliases. diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index a06faa414a2ef..563f3bbee81df 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -15,6 +15,7 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/PatternMatch.h" #include #include @@ -2701,6 +2702,120 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE, BaseT::getPeelingPreferences(L, SE, PP); } +bool RISCVTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const { + const DataLayout &DL = getDataLayout(); + Intrinsic::ID IID = Inst->getIntrinsicID(); + LLVMContext &C = Inst->getContext(); + bool HasMask = false; + switch (IID) { + case Intrinsic::riscv_vle_mask: + case Intrinsic::riscv_vse_mask: + HasMask = true; + [[fallthrough]]; + case Intrinsic::riscv_vle: + case Intrinsic::riscv_vse: { + // Intrinsic interface: + // riscv_vle(merge, ptr, vl) + // riscv_vle_mask(merge, ptr, mask, vl, policy) + // riscv_vse(val, ptr, vl) + // riscv_vse_mask(val, ptr, mask, vl, policy) + bool IsWrite = Inst->getType()->isVoidTy(); + Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); + unsigned VLIndex = RVVIInfo->VLOperand; + unsigned PtrOperandNo = VLIndex - 1 - HasMask; + MaybeAlign Alignment = + Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL); + Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C)); + Value *Mask = ConstantInt::getTrue(MaskType); + if (HasMask) + Mask = Inst->getArgOperand(VLIndex - 1); + Value *EVL = Inst->getArgOperand(VLIndex); + Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, + Alignment, Mask, EVL); + return true; + } + case Intrinsic::riscv_vlse_mask: + case Intrinsic::riscv_vsse_mask: + HasMask = true; + [[fallthrough]]; + case Intrinsic::riscv_vlse: + case Intrinsic::riscv_vsse: { + // Intrinsic interface: + // riscv_vlse(merge, ptr, stride, vl) + // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy) + // riscv_vsse(val, ptr, stride, vl) + // riscv_vsse_mask(val, ptr, stride, mask, vl, policy) + bool IsWrite = Inst->getType()->isVoidTy(); + Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); + unsigned VLIndex = RVVIInfo->VLOperand; + unsigned PtrOperandNo = VLIndex - 2 - HasMask; + MaybeAlign Alignment = + Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL); + + Value *Stride = Inst->getArgOperand(PtrOperandNo + 1); + // Use the pointer alignment as the element alignment if the stride is a + // multiple of the pointer alignment. Otherwise, the element alignment + // should be the greatest common divisor of pointer alignment and stride. + // For simplicity, just consider unalignment for elements. + unsigned PointerAlign = Alignment.valueOrOne().value(); + if (!isa(Stride) || + cast(Stride)->getZExtValue() % PointerAlign != 0) + Alignment = Align(1); + + Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C)); + Value *Mask = ConstantInt::getTrue(MaskType); + if (HasMask) + Mask = Inst->getArgOperand(VLIndex - 1); + Value *EVL = Inst->getArgOperand(VLIndex); + Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, + Alignment, Mask, EVL, Stride); + return true; + } + case Intrinsic::riscv_vloxei_mask: + case Intrinsic::riscv_vluxei_mask: + case Intrinsic::riscv_vsoxei_mask: + case Intrinsic::riscv_vsuxei_mask: + HasMask = true; + [[fallthrough]]; + case Intrinsic::riscv_vloxei: + case Intrinsic::riscv_vluxei: + case Intrinsic::riscv_vsoxei: + case Intrinsic::riscv_vsuxei: { + // Intrinsic interface (only listed ordered version): + // riscv_vloxei(merge, ptr, index, vl) + // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy) + // riscv_vsoxei(val, ptr, index, vl) + // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy) + bool IsWrite = Inst->getType()->isVoidTy(); + Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType(); + const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID); + unsigned VLIndex = RVVIInfo->VLOperand; + unsigned PtrOperandNo = VLIndex - 2 - HasMask; + Value *Mask; + if (HasMask) { + Mask = Inst->getArgOperand(VLIndex - 1); + } else { + // Mask cannot be nullptr here: vector GEP produces , + // and casting that to scalar i64 triggers a vector/scalar mismatch + // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it + // via extractelement instead. + Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C)); + Mask = ConstantInt::getTrue(MaskType); + } + Value *EVL = Inst->getArgOperand(VLIndex); + Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1); + Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty, + Align(1), Mask, EVL, + /* Stride */ nullptr, OffsetOp); + return true; + } + } + return false; +} + unsigned RISCVTTIImpl::getRegUsageForType(Type *Ty) const { if (Ty->isVectorTy()) { // f16 with only zvfhmin and bf16 will be promoted to f32 diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 47e0a250d285a..6886e8964e29e 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -160,6 +160,9 @@ class RISCVTTIImpl final : public BasicTTIImplBase { void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override; + bool getTgtMemIntrinsic(IntrinsicInst *Inst, + MemIntrinsicInfo &Info) const override; + unsigned getMinVectorRegisterBitWidth() const override { return ST->useRVVForFixedLengthVectors() ? 16 : 0; } diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp index 1e3f7fc0070ff..776208bd3e693 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp @@ -269,6 +269,21 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address, } break; } + case SPIRV::OpSDot: + case SPIRV::OpUDot: + case SPIRV::OpSUDot: + case SPIRV::OpSDotAccSat: + case SPIRV::OpUDotAccSat: + case SPIRV::OpSUDotAccSat: { + const unsigned NumOps = MI->getNumOperands(); + if (NumOps > NumFixedOps) { + OS << ' '; + printSymbolicOperand( + MI, NumOps - 1, OS); + break; + } + break; + } default: printRemainingVariableOps(MI, NumFixedOps, OS); break; diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 2abd9d36f7606..f704d3afdea78 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1096,6 +1096,41 @@ static bool build2DBlockIOINTELInst(const SPIRV::IncomingCall *Call, return true; } +static bool buildPipeInst(const SPIRV::IncomingCall *Call, unsigned Opcode, + unsigned Scope, MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + switch (Opcode) { + case SPIRV::OpCommitReadPipe: + case SPIRV::OpCommitWritePipe: + return buildOpFromWrapper(MIRBuilder, Opcode, Call, Register(0)); + case SPIRV::OpGroupCommitReadPipe: + case SPIRV::OpGroupCommitWritePipe: + case SPIRV::OpGroupReserveReadPipePackets: + case SPIRV::OpGroupReserveWritePipePackets: { + Register ScopeConstReg = + MIRBuilder.buildConstant(LLT::scalar(32), Scope).getReg(0); + MachineRegisterInfo *MRI = MIRBuilder.getMRI(); + MRI->setRegClass(ScopeConstReg, &SPIRV::iIDRegClass); + MachineInstrBuilder MIB; + MIB = MIRBuilder.buildInstr(Opcode); + // Add Return register and type. + if (Opcode == SPIRV::OpGroupReserveReadPipePackets || + Opcode == SPIRV::OpGroupReserveWritePipePackets) + MIB.addDef(Call->ReturnRegister) + .addUse(GR->getSPIRVTypeID(Call->ReturnType)); + + MIB.addUse(ScopeConstReg); + for (unsigned int i = 0; i < Call->Arguments.size(); ++i) + MIB.addUse(Call->Arguments[i]); + + return true; + } + default: + return buildOpFromWrapper(MIRBuilder, Opcode, Call, + GR->getSPIRVTypeID(Call->ReturnType)); + } +} + static unsigned getNumComponentsForDim(SPIRV::Dim::Dim dim) { switch (dim) { case SPIRV::Dim::DIM_1D: @@ -1778,7 +1813,7 @@ static bool generateDotOrFMulInst(const StringRef DemangledCall, // Add Packed Vector Format for Integer dot product builtins if arguments are // scalar if (!IsVec && OC != SPIRV::OpFMulS) - MIB.addImm(0); + MIB.addImm(SPIRV::PackedVectorFormat4x8Bit); return true; } @@ -2350,6 +2385,20 @@ static bool generate2DBlockIOINTELInst(const SPIRV::IncomingCall *Call, return build2DBlockIOINTELInst(Call, Opcode, MIRBuilder, GR); } +static bool generatePipeInst(const SPIRV::IncomingCall *Call, + MachineIRBuilder &MIRBuilder, + SPIRVGlobalRegistry *GR) { + const SPIRV::DemangledBuiltin *Builtin = Call->Builtin; + unsigned Opcode = + SPIRV::lookupNativeBuiltin(Builtin->Name, Builtin->Set)->Opcode; + + unsigned Scope = SPIRV::Scope::Workgroup; + if (Builtin->Name.contains("sub_group")) + Scope = SPIRV::Scope::Subgroup; + + return buildPipeInst(Call, Opcode, Scope, MIRBuilder, GR); +} + static bool buildNDRange(const SPIRV::IncomingCall *Call, MachineIRBuilder &MIRBuilder, SPIRVGlobalRegistry *GR) { @@ -2948,6 +2997,8 @@ std::optional lowerBuiltin(const StringRef DemangledCall, return generateTernaryBitwiseFunctionINTELInst(Call.get(), MIRBuilder, GR); case SPIRV::Block2DLoadStore: return generate2DBlockIOINTELInst(Call.get(), MIRBuilder, GR); + case SPIRV::Pipe: + return generatePipeInst(Call.get(), MIRBuilder, GR); } return false; } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index d08560bb6565a..2a8deb6bf498b 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -69,6 +69,7 @@ def ExtendedBitOps : BuiltinGroup; def BindlessINTEL : BuiltinGroup; def TernaryBitwiseINTEL : BuiltinGroup; def Block2DLoadStore : BuiltinGroup; +def Pipe : BuiltinGroup; //===----------------------------------------------------------------------===// // Class defining a demangled builtin record. The information in the record @@ -633,6 +634,29 @@ defm : DemangledNativeBuiltin<"__spirv_AtomicSMax", OpenCL_std, Atomic, 4, 4, Op defm : DemangledNativeBuiltin<"__spirv_AtomicUMin", OpenCL_std, Atomic, 4, 4, OpAtomicUMin>; defm : DemangledNativeBuiltin<"__spirv_AtomicUMax", OpenCL_std, Atomic, 4, 4, OpAtomicUMax>; +// Pipe Instruction +defm : DemangledNativeBuiltin<"__read_pipe_2", OpenCL_std, Pipe,2, 2, OpReadPipe>; +defm : DemangledNativeBuiltin<"__write_pipe_2", OpenCL_std, Pipe, 2, 2, OpWritePipe>; +defm : DemangledNativeBuiltin<"__read_pipe_4", OpenCL_std, Pipe,4, 4, OpReservedReadPipe>; +defm : DemangledNativeBuiltin<"__write_pipe_4", OpenCL_std, Pipe, 4, 4, OpReservedWritePipe>; +defm : DemangledNativeBuiltin<"__reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpReserveReadPipePackets>; +defm : DemangledNativeBuiltin<"__reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpReserveWritePipePackets>; +defm : DemangledNativeBuiltin<"__commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpCommitReadPipe>; +defm : DemangledNativeBuiltin<"__commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpCommitWritePipe>; +defm : DemangledNativeBuiltin<"is_valid_reserve_id", OpenCL_std, Pipe, 1, 1, OpIsValidReserveId>; +defm : DemangledNativeBuiltin<"__get_pipe_num_packets_ro", OpenCL_std, Pipe, 1, 1, OpGetNumPipePackets>; +defm : DemangledNativeBuiltin<"__get_pipe_max_packets_ro", OpenCL_std, Pipe, 1, 1, OpGetMaxPipePackets>; +defm : DemangledNativeBuiltin<"__get_pipe_num_packets_wo", OpenCL_std, Pipe, 1, 1, OpGetNumPipePackets>; +defm : DemangledNativeBuiltin<"__get_pipe_max_packets_wo", OpenCL_std, Pipe, 1, 1, OpGetMaxPipePackets>; +defm : DemangledNativeBuiltin<"__work_group_reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveReadPipePackets>; +defm : DemangledNativeBuiltin<"__work_group_reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveWritePipePackets>; +defm : DemangledNativeBuiltin<"__work_group_commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitReadPipe>; +defm : DemangledNativeBuiltin<"__work_group_commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitWritePipe>; +defm : DemangledNativeBuiltin<"__sub_group_reserve_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveReadPipePackets>; +defm : DemangledNativeBuiltin<"__sub_group_reserve_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupReserveWritePipePackets>; +defm : DemangledNativeBuiltin<"__sub_group_commit_read_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitReadPipe>; +defm : DemangledNativeBuiltin<"__sub_group_commit_write_pipe", OpenCL_std, Pipe, 2, 2, OpGroupCommitWritePipe>; + // Barrier builtin records: defm : DemangledNativeBuiltin<"barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>; defm : DemangledNativeBuiltin<"work_group_barrier", OpenCL_std, Barrier, 1, 3, OpControlBarrier>; diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp index 993de9e9f64ec..85ea9e156cb97 100644 --- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp @@ -148,7 +148,10 @@ static const std::map> SPIRV::Extension::Extension::SPV_KHR_float_controls2}, {"SPV_INTEL_tensor_float32_conversion", SPIRV::Extension::Extension::SPV_INTEL_tensor_float32_conversion}, - {"SPV_KHR_bfloat16", SPIRV::Extension::Extension::SPV_KHR_bfloat16}}; + {"SPV_KHR_bfloat16", SPIRV::Extension::Extension::SPV_KHR_bfloat16}, + {"SPV_EXT_relaxed_printf_string_address_space", + SPIRV::Extension::Extension:: + SPV_EXT_relaxed_printf_string_address_space}}; bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName, StringRef ArgValue, diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index f5a49e2b47363..704edd3139260 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -1909,11 +1909,12 @@ Instruction *SPIRVEmitIntrinsics::visitInsertValueInst(InsertValueInst &I) { B.SetInsertPoint(&I); SmallVector Types = {I.getInsertedValueOperand()->getType()}; SmallVector Args; - for (auto &Op : I.operands()) - if (isa(Op)) - Args.push_back(UndefValue::get(B.getInt32Ty())); - else - Args.push_back(Op); + Value *AggregateOp = I.getAggregateOperand(); + if (isa(AggregateOp)) + Args.push_back(UndefValue::get(B.getInt32Ty())); + else + Args.push_back(AggregateOp); + Args.push_back(I.getInsertedValueOperand()); for (auto &Op : I.indices()) Args.push_back(B.getInt32(Op)); Instruction *NewI = diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 496dcba17c10d..1723bfb639189 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -763,7 +763,38 @@ def OpGetDefaultQueue: Op<303, (outs ID:$res), (ins TYPE:$type), def OpBuildNDRange: Op<304, (outs ID:$res), (ins TYPE:$type, ID:$GWS, ID:$LWS, ID:$GWO), "$res = OpBuildNDRange $type $GWS $LWS $GWO">; -// TODO: 3.42.23. Pipe Instructions +// 3.42.23. Pipe Instructions + +def OpReadPipe: Op<274, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign), + "$res = OpReadPipe $type $Pipe $Pointer $PcktSize $PcktAlign">; +def OpWritePipe: Op<275, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign), + "$res = OpWritePipe $type $Pipe $Pointer $PcktSize $PcktAlign">; +def OpReservedReadPipe : Op<276, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$ReserveId, ID:$Index, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign), + "$res = OpReservedReadPipe $type $Pipe $ReserveId $Index $Pointer $PcktSize $PcktAlign">; +def OpReservedWritePipe : Op<277, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$ReserveId, ID:$Index, ID:$Pointer, ID:$PcktSize, ID:$PcktAlign), + "$res = OpReservedWritePipe $type $Pipe $ReserveId $Index $Pointer $PcktSize $PcktAlign">; +def OpReserveReadPipePackets : Op<278, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$NumPckts, ID:$PcktSize, ID:$PcktAlign), + "$res = OpReserveReadPipePackets $type $Pipe $NumPckts $PcktSize $PcktAlign">; +def OpReserveWritePipePackets : Op<279, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$NumPckts, ID:$PcktSize, ID:$PcktAlign), + "$res = OpReserveWritePipePackets $type $Pipe $NumPckts $PcktSize $PcktAlign">; +def OpCommitReadPipe : Op<280, (outs), (ins ID:$Pipe, ID:$ReserveId, ID:$PcktSize, ID:$PcktAlign), + "OpCommitReadPipe $Pipe $ReserveId $PcktSize $PcktAlign">; +def OpCommitWritePipe : Op<281, (outs), (ins ID:$Pipe, ID:$ReserveId, ID:$PcktSize, ID:$PcktAlign), + "OpCommitWritePipe $Pipe $ReserveId $PcktSize $PcktAlign">; +def OpIsValidReserveId : Op<282, (outs ID:$res), (ins TYPE:$type, ID:$ReserveId), + "$res = OpIsValidReserveId $type $ReserveId">; +def OpGetNumPipePackets : Op<283, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$PacketSize, ID:$PacketAlign), + "$res = OpGetNumPipePackets $type $Pipe $PacketSize $PacketAlign">; +def OpGetMaxPipePackets : Op<284, (outs ID:$res), (ins TYPE:$type, ID:$Pipe, ID:$PacketSize, ID:$PacketAlign), + "$res = OpGetMaxPipePackets $type $Pipe $PacketSize $PacketAlign">; +def OpGroupReserveReadPipePackets : Op<285, (outs ID:$res), (ins TYPE:$type, ID:$Scope, ID:$Pipe, ID:$NumPckts, ID:$PacketSize, ID:$PacketAlign), + "$res = OpGroupReserveReadPipePackets $type $Scope $Pipe $NumPckts $PacketSize $PacketAlign">; +def OpGroupReserveWritePipePackets : Op<286, (outs ID:$res), (ins TYPE:$type, ID:$Scope, ID:$Pipe, ID:$NumPckts, ID:$PacketSize, ID:$PacketAlign), + "$res = OpGroupReserveWritePipePackets $type $Scope $Pipe $NumPckts $PacketSize $PacketAlign">; +def OpGroupCommitReadPipe : Op<287, (outs), (ins ID:$Scope, ID:$Pipe, ID:$ReserveId, ID:$PacketSize, ID:$PacketAlign), + "OpGroupCommitReadPipe $Scope $Pipe $ReserveId $PacketSize $PacketAlign">; +def OpGroupCommitWritePipe : Op<288, (outs), (ins ID:$Scope, ID:$Pipe, ID:$ReserveId, ID:$PacketSize, ID:$PacketAlign), + "OpGroupCommitWritePipe $Scope $Pipe $ReserveId $PacketSize $PacketAlign">; // 3.42.24. Non-Uniform Instructions diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index 3ad5528fab061..1aadd9df189a8 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -197,6 +197,8 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectOverflowArith(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, unsigned Opcode) const; + bool selectDebugTrap(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, bool Signed) const; @@ -207,6 +209,9 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectOpIsInf(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectOpIsNan(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + template bool selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -278,6 +283,12 @@ class SPIRVInstructionSelector : public InstructionSelector { GL::GLSLExtInst GLInst) const; bool selectExtInst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, const ExtInstList &ExtInsts) const; + bool selectExtInstForLRound(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, CL::OpenCLExtInst CLInst, + GL::GLSLExtInst GLInst) const; + bool selectExtInstForLRound(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I, + const ExtInstList &ExtInsts) const; bool selectLog10(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; @@ -303,7 +314,8 @@ class SPIRVInstructionSelector : public InstructionSelector { MachineInstr &I) const; bool selectModf(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; - + bool selectFrexp(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; // Utilities std::pair buildI32Constant(uint32_t Val, MachineInstr &I, @@ -708,7 +720,22 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, return selectSUCmp(ResVReg, ResType, I, true); case TargetOpcode::G_UCMP: return selectSUCmp(ResVReg, ResType, I, false); - + case TargetOpcode::G_LROUND: + case TargetOpcode::G_LLROUND: { + Register regForLround = + MRI->createVirtualRegister(MRI->getRegClass(ResVReg), "lround"); + MRI->setRegClass(regForLround, &SPIRV::iIDRegClass); + GR.assignSPIRVTypeToVReg(GR.getSPIRVTypeForVReg(I.getOperand(1).getReg()), + regForLround, *(I.getParent()->getParent())); + selectExtInstForLRound(regForLround, GR.getSPIRVTypeForVReg(regForLround), + I, CL::round, GL::Round); + MachineBasicBlock &BB = *I.getParent(); + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConvertFToS)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(regForLround); + return MIB.constrainAllUses(TII, TRI, RBI); + } case TargetOpcode::G_STRICT_FMA: case TargetOpcode::G_FMA: return selectExtInst(ResVReg, ResType, I, CL::fma, GL::Fma); @@ -809,6 +836,9 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, case TargetOpcode::G_USUBSAT: return selectExtInst(ResVReg, ResType, I, CL::u_sub_sat); + case TargetOpcode::G_FFREXP: + return selectFrexp(ResVReg, ResType, I); + case TargetOpcode::G_UADDO: return selectOverflowArith(ResVReg, ResType, I, ResType->getOpcode() == SPIRV::OpTypeVector @@ -975,16 +1005,26 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, // represent code after lowering or intrinsics which are not implemented but // should not crash when found in a customer's LLVM IR input. case TargetOpcode::G_TRAP: - case TargetOpcode::G_DEBUGTRAP: case TargetOpcode::G_UBSANTRAP: case TargetOpcode::DBG_LABEL: return true; + case TargetOpcode::G_DEBUGTRAP: + return selectDebugTrap(ResVReg, ResType, I); default: return false; } } +bool SPIRVInstructionSelector::selectDebugTrap(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + unsigned Opcode = SPIRV::OpNop; + MachineBasicBlock &BB = *I.getParent(); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode)) + .constrainAllUses(TII, TRI, RBI); +} + bool SPIRVInstructionSelector::selectExtInst(Register ResVReg, const SPIRVType *ResType, MachineInstr &I, @@ -1047,6 +1087,88 @@ bool SPIRVInstructionSelector::selectExtInst(Register ResVReg, } return false; } +bool SPIRVInstructionSelector::selectExtInstForLRound( + Register ResVReg, const SPIRVType *ResType, MachineInstr &I, + CL::OpenCLExtInst CLInst, GL::GLSLExtInst GLInst) const { + ExtInstList ExtInsts = {{SPIRV::InstructionSet::OpenCL_std, CLInst}, + {SPIRV::InstructionSet::GLSL_std_450, GLInst}}; + return selectExtInstForLRound(ResVReg, ResType, I, ExtInsts); +} + +bool SPIRVInstructionSelector::selectExtInstForLRound( + Register ResVReg, const SPIRVType *ResType, MachineInstr &I, + const ExtInstList &Insts) const { + for (const auto &Ex : Insts) { + SPIRV::InstructionSet::InstructionSet Set = Ex.first; + uint32_t Opcode = Ex.second; + if (STI.canUseExtInstSet(Set)) { + MachineBasicBlock &BB = *I.getParent(); + auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm(static_cast(Set)) + .addImm(Opcode); + const unsigned NumOps = I.getNumOperands(); + unsigned Index = 1; + if (Index < NumOps && + I.getOperand(Index).getType() == + MachineOperand::MachineOperandType::MO_IntrinsicID) + Index = 2; + for (; Index < NumOps; ++Index) + MIB.add(I.getOperand(Index)); + MIB.constrainAllUses(TII, TRI, RBI); + return true; + } + } + return false; +} + +bool SPIRVInstructionSelector::selectFrexp(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + ExtInstList ExtInsts = {{SPIRV::InstructionSet::OpenCL_std, CL::frexp}, + {SPIRV::InstructionSet::GLSL_std_450, GL::Frexp}}; + for (const auto &Ex : ExtInsts) { + SPIRV::InstructionSet::InstructionSet Set = Ex.first; + uint32_t Opcode = Ex.second; + if (!STI.canUseExtInstSet(Set)) + continue; + + MachineIRBuilder MIRBuilder(I); + SPIRVType *PointeeTy = GR.getSPIRVTypeForVReg(I.getOperand(1).getReg()); + const SPIRVType *PointerType = GR.getOrCreateSPIRVPointerType( + PointeeTy, MIRBuilder, SPIRV::StorageClass::Function); + Register PointerVReg = + createVirtualRegister(PointerType, &GR, MRI, MRI->getMF()); + + auto It = getOpVariableMBBIt(I); + auto MIB = BuildMI(*It->getParent(), It, It->getDebugLoc(), + TII.get(SPIRV::OpVariable)) + .addDef(PointerVReg) + .addUse(GR.getSPIRVTypeID(PointerType)) + .addImm(static_cast(SPIRV::StorageClass::Function)) + .constrainAllUses(TII, TRI, RBI); + + MIB = MIB & + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addImm(static_cast(Ex.first)) + .addImm(Opcode) + .add(I.getOperand(2)) + .addUse(PointerVReg) + .constrainAllUses(TII, TRI, RBI); + + MIB = MIB & + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpLoad)) + .addDef(I.getOperand(1).getReg()) + .addUse(GR.getSPIRVTypeID(PointeeTy)) + .addUse(PointerVReg) + .constrainAllUses(TII, TRI, RBI); + return MIB; + } + return false; +} bool SPIRVInstructionSelector::selectOpWithSrcs(Register ResVReg, const SPIRVType *ResType, @@ -2056,6 +2178,17 @@ bool SPIRVInstructionSelector::selectOpIsInf(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +bool SPIRVInstructionSelector::selectOpIsNan(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + MachineBasicBlock &BB = *I.getParent(); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIsNan)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); +} + template bool SPIRVInstructionSelector::selectDot4AddPacked(Register ResVReg, const SPIRVType *ResType, @@ -3199,6 +3332,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_isinf: return selectOpIsInf(ResVReg, ResType, I); + case Intrinsic::spv_isnan: + return selectOpIsNan(ResVReg, ResType, I); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); case Intrinsic::spv_refract: diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index 170bddd507e3b..b4fc8dabbd4df 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -276,6 +276,10 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { {G_UADDO, G_SADDO, G_USUBO, G_SSUBO, G_UMULO, G_SMULO}) .alwaysLegal(); + getActionDefinitionsBuilder({G_LROUND, G_LLROUND}) + .legalForCartesianProduct(allFloatScalarsAndVectors, + allIntScalarsAndVectors); + // FP conversions. getActionDefinitionsBuilder({G_FPTRUNC, G_FPEXT}) .legalForCartesianProduct(allFloatScalarsAndVectors); @@ -286,6 +290,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { // Control-flow. In some cases (e.g. constants) s1 may be promoted to s32. getActionDefinitionsBuilder(G_BRCOND).legalFor({s1, s32}); + getActionDefinitionsBuilder(G_FFREXP).legalForCartesianProduct( + allFloatScalarsAndVectors, {s32, v2s32, v3s32, v4s32, v8s32, v16s32}); + // TODO: Review the target OpenCL and GLSL Extended Instruction Set specs to // tighten these requirements. Many of these math functions are only legal on // specific bitwidths, so they are not selectable for diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp index a95f393b75605..bc159d5c9a113 100644 --- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp @@ -1222,6 +1222,31 @@ static void AddDotProductRequirements(const MachineInstr &MI, } } +void addPrintfRequirements(const MachineInstr &MI, + SPIRV::RequirementHandler &Reqs, + const SPIRVSubtarget &ST) { + SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry(); + const SPIRVType *PtrType = GR->getSPIRVTypeForVReg(MI.getOperand(4).getReg()); + if (PtrType) { + MachineOperand ASOp = PtrType->getOperand(1); + if (ASOp.isImm()) { + unsigned AddrSpace = ASOp.getImm(); + if (AddrSpace != SPIRV::StorageClass::UniformConstant) { + if (!ST.canUseExtension( + SPIRV::Extension:: + SPV_EXT_relaxed_printf_string_address_space)) { + report_fatal_error("SPV_EXT_relaxed_printf_string_address_space is " + "required because printf uses a format string not " + "in constant address space.", + false); + } + Reqs.addExtension( + SPIRV::Extension::SPV_EXT_relaxed_printf_string_address_space); + } + } + } +} + static bool isBFloat16Type(const SPIRVType *TypeDef) { return TypeDef && TypeDef->getNumOperands() == 3 && TypeDef->getOpcode() == SPIRV::OpTypeFloat && @@ -1230,8 +1255,9 @@ static bool isBFloat16Type(const SPIRVType *TypeDef) { } void addInstrRequirements(const MachineInstr &MI, - SPIRV::RequirementHandler &Reqs, + SPIRV::ModuleAnalysisInfo &MAI, const SPIRVSubtarget &ST) { + SPIRV::RequirementHandler &Reqs = MAI.Reqs; switch (MI.getOpcode()) { case SPIRV::OpMemoryModel: { int64_t Addr = MI.getOperand(0).getImm(); @@ -1321,6 +1347,12 @@ void addInstrRequirements(const MachineInstr &MI, static_cast( SPIRV::InstructionSet::NonSemantic_Shader_DebugInfo_100)) { Reqs.addExtension(SPIRV::Extension::SPV_KHR_non_semantic_info); + break; + } + if (MI.getOperand(3).getImm() == + static_cast(SPIRV::OpenCLExtInst::printf)) { + addPrintfRequirements(MI, Reqs, ST); + break; } break; } @@ -1781,15 +1813,45 @@ void addInstrRequirements(const MachineInstr &MI, break; case SPIRV::OpConvertHandleToImageINTEL: case SPIRV::OpConvertHandleToSamplerINTEL: - case SPIRV::OpConvertHandleToSampledImageINTEL: + case SPIRV::OpConvertHandleToSampledImageINTEL: { if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bindless_images)) report_fatal_error("OpConvertHandleTo[Image/Sampler/SampledImage]INTEL " "instructions require the following SPIR-V extension: " "SPV_INTEL_bindless_images", false); + SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry(); + SPIRV::AddressingModel::AddressingModel AddrModel = MAI.Addr; + SPIRVType *TyDef = GR->getSPIRVTypeForVReg(MI.getOperand(1).getReg()); + if (MI.getOpcode() == SPIRV::OpConvertHandleToImageINTEL && + TyDef->getOpcode() != SPIRV::OpTypeImage) { + report_fatal_error("Incorrect return type for the instruction " + "OpConvertHandleToImageINTEL", + false); + } else if (MI.getOpcode() == SPIRV::OpConvertHandleToSamplerINTEL && + TyDef->getOpcode() != SPIRV::OpTypeSampler) { + report_fatal_error("Incorrect return type for the instruction " + "OpConvertHandleToSamplerINTEL", + false); + } else if (MI.getOpcode() == SPIRV::OpConvertHandleToSampledImageINTEL && + TyDef->getOpcode() != SPIRV::OpTypeSampledImage) { + report_fatal_error("Incorrect return type for the instruction " + "OpConvertHandleToSampledImageINTEL", + false); + } + SPIRVType *SpvTy = GR->getSPIRVTypeForVReg(MI.getOperand(2).getReg()); + unsigned Bitwidth = GR->getScalarOrVectorBitWidth(SpvTy); + if (!(Bitwidth == 32 && AddrModel == SPIRV::AddressingModel::Physical32) && + !(Bitwidth == 64 && AddrModel == SPIRV::AddressingModel::Physical64)) { + report_fatal_error( + "Parameter value must be a 32-bit scalar in case of " + "Physical32 addressing model or a 64-bit scalar in case of " + "Physical64 addressing model", + false); + } Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bindless_images); Reqs.addCapability(SPIRV::Capability::BindlessImagesINTEL); break; + } case SPIRV::OpSubgroup2DBlockLoadINTEL: case SPIRV::OpSubgroup2DBlockLoadTransposeINTEL: case SPIRV::OpSubgroup2DBlockLoadTransformINTEL: @@ -1927,7 +1989,7 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI, continue; for (const MachineBasicBlock &MBB : *MF) for (const MachineInstr &MI : MBB) - addInstrRequirements(MI, MAI.Reqs, ST); + addInstrRequirements(MI, MAI, ST); } // Collect requirements for OpExecutionMode instructions. auto Node = M.getNamedMetadata("spirv.ExecutionMode"); diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 2b34f61fa2434..4e4e6fb4ab791 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -335,6 +335,21 @@ static void lowerFunnelShifts(IntrinsicInst *FSHIntrinsic) { FSHIntrinsic->setCalledFunction(FSHFunc); } +static void lowerConstrainedFPCmpIntrinsic( + ConstrainedFPCmpIntrinsic *ConstrainedCmpIntrinsic, + SmallVector &EraseFromParent) { + if (!ConstrainedCmpIntrinsic) + return; + // Extract the floating-point values being compared + Value *LHS = ConstrainedCmpIntrinsic->getArgOperand(0); + Value *RHS = ConstrainedCmpIntrinsic->getArgOperand(1); + FCmpInst::Predicate Pred = ConstrainedCmpIntrinsic->getPredicate(); + IRBuilder<> Builder(ConstrainedCmpIntrinsic); + Value *FCmp = Builder.CreateFCmp(Pred, LHS, RHS); + ConstrainedCmpIntrinsic->replaceAllUsesWith(FCmp); + EraseFromParent.push_back(dyn_cast(ConstrainedCmpIntrinsic)); +} + static void lowerExpectAssume(IntrinsicInst *II) { // If we cannot use the SPV_KHR_expect_assume extension, then we need to // ignore the intrinsic and move on. It should be removed later on by LLVM. @@ -376,6 +391,7 @@ static bool toSpvLifetimeIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID) { bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { bool Changed = false; const SPIRVSubtarget &STI = TM.getSubtarget(*F); + SmallVector EraseFromParent; for (BasicBlock &BB : *F) { for (Instruction &I : make_early_inc_range(BB)) { auto Call = dyn_cast(&I); @@ -423,9 +439,17 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) { lowerPtrAnnotation(II); Changed = true; break; + case Intrinsic::experimental_constrained_fcmp: + case Intrinsic::experimental_constrained_fcmps: + lowerConstrainedFPCmpIntrinsic(dyn_cast(II), + EraseFromParent); + Changed = true; + break; } } } + for (auto *I : EraseFromParent) + I->eraseFromParent(); return Changed; } diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index 501bcb94af2ea..66ce5a2d67c3e 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -211,6 +211,7 @@ def CooperativeMatrixOperandsOperand : OperandCategory; def SpecConstantOpOperandsOperand : OperandCategory; def MatrixMultiplyAccumulateOperandsOperand : OperandCategory; def FPEncodingOperand : OperandCategory; +def PackedVectorFormatsOperand : OperandCategory; //===----------------------------------------------------------------------===// // Definition of the Environments @@ -2026,3 +2027,22 @@ multiclass FPEncodingOperand value, list reqExtensions>{ } defm BFloat16KHR : FPEncodingOperand<0, [SPV_KHR_bfloat16]>; + +def PackedVectorFormats : GenericEnum, Operand { + let FilterClass = "PackedVectorFormats"; + let NameField = "Name"; + let ValueField = "Value"; + let PrintMethod = !strconcat("printSymbolicOperand"); +} + +class PackedVectorFormats value> { + string Name = name; + bits<32> Value = value; +} + +multiclass PackedVectorFormatsOperand value, list reqExtensions> { + def NAME : BuiltIn; + defm : SymbolicOperandWithRequirements; +} + +defm PackedVectorFormat4x8Bit : PackedVectorFormatsOperand<0, [SPV_KHR_integer_dot_product]>; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp index cfa3511436b97..cb02e33b8e5dd 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp @@ -28,7 +28,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include #include diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp index 2737cca62cd20..a1607097af1ef 100644 --- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp +++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp @@ -115,7 +115,8 @@ static bool Analyze_CC_Sparc64_Full(bool IsReturn, unsigned &ValNo, MVT &ValVT, // Stack space is allocated for all arguments starting from [%fp+BIAS+128]. unsigned size = (LocVT == MVT::f128) ? 16 : 8; - Align alignment = (LocVT == MVT::f128) ? Align(16) : Align(8); + Align alignment = + (LocVT == MVT::f128 || ArgFlags.isSplit()) ? Align(16) : Align(8); unsigned Offset = State.AllocateStack(size, alignment); unsigned Reg = 0; diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index 59d1db784c688..383c96e8cca73 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -21,7 +21,6 @@ #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/ErrorHandling.h" #include #include diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 707887c59bd65..f8706b748b355 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -528,7 +528,9 @@ class SystemZTargetLowering : public TargetLowering { bool shouldConsiderGEPOffsetSplit() const override { return true; } - bool shouldExpandCmpUsingSelects(EVT VT) const override { return true; } + bool preferSelectsOverBooleanArithmetic(EVT VT) const override { + return true; + } const char *getTargetNodeName(unsigned Opcode) const override; std::pair diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp index 711937c488275..ab719390e3245 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp @@ -25,7 +25,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/EndianStream.h" -#include "llvm/Support/ErrorHandling.h" #include #include diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp index e6486e247209b..5c3127e2d3dc6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp @@ -216,6 +216,18 @@ static MachineInstr *findStartOfTree(MachineOperand &MO, return Def; } +// FAKE_USEs are no-ops, so remove them here so that the values used by them +// will be correctly dropped later. +static void removeFakeUses(MachineFunction &MF) { + SmallVector ToDelete; + for (auto &MBB : MF) + for (auto &MI : MBB) + if (MI.isFakeUse()) + ToDelete.push_back(&MI); + for (auto *MI : ToDelete) + MI->eraseFromParent(); +} + bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "********** Make Locals Explicit **********\n" "********** Function: " @@ -226,6 +238,8 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) { WebAssemblyFunctionInfo &MFI = *MF.getInfo(); const auto *TII = MF.getSubtarget().getInstrInfo(); + removeFakeUses(MF); + // Map non-stackified virtual registers to their local ids. DenseMap Reg2Local; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp index feac04a17068a..343d90e88950f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp @@ -39,18 +39,18 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI) WebAssembly::CATCHRET), RI(STI.getTargetTriple()) {} -bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable( +bool WebAssemblyInstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { switch (MI.getOpcode()) { case WebAssembly::CONST_I32: case WebAssembly::CONST_I64: case WebAssembly::CONST_F32: case WebAssembly::CONST_F64: - // TargetInstrInfo::isReallyTriviallyReMaterializable misses these + // TargetInstrInfo::isReMaterializableImpl misses these // because of the ARGUMENTS implicit def, so we manualy override it here. return true; default: - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h index ba00097034bf5..b92f62d7638c1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h @@ -37,7 +37,7 @@ class WebAssemblyInstrInfo final : public WebAssemblyGenInstrInfo { const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; } - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, Register SrcReg, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp index 08ca20b5eef6e..97f2ed0a828ba 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp @@ -867,6 +867,10 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) { if (Insert->isDebugValue()) continue; + // Ignore FAKE_USEs, which are no-ops and will be deleted later. + if (Insert->isFakeUse()) + continue; + // Iterate through the inputs in reverse order, since we'll be pulling // operands off the stack in LIFO order. CommutingState Commuting; diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index ce5e92135f706..a8908d4b710e6 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -1247,7 +1247,7 @@ class X86AsmParser : public MCTargetAsmParser { /// return false if no parsing errors occurred, true otherwise. bool HandleAVX512Operand(OperandVector &Operands); - bool ParseZ(std::unique_ptr &Z, const SMLoc &StartLoc); + bool ParseZ(std::unique_ptr &Z, SMLoc StartLoc); bool is64BitMode() const { // FIXME: Can tablegen auto-generate this? @@ -2907,8 +2907,7 @@ X86::CondCode X86AsmParser::ParseConditionCode(StringRef CC) { // true on failure, false otherwise // If no {z} mark was found - Parser doesn't advance -bool X86AsmParser::ParseZ(std::unique_ptr &Z, - const SMLoc &StartLoc) { +bool X86AsmParser::ParseZ(std::unique_ptr &Z, SMLoc StartLoc) { MCAsmParser &Parser = getParser(); // Assuming we are just pass the '{' mark, quering the next token // Searched for {z}, but none was found. Return false, as no parsing error was @@ -4018,9 +4017,14 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { return Error(Ops[0]->getStartLoc(), "all tmm registers must be distinct"); } - // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to - // check this with the legacy encoding, VEX/EVEX/XOP don't use REX. - if ((TSFlags & X86II::EncodingMask) == 0) { + // High 8-bit regs (AH/BH/CH/DH) are incompatible with encodings that imply + // extended prefixes: + // * Legacy path that would emit a REX (e.g. uses r8..r15 or sil/dil/bpl/spl) + // * EVEX + // * REX2 + // VEX/XOP don't use REX; they are excluded from the legacy check. + const unsigned Enc = TSFlags & X86II::EncodingMask; + if (Enc != X86II::VEX && Enc != X86II::XOP) { MCRegister HReg; bool UsesRex = TSFlags & X86II::REX_W; unsigned NumOps = Inst.getNumOperands(); @@ -4036,11 +4040,13 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { UsesRex = true; } - if (UsesRex && HReg) { + if (HReg && + (Enc == X86II::EVEX || ForcedOpcodePrefix == OpcodePrefix_REX2 || + ForcedOpcodePrefix == OpcodePrefix_REX || UsesRex)) { StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg); return Error(Ops[0]->getStartLoc(), - "can't encode '" + RegName + "' in an instruction requiring " - "REX prefix"); + "can't encode '" + RegName.str() + + "' in an instruction requiring EVEX/REX2/REX prefix"); } } diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp index 2c752457d165e..143c4c43e611a 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Type.h" using namespace llvm; @@ -110,7 +111,8 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, .legalFor(HasSSE2 || UseX87, {s64}) .legalFor(UseX87, {s80}); - getActionDefinitionsBuilder(G_GET_ROUNDING).customFor({s32}); + getActionDefinitionsBuilder({G_GET_ROUNDING, G_SET_ROUNDING}) + .customFor({s32}); // merge/unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { @@ -617,6 +619,8 @@ bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, return legalizeFPTOSI(MI, MRI, Helper); case TargetOpcode::G_GET_ROUNDING: return legalizeGETROUNDING(MI, MRI, Helper); + case TargetOpcode::G_SET_ROUNDING: + return legalizeSETROUNDING(MI, MRI, Helper); } llvm_unreachable("expected switch to return"); } @@ -859,6 +863,134 @@ bool X86LegalizerInfo::legalizeGETROUNDING(MachineInstr &MI, return true; } +bool X86LegalizerInfo::legalizeSETROUNDING(MachineInstr &MI, + MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const { + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + MachineFunction &MF = MIRBuilder.getMF(); + Register Src = MI.getOperand(0).getReg(); + const LLT s8 = LLT::scalar(8); + const LLT s16 = LLT::scalar(16); + const LLT s32 = LLT::scalar(32); + + // Allocate stack slot for control word and MXCSR (4 bytes). + int MemSize = 4; + Align Alignment = Align(4); + MachinePointerInfo PtrInfo; + auto StackTemp = Helper.createStackTemporary(TypeSize::getFixed(MemSize), + Alignment, PtrInfo); + Register StackPtr = StackTemp.getReg(0); + + auto StoreMMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 2, Align(2)); + MIRBuilder.buildInstr(X86::G_FNSTCW16) + .addUse(StackPtr) + .addMemOperand(StoreMMO); + + auto LoadMMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 2, Align(2)); + auto CWD16 = MIRBuilder.buildLoad(s16, StackPtr, *LoadMMO); + + // Clear RM field (bits 11:10) + auto ClearedCWD = + MIRBuilder.buildAnd(s16, CWD16, MIRBuilder.buildConstant(s16, 0xf3ff)); + + // Check if Src is a constant + auto *SrcDef = MRI.getVRegDef(Src); + Register RMBits; + Register MXCSRRMBits; + + if (SrcDef && SrcDef->getOpcode() == TargetOpcode::G_CONSTANT) { + uint64_t RM = getIConstantFromReg(Src, MRI).getZExtValue(); + int FieldVal = X86::getRoundingModeX86(RM); + + if (FieldVal == X86::rmInvalid) { + FieldVal = X86::rmToNearest; + LLVMContext &C = MF.getFunction().getContext(); + C.diagnose(DiagnosticInfoUnsupported( + MF.getFunction(), "rounding mode is not supported by X86 hardware", + DiagnosticLocation(MI.getDebugLoc()), DS_Error)); + return false; + } + + FieldVal = FieldVal << 3; + RMBits = MIRBuilder.buildConstant(s16, FieldVal).getReg(0); + MXCSRRMBits = MIRBuilder.buildConstant(s32, FieldVal).getReg(0); + } else { + // Convert Src (rounding mode) to bits for control word + // (0xc9 << (2 * Src + 4)) & 0xc00 + auto Src32 = MIRBuilder.buildZExtOrTrunc(s32, Src); + auto ShiftAmt = MIRBuilder.buildAdd( + s32, MIRBuilder.buildShl(s32, Src32, MIRBuilder.buildConstant(s32, 1)), + MIRBuilder.buildConstant(s32, 4)); + auto ShiftAmt8 = MIRBuilder.buildTrunc(s8, ShiftAmt); + auto Shifted = MIRBuilder.buildShl(s16, MIRBuilder.buildConstant(s16, 0xc9), + ShiftAmt8); + RMBits = + MIRBuilder.buildAnd(s16, Shifted, MIRBuilder.buildConstant(s16, 0xc00)) + .getReg(0); + + // For non-constant case, we still need to compute MXCSR bits dynamically + auto RMBits32 = MIRBuilder.buildZExt(s32, RMBits); + MXCSRRMBits = + MIRBuilder.buildShl(s32, RMBits32, MIRBuilder.buildConstant(s32, 3)) + .getReg(0); + } + // Update rounding mode bits + auto NewCWD = + MIRBuilder.buildOr(s16, ClearedCWD, RMBits, MachineInstr::Disjoint); + + // Store new FP Control Word to stack + auto StoreNewMMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 2, Align(2)); + MIRBuilder.buildStore(NewCWD, StackPtr, *StoreNewMMO); + + // Load FP control word from the slot using G_FLDCW16 + auto LoadNewMMO = + MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 2, Align(2)); + MIRBuilder.buildInstr(X86::G_FLDCW16) + .addUse(StackPtr) + .addMemOperand(LoadNewMMO); + + if (Subtarget.hasSSE1()) { + // Store MXCSR to stack (use STMXCSR) + auto StoreMXCSRMMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, 4, Align(4)); + MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) + .addIntrinsicID(Intrinsic::x86_sse_stmxcsr) + .addUse(StackPtr) + .addMemOperand(StoreMXCSRMMO); + + // Load MXCSR from stack + auto LoadMXCSRMMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, 4, Align(4)); + auto MXCSR = MIRBuilder.buildLoad(s32, StackPtr, *LoadMXCSRMMO); + + // Clear RM field (bits 14:13) + auto ClearedMXCSR = MIRBuilder.buildAnd( + s32, MXCSR, MIRBuilder.buildConstant(s32, 0xffff9fff)); + + // Update rounding mode bits + auto NewMXCSR = MIRBuilder.buildOr(s32, ClearedMXCSR, MXCSRRMBits); + + // Store new MXCSR to stack + auto StoreNewMXCSRMMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, 4, Align(4)); + MIRBuilder.buildStore(NewMXCSR, StackPtr, *StoreNewMXCSRMMO); + + // Load MXCSR from stack (use LDMXCSR) + auto LoadNewMXCSRMMO = MF.getMachineMemOperand( + PtrInfo, MachineMemOperand::MOLoad, 4, Align(4)); + MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) + .addIntrinsicID(Intrinsic::x86_sse_ldmxcsr) + .addUse(StackPtr) + .addMemOperand(LoadNewMXCSRMMO); + } + + MI.eraseFromParent(); + return true; +} + bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { return true; diff --git a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h index 0003552d70ee0..09c727c8e8685 100644 --- a/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h +++ b/llvm/lib/Target/X86/GISel/X86LegalizerInfo.h @@ -57,6 +57,9 @@ class X86LegalizerInfo : public LegalizerInfo { bool legalizeGETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const; + + bool legalizeSETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const; }; } // namespace llvm #endif diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index e2a1bbf383b3c..a69a781bf070b 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -20,24 +20,22 @@ namespace llvm { namespace mca { -void X86InstrPostProcess::setMemBarriers(std::unique_ptr &Inst, - const MCInst &MCI) { +void X86InstrPostProcess::setMemBarriers(Instruction &Inst, const MCInst &MCI) { switch (MCI.getOpcode()) { case X86::MFENCE: - Inst->setLoadBarrier(true); - Inst->setStoreBarrier(true); + Inst.setLoadBarrier(true); + Inst.setStoreBarrier(true); break; case X86::LFENCE: - Inst->setLoadBarrier(true); + Inst.setLoadBarrier(true); break; case X86::SFENCE: - Inst->setStoreBarrier(true); + Inst.setStoreBarrier(true); break; } } -void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, - const MCInst &MCI) { +void X86InstrPostProcess::useStackEngine(Instruction &Inst, const MCInst &MCI) { // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we // have not done the necessary benchmarking to see if they are also // optimized by the stack engine. @@ -46,18 +44,18 @@ void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, // delay subsequent rsp using non-stack instructions. if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { auto *StackRegisterDef = - llvm::find_if(Inst->getDefs(), [](const WriteState &State) { + llvm::find_if(Inst.getDefs(), [](const WriteState &State) { return State.getRegisterID() == X86::RSP; }); assert( - StackRegisterDef != Inst->getDefs().end() && + StackRegisterDef != Inst.getDefs().end() && "Expected push instruction to implicitly use stack pointer register."); - Inst->getDefs().erase(StackRegisterDef); + Inst.getDefs().erase(StackRegisterDef); } } -void X86InstrPostProcess::postProcessInstruction( - std::unique_ptr &Inst, const MCInst &MCI) { +void X86InstrPostProcess::postProcessInstruction(Instruction &Inst, + const MCInst &MCI) { // Set IsALoadBarrier and IsAStoreBarrier flags. setMemBarriers(Inst, MCI); useStackEngine(Inst, MCI); diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index c5459e42dfc9f..d6197f3344bbb 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -26,12 +26,12 @@ namespace mca { class X86InstrPostProcess : public InstrPostProcess { /// Called within X86InstrPostProcess to specify certain instructions /// as load and store barriers. - void setMemBarriers(std::unique_ptr &Inst, const MCInst &MCI); + void setMemBarriers(Instruction &Inst, const MCInst &MCI); /// Called within X86InstrPostPorcess to remove some rsp read operands /// on stack instructions to better simulate the stack engine. We currently /// do not model features of the stack engine like sync uops. - void useStackEngine(std::unique_ptr &Inst, const MCInst &MCI); + void useStackEngine(Instruction &Inst, const MCInst &MCI); public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) @@ -39,8 +39,7 @@ class X86InstrPostProcess : public InstrPostProcess { ~X86InstrPostProcess() = default; - void postProcessInstruction(std::unique_ptr &Inst, - const MCInst &MCI) override; + void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override; }; } // namespace mca diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 33dc0a232815c..a1d4e0bc62310 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -277,6 +277,22 @@ bool X86FixupInstTuningPass::processInstruction( return true; }; + // Is ADD(X,X) more efficient than SHL(X,1)? + auto ProcessShiftLeftToAdd = [&](unsigned AddOpc) -> bool { + if (MI.getOperand(NumOperands - 1).getImm() != 1) + return false; + if (!NewOpcPreferable(AddOpc, /*ReplaceInTie*/ true)) + return false; + LLVM_DEBUG(dbgs() << "Replacing: " << MI); + { + MI.setDesc(TII->get(AddOpc)); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(NumOperands - 2)); + } + LLVM_DEBUG(dbgs() << " With: " << MI); + return false; + }; + switch (Opc) { case X86::BLENDPDrri: return ProcessBLENDToMOV(X86::MOVSDrr, 0x3, 0x1); @@ -563,6 +579,44 @@ bool X86FixupInstTuningPass::processInstruction( return ProcessUNPCKPS(X86::VPUNPCKHDQZ256rmkz); case X86::VUNPCKHPSZrmkz: return ProcessUNPCKPS(X86::VPUNPCKHDQZrmkz); + + case X86::PSLLWri: + return ProcessShiftLeftToAdd(X86::PADDWrr); + case X86::VPSLLWri: + return ProcessShiftLeftToAdd(X86::VPADDWrr); + case X86::VPSLLWYri: + return ProcessShiftLeftToAdd(X86::VPADDWYrr); + case X86::VPSLLWZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ128rr); + case X86::VPSLLWZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDWZ256rr); + case X86::VPSLLWZri: + return ProcessShiftLeftToAdd(X86::VPADDWZrr); + case X86::PSLLDri: + return ProcessShiftLeftToAdd(X86::PADDDrr); + case X86::VPSLLDri: + return ProcessShiftLeftToAdd(X86::VPADDDrr); + case X86::VPSLLDYri: + return ProcessShiftLeftToAdd(X86::VPADDDYrr); + case X86::VPSLLDZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ128rr); + case X86::VPSLLDZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDDZ256rr); + case X86::VPSLLDZri: + return ProcessShiftLeftToAdd(X86::VPADDDZrr); + case X86::PSLLQri: + return ProcessShiftLeftToAdd(X86::PADDQrr); + case X86::VPSLLQri: + return ProcessShiftLeftToAdd(X86::VPADDQrr); + case X86::VPSLLQYri: + return ProcessShiftLeftToAdd(X86::VPADDQYrr); + case X86::VPSLLQZ128ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ128rr); + case X86::VPSLLQZ256ri: + return ProcessShiftLeftToAdd(X86::VPADDQZ256rr); + case X86::VPSLLQZri: + return ProcessShiftLeftToAdd(X86::VPADDQZrr); + default: return false; } diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 08c9d738baceb..a66a3213403b4 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -53,6 +53,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI, STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { // Cache a bunch of frame-related predicates for this subtarget. SlotSize = TRI->getSlotSize(); + assert(SlotSize == 4 || SlotSize == 8); Is64Bit = STI.is64Bit(); IsLP64 = STI.isTarget64BitLP64(); // standard x86_64 uses 64-bit frame/stack pointers, x32 - 32-bit. @@ -224,7 +225,7 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) { return false; } -constexpr int64_t MaxSPChunk = (1LL << 31) - 1; +constexpr uint64_t MaxSPChunk = (1ULL << 31) - 1; /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. @@ -245,8 +246,6 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, return; } - uint64_t Chunk = MaxSPChunk; - MachineFunction &MF = *MBB.getParent(); const X86Subtarget &STI = MF.getSubtarget(); const X86TargetLowering &TLI = *STI.getTargetLowering(); @@ -260,7 +259,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, // loop, by inlineStackProbe(). BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset); return; - } else if (Offset > Chunk) { + } else if (Offset > MaxSPChunk) { // Rather than emit a long series of instructions for large offsets, // load the offset into a register and do one sub/add unsigned Reg = 0; @@ -284,7 +283,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, .addReg(Reg); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. return; - } else if (Offset > 8 * Chunk) { + } else if (Offset > 8 * MaxSPChunk) { // If we would need more than 8 add or sub instructions (a >16GB stack // frame), it's worth spilling RAX to materialize this immediate. // pushq %rax @@ -322,8 +321,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, } while (Offset) { - uint64_t ThisVal = std::min(Offset, Chunk); - if (ThisVal == SlotSize) { + if (Offset == SlotSize) { // Use push / pop for slot sized adjustments as a size optimization. We // need to find a dead register when using pop. unsigned Reg = isSub ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX) @@ -334,11 +332,12 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB, BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)) .setMIFlag(Flag); - Offset -= ThisVal; - continue; + return; } } + uint64_t ThisVal = std::min(Offset, MaxSPChunk); + BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue) .setMIFlag(Flag); @@ -445,7 +444,7 @@ int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB, return CalcNewOffset(0); FoundStackAdjust(PI, Offset); - if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk) + if ((uint64_t)std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk) break; if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end())) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2feb76e0eb7b4..292eab77e2002 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4452,11 +4452,12 @@ static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, template SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef Ops, - F Builder, bool CheckBWI = true) { + F Builder, bool CheckBWI = true, + bool AllowAVX512 = true) { assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2"); unsigned NumSubs = 1; - if ((CheckBWI && Subtarget.useBWIRegs()) || - (!CheckBWI && Subtarget.useAVX512Regs())) { + if (AllowAVX512 && ((CheckBWI && Subtarget.useBWIRegs()) || + (!CheckBWI && Subtarget.useAVX512Regs()))) { if (VT.getSizeInBits() > 512) { NumSubs = VT.getSizeInBits() / 512; assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size"); @@ -5346,6 +5347,19 @@ bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { return false; } + +int getRoundingModeX86(unsigned RM) { + switch (static_cast<::llvm::RoundingMode>(RM)) { + // clang-format off + case ::llvm::RoundingMode::NearestTiesToEven: return X86::rmToNearest; break; + case ::llvm::RoundingMode::TowardNegative: return X86::rmDownward; break; + case ::llvm::RoundingMode::TowardPositive: return X86::rmUpward; break; + case ::llvm::RoundingMode::TowardZero: return X86::rmTowardZero; break; + default: + return X86::rmInvalid; // Invalid rounding mode + } +} + } // namespace X86 } // namespace llvm @@ -11721,10 +11735,19 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( // we'll have to do 2x as many shuffles in order to achieve this, a 2-input // pre-shuffle first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { + // If we don't have blends, see if we can create a cheap unpack. + if (!Subtarget.hasSSE41() && VT.is128BitVector() && + (is128BitUnpackShuffleMask(V1Mask, DAG) || + is128BitUnpackShuffleMask(V2Mask, DAG))) + if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( + DL, VT, V1, V2, Mask, Subtarget, DAG)) + return PermUnpack; + // Only prefer immediate blends to unpack/rotate. - if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, - DAG, true)) + if (SDValue BlendPerm = + lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG, true)) return BlendPerm; + // If either input vector provides only a single element which is repeated // multiple times, unpacking from both input vectors would generate worse // code. e.g. for @@ -11736,13 +11759,16 @@ static SDValue lowerShuffleAsDecomposedShuffleMerge( if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) return UnpackPerm; + if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( DL, VT, V1, V2, Mask, Subtarget, DAG)) return RotatePerm; + // Unpack/rotate failed - try again with variable blends. if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; + if (VT.getScalarSizeInBits() >= 32) if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( DL, VT, V1, V2, Mask, Subtarget, DAG)) @@ -28686,16 +28712,14 @@ SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op, SDValue RMBits; if (auto *CVal = dyn_cast(NewRM)) { uint64_t RM = CVal->getZExtValue(); - int FieldVal; - switch (static_cast(RM)) { - // clang-format off - case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break; - case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break; - case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break; - case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break; - default: - llvm_unreachable("rounding mode is not supported by X86 hardware"); - // clang-format on + int FieldVal = X86::getRoundingModeX86(RM); + + if (FieldVal == X86::rmInvalid) { + FieldVal = X86::rmToNearest; + LLVMContext &C = MF.getFunction().getContext(); + C.diagnose(DiagnosticInfoUnsupported( + MF.getFunction(), "rounding mode is not supported by X86 hardware", + DiagnosticLocation(DL.getDebugLoc()), DS_Error)); } RMBits = DAG.getConstant(FieldVal, DL, MVT::i16); } else { @@ -30289,22 +30313,8 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); - if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { - // Hardware support for vector shifts is sparse which makes us scalarize the - // vector operations in many cases. Also, on sandybridge ADD is faster than - // shl: (shl V, 1) -> (add (freeze V), (freeze V)) - if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { - // R may be undef at run-time, but (shl R, 1) must be an even number (LSB - // must be 0). (add undef, undef) however can be any value. To make this - // safe, we must freeze R to ensure that register allocation uses the same - // register for an undefined value. This ensures that the result will - // still be even and preserves the original semantics. - R = DAG.getFreeze(R); - return DAG.getNode(ISD::ADD, dl, VT, R, R); - } - + if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); - } // i64 SRA needs to be performed as partial shifts. if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || @@ -31205,16 +31215,16 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, unsigned NumElts = VT.getVectorNumElements(); if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { - if (IsFSHR) - std::swap(Op0, Op1); if (IsCstSplat) { + if (IsFSHR) + std::swap(Op0, Op1); uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, {Op0, Op1, Imm}, DAG, Subtarget); } - return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, + return getAVX512Node(IsFSHR ? ISD::FSHR : ISD::FSHL, DL, VT, {Op0, Op1, Amt}, DAG, Subtarget); } assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || @@ -35129,8 +35139,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VALIGN) NODE_NAME_CASE(VSHLD) NODE_NAME_CASE(VSHRD) - NODE_NAME_CASE(VSHLDV) - NODE_NAME_CASE(VSHRDV) NODE_NAME_CASE(PSHUFD) NODE_NAME_CASE(PSHUFHW) NODE_NAME_CASE(PSHUFLW) @@ -45162,10 +45170,13 @@ bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( case X86ISD::WrapperRIP: return true; case X86ISD::BLENDI: + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: + case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMV: case X86ISD::VPERMV3: { SmallVector Mask; SmallVector Ops; @@ -45228,10 +45239,13 @@ bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( case X86ISD::BLENDV: return false; // SSE target shuffles. + case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: + case X86ISD::VPERMILPV: case X86ISD::VPERMILPI: + case X86ISD::VPERMV: case X86ISD::VPERMV3: return false; // SSE comparisons handle all icmp/fcmp cases. @@ -46181,7 +46195,7 @@ static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, SDValue Zero = DAG.getConstant(0, DL, DpVT); return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, - DpBuilder, false); + DpBuilder, /*CheckBWI=*/false, Subtarget.hasVNNI()); } // Create a PSADBW given two sources representable as zexts of vXi8. @@ -58047,7 +58061,8 @@ static SDValue matchVPMADD52(SDNode *N, SelectionDAG &DAG, const SDLoc &DL, }; return SplitOpsAndApply(DAG, Subtarget, DL, VT, {Acc, X, Y}, VPMADD52Builder, - /*CheckBWI*/ false); + /*CheckBWI*/ false, + /*AllowAVX512*/ Subtarget.hasIFMA()); } static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0c9ba591b03eb..b55556aadd867 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -471,8 +471,7 @@ namespace llvm { // VBMI2 Concat & Shift. VSHLD, VSHRD, - VSHLDV, - VSHRDV, + // Shuffle Packed Values at 128-bit granularity. SHUF128, MOVDDUP, @@ -1004,13 +1003,14 @@ namespace llvm { /// Current rounding mode is represented in bits 11:10 of FPSR. These /// values are same as corresponding constants for rounding mode used /// in glibc. - enum RoundingMode { - rmToNearest = 0, // FE_TONEAREST - rmDownward = 1 << 10, // FE_DOWNWARD - rmUpward = 2 << 10, // FE_UPWARD - rmTowardZero = 3 << 10, // FE_TOWARDZERO - rmMask = 3 << 10 // Bit mask selecting rounding mode - }; + enum RoundingMode { + rmInvalid = -1, // For handle Invalid rounding mode + rmToNearest = 0, // FE_TONEAREST + rmDownward = 1 << 10, // FE_DOWNWARD + rmUpward = 2 << 10, // FE_UPWARD + rmTowardZero = 3 << 10, // FE_TOWARDZERO + rmMask = 3 << 10 // Bit mask selecting rounding mode + }; } /// Define some predicates that are used for node matching. @@ -1058,6 +1058,10 @@ namespace llvm { /// functions. bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF); + + /// Convert LLVM rounding mode to X86 rounding mode. + int getRoundingModeX86(unsigned RM); + } // end namespace X86 //===--------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index b8f299965faa3..564810cb4b88e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3238,6 +3238,7 @@ multiclass avx512_load opc, string OpcodeStr, string Name, (_.VT _.RC:$src1), (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.RR]>; + let mayLoad = 1, canFoldAsLoad = 1 in def rmk : AVX512PI opc, string OpcodeStr, string Name, (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K, Sched<[Sched.RM]>; } + let mayLoad = 1, canFoldAsLoad = 1 in def rmkz : AVX512PI; // VBMI2 //===----------------------------------------------------------------------===// -multiclass VBMI2_shift_var_rm Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rm Op, string OpStr, SDNode OpNode, bit SwapLR, X86FoldableSchedWrite sched, X86VectorVTInfo VTI> { let Constraints = "$src1 = $dst", ExeDomain = VTI.ExeDomain in { defm r: AVX512_maskable_3src, + !if(SwapLR, + (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src3))), + (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src3))))>, T8, PD, EVEX, VVVV, Sched<[sched]>; defm m: AVX512_maskable_3src, + !if(SwapLR, + (VTI.VT (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src3)))), + (VTI.VT (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.LdFrag addr:$src3)))))>, T8, PD, EVEX, VVVV, Sched<[sched.Folded, sched.ReadAfterFold]>; } } -multiclass VBMI2_shift_var_rmb Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rmb Op, string OpStr, SDNode OpNode, bit SwapLR, X86FoldableSchedWrite sched, X86VectorVTInfo VTI> - : VBMI2_shift_var_rm { + : VBMI2_shift_var_rm { let Constraints = "$src1 = $dst", ExeDomain = VTI.ExeDomain in defm mb: AVX512_maskable_3src, + !if(SwapLR, + (OpNode (VTI.VT VTI.RC:$src2), (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))), + (OpNode (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2), (VTI.VT (VTI.BroadcastLdFrag addr:$src3))))>, T8, PD, EVEX, VVVV, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass VBMI2_shift_var_rm_common Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rm_common Op, string OpStr, SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { let Predicates = [HasVBMI2] in - defm Z : VBMI2_shift_var_rm, + defm Z : VBMI2_shift_var_rm, EVEX_V512; let Predicates = [HasVBMI2, HasVLX] in { - defm Z256 : VBMI2_shift_var_rm, + defm Z256 : VBMI2_shift_var_rm, EVEX_V256; - defm Z128 : VBMI2_shift_var_rm, + defm Z128 : VBMI2_shift_var_rm, EVEX_V128; } } -multiclass VBMI2_shift_var_rmb_common Op, string OpStr, SDNode OpNode, +multiclass VBMI2_shift_var_rmb_common Op, string OpStr, SDNode OpNode, bit SwapLR, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTI> { let Predicates = [HasVBMI2] in - defm Z : VBMI2_shift_var_rmb, + defm Z : VBMI2_shift_var_rmb, EVEX_V512; let Predicates = [HasVBMI2, HasVLX] in { - defm Z256 : VBMI2_shift_var_rmb, + defm Z256 : VBMI2_shift_var_rmb, EVEX_V256; - defm Z128 : VBMI2_shift_var_rmb, + defm Z128 : VBMI2_shift_var_rmb, EVEX_V128; } } multiclass VBMI2_shift_var wOp, bits<8> dqOp, string Prefix, - SDNode OpNode, X86SchedWriteWidths sched> { - defm W : VBMI2_shift_var_rm_common { + defm W : VBMI2_shift_var_rm_common, REX_W, EVEX_CD8<16, CD8VF>; - defm D : VBMI2_shift_var_rmb_common, EVEX_CD8<32, CD8VF>; - defm Q : VBMI2_shift_var_rmb_common, REX_W, EVEX_CD8<64, CD8VF>; } @@ -12379,8 +12385,8 @@ multiclass VBMI2_shift_imm wOp, bits<8> dqOp, string Prefix, } // Concat & Shift -defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SchedWriteVecIMul>; -defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SchedWriteVecIMul>; +defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", fshl, 0, SchedWriteVecIMul>; +defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", fshr, 1, SchedWriteVecIMul>; defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SchedWriteVecIMul>; defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SchedWriteVecIMul>; diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index b476859069a57..031fdc1e7162c 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -25,18 +25,12 @@ let SchedRW = [WriteLEA] in { [(set GR32:$dst, lea32addr:$src)]>, OpSize32, Requires<[Not64BitMode]>; - let Predicates = [HasNDD], isCodeGenOnly = 1 in { - def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR8:$dst), (ins lea64_8mem:$src), - "lea{b}\t{$src|$dst}, {$dst|$src}", - [(set GR8:$dst, lea64_iaddr:$src)]>, - OpSize16, - Requires<[In64BitMode]>; - - def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR16:$dst), (ins lea64_16mem:$src), - "lea{w}\t{$src|$dst}, {$dst|$src}", - [(set GR16:$dst, lea64_iaddr:$src)]>, - OpSize16, - Requires<[In64BitMode]>; + let isCodeGenOnly = 1 in { + def LEA64_8r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_8mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32; + + def LEA64_16r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_16mem:$src), + "lea{l}\t{$src|$dst}, {$dst|$src}", []>, OpSize32; } def LEA64_32r : I<0x8D, MRMSrcMem, (outs GR32:$dst), (ins lea64_32mem:$src), @@ -51,6 +45,11 @@ let SchedRW = [WriteLEA] in { [(set GR64:$dst, lea64addr:$src)]>; } // SchedRW +let Predicates = [HasNDD] in { + def : Pat<(i8 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_8r lea64_8mem:$src), sub_8bit)>; + def : Pat<(i16 lea64_iaddr:$src), (EXTRACT_SUBREG (LEA64_16r lea64_16mem:$src), sub_16bit)>; +} + // Pseudo instruction for lea that prevent optimizer from eliminating // the instruction. let SchedRW = [WriteLEA], isPseudo = true, hasSideEffects = 1 in { diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 0c20ffed77e77..5321ecf0c1b2c 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -406,16 +406,6 @@ def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>; def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>; -def X86VShldv : SDNode<"X86ISD::VSHLDV", - SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>>; -def X86VShrdv : SDNode<"X86ISD::VSHRDV", - SDTypeProfile<1, 3, [SDTCisVec<0>, - SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>, - SDTCisSameAs<0,3>]>>; def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; diff --git a/llvm/lib/Target/X86/X86InstrGISel.td b/llvm/lib/Target/X86/X86InstrGISel.td index 39198214037a3..b0c6bb6f61ad8 100644 --- a/llvm/lib/Target/X86/X86InstrGISel.td +++ b/llvm/lib/Target/X86/X86InstrGISel.td @@ -34,6 +34,14 @@ def G_FNSTCW16 : X86GenericInstruction { let mayStore = true; } +def G_FLDCW16 : X86GenericInstruction { + let OutOperandList = (outs); + let InOperandList = (ins ptype0:$src); + let hasSideEffects = true; + let mayLoad = true; +} + def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 58d526269ff3c..1d2cd39951bf4 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -44,6 +44,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" +#include #include using namespace llvm; @@ -755,7 +756,7 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) { return isPICBase; } -bool X86InstrInfo::isReallyTriviallyReMaterializable( +bool X86InstrInfo::isReMaterializableImpl( const MachineInstr &MI) const { switch (MI.getOpcode()) { default: @@ -951,7 +952,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable( break; } } - return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); + return TargetInstrInfo::isReMaterializableImpl(MI); } void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, @@ -2573,10 +2574,13 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VCMPPSZ256rri: case X86::VCMPPDZrrik: case X86::VCMPPSZrrik: + case X86::VCMPPHZrrik: case X86::VCMPPDZ128rrik: case X86::VCMPPSZ128rrik: + case X86::VCMPPHZ128rrik: case X86::VCMPPDZ256rrik: case X86::VCMPPSZ256rrik: + case X86::VCMPPHZ256rrik: WorkingMI = CloneIfNew(MI); WorkingMI->getOperand(MI.getNumExplicitOperands() - 1) .setImm(X86::getSwappedVCMPImm( @@ -2830,10 +2834,13 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI, case X86::VCMPPSZ256rri: case X86::VCMPPDZrrik: case X86::VCMPPSZrrik: + case X86::VCMPPHZrrik: case X86::VCMPPDZ128rrik: case X86::VCMPPSZ128rrik: + case X86::VCMPPHZ128rrik: case X86::VCMPPDZ256rrik: - case X86::VCMPPSZ256rrik: { + case X86::VCMPPSZ256rrik: + case X86::VCMPPHZ256rrik: { unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 1 : 0; // Float comparison can be safely commuted for @@ -8106,6 +8113,39 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI, LiveIntervals *LIS) const { + // If LoadMI is a masked load, check MI having the same mask. + const MCInstrDesc &MCID = get(LoadMI.getOpcode()); + unsigned NumOps = MCID.getNumOperands(); + if (NumOps >= 3) { + Register MaskReg; + const MachineOperand &Op1 = LoadMI.getOperand(1); + const MachineOperand &Op2 = LoadMI.getOperand(2); + + auto IsVKWMClass = [](const TargetRegisterClass *RC) { + return RC == &X86::VK2WMRegClass || RC == &X86::VK4WMRegClass || + RC == &X86::VK8WMRegClass || RC == &X86::VK16WMRegClass || + RC == &X86::VK32WMRegClass || RC == &X86::VK64WMRegClass; + }; + + if (Op1.isReg() && IsVKWMClass(getRegClass(MCID, 1, &RI))) + MaskReg = Op1.getReg(); + else if (Op2.isReg() && IsVKWMClass(getRegClass(MCID, 2, &RI))) + MaskReg = Op2.getReg(); + + if (MaskReg) { + bool HasSameMask = false; + for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) { + const MachineOperand &Op = MI.getOperand(I); + if (Op.isReg() && Op.getReg() == MaskReg) { + HasSameMask = true; + break; + } + } + if (!HasSameMask) + return nullptr; + } + } + // TODO: Support the case where LoadMI loads a wide register, but MI // only uses a subreg. for (auto Op : Ops) { @@ -8114,7 +8154,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( } // If loading from a FrameIndex, fold directly from the FrameIndex. - unsigned NumOps = LoadMI.getDesc().getNumOperands(); int FrameIndex; if (isLoadFromStackSlot(LoadMI, FrameIndex)) { if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF)) diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 86133b3d969b1..5f75559bd9598 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -340,7 +340,7 @@ class X86InstrInfo final : public X86GenInstrInfo { Register isStoreToStackSlotPostFE(const MachineInstr &MI, int &FrameIndex) const override; - bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override; + bool isReMaterializableImpl(const MachineInstr &MI) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 40de36d81ddd2..ee1fec0da3d73 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -29,6 +29,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/IR/ProfDataUtils.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/Transforms/Utils/Local.h" @@ -1283,11 +1284,19 @@ void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N, Value *VR = ConstantInt::get(CI->getType(), static_cast(RHS[i])); Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR); - if (i < N - 1) - B.CreateCondBr(B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)), - BBNE, BBSubs[i + 1]); - else + if (i < N - 1) { + BranchInst *CondBrInst = B.CreateCondBr( + B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)), BBNE, + BBSubs[i + 1]); + + Function *F = CI->getFunction(); + assert(F && "Instruction does not belong to a function!"); + std::optional EC = F->getEntryCount(); + if (EC && EC->getCount() > 0) + setExplicitlyUnknownBranchWeights(*CondBrInst, DEBUG_TYPE); + } else { B.CreateBr(BBNE); + } Phi->addIncoming(Sub, BBSubs[i]); } diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp index c00e9c7bbee06..81efca9dfd209 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp @@ -75,8 +75,8 @@ bool Lowerer::lower(Function &F) { case Intrinsic::coro_subfn_addr: lowerSubFn(Builder, cast(II)); break; - case Intrinsic::coro_end: case Intrinsic::coro_suspend_retcon: + case Intrinsic::coro_is_in_ramp: if (IsPrivateAndUnprocessed) { II->replaceAllUsesWith(PoisonValue::get(II->getType())); } else diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h index d1887980fb3bc..26ec4f3ed6a8c 100644 --- a/llvm/lib/Transforms/Coroutines/CoroCloner.h +++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h @@ -120,6 +120,7 @@ class BaseCloner { void replaceRetconOrAsyncSuspendUses(); void replaceCoroSuspends(); void replaceCoroEnds(); + void replaceCoroIsInRamp(); void replaceSwiftErrorOps(); void salvageDebugInfo(); void handleFinalSuspend(); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 02c38d02cff64..c2d7bcc346776 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -213,7 +213,7 @@ static bool replaceCoroEndAsync(AnyCoroEndInst *End) { /// Replace a non-unwind call to llvm.coro.end. static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, Value *FramePtr, - bool InResume, CallGraph *CG) { + bool InRamp, CallGraph *CG) { // Start inserting right before the coro.end. IRBuilder<> Builder(End); @@ -225,7 +225,7 @@ static void replaceFallthroughCoroEnd(AnyCoroEndInst *End, "switch coroutine should not return any values"); // coro.end doesn't immediately end the coroutine in the main function // in this lowering, because we need to deallocate the coroutine. - if (!InResume) + if (InRamp) return; Builder.CreateRetVoid(); break; @@ -345,8 +345,7 @@ static void markCoroutineAsDone(IRBuilder<> &Builder, const coro::Shape &Shape, /// Replace an unwind call to llvm.coro.end. static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, - Value *FramePtr, bool InResume, - CallGraph *CG) { + Value *FramePtr, bool InRamp, CallGraph *CG) { IRBuilder<> Builder(End); switch (Shape.ABI) { @@ -359,7 +358,7 @@ static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, // FIXME: We should refactor this once there is other language // which uses Switch-Resumed style other than C++. markCoroutineAsDone(Builder, Shape, FramePtr); - if (!InResume) + if (InRamp) return; break; } @@ -383,15 +382,11 @@ static void replaceUnwindCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, } static void replaceCoroEnd(AnyCoroEndInst *End, const coro::Shape &Shape, - Value *FramePtr, bool InResume, CallGraph *CG) { + Value *FramePtr, bool InRamp, CallGraph *CG) { if (End->isUnwind()) - replaceUnwindCoroEnd(End, Shape, FramePtr, InResume, CG); + replaceUnwindCoroEnd(End, Shape, FramePtr, InRamp, CG); else - replaceFallthroughCoroEnd(End, Shape, FramePtr, InResume, CG); - - auto &Context = End->getContext(); - End->replaceAllUsesWith(InResume ? ConstantInt::getTrue(Context) - : ConstantInt::getFalse(Context)); + replaceFallthroughCoroEnd(End, Shape, FramePtr, InRamp, CG); End->eraseFromParent(); } @@ -558,7 +553,16 @@ void coro::BaseCloner::replaceCoroEnds() { // We use a null call graph because there's no call graph node for // the cloned function yet. We'll just be rebuilding that later. auto *NewCE = cast(VMap[CE]); - replaceCoroEnd(NewCE, Shape, NewFramePtr, /*in resume*/ true, nullptr); + replaceCoroEnd(NewCE, Shape, NewFramePtr, /*in ramp*/ false, nullptr); + } +} + +void coro::BaseCloner::replaceCoroIsInRamp() { + auto &Ctx = OrigF.getContext(); + for (auto *II : Shape.CoroIsInRampInsts) { + auto *NewII = cast(VMap[II]); + NewII->replaceAllUsesWith(ConstantInt::getFalse(Ctx)); + NewII->eraseFromParent(); } } @@ -1077,6 +1081,8 @@ void coro::BaseCloner::create() { // Remove coro.end intrinsics. replaceCoroEnds(); + replaceCoroIsInRamp(); + // Salvage debug info that points into the coroutine frame. salvageDebugInfo(); } @@ -1956,14 +1962,19 @@ class PrettyStackTraceFunction : public PrettyStackTraceEntry { static void removeCoroEndsFromRampFunction(const coro::Shape &Shape) { if (Shape.ABI != coro::ABI::Switch) { for (auto *End : Shape.CoroEnds) { - replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, nullptr); + replaceCoroEnd(End, Shape, Shape.FramePtr, /*in ramp*/ true, nullptr); } } else { - for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) { - auto &Context = End->getContext(); - End->replaceAllUsesWith(ConstantInt::getFalse(Context)); + for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) End->eraseFromParent(); - } + } +} + +static void removeCoroIsInRampFromRampFunction(const coro::Shape &Shape) { + for (auto *II : Shape.CoroIsInRampInsts) { + auto &Ctx = II->getContext(); + II->replaceAllUsesWith(ConstantInt::getTrue(Ctx)); + II->eraseFromParent(); } } @@ -2028,6 +2039,7 @@ static void doSplitCoroutine(Function &F, SmallVectorImpl &Clones, coro::salvageDebugInfo(ArgToAllocaMap, *DVR, false /*UseEntryValue*/); removeCoroEndsFromRampFunction(Shape); + removeCoroIsInRampFromRampFunction(Shape); if (shouldCreateNoAllocVariant) SwitchCoroutineSplitter::createNoAllocVariant(F, Shape, Clones); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 28a89a8f87dbd..47c2d0d462e00 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -93,6 +93,7 @@ static Intrinsic::ID NonOverloadedCoroIntrinsics[] = { Intrinsic::coro_save, Intrinsic::coro_subfn_addr, Intrinsic::coro_suspend, + Intrinsic::coro_is_in_ramp, }; bool coro::isSuspendBlock(BasicBlock *BB) { @@ -275,6 +276,9 @@ void coro::Shape::analyze(Function &F, } } break; + case Intrinsic::coro_is_in_ramp: + CoroIsInRampInsts.push_back(cast(II)); + break; case Intrinsic::coro_promise: assert(CoroPromise == nullptr && "CoroEarly must ensure coro.promise unique"); diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 7b67e60f7cc61..15f4d76300bff 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -4526,6 +4526,16 @@ void CallsiteContextGraph:: // If Clone not already assigned to a function clone: // Assign to first function clone without assignment // Assign caller to selected function clone +// For each call with graph Node having clones: +// If number func clones > number call's callsite Node clones: +// Record func CallInfo clones without Node clone in UnassignedCallClones +// For callsite Nodes in DFS order from allocations: +// If IsAllocation: +// Update allocation with alloc type +// Else: +// For Call, all MatchingCalls, and associated UnnassignedCallClones: +// Update call to call recorded callee clone +// template bool CallsiteContextGraph::assignFunctions() { bool Changed = false; @@ -4553,6 +4563,34 @@ bool CallsiteContextGraph::assignFunctions() { DenseMap CallMap; }; + // Map to keep track of information needed to update calls in function clones + // when their corresponding callsite node was not itself cloned for that + // function clone. Because of call context pruning (i.e. we only keep as much + // caller information as needed to distinguish hot vs cold), we may not have + // caller edges coming to each callsite node from all possible function + // callers. A function clone may get created for other callsites in the + // function for which there are caller edges that were not pruned. Any other + // callsites in that function clone, which were not themselved cloned for + // that function clone, should get updated the same way as the corresponding + // callsite in the original function (which may call a clone of its callee). + // + // We build this map after completing function cloning for each function, so + // that we can record the information from its call maps before they are + // destructed. The map will be used as we update calls to update any still + // unassigned call clones. Note that we may create new node clones as we clone + // other functions, so later on we check which node clones were still not + // created. To this end, the inner map is a map from function clone number to + // the list of calls cloned for that function (can be more than one due to the + // Node's MatchingCalls array). + // + // The alternative is creating new callsite clone nodes below as we clone the + // function, but that is tricker to get right and likely more overhead. + // + // Inner map is a std::map so sorted by key (clone number), in order to get + // ordered remarks in the full LTO case. + DenseMap>> + UnassignedCallClones; + // Walk all functions for which we saw calls with memprof metadata, and handle // cloning for each of its calls. for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { @@ -4996,6 +5034,63 @@ bool CallsiteContextGraph::assignFunctions() { } } } + + if (FuncCloneInfos.size() < 2) + continue; + + // In this case there is more than just the original function copy. + // Record call clones of any callsite nodes in the function that did not + // themselves get cloned for all of the function clones. + for (auto &Call : CallsWithMetadata) { + ContextNode *Node = getNodeForInst(Call); + if (!Node || !Node->hasCall() || Node->emptyContextIds()) + continue; + // If Node has enough clones already to cover all function clones, we can + // skip it. Need to add one for the original copy. + // Use >= in case there were clones that were skipped due to having empty + // context ids + if (Node->Clones.size() + 1 >= FuncCloneInfos.size()) + continue; + // First collect all function clones we cloned this callsite node for. + // They may not be sequential due to empty clones e.g. + DenseSet NodeCallClones; + for (auto *C : Node->Clones) + NodeCallClones.insert(C->Call.cloneNo()); + unsigned I = 0; + // Now check all the function clones. + for (auto &FC : FuncCloneInfos) { + // Function clones should be sequential. + assert(FC.FuncClone.cloneNo() == I); + // Skip the first clone which got the original call. + // Also skip any other clones created for this Node. + if (++I == 1 || NodeCallClones.contains(I)) { + continue; + } + // Record the call clones created for this callsite in this function + // clone. + auto &CallVector = UnassignedCallClones[Node][I]; + DenseMap &CallMap = FC.CallMap; + if (auto It = CallMap.find(Call); It != CallMap.end()) { + CallInfo CallClone = It->second; + CallVector.push_back(CallClone); + } else { + // All but the original clone (skipped earlier) should have an entry + // for all calls. + assert(false && "Expected to find call in CallMap"); + } + // Need to do the same for all matching calls. + for (auto &MatchingCall : Node->MatchingCalls) { + if (auto It = CallMap.find(MatchingCall); It != CallMap.end()) { + CallInfo CallClone = It->second; + CallVector.push_back(CallClone); + } else { + // All but the original clone (skipped earlier) should have an entry + // for all calls. + assert(false && "Expected to find call in CallMap"); + } + } + } + } } uint8_t BothTypes = @@ -5057,6 +5152,26 @@ bool CallsiteContextGraph::assignFunctions() { // Update all the matching calls as well. for (auto &Call : Node->MatchingCalls) updateCall(Call, CalleeFunc); + + // Now update all calls recorded earlier that are still in function clones + // which don't have a clone of this callsite node. + if (!UnassignedCallClones.contains(Node)) + return; + DenseSet NodeCallClones; + for (auto *C : Node->Clones) + NodeCallClones.insert(C->Call.cloneNo()); + // Note that we already confirmed Node is in this map a few lines above. + auto &ClonedCalls = UnassignedCallClones[Node]; + for (auto &[CloneNo, CallVector] : ClonedCalls) { + // Should start at 1 as we never create an entry for original node. + assert(CloneNo > 0); + // If we subsequently created a clone, skip this one. + if (NodeCallClones.contains(CloneNo)) + continue; + // Use the original Node's CalleeFunc. + for (auto &Call : CallVector) + updateCall(Call, CalleeFunc); + } }; // Performs DFS traversal starting from allocation nodes to update calls to diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6ad493772d170..cf6d0ecab4f69 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -64,6 +64,7 @@ #include "llvm/Support/KnownBits.h" #include "llvm/Support/KnownFPClass.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/TypeSize.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/InstCombine/InstCombiner.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" @@ -2405,6 +2406,22 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { matchBSwapOrBitReverse(*II, /*MatchBSwaps*/ true, /*MatchBitReversals*/ true)) return BitOp; + + // R = fshl(X, X, C2) + // fshl(R, R, C1) --> fshl(X, X, (C1 + C2) % bitsize) + Value *InnerOp; + const APInt *ShAmtInnerC, *ShAmtOuterC; + if (match(Op0, m_FShl(m_Value(InnerOp), m_Deferred(InnerOp), + m_APInt(ShAmtInnerC))) && + match(ShAmtC, m_APInt(ShAmtOuterC)) && Op0 == Op1) { + APInt Sum = *ShAmtOuterC + *ShAmtInnerC; + APInt Modulo = Sum.urem(APInt(Sum.getBitWidth(), BitWidth)); + if (Modulo.isZero()) + return replaceInstUsesWith(*II, InnerOp); + Constant *ModuloC = ConstantInt::get(Ty, Modulo); + return CallInst::Create(cast(Op0)->getCalledFunction(), + {InnerOp, InnerOp, ModuloC}); + } } // fshl(X, X, Neg(Y)) --> fshr(X, X, Y) @@ -3412,6 +3429,10 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { !isPowerOf2_64(RK.ArgValue) || !isa(RK.IRArgValue)) continue; + // Remove align 1 bundles; they don't add any useful information. + if (RK.ArgValue == 1) + return CallBase::removeOperandBundle(II, OBU.getTagID()); + // Don't try to remove align assumptions for pointers derived from // arguments. We might lose information if the function gets inline and // the align argument attribute disappears. @@ -3761,6 +3782,17 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { return replaceInstUsesWith(CI, Res); } } + + // vector.reduce.add.vNiM(splat(%x)) -> mul(%x, N) + if (Value *Splat = getSplatValue(Arg)) { + ElementCount VecToReduceCount = + cast(Arg->getType())->getElementCount(); + if (VecToReduceCount.isFixed()) { + unsigned VectorSize = VecToReduceCount.getFixedValue(); + return BinaryOperator::CreateMul( + Splat, ConstantInt::get(Splat->getType(), VectorSize)); + } + } } [[fallthrough]]; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 53e77e6cc5c31..9491610190c10 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -338,8 +338,18 @@ bool PointerReplacer::collectUsers() { if (!TryPushInstOperand(TrueInst) || !TryPushInstOperand(FalseInst)) return false; } else if (auto *GEP = dyn_cast(Inst)) { - UsersToReplace.insert(GEP); - PushUsersToWorklist(GEP); + auto *PtrOp = dyn_cast(GEP->getPointerOperand()); + if (!PtrOp) + return false; + if (isAvailable(PtrOp)) { + UsersToReplace.insert(GEP); + PushUsersToWorklist(GEP); + continue; + } + + Worklist.emplace_back(GEP); + if (!TryPushInstOperand(PtrOp)) + return false; } else if (auto *MI = dyn_cast(Inst)) { if (MI->isVolatile()) return false; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 4ea75409252bd..b6b3a95f35c76 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -4611,5 +4611,15 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { return replaceOperand(SI, 2, ConstantInt::get(FalseVal->getType(), 0)); } + Value *MaskedLoadPtr; + const APInt *MaskedLoadAlignment; + if (match(TrueVal, m_OneUse(m_MaskedLoad(m_Value(MaskedLoadPtr), + m_APInt(MaskedLoadAlignment), + m_Specific(CondVal), m_Value())))) + return replaceInstUsesWith( + SI, Builder.CreateMaskedLoad(TrueVal->getType(), MaskedLoadPtr, + Align(MaskedLoadAlignment->getZExtValue()), + CondVal, FalseVal)); + return nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index b17cf17db1580..6ef30663bf3ce 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -723,6 +723,11 @@ static bool replaceExtractElements(InsertElementInst *InsElt, NumExtElts >= NumInsElts) return false; + Value *ExtVecOp = ExtElt->getVectorOperand(); + // Bail out on constant vectors. + if (isa(ExtVecOp)) + return false; + // Create a shuffle mask to widen the extended-from vector using poison // values. The mask selects all of the values of the original vector followed // by as many poison values as needed to create a vector of the same length @@ -733,7 +738,6 @@ static bool replaceExtractElements(InsertElementInst *InsElt, for (unsigned i = NumExtElts; i < NumInsElts; ++i) ExtendMask.push_back(-1); - Value *ExtVecOp = ExtElt->getVectorOperand(); auto *ExtVecOpInst = dyn_cast(ExtVecOp); BasicBlock *InsertionBlock = (ExtVecOpInst && !isa(ExtVecOpInst)) ? ExtVecOpInst->getParent() diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 42c3d4a4f4c46..cdae9a7271915 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Demangle/Demangle.h" @@ -803,7 +804,8 @@ struct AddressSanitizer { bool ignoreAccess(Instruction *Inst, Value *Ptr); void getInterestingMemoryOperands( - Instruction *I, SmallVectorImpl &Interesting); + Instruction *I, SmallVectorImpl &Interesting, + const TargetTransformInfo *TTI); void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, InterestingMemoryOperand &O, bool UseCalls, @@ -843,7 +845,8 @@ struct AddressSanitizer { void instrumentMemIntrinsic(MemIntrinsic *MI, RuntimeCallInserter &RTCI); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool suppressInstrumentationSiteForDebug(int &Instrumented); - bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI); + bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI); bool maybeInsertAsanInitAtFunctionEntry(Function &F); bool maybeInsertDynamicShadowAtFunctionEntry(Function &F); void markEscapedLocalAllocas(Function &F); @@ -1314,7 +1317,8 @@ PreservedAnalyses AddressSanitizerPass::run(Module &M, Options.MaxInlinePoisoningSize, Options.CompileKernel, Options.Recover, Options.UseAfterScope, Options.UseAfterReturn); const TargetLibraryInfo &TLI = FAM.getResult(F); - Modified |= FunctionSanitizer.instrumentFunction(F, &TLI); + const TargetTransformInfo &TTI = FAM.getResult(F); + Modified |= FunctionSanitizer.instrumentFunction(F, &TLI, &TTI); } Modified |= ModuleSanitizer.instrumentModule(); if (!Modified) @@ -1452,7 +1456,8 @@ bool AddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) { } void AddressSanitizer::getInterestingMemoryOperands( - Instruction *I, SmallVectorImpl &Interesting) { + Instruction *I, SmallVectorImpl &Interesting, + const TargetTransformInfo *TTI) { // Do not instrument the load fetching the dynamic shadow address. if (LocalDynamicShadow == I) return; @@ -1570,6 +1575,12 @@ void AddressSanitizer::getInterestingMemoryOperands( break; } default: + if (auto *II = dyn_cast(I)) { + MemIntrinsicInfo IntrInfo; + if (TTI->getTgtMemIntrinsic(II, IntrInfo)) + Interesting = IntrInfo.InterestingOperands; + return; + } for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) { if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) || ignoreAccess(I, CI->getArgOperand(ArgNo))) @@ -1775,6 +1786,25 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, else NumInstrumentedReads++; + if (O.MaybeByteOffset) { + Type *Ty = Type::getInt8Ty(*C); + IRBuilder IB(O.getInsn()); + + Value *OffsetOp = O.MaybeByteOffset; + if (TargetTriple.isRISCV()) { + Type *OffsetTy = OffsetOp->getType(); + // RVV indexed loads/stores zero-extend offset operands which are narrower + // than XLEN to XLEN. + if (OffsetTy->getScalarType()->getIntegerBitWidth() < + static_cast(LongSize)) { + VectorType *OrigType = cast(OffsetTy); + Type *ExtendTy = VectorType::get(IntptrTy, OrigType); + OffsetOp = IB.CreateZExt(OffsetOp, ExtendTy); + } + } + Addr = IB.CreateGEP(Ty, Addr, {OffsetOp}); + } + unsigned Granularity = 1 << Mapping.Scale; if (O.MaybeMask) { instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.MaybeEVL, @@ -2985,7 +3015,8 @@ bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) { } bool AddressSanitizer::instrumentFunction(Function &F, - const TargetLibraryInfo *TLI) { + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI) { bool FunctionModified = false; // Do not apply any instrumentation for naked functions. @@ -3038,7 +3069,7 @@ bool AddressSanitizer::instrumentFunction(Function &F, if (Inst.hasMetadata(LLVMContext::MD_nosanitize)) continue; SmallVector InterestingOperands; - getInterestingMemoryOperands(&Inst, InterestingOperands); + getInterestingMemoryOperands(&Inst, InterestingOperands, TTI); if (!InterestingOperands.empty()) { for (auto &Operand : InterestingOperands) { diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index c14bbecf0d4e1..7c78eb35a865a 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -1591,7 +1591,16 @@ static void insertTrivialPHIs(CHRScope *Scope, } TrivialPHIs.insert(PN); CHR_DEBUG(dbgs() << "Insert phi " << *PN << "\n"); + bool FoundLifetimeAnnotation = false; for (Instruction *UI : Users) { + // If we found a lifetime annotation, remove it, but set a flag + // to ensure that we remove all other lifetime annotations attached + // to the alloca. + if (UI->isLifetimeStartOrEnd()) { + UI->eraseFromParent(); + FoundLifetimeAnnotation = true; + continue; + } for (unsigned J = 0, NumOps = UI->getNumOperands(); J < NumOps; ++J) { if (UI->getOperand(J) == &I) { UI->setOperand(J, PN); @@ -1599,6 +1608,14 @@ static void insertTrivialPHIs(CHRScope *Scope, } CHR_DEBUG(dbgs() << "Updated user " << *UI << "\n"); } + // Erase any leftover lifetime annotations for a dynamic alloca. + if (FoundLifetimeAnnotation) { + for (User *U : make_early_inc_range(I.users())) { + if (auto *UI = dyn_cast(U)) + if (UI->isLifetimeStartOrEnd()) + UI->eraseFromParent(); + } + } } } } @@ -1693,14 +1710,12 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet &TrivialPHIs) { BasicBlock *ExitBlock = LastRegion->getExit(); std::optional ProfileCount = BFI.getBlockProfileCount(EntryBlock); - if (ExitBlock) { - // Insert a trivial phi at the exit block (where the CHR hot path and the - // cold path merges) for a value that's defined in the scope but used - // outside it (meaning it's alive at the exit block). We will add the - // incoming values for the CHR cold paths to it below. Without this, we'd - // miss updating phi's for such values unless there happens to already be a - // phi for that value there. - insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs); + SmallVector StaticAllocas; + for (Instruction &I : *EntryBlock) { + if (auto *AI = dyn_cast(&I)) { + if (AI->isStaticAlloca()) + StaticAllocas.push_back(AI); + } } // Split the entry block of the first region. The new block becomes the new @@ -1719,6 +1734,20 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet &TrivialPHIs) { FirstRegion->replaceEntryRecursive(NewEntryBlock); BasicBlock *PreEntryBlock = EntryBlock; + // Move static allocas into the pre-entry block so they stay static. + for (AllocaInst *AI : StaticAllocas) + AI->moveBefore(EntryBlock->begin()); + + if (ExitBlock) { + // Insert a trivial phi at the exit block (where the CHR hot path and the + // cold path merges) for a value that's defined in the scope but used + // outside it (meaning it's alive at the exit block). We will add the + // incoming values for the CHR cold paths to it below. Without this, we'd + // miss updating phi's for such values unless there happens to already be a + // phi for that value there. + insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs); + } + ValueToValueMapTy VMap; // Clone the blocks in the scope (excluding the PreEntryBlock) to split into a // hot path (originals) and a cold path (clones) and update the PHIs at the diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 61fef1387d82a..480ff4a8c3cb9 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -572,7 +572,8 @@ class DataFlowSanitizer { const uint64_t NumOfElementsInArgOrgTLS = ArgTLSSize / OriginWidthBytes; public: - DataFlowSanitizer(const std::vector &ABIListFiles); + DataFlowSanitizer(const std::vector &ABIListFiles, + IntrusiveRefCntPtr FS); bool runImpl(Module &M, llvm::function_ref GetTLI); @@ -867,12 +868,11 @@ bool LibAtomicFunction(const Function &F) { } // end anonymous namespace DataFlowSanitizer::DataFlowSanitizer( - const std::vector &ABIListFiles) { + const std::vector &ABIListFiles, + IntrusiveRefCntPtr FS) { std::vector AllABIListFiles(std::move(ABIListFiles)); llvm::append_range(AllABIListFiles, ClABIListFiles); - // FIXME: should we propagate vfs::FileSystem to this constructor? - ABIList.set( - SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem())); + ABIList.set(SpecialCaseList::createOrDie(AllABIListFiles, *FS)); CombineTaintLookupTableNames.insert_range(ClCombineTaintLookupTables); } @@ -3471,7 +3471,7 @@ PreservedAnalyses DataFlowSanitizerPass::run(Module &M, AM.getResult(M).getManager(); return FAM.getResult(F); }; - if (!DataFlowSanitizer(ABIListFiles).runImpl(M, GetTLI)) + if (!DataFlowSanitizer(ABIListFiles, FS).runImpl(M, GetTLI)) return PreservedAnalyses::all(); PreservedAnalyses PA = PreservedAnalyses::none(); diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 7933604b8ac25..b988957dfbc08 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -4926,36 +4926,56 @@ struct MemorySanitizerVisitor : public InstVisitor { // <2 x double> @llvm.x86.avx512.rcp14.pd.128 // (<2 x double>, <2 x double>, i8) // + // <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512 + // (<8 x double>, i32, <8 x double>, i8, i32) + // A Imm WriteThru Mask Rounding + // + // All operands other than A and WriteThru (e.g., Mask, Imm, Rounding) must + // be fully initialized. + // // Dst[i] = Mask[i] ? some_op(A[i]) : WriteThru[i] // Dst_shadow[i] = Mask[i] ? all_or_nothing(A_shadow[i]) : WriteThru_shadow[i] - void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I) { + void handleAVX512VectorGenericMaskedFP(IntrinsicInst &I, unsigned AIndex, + unsigned WriteThruIndex, + unsigned MaskIndex) { IRBuilder<> IRB(&I); - assert(I.arg_size() == 3); - Value *A = I.getOperand(0); - Value *WriteThrough = I.getOperand(1); - Value *Mask = I.getOperand(2); + unsigned NumArgs = I.arg_size(); + assert(AIndex < NumArgs); + assert(WriteThruIndex < NumArgs); + assert(MaskIndex < NumArgs); + assert(AIndex != WriteThruIndex); + assert(AIndex != MaskIndex); + assert(WriteThruIndex != MaskIndex); + + Value *A = I.getOperand(AIndex); + Value *WriteThru = I.getOperand(WriteThruIndex); + Value *Mask = I.getOperand(MaskIndex); assert(isFixedFPVector(A)); - assert(isFixedFPVector(WriteThrough)); + assert(isFixedFPVector(WriteThru)); [[maybe_unused]] unsigned ANumElements = cast(A->getType())->getNumElements(); unsigned OutputNumElements = - cast(WriteThrough->getType())->getNumElements(); + cast(WriteThru->getType())->getNumElements(); assert(ANumElements == OutputNumElements); - assert(Mask->getType()->isIntegerTy()); - // Some bits of the mask might be unused, but check them all anyway - // (typically the mask is an integer constant). - insertCheckShadowOf(Mask, &I); + for (unsigned i = 0; i < NumArgs; ++i) { + if (i != AIndex && i != WriteThruIndex) { + // Imm, Mask, Rounding etc. are "control" data, hence we require that + // they be fully initialized. + assert(I.getOperand(i)->getType()->isIntegerTy()); + insertCheckShadowOf(I.getOperand(i), &I); + } + } // The mask has 1 bit per element of A, but a minimum of 8 bits. if (Mask->getType()->getScalarSizeInBits() == 8 && ANumElements < 8) Mask = IRB.CreateTrunc(Mask, Type::getIntNTy(*MS.C, ANumElements)); assert(Mask->getType()->getScalarSizeInBits() == ANumElements); - assert(I.getType() == WriteThrough->getType()); + assert(I.getType() == WriteThru->getType()); Mask = IRB.CreateBitCast( Mask, FixedVectorType::get(IRB.getInt1Ty(), OutputNumElements)); @@ -4966,9 +4986,9 @@ struct MemorySanitizerVisitor : public InstVisitor { AShadow = IRB.CreateSExt(IRB.CreateICmpNE(AShadow, getCleanShadow(AShadow)), AShadow->getType()); - Value *WriteThroughShadow = getShadow(WriteThrough); + Value *WriteThruShadow = getShadow(WriteThru); - Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThroughShadow); + Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThruShadow); setShadow(&I, Shadow); setOriginForNaryOp(I); @@ -6202,7 +6222,8 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_512: case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_256: case Intrinsic::x86_avx512fp16_mask_rsqrt_ph_128: - handleAVX512VectorGenericMaskedFP(I); + handleAVX512VectorGenericMaskedFP(I, /*AIndex=*/0, /*WriteThruIndex=*/1, + /*MaskIndex=*/2); break; // AVX512/AVX10 Reciprocal Square Root @@ -6253,7 +6274,64 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::x86_avx512fp16_mask_rcp_ph_512: case Intrinsic::x86_avx512fp16_mask_rcp_ph_256: case Intrinsic::x86_avx512fp16_mask_rcp_ph_128: - handleAVX512VectorGenericMaskedFP(I); + handleAVX512VectorGenericMaskedFP(I, /*AIndex=*/0, /*WriteThruIndex=*/1, + /*MaskIndex=*/2); + break; + + // <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512 + // (<32 x half>, i32, <32 x half>, i32, i32) + // <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256 + // (<16 x half>, i32, <16 x half>, i32, i16) + // <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128 + // (<8 x half>, i32, <8 x half>, i32, i8) + // + // <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512 + // (<16 x float>, i32, <16 x float>, i16, i32) + // <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256 + // (<8 x float>, i32, <8 x float>, i8) + // <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128 + // (<4 x float>, i32, <4 x float>, i8) + // + // <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512 + // (<8 x double>, i32, <8 x double>, i8, i32) + // A Imm WriteThru Mask Rounding + // <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256 + // (<4 x double>, i32, <4 x double>, i8) + // <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128 + // (<2 x double>, i32, <2 x double>, i8) + // A Imm WriteThru Mask + // + // <32 x bfloat> @llvm.x86.avx10.mask.rndscale.bf16.512 + // (<32 x bfloat>, i32, <32 x bfloat>, i32) + // <16 x bfloat> @llvm.x86.avx10.mask.rndscale.bf16.256 + // (<16 x bfloat>, i32, <16 x bfloat>, i16) + // <8 x bfloat> @llvm.x86.avx10.mask.rndscale.bf16.128 + // (<8 x bfloat>, i32, <8 x bfloat>, i8) + // + // Not supported: three vectors + // - <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh + // (<8 x half>, <8 x half>,<8 x half>, i8, i32, i32) + // - <4 x float> @llvm.x86.avx512.mask.rndscale.ss + // (<4 x float>, <4 x float>, <4 x float>, i8, i32, i32) + // - <2 x double> @llvm.x86.avx512.mask.rndscale.sd + // (<2 x double>, <2 x double>, <2 x double>, i8, i32, + // i32) + // A B WriteThru Mask Imm + // Rounding + case Intrinsic::x86_avx512fp16_mask_rndscale_ph_512: + case Intrinsic::x86_avx512fp16_mask_rndscale_ph_256: + case Intrinsic::x86_avx512fp16_mask_rndscale_ph_128: + case Intrinsic::x86_avx512_mask_rndscale_ps_512: + case Intrinsic::x86_avx512_mask_rndscale_ps_256: + case Intrinsic::x86_avx512_mask_rndscale_ps_128: + case Intrinsic::x86_avx512_mask_rndscale_pd_512: + case Intrinsic::x86_avx512_mask_rndscale_pd_256: + case Intrinsic::x86_avx512_mask_rndscale_pd_128: + case Intrinsic::x86_avx10_mask_rndscale_bf16_512: + case Intrinsic::x86_avx10_mask_rndscale_bf16_256: + case Intrinsic::x86_avx10_mask_rndscale_bf16_128: + handleAVX512VectorGenericMaskedFP(I, /*AIndex=*/0, /*WriteThruIndex=*/2, + /*MaskIndex=*/3); break; // AVX512 FP16 Arithmetic diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 8555ef5c22f82..e54a2e54f9943 100644 --- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -163,7 +163,7 @@ bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I, const SCEV *&OffSCEV) { Type *Int64Ty = Type::getInt64Ty(I->getContext()); OperandBundleUse AlignOB = I->getOperandBundleAt(Idx); - if (AlignOB.getTagName() != "align") + if (AlignOB.getTagID() != LLVMContext::OB_align) return false; assert(AlignOB.Inputs.size() >= 2); AAPtr = AlignOB.Inputs[0].get(); diff --git a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp index c2e58ba393553..89980d54ee897 100644 --- a/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp +++ b/llvm/lib/Transforms/Scalar/DropUnnecessaryAssumes.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/DropUnnecessaryAssumes.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/IntrinsicInst.h" @@ -16,36 +17,119 @@ using namespace llvm; using namespace llvm::PatternMatch; +static bool affectedValuesAreEphemeral(ArrayRef Affected) { + // Check whether all the uses are ephemeral, i.e. recursively only used + // by assumes. In that case, the assume does not provide useful information. + // Note that additional users may appear as a result of inlining and CSE, + // so we should only make this assumption late in the optimization pipeline. + SmallSetVector Worklist; + auto AddUsers = [&](Value *V) { + for (User *U : V->users()) { + // Bail out if we need to inspect too many users. + if (Worklist.size() >= 32) + return false; + Worklist.insert(cast(U)); + } + return true; + }; + + for (Value *V : Affected) { + // Do not handle assumes on globals for now. The use list for them may + // contain uses in other functions. + if (!isa(V)) + return false; + + if (!AddUsers(V)) + return false; + } + + for (unsigned Idx = 0; Idx < Worklist.size(); ++Idx) { + Instruction *I = Worklist[Idx]; + + // Use in assume is ephemeral. + if (isa(I)) + continue; + + // Use in side-effecting instruction is non-ephemeral. + if (I->mayHaveSideEffects() || I->isTerminator()) + return false; + + // Otherwise, recursively look at the users. + if (!AddUsers(I)) + return false; + } + + return true; +} + PreservedAnalyses DropUnnecessaryAssumesPass::run(Function &F, FunctionAnalysisManager &FAM) { AssumptionCache &AC = FAM.getResult(F); bool Changed = false; - for (AssumptionCache::ResultElem &Elem : AC.assumptions()) { - auto *Assume = cast_or_null(Elem.Assume); + for (const WeakVH &Elem : AC.assumptions()) { + auto *Assume = cast_or_null(Elem); if (!Assume) continue; - // TODO: Handle assumes with operand bundles. - if (Assume->hasOperandBundles()) + if (Assume->hasOperandBundles()) { + // Handle operand bundle assumptions. + SmallVector DeadBundleArgs; + SmallVector KeptBundles; + unsigned NumBundles = Assume->getNumOperandBundles(); + for (unsigned I = 0; I != NumBundles; ++I) { + auto IsDead = [](OperandBundleUse Bundle) { + // "ignore" operand bundles are always dead. + if (Bundle.getTagName() == "ignore") + return true; + + // Bundles without arguments do not affect any specific values. + // Always keep them for now. + if (Bundle.Inputs.empty()) + return false; + + SmallVector Affected; + AssumptionCache::findValuesAffectedByOperandBundle( + Bundle, [&](Value *A) { Affected.push_back(A); }); + + return affectedValuesAreEphemeral(Affected); + }; + + OperandBundleUse Bundle = Assume->getOperandBundleAt(I); + if (IsDead(Bundle)) + append_range(DeadBundleArgs, Bundle.Inputs); + else + KeptBundles.emplace_back(Bundle); + } + + if (KeptBundles.size() != NumBundles) { + if (KeptBundles.empty()) { + // All operand bundles are dead, remove the whole assume. + Assume->eraseFromParent(); + } else { + // Otherwise only drop the dead operand bundles. + CallBase *NewAssume = + CallBase::Create(Assume, KeptBundles, Assume->getIterator()); + AC.registerAssumption(cast(NewAssume)); + Assume->eraseFromParent(); + } + + RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadBundleArgs); + Changed = true; + } continue; + } Value *Cond = Assume->getArgOperand(0); // Don't drop type tests, which have special semantics. if (match(Cond, m_Intrinsic())) continue; - SmallPtrSet Affected; + SmallVector Affected; findValuesAffectedByCondition(Cond, /*IsAssume=*/true, - [&](Value *A) { Affected.insert(A); }); - - // If all the affected uses have only one use (part of the assume), then - // the assume does not provide useful information. Note that additional - // users may appear as a result of inlining and CSE, so we should only - // make this assumption late in the optimization pipeline. - // TODO: Handle dead cyclic usages. - // TODO: Handle multiple dead assumes on the same value. - if (!all_of(Affected, match_fn(m_OneUse(m_Value())))) + [&](Value *A) { Affected.push_back(A); }); + + if (!affectedValuesAreEphemeral(Affected)) continue; Assume->eraseFromParent(); diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp index b60b15b6c3a2b..995b80396b8af 100644 --- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp +++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp @@ -57,7 +57,8 @@ static bool tryToImproveAlign( cast(II->getArgOperand(AlignOpIdx))->getAlignValue(); Align PrefAlign = DL.getPrefTypeAlign(Type); Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign); - if (NewAlign <= OldAlign) + if (NewAlign <= OldAlign || + NewAlign.value() > std::numeric_limits().max()) return false; Value *V = diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index b5eb647a042b9..2073303237f69 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -100,6 +100,7 @@ STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumDA, "DA checks passed"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -1371,6 +1372,47 @@ struct LoopFuser { << "\n"); } #endif + unsigned Levels = DepResult->getLevels(); + unsigned SameSDLevels = DepResult->getSameSDLevels(); + unsigned CurLoopLevel = FC0.L->getLoopDepth(); + + // Check if DA is missing info regarding the current loop level + if (CurLoopLevel > Levels + SameSDLevels) + return false; + + // Iterating over the outer levels. + for (unsigned Level = 1; Level <= std::min(CurLoopLevel - 1, Levels); + ++Level) { + unsigned Direction = DepResult->getDirection(Level, false); + + // Check if the direction vector does not include equality. If an outer + // loop has a non-equal direction, outer indicies are different and it + // is safe to fuse. + if (!(Direction & Dependence::DVEntry::EQ)) { + LLVM_DEBUG(dbgs() << "Safe to fuse due to non-equal acceses in the " + "outer loops\n"); + NumDA++; + return true; + } + } + + assert(CurLoopLevel > Levels && "Fusion candidates are not separated"); + + unsigned CurDir = DepResult->getDirection(CurLoopLevel, true); + + // Check if the direction vector does not include greater direction. In + // that case, the dependency is not a backward loop-carried and is legal + // to fuse. For example here we have a forward dependency + // for (int i = 0; i < n; i++) + // A[i] = ...; + // for (int i = 0; i < n; i++) + // ... = A[i-1]; + if (!(CurDir & Dependence::DVEntry::GT)) { + LLVM_DEBUG(dbgs() << "Safe to fuse with no backward loop-carried " + "dependency\n"); + NumDA++; + return true; + } if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) LLVM_DEBUG( diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index 08446ccaa9fca..28ae4f0a0aad9 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -260,6 +260,17 @@ static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Dep.push_back('I'); } + // If all the elements of any direction vector have only '*', legality + // can't be proven. Exit early to save compile time. + if (all_of(Dep, [](char C) { return C == '*'; })) { + ORE->emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence", + L->getStartLoc(), L->getHeader()) + << "All loops have dependencies in all directions."; + }); + return false; + } + // Test whether the dependency is forward or not. bool IsKnownForward = true; if (Src->getParent() != Dst->getParent()) { diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp index faacd422c009c..c578b4b839258 100644 --- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -28,6 +28,10 @@ static cl::opt AnnotateSelect("profcheck-annotate-select", cl::init(true), cl::desc("Also inject (if missing) and verify MD_prof for " "`select` instructions")); +static cl::opt + WeightsForTest("profcheck-weights-for-test", cl::init(false), + cl::desc("Generate weights with small values for tests.")); + static cl::opt SelectTrueWeight( "profcheck-default-select-true-weight", cl::init(2U), cl::desc("When annotating `select` instructions, this value will be used " @@ -91,6 +95,10 @@ bool ProfileInjector::inject() { if (F.getEntryCount(/*AllowSynthetic=*/true)->getCount() == 0) return false; bool Changed = false; + // Cycle through the weights list. If we didn't, tests with more than (say) + // one conditional branch would have the same !prof metadata on all of them, + // and numerically that may make for a poor unit test. + uint32_t WeightsForTestOffset = 0; for (auto &BB : F) { if (AnnotateSelect) { for (auto &I : BB) { @@ -103,38 +111,48 @@ bool ProfileInjector::inject() { if (!Term || Term->getMetadata(LLVMContext::MD_prof)) continue; SmallVector Probs; - Probs.reserve(Term->getNumSuccessors()); - for (auto I = 0U, E = Term->getNumSuccessors(); I < E; ++I) - Probs.emplace_back(BPI.getEdgeProbability(&BB, Term->getSuccessor(I))); - assert(llvm::find_if(Probs, - [](const BranchProbability &P) { - return P.isUnknown(); - }) == Probs.end() && - "All branch probabilities should be valid"); - const auto *FirstZeroDenominator = - find_if(Probs, [](const BranchProbability &P) { - return P.getDenominator() == 0; - }); - (void)FirstZeroDenominator; - assert(FirstZeroDenominator == Probs.end()); - const auto *FirstNonZeroNumerator = - find_if(Probs, [](const BranchProbability &P) { return !P.isZero(); }); - assert(FirstNonZeroNumerator != Probs.end()); - DynamicAPInt LCM(Probs[0].getDenominator()); - DynamicAPInt GCD(FirstNonZeroNumerator->getNumerator()); - for (const auto &Prob : drop_begin(Probs)) { - if (!Prob.getNumerator()) - continue; - LCM = llvm::lcm(LCM, DynamicAPInt(Prob.getDenominator())); - GCD = llvm::gcd(GCD, DynamicAPInt(Prob.getNumerator())); - } SmallVector Weights; Weights.reserve(Term->getNumSuccessors()); - for (const auto &Prob : Probs) { - DynamicAPInt W = - (Prob.getNumerator() * LCM / GCD) / Prob.getDenominator(); - Weights.emplace_back(static_cast((int64_t)W)); + if (WeightsForTest) { + static const std::array Primes{3, 5, 7, 11, 13, 17, 19, 23, 29, 31, + 37, 41, 43, 47, 53, 59, 61, 67, 71}; + for (uint32_t I = 0, E = Term->getNumSuccessors(); I < E; ++I) + Weights.emplace_back( + Primes[(WeightsForTestOffset + I) % Primes.size()]); + ++WeightsForTestOffset; + } else { + Probs.reserve(Term->getNumSuccessors()); + for (auto I = 0U, E = Term->getNumSuccessors(); I < E; ++I) + Probs.emplace_back(BPI.getEdgeProbability(&BB, Term->getSuccessor(I))); + + assert(llvm::find_if(Probs, + [](const BranchProbability &P) { + return P.isUnknown(); + }) == Probs.end() && + "All branch probabilities should be valid"); + const auto *FirstZeroDenominator = + find_if(Probs, [](const BranchProbability &P) { + return P.getDenominator() == 0; + }); + (void)FirstZeroDenominator; + assert(FirstZeroDenominator == Probs.end()); + const auto *FirstNonZeroNumerator = find_if( + Probs, [](const BranchProbability &P) { return !P.isZero(); }); + assert(FirstNonZeroNumerator != Probs.end()); + DynamicAPInt LCM(Probs[0].getDenominator()); + DynamicAPInt GCD(FirstNonZeroNumerator->getNumerator()); + for (const auto &Prob : drop_begin(Probs)) { + if (!Prob.getNumerator()) + continue; + LCM = llvm::lcm(LCM, DynamicAPInt(Prob.getDenominator())); + GCD = llvm::gcd(GCD, DynamicAPInt(Prob.getNumerator())); + } + for (const auto &Prob : Probs) { + DynamicAPInt W = + (Prob.getNumerator() * LCM / GCD) / Prob.getDenominator(); + Weights.emplace_back(static_cast((int64_t)W)); + } } setBranchWeights(*Term, Weights, /*IsExpected=*/false); Changed = true; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index a1f759dd1df83..216bdf4eb9efb 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -525,28 +525,33 @@ static bool dominatesMergePoint( static ConstantInt *getConstantInt(Value *V, const DataLayout &DL) { // Normal constant int. ConstantInt *CI = dyn_cast(V); - if (CI || !isa(V) || !V->getType()->isPointerTy() || - DL.isNonIntegralPointerType(V->getType())) + if (CI || !isa(V) || !V->getType()->isPointerTy()) return CI; + // It is not safe to look through inttoptr or ptrtoint when using unstable + // pointer types. + if (DL.hasUnstableRepresentation(V->getType())) + return nullptr; + // This is some kind of pointer constant. Turn it into a pointer-sized // ConstantInt if possible. - IntegerType *PtrTy = cast(DL.getIntPtrType(V->getType())); + IntegerType *IntPtrTy = cast(DL.getIntPtrType(V->getType())); // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*). if (isa(V)) - return ConstantInt::get(PtrTy, 0); + return ConstantInt::get(IntPtrTy, 0); - // IntToPtr const int. + // IntToPtr const int, we can look through this if the semantics of + // inttoptr for this address space are a simple (truncating) bitcast. if (ConstantExpr *CE = dyn_cast(V)) if (CE->getOpcode() == Instruction::IntToPtr) if (ConstantInt *CI = dyn_cast(CE->getOperand(0))) { // The constant is very likely to have the right type already. - if (CI->getType() == PtrTy) + if (CI->getType() == IntPtrTy) return CI; else return cast( - ConstantFoldIntegerCast(CI, PtrTy, /*isSigned=*/false, DL)); + ConstantFoldIntegerCast(CI, IntPtrTy, /*isSigned=*/false, DL)); } return nullptr; } @@ -866,10 +871,12 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) { } } - // Unwrap any lossless ptrtoint cast. + // Unwrap any lossless ptrtoint cast (except for unstable pointers). if (CV) { if (PtrToIntInst *PTII = dyn_cast(CV)) { Value *Ptr = PTII->getPointerOperand(); + if (DL.hasUnstableRepresentation(Ptr->getType())) + return CV; if (PTII->getType() == DL.getIntPtrType(Ptr->getType())) CV = Ptr; } @@ -1427,6 +1434,8 @@ bool SimplifyCFGOpt::performValueComparisonIntoPredecessorFolding( Builder.SetInsertPoint(PTI); // Convert pointer to int before we switch. if (CV->getType()->isPointerTy()) { + assert(!DL.hasUnstableRepresentation(CV->getType()) && + "Should not end up here with unstable pointers"); CV = Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()), "magicptr"); } @@ -5246,6 +5255,8 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI, Builder.SetInsertPoint(BI); // Convert pointer to int before we switch. if (CompVal->getType()->isPointerTy()) { + assert(!DL.hasUnstableRepresentation(CompVal->getType()) && + "Should not end up here with unstable pointers"); CompVal = Builder.CreatePtrToInt( CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr"); } @@ -6318,9 +6329,12 @@ static bool initializeUniqueCases(SwitchInst *SI, PHINode *&PHI, // Helper function that checks if it is possible to transform a switch with only // two cases (or two cases + default) that produces a result into a select. // TODO: Handle switches with more than 2 cases that map to the same result. +// The branch weights correspond to the provided Condition (i.e. if Condition is +// modified from the original SwitchInst, the caller must adjust the weights) static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Constant *DefaultResult, Value *Condition, - IRBuilder<> &Builder, const DataLayout &DL) { + IRBuilder<> &Builder, const DataLayout &DL, + ArrayRef BranchWeights) { // If we are selecting between only two cases transform into a simple // select or a two-way select if default is possible. // Example: @@ -6329,6 +6343,10 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, // case 20: return 2; ----> %2 = icmp eq i32 %a, 20 // default: return 4; %3 = select i1 %2, i32 2, i32 %1 // } + + const bool HasBranchWeights = + !BranchWeights.empty() && !ProfcheckDisableMetadataFixes; + if (ResultVector.size() == 2 && ResultVector[0].second.size() == 1 && ResultVector[1].second.size() == 1) { ConstantInt *FirstCase = ResultVector[0].second[0]; @@ -6339,11 +6357,35 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp"); SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first, DefaultResult, "switch.select"); + if (auto *SI = dyn_cast(SelectValue); + SI && HasBranchWeights) { + // We start with 3 probabilities, where the numerator is the + // corresponding BranchWeights[i], and the denominator is the sum over + // BranchWeights. We want the probability and negative probability of + // Condition == SecondCase. + assert(BranchWeights.size() == 3); + setBranchWeights(SI, BranchWeights[2], + BranchWeights[0] + BranchWeights[1], + /*IsExpected=*/false); + } } Value *ValueCompare = Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp"); - return Builder.CreateSelect(ValueCompare, ResultVector[0].first, - SelectValue, "switch.select"); + Value *Ret = Builder.CreateSelect(ValueCompare, ResultVector[0].first, + SelectValue, "switch.select"); + if (auto *SI = dyn_cast(Ret); SI && HasBranchWeights) { + // We may have had a DefaultResult. Base the position of the first and + // second's branch weights accordingly. Also the proability that Condition + // != FirstCase needs to take that into account. + assert(BranchWeights.size() >= 2); + size_t FirstCasePos = (Condition != nullptr); + size_t SecondCasePos = FirstCasePos + 1; + uint32_t DefaultCase = (Condition != nullptr) ? BranchWeights[0] : 0; + setBranchWeights(SI, BranchWeights[FirstCasePos], + DefaultCase + BranchWeights[SecondCasePos], + /*IsExpected=*/false); + } + return Ret; } // Handle the degenerate case where two cases have the same result value. @@ -6379,8 +6421,16 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Value *And = Builder.CreateAnd(Condition, AndMask); Value *Cmp = Builder.CreateICmpEQ( And, Constant::getIntegerValue(And->getType(), AndMask)); - return Builder.CreateSelect(Cmp, ResultVector[0].first, - DefaultResult); + Value *Ret = + Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + if (auto *SI = dyn_cast(Ret); SI && HasBranchWeights) { + // We know there's a Default case. We base the resulting branch + // weights off its probability. + assert(BranchWeights.size() >= 2); + setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0), + BranchWeights[0], /*IsExpected=*/false); + } + return Ret; } } @@ -6397,7 +6447,14 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Value *And = Builder.CreateAnd(Condition, ~BitMask, "switch.and"); Value *Cmp = Builder.CreateICmpEQ( And, Constant::getNullValue(And->getType()), "switch.selectcmp"); - return Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + Value *Ret = + Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + if (auto *SI = dyn_cast(Ret); SI && HasBranchWeights) { + assert(BranchWeights.size() >= 2); + setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0), + BranchWeights[0], /*IsExpected=*/false); + } + return Ret; } } @@ -6408,7 +6465,14 @@ static Value *foldSwitchToSelect(const SwitchCaseResultVectorTy &ResultVector, Value *Cmp2 = Builder.CreateICmpEQ(Condition, CaseValues[1], "switch.selectcmp.case2"); Value *Cmp = Builder.CreateOr(Cmp1, Cmp2, "switch.selectcmp"); - return Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + Value *Ret = + Builder.CreateSelect(Cmp, ResultVector[0].first, DefaultResult); + if (auto *SI = dyn_cast(Ret); SI && HasBranchWeights) { + assert(BranchWeights.size() >= 2); + setBranchWeights(SI, accumulate(drop_begin(BranchWeights), 0), + BranchWeights[0], /*IsExpected=*/false); + } + return Ret; } } @@ -6469,8 +6533,18 @@ static bool trySwitchToSelect(SwitchInst *SI, IRBuilder<> &Builder, assert(PHI != nullptr && "PHI for value select not found"); Builder.SetInsertPoint(SI); - Value *SelectValue = - foldSwitchToSelect(UniqueResults, DefaultResult, Cond, Builder, DL); + SmallVector BranchWeights; + if (!ProfcheckDisableMetadataFixes) { + [[maybe_unused]] auto HasWeights = + extractBranchWeights(getBranchWeightMDNode(*SI), BranchWeights); + assert(!HasWeights == (BranchWeights.empty())); + } + assert(BranchWeights.empty() || + (BranchWeights.size() >= + UniqueResults.size() + (DefaultResult != nullptr))); + + Value *SelectValue = foldSwitchToSelect(UniqueResults, DefaultResult, Cond, + Builder, DL, BranchWeights); if (!SelectValue) return false; diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 96670fe3ea195..9f4a242214471 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,5 +1,4 @@ add_llvm_component_library(LLVMVectorize - EVLIndVarSimplify.cpp LoadStoreVectorizer.cpp LoopIdiomVectorize.cpp LoopVectorizationLegality.cpp diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp deleted file mode 100644 index 5dd689799b828..0000000000000 --- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp +++ /dev/null @@ -1,300 +0,0 @@ -//===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass optimizes a vectorized loop with canonical IV to using EVL-based -// IV if it was tail-folded by predicated EVL. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/IVDescriptors.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/Local.h" - -#define DEBUG_TYPE "evl-iv-simplify" - -using namespace llvm; - -STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated"); - -static cl::opt EnableEVLIndVarSimplify( - "enable-evl-indvar-simplify", - cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden, - cl::init(true)); - -namespace { -struct EVLIndVarSimplifyImpl { - ScalarEvolution &SE; - OptimizationRemarkEmitter *ORE = nullptr; - - EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR, - OptimizationRemarkEmitter *ORE) - : SE(LAR.SE), ORE(ORE) {} - - /// Returns true if modify the loop. - bool run(Loop &L); -}; -} // anonymous namespace - -/// Returns the constant part of vectorization factor from the induction -/// variable's step value SCEV expression. -static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) { - if (!Step) - return 0U; - - // Looking for loops with IV step value in the form of `( x - // vscale)`. - if (const auto *Mul = dyn_cast(Step)) { - if (Mul->getNumOperands() == 2) { - const SCEV *LHS = Mul->getOperand(0); - const SCEV *RHS = Mul->getOperand(1); - if (const auto *Const = dyn_cast(LHS); - Const && isa(RHS)) { - uint64_t V = Const->getAPInt().getLimitedValue(); - if (llvm::isUInt<32>(V)) - return V; - } - } - } - - // If not, see if the vscale_range of the parent function is a fixed value, - // which makes the step value to be replaced by a constant. - if (F.hasFnAttribute(Attribute::VScaleRange)) - if (const auto *ConstStep = dyn_cast(Step)) { - APInt V = ConstStep->getAPInt().abs(); - ConstantRange CR = llvm::getVScaleRange(&F, 64); - if (const APInt *Fixed = CR.getSingleElement()) { - V = V.zextOrTrunc(Fixed->getBitWidth()); - uint64_t VF = V.udiv(*Fixed).getLimitedValue(); - if (VF && llvm::isUInt<32>(VF) && - // Make sure step is divisible by vscale. - V.urem(*Fixed).isZero()) - return VF; - } - } - - return 0U; -} - -bool EVLIndVarSimplifyImpl::run(Loop &L) { - if (!EnableEVLIndVarSimplify) - return false; - - if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized")) - return false; - const MDOperand *EVLMD = - findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle") - .value_or(nullptr); - if (!EVLMD || !EVLMD->equalsStr("evl")) - return false; - - BasicBlock *LatchBlock = L.getLoopLatch(); - ICmpInst *OrigLatchCmp = L.getLatchCmpInst(); - if (!LatchBlock || !OrigLatchCmp) - return false; - - InductionDescriptor IVD; - PHINode *IndVar = L.getInductionVariable(SE); - if (!IndVar || !L.getInductionDescriptor(SE, IVD)) { - const char *Reason = (IndVar ? "induction descriptor is not available" - : "cannot recognize induction variable"); - LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName() - << " because" << Reason << "\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", - L.getStartLoc(), L.getHeader()) - << "Cannot retrieve IV because " << ore::NV("Reason", Reason); - }); - } - return false; - } - - BasicBlock *InitBlock, *BackEdgeBlock; - if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) { - LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in " - << L.getName() << "\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", - L.getStartLoc(), L.getHeader()) - << "Does not have a unique incoming and backedge"; - }); - } - return false; - } - - // Retrieve the loop bounds. - std::optional Bounds = L.getBounds(SE); - if (!Bounds) { - LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName() - << "\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", - L.getStartLoc(), L.getHeader()) - << "Could not obtain the loop bounds"; - }); - } - return false; - } - Value *CanonicalIVInit = &Bounds->getInitialIVValue(); - Value *CanonicalIVFinal = &Bounds->getFinalIVValue(); - - const SCEV *StepV = IVD.getStep(); - uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent()); - if (!VF) { - LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV - << "'\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", - L.getStartLoc(), L.getHeader()) - << "Could not infer VF from IndVar step " - << ore::NV("Step", StepV); - }); - } - return false; - } - LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName() - << "\n"); - - // Try to find the EVL-based induction variable. - using namespace PatternMatch; - BasicBlock *BB = IndVar->getParent(); - - Value *EVLIndVar = nullptr; - Value *RemTC = nullptr; - Value *TC = nullptr; - auto IntrinsicMatch = m_Intrinsic( - m_Value(RemTC), m_SpecificInt(VF), - /*Scalable=*/m_SpecificInt(1)); - for (PHINode &PN : BB->phis()) { - if (&PN == IndVar) - continue; - - // Check 1: it has to contain both incoming (init) & backedge blocks - // from IndVar. - if (PN.getBasicBlockIndex(InitBlock) < 0 || - PN.getBasicBlockIndex(BackEdgeBlock) < 0) - continue; - // Check 2: EVL index is always increasing, thus its inital value has to be - // equal to either the initial IV value (when the canonical IV is also - // increasing) or the last IV value (when canonical IV is decreasing). - Value *Init = PN.getIncomingValueForBlock(InitBlock); - using Direction = Loop::LoopBounds::Direction; - switch (Bounds->getDirection()) { - case Direction::Increasing: - if (Init != CanonicalIVInit) - continue; - break; - case Direction::Decreasing: - if (Init != CanonicalIVFinal) - continue; - break; - case Direction::Unknown: - // To be more permissive and see if either the initial or final IV value - // matches PN's init value. - if (Init != CanonicalIVInit && Init != CanonicalIVFinal) - continue; - break; - } - Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock); - assert(RecValue && "expect recurrent IndVar value"); - - LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN - << "\n"); - - // Check 3: Pattern match to find the EVL-based index and total trip count - // (TC). - if (match(RecValue, - m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) && - match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) { - EVLIndVar = RecValue; - break; - } - } - - if (!EVLIndVar || !TC) - return false; - - LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n"); - if (ORE) { - ORE->emit([&]() { - DebugLoc DL; - BasicBlock *Region = nullptr; - if (auto *I = dyn_cast(EVLIndVar)) { - DL = I->getDebugLoc(); - Region = I->getParent(); - } else { - DL = L.getStartLoc(); - Region = L.getHeader(); - } - return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region) - << "Using " << ore::NV("EVLIndVar", EVLIndVar) - << " for EVL-based IndVar"; - }); - } - - // Create an EVL-based comparison and replace the branch to use it as - // predicate. - - // Loop::getLatchCmpInst check at the beginning of this function has ensured - // that latch block ends in a conditional branch. - auto *LatchBranch = cast(LatchBlock->getTerminator()); - assert(LatchBranch->isConditional() && - "expect the loop latch to be ended with a conditional branch"); - ICmpInst::Predicate Pred; - if (LatchBranch->getSuccessor(0) == L.getHeader()) - Pred = ICmpInst::ICMP_NE; - else - Pred = ICmpInst::ICMP_EQ; - - IRBuilder<> Builder(OrigLatchCmp); - auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC); - OrigLatchCmp->replaceAllUsesWith(NewLatchCmp); - - // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are - // not used outside the cycles. However, in this case the now-RAUW-ed - // OrigLatchCmp will be considered a use outside the cycle while in reality - // it's practically dead. Thus we need to remove it before calling - // RecursivelyDeleteDeadPHINode. - (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp); - if (llvm::RecursivelyDeleteDeadPHINode(IndVar)) - LLVM_DEBUG(dbgs() << "Removed original IndVar\n"); - - ++NumEliminatedCanonicalIV; - - return true; -} - -PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM, - LoopStandardAnalysisResults &AR, - LPMUpdater &U) { - Function &F = *L.getHeader()->getParent(); - auto &FAMProxy = LAM.getResult(L, AR); - OptimizationRemarkEmitter *ORE = - FAMProxy.getCachedResult(F); - - if (EVLIndVarSimplifyImpl(AR, ORE).run(L)) - return PreservedAnalyses::allInSet(); - return PreservedAnalyses::all(); -} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ca092dcfcb492..ab5c9c99b9448 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -692,11 +692,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer { // vectorization of *epilogue* loops in the process of vectorizing loops and // their epilogues. class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { - /// The additional bypass block which conditionally skips over the epilogue - /// loop after executing the main loop. Needed to resume inductions and - /// reductions during epilogue vectorization. - BasicBlock *AdditionalBypassBlock = nullptr; - public: EpilogueVectorizerEpilogueLoop( Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, @@ -706,28 +701,12 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { GeneratedRTChecks &Checks, VPlan &Plan) : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM, BFI, PSI, Checks, Plan, EPI.EpilogueVF, - EPI.EpilogueVF, EPI.EpilogueUF) { - TripCount = EPI.TripCount; - } + EPI.EpilogueVF, EPI.EpilogueUF) {} /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (i.e., the second pass of VPlan execution). BasicBlock *createVectorizedLoopSkeleton() final; - /// Return the additional bypass block which targets the scalar loop by - /// skipping the epilogue loop after completing the main loop. - BasicBlock *getAdditionalBypassBlock() const { - assert(AdditionalBypassBlock && - "Trying to access AdditionalBypassBlock but it has not been set"); - return AdditionalBypassBlock; - } - protected: - /// Emits an iteration count bypass check after the main vector loop has - /// finished to see if there are any iterations left to execute by either - /// the vector epilogue or the scalar epilogue. - BasicBlock *emitMinimumVectorEpilogueIterCountCheck(BasicBlock *VectorPH, - BasicBlock *Bypass, - BasicBlock *Insert); void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; @@ -2459,8 +2438,9 @@ struct CSEDenseMapInfo { } // end anonymous namespace -///Perform cse of induction variable instructions. -static void cse(BasicBlock *BB) { +/// FIXME: This legacy common-subexpression-elimination routine is scheduled for +/// removal, in favor of the VPlan-based one. +static void legacyCSE(BasicBlock *BB) { // Perform simple cse. SmallDenseMap CSEMap; for (Instruction &In : llvm::make_early_inc_range(*BB)) { @@ -2564,7 +2544,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB]; // Remove redundant induction instructions. - cse(HeaderBB); + legacyCSE(HeaderBB); } void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) { @@ -2907,15 +2887,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, InstructionCost SafeDivisorCost = 0; auto *VecTy = toVectorTy(I->getType(), VF); - auto *DivisorI = dyn_cast(I->getOperand(1)); - if (DivisorI && !Legal->isInvariant(DivisorI)) { - // The cost of the select guard to ensure all lanes are well defined - // after we speculate above any internal control flow. - SafeDivisorCost += - TTI.getCmpSelInstrCost(Instruction::Select, VecTy, - toVectorTy(Type::getInt1Ty(I->getContext()), VF), - CmpInst::BAD_ICMP_PREDICATE, CostKind); - } + // The cost of the select guard to ensure all lanes are well defined + // after we speculate above any internal control flow. + SafeDivisorCost += + TTI.getCmpSelInstrCost(Instruction::Select, VecTy, + toVectorTy(Type::getInt1Ty(I->getContext()), VF), + CmpInst::BAD_ICMP_PREDICATE, CostKind); SmallVector Operands(I->operand_values()); SafeDivisorCost += TTI.getArithmeticInstrCost( @@ -3925,7 +3902,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( if (VF.isScalar()) continue; - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(*Plan, VF, CostCtx); auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -4182,7 +4160,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { // Add on other costs that are modelled in VPlan, but not in the legacy // cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, + *CM.PSE.getSE()); VPRegionBlock *VectorRegion = P->getVectorLoopRegion(); assert(VectorRegion && "Expected to have a vector region!"); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( @@ -6384,19 +6363,8 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { LoopBlocksDFS DFS(TheLoop); DFS.perform(LI); - MapVector> DeadInvariantStoreOps; for (BasicBlock *BB : reverse(make_range(DFS.beginRPO(), DFS.endRPO()))) for (Instruction &I : reverse(*BB)) { - // Find all stores to invariant variables. Since they are going to sink - // outside the loop we do not need calculate cost for them. - StoreInst *SI; - if ((SI = dyn_cast(&I)) && - Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { - ValuesToIgnore.insert(&I); - DeadInvariantStoreOps[SI->getPointerOperand()].push_back( - SI->getValueOperand()); - } - if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I)) continue; @@ -6443,9 +6411,6 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { append_range(DeadInterleavePointerOps, Op->operands()); } - for (const auto &[_, Ops] : DeadInvariantStoreOps) - llvm::append_range(DeadOps, drop_end(Ops)); - // Mark ops that would be trivially dead and are only used by ignored // instructions as free. BasicBlock *Header = TheLoop->getHeader(); @@ -6871,7 +6836,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF) const { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, *PSE.getSE()); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. @@ -6908,6 +6873,28 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, return nullptr; }; + // Check if a select for a safe divisor was hoisted to the pre-header. If so, + // the select doesn't need to be considered for the vector loop cost; go with + // the more accurate VPlan-based cost model. + for (VPRecipeBase &R : *Plan.getVectorPreheader()) { + auto *VPI = dyn_cast(&R); + if (!VPI || VPI->getOpcode() != Instruction::Select || + VPI->getNumUsers() != 1) + continue; + + if (auto *WR = dyn_cast(*VPI->user_begin())) { + switch (WR->getOpcode()) { + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return true; + default: + break; + } + } + } + DenseSet SeenInstrs; auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry()); for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { @@ -7082,7 +7069,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { // simplifications not accounted for in the legacy cost model. If that's the // case, don't trigger the assertion, as the extra simplifications may cause a // different VF to be picked by the VPlan-based cost model. - VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, + *CM.PSE.getSE()); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); // Verify that the VPlan-based and legacy cost models agree, except for VPlans // with early exits and plans with additional VPlan simplifications. The @@ -7223,7 +7211,6 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::narrowInterleaveGroups( BestVPlan, BestVF, TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)); - VPlanTransforms::cse(BestVPlan); VPlanTransforms::removeDeadRecipes(BestVPlan); VPlanTransforms::convertToConcreteRecipes(BestVPlan); @@ -7237,6 +7224,7 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan, VectorPH, CM.foldTailByMasking(), CM.requiresScalarEpilogue(BestVF.isVector())); VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF); + VPlanTransforms::cse(BestVPlan); VPlanTransforms::simplifyRecipes(BestVPlan); // 0. Generate SCEV-dependent code in the entry, including TripCount, before @@ -7419,124 +7407,28 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck( // EpilogueVectorizerEpilogueLoop //===--------------------------------------------------------------------===// -/// This function is partially responsible for generating the control flow -/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. +/// This function creates a new scalar preheader, using the previous one as +/// entry block to the epilogue VPlan. The minimum iteration check is being +/// represented in VPlan. BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() { - BasicBlock *ScalarPH = createScalarPreheader("vec.epilog."); - BasicBlock *VectorPH = ScalarPH->getSinglePredecessor(); - // Now, compare the remaining count and if there aren't enough iterations to - // execute the vectorized epilogue skip to the scalar part. - VectorPH->setName("vec.epilog.ph"); - BasicBlock *VecEpilogueIterationCountCheck = - SplitBlock(VectorPH, VectorPH->begin(), DT, LI, nullptr, - "vec.epilog.iter.check", true); - VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH); - - emitMinimumVectorEpilogueIterCountCheck(VectorPH, ScalarPH, - VecEpilogueIterationCountCheck); - AdditionalBypassBlock = VecEpilogueIterationCountCheck; - - // Adjust the control flow taking the state info from the main loop - // vectorization into account. - assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && - "expected this to be saved from the previous pass."); - EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, VectorPH); - - EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, ScalarPH); - - // Adjust the terminators of runtime check blocks and phis using them. - BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second; - BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second; - if (SCEVCheckBlock) - SCEVCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, ScalarPH); - if (MemCheckBlock) - MemCheckBlock->getTerminator()->replaceUsesOfWith( - VecEpilogueIterationCountCheck, ScalarPH); - - DT->changeImmediateDominator(ScalarPH, EPI.EpilogueIterationCountCheck); - - // The vec.epilog.iter.check block may contain Phi nodes from inductions or - // reductions which merge control-flow from the latch block and the middle - // block. Update the incoming values here and move the Phi into the preheader. - SmallVector PhisInBlock( - llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis())); - - for (PHINode *Phi : PhisInBlock) { - Phi->moveBefore(VectorPH->getFirstNonPHIIt()); - Phi->replaceIncomingBlockWith( - VecEpilogueIterationCountCheck->getSinglePredecessor(), - VecEpilogueIterationCountCheck); - - // If the phi doesn't have an incoming value from the - // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming - // value and also those from other check blocks. This is needed for - // reduction phis only. - if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { - return EPI.EpilogueIterationCountCheck == IncB; - })) + BasicBlock *NewScalarPH = createScalarPreheader("vec.epilog."); + BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor(); + OriginalScalarPH->setName("vec.epilog.iter.check"); + VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH); + VPBasicBlock *OldEntry = Plan.getEntry(); + for (auto &R : make_early_inc_range(*OldEntry)) { + // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by + // defining. + if (isa(&R)) continue; - Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); - if (SCEVCheckBlock) - Phi->removeIncomingValue(SCEVCheckBlock); - if (MemCheckBlock) - Phi->removeIncomingValue(MemCheckBlock); + R.moveBefore(*NewEntry, NewEntry->end()); } - return VectorPH; -} - -BasicBlock * -EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( - BasicBlock *VectorPH, BasicBlock *Bypass, BasicBlock *Insert) { - - assert(EPI.TripCount && - "Expected trip count to have been saved in the first pass."); - Value *TC = EPI.TripCount; - IRBuilder<> Builder(Insert->getTerminator()); - Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining"); - - // Generate code to check if the loop's trip count is less than VF * UF of the - // vector epilogue loop. - auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()) - ? ICmpInst::ICMP_ULE - : ICmpInst::ICMP_ULT; - - Value *CheckMinIters = - Builder.CreateICmp(P, Count, - createStepForVF(Builder, Count->getType(), - EPI.EpilogueVF, EPI.EpilogueUF), - "min.epilog.iters.check"); - - BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters); - auto VScale = Cost->getVScaleForTuning(); - unsigned MainLoopStep = - estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); - unsigned EpilogueLoopStep = - estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); - // We assume the remaining `Count` is equally distributed in - // [0, MainLoopStep) - // So the probability for `Count < EpilogueLoopStep` should be - // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep - // TODO: Improve the estimate by taking the estimated trip count into - // consideration. - unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); - const uint32_t Weights[] = {EstimatedSkipCount, - MainLoopStep - EstimatedSkipCount}; - setBranchWeights(BI, Weights, /*IsExpected=*/false); - ReplaceInstWithInst(Insert->getTerminator(), &BI); - - // A new entry block has been created for the epilogue VPlan. Hook it in, as - // otherwise we would try to modify the entry to the main vector loop. - VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(Insert); - VPBasicBlock *OldEntry = Plan.getEntry(); VPBlockUtils::reassociateBlocks(OldEntry, NewEntry); Plan.setEntry(NewEntry); // OldEntry is now dead and will be cleaned up when the plan gets destroyed. - return Insert; + return OriginalScalarPH; } void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() { @@ -8041,11 +7933,11 @@ bool VPRecipeBuilder::getScaledReductions( BinaryOperator *ExtendUser = dyn_cast(Op); std::optional BinOpc; Type *ExtOpTypes[2] = {nullptr}; + TTI::PartialReductionExtendKind ExtKinds[2] = {TTI::PR_None}; - auto CollectExtInfo = [this, &Exts, - &ExtOpTypes](SmallVectorImpl &Ops) -> bool { - unsigned I = 0; - for (Value *OpI : Ops) { + auto CollectExtInfo = [this, &Exts, &ExtOpTypes, + &ExtKinds](SmallVectorImpl &Ops) -> bool { + for (const auto &[I, OpI] : enumerate(Ops)) { Value *ExtOp; if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) return false; @@ -8056,7 +7948,7 @@ bool VPRecipeBuilder::getScaledReductions( return false; ExtOpTypes[I] = ExtOp->getType(); - I++; + ExtKinds[I] = TTI::getPartialReductionExtendKind(Exts[I]); } return true; }; @@ -8085,10 +7977,6 @@ bool VPRecipeBuilder::getScaledReductions( } else return false; - TTI::PartialReductionExtendKind OpAExtend = - TTI::getPartialReductionExtendKind(Exts[0]); - TTI::PartialReductionExtendKind OpBExtend = - Exts[1] ? TTI::getPartialReductionExtendKind(Exts[1]) : TTI::PR_None; PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser); TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits(); @@ -8101,7 +7989,8 @@ bool VPRecipeBuilder::getScaledReductions( [&](ElementCount VF) { InstructionCost Cost = TTI->getPartialReductionCost( Update->getOpcode(), ExtOpTypes[0], ExtOpTypes[1], - PHI->getType(), VF, OpAExtend, OpBExtend, BinOpc, CM.CostKind); + PHI->getType(), VF, ExtKinds[0], ExtKinds[1], BinOpc, + CM.CostKind); return Cost.isValid(); }, Range)) { @@ -8178,8 +8067,11 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (std::optional ScaleFactor = getScalingForReduction(Instr)) - return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); + if (std::optional ScaleFactor = getScalingForReduction(Instr)) { + if (auto PartialRed = + tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) + return PartialRed; + } if (!shouldWiden(Instr, Range)) return nullptr; @@ -8213,6 +8105,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa(BinOpRecipe)) std::swap(BinOp, Accumulator); + if (ScaleFactor != + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) + return nullptr; + unsigned ReductionOpcode = Reduction->getOpcode(); if (ReductionOpcode == Instruction::Sub) { auto *const Zero = ConstantInt::get(Reduction->getType(), 0); @@ -8704,7 +8600,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind); + VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, + *CM.PSE.getSE()); VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); } @@ -9615,16 +9512,21 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) { } /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded -/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. -static void -preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, - const SCEV2ValueTy &ExpandedSCEVs, - EpilogueLoopVectorizationInfo &EPI) { +/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some +/// reductions require creating new instructions to compute the resume values. +/// They are collected in a vector and returned. They must be moved to the +/// preheader of the vector epilogue loop, after created by the execution of \p +/// Plan. +static SmallVector preparePlanForEpilogueVectorLoop( + VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, + EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM, + ScalarEvolution &SE) { VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); DenseMap ToFrozen; + SmallVector InstsToMove; // Ensure that the start values for all header phi recipes are updated before // vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { @@ -9694,6 +9596,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, BasicBlock *PBB = cast(ResumeV)->getParent(); IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt()); ResumeV = Builder.CreateICmpNE(ResumeV, StartV); + if (auto *I = dyn_cast(ResumeV)) + InstsToMove.push_back(I); } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(RK)) { Value *StartV = getStartValueFromReductionResult(RdxResult); ToFrozen[StartV] = cast(ResumeV)->getIncomingValueForBlock( @@ -9708,8 +9612,12 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, BasicBlock *ResumeBB = cast(ResumeV)->getParent(); IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt()); Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]); + if (auto *I = dyn_cast(Cmp)) + InstsToMove.push_back(I); Value *Sentinel = RdxResult->getOperand(2)->getLiveInIRValue(); ResumeV = Builder.CreateSelect(Cmp, Sentinel, ResumeV); + if (auto *I = dyn_cast(ResumeV)) + InstsToMove.push_back(I); } else { VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); auto *PhiR = dyn_cast(&R); @@ -9761,6 +9669,18 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, Plan.resetTripCount(ExpandedVal); ExpandR->eraseFromParent(); } + + auto VScale = CM.getVScaleForTuning(); + unsigned MainLoopStep = + estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale); + unsigned EpilogueLoopStep = + estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale); + VPlanTransforms::addMinimumVectorEpilogueIterationCheck( + Plan, EPI.TripCount, EPI.VectorTripCount, + CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector()), EPI.EpilogueVF, + EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE); + + return InstsToMove; } // Generate bypass values from the additional bypass block. Note that when the @@ -9827,6 +9747,101 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L, } } +/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector +// loop, after both plans have executed, updating branches from the iteration +// and runtime checks of the main loop, as well as updating various phis. \p +// InstsToMove contains instructions that need to be moved to the preheader of +// the epilogue vector loop. +static void connectEpilogueVectorLoop( + VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI, + DominatorTree *DT, LoopVectorizationLegality &LVL, + DenseMap &ExpandedSCEVs, GeneratedRTChecks &Checks, + ArrayRef InstsToMove) { + BasicBlock *VecEpilogueIterationCountCheck = + cast(EpiPlan.getEntry())->getIRBasicBlock(); + + BasicBlock *VecEpiloguePreHeader = + cast(VecEpilogueIterationCountCheck->getTerminator()) + ->getSuccessor(1); + // Adjust the control flow taking the state info from the main loop + // vectorization into account. + assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck && + "expected this to be saved from the previous pass."); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); + EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, VecEpiloguePreHeader); + + DTU.applyUpdates({{DominatorTree::Delete, EPI.MainLoopIterationCountCheck, + VecEpilogueIterationCountCheck}, + {DominatorTree::Insert, EPI.MainLoopIterationCountCheck, + VecEpiloguePreHeader}}); + + BasicBlock *ScalarPH = + cast(EpiPlan.getScalarPreheader())->getIRBasicBlock(); + EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, ScalarPH); + DTU.applyUpdates( + {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck, + VecEpilogueIterationCountCheck}, + {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}}); + + // Adjust the terminators of runtime check blocks and phis using them. + BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second; + BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second; + if (SCEVCheckBlock) { + SCEVCheckBlock->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, ScalarPH); + DTU.applyUpdates({{DominatorTree::Delete, SCEVCheckBlock, + VecEpilogueIterationCountCheck}, + {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}}); + } + if (MemCheckBlock) { + MemCheckBlock->getTerminator()->replaceUsesOfWith( + VecEpilogueIterationCountCheck, ScalarPH); + DTU.applyUpdates( + {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck}, + {DominatorTree::Insert, MemCheckBlock, ScalarPH}}); + } + + // The vec.epilog.iter.check block may contain Phi nodes from inductions + // or reductions which merge control-flow from the latch block and the + // middle block. Update the incoming values here and move the Phi into the + // preheader. + SmallVector PhisInBlock( + llvm::make_pointer_range(VecEpilogueIterationCountCheck->phis())); + + for (PHINode *Phi : PhisInBlock) { + Phi->moveBefore(VecEpiloguePreHeader->getFirstNonPHIIt()); + Phi->replaceIncomingBlockWith( + VecEpilogueIterationCountCheck->getSinglePredecessor(), + VecEpilogueIterationCountCheck); + + // If the phi doesn't have an incoming value from the + // EpilogueIterationCountCheck, we are done. Otherwise remove the + // incoming value and also those from other check blocks. This is needed + // for reduction phis only. + if (none_of(Phi->blocks(), [&](BasicBlock *IncB) { + return EPI.EpilogueIterationCountCheck == IncB; + })) + continue; + Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck); + if (SCEVCheckBlock) + Phi->removeIncomingValue(SCEVCheckBlock); + if (MemCheckBlock) + Phi->removeIncomingValue(MemCheckBlock); + } + + auto IP = VecEpiloguePreHeader->getFirstNonPHIIt(); + for (auto *I : InstsToMove) + I->moveBefore(IP); + + // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop + // after executing the main loop. We need to update the resume values of + // inductions and reductions during epilogue vectorization. + fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan, + LVL, ExpandedSCEVs, EPI.VectorTripCount); +} + bool LoopVectorizePass::processLoop(Loop *L) { assert((EnableVPlanNativePath || L->isInnermost()) && "VPlan-native path is not enabled. Only process inner loops."); @@ -10043,7 +10058,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF.Width), CM, - CM.CostKind); + CM.CostKind, *CM.PSE.getSE()); if (!ForceVectorization && !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, LVP.getPlanFor(VF.Width), SEL, @@ -10188,6 +10203,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // factor) again shortly afterwards. VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block"); + BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph"); preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan); EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1, BestEpiPlan); @@ -10201,15 +10217,12 @@ bool LoopVectorizePass::processLoop(Loop *L) { // edges from the first pass. EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM, BFI, PSI, Checks, BestEpiPlan); - EpilogILV.setTripCount(MainILV.getTripCount()); - preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI); - + SmallVector InstsToMove = preparePlanForEpilogueVectorLoop( + BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE()); LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT, true); - - fixScalarResumeValuesFromBypass(EpilogILV.getAdditionalBypassBlock(), L, - BestEpiPlan, LVL, ExpandedSCEVs, - EPI.VectorTripCount); + connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs, + Checks, InstsToMove); ++LoopsEpilogueVectorized; } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, BFI, PSI, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6ac9018df641e..065622efc7ecc 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2237,8 +2237,7 @@ class BoUpSLP { bool isStridedLoad(ArrayRef VL, ArrayRef PointerOps, ArrayRef Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, - const bool IsAnyPointerUsedOutGraph, const int64_t Diff, - StridedPtrInfo &SPtrInfo) const; + const int64_t Diff, StridedPtrInfo &SPtrInfo) const; /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. @@ -6822,10 +6821,19 @@ bool BoUpSLP::isStridedLoad(ArrayRef VL, ArrayRef PointerOps, ArrayRef Order, const TargetTransformInfo &TTI, const DataLayout &DL, ScalarEvolution &SE, - const bool IsAnyPointerUsedOutGraph, const int64_t Diff, StridedPtrInfo &SPtrInfo) const { const size_t Sz = VL.size(); + if (Diff % (Sz - 1) != 0) + return false; + + // Try to generate strided load node. + auto IsAnyPointerUsedOutGraph = any_of(PointerOps, [&](Value *V) { + return isa(V) && any_of(V->users(), [&](User *U) { + return !isVectorized(U) && !MustGather.contains(U); + }); + }); + const uint64_t AbsoluteDiff = std::abs(Diff); Type *ScalarTy = VL.front()->getType(); auto *VecTy = getWidenedType(ScalarTy, Sz); @@ -6956,18 +6964,7 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( cast(V), UserIgnoreList); })) return LoadsState::CompressVectorize; - // Simple check if not a strided access - clear order. - bool IsPossibleStrided = *Diff % (Sz - 1) == 0; - // Try to generate strided load node. - auto IsAnyPointerUsedOutGraph = - IsPossibleStrided && any_of(PointerOps, [&](Value *V) { - return isa(V) && any_of(V->users(), [&](User *U) { - return !isVectorized(U) && !MustGather.contains(U); - }); - }); - if (IsPossibleStrided && - isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, - IsAnyPointerUsedOutGraph, *Diff, SPtrInfo)) + if (isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, *Diff, SPtrInfo)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -17522,7 +17519,9 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return !isa(V) && isa(V); })) || all_of(E->Scalars, [&](Value *V) { - return isa(V) || E->isCopyableElement(V) || + return isa(V) || + (E->Idx == 0 && isa(V)) || + E->isCopyableElement(V) || (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V)); })) Res = FindLastInst(); @@ -19122,7 +19121,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } case Instruction::InsertElement: { assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique"); - Builder.SetInsertPoint(cast(E->Scalars.back())); + if (const TreeEntry *OpE = getOperandEntry(E, 1); + OpE && !OpE->isGather() && OpE->hasState() && + !OpE->hasCopyableElements()) + Builder.SetInsertPoint(cast(E->Scalars.back())); + else + setInsertPointAfterBundle(E); Value *V = vectorizeOperand(E, 1); ArrayRef Op = E->getOperand(1); Type *ScalarTy = Op.front()->getType(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 1e6f1e3aeb0ac..728d29107808d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -845,19 +845,10 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) { if (VF.isScalable()) return InstructionCost::getInvalid(); - // First compute the cost of the conditionally executed recipes, followed by - // account for the branching cost, except if the mask is a header mask or - // uniform condition. - using namespace llvm::VPlanPatternMatch; + // Compute and return the cost of the conditionally executed recipes. + assert(VF.isVector() && "Can only compute vector cost at the moment."); VPBasicBlock *Then = cast(getEntry()->getSuccessors()[0]); - InstructionCost ThenCost = Then->cost(VF, Ctx); - - // For the scalar case, we may not always execute the original predicated - // block, Thus, scale the block's cost by the probability of executing it. - if (VF.isScalar()) - return ThenCost / getPredBlockCostDivisor(Ctx.CostKind); - - return ThenCost; + return Then->cost(VF, Ctx); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1700,26 +1691,6 @@ void LoopVectorizationPlanner::updateLoopMetadataAndProfileInfo( LoopVectorizeHints Hints(VectorLoop, true, *ORE); Hints.setAlreadyVectorized(); } - - // Check if it's EVL-vectorized and mark the corresponding metadata. - bool IsEVLVectorized = - llvm::any_of(*HeaderVPBB, [](const VPRecipeBase &Recipe) { - // Looking for the ExplictVectorLength VPInstruction. - if (const auto *VI = dyn_cast(&Recipe)) - return VI->getOpcode() == VPInstruction::ExplicitVectorLength; - return false; - }); - if (IsEVLVectorized) { - LLVMContext &Context = VectorLoop->getHeader()->getContext(); - MDNode *LoopID = VectorLoop->getLoopID(); - auto *IsEVLVectorizedMD = MDNode::get( - Context, - {MDString::get(Context, "llvm.loop.isvectorized.tailfoldingstyle"), - MDString::get(Context, "evl")}); - MDNode *NewLoopID = makePostTransformationMetadata(Context, LoopID, {}, - {IsEVLVectorizedMD}); - VectorLoop->setLoopID(NewLoopID); - } } TargetTransformInfo::UnrollingPreferences UP; TTI.getUnrollingPreferences(VectorLoop, *PSE.getSE(), UP, ORE); @@ -1779,7 +1750,8 @@ VPCostContext::getOperandInfo(VPValue *V) const { } InstructionCost VPCostContext::getScalarizationOverhead( - Type *ResultTy, ArrayRef Operands, ElementCount VF) { + Type *ResultTy, ArrayRef Operands, ElementCount VF, + bool AlwaysIncludeReplicatingR) { if (VF.isScalar()) return 0; @@ -1799,7 +1771,9 @@ InstructionCost VPCostContext::getScalarizationOverhead( SmallPtrSet UniqueOperands; SmallVector Tys; for (auto *Op : Operands) { - if (Op->isLiveIn() || isa(Op) || + if (Op->isLiveIn() || + (!AlwaysIncludeReplicatingR && + isa(Op)) || !UniqueOperands.insert(Op).second) continue; Tys.push_back(toVectorizedTy(Types.inferScalarType(Op), VF)); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index e64cefde81e31..0822511150e9e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -705,6 +705,9 @@ class VPIRFlags { VPIRFlags(WrapFlagsTy WrapFlags) : OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {} + VPIRFlags(TruncFlagsTy TruncFlags) + : OpType(OperationType::Trunc), TruncFlags(TruncFlags) {} + VPIRFlags(FastMathFlags FMFs) : OpType(OperationType::FPMathOp), FMFs(FMFs) {} VPIRFlags(DisjointFlagsTy DisjointFlags) @@ -1494,9 +1497,10 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, const VPIRFlags &Flags = {}, + const VPIRMetadata &Metadata = {}, DebugLoc DL = DebugLoc::getUnknown()) : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, Flags, DL), - VPIRMetadata(), Opcode(Opcode), ResultTy(ResultTy) { + VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) { assert(flagsValidForOpcode(Opcode) && "Set flags not supported for the provided opcode"); } @@ -1504,11 +1508,11 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { ~VPWidenCastRecipe() override = default; VPWidenCastRecipe *clone() override { + auto *New = new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, *this, + *this, getDebugLoc()); if (auto *UV = getUnderlyingValue()) - return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy, - *cast(UV)); - - return new VPWidenCastRecipe(Opcode, getOperand(0), ResultTy); + New->setUnderlyingValue(UV); + return New; } VP_CLASSOF_IMPL(VPDef::VPWidenCastSC) diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 46ab7712e2671..07bfe7a896d86 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -395,20 +395,6 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, return Base::properlyDominates(ParentA, ParentB); } -/// Get the VF scaling factor applied to the recipe's output, if the recipe has -/// one. -static unsigned getVFScaleFactor(VPValue *R) { - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast(R)) - return RR->getVFScaleFactor(); - assert( - (!isa(R) || cast(R)->getOpcode() != - VPInstruction::ReductionStartVector) && - "getting scaling factor of reduction-start-vector not implemented yet"); - return 1; -} - bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI, unsigned OverrideMaxNumRegs) const { return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) { @@ -571,7 +557,8 @@ SmallVector llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(VPV); + unsigned ScaleFactor = + vputils::getVFScaleFactor(VPV->getDefiningRecipe()); ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index cef91c15dd873..c8212af9f8e00 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -756,6 +756,43 @@ void VPlanTransforms::addMinimumIterationCheck( } } +void VPlanTransforms::addMinimumVectorEpilogueIterationCheck( + VPlan &Plan, Value *TripCount, Value *VectorTripCount, + bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, + unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE) { + // Add the minimum iteration check for the epilogue vector loop. + VPValue *TC = Plan.getOrAddLiveIn(TripCount); + VPBuilder Builder(cast(Plan.getEntry())); + VPValue *Count = Builder.createNaryOp( + Instruction::Sub, {TC, Plan.getOrAddLiveIn(VectorTripCount)}, + DebugLoc::getUnknown(), "n.vec.remaining"); + + // Generate code to check if the loop's trip count is less than VF * UF of + // the vector epilogue loop. + auto P = RequiresScalarEpilogue ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT; + VPValue *VFxUF = Builder.createExpandSCEV(SE.getElementCount( + TripCount->getType(), (EpilogueVF * EpilogueUF), SCEV::FlagNUW)); + + auto *CheckMinIters = Builder.createICmp( + P, Count, VFxUF, DebugLoc::getUnknown(), "min.epilog.iters.check"); + VPInstruction *Branch = + Builder.createNaryOp(VPInstruction::BranchOnCond, CheckMinIters); + + // We assume the remaining `Count` is equally distributed in + // [0, MainLoopStep) + // So the probability for `Count < EpilogueLoopStep` should be + // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep + // TODO: Improve the estimate by taking the estimated trip count into + // consideration. + unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep); + const uint32_t Weights[] = {EstimatedSkipCount, + MainLoopStep - EstimatedSkipCount}; + MDBuilder MDB(Plan.getContext()); + MDNode *BranchWeights = + MDB.createBranchWeights(Weights, /*IsExpected=*/false); + Branch->addMetadata(LLVMContext::MD_prof, BranchWeights); +} + bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) { auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { auto *MinMaxR = dyn_cast( diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h index fe59774b7c838..2a8baec74b72b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -349,12 +349,14 @@ struct VPCostContext { LoopVectorizationCostModel &CM; SmallPtrSet SkipCostComputation; TargetTransformInfo::TargetCostKind CostKind; + ScalarEvolution &SE; VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, const VPlan &Plan, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) + TargetTransformInfo::TargetCostKind CostKind, + ScalarEvolution &SE) : TTI(TTI), TLI(TLI), Types(Plan), LLVMCtx(Plan.getContext()), CM(CM), - CostKind(CostKind) {} + CostKind(CostKind), SE(SE) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -374,10 +376,12 @@ struct VPCostContext { /// Estimate the overhead of scalarizing a recipe with result type \p ResultTy /// and \p Operands with \p VF. This is a convenience wrapper for the - /// type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Type *ResultTy, - ArrayRef Operands, - ElementCount VF); + /// type-based getScalarizationOverhead API. If \p AlwaysIncludeReplicatingR + /// is true, always compute the cost of scalarizing replicating operands. + InstructionCost + getScalarizationOverhead(Type *ResultTy, ArrayRef Operands, + ElementCount VF, + bool AlwaysIncludeReplicatingR = false); }; /// This class can be used to assign names to VPValues. For VPValues without diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aa3de3613b68e..b5e30cb1fa655 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1214,6 +1214,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { case Instruction::Select: case Instruction::PHI: case VPInstruction::AnyOf: + case VPInstruction::Broadcast: case VPInstruction::BuildStructVector: case VPInstruction::BuildVector: case VPInstruction::CalculateTripCountMinusVF: @@ -2016,13 +2017,13 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || Opcode == Instruction::FSub || Opcode == Instruction::FNeg || Opcode == Instruction::FDiv || Opcode == Instruction::FRem || + Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc || Opcode == Instruction::FCmp || Opcode == Instruction::Select || Opcode == VPInstruction::WideIVStep || Opcode == VPInstruction::ReductionStartVector || Opcode == VPInstruction::ComputeReductionResult; case OperationType::NonNegOp: - return Opcode == Instruction::ZExt; - break; + return Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP; case OperationType::Cmp: return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; case OperationType::Other: @@ -3051,7 +3052,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { if (State.VF.isVector() && shouldPack()) { Value *WideValue = State.Lane->isFirstLane() - ? PoisonValue::get(VectorType::get(UI->getType(), State.VF)) + ? PoisonValue::get(toVectorizedTy(UI->getType(), State.VF)) : State.get(this); State.set(this, State.packScalarIntoVectorizedValue(this, WideValue, *State.Lane)); @@ -3068,6 +3069,61 @@ bool VPReplicateRecipe::shouldPack() const { }); } +/// Returns true if \p Ptr is a pointer computation for which the legacy cost +/// model computes a SCEV expression when computing the address cost. +static bool shouldUseAddressAccessSCEV(const VPValue *Ptr) { + auto *PtrR = Ptr->getDefiningRecipe(); + if (!PtrR || !((isa(PtrR) && + cast(PtrR)->getOpcode() == + Instruction::GetElementPtr) || + isa(PtrR))) + return false; + + // We are looking for a GEP where all indices are either loop invariant or + // inductions. + for (VPValue *Opd : drop_begin(PtrR->operands())) { + if (!Opd->isDefinedOutsideLoopRegions() && + !isa(Opd)) + return false; + } + + return true; +} + +/// Returns true if \p V is used as part of the address of another load or +/// store. +static bool isUsedByLoadStoreAddress(const VPUser *V) { + SmallPtrSet Seen; + SmallVector WorkList = {V}; + + while (!WorkList.empty()) { + auto *Cur = dyn_cast(WorkList.pop_back_val()); + if (!Cur || !Seen.insert(Cur).second) + continue; + + for (VPUser *U : Cur->users()) { + if (auto *InterleaveR = dyn_cast(U)) + if (InterleaveR->getAddr() == Cur) + return true; + if (auto *RepR = dyn_cast(U)) { + if (RepR->getOpcode() == Instruction::Load && + RepR->getOperand(0) == Cur) + return true; + if (RepR->getOpcode() == Instruction::Store && + RepR->getOperand(1) == Cur) + return true; + } + if (auto *MemR = dyn_cast(U)) { + if (MemR->getAddr() == Cur && MemR->isConsecutive()) + return true; + } + } + + append_range(WorkList, cast(Cur)->users()); + } + return false; +} + InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, VPCostContext &Ctx) const { Instruction *UI = cast(getUnderlyingValue()); @@ -3175,21 +3231,58 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, } case Instruction::Load: case Instruction::Store: { - if (isSingleScalar()) { - bool IsLoad = UI->getOpcode() == Instruction::Load; - Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); - Type *ScalarPtrTy = Ctx.Types.inferScalarType(getOperand(IsLoad ? 0 : 1)); - const Align Alignment = getLoadStoreAlignment(UI); - unsigned AS = getLoadStoreAddressSpace(UI); - TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); - InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( - UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo, UI); - return ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( - ScalarPtrTy, nullptr, nullptr, Ctx.CostKind); - } + if (VF.isScalable() && !isSingleScalar()) + return InstructionCost::getInvalid(); + // TODO: See getMemInstScalarizationCost for how to handle replicating and // predicated cases. - break; + const VPRegionBlock *ParentRegion = getParent()->getParent(); + if (ParentRegion && ParentRegion->isReplicator()) + break; + + bool IsLoad = UI->getOpcode() == Instruction::Load; + const VPValue *PtrOp = getOperand(!IsLoad); + // TODO: Handle cases where we need to pass a SCEV to + // getAddressComputationCost. + if (shouldUseAddressAccessSCEV(PtrOp)) + break; + + Type *ValTy = Ctx.Types.inferScalarType(IsLoad ? this : getOperand(0)); + Type *ScalarPtrTy = Ctx.Types.inferScalarType(PtrOp); + const Align Alignment = getLoadStoreAlignment(UI); + unsigned AS = getLoadStoreAddressSpace(UI); + TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(UI->getOperand(0)); + InstructionCost ScalarMemOpCost = Ctx.TTI.getMemoryOpCost( + UI->getOpcode(), ValTy, Alignment, AS, Ctx.CostKind, OpInfo); + + Type *PtrTy = isSingleScalar() ? ScalarPtrTy : toVectorTy(ScalarPtrTy, VF); + + InstructionCost ScalarCost = + ScalarMemOpCost + Ctx.TTI.getAddressComputationCost( + PtrTy, &Ctx.SE, nullptr, Ctx.CostKind); + if (isSingleScalar()) + return ScalarCost; + + SmallVector OpsToScalarize; + Type *ResultTy = Type::getVoidTy(PtrTy->getContext()); + // Set ResultTy and OpsToScalarize, if scalarization is needed. Currently we + // don't assign scalarization overhead in general, if the target prefers + // vectorized addressing or the loaded value is used as part of an address + // of another load or store. + bool PreferVectorizedAddressing = Ctx.TTI.prefersVectorizedAddressing(); + if (PreferVectorizedAddressing || !isUsedByLoadStoreAddress(this)) { + bool EfficientVectorLoadStore = + Ctx.TTI.supportsEfficientVectorElementLoadStore(); + if (!(IsLoad && !PreferVectorizedAddressing) && + !(!IsLoad && EfficientVectorLoadStore)) + append_range(OpsToScalarize, operands()); + + if (!EfficientVectorLoadStore) + ResultTy = Ctx.Types.inferScalarType(this); + } + + return (ScalarCost * VF.getFixedValue()) + + Ctx.getScalarizationOverhead(ResultTy, OpsToScalarize, VF, true); } } @@ -3267,11 +3360,22 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) { // also do that packing, thereby "hoisting" the insert-element sequence. // Otherwise, a phi node for the scalar value is needed. if (State.hasVectorValue(getOperand(0))) { - Value *VectorValue = State.get(getOperand(0)); - InsertElementInst *IEI = cast(VectorValue); - PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2); - VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector. - VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element. + auto *VecI = cast(State.get(getOperand(0))); + assert((isa(VecI)) && + "Packed operands must generate an insertelement or insertvalue"); + + // If VectorI is a struct, it will be a sequence like: + // %1 = insertvalue %unmodified, %x, 0 + // %2 = insertvalue %1, %y, 1 + // %VectorI = insertvalue %2, %z, 2 + // To get the unmodified vector we need to look through the chain. + if (auto *StructTy = dyn_cast(VecI->getType())) + for (unsigned I = 0; I < StructTy->getNumContainedTypes() - 1; I++) + VecI = cast(VecI->getOperand(0)); + + PHINode *VPhi = State.Builder.CreatePHI(VecI->getType(), 2); + VPhi->addIncoming(VecI->getOperand(0), PredicatingBB); // Unmodified vector. + VPhi->addIncoming(VecI, PredicatedBB); // New vector with inserted element. if (State.hasVectorValue(this)) State.reset(this, VPhi); else diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 84f02059743c3..5252e1f928294 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2195,7 +2195,8 @@ void VPlanTransforms::truncateToMinimalBitwidths( auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op); VPWidenCastRecipe *NewOp = IterIsEmpty - ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy) + ? new VPWidenCastRecipe(Instruction::Trunc, Op, NewResTy, + VPIRFlags::TruncFlagsTy(false, false)) : ProcessedIter->second; R.setOperand(Idx, NewOp); if (!IterIsEmpty) @@ -2852,6 +2853,7 @@ void VPlanTransforms::replaceSymbolicStrides( return R->getParent()->getParent() || R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor(); }; + ValueToSCEVMapTy RewriteMap; for (const SCEV *Stride : StridesMap.values()) { using namespace SCEVPatternMatch; auto *StrideV = cast(Stride)->getValue(); @@ -2879,6 +2881,22 @@ void VPlanTransforms::replaceSymbolicStrides( VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C)); StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride); } + RewriteMap[StrideV] = PSE.getSCEV(StrideV); + } + + for (VPRecipeBase &R : *Plan.getEntry()) { + auto *ExpSCEV = dyn_cast(&R); + if (!ExpSCEV) + continue; + const SCEV *ScevExpr = ExpSCEV->getSCEV(); + auto *NewSCEV = + SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap); + if (NewSCEV != ScevExpr) { + VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV); + ExpSCEV->replaceAllUsesWith(NewExp); + if (Plan.getTripCount() == ExpSCEV) + Plan.resetTripCount(NewExp); + } } } @@ -3566,13 +3584,13 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, Mul, Ext0, Ext1, Ext)) { auto *NewExt0 = new VPWidenCastRecipe( Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0, - Ext0->getDebugLoc()); + *Ext0, Ext0->getDebugLoc()); NewExt0->insertBefore(Ext0); VPWidenCastRecipe *NewExt1 = NewExt0; if (Ext0 != Ext1) { NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0), - Ext->getResultType(), *Ext1, + Ext->getResultType(), *Ext1, *Ext1, Ext1->getDebugLoc()); NewExt1->insertBefore(Ext1); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 69452a7e37572..4c65cb7d7a80d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -117,6 +117,13 @@ struct VPlanTransforms { bool TailFolded, bool CheckNeededWithTailFolding, Loop *OrigLoop, const uint32_t *MinItersBypassWeights, DebugLoc DL, ScalarEvolution &SE); + /// Add a check to \p Plan to see if the epilogue vector loop should be + /// executed. + static void addMinimumVectorEpilogueIterationCheck( + VPlan &Plan, Value *TripCount, Value *VectorTripCount, + bool RequiresScalarEpilogue, ElementCount EpilogueVF, unsigned EpilogueUF, + unsigned MainLoopStep, unsigned EpilogueLoopStep, ScalarEvolution &SE); + /// Replace loops in \p Plan's flat CFG with VPRegionBlocks, turning \p Plan's /// flat CFG into a hierarchical CFG. LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 917aa01f8a926..059993043dcda 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -13,6 +13,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" using namespace llvm; +using namespace llvm::VPlanPatternMatch; bool vputils::onlyFirstLaneUsed(const VPValue *Def) { return all_of(Def->users(), @@ -63,7 +64,6 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { }; VPValue *A, *B; - using namespace VPlanPatternMatch; if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One()))) return B == Plan.getTripCount() && @@ -90,7 +90,6 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) { } bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { - using namespace VPlanPatternMatch; // Live-ins are uniform. if (V->isLiveIn()) return true; @@ -141,11 +140,24 @@ VPBasicBlock *vputils::getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT) { return I == DepthFirst.end() ? nullptr : cast(*I); } +unsigned vputils::getVFScaleFactor(VPRecipeBase *R) { + if (!R) + return 1; + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + if (auto *RR = dyn_cast(R)) + return RR->getVFScaleFactor(); + assert( + (!isa(R) || cast(R)->getOpcode() != + VPInstruction::ReductionStartVector) && + "getting scaling factor of reduction-start-vector not implemented yet"); + return 1; +} + std::optional vputils::getRecipesForUncountableExit(VPlan &Plan, SmallVectorImpl &Recipes, SmallVectorImpl &GEPs) { - using namespace llvm::VPlanPatternMatch; // Given a VPlan like the following (just including the recipes contributing // to loop control exiting here, not the actual work), we're looking to match // the recipes contributing to the uncountable exit condition comparison diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h index 33dd8efaec2db..0222b0aa81063 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h @@ -102,6 +102,10 @@ bool isUniformAcrossVFsAndUFs(VPValue *V); /// exist. VPBasicBlock *getFirstLoopHeader(VPlan &Plan, VPDominatorTree &VPDT); +/// Get the VF scaling factor applied to the recipe's output, if the recipe has +/// one. +unsigned getVFScaleFactor(VPRecipeBase *R); + /// Returns the VPValue representing the uncountable exit comparison used by /// AnyOf if the recipes it depends on can be traced back to live-ins and /// the addresses (in GEP/PtrAdd form) of any (non-masked) load used in diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 92caa0b4e51d5..013ea2e883534 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -199,7 +199,8 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { // EVLIVIncrement is only used by EVLIV & BranchOnCount. // Having more than two users is unexpected. using namespace llvm::VPlanPatternMatch; - if ((I->getNumUsers() != 1) && + if (I->getOpcode() != VPInstruction::Broadcast && + I->getNumUsers() != 1 && (I->getNumUsers() != 2 || none_of(I->users(), match_fn(m_BranchOnCount(m_Specific(I), m_VPValue()))))) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 0ef933f596604..32704bdb54f4f 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2487,21 +2487,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask)))) return false; + // Check whether this is a binary shuffle. + bool IsBinaryShuffle = !isa(V1); + auto *C0 = dyn_cast(V0); auto *C1 = dyn_cast(V1); - if (!C0 || !C1) + if (!C0 || (IsBinaryShuffle && !C1)) return false; Instruction::CastOps Opcode = C0->getOpcode(); - if (C0->getSrcTy() != C1->getSrcTy()) + + // If this is allowed, foldShuffleOfCastops can get stuck in a loop + // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle. + if (!IsBinaryShuffle && Opcode == Instruction::BitCast) return false; - // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. - if (Opcode != C1->getOpcode()) { - if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) - Opcode = Instruction::SExt; - else + if (IsBinaryShuffle) { + if (C0->getSrcTy() != C1->getSrcTy()) return false; + // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds. + if (Opcode != C1->getOpcode()) { + if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value()))) + Opcode = Instruction::SExt; + else + return false; + } } auto *ShuffleDstTy = dyn_cast(I.getType()); @@ -2544,23 +2554,31 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { InstructionCost CostC0 = TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy, TTI::CastContextHint::None, CostKind); - InstructionCost CostC1 = - TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, - TTI::CastContextHint::None, CostKind); - InstructionCost OldCost = CostC0 + CostC1; - OldCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, - CastDstTy, OldMask, CostKind, 0, nullptr, {}, &I); - InstructionCost NewCost = - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, NewShuffleDstTy, - CastSrcTy, NewMask, CostKind); + TargetTransformInfo::ShuffleKind ShuffleKind; + if (IsBinaryShuffle) + ShuffleKind = TargetTransformInfo::SK_PermuteTwoSrc; + else + ShuffleKind = TargetTransformInfo::SK_PermuteSingleSrc; + + InstructionCost OldCost = CostC0; + OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask, + CostKind, 0, nullptr, {}, &I); + + InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy, + CastSrcTy, NewMask, CostKind); NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy, TTI::CastContextHint::None, CostKind); if (!C0->hasOneUse()) NewCost += CostC0; - if (!C1->hasOneUse()) - NewCost += CostC1; + if (IsBinaryShuffle) { + InstructionCost CostC1 = + TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy, + TTI::CastContextHint::None, CostKind); + OldCost += CostC1; + if (!C1->hasOneUse()) + NewCost += CostC1; + } LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost @@ -2568,14 +2586,20 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { if (NewCost > OldCost) return false; - Value *Shuf = Builder.CreateShuffleVector(C0->getOperand(0), - C1->getOperand(0), NewMask); + Value *Shuf; + if (IsBinaryShuffle) + Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0), + NewMask); + else + Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask); + Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy); // Intersect flags from the old casts. if (auto *NewInst = dyn_cast(Cast)) { NewInst->copyIRFlags(C0); - NewInst->andIRFlags(C1); + if (IsBinaryShuffle) + NewInst->andIRFlags(C1); } Worklist.pushValue(Shuf); @@ -4433,7 +4457,7 @@ bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) { // Create new mask using difference of the two incoming masks. int MaskOffset = NewMask[0u]; - unsigned Index = (InputNumElements - MaskOffset) % InputNumElements; + unsigned Index = (InputNumElements + MaskOffset) % InputNumElements; NewMask.clear(); for (unsigned I = 0u; I < InputNumElements; ++I) { diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll index 7ac4db3119210..904db9064a369 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll @@ -3,11 +3,13 @@ ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s +; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s +; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s define void @canonicalize_f16() { ; BASE-LABEL: 'canonicalize_f16' @@ -141,6 +143,16 @@ define void @canonicalize_bf16() { ; GFX10-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) ; GFX10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; BASE-SIZE-LABEL: 'canonicalize_bf16' ; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) ; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) @@ -181,6 +193,15 @@ define void @canonicalize_bf16() { ; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) ; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'canonicalize_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1 %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1 %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1 @@ -203,6 +224,17 @@ define void @canonicalize_f32() { ; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_f32' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; ALL-SIZE-LABEL: 'canonicalize_f32' ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) @@ -214,6 +246,16 @@ define void @canonicalize_f32() { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; +; GFX1250-SIZE-LABEL: 'canonicalize_f32': +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void %f32 = call float @llvm.canonicalize.f32(float undef) #1 %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1 %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1 @@ -236,6 +278,16 @@ define void @canonicalize_f64() { ; ALL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) ; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'canonicalize_f64' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; ALL-SIZE-LABEL: 'canonicalize_f64' ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) @@ -245,6 +297,16 @@ define void @canonicalize_f64() { ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) ; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; GFX1250-SIZE-LABEL: 'canonicalize_f64' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %f64 = call double @llvm.canonicalize.f64(double undef) #1 %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1 @@ -255,9 +317,3 @@ define void @canonicalize_f64() { %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1 ret void } - - - - - - diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll index 55994d865fa6c..9b1495b35a89d 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll @@ -2,159 +2,190 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fadd_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fadd_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fadd_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f32' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fadd <2 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fadd <3 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fadd <4 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fadd <5 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fadd <8 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fadd <9 x float> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fadd float poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fadd <2 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fadd <3 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fadd <4 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fadd <5 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fadd <8 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fadd <9 x float> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fadd float undef, undef - %v2f32 = fadd <2 x float> undef, undef - %v3f32 = fadd <3 x float> undef, undef - %v4f32 = fadd <4 x float> undef, undef - %v5f32 = fadd <5 x float> undef, undef - %v8f32 = fadd <8 x float> undef, undef - %v9f32 = fadd <9 x float> undef, undef + %f32 = fadd float poison, poison + %v2f32 = fadd <2 x float> poison, poison + %v3f32 = fadd <3 x float> poison, poison + %v4f32 = fadd <4 x float> poison, poison + %v5f32 = fadd <5 x float> poison, poison + %v8f32 = fadd <8 x float> poison, poison + %v9f32 = fadd <9 x float> poison, poison ret void } define amdgpu_kernel void @fadd_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fadd_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fadd_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f64' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fadd double poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fadd <2 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fadd <3 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fadd <4 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fadd <5 x double> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fadd_f64' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fadd double poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fadd <2 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fadd <3 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fadd <4 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fadd <5 x double> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fadd_f64' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fadd double poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fadd <2 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fadd <3 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fadd <4 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fadd <5 x double> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fadd double undef, undef - %v2f64 = fadd <2 x double> undef, undef - %v3f64 = fadd <3 x double> undef, undef - %v4f64 = fadd <4 x double> undef, undef - %v5f64 = fadd <5 x double> undef, undef + %f64 = fadd double poison, poison + %v2f64 = fadd <2 x double> poison, poison + %v3f64 = fadd <3 x double> poison, poison + %v4f64 = fadd <4 x double> poison, poison + %v5f64 = fadd <5 x double> poison, poison ret void } define amdgpu_kernel void @fadd_f16() #0 { ; FASTF16-LABEL: 'fadd_f16' -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fadd_f16' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF16-SIZE-LABEL: 'fadd_f16' -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fadd <2 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fadd <3 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> poison, poison ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fadd_f16' -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> undef, undef +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fadd half poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fadd <2 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fadd <3 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fadd <4 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fadd <5 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fadd <16 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fadd <17 x half> poison, poison ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fadd half undef, undef - %v2f16 = fadd <2 x half> undef, undef - %v3f16 = fadd <3 x half> undef, undef - %v4f16 = fadd <4 x half> undef, undef - %v5f16 = fadd <5 x half> undef, undef - %v16f16 = fadd <16 x half> undef, undef - %v17f16 = fadd <17 x half> undef, undef + %f16 = fadd half poison, poison + %v2f16 = fadd <2 x half> poison, poison + %v3f16 = fadd <3 x half> poison, poison + %v4f16 = fadd <4 x half> poison, poison + %v5f16 = fadd <5 x half> poison, poison + %v16f16 = fadd <16 x half> poison, poison + %v17f16 = fadd <17 x half> poison, poison + ret void +} + +define amdgpu_kernel void @fadd_bf16() #0 { +; GFX1250-LABEL: 'fadd_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; GFX1250-SIZE-LABEL: 'fadd_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fadd bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fadd <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fadd <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fadd <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fadd <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fadd <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fadd <17 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = fadd bfloat poison, poison + %v2bf16 = fadd <2 x bfloat> poison, poison + %v3bf16 = fadd <3 x bfloat> poison, poison + %v4bf16 = fadd <4 x bfloat> poison, poison + %v5bf16 = fadd <5 x bfloat> poison, poison + %v16bf16 = fadd <16 x bfloat> poison, poison + %v17bf16 = fadd <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll index 2ff9d4f7f5e38..f34ee31bcf4ce 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll @@ -2,166 +2,186 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SLOW %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1010 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefix=FAST-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=SLOW-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s define void @fma_f16() { ; FAST-LABEL: 'fma_f16' -; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_f16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FAST-SIZE-LABEL: 'fma_f16' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = call half @llvm.fma.f16(half undef, half undef, half undef) - %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) - %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) - %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) - %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) - %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) - %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) + %f16 = call half @llvm.fma.f16(half poison, half poison, half poison) + %v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> poison, <2 x half> poison, <2 x half> poison) + %v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> poison, <3 x half> poison, <3 x half> poison) + %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> poison, <4 x half> poison, <4 x half> poison) + %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> poison, <5 x half> poison, <5 x half> poison) + %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> poison, <16 x half> poison, <16 x half> poison) + %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> poison, <17 x half> poison, <17 x half> poison) ret void } define void @fma_bf16() { ; FAST-LABEL: 'fma_bf16' -; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; FAST-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; FAST-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; FAST-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fma_bf16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FAST-SIZE-LABEL: 'fma_bf16' -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; FAST-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_bf16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %bf16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) - %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) - %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> undef, <3 x bfloat> undef, <3 x bfloat> undef) - %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) - %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> undef, <5 x bfloat> undef, <5 x bfloat> undef) - %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef) - %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> undef, <17 x bfloat> undef, <17 x bfloat> undef) +; GFX1250-LABEL: 'fma_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; GFX1250-SIZE-LABEL: 'fma_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = call bfloat @llvm.fma.bf16(bfloat poison, bfloat poison, bfloat poison) + %v2bf16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> poison, <2 x bfloat> poison, <2 x bfloat> poison) + %v3bf16 = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> poison, <3 x bfloat> poison, <3 x bfloat> poison) + %v4bf16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> poison, <4 x bfloat> poison, <4 x bfloat> poison) + %v5bf16 = call <5 x bfloat> @llvm.fma.v5bf16(<5 x bfloat> poison, <5 x bfloat> poison, <5 x bfloat> poison) + %v16bf16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> poison, <16 x bfloat> poison, <16 x bfloat> poison) + %v17bf16 = call <17 x bfloat> @llvm.fma.v17bf16(<17 x bfloat> poison, <17 x bfloat> poison, <17 x bfloat> poison) ret void } define void @fma_f32() { ; SLOW-LABEL: 'fma_f32' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f32' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = call float @llvm.fma.f32(float undef, float undef, float undef) - %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) - %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) - %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef) - %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) - %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef) - %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> undef, <9 x float> undef, <9 x float> undef) + %f32 = call float @llvm.fma.f32(float poison, float poison, float poison) + %v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> poison, <2 x float> poison, <2 x float> poison) + %v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> poison, <3 x float> poison, <3 x float> poison) + %v4f32 = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison) + %v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> poison, <5 x float> poison, <5 x float> poison) + %v8f32 = call <8 x float> @llvm.fma.v8f32(<8 x float> poison, <8 x float> poison, <8 x float> poison) + %v9f32 = call <9 x float> @llvm.fma.v9f32(<9 x float> poison, <9 x float> poison, <9 x float> poison) ret void } define void @fma_f64() { ; SLOW-LABEL: 'fma_f64' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) +; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fma_f64' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = call double @llvm.fma.f64(double undef, double undef, double undef) - %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) - %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) - %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef) - %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> undef, <5 x double> undef, <5 x double> undef) + %f64 = call double @llvm.fma.f64(double poison, double poison, double poison) + %v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> poison, <2 x double> poison, <2 x double> poison) + %v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> poison, <3 x double> poison, <3 x double> poison) + %v4f64 = call <4 x double> @llvm.fma.v4f64(<4 x double> poison, <4 x double> poison, <4 x double> poison) + %v5f64 = call <5 x double> @llvm.fma.v5f64(<5 x double> poison, <5 x double> poison, <5 x double> poison) ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll index adc4eea309a58..c0b9cda23ea04 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll @@ -2,210 +2,231 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9,F32,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=F32,SLOW %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX9-SIZE,GFX90A-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,GFX9-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZE,SLOW-SIZE %s +; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fmul_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fmul_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; F32-LABEL: 'fmul_f32' -; F32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef +; F32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison +; F32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison ; F32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f32' -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fmul <2 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fmul <3 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fmul <4 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fmul <5 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fmul <8 x float> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fmul <9 x float> poison, poison ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fmul float poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fmul <2 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fmul <3 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fmul <4 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fmul <5 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fmul <8 x float> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fmul <9 x float> poison, poison ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fmul float undef, undef - %v2f32 = fmul <2 x float> undef, undef - %v3f32 = fmul <3 x float> undef, undef - %v4f32 = fmul <4 x float> undef, undef - %v5f32 = fmul <5 x float> undef, undef - %v8f32 = fmul <8 x float> undef, undef - %v9f32 = fmul <9 x float> undef, undef + %f32 = fmul float poison, poison + %v2f32 = fmul <2 x float> poison, poison + %v3f32 = fmul <3 x float> poison, poison + %v4f32 = fmul <4 x float> poison, poison + %v5f32 = fmul <5 x float> poison, poison + %v8f32 = fmul <8 x float> poison, poison + %v9f32 = fmul <9 x float> poison, poison ret void } define amdgpu_kernel void @fmul_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fmul_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fmul_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f64' -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fmul double poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fmul <2 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fmul <3 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fmul <4 x double> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fmul <5 x double> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-SIZE-LABEL: 'fmul_f64' -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> undef, undef -; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> undef, undef +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fmul double poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fmul <2 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fmul <3 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fmul <4 x double> poison, poison +; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fmul <5 x double> poison, poison ; GFX90A-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'fmul_f64' -; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> undef, undef -; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> undef, undef +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fmul double poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fmul <2 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fmul <3 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fmul <4 x double> poison, poison +; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fmul <5 x double> poison, poison ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fmul double undef, undef - %v2f64 = fmul <2 x double> undef, undef - %v3f64 = fmul <3 x double> undef, undef - %v4f64 = fmul <4 x double> undef, undef - %v5f64 = fmul <5 x double> undef, undef + %f64 = fmul double poison, poison + %v2f64 = fmul <2 x double> poison, poison + %v3f64 = fmul <3 x double> poison, poison + %v4f64 = fmul <4 x double> poison, poison + %v5f64 = fmul <5 x double> poison, poison ret void } define amdgpu_kernel void @fmul_f16() #0 { ; GFX9-LABEL: 'fmul_f16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_f16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX9-SIZE-LABEL: 'fmul_f16' -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fmul <2 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fmul <3 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> poison, poison ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_f16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> undef, undef +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fmul half poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fmul <2 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fmul <3 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fmul <4 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fmul <5 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fmul <16 x half> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fmul <17 x half> poison, poison ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fmul half undef, undef - %v2f16 = fmul <2 x half> undef, undef - %v3f16 = fmul <3 x half> undef, undef - %v4f16 = fmul <4 x half> undef, undef - %v5f16 = fmul <5 x half> undef, undef - %v16f16 = fmul <16 x half> undef, undef - %v17f16 = fmul <17 x half> undef, undef + %f16 = fmul half poison, poison + %v2f16 = fmul <2 x half> poison, poison + %v3f16 = fmul <3 x half> poison, poison + %v4f16 = fmul <4 x half> poison, poison + %v5f16 = fmul <5 x half> poison, poison + %v16f16 = fmul <16 x half> poison, poison + %v17f16 = fmul <17 x half> poison, poison ret void } define amdgpu_kernel void @fmul_bf16() #0 { ; GFX9-LABEL: 'fmul_bf16' -; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX9-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX9-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOW-LABEL: 'fmul_bf16' -; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; SLOW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; SLOW-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; SLOW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; +; GFX1250-LABEL: 'fmul_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; ; GFX9-SIZE-LABEL: 'fmul_bf16' -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; GFX9-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOW-SIZE-LABEL: 'fmul_bf16' -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> undef, undef -; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> undef, undef +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison ; SLOW-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %bf16 = fmul bfloat undef, undef - %v2bf16 = fmul <2 x bfloat> undef, undef - %v3bf16 = fmul <3 x bfloat> undef, undef - %v4bf16 = fmul <4 x bfloat> undef, undef - %v5bf16 = fmul <5 x bfloat> undef, undef - %v16bf16 = fmul <16 x bfloat> undef, undef - %v17bf16 = fmul <17 x bfloat> undef, undef +; GFX1250-SIZE-LABEL: 'fmul_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fmul bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fmul <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fmul <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fmul <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fmul <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fmul <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fmul <17 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void + %bf16 = fmul bfloat poison, poison + %v2bf16 = fmul <2 x bfloat> poison, poison + %v3bf16 = fmul <3 x bfloat> poison, poison + %v4bf16 = fmul <4 x bfloat> poison, poison + %v5bf16 = fmul <5 x bfloat> poison, poison + %v16bf16 = fmul <16 x bfloat> poison, poison + %v17bf16 = fmul <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll index 4e71a71326bad..6b71603f70f6b 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll @@ -2,158 +2,191 @@ ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,GFX90A-FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,FASTF16,FASTF64 %s ; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32,SLOWF64 %s +; RUN: opt -passes="print" 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16-SIZE,GFX90A-FASTF64-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,FASTF16-SIZE %s ; RUN: opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=NOPACKEDF32-SIZE,SLOWF64-SIZE %s +; RUN opt -passes="print" -cost-kind=code-size 2>&1 -disable-output -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250-SIZE %s ; END. define amdgpu_kernel void @fsub_f32() #0 { ; GFX90A-FASTF64-LABEL: 'fsub_f32' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; NOPACKEDF32-LABEL: 'fsub_f32' -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison +; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison ; NOPACKEDF32-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f32' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = fsub <2 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = fsub <3 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fsub <4 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = fsub <5 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = fsub <8 x float> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = fsub <9 x float> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f32' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = fsub float poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fsub <2 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f32 = fsub <3 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f32 = fsub <4 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v5f32 = fsub <5 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8f32 = fsub <8 x float> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v9f32 = fsub <9 x float> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f32 = fsub float undef, undef - %v2f32 = fsub <2 x float> undef, undef - %v3f32 = fsub <3 x float> undef, undef - %v4f32 = fsub <4 x float> undef, undef - %v5f32 = fsub <5 x float> undef, undef - %v8f32 = fsub <8 x float> undef, undef - %v9f32 = fsub <9 x float> undef, undef + %f32 = fsub float poison, poison + %v2f32 = fsub <2 x float> poison, poison + %v3f32 = fsub <3 x float> poison, poison + %v4f32 = fsub <4 x float> poison, poison + %v5f32 = fsub <5 x float> poison, poison + %v8f32 = fsub <8 x float> poison, poison + %v9f32 = fsub <9 x float> poison, poison ret void } define amdgpu_kernel void @fsub_f64() #0 { ; GFX90A-FASTF64-LABEL: 'fsub_f64' -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef -; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison +; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison ; GFX90A-FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF64-LABEL: 'fsub_f64' -; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef -; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef +; FASTF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison +; FASTF64-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison ; FASTF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f64' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = fsub double poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fsub <2 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = fsub <3 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fsub <4 x double> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = fsub <5 x double> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; GFX90A-FASTF64-SIZE-LABEL: 'fsub_f64' -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> undef, undef -; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> undef, undef +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f64 = fsub double poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f64 = fsub <2 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3f64 = fsub <3 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = fsub <4 x double> poison, poison +; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v5f64 = fsub <5 x double> poison, poison ; GFX90A-FASTF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; NOPACKEDF32-SIZE-LABEL: 'fsub_f64' -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> undef, undef -; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> undef, undef +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = fsub double poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = fsub <2 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = fsub <3 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = fsub <4 x double> poison, poison +; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = fsub <5 x double> poison, poison ; NOPACKEDF32-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f64 = fsub double undef, undef - %v2f64 = fsub <2 x double> undef, undef - %v3f64 = fsub <3 x double> undef, undef - %v4f64 = fsub <4 x double> undef, undef - %v5f64 = fsub <5 x double> undef, undef + %f64 = fsub double poison, poison + %v2f64 = fsub <2 x double> poison, poison + %v3f64 = fsub <3 x double> poison, poison + %v4f64 = fsub <4 x double> poison, poison + %v5f64 = fsub <5 x double> poison, poison ret void } define amdgpu_kernel void @fsub_f16() #0 { ; FASTF16-LABEL: 'fsub_f16' -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison +; FASTF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison ; FASTF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; SLOWF64-LABEL: 'fsub_f16' -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef -; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison +; SLOWF64-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison ; SLOWF64-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void ; ; FASTF16-SIZE-LABEL: 'fsub_f16' -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef -; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16 = fsub <2 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f16 = fsub <3 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> poison, poison +; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> poison, poison ; FASTF16-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SLOWF64-SIZE-LABEL: 'fsub_f16' -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> undef, undef -; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> undef, undef +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = fsub half poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fsub <2 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v3f16 = fsub <3 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = fsub <4 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = fsub <5 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = fsub <16 x half> poison, poison +; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v17f16 = fsub <17 x half> poison, poison ; SLOWF64-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; - %f16 = fsub half undef, undef - %v2f16 = fsub <2 x half> undef, undef - %v3f16 = fsub <3 x half> undef, undef - %v4f16 = fsub <4 x half> undef, undef - %v5f16 = fsub <5 x half> undef, undef - %v16f16 = fsub <16 x half> undef, undef - %v17f16 = fsub <17 x half> undef, undef + %f16 = fsub half poison, poison + %v2f16 = fsub <2 x half> poison, poison + %v3f16 = fsub <3 x half> poison, poison + %v4f16 = fsub <4 x half> poison, poison + %v5f16 = fsub <5 x half> poison, poison + %v16f16 = fsub <16 x half> poison, poison + %v17f16 = fsub <17 x half> poison, poison + ret void +} + +define amdgpu_kernel void @fsub_bf16() #0 { +; GFX1250-LABEL: 'fsub_bf16' +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison +; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void +; +; GFX1250-SIZE-LABEL: 'fsub_bf16' +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = fsub bfloat poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = fsub <2 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = fsub <3 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = fsub <4 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = fsub <5 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = fsub <16 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = fsub <17 x bfloat> poison, poison +; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %bf16 = fsub bfloat poison, poison + %v2bf16 = fsub <2 x bfloat> poison, poison + %v3bf16 = fsub <3 x bfloat> poison, poison + %v4bf16 = fsub <4 x bfloat> poison, poison + %v5bf16 = fsub <5 x bfloat> poison, poison + %v16bf16 = fsub <16 x bfloat> poison, poison + %v17bf16 = fsub <17 x bfloat> poison, poison ret void } diff --git a/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll b/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll index eed93cf0df8ef..e2eb4f6e7b9e9 100644 --- a/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll +++ b/llvm/test/Analysis/GlobalsModRef/nonescaping-noalias.ll @@ -62,7 +62,7 @@ define ptr @test1_tls_noopt(ptr %coro, ptr %param) presplitcoroutine { ; CHECK-NEXT: store i32 [[V]], ptr [[PARAM]], align 4 ; CHECK-NEXT: ret ptr [[CORO]] ; CHECK: suspend: -; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.coro.end(ptr [[CORO]], i1 false, token none) +; CHECK-NEXT: call void @llvm.coro.end(ptr [[CORO]], i1 false, token none) ; CHECK-NEXT: ret ptr [[CORO]] ; entry: @@ -79,7 +79,7 @@ resume: ret ptr %coro suspend: - call i1 @llvm.coro.end(ptr %coro, i1 0, token none) + call void @llvm.coro.end(ptr %coro, i1 0, token none) ret ptr %coro } diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll index bf140c7fa216a..b1fe7b1b2b7ee 100644 --- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll +++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll @@ -574,5 +574,164 @@ exit: ret void } +define void @test_ptr_aligned_by_2_and_4_via_assumption(ptr %start, ptr %end) { +; CHECK-LABEL: 'test_ptr_aligned_by_2_and_4_via_assumption' +; CHECK-NEXT: Classifying expressions for: @test_ptr_aligned_by_2_and_4_via_assumption +; CHECK-NEXT: %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4 +; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_ptr_aligned_by_2_and_4_via_assumption +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; +entry: + call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ] + call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 4) ] + br label %loop + +loop: + %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ] + store ptr %iv, ptr %iv + %iv.next = getelementptr i8, ptr %iv, i64 4 + %ec = icmp ne ptr %iv.next, %end + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @test_ptrs_aligned_by_4_via_assumption(ptr %start, ptr %end) { +; CHECK-LABEL: 'test_ptrs_aligned_by_4_via_assumption' +; CHECK-NEXT: Classifying expressions for: @test_ptrs_aligned_by_4_via_assumption +; CHECK-NEXT: %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4 +; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_ptrs_aligned_by_4_via_assumption +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; +entry: + call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 4) ] + call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 4) ] + br label %loop + +loop: + %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ] + store ptr %iv, ptr %iv + %iv.next = getelementptr i8, ptr %iv, i64 4 + %ec = icmp ne ptr %iv.next, %end + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +define void @test_ptrs_aligned_by_8_via_assumption(ptr %start, ptr %end) { +; CHECK-LABEL: 'test_ptrs_aligned_by_8_via_assumption' +; CHECK-NEXT: Classifying expressions for: @test_ptrs_aligned_by_8_via_assumption +; CHECK-NEXT: %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4 +; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_ptrs_aligned_by_8_via_assumption +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; +entry: + call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ] + call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ] + br label %loop + +loop: + %iv = phi ptr [ %start, %entry ], [ %iv.next, %loop ] + store ptr %iv, ptr %iv + %iv.next = getelementptr i8, ptr %iv, i64 4 + %ec = icmp ne ptr %iv.next, %end + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +declare i1 @cond() + +define void @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors(ptr %start, ptr %end) { +; CHECK-LABEL: 'test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors' +; CHECK-NEXT: Classifying expressions for: @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors +; CHECK-NEXT: %c = call i1 @cond() +; CHECK-NEXT: --> %c U: full-set S: full-set +; CHECK-NEXT: %iv = phi ptr [ %start, %then ], [ %start, %else ], [ %iv.next, %loop ] +; CHECK-NEXT: --> {%start,+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: %iv.next = getelementptr i8, ptr %iv, i64 4 +; CHECK-NEXT: --> {(4 + %start),+,4}<%loop> U: full-set S: full-set Exits: <> LoopDispositions: { %loop: Computable } +; CHECK-NEXT: Determining loop execution counts for: @test_ptr_aligned_by_4_via_assumption_multiple_loop_predecessors +; CHECK-NEXT: Loop %loop: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %loop: Unpredictable symbolic max backedge-taken count. +; CHECK-NEXT: Loop %loop: Predicated backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated constant max backedge-taken count is i64 4611686018427387903 +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; CHECK-NEXT: Loop %loop: Predicated symbolic max backedge-taken count is ((-4 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) /u 4) +; CHECK-NEXT: Predicates: +; CHECK-NEXT: Equal predicate: (zext i2 ((trunc i64 (ptrtoint ptr %end to i64) to i2) + (-1 * (trunc i64 (ptrtoint ptr %start to i64) to i2))) to i64) == 0 +; +entry: + call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 2) ] + call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 4) ] + %c = call i1 @cond() + br i1 %c, label %then, label %else + +then: + br label %loop + +else: + br label %loop + +loop: + %iv = phi ptr [ %start, %then] , [ %start, %else ], [ %iv.next, %loop ] + store ptr %iv, ptr %iv + %iv.next = getelementptr i8, ptr %iv, i64 4 + %ec = icmp ne ptr %iv.next, %end + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + declare void @llvm.assume(i1) declare void @llvm.experimental.guard(i1, ...) diff --git a/llvm/test/Assembler/auto_upgrade_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_intrinsics.ll index 37cb49650f6bd..64d4a3ba7c802 100644 --- a/llvm/test/Assembler/auto_upgrade_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_intrinsics.ll @@ -47,11 +47,11 @@ entry: ret void } -declare i1 @llvm.coro.end(ptr, i1) +declare void @llvm.coro.end(ptr, i1) define void @test.coro.end(ptr %ptr) { ; CHECK-LABEL: @test.coro.end( -; CHECK: call i1 @llvm.coro.end(ptr %ptr, i1 false, token none) - call i1 @llvm.coro.end(ptr %ptr, i1 false) +; CHECK: call void @llvm.coro.end(ptr %ptr, i1 false, token none) + call void @llvm.coro.end(ptr %ptr, i1 false) ret void } diff --git a/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll b/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll index d860104b9cb3d..5628e17b4936e 100644 --- a/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll +++ b/llvm/test/Bitcode/operand-bundles-bc-analyzer.ll @@ -13,6 +13,7 @@ ; CHECK-NEXT: ) +define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %scale.coerce) { +; CHECK-SD-LABEL: cmplx_mul_combined_re_im: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: lsr x9, x0, #16 +; CHECK-SD-NEXT: adrp x8, .LCPI14_0 +; CHECK-SD-NEXT: dup v4.8h, w0 +; CHECK-SD-NEXT: dup v1.8h, w9 +; CHECK-SD-NEXT: fmov s3, w9 +; CHECK-SD-NEXT: sqneg v2.8h, v1.8h +; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-SD-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b +; CHECK-SD-NEXT: rev32 v2.8h, v0.8h +; CHECK-SD-NEXT: sqdmull v3.4s, v0.4h, v4.4h +; CHECK-SD-NEXT: sqdmull2 v0.4s, v0.8h, v4.8h +; CHECK-SD-NEXT: sqdmlal v3.4s, v2.4h, v1.4h +; CHECK-SD-NEXT: sqdmlal2 v0.4s, v2.8h, v1.8h +; CHECK-SD-NEXT: uzp2 v0.8h, v3.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: cmplx_mul_combined_re_im: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: lsr w9, w0, #16 +; CHECK-GI-NEXT: adrp x8, .LCPI14_0 +; CHECK-GI-NEXT: rev32 v4.8h, v0.8h +; CHECK-GI-NEXT: dup v1.8h, w9 +; CHECK-GI-NEXT: fmov s3, w9 +; CHECK-GI-NEXT: sqneg v2.8h, v1.8h +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] +; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: dup v3.8h, w0 +; CHECK-GI-NEXT: sqdmull v2.4s, v2.4h, v3.4h +; CHECK-GI-NEXT: sqdmull v5.4s, v4.4h, v1.4h +; CHECK-GI-NEXT: sqdmlal v5.4s, v0.4h, v3.4h +; CHECK-GI-NEXT: sqdmlal2 v2.4s, v4.8h, v1.8h +; CHECK-GI-NEXT: uzp2 v0.8h, v5.8h, v2.8h +; CHECK-GI-NEXT: ret +entry: + %scale.sroa.0.0.extract.trunc = trunc i64 %scale.coerce to i16 + %scale.sroa.2.0.extract.shift23 = lshr i64 %scale.coerce, 16 + %scale.sroa.2.0.extract.trunc = trunc i64 %scale.sroa.2.0.extract.shift23 to i16 + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> poison, <8 x i32> + %vecinit.i24 = insertelement <8 x i16> poison, i16 %scale.sroa.0.0.extract.trunc, i64 0 + %vecinit.i = insertelement <8 x i16> poison, i16 %scale.sroa.2.0.extract.trunc, i64 0 + %vecinit7.i = shufflevector <8 x i16> %vecinit.i, <8 x i16> poison, <8 x i32> zeroinitializer + %vqnegq_v1.i = tail call noundef <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %vecinit7.i) + %vbsl5.i = shufflevector <8 x i16> %vqnegq_v1.i, <8 x i16> %vecinit.i, <8 x i32> + %shuffle.i40 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> + %shuffle.i39 = shufflevector <8 x i16> %vecinit.i24, <8 x i16> poison, <4 x i32> zeroinitializer + %vqdmull_v2.i36 = tail call noundef <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i40, <4 x i16> %shuffle.i39) + %shuffle.i44 = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> + %vqdmull_v2.i = tail call noundef <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i44, <4 x i16> %shuffle.i39) + %shuffle.i38 = shufflevector <8 x i16> %shuffle.i, <8 x i16> poison, <4 x i32> + %shuffle.i37 = shufflevector <8 x i16> %vbsl5.i, <8 x i16> poison, <4 x i32> + %vqdmlal2.i45 = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i38, <4 x i16> %shuffle.i37) + %vqdmlal_v3.i46 = tail call noundef <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %vqdmull_v2.i36, <4 x i32> %vqdmlal2.i45) + %shuffle.i42 = shufflevector <8 x i16> %shuffle.i, <8 x i16> poison, <4 x i32> + %shuffle.i41 = shufflevector <8 x i16> %vbsl5.i, <8 x i16> poison, <4 x i32> + %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i42, <4 x i16> %shuffle.i41) + %vqdmlal_v3.i = tail call noundef <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %vqdmull_v2.i, <4 x i32> %vqdmlal2.i) + %0 = bitcast <4 x i32> %vqdmlal_v3.i46 to <8 x i16> + %1 = bitcast <4 x i32> %vqdmlal_v3.i to <8 x i16> + %shuffle.i35 = shufflevector <8 x i16> %0, <8 x i16> %1, <8 x i32> + ret <8 x i16> %shuffle.i35 +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/abdu-neg.ll b/llvm/test/CodeGen/AArch64/abdu-neg.ll index 79fc12ea76f63..269cbf03f32a0 100644 --- a/llvm/test/CodeGen/AArch64/abdu-neg.ll +++ b/llvm/test/CodeGen/AArch64/abdu-neg.ll @@ -180,13 +180,11 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_ext_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x8, x8, x10 -; CHECK-NEXT: sbc x9, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x8, x8, x10, lo +; CHECK-NEXT: csel x9, x9, x11, lo ; CHECK-NEXT: negs x0, x8 ; CHECK-NEXT: ngc x1, x9 ; CHECK-NEXT: ret @@ -203,13 +201,11 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_ext_i128_undef: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x8, x8, x10 -; CHECK-NEXT: sbc x9, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x8, x8, x10, lo +; CHECK-NEXT: csel x9, x9, x11, lo ; CHECK-NEXT: negs x0, x8 ; CHECK-NEXT: ngc x1, x9 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/abdu.ll b/llvm/test/CodeGen/AArch64/abdu.ll index 6db7693fb3a1c..3cbe648788a84 100644 --- a/llvm/test/CodeGen/AArch64/abdu.ll +++ b/llvm/test/CodeGen/AArch64/abdu.ll @@ -169,13 +169,11 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_ext_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbc x1, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lo +; CHECK-NEXT: csel x1, x9, x11, lo ; CHECK-NEXT: ret %aext = zext i128 %a to i256 %bext = zext i128 %b to i256 @@ -189,13 +187,11 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_ext_i128_undef: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbc x1, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lo +; CHECK-NEXT: csel x1, x9, x11, lo ; CHECK-NEXT: ret %aext = zext i128 %a to i256 %bext = zext i128 %b to i256 @@ -263,13 +259,11 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_minmax_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbc x1, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lo +; CHECK-NEXT: csel x1, x9, x11, lo ; CHECK-NEXT: ret %min = call i128 @llvm.umin.i128(i128 %a, i128 %b) %max = call i128 @llvm.umax.i128(i128 %a, i128 %b) @@ -339,13 +333,11 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_cmp_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbc x1, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lo +; CHECK-NEXT: csel x1, x9, x11, lo ; CHECK-NEXT: ret %cmp = icmp uge i128 %a, %b %ab = sub i128 %a, %b @@ -437,13 +429,11 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; CHECK-LABEL: abd_select_i128: ; CHECK: // %bb.0: ; CHECK-NEXT: subs x8, x0, x2 -; CHECK-NEXT: sbcs x9, x1, x3 -; CHECK-NEXT: cset w10, lo -; CHECK-NEXT: sbfx x10, x10, #0, #1 -; CHECK-NEXT: eor x8, x8, x10 -; CHECK-NEXT: eor x9, x9, x10 -; CHECK-NEXT: subs x0, x8, x10 -; CHECK-NEXT: sbc x1, x9, x10 +; CHECK-NEXT: sbc x9, x1, x3 +; CHECK-NEXT: subs x10, x2, x0 +; CHECK-NEXT: sbcs x11, x3, x1 +; CHECK-NEXT: csel x0, x8, x10, lo +; CHECK-NEXT: csel x1, x9, x11, lo ; CHECK-NEXT: ret %cmp = icmp ult i128 %a, %b %ab = select i1 %cmp, i128 %a, i128 %b diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index 2b9e334cc7812..2b313fa8ce55f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -53,18 +53,15 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) { define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) { ; CHECK-LABEL: uitofp_v4i64_to_v4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q2, [x0] -; CHECK-NEXT: mov x8, v0.d[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: ucvtf s1, x9 -; CHECK-NEXT: mov x9, v2.d[1] -; CHECK-NEXT: ucvtf s0, x8 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: ucvtf s2, x8 +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: movi v2.4s, #127, msl #8 +; CHECK-NEXT: ucvtf s0, x9 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: ldp x8, x9, [x0, #16] ; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: mov v1.s[2], v0.s[0] ; CHECK-NEXT: ucvtf s0, x9 -; CHECK-NEXT: mov v1.s[2], v2.s[0] -; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: mov v1.s[3], v0.s[0] ; CHECK-NEXT: movi v0.4s, #1 ; CHECK-NEXT: ushr v3.4s, v1.4s, #16 diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll new file mode 100644 index 0000000000000..b1b9fcf8a8b3c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-intrinsics.ll @@ -0,0 +1,609 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK + + +; +; Intriniscs +; + +define float @fcvtas_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtas_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtas_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtas_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtas_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtas_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtas.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtas_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtas_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtas.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtas_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtas_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtas_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtas_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtas s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + + +define float @fcvtau_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtau_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtau_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtau_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtau_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtau_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtau.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtau_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtau_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtau.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtau_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtau_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtau_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtau_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtau s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtms_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtms_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtms_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtms_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtms_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtms_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtms.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtms_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtms_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtms_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtms_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtms_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtms_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtms s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtmu_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtmu_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtmu_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtmu_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtmu_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtmu_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtmu.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtmu_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtmu_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtmu.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtmu_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtmu_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtmu_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtmu_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtmu s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtns_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtns_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtns_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtns_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtns_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtns_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtns.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtns_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtns_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtns.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtns_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtns_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtns_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtns_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtns s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtnu_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtnu_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtnu_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtnu_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtnu_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtnu_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtnu.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtnu_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtnu_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtnu.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtnu_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtnu_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtnu_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtnu_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtnu s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtps_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtps_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtps_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtps_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtps_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtps_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtps.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtps_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtps_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtps.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtps_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtps_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtps_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtps_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtps s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtpu_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtpu_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtpu_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtpu_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtpu_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtpu_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtpu.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtpu_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtpu_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtpu.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtpu_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtpu_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtpu_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtpu_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtpu s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtzs_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtzs_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtzs_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtzs_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtzs_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtzs_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtzs_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtzs_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtzs_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtzs_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtzs_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtzs_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} + +define float @fcvtzu_1s1d_simd(double %A) nounwind { +; CHECK-LABEL: fcvtzu_1s1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu s0, d0 +; CHECK-NEXT: ret + %i = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A) + %f = bitcast i32 %i to float + ret float %f +} + +define double @fcvtzu_1d1s_simd(float %A) nounwind { +; CHECK-LABEL: fcvtzu_1d1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, s0 +; CHECK-NEXT: ret + %i = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A) + %d = bitcast i64 %i to double + ret double %d +} + +define float @fcvtzu_1s1h_simd(half %a) { +; CHECK-LABEL: fcvtzu_1s1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu s0, h0 +; CHECK-NEXT: ret + %fcvt = tail call i32 @llvm.aarch64.neon.fcvtzu.i32.f16(half %a) + %f = bitcast i32 %fcvt to float + ret float %f +} + +define double @fcvtzu_1d1h_simd(half %a) { +; CHECK-LABEL: fcvtzu_1d1h_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, h0 +; CHECK-NEXT: ret + %vcvtah_s64_f16 = tail call i64 @llvm.aarch64.neon.fcvtzu.i64.f16(half %a) + %d = bitcast i64 %vcvtah_s64_f16 to double + ret double %d +} + +define double @fcvtzu_1d1d_simd(double %a) { +; CHECK-LABEL: fcvtzu_1d1d_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: ret + %vcvtah_s64_f64 = tail call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %a) + %d = bitcast i64 %vcvtah_s64_f64 to double + ret double %d +} + +define float @fcvtzu_1s1s_simd(float %a) { +; CHECK-LABEL: fcvtzu_1s1s_simd: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu s0, s0 +; CHECK-NEXT: ret + %vcvtah_s32_f32 = tail call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %a) + %d = bitcast i32 %vcvtah_s32_f32 to float + ret float %d +} diff --git a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll index 59f887a1143c0..a93203793307a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll +++ b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll @@ -4,10 +4,8 @@ define i32 @foo(ptr %__a) nounwind { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: umov.h w8, v0[0] -; CHECK-NEXT: umov.h w9, v0[0] -; CHECK-NEXT: add w0, w9, w8, uxth #1 +; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: add w0, w8, w8, lsl #1 ; CHECK-NEXT: ret %tmp18 = load <4 x i16>, ptr %__a, align 8 %vget_lane = extractelement <4 x i16> %tmp18, i32 0 diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll index 114203e46f196..13093cb2204ce 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -105,13 +105,13 @@ define i32 @ldr_int_volatile(ptr %a) nounwind { ; CHECK: Cluster ld/st SU(1) - SU(3) ; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui ; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui -define <2 x i64> @ldq_cluster(ptr %p) { - %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 +define <4 x i32> @ldq_cluster(ptr %p) { + %tmp1 = load <4 x i32>, ptr %p, align 8 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2 - %tmp2 = add nsw <2 x i64> %tmp1, %tmp1 - %tmp3 = load <2 x i64>, ptr %add.ptr2, align 8 - %res = mul nsw <2 x i64> %tmp2, %tmp3 - ret <2 x i64> %res + %tmp2 = add nsw <4 x i32> %tmp1, %tmp1 + %tmp3 = load <4 x i32>, ptr %add.ptr2, align 8 + %res = mul nsw <4 x i32> %tmp2, %tmp3 + ret <4 x i32> %res } ; CHECK: ********** MI Scheduling ********** @@ -215,7 +215,7 @@ exit: ; CHECK: ********** MI Scheduling ********** ; CHECK: LDURXi_LDRXui:%bb.0 entry ; CHECK: Cluster ld/st SU(3) - SU(4) -; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi +; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi ; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui ; define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) { diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll index 60fcb643fb9f4..627d31f9a64fc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll @@ -1,15 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for fcvtas_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtau_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtms_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtmu_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtps_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtpu_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtns_1d -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fcvtnu_1d +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind { ; CHECK-LABEL: fcvtas_2s: @@ -405,16 +396,10 @@ define <2 x i64> @fcvtzs_2d_intrinsic(<2 x double> %A) nounwind { } define <1 x i64> @fcvtzs_1d_intrinsic(<1 x double> %A) nounwind { -; CHECK-SD-LABEL: fcvtzs_1d_intrinsic: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fcvtzs d0, d0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fcvtzs_1d_intrinsic: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcvtzs x8, d0 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fcvtzs_1d_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzs d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzs.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } @@ -490,16 +475,10 @@ define <2 x i64> @fcvtzu_2d_intrinsic(<2 x double> %A) nounwind { } define <1 x i64> @fcvtzu_1d_intrinsic(<1 x double> %A) nounwind { -; CHECK-SD-LABEL: fcvtzu_1d_intrinsic: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: fcvtzu d0, d0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fcvtzu_1d_intrinsic: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcvtzu x8, d0 -; CHECK-GI-NEXT: fmov d0, x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fcvtzu_1d_intrinsic: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtzu d0, d0 +; CHECK-NEXT: ret %tmp3 = call <1 x i64> @llvm.aarch64.neon.fcvtzu.v1i64.v1f64(<1 x double> %A) ret <1 x i64> %tmp3 } diff --git a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll index 1c216e7357215..e371748a43b29 100644 --- a/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bf16-vector-bitcast.ll @@ -11,6 +11,16 @@ entry: ret <4 x i16> %1 } +define <4 x half> @v4bf16_to_v4f16(float, <4 x bfloat> %a) nounwind { +; CHECK-LABEL: v4bf16_to_v4f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x bfloat> %a to <4 x half> + ret <4 x half> %1 +} + define <2 x i32> @v4bf16_to_v2i32(float, <4 x bfloat> %a) nounwind { ; CHECK-LABEL: v4bf16_to_v2i32: ; CHECK: // %bb.0: // %entry @@ -82,6 +92,16 @@ entry: ret <4 x bfloat> %1 } +define <4 x bfloat> @v4f16_to_v4bf16(float, <4 x half> %a) nounwind { +; CHECK-LABEL: v4f16_to_v4bf16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: ret +entry: + %1 = bitcast <4 x half> %a to <4 x bfloat> + ret <4 x bfloat> %1 +} + define <4 x bfloat> @v2i32_to_v4bf16(float, <2 x i32> %a) nounwind { ; CHECK-LABEL: v2i32_to_v4bf16: ; CHECK: // %bb.0: // %entry @@ -152,6 +172,16 @@ entry: ret <8 x i16> %1 } +define <8 x half> @v8bf16_to_v8f16(float, <8 x bfloat> %a) nounwind { +; CHECK-LABEL: v8bf16_to_v8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x bfloat> %a to <8 x half> + ret <8 x half> %1 +} + define <4 x i32> @v8bf16_to_v4i32(float, <8 x bfloat> %a) nounwind { ; CHECK-LABEL: v8bf16_to_v4i32: ; CHECK: // %bb.0: // %entry @@ -202,6 +232,16 @@ entry: ret <8 x bfloat> %1 } +define <8 x bfloat> @v8f16_to_v8bf16(float, <8 x half> %a) nounwind { +; CHECK-LABEL: v8f16_to_v8bf16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %1 = bitcast <8 x half> %a to <8 x bfloat> + ret <8 x bfloat> %1 +} + define <8 x bfloat> @v4i32_to_v8bf16(float, <4 x i32> %a) nounwind { ; CHECK-LABEL: v4i32_to_v8bf16: ; CHECK: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll index 0960c4c2a3342..a56d5b1b49b38 100644 --- a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll +++ b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll @@ -78,9 +78,8 @@ B: define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind { ; CHECK-LABEL: g_i8_sign_extend_inreg: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxtb w8, w0 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w8, w1, w2, mi +; CHECK-NEXT: tst w0, #0x80 +; CHECK-NEXT: csel w8, w1, w2, ne ; CHECK-NEXT: add w0, w8, w0, uxtb ; CHECK-NEXT: ret entry: @@ -100,9 +99,8 @@ B: define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind { ; CHECK-LABEL: g_i16_sign_extend_inreg: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxth w8, w0 -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w8, w1, w2, mi +; CHECK-NEXT: tst w0, #0x8000 +; CHECK-NEXT: csel w8, w1, w2, ne ; CHECK-NEXT: add w0, w8, w0, uxth ; CHECK-NEXT: ret entry: @@ -167,10 +165,8 @@ B: define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind { ; CHECK-LABEL: g_i32_sign_extend_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: sxtw x8, w0 -; CHECK-NEXT: cmp x8, #0 -; CHECK-NEXT: csel x8, x1, x2, mi +; CHECK-NEXT: tst w0, #0x80000000 +; CHECK-NEXT: csel x8, x1, x2, ne ; CHECK-NEXT: add x0, x8, w0, uxtw ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll index baca159f9dd55..02dfaa19acc9d 100644 --- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll @@ -4,11 +4,9 @@ define void @autogen_SD19655(ptr %addr, ptr %addrfloat) { ; CHECK-LABEL: autogen_SD19655: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov.d x8, v0[1] -; CHECK-NEXT: fmov x9, d0 -; CHECK-NEXT: scvtf s1, x9 -; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: scvtf s0, x9 +; CHECK-NEXT: scvtf s1, x8 ; CHECK-NEXT: mov.s v1[1], v0[0] ; CHECK-NEXT: str d1, [x1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-pair.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-pair.mir new file mode 100644 index 0000000000000..113f343bac73e --- /dev/null +++ b/llvm/test/CodeGen/AArch64/debug-info-sve-pair.mir @@ -0,0 +1,344 @@ +# RUN: llc -start-before=aarch64-asm-printer -o - %s | FileCheck %s + +# Check that z30_z31 debug info does not crash. + +# CHECK: .Ldebug_loc0: +# CHECK: .byte 4 // DW_LLE_offset_pair +# CHECK: .uleb128 .Ltmp2-.Lfunc_begin0 // starting offset +# CHECK: .uleb128 .Ltmp3-.Lfunc_begin0 // ending offset +# CHECK: .byte 2 // Loc expr size +# CHECK: .byte 144 // DW_OP_regx +# CHECK: .byte 126 // 126 +# CHECK: .byte 4 // DW_LLE_offset_pair +# CHECK: .uleb128 .Ltmp3-.Lfunc_begin0 // starting offset +# CHECK: .uleb128 .Lfunc_end0-.Lfunc_begin0 // ending offset +# CHECK: .byte 6 // Loc expr size +# CHECK: .byte 144 // sub-register DW_OP_regx +# CHECK: .byte 94 // 94 +# CHECK: .byte 147 // DW_OP_piece +# CHECK: .byte 16 // 16 +# CHECK: .byte 147 // DW_OP_piece +# CHECK: .byte 31 // 31 +# CHECK: .byte 0 // DW_LLE_end_of_list + + +--- | + target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32" + target triple = "aarch64" + + define void @_Z10Sort16RowsILi6EEv12SharedTraitsI10TraitsLaneEP22Trans_NS_hwy_float16_tiS4_(i8 %st.coerce, ptr noundef %keys, i32 noundef %0, ptr noundef %1) #2 !dbg !2 { + unreachable + } + + attributes #2 = { mustprogress uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+perfmon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve-aes,+sve2,+sve2-aes,+v8.1a,+v8.2a,+v8a,-fmv" "tune-cpu"="generic" } + + !llvm.dbg.cu = !{!3} + !llvm.module.flags = !{!4, !5, !6, !7, !8, !9} + !llvm.ident = !{!10} + + !2 = distinct !DISubprogram(name: "Sort16Rows<6>", linkageName: "_Z10Sort16RowsILi6EEv12SharedTraitsI10TraitsLaneEP22Trans_NS_hwy_float16_tiS4_", scope: !12, file: !12, line: 369, type: !18, scopeLine: 370, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !19, retainedNodes: !20, keyInstructions: true) + !3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !14, producer: "clang version 22.0.0git (https://github.com/llvm/llvm-project.git)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) + !4 = !{i32 7, !"Dwarf Version", i32 5} + !5 = !{i32 2, !"Debug Info Version", i32 3} + !6 = !{i32 1, !"wchar_size", i32 4} + !7 = !{i32 7, !"uwtable", i32 2} + !8 = !{i32 7, !"frame-pointer", i32 1} + !9 = !{i32 7, !"debug-info-assignment-tracking", i1 true} + !10 = !{!"clang version 22.0.0git (https://github.com/llvm/llvm-project.git)"} + !12 = !DIFile(filename: "example.cpp", directory: "/app", checksumkind: CSK_MD5, checksum: "5fbaafea0ede06ddd1ffc371aeee276e") + !14 = !DIFile(filename: "/app/example.cpp", directory: "/app", checksumkind: CSK_MD5, checksum: "5fbaafea0ede06ddd1ffc371aeee276e") + !17 = !DIBasicType(name: "__fp16", size: 16, encoding: DW_ATE_float) + !18 = !DISubroutineType(types: !21) + !19 = !{!120} + !20 = !{!77, !78, !79, !80, !81, !82, !83, !84, !85, !86, !87, !88, !89, !90, !91, !92, !93, !94, !95, !96, !97, !98, !99, !100, !101, !102, !103, !104, !105} + !21 = !{null, !22, !23, !24, !23} + !22 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "SharedTraits", file: !12, line: 272, size: 8, flags: DIFlagTypePassByValue, elements: !25, templateParams: !26, identifier: "_ZTS12SharedTraitsI10TraitsLaneE") + !23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !55, size: 64) + !24 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) + !25 = !{!27} + !26 = !{!76} + !27 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !22, baseType: !28, extraData: i32 0) + !28 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TraitsLane", file: !12, line: 325, size: 8, flags: DIFlagTypePassByValue, elements: !29, identifier: "_ZTS10TraitsLane") + !29 = !{!30, !31, !32, !33} + !30 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !28, baseType: !34, extraData: i32 0) + !31 = !DISubprogram(name: "Sort2", linkageName: "_ZN10TraitsLane5Sort2E4SimdI22Trans_NS_hwy_float16_tLi1ELi0EERu13__SVFloat16_tS4_", scope: !28, file: !12, line: 326, type: !70, scopeLine: 326, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + !32 = !DISubprogram(name: "SortPairsDistance1", linkageName: "_ZN10TraitsLane18SortPairsDistance1E4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEu13__SVFloat16_t", scope: !28, file: !12, line: 344, type: !74, scopeLine: 344, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + !33 = !DISubprogram(name: "SortPairsDistance4", linkageName: "_ZN10TraitsLane18SortPairsDistance4E4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEu13__SVFloat16_t", scope: !28, file: !12, line: 352, type: !74, scopeLine: 352, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + !34 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "KeyLane", file: !12, line: 307, size: 8, flags: DIFlagTypePassByValue, elements: !35, identifier: "_ZTS7KeyLane") + !35 = !{!36, !37, !38} + !36 = !DISubprogram(name: "SwapAdjacentPairs", linkageName: "_ZN7KeyLane17SwapAdjacentPairsE4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEu13__SVFloat16_t", scope: !34, file: !12, line: 309, type: !39, scopeLine: 309, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + !37 = !DISubprogram(name: "SwapAdjacentPairs", linkageName: "_ZN7KeyLane17SwapAdjacentPairsEu13__SVFloat32_t", scope: !34, file: !12, line: 314, type: !58, scopeLine: 314, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + !38 = !DISubprogram(name: "OddEvenPairs", linkageName: "_ZN7KeyLane12OddEvenPairsE4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEu13__SVFloat16_tS3_", scope: !34, file: !12, line: 318, type: !68, scopeLine: 318, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) + !39 = !DISubroutineType(types: !40) + !40 = !{!41, !42, !43, !41} + !41 = !DIDerivedType(tag: DW_TAG_typedef, name: "Vec >", file: !12, line: 270, baseType: !44) + !42 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !34, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) + !43 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Simd", file: !12, line: 83, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !51, identifier: "_ZTS4SimdI22Trans_NS_hwy_float16_tLi1ELi0EE") + !44 = !DIDerivedType(tag: DW_TAG_typedef, name: "VFromD >", file: !12, line: 142, baseType: !45) + !45 = !DIDerivedType(tag: DW_TAG_typedef, name: "svfloat16_t", file: !12, line: 26, baseType: !46) + !46 = !DIDerivedType(tag: DW_TAG_typedef, name: "__SVFloat16_t", file: !12, baseType: !47) + !47 = !DICompositeType(tag: DW_TAG_array_type, baseType: !17, flags: DIFlagVector, elements: !48) + !48 = !{!49} + !49 = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 4, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + !50 = !{} + !51 = !{!52, !53, !54} + !52 = !DITemplateTypeParameter(name: "Lane", type: !55) + !53 = !DITemplateValueParameter(type: !24, value: i32 1) + !54 = !DITemplateValueParameter(name: "kPow2", type: !24, value: i32 0) + !55 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Trans_NS_hwy_float16_t", file: !12, line: 6, size: 16, flags: DIFlagTypePassByValue, elements: !56, identifier: "_ZTS22Trans_NS_hwy_float16_t") + !56 = !{!57} + !57 = !DIDerivedType(tag: DW_TAG_member, name: "native", scope: !55, file: !12, line: 7, baseType: !17, size: 16) + !58 = !DISubroutineType(types: !59) + !59 = !{!60, !42, !60} + !60 = !DIDerivedType(tag: DW_TAG_typedef, name: "Vec >", file: !12, line: 270, baseType: !61) + !61 = !DIDerivedType(tag: DW_TAG_typedef, name: "VFromD >", file: !12, line: 142, baseType: !62) + !62 = !DIDerivedType(tag: DW_TAG_typedef, name: "svfloat32_t", file: !12, line: 27, baseType: !63) + !63 = !DIDerivedType(tag: DW_TAG_typedef, name: "__SVFloat32_t", file: !12, baseType: !64) + !64 = !DICompositeType(tag: DW_TAG_array_type, baseType: !65, flags: DIFlagVector, elements: !66) + !65 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) + !66 = !{!67} + !67 = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 2, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + !68 = !DISubroutineType(types: !69) + !69 = !{!41, !42, !43, !41, !41} + !70 = !DISubroutineType(types: !71) + !71 = !{null, !72, !43, !73, !73} + !72 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) + !73 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !41, size: 64) + !74 = !DISubroutineType(types: !75) + !75 = !{!41, !72, !43, !41} + !76 = !DITemplateTypeParameter(name: "Base", type: !28) + !77 = !DILocalVariable(name: "st", arg: 1, scope: !2, file: !12, line: 369, type: !22) + !78 = !DILocalVariable(name: "keys", arg: 2, scope: !2, file: !12, line: 369, type: !23) + !79 = !DILocalVariable(arg: 3, scope: !2, file: !12, line: 369, type: !24) + !80 = !DILocalVariable(arg: 4, scope: !2, file: !12, line: 370, type: !23) + !81 = !DILocalVariable(name: "d", scope: !2, file: !12, line: 371, type: !106) + !82 = !DILocalVariable(name: "v8", scope: !2, file: !12, line: 373, type: !112) + !83 = !DILocalVariable(name: "v9", scope: !2, file: !12, line: 373, type: !112) + !84 = !DILocalVariable(name: "va", scope: !2, file: !12, line: 373, type: !112) + !85 = !DILocalVariable(name: "vb", scope: !2, file: !12, line: 373, type: !112) + !86 = !DILocalVariable(name: "vc", scope: !2, file: !12, line: 373, type: !112) + !87 = !DILocalVariable(name: "vd", scope: !2, file: !12, line: 373, type: !112) + !88 = !DILocalVariable(name: "ve", scope: !2, file: !12, line: 373, type: !112) + !89 = !DILocalVariable(name: "vf", scope: !2, file: !12, line: 373, type: !112) + !90 = !DILocalVariable(name: "v2", scope: !2, file: !12, line: 373, type: !112) + !91 = !DILocalVariable(name: "v4", scope: !2, file: !12, line: 373, type: !112) + !92 = !DILocalVariable(name: "v7", scope: !2, file: !12, line: 373, type: !112) + !93 = !DILocalVariable(name: "v0", scope: !2, file: !12, line: 374, type: !112) + !94 = !DILocalVariable(name: "v3", scope: !2, file: !12, line: 375, type: !112) + !95 = !DILocalVariable(name: "v5", scope: !2, file: !12, line: 376, type: !112) + !96 = !DILocalVariable(name: "v6", scope: !2, file: !12, line: 377, type: !112) + !97 = !DILocalVariable(name: "kIota", scope: !2, file: !12, line: 378, type: !112) + !98 = !DILocalVariable(name: "m8", scope: !2, file: !12, line: 379, type: !113) + !99 = !DILocalVariable(name: "m9", scope: !2, file: !12, line: 380, type: !113) + !100 = !DILocalVariable(name: "ma", scope: !2, file: !12, line: 381, type: !113) + !101 = !DILocalVariable(name: "mb", scope: !2, file: !12, line: 382, type: !113) + !102 = !DILocalVariable(name: "mc", scope: !2, file: !12, line: 383, type: !113) + !103 = !DILocalVariable(name: "md", scope: !2, file: !12, line: 384, type: !113) + !104 = !DILocalVariable(name: "me", scope: !2, file: !12, line: 385, type: !113) + !105 = !DILocalVariable(name: "mf", scope: !2, file: !12, line: 386, type: !113) + !106 = !DIDerivedType(tag: DW_TAG_typedef, name: "CappedTag", file: !12, line: 97, baseType: !107) + !107 = !DIDerivedType(tag: DW_TAG_typedef, name: "type", scope: !108, file: !12, line: 89, baseType: !43) + !108 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ClampNAndPow2", file: !12, line: 88, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !109, identifier: "_ZTS13ClampNAndPow2I22Trans_NS_hwy_float16_tLi1EE") + !109 = !{!110, !111} + !110 = !DITemplateTypeParameter(name: "T", type: !55) + !111 = !DITemplateValueParameter(name: "N", type: !24, value: i32 1) + !112 = !DIDerivedType(tag: DW_TAG_typedef, name: "V", scope: !2, file: !12, line: 372, baseType: !41) + !113 = !DIDerivedType(tag: DW_TAG_typedef, name: "Mask >", file: !12, line: 271, baseType: !114) + !114 = !DIDerivedType(tag: DW_TAG_typedef, name: "svbool_t", file: !12, line: 28, baseType: !115) + !115 = !DIDerivedType(tag: DW_TAG_typedef, name: "__SVBool_t", file: !12, baseType: !116) + !116 = !DICompositeType(tag: DW_TAG_array_type, baseType: !117, flags: DIFlagVector, elements: !118) + !117 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char) + !118 = !{!119} + !119 = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 1, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + !120 = !DITemplateValueParameter(name: "kKeysPerRow", type: !24, value: i32 6) + !121 = !DILocalVariable(name: "this", arg: 1, scope: !122, type: !123, flags: DIFlagArtificial | DIFlagObjectPointer) + !122 = distinct !DISubprogram(name: "Sort2", linkageName: "_ZN10TraitsLane5Sort2E4SimdI22Trans_NS_hwy_float16_tLi1ELi0EERu13__SVFloat16_tS4_", scope: !28, file: !12, line: 326, type: !70, scopeLine: 328, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, declaration: !31, retainedNodes: !124, keyInstructions: true) + !123 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64) + !124 = !{!121, !125, !126, !127, !128, !129, !130, !131, !132} + !125 = !DILocalVariable(name: "d", arg: 2, scope: !122, file: !12, line: 326, type: !43) + !126 = !DILocalVariable(name: "a", arg: 3, scope: !122, file: !12, line: 327, type: !73) + !127 = !DILocalVariable(name: "b", arg: 4, scope: !122, file: !12, line: 328, type: !73) + !128 = !DILocalVariable(name: "__trans_tmp_52", scope: !122, file: !12, line: 329, type: !41) + !129 = !DILocalVariable(name: "a_copy", scope: !122, file: !12, line: 329, type: !41) + !130 = !DILocalVariable(name: "__trans_tmp_45", scope: !122, file: !12, line: 330, type: !41) + !131 = !DILocalVariable(name: "__trans_tmp_53", scope: !133, file: !12, line: 334, type: !41) + !132 = !DILocalVariable(name: "__trans_tmp_29", scope: !134, file: !12, line: 336, type: !45) + !133 = distinct !DILexicalBlock(scope: !122, file: !12, line: 333, column: 5) + !134 = distinct !DILexicalBlock(scope: !133, file: !12, line: 335, column: 7) + !137 = distinct !DISubprogram(name: "SortPairsDistance1", linkageName: "_ZN10TraitsLane18SortPairsDistance1E4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEu13__SVFloat16_t", scope: !28, file: !12, line: 344, type: !74, scopeLine: 345, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, declaration: !32, retainedNodes: !139, keyInstructions: true) + !139 = !{!140, !141, !142, !143} + !140 = !DILocalVariable(name: "this", arg: 1, scope: !137, type: !123, flags: DIFlagArtificial | DIFlagObjectPointer) + !141 = !DILocalVariable(name: "d", arg: 2, scope: !137, file: !12, line: 344, type: !43) + !142 = !DILocalVariable(name: "v", arg: 3, scope: !137, file: !12, line: 345, type: !41) + !143 = !DILocalVariable(name: "__trans_tmp_48", scope: !137, file: !12, line: 346, type: !41) + !144 = distinct !DISubprogram(name: "Merge16x16<6, SharedTraits, __SVFloat16_t>", linkageName: "_Z10Merge16x16ILi6E12SharedTraitsI10TraitsLaneEu13__SVFloat16_tEvT0_RT1_S6_S6_S6_S6_S6_S6_S6_S6_S6_S6_S6_", scope: !12, file: !12, line: 286, type: !146, scopeLine: 288, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !147, retainedNodes: !148, keyInstructions: true) + !145 = distinct !DILocation(line: 388, column: 3, scope: !2) + !146 = !DISubroutineType(types: !149) + !147 = !{!164, !165, !166} + !148 = !{!151, !152, !153, !154, !155, !156, !157, !158, !159, !160, !161, !162, !163} + !149 = !{null, !22, !150, !150, !150, !150, !150, !150, !150, !150, !150, !150, !150, !150} + !150 = !DIDerivedType(tag: DW_TAG_reference_type, baseType: !47, size: 64) + !151 = !DILocalVariable(name: "st", arg: 1, scope: !144, file: !12, line: 286, type: !22) + !152 = !DILocalVariable(name: "v0", arg: 2, scope: !144, file: !12, line: 286, type: !150) + !153 = !DILocalVariable(name: "v2", arg: 3, scope: !144, file: !12, line: 286, type: !150) + !154 = !DILocalVariable(name: "v5", arg: 4, scope: !144, file: !12, line: 286, type: !150) + !155 = !DILocalVariable(name: "v6", arg: 5, scope: !144, file: !12, line: 287, type: !150) + !156 = !DILocalVariable(name: "v7", arg: 6, scope: !144, file: !12, line: 287, type: !150) + !157 = !DILocalVariable(name: "v9", arg: 7, scope: !144, file: !12, line: 287, type: !150) + !158 = !DILocalVariable(name: "va", arg: 8, scope: !144, file: !12, line: 287, type: !150) + !159 = !DILocalVariable(name: "vb", arg: 9, scope: !144, file: !12, line: 287, type: !150) + !160 = !DILocalVariable(name: "vc", arg: 10, scope: !144, file: !12, line: 288, type: !150) + !161 = !DILocalVariable(name: "vd", arg: 11, scope: !144, file: !12, line: 288, type: !150) + !162 = !DILocalVariable(name: "ve", arg: 12, scope: !144, file: !12, line: 288, type: !150) + !163 = !DILocalVariable(name: "vf", arg: 13, scope: !144, file: !12, line: 288, type: !150) + !164 = !DITemplateValueParameter(type: !24, value: i32 6) + !165 = !DITemplateTypeParameter(name: "Traits", type: !22) + !166 = !DITemplateTypeParameter(name: "V", type: !47) + !184 = !DILocalVariable(name: "this", arg: 1, scope: !185, type: !186, flags: DIFlagArtificial | DIFlagObjectPointer) + !185 = distinct !DISubprogram(name: "SortPairsDistance2 >", linkageName: "_ZN12SharedTraitsI10TraitsLaneE18SortPairsDistance2I4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEEEDTcl4ZerocvT__EEES6_S7_", scope: !22, file: !12, line: 273, type: !187, scopeLine: 273, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !188, declaration: !189, retainedNodes: !190, keyInstructions: true) + !186 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64) + !187 = !DISubroutineType(types: !191) + !188 = !{!193} + !189 = !DISubprogram(name: "SortPairsDistance2 >", linkageName: "_ZN12SharedTraitsI10TraitsLaneE18SortPairsDistance2I4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEEEDTcl4ZerocvT__EEES6_S7_", scope: !22, file: !12, line: 273, type: !187, scopeLine: 273, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, templateParams: !188) + !190 = !{!184, !194, !195, !196, !197} + !191 = !{!41, !192, !43, !41} + !192 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) + !193 = !DITemplateTypeParameter(name: "D", type: !43) + !194 = !DILocalVariable(name: "d", arg: 2, scope: !185, file: !12, line: 273, type: !43) + !195 = !DILocalVariable(name: "v", arg: 3, scope: !185, file: !12, line: 273, type: !41) + !196 = !DILocalVariable(name: "base", scope: !185, file: !12, line: 274, type: !28) + !197 = !DILocalVariable(name: "swapped", scope: !185, file: !12, line: 275, type: !41) + !200 = !DILocation(line: 0, scope: !122, inlinedAt: !201) + !201 = distinct !DILocation(line: 358, column: 5, scope: !202, inlinedAt: !203) + !202 = distinct !DISubprogram(name: "SortPairsDistance4", linkageName: "_ZN10TraitsLane18SortPairsDistance4E4SimdI22Trans_NS_hwy_float16_tLi1ELi0EEu13__SVFloat16_t", scope: !28, file: !12, line: 352, type: !74, scopeLine: 353, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, declaration: !33, retainedNodes: !204, keyInstructions: true) + !203 = distinct !DILocation(line: 298, column: 11, scope: !144, inlinedAt: !145) + !204 = !{!205, !206, !207, !208, !209, !210, !211} + !205 = !DILocalVariable(name: "this", arg: 1, scope: !202, type: !123, flags: DIFlagArtificial | DIFlagObjectPointer) + !206 = !DILocalVariable(name: "d", arg: 2, scope: !202, file: !12, line: 352, type: !43) + !207 = !DILocalVariable(name: "v", arg: 3, scope: !202, file: !12, line: 353, type: !41) + !208 = !DILocalVariable(name: "__trans_tmp_42", scope: !202, file: !12, line: 354, type: !41) + !209 = !DILocalVariable(name: "__trans_tmp_39", scope: !202, file: !12, line: 354, type: !41) + !210 = !DILocalVariable(name: "dw", scope: !202, file: !12, line: 355, type: !212) + !211 = !DILocalVariable(name: "__trans_tmp_51", scope: !219, file: !12, line: 360, type: !44) + !212 = !DIDerivedType(tag: DW_TAG_typedef, name: "RepartitionToWide >", file: !12, line: 103, baseType: !213) + !213 = !DIDerivedType(tag: DW_TAG_typedef, name: "Repartition >", file: !12, line: 101, baseType: !214) + !214 = !DIDerivedType(tag: DW_TAG_typedef, name: "Repartition", scope: !43, file: !12, line: 86, baseType: !215) + !215 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Simd", file: !12, line: 83, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !216, identifier: "_ZTS4SimdIfLi0ELi0EE") + !216 = !{!217, !218, !54} + !217 = !DITemplateTypeParameter(name: "Lane", type: !65) + !218 = !DITemplateValueParameter(type: !24, value: i32 0) + !219 = distinct !DILexicalBlock(scope: !202, file: !12, line: 359, column: 5) + !220 = !DILocalVariable(name: "this", arg: 1, scope: !221, type: !222, flags: DIFlagArtificial | DIFlagObjectPointer) + !221 = distinct !DISubprogram(name: "SwapAdjacentPairs", linkageName: "_ZN7KeyLane17SwapAdjacentPairsEu13__SVFloat32_t", scope: !34, file: !12, line: 314, type: !58, scopeLine: 314, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, declaration: !37, retainedNodes: !223, keyInstructions: true) + !222 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !34, size: 64) + !223 = !{!220, !224} + !224 = !DILocalVariable(name: "v", arg: 2, scope: !221, file: !12, line: 314, type: !60) + !225 = distinct !DILocation(line: 357, column: 38, scope: !202, inlinedAt: !203) + !226 = !DILocalVariable(name: "v", arg: 1, scope: !227, file: !12, line: 264, type: !64) + !227 = distinct !DISubprogram(name: "Shuffle1032<__SVFloat32_t>", linkageName: "_Z11Shuffle1032Iu13__SVFloat32_tET_S1_", scope: !12, file: !12, line: 264, type: !228, scopeLine: 264, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !229, retainedNodes: !230, keyInstructions: true) + !228 = !DISubroutineType(types: !231) + !229 = !{!262} + !230 = !{!226, !232, !233, !234} + !231 = !{!64, !64} + !232 = !DILocalVariable(name: "d", scope: !227, file: !12, line: 265, type: !235) + !233 = !DILocalVariable(name: "d8", scope: !227, file: !12, line: 266, type: !252) + !234 = !DILocalVariable(name: "v8", scope: !227, file: !12, line: 267, type: !257) + !235 = !DIDerivedType(tag: DW_TAG_typedef, name: "DFromV<__SVFloat32_t>", file: !12, line: 108, baseType: !236) + !236 = !DIDerivedType(tag: DW_TAG_typedef, name: "type", scope: !237, file: !12, line: 116, baseType: !238) + !237 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "DFromV_t<__SVFloat32_t>", file: !12, line: 115, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !239, identifier: "_ZTS8DFromV_tIu13__SVFloat32_tE") + !238 = !DIDerivedType(tag: DW_TAG_typedef, name: "ScalableTag", file: !12, line: 95, baseType: !241) + !239 = !{!240} + !240 = !DITemplateTypeParameter(type: !64) + !241 = !DIDerivedType(tag: DW_TAG_typedef, name: "type", scope: !242, file: !12, line: 92, baseType: !243) + !242 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ScalableTagChecker", file: !12, line: 91, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !244, identifier: "_ZTS18ScalableTagCheckerIfE") + !243 = !DIDerivedType(tag: DW_TAG_typedef, name: "type", scope: !246, file: !12, line: 89, baseType: !247) + !244 = !{!245} + !245 = !DITemplateTypeParameter(name: "T", type: !65) + !246 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "ClampNAndPow2", file: !12, line: 88, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !248, identifier: "_ZTS13ClampNAndPow2IfLi64EE") + !247 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Simd", file: !12, line: 83, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !250, identifier: "_ZTS4SimdIfLi64ELi0EE") + !248 = !{!245, !249} + !249 = !DITemplateValueParameter(name: "N", type: !24, value: i32 64) + !250 = !{!217, !251, !54} + !251 = !DITemplateValueParameter(type: !24, value: i32 64) + !252 = !DIDerivedType(tag: DW_TAG_typedef, name: "Repartition >", file: !12, line: 101, baseType: !253) + !253 = !DIDerivedType(tag: DW_TAG_typedef, name: "Repartition", scope: !247, file: !12, line: 86, baseType: !254) + !254 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Simd", file: !12, line: 83, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !255, identifier: "_ZTS4SimdIhLi0ELi0EE") + !255 = !{!256, !218, !54} + !256 = !DITemplateTypeParameter(name: "Lane", type: !117) + !257 = !DIDerivedType(tag: DW_TAG_typedef, name: "svuint8_t", file: !12, line: 22, baseType: !258) + !258 = !DIDerivedType(tag: DW_TAG_typedef, name: "__SVUint8_t", file: !12, baseType: !259) + !259 = !DICompositeType(tag: DW_TAG_array_type, baseType: !117, flags: DIFlagVector, elements: !260) + !260 = !{!261} + !261 = !DISubrange(lowerBound: 0, upperBound: !DIExpression(DW_OP_constu, 8, DW_OP_bregx, 46, 0, DW_OP_mul, DW_OP_constu, 1, DW_OP_minus)) + !262 = !DITemplateTypeParameter(name: "V", type: !64) + !263 = !DILocalVariable(name: "hi", arg: 1, scope: !264, file: !12, line: 248, type: !259) + !264 = distinct !DISubprogram(name: "CombineShiftRightBytes<8, __SVUint8_t>", linkageName: "_Z22CombineShiftRightBytesILi8Eu11__SVUint8_tET0_S1_S1_", scope: !12, file: !12, line: 248, type: !265, scopeLine: 248, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !266, retainedNodes: !267, keyInstructions: true) + !265 = !DISubroutineType(types: !268) + !266 = !{!283, !284} + !267 = !{!263, !269, !270, !271, !272, !273, !274, !275, !276} + !268 = !{!259, !259, !259} + !269 = !DILocalVariable(name: "lo", arg: 2, scope: !264, file: !12, line: 248, type: !259) + !270 = !DILocalVariable(name: "__trans_tmp_33", scope: !264, file: !12, line: 249, type: !257) + !271 = !DILocalVariable(name: "__trans_tmp_15", scope: !264, file: !12, line: 249, type: !257) + !272 = !DILocalVariable(name: "__trans_tmp_32", scope: !264, file: !12, line: 250, type: !257) + !273 = !DILocalVariable(name: "d8", scope: !264, file: !12, line: 251, type: !277) + !274 = !DILocalVariable(name: "__trans_tmp_16", scope: !264, file: !12, line: 252, type: !114) + !275 = !DILocalVariable(name: "lo_down", scope: !264, file: !12, line: 254, type: !257) + !276 = !DILocalVariable(name: "__trans_tmp_34", scope: !264, file: !12, line: 255, type: !114) + !277 = !DIDerivedType(tag: DW_TAG_typedef, name: "Repartition >", file: !12, line: 101, baseType: !278) + !278 = !DIDerivedType(tag: DW_TAG_typedef, name: "Repartition", scope: !279, file: !12, line: 86, baseType: !254) + !279 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Simd", file: !12, line: 83, size: 8, flags: DIFlagTypePassByValue, elements: !50, templateParams: !280, identifier: "_ZTS4SimdIcLi0ELi0EE") + !280 = !{!281, !218, !54} + !281 = !DITemplateTypeParameter(name: "Lane", type: !282) + !282 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_unsigned_char) + !283 = !DITemplateValueParameter(name: "kBytes", type: !24, value: i32 8) + !284 = !DITemplateTypeParameter(name: "V", type: !259) + !285 = !DILocalVariable(name: "hi", arg: 1, scope: !286, file: !12, line: 216, type: !257) + !286 = distinct !DISubprogram(name: "Ext<8>", linkageName: "_Z3ExtILi8EEu11__SVUint8_tS0_S0_", scope: !12, file: !12, line: 216, type: !287, scopeLine: 216, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !288, retainedNodes: !289, keyInstructions: true) + !287 = !DISubroutineType(types: !290) + !288 = !{!292} + !289 = !{!285, !291} + !290 = !{!257, !257, !257} + !291 = !DILocalVariable(name: "lo", arg: 2, scope: !286, file: !12, line: 216, type: !257) + !292 = !DITemplateValueParameter(name: "kIndex", type: !24, value: i32 8) + !293 = !DILocalVariable(name: "a", arg: 1, scope: !294, file: !12, line: 180, type: !47) + !294 = distinct !DISubprogram(name: "Min<__SVFloat16_t>", linkageName: "_Z3MinIu13__SVFloat16_tET_S1_S1_", scope: !12, file: !12, line: 180, type: !295, scopeLine: 180, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, templateParams: !296, retainedNodes: !297, keyInstructions: true) + !295 = !DISubroutineType(types: !298) + !296 = !{!166} + !297 = !{!293, !299, !300, !301, !302, !303, !304} + !298 = !{!47, !47, !47} + !299 = !DILocalVariable(name: "b", arg: 2, scope: !294, file: !12, line: 180, type: !47) + !300 = !DILocalVariable(name: "__trans_tmp_36", scope: !294, file: !12, line: 181, type: !45) + !301 = !DILocalVariable(name: "__trans_tmp_25", scope: !294, file: !12, line: 181, type: !45) + !302 = !DILocalVariable(name: "__trans_tmp_27", scope: !294, file: !12, line: 182, type: !114) + !303 = !DILocalVariable(name: "__trans_tmp_24", scope: !294, file: !12, line: 183, type: !114) + !304 = !DILocalVariable(name: "__trans_tmp_19", scope: !294, file: !12, line: 184, type: !114) + !308 = distinct !DILocation(line: 315, column: 12, scope: !221, inlinedAt: !225) + !309 = distinct !DILocation(line: 268, column: 21, scope: !227, inlinedAt: !308) + !311 = distinct !DILocation(line: 254, column: 18, scope: !264, inlinedAt: !309) + !312 = !DILocation(line: 217, column: 10, scope: !286, inlinedAt: !311, atomGroup: 1, atomRank: 2) + !313 = !DILocation(line: 257, column: 20, scope: !264, inlinedAt: !309, atomGroup: 5, atomRank: 2) + !314 = !DILocation(line: 0, scope: !294, inlinedAt: !315) + !315 = distinct !DILocation(line: 331, column: 22, scope: !122, inlinedAt: !201) + !316 = !DILocation(line: 185, column: 20, scope: !294, inlinedAt: !315) + !317 = !DILocation(line: 403, column: 1, scope: !2, atomGroup: 19449, atomRank: 1) + +... +--- +name: _Z10Sort16RowsILi6EEv12SharedTraitsI10TraitsLaneEP22Trans_NS_hwy_float16_tiS4_ +body: | + bb.0: + liveins: $x1, $z0, $z1, $p0 + + $z30 = LDR_ZXI $x1, -14 + $z31 = LDR_ZXI $x1, -13 + $z23 = ORR_ZZZ $z30, $z30 + renamable $z2 = EXT_ZZI_B renamable $z30_z31, 8, debug-location !312 + renamable $z7 = SEL_ZPZZ_B renamable $p0, renamable $z0, killed renamable $z1, debug-location !313 + DBG_VALUE $z30, $noreg, !129, !DIExpression(), debug-location !200 + renamable $p3 = nofpexcept FCMGT_PPzZZ_H renamable $p0, renamable $z0, undef renamable $z1, debug-location !316 + DBG_VALUE $z30_z31, $noreg, !129, !DIExpression(), debug-location !200 + DBG_VALUE $z30_z31, $noreg, !293, !DIExpression(), debug-location !314 + RET undef $lr, debug-location !317 +... + diff --git a/llvm/test/CodeGen/AArch64/dp-3source.ll b/llvm/test/CodeGen/AArch64/dp-3source.ll index 313f671c19c5e..26ee07627e3e5 100644 --- a/llvm/test/CodeGen/AArch64/dp-3source.ll +++ b/llvm/test/CodeGen/AArch64/dp-3source.ll @@ -1,164 +1,212 @@ -; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) { ; CHECK-LABEL: test_madd32: +; CHECK: ; %bb.0: +; CHECK-NEXT: madd w0, w1, w2, w0 +; CHECK-NEXT: ret %mid = mul i32 %val1, %val2 %res = add i32 %val0, %mid -; CHECK: madd {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i32 %res } define i64 @test_madd64(i64 %val0, i64 %val1, i64 %val2) { ; CHECK-LABEL: test_madd64: +; CHECK: ; %bb.0: +; CHECK-NEXT: madd x0, x1, x2, x0 +; CHECK-NEXT: ret %mid = mul i64 %val1, %val2 %res = add i64 %val0, %mid -; CHECK: madd {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i32 @test_msub32(i32 %val0, i32 %val1, i32 %val2) { ; CHECK-LABEL: test_msub32: +; CHECK: ; %bb.0: +; CHECK-NEXT: msub w0, w1, w2, w0 +; CHECK-NEXT: ret %mid = mul i32 %val1, %val2 %res = sub i32 %val0, %mid -; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i32 %res } define i64 @test_msub64(i64 %val0, i64 %val1, i64 %val2) { ; CHECK-LABEL: test_msub64: +; CHECK: ; %bb.0: +; CHECK-NEXT: msub x0, x1, x2, x0 +; CHECK-NEXT: ret %mid = mul i64 %val1, %val2 %res = sub i64 %val0, %mid -; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i64 @test_smaddl(i64 %acc, i32 %val1, i32 %val2) { ; CHECK-LABEL: test_smaddl: +; CHECK: ; %bb.0: +; CHECK-NEXT: smaddl x0, w1, w2, x0 +; CHECK-NEXT: ret %ext1 = sext i32 %val1 to i64 %ext2 = sext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 %res = add i64 %acc, %prod -; CHECK: smaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i64 @test_smsubl(i64 %acc, i32 %val1, i32 %val2) { ; CHECK-LABEL: test_smsubl: +; CHECK: ; %bb.0: +; CHECK-NEXT: smsubl x0, w1, w2, x0 +; CHECK-NEXT: ret %ext1 = sext i32 %val1 to i64 %ext2 = sext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 %res = sub i64 %acc, %prod -; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i64 @test_umaddl(i64 %acc, i32 %val1, i32 %val2) { ; CHECK-LABEL: test_umaddl: +; CHECK: ; %bb.0: +; CHECK-NEXT: umaddl x0, w1, w2, x0 +; CHECK-NEXT: ret %ext1 = zext i32 %val1 to i64 %ext2 = zext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 %res = add i64 %acc, %prod -; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i64 @test_umsubl(i64 %acc, i32 %val1, i32 %val2) { ; CHECK-LABEL: test_umsubl: +; CHECK: ; %bb.0: +; CHECK-NEXT: umsubl x0, w1, w2, x0 +; CHECK-NEXT: ret %ext1 = zext i32 %val1 to i64 %ext2 = zext i32 %val2 to i64 %prod = mul i64 %ext1, %ext2 %res = sub i64 %acc, %prod -; CHECK: umsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i64 @test_smulh(i64 %lhs, i64 %rhs) { -; CHECK-LABEL: test_smulh: +; CHECK-SD-LABEL: test_smulh: +; CHECK-SD: ; %bb.0: +; CHECK-SD-NEXT: smulh x0, x0, x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_smulh: +; CHECK-GI: ; %bb.0: +; CHECK-GI-NEXT: asr x8, x1, #63 +; CHECK-GI-NEXT: asr x9, x0, #63 +; CHECK-GI-NEXT: umulh x10, x0, x1 +; CHECK-GI-NEXT: mul x8, x0, x8 +; CHECK-GI-NEXT: madd x8, x9, x1, x8 +; CHECK-GI-NEXT: add x0, x8, x10 +; CHECK-GI-NEXT: ret %ext1 = sext i64 %lhs to i128 %ext2 = sext i64 %rhs to i128 %res = mul i128 %ext1, %ext2 %high = lshr i128 %res, 64 %val = trunc i128 %high to i64 -; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %val } define i64 @test_umulh(i64 %lhs, i64 %rhs) { ; CHECK-LABEL: test_umulh: +; CHECK: ; %bb.0: +; CHECK-NEXT: umulh x0, x0, x1 +; CHECK-NEXT: ret %ext1 = zext i64 %lhs to i128 %ext2 = zext i64 %rhs to i128 %res = mul i128 %ext1, %ext2 %high = lshr i128 %res, 64 %val = trunc i128 %high to i64 -; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %val } define i32 @test_mul32(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: test_mul32: +; CHECK: ; %bb.0: +; CHECK-NEXT: mul w0, w0, w1 +; CHECK-NEXT: ret %res = mul i32 %lhs, %rhs -; CHECK: mul {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i32 %res } define i64 @test_mul64(i64 %lhs, i64 %rhs) { ; CHECK-LABEL: test_mul64: +; CHECK: ; %bb.0: +; CHECK-NEXT: mul x0, x0, x1 +; CHECK-NEXT: ret %res = mul i64 %lhs, %rhs -; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i32 @test_mneg32(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: test_mneg32: +; CHECK: ; %bb.0: +; CHECK-NEXT: mneg w0, w0, w1 +; CHECK-NEXT: ret %prod = mul i32 %lhs, %rhs %res = sub i32 0, %prod -; CHECK: mneg {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i32 %res } define i64 @test_mneg64(i64 %lhs, i64 %rhs) { ; CHECK-LABEL: test_mneg64: +; CHECK: ; %bb.0: +; CHECK-NEXT: mneg x0, x0, x1 +; CHECK-NEXT: ret %prod = mul i64 %lhs, %rhs %res = sub i64 0, %prod -; CHECK: mneg {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} ret i64 %res } define i64 @test_smull(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: test_smull: +; CHECK: ; %bb.0: +; CHECK-NEXT: smull x0, w0, w1 +; CHECK-NEXT: ret %ext1 = sext i32 %lhs to i64 %ext2 = sext i32 %rhs to i64 %res = mul i64 %ext1, %ext2 -; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i64 %res } define i64 @test_umull(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: test_umull: +; CHECK: ; %bb.0: +; CHECK-NEXT: umull x0, w0, w1 +; CHECK-NEXT: ret %ext1 = zext i32 %lhs to i64 %ext2 = zext i32 %rhs to i64 %res = mul i64 %ext1, %ext2 -; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i64 %res } define i64 @test_smnegl(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: test_smnegl: +; CHECK: ; %bb.0: +; CHECK-NEXT: smnegl x0, w0, w1 +; CHECK-NEXT: ret %ext1 = sext i32 %lhs to i64 %ext2 = sext i32 %rhs to i64 %prod = mul i64 %ext1, %ext2 %res = sub i64 0, %prod -; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i64 %res } define i64 @test_umnegl(i32 %lhs, i32 %rhs) { ; CHECK-LABEL: test_umnegl: +; CHECK: ; %bb.0: +; CHECK-NEXT: umnegl x0, w0, w1 +; CHECK-NEXT: ret %ext1 = zext i32 %lhs to i64 %ext2 = zext i32 %rhs to i64 %prod = mul i64 %ext1, %ext2 %res = sub i64 0, %prod -; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret i64 %res } @@ -168,11 +216,34 @@ define i64 @test_umnegl(i32 %lhs, i32 %rhs) { define void @test_mneg(){ ; CHECK-LABEL: test_mneg: +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x8, _a@GOTPAGE +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: adrp x9, _b@GOTPAGE +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: ldr x8, [x8, _a@GOTPAGEOFF] +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: ldr x9, [x9, _b@GOTPAGEOFF] +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: ldr w8, [x8] +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr w9, [x9] +; CHECK-NEXT: mneg w8, w8, w9 +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x9, _c@GOTPAGE +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr x9, [x9, _c@GOTPAGEOFF] +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdrGotStr Lloh6, Lloh7, Lloh8 +; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh1, Lloh3, Lloh5 +; CHECK-NEXT: .loh AdrpLdrGotLdr Lloh0, Lloh2, Lloh4 %1 = load i32, ptr @a, align 4 %2 = load i32, ptr @b, align 4 %3 = sub i32 0, %1 %4 = mul i32 %2, %3 store i32 %4, ptr @c, align 4 -; CHECK: mneg {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} ret void } diff --git a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll index 832e34b664fbe..f5cf629b2a4a4 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-cmp.ll @@ -75,10 +75,9 @@ define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) { ; CHECK-LABEL: vector_loop_with_icmp: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: mov w8, #2 // =0x2 -; CHECK-NEXT: mov w9, #16 // =0x10 -; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: mov z1.d, #2 // =0x2 ; CHECK-NEXT: add x8, x0, #4 +; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: mov w10, #1 // =0x1 ; CHECK-NEXT: b .LBB5_2 ; CHECK-NEXT: .LBB5_1: // %pred.store.continue6 diff --git a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll index 6ab703c08b837..121cc30692124 100644 --- a/llvm/test/CodeGen/AArch64/extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/extract-vector-elt.ll @@ -1114,16 +1114,10 @@ entry: } define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) { -; CHECK-SD-LABEL: v3ext: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldr d0, [sp] -; CHECK-SD-NEXT: fmov x0, d0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: v3ext: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr x0, [sp] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: v3ext: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr x0, [sp] +; CHECK-NEXT: ret entry: %c = extractelement <3 x ptr> %x, i32 2 ret ptr %c diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index 18665bcbeae83..7195e2b2f1255 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -2093,3 +2093,54 @@ define <2 x i1> @icmp_slt_v2i64_Zero_LHS(<2 x i64> %a) { %c = icmp slt <2 x i64> , %a ret <2 x i1> %c } + +; Test TST optimization for i8 sign bit testing with cross-type select +; This tests the pattern: icmp slt i8 %val, 0; select i1 %cmp, i32 %a, i32 %b +; The optimization should convert sxtb+cmp to tst for sign bit testing. + +define i32 @i8_signbit_tst_constants(i8 %x, i8 %y) { +; CHECK-SD-LABEL: i8_signbit_tst_constants: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w9, w0, w1 +; CHECK-SD-NEXT: mov w8, #42 // =0x2a +; CHECK-SD-NEXT: tst w9, #0x80 +; CHECK-SD-NEXT: mov w9, #20894 // =0x519e +; CHECK-SD-NEXT: csel w0, w9, w8, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i8_signbit_tst_constants: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, w1 +; CHECK-GI-NEXT: mov w9, #42 // =0x2a +; CHECK-GI-NEXT: mov w10, #20894 // =0x519e +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: cmp w8, #0 +; CHECK-GI-NEXT: csel w0, w10, w9, mi +; CHECK-GI-NEXT: ret + %add = add i8 %x, %y + %cmp = icmp slt i8 %add, 0 + %sel = select i1 %cmp, i32 20894, i32 42 + ret i32 %sel +} + +; Test i8 sign bit testing with variable select values (problematic case) +define i32 @i8_signbit_variables(i8 %x, i8 %y, i32 %a, i32 %b) { +; CHECK-SD-LABEL: i8_signbit_variables: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: add w8, w0, w1 +; CHECK-SD-NEXT: tst w8, #0x80 +; CHECK-SD-NEXT: csel w0, w2, w3, ne +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: i8_signbit_variables: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: add w8, w0, w1 +; CHECK-GI-NEXT: sxtb w8, w8 +; CHECK-GI-NEXT: cmp w8, #0 +; CHECK-GI-NEXT: csel w0, w2, w3, mi +; CHECK-GI-NEXT: ret + %add = add i8 %x, %y + %cmp = icmp slt i8 %add, 0 + %sel = select i1 %cmp, i32 %a, i32 %b + ret i32 %sel +} diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll index 42641693c4081..0d3ae559449a4 100644 --- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll +++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll @@ -740,162 +740,151 @@ entry: define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: stofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: mov x9, v3.d[1] -; CHECK-NEXT: mov x8, v2.d[1] -; CHECK-NEXT: fmov x11, d3 -; CHECK-NEXT: fmov x12, d0 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: scvtf s2, x10 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: scvtf s19, x9 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: scvtf s16, x11 -; CHECK-NEXT: mov x11, v6.d[1] -; CHECK-NEXT: scvtf s0, x12 -; CHECK-NEXT: scvtf s18, x8 -; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: ldp x8, x9, [sp, #32] +; CHECK-NEXT: mov x13, v2.d[1] +; CHECK-NEXT: ldp x10, x12, [sp, #96] +; CHECK-NEXT: fmov x14, d3 +; CHECK-NEXT: movi v17.4s, #1 +; CHECK-NEXT: scvtf s18, x9 +; CHECK-NEXT: scvtf s16, x8 +; CHECK-NEXT: ldp x8, x9, [sp, #48] +; CHECK-NEXT: scvtf s23, x12 ; CHECK-NEXT: scvtf s20, x10 -; CHECK-NEXT: scvtf s17, x9 -; CHECK-NEXT: mov x9, v7.d[1] -; CHECK-NEXT: mov x10, v4.d[1] -; CHECK-NEXT: scvtf s21, x11 -; CHECK-NEXT: fmov x11, d6 -; CHECK-NEXT: mov v2.s[1], v18.s[0] -; CHECK-NEXT: scvtf s25, x8 -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: mov v0.s[1], v20.s[0] -; CHECK-NEXT: ldp q24, q20, [sp, #32] -; CHECK-NEXT: scvtf s22, x9 -; CHECK-NEXT: fmov x9, d4 -; CHECK-NEXT: scvtf s1, x11 -; CHECK-NEXT: scvtf s26, x10 -; CHECK-NEXT: fmov x11, d7 -; CHECK-NEXT: mov v2.s[2], v16.s[0] -; CHECK-NEXT: ldp q18, q16, [sp] -; CHECK-NEXT: mov x8, v24.d[1] -; CHECK-NEXT: scvtf s4, x9 -; CHECK-NEXT: fmov x9, d5 -; CHECK-NEXT: mov v0.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[1], v21.s[0] -; CHECK-NEXT: scvtf s23, x11 -; CHECK-NEXT: mov x11, v5.d[1] -; CHECK-NEXT: mov v2.s[3], v19.s[0] +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: scvtf s27, x13 ; CHECK-NEXT: scvtf s21, x8 -; CHECK-NEXT: mov x8, v20.d[1] -; CHECK-NEXT: scvtf s17, x9 -; CHECK-NEXT: fmov x9, d24 -; CHECK-NEXT: mov v4.s[1], v26.s[0] -; CHECK-NEXT: mov v0.s[3], v25.s[0] -; CHECK-NEXT: ldp q26, q24, [sp, #96] -; CHECK-NEXT: mov v1.s[2], v23.s[0] -; CHECK-NEXT: ldp q25, q23, [sp, #64] -; CHECK-NEXT: scvtf s7, x11 -; CHECK-NEXT: scvtf s27, x8 -; CHECK-NEXT: fmov x8, d18 -; CHECK-NEXT: scvtf s5, x9 -; CHECK-NEXT: mov x10, v26.d[1] -; CHECK-NEXT: mov x9, v18.d[1] -; CHECK-NEXT: fmov x11, d20 -; CHECK-NEXT: mov v4.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[3], v22.s[0] -; CHECK-NEXT: ushr v19.4s, v2.4s, #16 -; CHECK-NEXT: scvtf s17, x8 -; CHECK-NEXT: fmov x8, d26 -; CHECK-NEXT: add v26.4s, v2.4s, v6.4s +; CHECK-NEXT: ldp x8, x11, [sp] +; CHECK-NEXT: mov v16.s[1], v18.s[0] +; CHECK-NEXT: scvtf s24, x9 +; CHECK-NEXT: movi v18.4s, #127, msl #8 +; CHECK-NEXT: mov v20.s[1], v23.s[0] ; CHECK-NEXT: scvtf s22, x11 -; CHECK-NEXT: mov x11, v25.d[1] -; CHECK-NEXT: mov v5.s[1], v21.s[0] -; CHECK-NEXT: scvtf s28, x10 -; CHECK-NEXT: fmov x10, d16 -; CHECK-NEXT: scvtf s21, x9 -; CHECK-NEXT: fmov x9, d25 -; CHECK-NEXT: scvtf s18, x8 -; CHECK-NEXT: mov x8, v16.d[1] -; CHECK-NEXT: mov v4.s[3], v7.s[0] -; CHECK-NEXT: and v19.16b, v19.16b, v3.16b -; CHECK-NEXT: scvtf s16, x10 -; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: ldp x11, x12, [sp, #16] +; CHECK-NEXT: scvtf s19, x8 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: mov v16.s[2], v21.s[0] ; CHECK-NEXT: scvtf s25, x11 -; CHECK-NEXT: scvtf s20, x9 -; CHECK-NEXT: mov x9, v24.d[1] -; CHECK-NEXT: mov v17.s[1], v21.s[0] -; CHECK-NEXT: fmov x11, d23 -; CHECK-NEXT: mov v18.s[1], v28.s[0] -; CHECK-NEXT: scvtf s24, x8 -; CHECK-NEXT: scvtf s21, x10 -; CHECK-NEXT: mov x10, v23.d[1] -; CHECK-NEXT: mov v5.s[2], v22.s[0] -; CHECK-NEXT: ushr v22.4s, v1.4s, #16 -; CHECK-NEXT: ushr v28.4s, v0.4s, #16 +; CHECK-NEXT: ldp x9, x11, [sp, #112] +; CHECK-NEXT: mov v19.s[1], v22.s[0] +; CHECK-NEXT: scvtf s22, x12 +; CHECK-NEXT: scvtf s26, x9 +; CHECK-NEXT: ldp x9, x12, [sp, #64] ; CHECK-NEXT: scvtf s23, x11 -; CHECK-NEXT: mov v20.s[1], v25.s[0] -; CHECK-NEXT: scvtf s25, x9 -; CHECK-NEXT: mov v17.s[2], v16.s[0] -; CHECK-NEXT: add v16.4s, v19.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v4.4s, #16 -; CHECK-NEXT: mov v18.s[2], v21.s[0] -; CHECK-NEXT: scvtf s7, x10 -; CHECK-NEXT: and v22.16b, v22.16b, v3.16b -; CHECK-NEXT: mov v5.s[3], v27.s[0] -; CHECK-NEXT: and v21.16b, v28.16b, v3.16b -; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v20.s[2], v23.s[0] -; CHECK-NEXT: add v23.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v16.s[3], v24.s[0] +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: scvtf s24, x12 +; CHECK-NEXT: scvtf s2, x9 +; CHECK-NEXT: mov x9, v6.d[1] +; CHECK-NEXT: ldp x12, x13, [sp, #80] +; CHECK-NEXT: scvtf s21, x11 +; CHECK-NEXT: mov x11, v4.d[1] +; CHECK-NEXT: mov v19.s[2], v25.s[0] +; CHECK-NEXT: mov v20.s[2], v26.s[0] +; CHECK-NEXT: ushr v25.4s, v16.4s, #16 +; CHECK-NEXT: scvtf s26, x14 +; CHECK-NEXT: scvtf s3, x12 +; CHECK-NEXT: mov v2.s[1], v24.s[0] +; CHECK-NEXT: scvtf s24, x10 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: scvtf s6, x9 +; CHECK-NEXT: mov v21.s[1], v27.s[0] +; CHECK-NEXT: scvtf s27, x11 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov v19.s[3], v22.s[0] +; CHECK-NEXT: mov v20.s[3], v23.s[0] +; CHECK-NEXT: add v22.4s, v16.4s, v18.4s +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: scvtf s3, x10 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: scvtf s0, x12 +; CHECK-NEXT: and v23.16b, v25.16b, v17.16b +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: fmov x12, d5 +; CHECK-NEXT: mov v21.s[2], v26.s[0] +; CHECK-NEXT: scvtf s25, x13 +; CHECK-NEXT: scvtf s4, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: add v26.4s, v20.4s, v18.4s +; CHECK-NEXT: mov v3.s[1], v6.s[0] +; CHECK-NEXT: scvtf s6, x11 +; CHECK-NEXT: mov x11, v5.d[1] +; CHECK-NEXT: scvtf s5, x8 +; CHECK-NEXT: mov v0.s[1], v24.s[0] +; CHECK-NEXT: add v22.4s, v23.4s, v22.4s +; CHECK-NEXT: scvtf s1, x10 +; CHECK-NEXT: mov x10, v7.d[1] +; CHECK-NEXT: scvtf s7, x12 +; CHECK-NEXT: mov v4.s[1], v27.s[0] +; CHECK-NEXT: ushr v23.4s, v19.4s, #16 +; CHECK-NEXT: mov v2.s[3], v25.s[0] +; CHECK-NEXT: mov v3.s[2], v6.s[0] +; CHECK-NEXT: add v25.4s, v19.4s, v18.4s +; CHECK-NEXT: ushr v24.4s, v20.4s, #16 +; CHECK-NEXT: mov v21.s[3], v5.s[0] +; CHECK-NEXT: scvtf s5, x11 +; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s +; CHECK-NEXT: scvtf s6, x10 +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: scvtf s1, x9 +; CHECK-NEXT: mov v4.s[2], v7.s[0] +; CHECK-NEXT: and v24.16b, v24.16b, v17.16b +; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s +; CHECK-NEXT: orr v16.4s, #64, lsl #16 +; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s +; CHECK-NEXT: add v27.4s, v21.4s, v18.4s +; CHECK-NEXT: orr v20.4s, #64, lsl #16 +; CHECK-NEXT: mov v3.s[3], v6.s[0] +; CHECK-NEXT: add v6.4s, v23.4s, v25.4s +; CHECK-NEXT: ushr v23.4s, v21.4s, #16 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: ushr v1.4s, v2.4s, #16 +; CHECK-NEXT: add v24.4s, v24.4s, v26.4s +; CHECK-NEXT: add v25.4s, v2.4s, v18.4s +; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: orr v19.4s, #64, lsl #16 ; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: mov v17.s[3], v24.s[0] -; CHECK-NEXT: add v24.4s, v1.4s, v6.4s -; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s -; CHECK-NEXT: mov v18.s[3], v25.s[0] -; CHECK-NEXT: add v25.4s, v4.4s, v6.4s -; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b -; CHECK-NEXT: mov v20.s[3], v7.s[0] -; CHECK-NEXT: add v22.4s, v22.4s, v24.4s -; CHECK-NEXT: add v7.4s, v21.4s, v23.4s -; CHECK-NEXT: ushr v24.4s, v17.4s, #16 -; CHECK-NEXT: and v23.16b, v26.16b, v3.16b -; CHECK-NEXT: ushr v26.4s, v5.4s, #16 -; CHECK-NEXT: ushr v28.4s, v18.4s, #16 -; CHECK-NEXT: add v30.4s, v17.4s, v6.4s -; CHECK-NEXT: add v31.4s, v18.4s, v6.4s -; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b -; CHECK-NEXT: ushr v29.4s, v20.4s, #16 -; CHECK-NEXT: and v24.16b, v24.16b, v3.16b -; CHECK-NEXT: add v23.4s, v23.4s, v25.4s -; CHECK-NEXT: and v28.16b, v28.16b, v3.16b -; CHECK-NEXT: and v25.16b, v26.16b, v3.16b -; CHECK-NEXT: add v26.4s, v5.4s, v6.4s -; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v3.16b, v29.16b, v3.16b -; CHECK-NEXT: add v24.4s, v24.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s -; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s -; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: ushr v28.4s, v3.4s, #16 +; CHECK-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b +; CHECK-NEXT: ushr v26.4s, v0.4s, #16 +; CHECK-NEXT: ushr v30.4s, v4.4s, #16 +; CHECK-NEXT: add v23.4s, v23.4s, v27.4s +; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b +; CHECK-NEXT: mov v6.16b, v29.16b +; CHECK-NEXT: and v27.16b, v28.16b, v17.16b +; CHECK-NEXT: add v28.4s, v3.4s, v18.4s +; CHECK-NEXT: add v1.4s, v1.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v17.16b +; CHECK-NEXT: add v26.4s, v0.4s, v18.4s +; CHECK-NEXT: and v17.16b, v30.16b, v17.16b +; CHECK-NEXT: add v18.4s, v4.4s, v18.4s +; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s +; CHECK-NEXT: orr v21.4s, #64, lsl #16 +; CHECK-NEXT: add v27.4s, v27.4s, v28.4s +; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 ; CHECK-NEXT: add v25.4s, v25.4s, v26.4s -; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s +; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-NEXT: add v17.4s, v17.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s ; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: add v3.4s, v3.4s, v6.4s -; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: orr v17.4s, #64, lsl #16 -; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v20.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b -; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b -; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b -; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v16.16b, v30.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b +; CHECK-NEXT: mov v19.16b, v28.16b +; CHECK-NEXT: uzp2 v2.8h, v5.8h, v7.8h +; CHECK-NEXT: bit v0.16b, v25.16b, v26.16b +; CHECK-NEXT: bsl v16.16b, v23.16b, v21.16b +; CHECK-NEXT: bit v4.16b, v17.16b, v18.16b +; CHECK-NEXT: bsl v19.16b, v27.16b, v3.16b +; CHECK-NEXT: uzp2 v3.8h, v1.8h, v6.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v16.8h +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v19.8h ; CHECK-NEXT: ret entry: %c = sitofp <32 x i64> %a to <32 x bfloat> @@ -905,162 +894,151 @@ entry: define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: utofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov x10, d2 -; CHECK-NEXT: mov x9, v3.d[1] -; CHECK-NEXT: mov x8, v2.d[1] -; CHECK-NEXT: fmov x11, d3 -; CHECK-NEXT: fmov x12, d0 -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ucvtf s2, x10 -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: ucvtf s19, x9 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: ucvtf s16, x11 -; CHECK-NEXT: mov x11, v6.d[1] -; CHECK-NEXT: ucvtf s0, x12 -; CHECK-NEXT: ucvtf s18, x8 -; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: ldp x8, x9, [sp, #32] +; CHECK-NEXT: mov x13, v2.d[1] +; CHECK-NEXT: ldp x10, x12, [sp, #96] +; CHECK-NEXT: fmov x14, d3 +; CHECK-NEXT: movi v17.4s, #1 +; CHECK-NEXT: ucvtf s18, x9 +; CHECK-NEXT: ucvtf s16, x8 +; CHECK-NEXT: ldp x8, x9, [sp, #48] +; CHECK-NEXT: ucvtf s23, x12 ; CHECK-NEXT: ucvtf s20, x10 -; CHECK-NEXT: ucvtf s17, x9 -; CHECK-NEXT: mov x9, v7.d[1] -; CHECK-NEXT: mov x10, v4.d[1] -; CHECK-NEXT: ucvtf s21, x11 -; CHECK-NEXT: fmov x11, d6 -; CHECK-NEXT: mov v2.s[1], v18.s[0] -; CHECK-NEXT: ucvtf s25, x8 -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: mov v0.s[1], v20.s[0] -; CHECK-NEXT: ldp q24, q20, [sp, #32] -; CHECK-NEXT: ucvtf s22, x9 -; CHECK-NEXT: fmov x9, d4 -; CHECK-NEXT: ucvtf s1, x11 -; CHECK-NEXT: ucvtf s26, x10 -; CHECK-NEXT: fmov x11, d7 -; CHECK-NEXT: mov v2.s[2], v16.s[0] -; CHECK-NEXT: ldp q18, q16, [sp] -; CHECK-NEXT: mov x8, v24.d[1] -; CHECK-NEXT: ucvtf s4, x9 -; CHECK-NEXT: fmov x9, d5 -; CHECK-NEXT: mov v0.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[1], v21.s[0] -; CHECK-NEXT: ucvtf s23, x11 -; CHECK-NEXT: mov x11, v5.d[1] -; CHECK-NEXT: mov v2.s[3], v19.s[0] +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ucvtf s27, x13 ; CHECK-NEXT: ucvtf s21, x8 -; CHECK-NEXT: mov x8, v20.d[1] -; CHECK-NEXT: ucvtf s17, x9 -; CHECK-NEXT: fmov x9, d24 -; CHECK-NEXT: mov v4.s[1], v26.s[0] -; CHECK-NEXT: mov v0.s[3], v25.s[0] -; CHECK-NEXT: ldp q26, q24, [sp, #96] -; CHECK-NEXT: mov v1.s[2], v23.s[0] -; CHECK-NEXT: ldp q25, q23, [sp, #64] -; CHECK-NEXT: ucvtf s7, x11 -; CHECK-NEXT: ucvtf s27, x8 -; CHECK-NEXT: fmov x8, d18 -; CHECK-NEXT: ucvtf s5, x9 -; CHECK-NEXT: mov x10, v26.d[1] -; CHECK-NEXT: mov x9, v18.d[1] -; CHECK-NEXT: fmov x11, d20 -; CHECK-NEXT: mov v4.s[2], v17.s[0] -; CHECK-NEXT: mov v1.s[3], v22.s[0] -; CHECK-NEXT: ushr v19.4s, v2.4s, #16 -; CHECK-NEXT: ucvtf s17, x8 -; CHECK-NEXT: fmov x8, d26 -; CHECK-NEXT: add v26.4s, v2.4s, v6.4s +; CHECK-NEXT: ldp x8, x11, [sp] +; CHECK-NEXT: mov v16.s[1], v18.s[0] +; CHECK-NEXT: ucvtf s24, x9 +; CHECK-NEXT: movi v18.4s, #127, msl #8 +; CHECK-NEXT: mov v20.s[1], v23.s[0] ; CHECK-NEXT: ucvtf s22, x11 -; CHECK-NEXT: mov x11, v25.d[1] -; CHECK-NEXT: mov v5.s[1], v21.s[0] -; CHECK-NEXT: ucvtf s28, x10 -; CHECK-NEXT: fmov x10, d16 -; CHECK-NEXT: ucvtf s21, x9 -; CHECK-NEXT: fmov x9, d25 -; CHECK-NEXT: ucvtf s18, x8 -; CHECK-NEXT: mov x8, v16.d[1] -; CHECK-NEXT: mov v4.s[3], v7.s[0] -; CHECK-NEXT: and v19.16b, v19.16b, v3.16b -; CHECK-NEXT: ucvtf s16, x10 -; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: ldp x11, x12, [sp, #16] +; CHECK-NEXT: ucvtf s19, x8 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: mov v16.s[2], v21.s[0] ; CHECK-NEXT: ucvtf s25, x11 -; CHECK-NEXT: ucvtf s20, x9 -; CHECK-NEXT: mov x9, v24.d[1] -; CHECK-NEXT: mov v17.s[1], v21.s[0] -; CHECK-NEXT: fmov x11, d23 -; CHECK-NEXT: mov v18.s[1], v28.s[0] -; CHECK-NEXT: ucvtf s24, x8 -; CHECK-NEXT: ucvtf s21, x10 -; CHECK-NEXT: mov x10, v23.d[1] -; CHECK-NEXT: mov v5.s[2], v22.s[0] -; CHECK-NEXT: ushr v22.4s, v1.4s, #16 -; CHECK-NEXT: ushr v28.4s, v0.4s, #16 +; CHECK-NEXT: ldp x9, x11, [sp, #112] +; CHECK-NEXT: mov v19.s[1], v22.s[0] +; CHECK-NEXT: ucvtf s22, x12 +; CHECK-NEXT: ucvtf s26, x9 +; CHECK-NEXT: ldp x9, x12, [sp, #64] ; CHECK-NEXT: ucvtf s23, x11 -; CHECK-NEXT: mov v20.s[1], v25.s[0] -; CHECK-NEXT: ucvtf s25, x9 -; CHECK-NEXT: mov v17.s[2], v16.s[0] -; CHECK-NEXT: add v16.4s, v19.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v4.4s, #16 -; CHECK-NEXT: mov v18.s[2], v21.s[0] -; CHECK-NEXT: ucvtf s7, x10 -; CHECK-NEXT: and v22.16b, v22.16b, v3.16b -; CHECK-NEXT: mov v5.s[3], v27.s[0] -; CHECK-NEXT: and v21.16b, v28.16b, v3.16b -; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s -; CHECK-NEXT: mov v20.s[2], v23.s[0] -; CHECK-NEXT: add v23.4s, v0.4s, v6.4s +; CHECK-NEXT: mov v16.s[3], v24.s[0] +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: ucvtf s24, x12 +; CHECK-NEXT: ucvtf s2, x9 +; CHECK-NEXT: mov x9, v6.d[1] +; CHECK-NEXT: ldp x12, x13, [sp, #80] +; CHECK-NEXT: ucvtf s21, x11 +; CHECK-NEXT: mov x11, v4.d[1] +; CHECK-NEXT: mov v19.s[2], v25.s[0] +; CHECK-NEXT: mov v20.s[2], v26.s[0] +; CHECK-NEXT: ushr v25.4s, v16.4s, #16 +; CHECK-NEXT: ucvtf s26, x14 +; CHECK-NEXT: ucvtf s3, x12 +; CHECK-NEXT: mov v2.s[1], v24.s[0] +; CHECK-NEXT: ucvtf s24, x10 +; CHECK-NEXT: fmov x10, d6 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: ucvtf s6, x9 +; CHECK-NEXT: mov v21.s[1], v27.s[0] +; CHECK-NEXT: ucvtf s27, x11 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov v19.s[3], v22.s[0] +; CHECK-NEXT: mov v20.s[3], v23.s[0] +; CHECK-NEXT: add v22.4s, v16.4s, v18.4s +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: ucvtf s3, x10 +; CHECK-NEXT: fmov x10, d4 +; CHECK-NEXT: ucvtf s0, x12 +; CHECK-NEXT: and v23.16b, v25.16b, v17.16b +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: fmov x12, d5 +; CHECK-NEXT: mov v21.s[2], v26.s[0] +; CHECK-NEXT: ucvtf s25, x13 +; CHECK-NEXT: ucvtf s4, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: add v26.4s, v20.4s, v18.4s +; CHECK-NEXT: mov v3.s[1], v6.s[0] +; CHECK-NEXT: ucvtf s6, x11 +; CHECK-NEXT: mov x11, v5.d[1] +; CHECK-NEXT: ucvtf s5, x8 +; CHECK-NEXT: mov v0.s[1], v24.s[0] +; CHECK-NEXT: add v22.4s, v23.4s, v22.4s +; CHECK-NEXT: ucvtf s1, x10 +; CHECK-NEXT: mov x10, v7.d[1] +; CHECK-NEXT: ucvtf s7, x12 +; CHECK-NEXT: mov v4.s[1], v27.s[0] +; CHECK-NEXT: ushr v23.4s, v19.4s, #16 +; CHECK-NEXT: mov v2.s[3], v25.s[0] +; CHECK-NEXT: mov v3.s[2], v6.s[0] +; CHECK-NEXT: add v25.4s, v19.4s, v18.4s +; CHECK-NEXT: ushr v24.4s, v20.4s, #16 +; CHECK-NEXT: mov v21.s[3], v5.s[0] +; CHECK-NEXT: ucvtf s5, x11 +; CHECK-NEXT: fcmeq v29.4s, v20.4s, v20.4s +; CHECK-NEXT: ucvtf s6, x10 +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: ucvtf s1, x9 +; CHECK-NEXT: mov v4.s[2], v7.s[0] +; CHECK-NEXT: and v24.16b, v24.16b, v17.16b +; CHECK-NEXT: fcmeq v7.4s, v16.4s, v16.4s +; CHECK-NEXT: orr v16.4s, #64, lsl #16 +; CHECK-NEXT: fcmeq v31.4s, v2.4s, v2.4s +; CHECK-NEXT: add v27.4s, v21.4s, v18.4s +; CHECK-NEXT: orr v20.4s, #64, lsl #16 +; CHECK-NEXT: mov v3.s[3], v6.s[0] +; CHECK-NEXT: add v6.4s, v23.4s, v25.4s +; CHECK-NEXT: ushr v23.4s, v21.4s, #16 +; CHECK-NEXT: mov v0.s[3], v1.s[0] +; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: ushr v1.4s, v2.4s, #16 +; CHECK-NEXT: add v24.4s, v24.4s, v26.4s +; CHECK-NEXT: add v25.4s, v2.4s, v18.4s +; CHECK-NEXT: fcmeq v5.4s, v19.4s, v19.4s +; CHECK-NEXT: and v23.16b, v23.16b, v17.16b +; CHECK-NEXT: orr v19.4s, #64, lsl #16 ; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: mov v17.s[3], v24.s[0] -; CHECK-NEXT: add v24.4s, v1.4s, v6.4s -; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s -; CHECK-NEXT: mov v18.s[3], v25.s[0] -; CHECK-NEXT: add v25.4s, v4.4s, v6.4s -; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b -; CHECK-NEXT: mov v20.s[3], v7.s[0] -; CHECK-NEXT: add v22.4s, v22.4s, v24.4s -; CHECK-NEXT: add v7.4s, v21.4s, v23.4s -; CHECK-NEXT: ushr v24.4s, v17.4s, #16 -; CHECK-NEXT: and v23.16b, v26.16b, v3.16b -; CHECK-NEXT: ushr v26.4s, v5.4s, #16 -; CHECK-NEXT: ushr v28.4s, v18.4s, #16 -; CHECK-NEXT: add v30.4s, v17.4s, v6.4s -; CHECK-NEXT: add v31.4s, v18.4s, v6.4s -; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b -; CHECK-NEXT: ushr v29.4s, v20.4s, #16 -; CHECK-NEXT: and v24.16b, v24.16b, v3.16b -; CHECK-NEXT: add v23.4s, v23.4s, v25.4s -; CHECK-NEXT: and v28.16b, v28.16b, v3.16b -; CHECK-NEXT: and v25.16b, v26.16b, v3.16b -; CHECK-NEXT: add v26.4s, v5.4s, v6.4s -; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v3.16b, v29.16b, v3.16b -; CHECK-NEXT: add v24.4s, v24.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s -; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s -; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: ushr v28.4s, v3.4s, #16 +; CHECK-NEXT: and v1.16b, v1.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v22.16b, v16.16b +; CHECK-NEXT: ushr v26.4s, v0.4s, #16 +; CHECK-NEXT: ushr v30.4s, v4.4s, #16 +; CHECK-NEXT: add v23.4s, v23.4s, v27.4s +; CHECK-NEXT: bsl v5.16b, v6.16b, v19.16b +; CHECK-NEXT: mov v6.16b, v29.16b +; CHECK-NEXT: and v27.16b, v28.16b, v17.16b +; CHECK-NEXT: add v28.4s, v3.4s, v18.4s +; CHECK-NEXT: add v1.4s, v1.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v17.16b +; CHECK-NEXT: add v26.4s, v0.4s, v18.4s +; CHECK-NEXT: and v17.16b, v30.16b, v17.16b +; CHECK-NEXT: add v18.4s, v4.4s, v18.4s +; CHECK-NEXT: fcmeq v30.4s, v21.4s, v21.4s +; CHECK-NEXT: orr v21.4s, #64, lsl #16 +; CHECK-NEXT: add v27.4s, v27.4s, v28.4s +; CHECK-NEXT: fcmeq v28.4s, v3.4s, v3.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 ; CHECK-NEXT: add v25.4s, v25.4s, v26.4s -; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s +; CHECK-NEXT: fcmeq v26.4s, v0.4s, v0.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-NEXT: add v17.4s, v17.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v4.4s, v4.4s ; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: add v3.4s, v3.4s, v6.4s -; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: orr v17.4s, #64, lsl #16 -; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v20.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b -; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b -; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b -; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v16.16b, v30.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v20.16b +; CHECK-NEXT: bif v1.16b, v2.16b, v31.16b +; CHECK-NEXT: mov v19.16b, v28.16b +; CHECK-NEXT: uzp2 v2.8h, v5.8h, v7.8h +; CHECK-NEXT: bit v0.16b, v25.16b, v26.16b +; CHECK-NEXT: bsl v16.16b, v23.16b, v21.16b +; CHECK-NEXT: bit v4.16b, v17.16b, v18.16b +; CHECK-NEXT: bsl v19.16b, v27.16b, v3.16b +; CHECK-NEXT: uzp2 v3.8h, v1.8h, v6.8h +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v16.8h +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v19.8h ; CHECK-NEXT: ret entry: %c = uitofp <32 x i64> %a to <32 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/ldexp.ll b/llvm/test/CodeGen/AArch64/ldexp.ll index 6019fa1490e3d..1e35bd627a199 100644 --- a/llvm/test/CodeGen/AArch64/ldexp.ll +++ b/llvm/test/CodeGen/AArch64/ldexp.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck -check-prefixes=SVE,SVELINUX %s +; RUN: llc -mtriple=aarch64 -global-isel < %s -o - | FileCheck -check-prefixes=GISEL %s ; RUN: llc -mtriple=aarch64-windows-msvc -mattr=+sve < %s -o - | FileCheck -check-prefixes=SVE,SVEWINDOWS %s ; RUN: llc -mtriple=aarch64-windows-msvc < %s -o - | FileCheck -check-prefixes=WINDOWS %s @@ -15,6 +16,10 @@ define double @testExp(double %val, i32 %a) { ; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE-NEXT: ret ; +; GISEL-LABEL: testExp: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: b ldexp +; ; WINDOWS-LABEL: testExp: ; WINDOWS: // %bb.0: // %entry ; WINDOWS-NEXT: b ldexp @@ -37,6 +42,10 @@ define double @testExpIntrinsic(double %val, i32 %a) { ; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 ; SVE-NEXT: ret ; +; GISEL-LABEL: testExpIntrinsic: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: b ldexp +; ; WINDOWS-LABEL: testExpIntrinsic: ; WINDOWS: // %bb.0: // %entry ; WINDOWS-NEXT: b ldexp @@ -55,6 +64,10 @@ define float @testExpf(float %val, i32 %a) { ; SVELINUX-NEXT: // kill: def $s0 killed $s0 killed $z0 ; SVELINUX-NEXT: ret ; +; GISEL-LABEL: testExpf: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: b ldexpf +; ; SVEWINDOWS-LABEL: testExpf: ; SVEWINDOWS: // %bb.0: // %entry ; SVEWINDOWS-NEXT: b ldexpf @@ -77,6 +90,10 @@ define float @testExpfIntrinsic(float %val, i32 %a) { ; SVE-NEXT: // kill: def $s0 killed $s0 killed $z0 ; SVE-NEXT: ret ; +; GISEL-LABEL: testExpfIntrinsic: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: b ldexpf +; ; WINDOWS-LABEL: testExpfIntrinsic: ; WINDOWS: .seh_proc testExpfIntrinsic ; WINDOWS-NEXT: // %bb.0: // %entry @@ -98,6 +115,90 @@ entry: ret float %call } +define <2 x float> @test_ldexp_v2f32_v2i32(<2 x float> %Val, <2 x i32> %Exp) { +; SVE-LABEL: test_ldexp_v2f32_v2i32: +; SVE: // %bb.0: +; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE-NEXT: mov w8, v1.s[1] +; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: mov s2, v0.s[1] +; SVE-NEXT: ptrue p0.s +; SVE-NEXT: fscale z0.s, p0/m, z0.s, z1.s +; SVE-NEXT: fmov s3, w8 +; SVE-NEXT: fscale z2.s, p0/m, z2.s, z3.s +; SVE-NEXT: mov v0.s[1], v2.s[0] +; SVE-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE-NEXT: ret +; +; GISEL-LABEL: test_ldexp_v2f32_v2i32: +; GISEL: // %bb.0: +; GISEL-NEXT: sub sp, sp, #48 +; GISEL-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill +; GISEL-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 48 +; GISEL-NEXT: .cfi_offset w30, -16 +; GISEL-NEXT: .cfi_offset b8, -24 +; GISEL-NEXT: .cfi_offset b9, -32 +; GISEL-NEXT: // kill: def $d1 killed $d1 def $q1 +; GISEL-NEXT: fmov w0, s1 +; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 +; GISEL-NEXT: mov s8, v0.s[1] +; GISEL-NEXT: mov s9, v1.s[1] +; GISEL-NEXT: // kill: def $s0 killed $s0 killed $q0 +; GISEL-NEXT: bl ldexpf +; GISEL-NEXT: // kill: def $s0 killed $s0 def $q0 +; GISEL-NEXT: str q0, [sp] // 16-byte Folded Spill +; GISEL-NEXT: fmov w0, s9 +; GISEL-NEXT: fmov s0, s8 +; GISEL-NEXT: bl ldexpf +; GISEL-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; GISEL-NEXT: // kill: def $s0 killed $s0 def $q0 +; GISEL-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; GISEL-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; GISEL-NEXT: mov v1.s[1], v0.s[0] +; GISEL-NEXT: fmov d0, d1 +; GISEL-NEXT: add sp, sp, #48 +; GISEL-NEXT: ret +; +; WINDOWS-LABEL: test_ldexp_v2f32_v2i32: +; WINDOWS: .seh_proc test_ldexp_v2f32_v2i32 +; WINDOWS-NEXT: // %bb.0: +; WINDOWS-NEXT: sub sp, sp, #48 +; WINDOWS-NEXT: .seh_stackalloc 48 +; WINDOWS-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; WINDOWS-NEXT: .seh_save_reg x30, 32 +; WINDOWS-NEXT: .seh_endprologue +; WINDOWS-NEXT: // kill: def $d0 killed $d0 def $q0 +; WINDOWS-NEXT: mov s2, v0.s[1] +; WINDOWS-NEXT: // kill: def $d1 killed $d1 def $q1 +; WINDOWS-NEXT: stp q1, q0, [sp] // 32-byte Folded Spill +; WINDOWS-NEXT: mov w0, v1.s[1] +; WINDOWS-NEXT: fcvt d0, s2 +; WINDOWS-NEXT: bl ldexp +; WINDOWS-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; WINDOWS-NEXT: fcvt s0, d0 +; WINDOWS-NEXT: fcvt d1, s1 +; WINDOWS-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; WINDOWS-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; WINDOWS-NEXT: fmov w0, s0 +; WINDOWS-NEXT: fmov d0, d1 +; WINDOWS-NEXT: bl ldexp +; WINDOWS-NEXT: fcvt s0, d0 +; WINDOWS-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; WINDOWS-NEXT: mov v0.s[1], v1.s[0] +; WINDOWS-NEXT: // kill: def $d0 killed $d0 killed $q0 +; WINDOWS-NEXT: .seh_startepilogue +; WINDOWS-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; WINDOWS-NEXT: .seh_save_reg x30, 32 +; WINDOWS-NEXT: add sp, sp, #48 +; WINDOWS-NEXT: .seh_stackalloc 48 +; WINDOWS-NEXT: .seh_endepilogue +; WINDOWS-NEXT: ret +; WINDOWS-NEXT: .seh_endfunclet +; WINDOWS-NEXT: .seh_endproc + %result = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %Val, <2 x i32> %Exp) + ret <2 x float> %result +} declare float @ldexpf(float, i32) memory(none) @@ -106,6 +207,10 @@ define fp128 @testExpl(fp128 %val, i32 %a) { ; SVE: // %bb.0: // %entry ; SVE-NEXT: b ldexpl ; +; GISEL-LABEL: testExpl: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: b ldexpl +; ; WINDOWS-LABEL: testExpl: ; WINDOWS: // %bb.0: // %entry ; WINDOWS-NEXT: b ldexpl @@ -126,6 +231,17 @@ define half @testExpf16(half %val, i32 %a) { ; SVE-NEXT: fcvt h0, s0 ; SVE-NEXT: ret ; +; GISEL-LABEL: testExpf16: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 16 +; GISEL-NEXT: .cfi_offset w30, -16 +; GISEL-NEXT: fcvt s0, h0 +; GISEL-NEXT: bl ldexpf +; GISEL-NEXT: fcvt h0, s0 +; GISEL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISEL-NEXT: ret +; ; WINDOWS-LABEL: testExpf16: ; WINDOWS: .seh_proc testExpf16 ; WINDOWS-NEXT: // %bb.0: // %entry diff --git a/llvm/test/CodeGen/AArch64/madd-lohi.ll b/llvm/test/CodeGen/AArch64/madd-lohi.ll deleted file mode 100644 index e5d8fcdda326d..0000000000000 --- a/llvm/test/CodeGen/AArch64/madd-lohi.ll +++ /dev/null @@ -1,25 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s -; RUN: llc -mtriple=aarch64_be-linux-gnu %s -o - | FileCheck --check-prefix=CHECK-BE %s - -define i128 @test_128bitmul(i128 %lhs, i128 %rhs) { -; CHECK-LABEL: test_128bitmul: -; CHECK: ; %bb.0: -; CHECK-NEXT: umulh x8, x0, x2 -; CHECK-NEXT: madd x8, x0, x3, x8 -; CHECK-NEXT: mul x0, x0, x2 -; CHECK-NEXT: madd x1, x1, x2, x8 -; CHECK-NEXT: ret -; -; CHECK-BE-LABEL: test_128bitmul: -; CHECK-BE: // %bb.0: -; CHECK-BE-NEXT: umulh x8, x1, x3 -; CHECK-BE-NEXT: madd x8, x1, x2, x8 -; CHECK-BE-NEXT: mul x1, x1, x3 -; CHECK-BE-NEXT: madd x0, x0, x3, x8 -; CHECK-BE-NEXT: ret - - - %prod = mul i128 %lhs, %rhs - ret i128 %prod -} diff --git a/llvm/test/CodeGen/AArch64/movi64_sve.ll b/llvm/test/CodeGen/AArch64/movi64_sve.ll new file mode 100644 index 0000000000000..1d4e00d0c3d10 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/movi64_sve.ll @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=COMMON,NEON +; RUN: llc -mtriple=aarch64 -mattr=+neon,+sve < %s | FileCheck %s --check-prefixes=COMMON,SVE + +define <2 x i64> @movi_1_v2i64() { +; NEON-LABEL: movi_1_v2i64: +; NEON: // %bb.0: +; NEON-NEXT: mov w8, #1 // =0x1 +; NEON-NEXT: dup v0.2d, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: movi_1_v2i64: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #1 // =0x1 +; SVE-NEXT: ret + ret <2 x i64> splat (i64 1) +} + +define <2 x i64> @movi_127_v2i64() { +; NEON-LABEL: movi_127_v2i64: +; NEON: // %bb.0: +; NEON-NEXT: mov w8, #127 // =0x7f +; NEON-NEXT: dup v0.2d, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: movi_127_v2i64: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #127 // =0x7f +; SVE-NEXT: ret + ret <2 x i64> splat (i64 127) +} + +define <2 x i64> @movi_m128_v2i64() { +; NEON-LABEL: movi_m128_v2i64: +; NEON: // %bb.0: +; NEON-NEXT: mov x8, #-128 // =0xffffffffffffff80 +; NEON-NEXT: dup v0.2d, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: movi_m128_v2i64: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #-128 // =0xffffffffffffff80 +; SVE-NEXT: ret + ret <2 x i64> splat (i64 -128) +} + +define <2 x i64> @movi_256_v2i64() { +; NEON-LABEL: movi_256_v2i64: +; NEON: // %bb.0: +; NEON-NEXT: mov w8, #256 // =0x100 +; NEON-NEXT: dup v0.2d, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: movi_256_v2i64: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #256 // =0x100 +; SVE-NEXT: ret + ret <2 x i64> splat (i64 256) +} + +define <2 x i64> @movi_32512_v2i64() { +; NEON-LABEL: movi_32512_v2i64: +; NEON: // %bb.0: +; NEON-NEXT: mov w8, #32512 // =0x7f00 +; NEON-NEXT: dup v0.2d, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: movi_32512_v2i64: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #32512 // =0x7f00 +; SVE-NEXT: ret + ret <2 x i64> splat (i64 32512) +} + +define <2 x i64> @movi_m32768_v2i64() { +; NEON-LABEL: movi_m32768_v2i64: +; NEON: // %bb.0: +; NEON-NEXT: mov x8, #-32768 // =0xffffffffffff8000 +; NEON-NEXT: dup v0.2d, x8 +; NEON-NEXT: ret +; +; SVE-LABEL: movi_m32768_v2i64: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #-32768 // =0xffffffffffff8000 +; SVE-NEXT: ret + ret <2 x i64> splat (i64 -32768) +} + +; Special cases where the destination vector does not have 64-bit elements + +define <4 x i32> @movi_v4i32_1() { +; NEON-LABEL: movi_v4i32_1: +; NEON: // %bb.0: +; NEON-NEXT: adrp x8, .LCPI6_0 +; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI6_0] +; NEON-NEXT: ret +; +; SVE-LABEL: movi_v4i32_1: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #127 // =0x7f +; SVE-NEXT: ret + ret <4 x i32> +} + +define <4 x i32> @movi_v4i32_2() { +; NEON-LABEL: movi_v4i32_2: +; NEON: // %bb.0: +; NEON-NEXT: adrp x8, .LCPI7_0 +; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI7_0] +; NEON-NEXT: ret +; +; SVE-LABEL: movi_v4i32_2: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #32512 // =0x7f00 +; SVE-NEXT: ret + ret <4 x i32> +} + +define <8 x i16> @movi_v8i16_1() { +; NEON-LABEL: movi_v8i16_1: +; NEON: // %bb.0: +; NEON-NEXT: adrp x8, .LCPI8_0 +; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] +; NEON-NEXT: ret +; +; SVE-LABEL: movi_v8i16_1: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #127 // =0x7f +; SVE-NEXT: ret + ret <8 x i16> +} + +define <8 x i16> @movi_v8i16_2() { +; NEON-LABEL: movi_v8i16_2: +; NEON: // %bb.0: +; NEON-NEXT: adrp x8, .LCPI9_0 +; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI9_0] +; NEON-NEXT: ret +; +; SVE-LABEL: movi_v8i16_2: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #32512 // =0x7f00 +; SVE-NEXT: ret + ret <8 x i16> +} + +define <16 x i8> @movi_v16i8_1() { +; NEON-LABEL: movi_v16i8_1: +; NEON: // %bb.0: +; NEON-NEXT: adrp x8, .LCPI10_0 +; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] +; NEON-NEXT: ret +; +; SVE-LABEL: movi_v16i8_1: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #127 // =0x7f +; SVE-NEXT: ret + ret <16 x i8> +} + +define <16 x i8> @movi_v16i8_2() { +; NEON-LABEL: movi_v16i8_2: +; NEON: // %bb.0: +; NEON-NEXT: adrp x8, .LCPI11_0 +; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI11_0] +; NEON-NEXT: ret +; +; SVE-LABEL: movi_v16i8_2: +; SVE: // %bb.0: +; SVE-NEXT: mov z0.d, #32512 // =0x7f00 +; SVE-NEXT: ret + ret <16 x i8> +} + +; Negative cases + +define <2 x i64> @movi_128_v2i64() { +; COMMON-LABEL: movi_128_v2i64: +; COMMON: // %bb.0: +; COMMON-NEXT: mov w8, #128 // =0x80 +; COMMON-NEXT: dup v0.2d, x8 +; COMMON-NEXT: ret + ret <2 x i64> splat (i64 128) +} + +define <2 x i64> @movi_m127_v2i64() { +; COMMON-LABEL: movi_m127_v2i64: +; COMMON: // %bb.0: +; COMMON-NEXT: mov x8, #-129 // =0xffffffffffffff7f +; COMMON-NEXT: dup v0.2d, x8 +; COMMON-NEXT: ret + ret <2 x i64> splat (i64 -129) +} + +define <2 x i64> @movi_32513_v2i64() { +; COMMON-LABEL: movi_32513_v2i64: +; COMMON: // %bb.0: +; COMMON-NEXT: mov w8, #32513 // =0x7f01 +; COMMON-NEXT: dup v0.2d, x8 +; COMMON-NEXT: ret + ret <2 x i64> splat (i64 32513) +} + +define <2 x i64> @movi_m32769_v2i64() { +; COMMON-LABEL: movi_m32769_v2i64: +; COMMON: // %bb.0: +; COMMON-NEXT: mov x8, #-32769 // =0xffffffffffff7fff +; COMMON-NEXT: dup v0.2d, x8 +; COMMON-NEXT: ret + ret <2 x i64> splat (i64 -32769) +} + +define <2 x i64> @movi_257_v2i64() { +; COMMON-LABEL: movi_257_v2i64: +; COMMON: // %bb.0: +; COMMON-NEXT: mov w8, #257 // =0x101 +; COMMON-NEXT: dup v0.2d, x8 +; COMMON-NEXT: ret + ret <2 x i64> splat (i64 257) +} + +define <4 x i32> @movi_v4i32_3() { +; COMMON-LABEL: movi_v4i32_3: +; COMMON: // %bb.0: +; COMMON-NEXT: adrp x8, .LCPI17_0 +; COMMON-NEXT: ldr q0, [x8, :lo12:.LCPI17_0] +; COMMON-NEXT: ret + ret <4 x i32> +} + +define <16 x i8> @movi_v16i8_3() { +; COMMON-LABEL: movi_v16i8_3: +; COMMON: // %bb.0: +; COMMON-NEXT: adrp x8, .LCPI18_0 +; COMMON-NEXT: ldr q0, [x8, :lo12:.LCPI18_0] +; COMMON-NEXT: ret + ret <16 x i8> +} diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll index c38516fc57bbd..d60c870003e4d 100644 --- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll @@ -1255,3 +1255,151 @@ entry: %partial.reduce = tail call <2 x i64> @llvm.vector.partial.reduce.add(<2 x i64> %acc, <8 x i64> %input.wide) ret <2 x i64> %partial.reduce } + +define <4 x i32> @partial_reduce_shl_sext_const_rhs6(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_sext_const_rhs6: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-COMMON-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-COMMON-NEXT: sshll v3.4s, v0.4h, #6 +; CHECK-COMMON-NEXT: sshll2 v4.4s, v2.8h, #6 +; CHECK-COMMON-NEXT: sshll v2.4s, v2.4h, #6 +; CHECK-COMMON-NEXT: sshll2 v0.4s, v0.8h, #6 +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v2.4s, v4.4s, v3.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-COMMON-NEXT: ret + %ext = sext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, splat (i32 6) + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_sext_const_rhs8(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_sext_const_rhs8: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-COMMON-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-COMMON-NEXT: sshll v3.4s, v0.4h, #8 +; CHECK-COMMON-NEXT: sshll2 v4.4s, v2.8h, #8 +; CHECK-COMMON-NEXT: sshll v2.4s, v2.4h, #8 +; CHECK-COMMON-NEXT: sshll2 v0.4s, v0.8h, #8 +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v2.4s, v4.4s, v3.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-COMMON-NEXT: ret + %ext = sext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, splat (i32 8) + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_sext_const_rhs_9(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_sext_const_rhs_9: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ret + %ext = sext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, splat (i32 32) + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_sext_non_const_rhs(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_sext_non_const_rhs: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: sshll v2.8h, v0.8b, #0 +; CHECK-COMMON-NEXT: sshll2 v0.8h, v0.16b, #0 +; CHECK-COMMON-NEXT: sshll v3.4s, v2.4h, #0 +; CHECK-COMMON-NEXT: sshll2 v2.4s, v2.8h, #0 +; CHECK-COMMON-NEXT: sshll v4.4s, v0.4h, #0 +; CHECK-COMMON-NEXT: sshll2 v0.4s, v0.8h, #0 +; CHECK-COMMON-NEXT: ushl v4.4s, v4.4s, v4.4s +; CHECK-COMMON-NEXT: ushl v2.4s, v2.4s, v2.4s +; CHECK-COMMON-NEXT: ushl v3.4s, v3.4s, v3.4s +; CHECK-COMMON-NEXT: ushl v0.4s, v0.4s, v0.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-COMMON-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-COMMON-NEXT: ret + %ext = sext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, %ext + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_zext_const_rhs6(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_zext_const_rhs6: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-COMMON-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-COMMON-NEXT: ushll v3.4s, v0.4h, #6 +; CHECK-COMMON-NEXT: ushll2 v4.4s, v2.8h, #6 +; CHECK-COMMON-NEXT: ushll v2.4s, v2.4h, #6 +; CHECK-COMMON-NEXT: ushll2 v0.4s, v0.8h, #6 +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v2.4s, v4.4s, v3.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-COMMON-NEXT: ret + %ext = zext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, splat (i32 6) + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_zext_const_rhs8(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_zext_const_rhs8: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-COMMON-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-COMMON-NEXT: ushll v3.4s, v0.4h, #8 +; CHECK-COMMON-NEXT: ushll2 v4.4s, v2.8h, #8 +; CHECK-COMMON-NEXT: ushll v2.4s, v2.4h, #8 +; CHECK-COMMON-NEXT: ushll2 v0.4s, v0.8h, #8 +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v2.4s, v4.4s, v3.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-COMMON-NEXT: ret + %ext = zext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, splat (i32 8) + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_zext_const_rhs_9(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_zext_const_rhs_9: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ret + %ext = zext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, splat (i32 32) + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} + +define <4 x i32> @partial_reduce_shl_zext_non_const_rhs(<16 x i8> %l, <4 x i32> %part) { +; CHECK-COMMON-LABEL: partial_reduce_shl_zext_non_const_rhs: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: ushll v2.8h, v0.8b, #0 +; CHECK-COMMON-NEXT: ushll2 v0.8h, v0.16b, #0 +; CHECK-COMMON-NEXT: ushll v3.4s, v2.4h, #0 +; CHECK-COMMON-NEXT: ushll2 v2.4s, v2.8h, #0 +; CHECK-COMMON-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-COMMON-NEXT: ushll2 v0.4s, v0.8h, #0 +; CHECK-COMMON-NEXT: ushl v4.4s, v4.4s, v4.4s +; CHECK-COMMON-NEXT: ushl v2.4s, v2.4s, v2.4s +; CHECK-COMMON-NEXT: ushl v3.4s, v3.4s, v3.4s +; CHECK-COMMON-NEXT: ushl v0.4s, v0.4s, v0.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-COMMON-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-COMMON-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-COMMON-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-COMMON-NEXT: ret + %ext = zext <16 x i8> %l to <16 x i32> + %shift = shl nsw <16 x i32> %ext, %ext + %red = tail call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %part, <16 x i32> %shift) + ret <4 x i32> %red +} diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll index c91de8f3a0a47..e3c623371448b 100644 --- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll +++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll @@ -8,224 +8,209 @@ define dso_local void @run_test() local_unnamed_addr uwtable { ; CHECK-LABEL: run_test: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #208 -; CHECK-NEXT: .cfi_def_cfa_offset 208 -; CHECK-NEXT: stp d15, d14, [sp, #96] // 16-byte Folded Spill -; CHECK-NEXT: stp d13, d12, [sp, #112] // 16-byte Folded Spill -; CHECK-NEXT: stp d11, d10, [sp, #128] // 16-byte Folded Spill -; CHECK-NEXT: stp d9, d8, [sp, #144] // 16-byte Folded Spill -; CHECK-NEXT: str x23, [sp, #160] // 8-byte Folded Spill -; CHECK-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill -; CHECK-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #192 +; CHECK-NEXT: .cfi_def_cfa_offset 192 +; CHECK-NEXT: stp d15, d14, [sp, #112] // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #128] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #144] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #160] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #176] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 -; CHECK-NEXT: .cfi_offset w22, -32 -; CHECK-NEXT: .cfi_offset w23, -48 -; CHECK-NEXT: .cfi_offset b8, -56 -; CHECK-NEXT: .cfi_offset b9, -64 -; CHECK-NEXT: .cfi_offset b10, -72 -; CHECK-NEXT: .cfi_offset b11, -80 -; CHECK-NEXT: .cfi_offset b12, -88 -; CHECK-NEXT: .cfi_offset b13, -96 -; CHECK-NEXT: .cfi_offset b14, -104 -; CHECK-NEXT: .cfi_offset b15, -112 -; CHECK-NEXT: movi v2.2d, #0000000000000000 -; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: .cfi_offset b8, -24 +; CHECK-NEXT: .cfi_offset b9, -32 +; CHECK-NEXT: .cfi_offset b10, -40 +; CHECK-NEXT: .cfi_offset b11, -48 +; CHECK-NEXT: .cfi_offset b12, -56 +; CHECK-NEXT: .cfi_offset b13, -64 +; CHECK-NEXT: .cfi_offset b14, -72 +; CHECK-NEXT: .cfi_offset b15, -80 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: adrp x14, B+48 +; CHECK-NEXT: add x14, x14, :lo12:B+48 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: adrp x9, B+48 -; CHECK-NEXT: add x9, x9, :lo12:B+48 +; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: adrp x10, A ; CHECK-NEXT: add x10, x10, :lo12:A ; CHECK-NEXT: mov x11, xzr -; CHECK-NEXT: // kill: killed $q1 -; CHECK-NEXT: // implicit-def: $q1 +; CHECK-NEXT: // kill: killed $q18 +; CHECK-NEXT: // implicit-def: $q18 ; CHECK-NEXT: mov x12, xzr +; CHECK-NEXT: mov x13, x14 ; CHECK-NEXT: // implicit-def: $q0 +; CHECK-NEXT: // implicit-def: $q2 ; CHECK-NEXT: // implicit-def: $q3 ; CHECK-NEXT: // implicit-def: $q4 ; CHECK-NEXT: // implicit-def: $q5 -; CHECK-NEXT: // implicit-def: $q7 -; CHECK-NEXT: // implicit-def: $q10 -; CHECK-NEXT: // implicit-def: $q17 ; CHECK-NEXT: // implicit-def: $q6 -; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // implicit-def: $q16 +; CHECK-NEXT: // implicit-def: $q17 +; CHECK-NEXT: // implicit-def: $q7 ; CHECK-NEXT: // implicit-def: $q19 ; CHECK-NEXT: // implicit-def: $q20 ; CHECK-NEXT: // implicit-def: $q21 ; CHECK-NEXT: // implicit-def: $q22 -; CHECK-NEXT: // implicit-def: $q23 ; CHECK-NEXT: // implicit-def: $q24 -; CHECK-NEXT: // implicit-def: $q9 +; CHECK-NEXT: // implicit-def: $q23 +; CHECK-NEXT: // implicit-def: $q25 +; CHECK-NEXT: // implicit-def: $q26 ; CHECK-NEXT: // implicit-def: $q27 -; CHECK-NEXT: // implicit-def: $q12 -; CHECK-NEXT: // implicit-def: $q28 -; CHECK-NEXT: // implicit-def: $q14 -; CHECK-NEXT: // implicit-def: $q15 -; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q30 +; CHECK-NEXT: // implicit-def: $q8 ; CHECK-NEXT: // implicit-def: $q11 -; CHECK-NEXT: // implicit-def: $q31 +; CHECK-NEXT: // implicit-def: $q12 +; CHECK-NEXT: // implicit-def: $q29 ; CHECK-NEXT: // implicit-def: $q13 -; CHECK-NEXT: // kill: killed $q1 -; CHECK-NEXT: // implicit-def: $q1 -; CHECK-NEXT: // kill: killed $q1 +; CHECK-NEXT: // implicit-def: $q10 +; CHECK-NEXT: // implicit-def: $q15 +; CHECK-NEXT: // kill: killed $q18 +; CHECK-NEXT: // implicit-def: $q18 +; CHECK-NEXT: // kill: killed $q18 ; CHECK-NEXT: .LBB0_1: // %for.cond1.preheader ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill -; CHECK-NEXT: ldr q15, [x8] +; CHECK-NEXT: ldr x17, [x8] ; CHECK-NEXT: ldr x15, [x8] -; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: add x20, x10, x11 -; CHECK-NEXT: mov v8.16b, v28.16b -; CHECK-NEXT: fmov x2, d15 -; CHECK-NEXT: mov x17, v15.d[1] -; CHECK-NEXT: ldr q14, [x8] +; CHECK-NEXT: mov v18.16b, v0.16b +; CHECK-NEXT: ldr x16, [x9] +; CHECK-NEXT: stp q15, q4, [sp] // 32-byte Folded Spill +; CHECK-NEXT: add x5, x10, x11 +; CHECK-NEXT: mul x1, x15, x17 +; CHECK-NEXT: ldr x2, [x13], #64 +; CHECK-NEXT: ldr x5, [x5, #128] +; CHECK-NEXT: stp q7, q23, [sp, #32] // 32-byte Folded Spill +; CHECK-NEXT: ldr x14, [x14, #8] +; CHECK-NEXT: mul x0, x17, x17 +; CHECK-NEXT: ldr q23, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: mov v9.16b, v30.16b +; CHECK-NEXT: mov v30.16b, v25.16b +; CHECK-NEXT: mov v25.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v6.16b +; CHECK-NEXT: mul x18, x16, x17 +; CHECK-NEXT: mov v6.16b, v1.16b ; CHECK-NEXT: mov v28.16b, v24.16b -; CHECK-NEXT: mov v24.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v17.16b -; CHECK-NEXT: fmov x13, d14 -; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: mov v17.16b, v5.16b -; CHECK-NEXT: mul x3, x2, x15 -; CHECK-NEXT: ldr q14, [x9], #64 -; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x6, [x8] -; CHECK-NEXT: ldr x20, [x20, #128] -; CHECK-NEXT: mul x1, x17, x15 -; CHECK-NEXT: mov x14, v14.d[1] -; CHECK-NEXT: fmov x5, d14 -; CHECK-NEXT: mov v29.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v0.16b -; CHECK-NEXT: mov v25.16b, v6.16b -; CHECK-NEXT: mul x18, x13, x15 -; CHECK-NEXT: mov v6.16b, v2.16b -; CHECK-NEXT: mov v26.16b, v22.16b -; CHECK-NEXT: fmov d15, x3 -; CHECK-NEXT: mov v22.16b, v18.16b -; CHECK-NEXT: mov v18.16b, v7.16b -; CHECK-NEXT: mul x0, x16, x15 -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: mov v16.16b, v4.16b +; CHECK-NEXT: fmov d14, x1 +; CHECK-NEXT: mov v24.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v5.16b +; CHECK-NEXT: mul x4, x2, x17 +; CHECK-NEXT: mov v31.16b, v26.16b +; CHECK-NEXT: mov v26.16b, v21.16b +; CHECK-NEXT: fmov d15, x0 +; CHECK-NEXT: mov v21.16b, v16.16b +; CHECK-NEXT: mov v16.16b, v2.16b +; CHECK-NEXT: mov v0.16b, v14.16b +; CHECK-NEXT: mul x20, x2, x5 +; CHECK-NEXT: mov v7.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v3.16b ; CHECK-NEXT: add x11, x11, #8 -; CHECK-NEXT: add x12, x12, #1 -; CHECK-NEXT: mov v15.d[1], x1 -; CHECK-NEXT: mul x4, x14, x15 +; CHECK-NEXT: mov v15.d[1], x18 +; CHECK-NEXT: mul x3, x14, x17 ; CHECK-NEXT: cmp x11, #64 -; CHECK-NEXT: fmov d14, x18 -; CHECK-NEXT: mul x15, x5, x15 -; CHECK-NEXT: add v5.2d, v5.2d, v15.2d -; CHECK-NEXT: mul x21, x2, x6 -; CHECK-NEXT: mov v14.d[1], x0 -; CHECK-NEXT: mul x2, x2, x20 -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: mul x22, x13, x20 -; CHECK-NEXT: add v5.2d, v5.2d, v14.2d -; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: mul x19, x17, x6 -; CHECK-NEXT: mov v0.d[1], x4 -; CHECK-NEXT: fmov d1, x2 -; CHECK-NEXT: mul x17, x17, x20 -; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: add v5.2d, v13.2d, v14.2d -; CHECK-NEXT: fmov d2, x22 -; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x7, x16, x6 -; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload -; CHECK-NEXT: mov v3.d[1], x19 -; CHECK-NEXT: add v13.2d, v13.2d, v0.2d -; CHECK-NEXT: mul x16, x16, x20 -; CHECK-NEXT: mov v1.d[1], x17 -; CHECK-NEXT: mul x23, x5, x20 -; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov v13.16b, v5.16b -; CHECK-NEXT: mov v5.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v20.16b -; CHECK-NEXT: mov v20.16b, v24.16b -; CHECK-NEXT: mul x13, x13, x6 -; CHECK-NEXT: mov v24.16b, v28.16b -; CHECK-NEXT: add v11.2d, v11.2d, v3.2d -; CHECK-NEXT: mov v2.d[1], x16 +; CHECK-NEXT: mov v0.d[1], x1 +; CHECK-NEXT: fmov d1, x4 +; CHECK-NEXT: add x12, x12, #1 +; CHECK-NEXT: mul x17, x17, x5 +; CHECK-NEXT: fmov d5, x20 +; CHECK-NEXT: mul x6, x15, x15 +; CHECK-NEXT: add v23.2d, v23.2d, v0.2d +; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: mov v1.d[1], x3 +; CHECK-NEXT: mul x7, x15, x5 +; CHECK-NEXT: add v0.2d, v0.2d, v15.2d +; CHECK-NEXT: fmov d2, x17 +; CHECK-NEXT: mul x0, x14, x5 +; CHECK-NEXT: fmov d4, x6 +; CHECK-NEXT: mul x19, x16, x5 +; CHECK-NEXT: stp q0, q23, [sp, #64] // 32-byte Folded Spill +; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: fmov d3, x7 +; CHECK-NEXT: ldr q23, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mul x17, x2, x15 +; CHECK-NEXT: add v0.2d, v0.2d, v15.2d +; CHECK-NEXT: ldr q15, [sp] // 16-byte Folded Reload +; CHECK-NEXT: mov v5.d[1], x0 +; CHECK-NEXT: mov v4.d[1], x6 +; CHECK-NEXT: mul x16, x16, x15 +; CHECK-NEXT: mov v3.d[1], x7 ; CHECK-NEXT: add v15.2d, v15.2d, v1.2d -; CHECK-NEXT: add v27.2d, v27.2d, v3.2d -; CHECK-NEXT: mul x18, x14, x20 -; CHECK-NEXT: add v23.2d, v23.2d, v3.2d -; CHECK-NEXT: add v19.2d, v19.2d, v3.2d -; CHECK-NEXT: fmov d4, x23 -; CHECK-NEXT: add v10.2d, v10.2d, v3.2d -; CHECK-NEXT: mul x15, x5, x6 -; CHECK-NEXT: fmov d0, x13 -; CHECK-NEXT: add v14.2d, v14.2d, v2.2d -; CHECK-NEXT: add v2.2d, v6.2d, v3.2d -; CHECK-NEXT: mul x14, x14, x6 -; CHECK-NEXT: mov v3.16b, v7.16b -; CHECK-NEXT: mov v7.16b, v18.16b -; CHECK-NEXT: mov v4.d[1], x18 -; CHECK-NEXT: mov v18.16b, v22.16b -; CHECK-NEXT: mov v0.d[1], x7 -; CHECK-NEXT: fmov d1, x15 -; CHECK-NEXT: add v28.2d, v8.2d, v4.2d -; CHECK-NEXT: mov v1.d[1], x14 -; CHECK-NEXT: add v31.2d, v31.2d, v0.2d -; CHECK-NEXT: add v30.2d, v30.2d, v0.2d +; CHECK-NEXT: mov v2.d[1], x19 +; CHECK-NEXT: str q0, [sp, #96] // 16-byte Folded Spill +; CHECK-NEXT: mov v1.16b, v6.16b +; CHECK-NEXT: mul x14, x14, x15 +; CHECK-NEXT: mov v6.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v25.16b +; CHECK-NEXT: fmov d0, x17 +; CHECK-NEXT: mov v25.16b, v30.16b +; CHECK-NEXT: add v30.2d, v9.2d, v5.2d +; CHECK-NEXT: mov v5.16b, v19.16b +; CHECK-NEXT: mov v19.16b, v24.16b +; CHECK-NEXT: add v11.2d, v11.2d, v3.2d +; CHECK-NEXT: mov v14.d[1], x16 +; CHECK-NEXT: mov v3.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v10.16b +; CHECK-NEXT: mov v10.16b, v7.16b +; CHECK-NEXT: add v8.2d, v8.2d, v2.2d +; CHECK-NEXT: mov v2.16b, v16.16b +; CHECK-NEXT: mov v0.d[1], x14 +; CHECK-NEXT: mov v16.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v26.16b +; CHECK-NEXT: add v13.2d, v13.2d, v4.2d +; CHECK-NEXT: add v26.2d, v31.2d, v4.2d +; CHECK-NEXT: add v24.2d, v28.2d, v4.2d +; CHECK-NEXT: add v19.2d, v19.2d, v4.2d +; CHECK-NEXT: add v6.2d, v6.2d, v4.2d +; CHECK-NEXT: add v1.2d, v1.2d, v4.2d +; CHECK-NEXT: ldp q4, q7, [sp, #16] // 32-byte Folded Reload +; CHECK-NEXT: add v10.2d, v10.2d, v14.2d +; CHECK-NEXT: add v29.2d, v29.2d, v14.2d +; CHECK-NEXT: add v27.2d, v27.2d, v14.2d +; CHECK-NEXT: add v23.2d, v23.2d, v14.2d +; CHECK-NEXT: add v22.2d, v22.2d, v14.2d +; CHECK-NEXT: add v20.2d, v20.2d, v14.2d +; CHECK-NEXT: add v16.2d, v16.2d, v14.2d +; CHECK-NEXT: add v7.2d, v7.2d, v14.2d +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: add v3.2d, v3.2d, v14.2d +; CHECK-NEXT: add v2.2d, v2.2d, v14.2d ; CHECK-NEXT: add v12.2d, v12.2d, v0.2d -; CHECK-NEXT: add v24.2d, v24.2d, v0.2d -; CHECK-NEXT: add v22.2d, v26.2d, v0.2d -; CHECK-NEXT: add v20.2d, v20.2d, v0.2d -; CHECK-NEXT: add v18.2d, v18.2d, v0.2d +; CHECK-NEXT: add v25.2d, v25.2d, v0.2d +; CHECK-NEXT: add v21.2d, v21.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v7.2d, v7.2d, v0.2d -; CHECK-NEXT: add v4.2d, v16.2d, v0.2d -; CHECK-NEXT: add v3.2d, v3.2d, v0.2d -; CHECK-NEXT: mov v0.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v29.16b -; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload -; CHECK-NEXT: add v9.2d, v9.2d, v1.2d -; CHECK-NEXT: add v6.2d, v25.2d, v1.2d -; CHECK-NEXT: add v5.2d, v5.2d, v1.2d -; CHECK-NEXT: add v29.2d, v29.2d, v1.2d -; CHECK-NEXT: add v21.2d, v21.2d, v1.2d -; CHECK-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-NEXT: add v4.2d, v4.2d, v0.2d +; CHECK-NEXT: add v0.2d, v18.2d, v0.2d +; CHECK-NEXT: mov x14, x13 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp q28, q18, [sp, #64] // 32-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: stp q11, q30, [x8, #80] -; CHECK-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload -; CHECK-NEXT: str q1, [x8] -; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: ldr x23, [sp, #160] // 8-byte Folded Reload -; CHECK-NEXT: stp q15, q14, [x8, #144] -; CHECK-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: stp q1, q13, [x8, #16] -; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q28, q12, [x8, #176] -; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q1, q31, [x8, #48] -; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: stp q9, q24, [x8, #240] -; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q19, q18, [x8, #336] -; CHECK-NEXT: stp q10, q7, [x8, #400] -; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload -; CHECK-NEXT: str q29, [x8, #112] -; CHECK-NEXT: str q27, [x8, #208] -; CHECK-NEXT: stp q23, q22, [x8, #272] +; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload +; CHECK-NEXT: stp q10, q13, [x8, #64] +; CHECK-NEXT: stp q28, q18, [x8] +; CHECK-NEXT: ldr q18, [sp, #96] // 16-byte Folded Reload +; CHECK-NEXT: stp q29, q12, [x8, #96] +; CHECK-NEXT: ldp d13, d12, [sp, #128] // 16-byte Folded Reload +; CHECK-NEXT: stp q18, q15, [x8, #32] +; CHECK-NEXT: ldp d15, d14, [sp, #112] // 16-byte Folded Reload +; CHECK-NEXT: stp q11, q8, [x8, #144] +; CHECK-NEXT: ldp d9, d8, [sp, #160] // 16-byte Folded Reload +; CHECK-NEXT: stp q30, q27, [x8, #176] +; CHECK-NEXT: ldp d11, d10, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: str q26, [x8, #208] +; CHECK-NEXT: stp q25, q23, [x8, #240] +; CHECK-NEXT: stp q24, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] -; CHECK-NEXT: stp q6, q17, [x8, #368] -; CHECK-NEXT: stp q5, q4, [x8, #432] -; CHECK-NEXT: stp q2, q3, [x8, #464] +; CHECK-NEXT: stp q19, q7, [x8, #336] +; CHECK-NEXT: stp q17, q16, [x8, #368] +; CHECK-NEXT: stp q6, q5, [x8, #400] +; CHECK-NEXT: stp q4, q3, [x8, #432] +; CHECK-NEXT: stp q1, q2, [x8, #464] ; CHECK-NEXT: str q0, [x8, #496] -; CHECK-NEXT: add sp, sp, #208 +; CHECK-NEXT: add sp, sp, #192 ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w20 -; CHECK-NEXT: .cfi_restore w21 -; CHECK-NEXT: .cfi_restore w22 -; CHECK-NEXT: .cfi_restore w23 ; CHECK-NEXT: .cfi_restore b8 ; CHECK-NEXT: .cfi_restore b9 ; CHECK-NEXT: .cfi_restore b10 diff --git a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll index eb3a0391eb79e..0ed29b48cf2f8 100644 --- a/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll +++ b/llvm/test/CodeGen/AArch64/scalarize-vector-load.ll @@ -4,36 +4,35 @@ define i8 @scalarize_v16i8(ptr %p) { ; CHECK-LABEL: scalarize_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: umov w16, v0.b[8] -; CHECK-NEXT: umov w17, v0.b[9] -; CHECK-NEXT: umov w18, v0.b[10] -; CHECK-NEXT: umov w0, v0.b[11] -; CHECK-NEXT: umov w1, v0.b[12] -; CHECK-NEXT: umov w2, v0.b[13] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: umov w3, v0.b[14] -; CHECK-NEXT: umov w4, v0.b[15] -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w11, w14, w15 +; CHECK-NEXT: ldrb w8, [x0, #3] +; CHECK-NEXT: ldrb w9, [x0, #2] +; CHECK-NEXT: ldrb w10, [x0, #1] +; CHECK-NEXT: ldrb w11, [x0] +; CHECK-NEXT: ldrb w13, [x0, #5] +; CHECK-NEXT: ldrb w14, [x0, #4] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: ldrb w12, [x0, #15] +; CHECK-NEXT: ldrb w15, [x0, #11] +; CHECK-NEXT: add w10, w11, w10 +; CHECK-NEXT: add w9, w14, w13 +; CHECK-NEXT: ldrb w11, [x0, #10] +; CHECK-NEXT: ldrb w13, [x0, #9] +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: ldrb w14, [x0, #8] +; CHECK-NEXT: ldrb w16, [x0, #7] +; CHECK-NEXT: add w11, w11, w15 +; CHECK-NEXT: ldrb w17, [x0, #6] +; CHECK-NEXT: ldrb w18, [x0, #14] +; CHECK-NEXT: add w13, w14, w13 +; CHECK-NEXT: ldrb w1, [x0, #13] +; CHECK-NEXT: ldrb w0, [x0, #12] +; CHECK-NEXT: add w16, w17, w16 +; CHECK-NEXT: add w10, w13, w11 +; CHECK-NEXT: add w12, w18, w12 +; CHECK-NEXT: add w9, w9, w16 +; CHECK-NEXT: add w14, w0, w1 ; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w12, w16, w17 -; CHECK-NEXT: add w13, w18, w0 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w14, w1, w2 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w15, w3, w4 -; CHECK-NEXT: add w11, w14, w15 +; CHECK-NEXT: add w11, w14, w12 ; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret @@ -75,22 +74,21 @@ define i8 @scalarize_v16i8(ptr %p) { define i8 @scalarize_v8i8(ptr %p) { ; CHECK-LABEL: scalarize_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: umov w8, v0.b[0] -; CHECK-NEXT: umov w9, v0.b[1] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w11, w14, w15 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ldrb w8, [x0, #7] +; CHECK-NEXT: ldrb w9, [x0, #6] +; CHECK-NEXT: ldrb w10, [x0, #5] +; CHECK-NEXT: ldrb w11, [x0, #1] +; CHECK-NEXT: ldrb w12, [x0] +; CHECK-NEXT: ldrb w13, [x0, #4] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: ldrb w14, [x0, #3] +; CHECK-NEXT: ldrb w15, [x0, #2] +; CHECK-NEXT: add w11, w12, w11 +; CHECK-NEXT: add w10, w13, w10 +; CHECK-NEXT: add w12, w15, w14 +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: add w9, w11, w12 +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %wide.load = load <8 x i8>, ptr %p, align 4 %l0 = extractelement <8 x i8> %wide.load, i32 0 @@ -114,22 +112,21 @@ define i8 @scalarize_v8i8(ptr %p) { define i16 @scalarize_v8i16(ptr %p) { ; CHECK-LABEL: scalarize_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: umov w12, v0.h[4] -; CHECK-NEXT: umov w13, v0.h[5] -; CHECK-NEXT: umov w14, v0.h[6] -; CHECK-NEXT: umov w15, v0.h[7] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w10, w12, w13 -; CHECK-NEXT: add w11, w14, w15 -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ldrh w8, [x0, #14] +; CHECK-NEXT: ldrh w9, [x0, #12] +; CHECK-NEXT: ldrh w10, [x0, #10] +; CHECK-NEXT: ldrh w11, [x0, #2] +; CHECK-NEXT: ldrh w12, [x0] +; CHECK-NEXT: ldrh w13, [x0, #8] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: ldrh w14, [x0, #6] +; CHECK-NEXT: ldrh w15, [x0, #4] +; CHECK-NEXT: add w11, w12, w11 +; CHECK-NEXT: add w10, w13, w10 +; CHECK-NEXT: add w12, w15, w14 +; CHECK-NEXT: add w8, w10, w8 +; CHECK-NEXT: add w9, w11, w12 +; CHECK-NEXT: add w0, w9, w8 ; CHECK-NEXT: ret %wide.load = load <8 x i16>, ptr %p, align 4 %l0 = extractelement <8 x i16> %wide.load, i32 0 @@ -153,14 +150,13 @@ define i16 @scalarize_v8i16(ptr %p) { define i16 @scalarize_v4i16(ptr %p) { ; CHECK-LABEL: scalarize_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: add w8, w8, w9 -; CHECK-NEXT: add w9, w10, w11 -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ldrh w8, [x0, #6] +; CHECK-NEXT: ldrh w9, [x0, #4] +; CHECK-NEXT: ldrh w10, [x0, #2] +; CHECK-NEXT: ldrh w11, [x0] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: add w10, w11, w10 +; CHECK-NEXT: add w0, w10, w8 ; CHECK-NEXT: ret %wide.load = load <4 x i16>, ptr %p, align 4 %l0 = extractelement <4 x i16> %wide.load, i32 0 @@ -176,13 +172,10 @@ define i16 @scalarize_v4i16(ptr %p) { define i32 @scalarize_v4i32(ptr %p) { ; CHECK-LABEL: scalarize_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: mov w9, v0.s[2] -; CHECK-NEXT: mov w10, v0.s[3] -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: add w8, w11, w8 -; CHECK-NEXT: add w9, w9, w10 +; CHECK-NEXT: ldp w9, w8, [x0] +; CHECK-NEXT: ldp w10, w11, [x0, #8] +; CHECK-NEXT: add w8, w9, w8 +; CHECK-NEXT: add w9, w10, w11 ; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 @@ -199,11 +192,10 @@ define i32 @scalarize_v4i32(ptr %p) { define i64 @scalarize_v4i64(ptr %p) { ; CHECK-LABEL: scalarize_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: fmov x8, d1 -; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ldp x8, x9, [x0] +; CHECK-NEXT: ldp x10, x11, [x0, #16] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: add x9, x10, x11 ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %wide.load = load <4 x i64>, ptr %p, align 4 @@ -220,14 +212,11 @@ define i64 @scalarize_v4i64(ptr %p) { define i64 @scalarize_v4i32_sext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_sext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: sshll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: sshll v0.2d, v0.2s, #0 -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ldpsw x9, x8, [x0, #8] +; CHECK-NEXT: ldpsw x11, x10, [x0] +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: add x0, x10, x8 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = sext <4 x i32> %wide.load to <4 x i64> @@ -244,14 +233,11 @@ define i64 @scalarize_v4i32_sext(ptr %p) { define i64 @scalarize_v4i32_zext(ptr %p) { ; CHECK-LABEL: scalarize_v4i32_zext: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ushll2 v1.2d, v0.4s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: addp d0, v0.2d -; CHECK-NEXT: addp d1, v1.2d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: add x0, x8, x9 +; CHECK-NEXT: ldp w9, w8, [x0, #8] +; CHECK-NEXT: ldp w11, w10, [x0] +; CHECK-NEXT: add x8, x9, x8 +; CHECK-NEXT: add x10, x11, x10 +; CHECK-NEXT: add x0, x10, x8 ; CHECK-NEXT: ret %wide.load = load <4 x i32>, ptr %p, align 4 %ext = zext <4 x i32> %wide.load to <4 x i64> @@ -340,55 +326,43 @@ define double @scalarize_v4f64(ptr %p) { define float @scalarize_into_load(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q0, [x1] -; CHECK-NEXT: ldp q3, q2, [x1, #96] -; CHECK-NEXT: ldp q5, q4, [x1, #64] -; CHECK-NEXT: ldp q7, q6, [x1, #32] -; CHECK-NEXT: mov x8, v1.d[1] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mov x1, v3.d[1] -; CHECK-NEXT: mov x4, v2.d[1] -; CHECK-NEXT: mov x16, v5.d[1] -; CHECK-NEXT: mov x18, v4.d[1] -; CHECK-NEXT: fmov x9, d1 -; CHECK-NEXT: mov x12, v7.d[1] -; CHECK-NEXT: mov x14, v6.d[1] -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x13, d7 -; CHECK-NEXT: fmov x15, d6 -; CHECK-NEXT: fmov x17, d5 -; CHECK-NEXT: fmov x0, d4 -; CHECK-NEXT: fmov x3, d3 -; CHECK-NEXT: fmov x5, d2 -; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] -; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] -; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] -; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] -; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] -; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] -; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] -; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] -; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] -; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] -; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] -; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] -; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] -; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] -; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] -; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] +; CHECK-NEXT: ldp x8, x9, [x1] +; CHECK-NEXT: ldp x10, x11, [x1, #16] +; CHECK-NEXT: ldp x12, x13, [x1, #64] +; CHECK-NEXT: ldr s0, [x2, x8, lsl #2] +; CHECK-NEXT: ldr s1, [x2, x9, lsl #2] +; CHECK-NEXT: ldp x8, x9, [x1, #32] +; CHECK-NEXT: ldr s2, [x2, x10, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x11, lsl #2] ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr s6, [x2, x12, lsl #2] +; CHECK-NEXT: ldp x10, x11, [x1, #48] +; CHECK-NEXT: ldr s7, [x2, x13, lsl #2] ; CHECK-NEXT: fadd s1, s2, s3 -; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 -; CHECK-NEXT: fadd s4, s16, s17 -; CHECK-NEXT: fadd s5, s18, s19 -; CHECK-NEXT: fadd s6, s20, s21 -; CHECK-NEXT: fadd s7, s22, s23 +; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x9, lsl #2] +; CHECK-NEXT: ldp x14, x15, [x1, #80] +; CHECK-NEXT: fadd s2, s2, s3 +; CHECK-NEXT: ldr s4, [x2, x10, lsl #2] +; CHECK-NEXT: ldr s5, [x2, x11, lsl #2] +; CHECK-NEXT: ldp x16, x17, [x1, #96] +; CHECK-NEXT: fadd s3, s4, s5 +; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldp x18, x0, [x1, #112] +; CHECK-NEXT: ldr s16, [x2, x14, lsl #2] +; CHECK-NEXT: ldr s17, [x2, x15, lsl #2] +; CHECK-NEXT: ldr s18, [x2, x16, lsl #2] +; CHECK-NEXT: ldr s19, [x2, x17, lsl #2] +; CHECK-NEXT: ldr s20, [x2, x18, lsl #2] +; CHECK-NEXT: ldr s21, [x2, x0, lsl #2] +; CHECK-NEXT: fadd s5, s16, s17 +; CHECK-NEXT: fadd s6, s18, s19 ; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: fadd s3, s6, s7 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret @@ -463,57 +437,39 @@ entry: define float @scalarize_into_load_sext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_sext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q0, q2, [x1] -; CHECK-NEXT: ldp q4, q1, [x1, #32] -; CHECK-NEXT: sshll v3.2d, v0.2s, #0 -; CHECK-NEXT: sshll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: sshll2 v6.2d, v2.4s, #0 -; CHECK-NEXT: sshll2 v5.2d, v1.4s, #0 -; CHECK-NEXT: sshll v1.2d, v1.2s, #0 -; CHECK-NEXT: sshll v2.2d, v2.2s, #0 -; CHECK-NEXT: sshll2 v7.2d, v4.4s, #0 -; CHECK-NEXT: sshll v4.2d, v4.2s, #0 -; CHECK-NEXT: mov x8, v3.d[1] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mov x14, v6.d[1] -; CHECK-NEXT: mov x12, v2.d[1] -; CHECK-NEXT: mov x1, v1.d[1] -; CHECK-NEXT: mov x4, v5.d[1] -; CHECK-NEXT: mov x16, v4.d[1] -; CHECK-NEXT: mov x18, v7.d[1] -; CHECK-NEXT: fmov x9, d3 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x13, d2 -; CHECK-NEXT: fmov x15, d6 -; CHECK-NEXT: fmov x17, d4 -; CHECK-NEXT: fmov x0, d7 -; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] -; CHECK-NEXT: fmov x3, d1 -; CHECK-NEXT: fmov x5, d5 +; CHECK-NEXT: ldpsw x9, x8, [x1] +; CHECK-NEXT: ldpsw x11, x10, [x1, #8] +; CHECK-NEXT: ldpsw x13, x12, [x1, #24] ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] -; CHECK-NEXT: ldr s1, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] +; CHECK-NEXT: ldpsw x9, x8, [x1, #56] +; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldpsw x11, x10, [x1, #48] +; CHECK-NEXT: ldpsw x15, x14, [x1, #16] +; CHECK-NEXT: ldpsw x17, x16, [x1, #40] +; CHECK-NEXT: ldpsw x0, x18, [x1, #32] +; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] -; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] -; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] +; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] +; CHECK-NEXT: fadd s2, s2, s3 +; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] -; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] -; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] -; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] -; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] -; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] -; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] -; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: fadd s1, s1, s3 -; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 -; CHECK-NEXT: fadd s4, s16, s17 -; CHECK-NEXT: fadd s5, s18, s19 -; CHECK-NEXT: fadd s6, s20, s21 -; CHECK-NEXT: fadd s7, s22, s23 +; CHECK-NEXT: fadd s3, s4, s5 +; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] +; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] +; CHECK-NEXT: fadd s5, s16, s17 +; CHECK-NEXT: fadd s6, s18, s19 +; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 @@ -593,57 +549,39 @@ entry: define float @scalarize_into_load_zext(i64 %22, ptr %23, ptr %rawA, ptr %rawB) { ; CHECK-LABEL: scalarize_into_load_zext: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q0, q2, [x1] -; CHECK-NEXT: ldp q4, q1, [x1, #32] -; CHECK-NEXT: ushll v3.2d, v0.2s, #0 -; CHECK-NEXT: ushll2 v0.2d, v0.4s, #0 -; CHECK-NEXT: ushll2 v6.2d, v2.4s, #0 -; CHECK-NEXT: ushll2 v5.2d, v1.4s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v7.2d, v4.4s, #0 -; CHECK-NEXT: ushll v4.2d, v4.2s, #0 -; CHECK-NEXT: mov x8, v3.d[1] -; CHECK-NEXT: mov x10, v0.d[1] -; CHECK-NEXT: mov x14, v6.d[1] -; CHECK-NEXT: mov x12, v2.d[1] -; CHECK-NEXT: mov x1, v1.d[1] -; CHECK-NEXT: mov x4, v5.d[1] -; CHECK-NEXT: mov x16, v4.d[1] -; CHECK-NEXT: mov x18, v7.d[1] -; CHECK-NEXT: fmov x9, d3 -; CHECK-NEXT: fmov x11, d0 -; CHECK-NEXT: fmov x13, d2 -; CHECK-NEXT: fmov x15, d6 -; CHECK-NEXT: fmov x17, d4 -; CHECK-NEXT: fmov x0, d7 -; CHECK-NEXT: ldr s2, [x2, x8, lsl #2] -; CHECK-NEXT: fmov x3, d1 -; CHECK-NEXT: fmov x5, d5 +; CHECK-NEXT: ldp w9, w8, [x1] +; CHECK-NEXT: ldp w11, w10, [x1, #8] +; CHECK-NEXT: ldp w13, w12, [x1, #24] ; CHECK-NEXT: ldr s0, [x2, x9, lsl #2] -; CHECK-NEXT: ldr s1, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s1, [x2, x8, lsl #2] +; CHECK-NEXT: ldp w9, w8, [x1, #56] +; CHECK-NEXT: ldr s2, [x2, x11, lsl #2] ; CHECK-NEXT: ldr s3, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldp w11, w10, [x1, #48] +; CHECK-NEXT: ldp w15, w14, [x1, #16] +; CHECK-NEXT: ldp w17, w16, [x1, #40] +; CHECK-NEXT: ldp w0, w18, [x1, #32] +; CHECK-NEXT: fadd s1, s2, s3 +; CHECK-NEXT: ldr s2, [x2, x15, lsl #2] +; CHECK-NEXT: ldr s3, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s4, [x2, x13, lsl #2] ; CHECK-NEXT: ldr s5, [x2, x12, lsl #2] -; CHECK-NEXT: ldr s6, [x2, x15, lsl #2] -; CHECK-NEXT: ldr s7, [x2, x14, lsl #2] ; CHECK-NEXT: ldr s16, [x2, x17, lsl #2] +; CHECK-NEXT: ldr s6, [x2, x0, lsl #2] +; CHECK-NEXT: fadd s2, s2, s3 +; CHECK-NEXT: ldr s7, [x2, x18, lsl #2] ; CHECK-NEXT: ldr s17, [x2, x16, lsl #2] -; CHECK-NEXT: ldr s18, [x2, x0, lsl #2] -; CHECK-NEXT: ldr s19, [x2, x18, lsl #2] -; CHECK-NEXT: ldr s20, [x2, x3, lsl #2] -; CHECK-NEXT: ldr s21, [x2, x1, lsl #2] -; CHECK-NEXT: ldr s22, [x2, x5, lsl #2] -; CHECK-NEXT: ldr s23, [x2, x4, lsl #2] -; CHECK-NEXT: fadd s0, s0, s2 -; CHECK-NEXT: fadd s1, s1, s3 -; CHECK-NEXT: fadd s2, s4, s5 -; CHECK-NEXT: fadd s3, s6, s7 -; CHECK-NEXT: fadd s4, s16, s17 -; CHECK-NEXT: fadd s5, s18, s19 -; CHECK-NEXT: fadd s6, s20, s21 -; CHECK-NEXT: fadd s7, s22, s23 +; CHECK-NEXT: fadd s3, s4, s5 +; CHECK-NEXT: ldr s18, [x2, x11, lsl #2] +; CHECK-NEXT: ldr s19, [x2, x10, lsl #2] +; CHECK-NEXT: fadd s4, s6, s7 ; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ldr s20, [x2, x9, lsl #2] +; CHECK-NEXT: ldr s21, [x2, x8, lsl #2] +; CHECK-NEXT: fadd s5, s16, s17 +; CHECK-NEXT: fadd s6, s18, s19 +; CHECK-NEXT: fadd s7, s20, s21 ; CHECK-NEXT: fadd s1, s2, s3 ; CHECK-NEXT: fadd s2, s4, s5 ; CHECK-NEXT: fadd s3, s6, s7 diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index a0a14f2ffae3f..e3007a3723484 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -169,8 +169,6 @@ define i64 @streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou ; CHECK-NEWLOWERING-NEXT: smstop sm ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: smstop sm ; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: smstart sm ; CHECK-NEWLOWERING-NEXT: mov x8, x0 @@ -268,19 +266,11 @@ define i64 @streaming_compatible_agnostic_caller_nonstreaming_private_za_callee( ; CHECK-NEWLOWERING-NEXT: .LBB5_2: ; CHECK-NEWLOWERING-NEXT: mov x0, x8 ; CHECK-NEWLOWERING-NEXT: bl private_za_decl +; CHECK-NEWLOWERING-NEXT: bl private_za_decl ; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_4 ; CHECK-NEWLOWERING-NEXT: // %bb.3: ; CHECK-NEWLOWERING-NEXT: smstart sm ; CHECK-NEWLOWERING-NEXT: .LBB5_4: -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_6 -; CHECK-NEWLOWERING-NEXT: // %bb.5: -; CHECK-NEWLOWERING-NEXT: smstop sm -; CHECK-NEWLOWERING-NEXT: .LBB5_6: -; CHECK-NEWLOWERING-NEXT: bl private_za_decl -; CHECK-NEWLOWERING-NEXT: tbz w20, #0, .LBB5_8 -; CHECK-NEWLOWERING-NEXT: // %bb.7: -; CHECK-NEWLOWERING-NEXT: smstart sm -; CHECK-NEWLOWERING-NEXT: .LBB5_8: ; CHECK-NEWLOWERING-NEXT: mov x8, x0 ; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll index cab094e638cdf..ced0d41c22dab 100644 --- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll +++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-new-sme-abi -aarch64-streaming-hazard-size=0 -mattr=+sve,+sme2 < %s | FileCheck %s declare void @callee() declare void @callee_sm() "aarch64_pstate_sm_enabled" @@ -563,3 +563,128 @@ define void @test13(ptr %ptr) nounwind "aarch64_pstate_sm_enabled" { store %res1, ptr %ptr ret void } + +; normal caller -> streaming callees (with ZA state) +define void @test14(ptr %callee) nounwind "aarch64_inout_za" { +; CHECK-LABEL: test14: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: cbnz x8, .LBB15_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB15_2: +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee_sm() + call void @callee_sm() + ret void +} + +; normal caller -> streaming callees (with ZA agnostic state) +define void @test15(ptr %callee) nounwind "aarch64_za_state_agnostic" { +; CHECK-LABEL: test15: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x20, sp +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: smstart sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: bl callee_sm +; CHECK-NEXT: smstop sm +; CHECK-NEXT: mov x0, x20 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee_sm() + call void @callee_sm() + ret void +} + +; locally streaming caller -> normal callees (with ZA state) +define void @test16(ptr %callee) nounwind "aarch64_pstate_sm_body" "aarch64_new_za" { +; CHECK-LABEL: test16: +; CHECK: // %bb.0: +; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill +; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: add x29, sp, #64 +; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: cbz x8, .LBB17_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: bl __arm_tpidr2_save +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: zero {za} +; CHECK-NEXT: .LBB17_2: +; CHECK-NEXT: smstart za +; CHECK-NEXT: smstart sm +; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: smstop sm +; CHECK-NEXT: bl callee +; CHECK-NEXT: bl callee +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: smstop za +; CHECK-NEXT: sub sp, x29, #64 +; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload +; CHECK-NEXT: ret + call void @callee() + call void @callee() + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 505a40c16653b..d00efa7d99d53 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -374,8 +374,8 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl foo ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ldr z0, [sp, #2, mul vl] -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: addvl x8, sp, #2 +; CHECK-NEXT: ldrb w0, [x8] ; CHECK-NEXT: addvl sp, sp, #3 ; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index fc43c714d69b3..b6dee97ea2962 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-SDAG ; A simple EH test case that corresponds to the following C++ source: ; @@ -87,6 +88,90 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl _Unwind_Resume +; +; CHECK-SDAG-LABEL: za_with_raii: +; CHECK-SDAG: .Lfunc_begin0: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception0 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: tbnz w0, #0, .LBB0_2 +; CHECK-SDAG-NEXT: // %bb.1: // %return_normally +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: b shared_za_call +; CHECK-SDAG-NEXT: .LBB0_2: // %throw_exception +; CHECK-SDAG-NEXT: sub x20, x29, #16 +; CHECK-SDAG-NEXT: mov w0, #8 // =0x8 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl __cxa_allocate_exception +; CHECK-SDAG-NEXT: mov x8, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x9, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x9, .LBB0_4 +; CHECK-SDAG-NEXT: // %bb.3: // %throw_exception +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_4: // %throw_exception +; CHECK-SDAG-NEXT: adrp x9, .L.str +; CHECK-SDAG-NEXT: add x9, x9, :lo12:.L.str +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: str x9, [x8] +; CHECK-SDAG-NEXT: .Ltmp0: // EH_LABEL +; CHECK-SDAG-NEXT: adrp x1, :got:typeinfo_for_char_const_ptr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: mov x0, x8 +; CHECK-SDAG-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] +; CHECK-SDAG-NEXT: mov x2, xzr +; CHECK-SDAG-NEXT: bl __cxa_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_6 +; CHECK-SDAG-NEXT: // %bb.5: // %throw_exception +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_6: // %throw_exception +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .Ltmp1: // EH_LABEL +; CHECK-SDAG-NEXT: // %bb.7: // %throw_fail +; CHECK-SDAG-NEXT: .LBB0_8: // %unwind_dtors +; CHECK-SDAG-NEXT: .Ltmp2: // EH_LABEL +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_10 +; CHECK-SDAG-NEXT: // %bb.9: // %unwind_dtors +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_10: // %unwind_dtors +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x20 +; CHECK-SDAG-NEXT: bl _Unwind_Resume +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB0_12 +; CHECK-SDAG-NEXT: // %bb.11: // %unwind_dtors +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB0_12: // %unwind_dtors +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr br i1 %fail, label %throw_exception, label %return_normally throw_exception: @@ -124,7 +209,7 @@ throw_fail: ; } ; shared_za_call(); ; } -define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 { +define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v0 { ; CHECK-LABEL: try_catch: ; CHECK: .Lfunc_begin1: ; CHECK-NEXT: .cfi_startproc @@ -142,11 +227,11 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .Ltmp3: // EH_LABEL ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl may_throw -; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .Ltmp4: // EH_LABEL ; CHECK-NEXT: .LBB1_1: // %after_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -160,7 +245,7 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call ; CHECK-NEXT: .LBB1_4: // %catch -; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: .Ltmp5: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -175,6 +260,78 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl __cxa_end_catch ; CHECK-NEXT: b .LBB1_1 +; +; CHECK-SDAG-LABEL: try_catch: +; CHECK-SDAG: .Lfunc_begin1: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception1 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: .Ltmp3: // EH_LABEL +; CHECK-SDAG-NEXT: sub x19, x29, #16 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_2 +; CHECK-SDAG-NEXT: // %bb.1: +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_2: +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .Ltmp4: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB1_3: // %after_catch +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: b shared_za_call +; CHECK-SDAG-NEXT: .LBB1_4: // %catch +; CHECK-SDAG-NEXT: .Ltmp5: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_6 +; CHECK-SDAG-NEXT: // %bb.5: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_6: // %catch +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_8 +; CHECK-SDAG-NEXT: // %bb.7: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_8: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB1_10 +; CHECK-SDAG-NEXT: // %bb.9: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB1_10: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b .LBB1_3 invoke void @may_throw() to label %after_catch unwind label %catch @@ -235,16 +392,16 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: zero {za} ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: .Ltmp6: // EH_LABEL ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: .Ltmp7: // EH_LABEL ; CHECK-NEXT: .LBB2_3: // %exit ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_4: // %catch -; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: .Ltmp8: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -260,6 +417,78 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: bl __cxa_end_catch ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: b .LBB2_3 +; +; CHECK-SDAG-LABEL: try_catch_shared_za_callee: +; CHECK-SDAG: .Lfunc_begin2: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception2 +; CHECK-SDAG-NEXT: // %bb.0: // %prelude +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: sub sp, sp, #16 +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: rdsvl x8, #1 +; CHECK-SDAG-NEXT: mov x9, sp +; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 +; CHECK-SDAG-NEXT: mov sp, x9 +; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: cbz x8, .LBB2_2 +; CHECK-SDAG-NEXT: // %bb.1: // %save.za +; CHECK-SDAG-NEXT: bl __arm_tpidr2_save +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: .LBB2_2: +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: zero {za} +; CHECK-SDAG-NEXT: .Ltmp6: // EH_LABEL +; CHECK-SDAG-NEXT: bl shared_za_call +; CHECK-SDAG-NEXT: .Ltmp7: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB2_3: // %exit +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB2_4: // %catch +; CHECK-SDAG-NEXT: .Ltmp8: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: sub x19, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_6 +; CHECK-SDAG-NEXT: // %bb.5: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_6: // %catch +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_8 +; CHECK-SDAG-NEXT: // %bb.7: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_8: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: bl noexcept_shared_za_call +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x19 +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-SDAG-NEXT: sub x0, x29, #16 +; CHECK-SDAG-NEXT: cbnz x8, .LBB2_10 +; CHECK-SDAG-NEXT: // %bb.9: // %catch +; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore +; CHECK-SDAG-NEXT: .LBB2_10: // %catch +; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr +; CHECK-SDAG-NEXT: b .LBB2_3 invoke void @shared_za_call() #4 to label %exit unwind label %catch catch: @@ -275,6 +504,234 @@ exit: ret void } +; A simple ZT0 exception example that corresponds to: +; +; struct ZT0Resource { +; ~ZT0Resource() __arm_inout("zt0") { +; shared_zt0_call(); // simulate cleanup in destructor +; } +; }; +; +; void za_with_raii() __arm_inout("zt0") { +; ZT0Resource r; +; may_throw(); +; } +; +; This code may require reloading ZT0 in the cleanup for ~ZT0Resource(). +; +; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not implemented). +define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_shared_zt0_callee: +; CHECK: .Lfunc_begin3: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception3 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #80 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stp x9, x8, [x29, #-80] +; CHECK-NEXT: .Ltmp9: // EH_LABEL +; CHECK-NEXT: sub x19, x29, #64 +; CHECK-NEXT: str zt0, [x19] +; CHECK-NEXT: smstop za +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: .Ltmp10: // EH_LABEL +; CHECK-NEXT: // %bb.1: // %return_normally +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_2: // %unwind_dtors +; CHECK-NEXT: .Ltmp11: // EH_LABEL +; CHECK-NEXT: sub x20, x29, #64 +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #80 +; CHECK-NEXT: cbnz x8, .LBB3_4 +; CHECK-NEXT: // %bb.3: // %unwind_dtors +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB3_4: // %unwind_dtors +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: bl shared_zt0_call +; CHECK-NEXT: str zt0, [x20] +; CHECK-NEXT: smstop za +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl _Unwind_Resume +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x20] +; +; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee: +; CHECK-SDAG: .Lfunc_begin3: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception3 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: sub sp, sp, #96 +; CHECK-SDAG-NEXT: str x30, [sp, #64] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SDAG-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SDAG-NEXT: .cfi_offset w19, -8 +; CHECK-SDAG-NEXT: .cfi_offset w20, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -32 +; CHECK-SDAG-NEXT: .Ltmp9: // EH_LABEL +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: str zt0, [x19] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x19] +; CHECK-SDAG-NEXT: .Ltmp10: // EH_LABEL +; CHECK-SDAG-NEXT: // %bb.1: // %return_normally +; CHECK-SDAG-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: add sp, sp, #96 +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB3_2: // %unwind_dtors +; CHECK-SDAG-NEXT: .Ltmp11: // EH_LABEL +; CHECK-SDAG-NEXT: mov x20, sp +; CHECK-SDAG-NEXT: mov x19, x0 +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] +; CHECK-SDAG-NEXT: bl shared_zt0_call +; CHECK-SDAG-NEXT: str zt0, [x20] +; CHECK-SDAG-NEXT: smstop za +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl _Unwind_Resume +; CHECK-SDAG-NEXT: smstart za +; CHECK-SDAG-NEXT: ldr zt0, [x20] + invoke void @may_throw() + to label %return_normally unwind label %unwind_dtors + +unwind_dtors: + %5 = landingpad { ptr, i32 } + cleanup + tail call void @shared_zt0_call() + resume { ptr, i32 } %5 + +return_normally: + ret void +} + +; This example corresponds to: +; +; __arm_agnostic("sme_za_state") void try_catch_agnostic_za() +; { +; try { +; may_throw(); +; } catch(...) { +; } +; } +; +; In this example we must execute __arm_sme_restore once we enter the catch block +; (before executing __arm_sme_save again, which would invalidate the prior save). +define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr @__gxx_personality_v0 { +; CHECK-LABEL: try_catch_agnostic_za: +; CHECK: .Lfunc_begin4: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-NEXT: .cfi_lsda 28, .Lexception4 +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w19, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: bl __arm_sme_state_size +; CHECK-NEXT: sub sp, sp, x0 +; CHECK-NEXT: mov x19, sp +; CHECK-NEXT: .Ltmp12: // EH_LABEL +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_save +; CHECK-NEXT: bl may_throw +; CHECK-NEXT: .Ltmp13: // EH_LABEL +; CHECK-NEXT: .LBB4_1: // %exit +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_2: // %catch +; CHECK-NEXT: .Ltmp14: // EH_LABEL +; CHECK-NEXT: bl __cxa_begin_catch +; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: b .LBB4_1 +; +; CHECK-SDAG-LABEL: try_catch_agnostic_za: +; CHECK-SDAG: .Lfunc_begin4: +; CHECK-SDAG-NEXT: .cfi_startproc +; CHECK-SDAG-NEXT: .cfi_personality 156, DW.ref.__gxx_personality_v0 +; CHECK-SDAG-NEXT: .cfi_lsda 28, .Lexception4 +; CHECK-SDAG-NEXT: // %bb.0: +; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-SDAG-NEXT: mov x29, sp +; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 32 +; CHECK-SDAG-NEXT: .cfi_offset w19, -16 +; CHECK-SDAG-NEXT: .cfi_offset w30, -24 +; CHECK-SDAG-NEXT: .cfi_offset w29, -32 +; CHECK-SDAG-NEXT: bl __arm_sme_state_size +; CHECK-SDAG-NEXT: sub sp, sp, x0 +; CHECK-SDAG-NEXT: mov x19, sp +; CHECK-SDAG-NEXT: .Ltmp12: // EH_LABEL +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl may_throw +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: .Ltmp13: // EH_LABEL +; CHECK-SDAG-NEXT: .LBB4_1: // %exit +; CHECK-SDAG-NEXT: mov sp, x29 +; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-SDAG-NEXT: ret +; CHECK-SDAG-NEXT: .LBB4_2: // %catch +; CHECK-SDAG-NEXT: .Ltmp14: // EH_LABEL +; CHECK-SDAG-NEXT: mov x1, x0 +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: mov x0, x1 +; CHECK-SDAG-NEXT: bl __cxa_begin_catch +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_save +; CHECK-SDAG-NEXT: bl __cxa_end_catch +; CHECK-SDAG-NEXT: mov x0, x19 +; CHECK-SDAG-NEXT: bl __arm_sme_restore +; CHECK-SDAG-NEXT: b .LBB4_1 + invoke void @may_throw() + to label %exit unwind label %catch +catch: + %eh_info = landingpad { ptr, i32 } + catch ptr null + %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0 + tail call ptr @__cxa_begin_catch(ptr %exception_ptr) + tail call void @__cxa_end_catch() + br label %exit + +exit: + ret void +} + declare ptr @__cxa_allocate_exception(i64) declare void @__cxa_throw(ptr, ptr, ptr) declare ptr @__cxa_begin_catch(ptr) @@ -284,3 +741,4 @@ declare i32 @__gxx_personality_v0(...) declare void @may_throw() declare void @shared_za_call() "aarch64_inout_za" declare void @noexcept_shared_za_call() "aarch64_inout_za" +declare void @shared_zt0_call() "aarch64_inout_zt0" diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll index f6ed2e6a787f0..ba7bee9a94bac 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll @@ -19,14 +19,12 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 { define <2 x i256> @load_zext_v2i64i256(ptr %ap) #0 { ; CHECK-LABEL: load_zext_v2i64i256: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldp x0, x4, [x0] ; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: mov x2, xzr ; CHECK-NEXT: mov x3, xzr ; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: mov x6, xzr -; CHECK-NEXT: mov x4, v0.d[1] -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: mov x7, xzr ; CHECK-NEXT: ret %a = load <2 x i64>, ptr %ap diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index ebd32c73ec65b..6fd5b820a2242 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -438,8 +438,7 @@ define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: cbnz x8, .LBB15_2 ; CHECK-NEXT: // %bb.1: // %cond.load diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll index a69808d32ed73..4f5a5a6dee257 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -727,8 +727,8 @@ define void @load_splat_v4f64(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.b, b0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1rb { z0.b }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <32 x i8>, ptr %a @@ -740,8 +740,8 @@ define void @load_splat_v32i8(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1rh { z0.h }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <16 x i16>, ptr %a @@ -753,8 +753,8 @@ define void @load_splat_v16i16(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.s, s0 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1rw { z0.s }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <8 x i32>, ptr %a @@ -766,8 +766,8 @@ define void @load_splat_v8i32(ptr %a, ptr %b) vscale_range(2,2) #0 { define void @load_splat_v4i64(ptr %a, ptr %b) vscale_range(2,2) #0 { ; CHECK-LABEL: load_splat_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr z0, [x0] -; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1rd { z0.d }, p0/z, [x0] ; CHECK-NEXT: str z0, [x1] ; CHECK-NEXT: ret %v = load <4 x i64>, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 4d524bc848de6..e433786cfdd1f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -99,16 +99,14 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) { define <2 x i256> @load_zext_v2i64i256(ptr %ap) { ; CHECK-LABEL: load_zext_v2i64i256: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ldp x8, x4, [x0] ; CHECK-NEXT: mov x1, xzr ; CHECK-NEXT: mov x2, xzr ; CHECK-NEXT: mov x3, xzr ; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: mov x6, xzr -; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: mov x7, xzr -; CHECK-NEXT: fmov x4, d1 +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: load_zext_v2i64i256: @@ -282,14 +280,12 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; CHECK-LABEL: load_sext_v2i64i256: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: mov z1.d, z0.d[1] -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: fmov x4, d1 -; CHECK-NEXT: asr x1, x0, #63 +; CHECK-NEXT: ldp x8, x4, [x0] +; CHECK-NEXT: asr x1, x8, #63 +; CHECK-NEXT: asr x5, x4, #63 +; CHECK-NEXT: mov x0, x8 ; CHECK-NEXT: mov x2, x1 ; CHECK-NEXT: mov x3, x1 -; CHECK-NEXT: asr x5, x4, #63 ; CHECK-NEXT: mov x6, x5 ; CHECK-NEXT: mov x7, x5 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index e6c6003ee6c69..094eaad0cfe80 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -115,9 +115,9 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: ldr q0, [sp] ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] +; CHECK-NEXT: ldrb w8, [sp, #16] ; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b -; CHECK-NEXT: ldr q1, [sp, #16] -; CHECK-NEXT: stur b1, [x19, #8] +; CHECK-NEXT: strb w8, [x19, #8] ; CHECK-NEXT: str d0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll index ea6123edc8b4c..7b9b69e0d9b4d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -101,15 +101,13 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.b, z0.b[15] -; CHECK-NEXT: mov z2.b, z1.b[15] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.b, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.b, w8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldrb w8, [x0, #31] +; CHECK-NEXT: mov z1.b, z0.b[15] +; CHECK-NEXT: insr z0.b, w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: insr z2.b, w8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8: @@ -238,15 +236,13 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.h, z0.h[7] -; CHECK-NEXT: mov z2.h, z1.h[7] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.h, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.h, w8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldrh w8, [x0, #30] +; CHECK-NEXT: mov z1.h, z0.h[7] +; CHECK-NEXT: insr z0.h, w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: insr z2.h, w8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16: @@ -341,15 +337,13 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.s, z0.s[3] -; CHECK-NEXT: mov z2.s, z1.s[3] -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: insr z1.s, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: insr z3.s, w8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldr w8, [x0, #28] +; CHECK-NEXT: mov z1.s, z0.s[3] +; CHECK-NEXT: insr z0.s, w8 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: insr z2.s, w8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32: @@ -409,15 +403,13 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: shuffle_ext_byone_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ldp q1, q3, [x1] -; CHECK-NEXT: mov z0.d, z0.d[1] -; CHECK-NEXT: mov z2.d, z1.d[1] -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: insr z1.d, x8 -; CHECK-NEXT: fmov x8, d2 -; CHECK-NEXT: insr z3.d, x8 -; CHECK-NEXT: stp q1, q3, [x0] +; CHECK-NEXT: ldp q0, q2, [x1] +; CHECK-NEXT: ldr x8, [x0, #24] +; CHECK-NEXT: mov z1.d, z0.d[1] +; CHECK-NEXT: insr z0.d, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: insr z2.d, x8 +; CHECK-NEXT: stp q0, q2, [x0] ; CHECK-NEXT: ret ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64: diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll index 62d41fca10db3..19e1aa5d152ce 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -26,9 +26,9 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_and_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.b[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: tst w8, #0x80 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) @@ -120,9 +120,9 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_and_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: tst w8, #0x8000 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i16> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x) @@ -305,9 +305,9 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_or_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.b[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: tst w8, #0x80 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) @@ -399,9 +399,9 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_or_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: tst w8, #0x8000 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i16> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x) @@ -584,9 +584,9 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_xor_v1i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.b[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.b[0] +; CHECK-NEXT: tst w8, #0x80 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i8> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) @@ -679,9 +679,9 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_xor_v1i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: smov w8, v0.h[0] -; CHECK-NEXT: cmp w8, #0 -; CHECK-NEXT: csel w0, w0, w1, mi +; CHECK-NEXT: umov w8, v0.h[0] +; CHECK-NEXT: tst w8, #0x8000 +; CHECK-NEXT: csel w0, w0, w1, ne ; CHECK-NEXT: ret %x = icmp slt <1 x i16> %a0, zeroinitializer %y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x) diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 9165493863729..55c343164a1b8 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -462,10 +462,9 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i ; CHECK-NEXT: orr x8, x9, x8, lsl #1 ; CHECK-NEXT: strh w1, [x10] ; CHECK-NEXT: strh w2, [x8] -; CHECK-NEXT: ldr d0, [sp, #8] -; CHECK-NEXT: umov.h w0, v0[0] -; CHECK-NEXT: umov.h w1, v0[1] -; CHECK-NEXT: umov.h w2, v0[2] +; CHECK-NEXT: ldrh w0, [sp, #8] +; CHECK-NEXT: ldrh w1, [sp, #10] +; CHECK-NEXT: ldrh w2, [sp, #12] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-divrem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-divrem.mir new file mode 100644 index 0000000000000..f75fa857448d7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-divrem.mir @@ -0,0 +1,125 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 -run-pass=legalizer -o - %s | FileCheck -check-prefix=GFX8 %s + +--- +name: test_sdivrem_s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: test_sdivrem_s16 + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16 + ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[C]](s32) + ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[C]](s32) + ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG]], [[ASHR]] + ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SEXT_INREG1]], [[ASHR1]] + ; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]] + ; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[ADD1]], [[ASHR1]] + ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[XOR1]](s32) + ; GFX8-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]] + ; GFX8-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[XOR1]] + ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; GFX8-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; GFX8-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[XOR]], [[ADD2]] + ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[XOR1]] + ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[XOR]], [[MUL1]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[XOR1]] + ; GFX8-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH1]], [[C3]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD3]], [[UMULH1]] + ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[XOR1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT1]](s32), [[XOR1]] + ; GFX8-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD4]], [[SELECT]] + ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[XOR1]] + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT1]] + ; GFX8-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[ASHR]], [[ASHR1]] + ; GFX8-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[SELECT2]], [[XOR2]] + ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[XOR3]], [[XOR2]] + ; GFX8-NEXT: [[XOR4:%[0-9]+]]:_(s32) = G_XOR [[SELECT3]], [[ASHR]] + ; GFX8-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[XOR4]], [[ASHR]] + ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SUB4]](s32) + ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SUB5]](s32) + ; GFX8-NEXT: S_NOP 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16) + ; GFX8-NEXT: $vgpr0 = COPY [[SUB4]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[SUB5]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16), %5:_(s16) = G_SDIVREM %2, %3 + S_NOP 0, implicit %4, implicit %5 + %6:_(s32) = G_ANYEXT %4 + %7:_(s32) = G_ANYEXT %5 + $vgpr0 = COPY %6 + $vgpr0 = COPY %7 + +... + +--- +name: test_udivrem_s16 +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + + ; GFX8-LABEL: name: test_udivrem_s16 + ; GFX8: liveins: $vgpr0, $vgpr1 + ; GFX8-NEXT: {{ $}} + ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]] + ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[AND1]](s32) + ; GFX8-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32) + ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000 + ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C1]] + ; GFX8-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32) + ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[AND1]] + ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]] + ; GFX8-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]] + ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]] + ; GFX8-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]] + ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[AND1]] + ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]] + ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[AND1]] + ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UMULH1]], [[C3]] + ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[ADD1]], [[UMULH1]] + ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND1]] + ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]] + ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT1]](s32), [[AND1]] + ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]] + ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]] + ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT1]], [[AND1]] + ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT1]] + ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT2]](s32) + ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT3]](s32) + ; GFX8-NEXT: S_NOP 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16) + ; GFX8-NEXT: $vgpr0 = COPY [[SELECT2]](s32) + ; GFX8-NEXT: $vgpr0 = COPY [[SELECT3]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s16) = G_TRUNC %0 + %3:_(s16) = G_TRUNC %1 + %4:_(s16), %5:_(s16) = G_UDIVREM %2, %3 + S_NOP 0, implicit %4, implicit %5 + %6:_(s32) = G_ANYEXT %4 + %7:_(s32) = G_ANYEXT %5 + $vgpr0 = COPY %6 + $vgpr0 = COPY %7 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll index dd5a9ae48e207..6e85e6fc7297d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=instruction-select < %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 -stop-after=instruction-select < %s | FileCheck --check-prefix=CHECK45 %s define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-LABEL: name: basic_raw_buffer @@ -25,7 +26,39 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678) + ; + ; CHECK45-LABEL: name: basic_raw_buffer + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -6629298651489370112 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], [[S_MOV_B]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 9 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -536870912 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE1]], [[S_MOV_B64_]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub0 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1 + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; CHECK45-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } @@ -43,7 +76,23 @@ define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) { ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + ; + ; CHECK45-LABEL: name: read_raw_buffer + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i64 0, i32 0) %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) ret float %loaded } @@ -74,19 +123,54 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 1234, i32 5678) + ; + ; CHECK45-LABEL: name: basic_struct_buffer + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -6629298651489370112 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], [[S_MOV_B]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 9 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16384 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -536870912 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_2]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE1]], [[S_MOV_B64_]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_OR_B64_1]], [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub0 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; CHECK45-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } -define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %numVals, i32 inreg %flags) { +define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i64 inreg %numVals, i32 inreg %flags) { ; CHECK-LABEL: name: variable_top_half ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr4 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 @@ -104,20 +188,64 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %nu ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: variable_top_half + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_LSHL_B32_]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], [[S_MOV_B32_3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16384 + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_4]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], [[S_MOV_B32_5]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_LSHL_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], [[REG_SEQUENCE3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_OR_B64_1]], [[REG_SEQUENCE4]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub0 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK45-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i64 %numVals, i32 %flags) ret ptr addrspace(8) %rsrc } -define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { +define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i64 inreg %numVals, i32 inreg %flags) { ; CHECK-LABEL: name: general_case ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 @@ -136,20 +264,66 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: general_case + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_LSHL_B32_]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], [[S_MOV_B32_3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_4]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_LSHL_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], [[S_MOV_B32_5]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_LSHL_B32_2]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], [[REG_SEQUENCE3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_OR_B64_1]], [[REG_SEQUENCE4]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub0 + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; CHECK45-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) ret ptr addrspace(8) %rsrc } -define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { +define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i64 inreg %numVals, i32 inreg %flags) { ; CHECK-LABEL: name: general_case_load ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 @@ -161,23 +335,61 @@ define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 i ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: general_case_load + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr3 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr4 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr5 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_LSHL_B32_]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], [[S_MOV_B32_3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], [[S_MOV_B32_4]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_LSHL_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], [[S_MOV_B32_5]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_LSHL_B32_2]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], [[REG_SEQUENCE3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_OR_B64_1]], [[REG_SEQUENCE4]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub0 + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY7]], %subreg.sub1, [[COPY8]], %subreg.sub2, [[COPY9]], %subreg.sub3 + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[COPY10]], [[REG_SEQUENCE5]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %value } ; None of the components are uniform due to the lack of an inreg -define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i32 %numVals, i32 %flags) { +define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i64 %numVals, i32 %flags) { ; CHECK-LABEL: name: general_case_load_with_waterfall ; CHECK: bb.1 (%ir-block.0): ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16 @@ -221,7 +433,75 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3 ; CHECK-NEXT: bb.5: ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: general_case_load_with_waterfall + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: successors: %bb.2(0x80000000) + ; CHECK45-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[V_LSHL_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_OR_B32_e64 [[COPY3]], [[COPY6]], [[COPY8]], implicit $exec + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK45-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 [[COPY9]], [[REG_SEQUENCE1]], implicit $exec + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_3]] + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY2]], implicit $exec + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_4]] + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY5]], implicit $exec + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub0 + ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub1 + ; CHECK45-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 [[COPY13]], [[V_LSHLREV_B32_e64_]], [[V_LSHLREV_B32_e64_1]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[V_LSHL_OR_B32_e64_]], %subreg.sub1, [[COPY12]], %subreg.sub2, [[V_OR3_B32_e64_]], %subreg.sub3 + ; CHECK45-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; CHECK45-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32_xm0_xexec = S_MOV_B32 $exec_lo + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: bb.2: + ; CHECK45-NEXT: successors: %bb.3(0x80000000) + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_LSHL_OR_B32_e64_]], implicit $exec + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_OR3_B32_e64_]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 + ; CHECK45-NEXT: [[COPY15:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub0_sub1 + ; CHECK45-NEXT: [[COPY16:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE2]].sub2_sub3 + ; CHECK45-NEXT: [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1 + ; CHECK45-NEXT: [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3 + ; CHECK45-NEXT: [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec + ; CHECK45-NEXT: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec + ; CHECK45-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_AND_SAVEEXEC_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[S_AND_B32_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: bb.3: + ; CHECK45-NEXT: successors: %bb.4(0x40000000), %bb.2(0x40000000) + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[COPY14]], [[REG_SEQUENCE3]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: $exec_lo = S_XOR_B32_term $exec_lo, [[S_AND_SAVEEXEC_B32_]], implicit-def $scc + ; CHECK45-NEXT: SI_WATERFALL_LOOP %bb.2, implicit $exec + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: bb.4: + ; CHECK45-NEXT: successors: %bb.5(0x80000000) + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: $exec_lo = S_MOV_B32_term [[S_MOV_B32_5]] + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: bb.5: + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %value } @@ -240,7 +520,23 @@ define amdgpu_ps float @read_buffer_fat_ptr_p0(ptr inreg %p) { ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i32 0, i32 0) + ; + ; CHECK45-LABEL: name: read_buffer_fat_ptr_p0 + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i64 0, i32 0) %loaded = load float, ptr addrspace(7) %ptr ret float %loaded } @@ -259,14 +555,30 @@ define amdgpu_ps float @read_buffer_fat_ptr_p1(ptr addrspace(1) inreg %p) { ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 - %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + ; + ; CHECK45-LABEL: name: read_buffer_fat_ptr_p1 + ; CHECK45: bb.1 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3 + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE1]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %p, i16 0, i64 0, i32 0) %loaded = load float, ptr addrspace(7) %ptr ret float %loaded } -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr nocapture readnone, i16, i32, i32) -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) -declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr nocapture readnone, i16, i32, i32) -declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr nocapture readnone, i16, i64, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i64, i32) +declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr nocapture readnone, i16, i64, i32) +declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nocapture readnone, i16, i64, i32) declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll index 0373027201378..22acedc4d6e25 100644 --- a/llvm/test/CodeGen/AMDGPU/add_u64.ll +++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll @@ -109,7 +109,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) { ; ; GFX1250-LABEL: test_add_u64_v_64bit_imm: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x13b9ac9ff, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %add = add i64 %a, 5294967295 %ret = bitcast i64 %add to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll index e65f401bcf68a..7fcb29d367006 100644 --- a/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll +++ b/llvm/test/CodeGen/AMDGPU/alignbit-pat.ll @@ -1,11 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s -; GCN-LABEL: {{^}}alignbit_shr_pat: -; GCN-DAG: s_load_dword s[[SHR:[0-9]+]] -; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]] - define amdgpu_kernel void @alignbit_shr_pat(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) { +; GCN-LABEL: alignbit_shr_pat: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_and_b32 s0, s8, 31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], s0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm bb: %tmp = load i64, ptr addrspace(1) %arg, align 8 %tmp3 = and i32 %arg2, 31 @@ -16,12 +29,24 @@ bb: ret void } -; GCN-LABEL: {{^}}alignbit_shr_pat_v: -; GCN-DAG: load_dword v[[SHR:[0-9]+]], -; GCN-DAG: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]] - define amdgpu_kernel void @alignbit_shr_pat_v(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) { +; GCN-LABEL: alignbit_shr_pat_v: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_alignbit_b32 v0, v4, v3, v0 +; GCN-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm bb: %tid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep1 = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %tid @@ -36,12 +61,24 @@ bb: ret void } -; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30: -; Negative test, wrong constant -; GCN: v_lshr_b64 -; GCN-NOT: v_alignbit_b32 - define amdgpu_kernel void @alignbit_shr_pat_wrong_and30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) { +; GCN-LABEL: alignbit_shr_pat_wrong_and30: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_and_b32 s0, s8, 30 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], s0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm bb: %tmp = load i64, ptr addrspace(1) %arg, align 8 %tmp3 = and i32 %arg2, 30 @@ -52,12 +89,23 @@ bb: ret void } -; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63: -; Negative test, wrong constant -; GCN: v_lshr_b64 -; GCN-NOT: v_alignbit_b32 - define amdgpu_kernel void @alignbit_shr_pat_wrong_and63(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, i32 %arg2) { +; GCN-LABEL: alignbit_shr_pat_wrong_and63: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], s8 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm bb: %tmp = load i64, ptr addrspace(1) %arg, align 8 %tmp3 = and i32 %arg2, 63 @@ -68,11 +116,22 @@ bb: ret void } -; GCN-LABEL: {{^}}alignbit_shr_pat_const30: -; GCN: load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]] -; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], 30 - define amdgpu_kernel void @alignbit_shr_pat_const30(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) { +; GCN-LABEL: alignbit_shr_pat_const30: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 30 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm bb: %tmp = load i64, ptr addrspace(1) %arg, align 8 %tmp5 = lshr i64 %tmp, 30 @@ -81,12 +140,22 @@ bb: ret void } -; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_const33: -; Negative test, shift amount more than 31 -; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} -; GCN-NOT: v_alignbit_b32 - define amdgpu_kernel void @alignbit_shr_pat_wrong_const33(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) { +; GCN-LABEL: alignbit_shr_pat_wrong_const33: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s2 +; GCN-NEXT: s_mov_b32 s5, s3 +; GCN-NEXT: s_mov_b32 s2, s6 +; GCN-NEXT: s_mov_b32 s3, s7 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm bb: %tmp = load i64, ptr addrspace(1) %arg, align 8 %tmp5 = lshr i64 %tmp, 33 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index be08c4e33f072..815b9f294be8f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -7526,831 +7526,1167 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v32i32_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s30, 0 -; SI-NEXT: v_writelane_b32 v41, s31, 1 -; SI-NEXT: v_writelane_b32 v41, s34, 2 -; SI-NEXT: v_writelane_b32 v41, s35, 3 -; SI-NEXT: v_writelane_b32 v41, s36, 4 -; SI-NEXT: v_writelane_b32 v41, s37, 5 -; SI-NEXT: v_writelane_b32 v41, s38, 6 -; SI-NEXT: v_writelane_b32 v41, s39, 7 -; SI-NEXT: v_writelane_b32 v41, s48, 8 -; SI-NEXT: v_writelane_b32 v41, s49, 9 -; SI-NEXT: v_writelane_b32 v41, s50, 10 -; SI-NEXT: v_writelane_b32 v41, s51, 11 -; SI-NEXT: v_writelane_b32 v41, s52, 12 -; SI-NEXT: v_writelane_b32 v41, s53, 13 -; SI-NEXT: v_writelane_b32 v41, s54, 14 -; SI-NEXT: v_writelane_b32 v41, s55, 15 -; SI-NEXT: v_writelane_b32 v41, s64, 16 -; SI-NEXT: v_writelane_b32 v41, s65, 17 -; SI-NEXT: v_writelane_b32 v41, s66, 18 -; SI-NEXT: v_writelane_b32 v41, s67, 19 -; SI-NEXT: v_writelane_b32 v41, s68, 20 -; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v41, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s47, v1 -; SI-NEXT: v_readfirstlane_b32 s46, v2 -; SI-NEXT: v_readfirstlane_b32 s45, v3 -; SI-NEXT: v_readfirstlane_b32 s44, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v5 -; SI-NEXT: v_readfirstlane_b32 s42, v6 -; SI-NEXT: v_readfirstlane_b32 s41, v7 -; SI-NEXT: v_readfirstlane_b32 s40, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s7, v17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v22, s45 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 -; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_mov_b32_e32 v35, s24 -; SI-NEXT: v_mov_b32_e32 v39, s22 -; SI-NEXT: v_mov_b32_e32 v50, s20 -; SI-NEXT: v_mov_b32_e32 v53, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 -; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 -; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 -; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 -; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 -; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 -; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 -; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 -; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 -; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 -; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 -; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 -; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 -; SI-NEXT: s_lshr_b32 s56, s6, 24 -; SI-NEXT: s_lshr_b32 s57, s6, 16 -; SI-NEXT: s_lshr_b32 s58, s6, 8 -; SI-NEXT: s_lshr_b32 s59, s8, 24 -; SI-NEXT: s_lshr_b32 s60, s8, 16 -; SI-NEXT: s_lshr_b32 s61, s8, 8 -; SI-NEXT: s_lshr_b32 s62, s10, 24 -; SI-NEXT: s_lshr_b32 s63, s10, 16 -; SI-NEXT: s_lshr_b32 s72, s10, 8 -; SI-NEXT: s_lshr_b32 s73, s12, 24 -; SI-NEXT: s_lshr_b32 s74, s12, 16 -; SI-NEXT: s_lshr_b32 s75, s12, 8 -; SI-NEXT: s_lshr_b32 s76, s14, 24 -; SI-NEXT: s_lshr_b32 s77, s14, 16 -; SI-NEXT: s_lshr_b32 s78, s14, 8 -; SI-NEXT: s_lshr_b32 s79, s40, 24 -; SI-NEXT: s_lshr_b32 s88, s40, 16 -; SI-NEXT: s_lshr_b32 s89, s40, 8 -; SI-NEXT: s_lshr_b32 s90, s42, 24 -; SI-NEXT: s_lshr_b32 s91, s42, 16 -; SI-NEXT: s_lshr_b32 s92, s42, 8 -; SI-NEXT: s_lshr_b32 s93, s44, 24 -; SI-NEXT: s_lshr_b32 s94, s44, 16 -; SI-NEXT: s_lshr_b32 s95, s44, 8 -; SI-NEXT: s_lshr_b32 s30, s46, 24 -; SI-NEXT: s_lshr_b32 s31, s46, 16 -; SI-NEXT: s_lshr_b32 s34, s46, 8 -; SI-NEXT: s_lshr_b32 s35, s29, 24 -; SI-NEXT: s_lshr_b32 s36, s29, 16 -; SI-NEXT: s_lshr_b32 s37, s29, 8 -; SI-NEXT: s_lshr_b32 s38, s27, 24 -; SI-NEXT: s_lshr_b32 s39, s27, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 8 -; SI-NEXT: s_lshr_b32 s49, s25, 24 -; SI-NEXT: s_lshr_b32 s50, s25, 16 -; SI-NEXT: s_lshr_b32 s51, s25, 8 -; SI-NEXT: s_lshr_b32 s52, s23, 24 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s55, s21, 24 -; SI-NEXT: s_lshr_b32 s64, s21, 16 -; SI-NEXT: s_lshr_b32 s65, s21, 8 -; SI-NEXT: s_lshr_b32 s66, s19, 24 -; SI-NEXT: s_lshr_b32 s67, s19, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 8 -; SI-NEXT: s_lshr_b32 s69, s17, 24 -; SI-NEXT: s_lshr_b32 s70, s17, 16 -; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_lshr_b32 s46, s5, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s46, 42 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 43 +; SI-NEXT: s_lshr_b32 s46, s5, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 44 +; SI-NEXT: s_lshr_b32 s46, s7, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 45 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 46 +; SI-NEXT: s_lshr_b32 s46, s7, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 47 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 48 +; SI-NEXT: s_lshr_b32 s46, s9, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 49 +; SI-NEXT: s_lshr_b32 s46, s11, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 50 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 51 +; SI-NEXT: s_lshr_b32 s46, s11, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 52 +; SI-NEXT: s_lshr_b32 s46, s13, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 53 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 54 +; SI-NEXT: s_lshr_b32 s46, s13, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 55 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 56 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 57 +; SI-NEXT: s_lshr_b32 s46, s15, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 58 +; SI-NEXT: s_lshr_b32 s46, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 59 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 60 +; SI-NEXT: s_lshr_b32 s46, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 61 +; SI-NEXT: s_lshr_b32 s46, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 62 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 63 +; SI-NEXT: s_lshr_b32 s46, s43, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 0 +; SI-NEXT: s_lshr_b32 s46, s45, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 1 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 2 +; SI-NEXT: s_lshr_b32 s46, s45, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 3 +; SI-NEXT: s_lshr_b32 s46, s29, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 4 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 5 +; SI-NEXT: s_lshr_b32 s46, s29, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 6 +; SI-NEXT: s_lshr_b32 s46, s27, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 7 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 8 +; SI-NEXT: s_lshr_b32 s46, s27, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 9 +; SI-NEXT: s_lshr_b32 s46, s25, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 10 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 11 +; SI-NEXT: s_lshr_b32 s46, s25, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 12 +; SI-NEXT: s_lshr_b32 s46, s23, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 13 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 14 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 15 +; SI-NEXT: s_lshr_b32 s46, s21, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 40 +; SI-NEXT: v_writelane_b32 v22, s47, 41 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 38 +; SI-NEXT: v_writelane_b32 v22, s47, 39 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 36 +; SI-NEXT: v_writelane_b32 v22, s47, 37 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 34 +; SI-NEXT: v_writelane_b32 v22, s47, 35 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 32 +; SI-NEXT: v_writelane_b32 v22, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 30 +; SI-NEXT: v_writelane_b32 v22, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 28 +; SI-NEXT: v_writelane_b32 v22, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 26 +; SI-NEXT: v_writelane_b32 v22, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 24 +; SI-NEXT: v_writelane_b32 v22, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 22 +; SI-NEXT: v_writelane_b32 v22, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 20 +; SI-NEXT: v_writelane_b32 v22, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 18 +; SI-NEXT: v_writelane_b32 v22, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 16 +; SI-NEXT: v_writelane_b32 v22, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 14 +; SI-NEXT: v_writelane_b32 v22, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 12 +; SI-NEXT: v_writelane_b32 v22, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 10 +; SI-NEXT: v_writelane_b32 v22, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 8 +; SI-NEXT: v_writelane_b32 v22, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 6 +; SI-NEXT: v_writelane_b32 v22, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 4 +; SI-NEXT: v_writelane_b32 v22, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 2 +; SI-NEXT: v_writelane_b32 v22, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 0 +; SI-NEXT: s_lshr_b32 s50, s9, 24 +; SI-NEXT: s_lshr_b32 s51, s21, 8 +; SI-NEXT: s_lshr_b32 s48, s19, 24 +; SI-NEXT: s_lshr_b32 s52, s19, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 8 +; SI-NEXT: s_lshr_b32 s54, s17, 24 +; SI-NEXT: s_lshr_b32 s55, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s17, 8 +; SI-NEXT: v_writelane_b32 v22, s47, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 40 +; SI-NEXT: v_writelane_b32 v22, s47, 41 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 38 +; SI-NEXT: v_writelane_b32 v22, s47, 39 +; SI-NEXT: s_lshr_b32 s46, s5, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 42 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 43 +; SI-NEXT: s_lshr_b32 s46, s5, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 44 +; SI-NEXT: s_lshr_b32 s46, s7, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 45 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 46 +; SI-NEXT: s_lshr_b32 s46, s7, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 47 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 48 +; SI-NEXT: s_lshr_b32 s46, s9, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 49 +; SI-NEXT: s_lshr_b32 s46, s11, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 50 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 51 +; SI-NEXT: s_lshr_b32 s46, s11, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 52 +; SI-NEXT: s_lshr_b32 s46, s13, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 53 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 54 +; SI-NEXT: s_lshr_b32 s46, s13, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 55 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 56 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: s_add_i32 s41, s41, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 57 +; SI-NEXT: s_lshr_b32 s46, s15, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 58 +; SI-NEXT: s_lshr_b32 s46, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 59 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: s_add_i32 s43, s43, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 60 +; SI-NEXT: s_lshr_b32 s46, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 61 +; SI-NEXT: s_lshr_b32 s46, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 62 +; SI-NEXT: s_lshr_b32 s46, s43, 16 ; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 63 +; SI-NEXT: s_lshr_b32 s46, s43, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 0 +; SI-NEXT: s_lshr_b32 s46, s45, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 1 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: v_writelane_b32 v21, s46, 2 +; SI-NEXT: s_lshr_b32 s46, s45, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 3 +; SI-NEXT: s_lshr_b32 s46, s29, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 4 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: v_writelane_b32 v21, s46, 5 +; SI-NEXT: s_lshr_b32 s46, s29, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 6 +; SI-NEXT: s_lshr_b32 s46, s27, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 7 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: v_writelane_b32 v21, s46, 8 +; SI-NEXT: s_lshr_b32 s46, s27, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 9 +; SI-NEXT: s_lshr_b32 s46, s25, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 10 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: v_writelane_b32 v21, s46, 11 +; SI-NEXT: s_lshr_b32 s46, s25, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 12 +; SI-NEXT: s_lshr_b32 s46, s23, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 13 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: v_writelane_b32 v21, s46, 14 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 15 +; SI-NEXT: s_lshr_b32 s46, s21, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 +; SI-NEXT: s_add_i32 s6, s6, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 36 +; SI-NEXT: v_writelane_b32 v22, s47, 37 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 34 +; SI-NEXT: v_writelane_b32 v22, s47, 35 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 32 +; SI-NEXT: v_writelane_b32 v22, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 30 +; SI-NEXT: v_writelane_b32 v22, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 28 +; SI-NEXT: v_writelane_b32 v22, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 26 +; SI-NEXT: v_writelane_b32 v22, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 24 +; SI-NEXT: v_writelane_b32 v22, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 22 +; SI-NEXT: v_writelane_b32 v22, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 20 +; SI-NEXT: v_writelane_b32 v22, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 18 +; SI-NEXT: v_writelane_b32 v22, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 16 +; SI-NEXT: v_writelane_b32 v22, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 14 +; SI-NEXT: v_writelane_b32 v22, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 12 +; SI-NEXT: v_writelane_b32 v22, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 10 +; SI-NEXT: v_writelane_b32 v22, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 8 +; SI-NEXT: v_writelane_b32 v22, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 +; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: v_writelane_b32 v22, s46, 6 +; SI-NEXT: v_writelane_b32 v22, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 4 +; SI-NEXT: v_writelane_b32 v22, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 2 +; SI-NEXT: v_writelane_b32 v22, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_mov_b32_e32 v22, s45 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s46, s46, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 -; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_mov_b32_e32 v35, s24 -; SI-NEXT: v_mov_b32_e32 v39, s22 -; SI-NEXT: v_mov_b32_e32 v50, s20 -; SI-NEXT: v_mov_b32_e32 v53, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 -; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 -; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 -; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 -; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 -; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 -; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 -; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 -; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 -; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 -; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 -; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 -; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 -; SI-NEXT: s_lshr_b32 s56, s6, 24 -; SI-NEXT: s_lshr_b32 s57, s6, 16 -; SI-NEXT: s_lshr_b32 s58, s6, 8 -; SI-NEXT: s_lshr_b32 s59, s8, 24 -; SI-NEXT: s_lshr_b32 s60, s8, 16 -; SI-NEXT: s_lshr_b32 s61, s8, 8 -; SI-NEXT: s_lshr_b32 s62, s10, 24 -; SI-NEXT: s_lshr_b32 s63, s10, 16 -; SI-NEXT: s_lshr_b32 s72, s10, 8 -; SI-NEXT: s_lshr_b32 s73, s12, 24 -; SI-NEXT: s_lshr_b32 s74, s12, 16 -; SI-NEXT: s_lshr_b32 s75, s12, 8 -; SI-NEXT: s_lshr_b32 s76, s14, 24 -; SI-NEXT: s_lshr_b32 s77, s14, 16 -; SI-NEXT: s_lshr_b32 s78, s14, 8 -; SI-NEXT: s_lshr_b32 s79, s40, 24 -; SI-NEXT: s_lshr_b32 s88, s40, 16 -; SI-NEXT: s_lshr_b32 s89, s40, 8 -; SI-NEXT: s_lshr_b32 s90, s42, 24 -; SI-NEXT: s_lshr_b32 s91, s42, 16 -; SI-NEXT: s_lshr_b32 s92, s42, 8 -; SI-NEXT: s_lshr_b32 s93, s44, 24 -; SI-NEXT: s_lshr_b32 s94, s44, 16 -; SI-NEXT: s_lshr_b32 s95, s44, 8 -; SI-NEXT: s_lshr_b32 s30, s46, 24 -; SI-NEXT: s_lshr_b32 s31, s46, 16 -; SI-NEXT: s_lshr_b32 s34, s46, 8 -; SI-NEXT: s_lshr_b32 s35, s29, 24 -; SI-NEXT: s_lshr_b32 s36, s29, 16 -; SI-NEXT: s_lshr_b32 s37, s29, 8 -; SI-NEXT: s_lshr_b32 s38, s27, 24 -; SI-NEXT: s_lshr_b32 s39, s27, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 8 -; SI-NEXT: s_lshr_b32 s49, s25, 24 -; SI-NEXT: s_lshr_b32 s50, s25, 16 -; SI-NEXT: s_lshr_b32 s51, s25, 8 -; SI-NEXT: s_lshr_b32 s52, s23, 24 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s55, s21, 24 -; SI-NEXT: s_lshr_b32 s64, s21, 16 -; SI-NEXT: s_lshr_b32 s65, s21, 8 -; SI-NEXT: s_lshr_b32 s66, s19, 24 -; SI-NEXT: s_lshr_b32 s67, s19, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 8 -; SI-NEXT: s_lshr_b32 s69, s17, 24 -; SI-NEXT: s_lshr_b32 s70, s17, 16 -; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 0 +; SI-NEXT: s_lshr_b32 s50, s9, 24 +; SI-NEXT: s_lshr_b32 s51, s21, 8 +; SI-NEXT: s_lshr_b32 s48, s19, 24 +; SI-NEXT: s_lshr_b32 s52, s19, 16 +; SI-NEXT: s_lshr_b32 s53, s19, 8 +; SI-NEXT: s_lshr_b32 s54, s17, 24 +; SI-NEXT: s_lshr_b32 s55, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s17, 8 +; SI-NEXT: v_writelane_b32 v22, s47, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 -; SI-NEXT: v_or_b32_e32 v40, s4, v40 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s71, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s69, 24 -; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v54 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v54, v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; SI-NEXT: v_or_b32_e32 v53, s4, v53 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s67, 0xff -; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s66, 24 -; SI-NEXT: v_or_b32_e32 v54, v40, v54 -; SI-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v54, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v51, v53, v51 -; SI-NEXT: v_add_i32_e32 v52, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v55, v54, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 -; SI-NEXT: v_or_b32_e32 v50, s4, v50 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s64, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s55, 24 -; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v48, v48, v49 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_lshl_b32 s47, s38, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s16, s16, s47 +; SI-NEXT: s_and_b32 s47, s36, 0xff +; SI-NEXT: s_lshl_b32 s57, s34, 24 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_or_b32 s47, s57, s47 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s47 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s49, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s55, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s47, s54, 24 +; SI-NEXT: s_or_b32 s17, s47, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s30, 8 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s94, 0xff +; SI-NEXT: s_lshl_b32 s18, s92, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xff +; SI-NEXT: s_lshl_b32 s17, s53, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s52, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s48, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s90, 8 +; SI-NEXT: s_and_b32 s17, s20, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s88, 0xff +; SI-NEXT: s_lshl_b32 s18, s78, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xff +; SI-NEXT: s_lshl_b32 s17, s51, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 17 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 16 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: s_lshl_b32 s16, s76, 8 +; SI-NEXT: s_and_b32 s17, s22, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_lshl_b32 s18, s72, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 15 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 14 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 13 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: s_lshl_b32 s16, s62, 8 +; SI-NEXT: s_and_b32 s17, s24, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_lshl_b32 s18, s58, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 12 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 11 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 10 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_and_b32 s17, s26, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s46, 0xff +; SI-NEXT: s_lshl_b32 s18, s98, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 9 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 8 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 7 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: s_lshl_b32 s16, s96, 8 +; SI-NEXT: s_and_b32 s17, s28, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s86, 0xff +; SI-NEXT: s_lshl_b32 s18, s84, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 6 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 5 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: s_lshl_b32 s16, s82, 8 +; SI-NEXT: s_and_b32 s17, s44, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s80, 0xff +; SI-NEXT: s_lshl_b32 s18, s70, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 3 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 2 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v48, v50, v48 -; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v52, v51, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; SI-NEXT: v_or_b32_e32 v39, s4, v39 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s53, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v37 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s52, 24 -; SI-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: s_lshl_b32 s16, s68, 8 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s66, 0xff +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_lshl_b32 s18, s64, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: v_readlane_b32 s17, v21, 0 +; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 63 +; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 62 +; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v22, 0 +; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_readlane_b32 s19, v22, 1 +; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v22, 2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v37, v39, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v49, v48, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; SI-NEXT: v_or_b32_e32 v35, s4, v35 -; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s49, 24 -; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 3 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 61 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v35, v33 -; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: v_or_b32_e32 v29, s4, v29 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s38, 24 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 60 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 59 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 -; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 6 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s17, v22, 7 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s19, v22, 5 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 8 +; SI-NEXT: v_readlane_b32 s17, v22, 9 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 10 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s35, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: v_readlane_b32 s15, v22, 58 +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_readlane_b32 s15, v22, 57 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: v_readlane_b32 s16, v22, 56 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_readlane_b32 s14, v22, 12 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s15, v22, 13 +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: v_readlane_b32 s14, v22, 14 +; SI-NEXT: v_readlane_b32 s15, v22, 15 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s16, v22, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s31, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s30, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: v_readlane_b32 s13, v22, 55 +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_readlane_b32 s13, v22, 54 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: v_readlane_b32 s14, v22, 53 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 -; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_readlane_b32 s12, v22, 18 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s13, v22, 19 +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: v_readlane_b32 s12, v22, 20 +; SI-NEXT: v_readlane_b32 s13, v22, 21 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s14, v22, 22 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s14, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s93, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v21, s4, v21 -; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s91, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s90, 24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v22, 52 +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_readlane_b32 s11, v22, 51 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: v_readlane_b32 s12, v22, 50 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v18, s4, v18 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s88, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s79, 24 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_readlane_b32 s10, v22, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s11, v22, 25 +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: v_readlane_b32 s10, v22, 26 +; SI-NEXT: v_readlane_b32 s11, v22, 27 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s12, v22, 28 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s76, 24 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v22, 49 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_readlane_b32 s9, v22, 48 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s50, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s73, 24 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_readlane_b32 s8, v22, 30 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s9, v22, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: v_readlane_b32 s8, v22, 32 +; SI-NEXT: v_readlane_b32 s9, v22, 33 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v22, 34 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s63, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s10, s62, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v22, 47 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_readlane_b32 s7, v22, 46 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s8, v22, 45 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s59, 24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_readlane_b32 s6, v22, 36 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s7, v22, 37 +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_readlane_b32 s6, v22, 38 +; SI-NEXT: v_readlane_b32 s7, v22, 39 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v22, 40 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: v_readlane_b32 s5, v22, 44 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_readlane_b32 s5, v22, 43 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s6, v22, 42 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s56, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s19, v22, 11 +; SI-NEXT: v_readlane_b32 s17, v22, 17 +; SI-NEXT: v_readlane_b32 s15, v22, 23 +; SI-NEXT: v_readlane_b32 s13, v22, 29 +; SI-NEXT: v_readlane_b32 s11, v22, 35 +; SI-NEXT: v_readlane_b32 s9, v22, 41 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s71, v41, 23 -; SI-NEXT: v_readlane_b32 s70, v41, 22 -; SI-NEXT: v_readlane_b32 s69, v41, 21 -; SI-NEXT: v_readlane_b32 s68, v41, 20 -; SI-NEXT: v_readlane_b32 s67, v41, 19 -; SI-NEXT: v_readlane_b32 s66, v41, 18 -; SI-NEXT: v_readlane_b32 s65, v41, 17 -; SI-NEXT: v_readlane_b32 s64, v41, 16 -; SI-NEXT: v_readlane_b32 s55, v41, 15 -; SI-NEXT: v_readlane_b32 s54, v41, 14 -; SI-NEXT: v_readlane_b32 s53, v41, 13 -; SI-NEXT: v_readlane_b32 s52, v41, 12 -; SI-NEXT: v_readlane_b32 s51, v41, 11 -; SI-NEXT: v_readlane_b32 s50, v41, 10 -; SI-NEXT: v_readlane_b32 s49, v41, 9 -; SI-NEXT: v_readlane_b32 s48, v41, 8 -; SI-NEXT: v_readlane_b32 s39, v41, 7 -; SI-NEXT: v_readlane_b32 s38, v41, 6 -; SI-NEXT: v_readlane_b32 s37, v41, 5 -; SI-NEXT: v_readlane_b32 s36, v41, 4 -; SI-NEXT: v_readlane_b32 s35, v41, 3 -; SI-NEXT: v_readlane_b32 s34, v41, 2 -; SI-NEXT: v_readlane_b32 s31, v41, 1 -; SI-NEXT: v_readlane_b32 s30, v41, 0 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s50, 0 +; SI-NEXT: v_writelane_b32 v22, s51, 1 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 2 +; SI-NEXT: v_writelane_b32 v22, s51, 3 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 4 +; SI-NEXT: v_writelane_b32 v22, s51, 5 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 6 +; SI-NEXT: v_writelane_b32 v22, s51, 7 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 8 +; SI-NEXT: v_writelane_b32 v22, s51, 9 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 10 +; SI-NEXT: v_writelane_b32 v22, s51, 11 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 12 +; SI-NEXT: v_writelane_b32 v22, s51, 13 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 14 +; SI-NEXT: v_writelane_b32 v22, s51, 15 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 16 +; SI-NEXT: v_writelane_b32 v22, s51, 17 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 18 +; SI-NEXT: v_writelane_b32 v22, s51, 19 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 20 +; SI-NEXT: v_writelane_b32 v22, s51, 21 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 22 +; SI-NEXT: v_writelane_b32 v22, s51, 23 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 24 +; SI-NEXT: v_writelane_b32 v22, s51, 25 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 26 +; SI-NEXT: v_writelane_b32 v22, s51, 27 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 28 +; SI-NEXT: v_writelane_b32 v22, s51, 29 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 30 +; SI-NEXT: v_writelane_b32 v22, s51, 31 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 32 +; SI-NEXT: v_writelane_b32 v22, s51, 33 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 34 +; SI-NEXT: v_writelane_b32 v22, s51, 35 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 36 +; SI-NEXT: v_writelane_b32 v22, s51, 37 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 38 +; SI-NEXT: v_writelane_b32 v22, s51, 39 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s50, 40 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: v_writelane_b32 v22, s51, 41 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v32i32_to_v128i8_scalar: @@ -17296,8 +17632,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -17305,133 +17648,93 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -17440,31 +17743,31 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -17476,140 +17779,206 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB15_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -17618,306 +17987,277 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -17944,108 +18284,112 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB15_3 ; SI-NEXT: .LBB15_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB15_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB15_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18054,17 +18398,17 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18074,15 +18418,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18092,15 +18436,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18110,15 +18454,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18128,15 +18472,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18146,15 +18490,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18164,15 +18508,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18181,16 +18525,17 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18200,15 +18545,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18218,84 +18563,79 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -18304,15 +18644,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -18321,15 +18661,15 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18339,9 +18679,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -18357,106 +18697,110 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -18464,14 +18808,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -18479,14 +18823,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -18516,7 +18860,7 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32i32_scalar: @@ -18538,113 +18882,115 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -18653,29 +18999,28 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -18685,130 +19030,141 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB15_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -18817,208 +19173,197 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -19049,85 +19394,95 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB15_3 ; VI-NEXT: .LBB15_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB15_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB15_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -19136,26 +19491,25 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -19163,8 +19517,8 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -19176,9 +19530,9 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -19190,14 +19544,14 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -19205,280 +19559,280 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -19523,504 +19877,524 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB15_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -20051,32 +20425,48 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB15_3 ; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB15_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB15_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -20123,348 +20513,352 @@ define inreg <32 x i32> @bitcast_v128i8_to_v32i32_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -27132,24 +27526,23 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -27159,165 +27552,177 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB19_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB19_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -27325,16 +27730,6 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -27342,212 +27737,238 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB19_3 +; SI-NEXT: .LBB19_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB19_3 -; SI-NEXT: .LBB19_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB19_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB19_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -27561,105 +27982,107 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB19_3: ; %end +; SI-NEXT: .LBB19_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -27678,41 +28101,6 @@ define inreg <32 x i32> @bitcast_v64bf16_to_v32i32_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB19_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB19_2 ; ; VI-LABEL: bitcast_v64bf16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -29901,15 +30289,13 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -29919,13 +30305,15 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -30008,87 +30396,82 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -30104,7 +30487,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -30125,7 +30508,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 @@ -30139,6 +30522,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 @@ -30147,17 +30532,19 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -30179,133 +30566,122 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB20_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 ; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v62 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 ; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_add_i32_e32 v33, vcc, 3, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 ; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 ; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -30321,14 +30697,16 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 @@ -30338,37 +30716,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v46, v8 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_mov_b32_e32 v50, v28 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v56, v8 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: .LBB20_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -30379,41 +30757,45 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -30422,7 +30804,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -30431,7 +30813,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -30440,7 +30822,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -30449,7 +30831,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -30458,7 +30840,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -30467,7 +30849,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -30476,7 +30858,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -30485,7 +30867,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -30495,20 +30877,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30519,7 +30890,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30530,7 +30901,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30541,7 +30912,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30552,7 +30923,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30563,7 +30934,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30572,9 +30943,9 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30584,8 +30955,8 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30596,7 +30967,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30607,7 +30978,7 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -30617,30 +30988,37 @@ define <64 x half> @bitcast_v32i32_to_v64f16(<32 x i32> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -34412,385 +34790,431 @@ define inreg <64 x i16> @bitcast_v32i32_to_v64i16_scalar(<32 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v32i32_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_readfirstlane_b32 s47, v1 -; SI-NEXT: v_readfirstlane_b32 s46, v2 -; SI-NEXT: v_readfirstlane_b32 s45, v3 -; SI-NEXT: v_readfirstlane_b32 s44, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v5 -; SI-NEXT: v_readfirstlane_b32 s42, v6 -; SI-NEXT: v_readfirstlane_b32 s41, v7 -; SI-NEXT: v_readfirstlane_b32 s40, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s7, v17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_writelane_b32 v20, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s47 -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s18 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 -; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s8, 16 -; SI-NEXT: s_lshr_b32 s58, s10, 16 -; SI-NEXT: s_lshr_b32 s59, s12, 16 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s40, 16 -; SI-NEXT: s_lshr_b32 s62, s42, 16 -; SI-NEXT: s_lshr_b32 s63, s44, 16 -; SI-NEXT: s_lshr_b32 s72, s46, 16 -; SI-NEXT: s_lshr_b32 s73, s29, 16 -; SI-NEXT: s_lshr_b32 s74, s27, 16 -; SI-NEXT: s_lshr_b32 s75, s25, 16 -; SI-NEXT: s_lshr_b32 s76, s23, 16 -; SI-NEXT: s_lshr_b32 s77, s21, 16 -; SI-NEXT: s_lshr_b32 s78, s19, 16 -; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s41, 16 +; SI-NEXT: s_lshr_b32 s53, s43, 16 +; SI-NEXT: s_lshr_b32 s54, s45, 16 +; SI-NEXT: s_lshr_b32 s55, s29, 16 +; SI-NEXT: s_lshr_b32 s64, s27, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 16 +; SI-NEXT: s_lshr_b32 s66, s23, 16 +; SI-NEXT: s_lshr_b32 s67, s21, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 16 +; SI-NEXT: s_lshr_b32 s69, s17, 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s47, s47, 3 ; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s46, s46, 3 ; SI-NEXT: s_add_i32 s44, s44, 3 +; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s47 -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s18 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 -; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s8, 16 -; SI-NEXT: s_lshr_b32 s58, s10, 16 -; SI-NEXT: s_lshr_b32 s59, s12, 16 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s40, 16 -; SI-NEXT: s_lshr_b32 s62, s42, 16 -; SI-NEXT: s_lshr_b32 s63, s44, 16 -; SI-NEXT: s_lshr_b32 s72, s46, 16 -; SI-NEXT: s_lshr_b32 s73, s29, 16 -; SI-NEXT: s_lshr_b32 s74, s27, 16 -; SI-NEXT: s_lshr_b32 s75, s25, 16 -; SI-NEXT: s_lshr_b32 s76, s23, 16 -; SI-NEXT: s_lshr_b32 s77, s21, 16 -; SI-NEXT: s_lshr_b32 s78, s19, 16 -; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s41, 16 +; SI-NEXT: s_lshr_b32 s53, s43, 16 +; SI-NEXT: s_lshr_b32 s54, s45, 16 +; SI-NEXT: s_lshr_b32 s55, s29, 16 +; SI-NEXT: s_lshr_b32 s64, s27, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 16 +; SI-NEXT: s_lshr_b32 s66, s23, 16 +; SI-NEXT: s_lshr_b32 s67, s21, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 16 +; SI-NEXT: s_lshr_b32 s69, s17, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, s4, v16 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s79, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s47, s36, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s47 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s69, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s68, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 -; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s67, 16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 -; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s66, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s92, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s75, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s46, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s44, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xffff +; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s51, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v32i32_to_v64i16_scalar: @@ -35860,179 +36284,162 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB27_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB27_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -36040,15 +36447,18 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -36056,14 +36466,40 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB27_3 -; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB27_3 +; SI-NEXT: .LBB27_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB27_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB27_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -36104,143 +36540,143 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -36249,7 +36685,7 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB27_3: ; %end +; SI-NEXT: .LBB27_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -36268,35 +36704,6 @@ define inreg <32 x i32> @bitcast_v64i16_to_v32i32_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB27_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB27_2 ; ; VI-LABEL: bitcast_v64i16_to_v32i32_scalar: ; VI: ; %bb.0: @@ -43243,1220 +43650,1742 @@ define inreg <128 x i8> @bitcast_v32f32_to_v128i8_scalar(<32 x float> inreg %a, ; SI-LABEL: bitcast_v32f32_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v56, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v47, s17 -; SI-NEXT: v_mov_b32_e32 v44, s18 -; SI-NEXT: v_mov_b32_e32 v42, s19 -; SI-NEXT: v_mov_b32_e32 v40, s20 -; SI-NEXT: v_mov_b32_e32 v53, s21 -; SI-NEXT: v_mov_b32_e32 v51, s22 -; SI-NEXT: v_mov_b32_e32 v48, s23 -; SI-NEXT: v_mov_b32_e32 v38, s24 -; SI-NEXT: v_mov_b32_e32 v35, s25 -; SI-NEXT: v_mov_b32_e32 v33, s26 -; SI-NEXT: v_mov_b32_e32 v30, s27 -; SI-NEXT: v_mov_b32_e32 v28, s28 -; SI-NEXT: v_mov_b32_e32 v25, s29 -; SI-NEXT: s_cbranch_scc0 .LBB37_4 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b32 s46, s5, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 17 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 16 +; SI-NEXT: s_lshr_b32 s46, s5, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 15 +; SI-NEXT: s_lshr_b32 s46, s7, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 14 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 13 +; SI-NEXT: s_lshr_b32 s46, s7, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 12 +; SI-NEXT: s_lshr_b32 s46, s9, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 11 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 10 +; SI-NEXT: s_lshr_b32 s46, s9, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 9 +; SI-NEXT: s_lshr_b32 s46, s11, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 8 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 7 +; SI-NEXT: s_lshr_b32 s46, s11, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 6 +; SI-NEXT: s_lshr_b32 s46, s13, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 5 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: s_lshr_b32 s46, s13, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 3 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 2 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 1 +; SI-NEXT: s_lshr_b32 s46, s15, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 0 +; SI-NEXT: s_lshr_b32 s46, s41, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 63 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 62 +; SI-NEXT: s_lshr_b32 s46, s41, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 61 +; SI-NEXT: s_lshr_b32 s46, s43, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 60 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 59 +; SI-NEXT: s_lshr_b32 s46, s43, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 58 +; SI-NEXT: s_lshr_b32 s46, s45, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 57 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 56 +; SI-NEXT: s_lshr_b32 s46, s45, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 55 +; SI-NEXT: s_lshr_b32 s46, s29, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 54 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 53 +; SI-NEXT: s_lshr_b32 s46, s29, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 52 +; SI-NEXT: s_lshr_b32 s46, s27, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 51 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 50 +; SI-NEXT: s_lshr_b32 s46, s27, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 49 +; SI-NEXT: s_lshr_b32 s46, s25, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 48 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 47 +; SI-NEXT: s_lshr_b32 s46, s25, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 46 +; SI-NEXT: s_lshr_b32 s46, s23, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 45 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 44 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 43 +; SI-NEXT: s_lshr_b32 s46, s21, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 42 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 41 +; SI-NEXT: s_lshr_b32 s46, s21, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 40 +; SI-NEXT: s_lshr_b32 s46, s19, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 39 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 38 +; SI-NEXT: s_lshr_b32 s46, s19, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 37 +; SI-NEXT: s_lshr_b32 s46, s17, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 36 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 35 +; SI-NEXT: s_lshr_b32 s46, s17, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 34 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 32 +; SI-NEXT: v_writelane_b32 v61, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 30 +; SI-NEXT: v_writelane_b32 v61, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 28 +; SI-NEXT: v_writelane_b32 v61, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 26 +; SI-NEXT: v_writelane_b32 v61, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 24 +; SI-NEXT: v_writelane_b32 v61, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 22 +; SI-NEXT: v_writelane_b32 v61, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 20 +; SI-NEXT: v_writelane_b32 v61, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 18 +; SI-NEXT: v_writelane_b32 v61, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 16 +; SI-NEXT: v_writelane_b32 v61, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 14 +; SI-NEXT: v_writelane_b32 v61, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 12 +; SI-NEXT: v_writelane_b32 v61, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 10 +; SI-NEXT: v_writelane_b32 v61, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 8 +; SI-NEXT: v_writelane_b32 v61, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 6 +; SI-NEXT: v_writelane_b32 v61, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 4 +; SI-NEXT: v_writelane_b32 v61, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 2 +; SI-NEXT: v_writelane_b32 v61, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 0 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v61, s47, 1 +; SI-NEXT: s_lshr_b64 s[50:51], s[40:41], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[40:41], 8 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_cbranch_execnz .LBB37_4 +; SI-NEXT: .LBB37_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v2, s5, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s4, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[1:2], 8 +; SI-NEXT: v_add_f32_e64 v4, s7, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s6, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 8 +; SI-NEXT: v_add_f32_e64 v6, s9, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s8, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[5:6], 8 +; SI-NEXT: v_add_f32_e64 v8, s11, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s10, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[7:8], 8 +; SI-NEXT: v_add_f32_e64 v10, s13, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s12, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[9:10], 8 +; SI-NEXT: v_add_f32_e64 v12, s15, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s14, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[11:12], 8 +; SI-NEXT: v_add_f32_e64 v16, s41, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s40, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[15:16], 8 +; SI-NEXT: v_add_f32_e64 v21, s43, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s42, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 -; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 -; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 -; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 -; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 -; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 -; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 -; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 -; SI-NEXT: s_cbranch_execnz .LBB37_3 -; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 16 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[20:21], 8 +; SI-NEXT: v_add_f32_e64 v26, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s44, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v18, v17, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 16 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 8 +; SI-NEXT: v_add_f32_e64 v30, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s28, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[29:30], 8 +; SI-NEXT: v_add_f32_e64 v36, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s26, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 24 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[13:14], v[35:36], 8 +; SI-NEXT: v_add_f32_e64 v49, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s24, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v16, v15, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[48:49], 8 +; SI-NEXT: v_add_f32_e64 v53, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v52, s22, 1.0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 24 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 16 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[17:18], v[52:53], 8 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 16 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v14, v13, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v4 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 16 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v4 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v12, v11, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v6 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v6 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 16 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v6 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v10, v9, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 16 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v8, v7, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v10 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v10 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 16 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v10 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v12 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 16 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v12 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v4, v3, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v21 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v21 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v21 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v26 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_add_f32_e64 v41, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v40, s20, 1.0 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v26 +; SI-NEXT: v_add_f32_e64 v58, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v57, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v46, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[40:41], 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v30 +; SI-NEXT: v_lshr_b64 v[32:33], v[40:41], 8 +; SI-NEXT: v_lshr_b64 v[37:38], v[45:46], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[57:58], 16 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v30 +; SI-NEXT: v_lshr_b64 v[27:28], v[40:41], 24 +; SI-NEXT: v_lshr_b64 v[33:34], v[45:46], 24 +; SI-NEXT: v_lshr_b64 v[38:39], v[45:46], 8 +; SI-NEXT: v_lshr_b64 v[50:51], v[57:58], 24 +; SI-NEXT: v_lshr_b64 v[43:44], v[57:58], 8 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v30 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v55, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v49 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v17, 8, v53 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v41 +; SI-NEXT: v_lshrrev_b32_e32 v18, 24, v46 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v46 +; SI-NEXT: v_lshrrev_b32_e32 v39, 24, v58 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v58 +; SI-NEXT: s_branch .LBB37_5 +; SI-NEXT: .LBB37_3: +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 0 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 2 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 4 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 5 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 6 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 7 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 9 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 10 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 11 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 13 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 14 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 15 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 16 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 17 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 18 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 19 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 20 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 21 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 22 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 23 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 24 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 25 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 26 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 27 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 28 +; SI-NEXT: v_writelane_b32 v61, s49, 29 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 30 +; SI-NEXT: v_writelane_b32 v61, s49, 31 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 32 +; SI-NEXT: v_writelane_b32 v61, s49, 33 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: s_branch .LBB37_2 +; SI-NEXT: .LBB37_4: +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 34 +; SI-NEXT: v_mov_b32_e32 v54, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 35 +; SI-NEXT: v_mov_b32_e32 v51, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 36 +; SI-NEXT: v_mov_b32_e32 v39, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 37 +; SI-NEXT: v_mov_b32_e32 v23, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 38 +; SI-NEXT: v_mov_b32_e32 v22, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 39 +; SI-NEXT: v_mov_b32_e32 v18, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 40 +; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 41 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 42 +; SI-NEXT: v_mov_b32_e32 v24, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 43 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 44 +; SI-NEXT: v_mov_b32_e32 v19, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 45 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 47 +; SI-NEXT: v_mov_b32_e32 v59, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 48 +; SI-NEXT: v_mov_b32_e32 v56, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 49 +; SI-NEXT: v_mov_b32_e32 v47, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 50 +; SI-NEXT: v_mov_b32_e32 v44, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 51 +; SI-NEXT: v_mov_b32_e32 v55, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 52 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 53 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v2, v1, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 54 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 55 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 16 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 56 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v25, v28, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 57 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 16 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 59 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v30, v33, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 60 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 61 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 16 -; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 -; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 62 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v35, v38, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 63 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 24 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v19, v48, v51, 8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 5 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 6 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v16 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v14 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 12 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v12 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 13 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v10 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 16 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 32 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v8 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v6 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v4 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; SI-NEXT: v_add_f32_e32 v47, 1.0, v47 -; SI-NEXT: v_add_f32_e32 v56, 1.0, v56 -; SI-NEXT: v_add_f32_e32 v42, 1.0, v42 -; SI-NEXT: v_add_f32_e32 v44, 1.0, v44 -; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_add_f32_e32 v40, 1.0, v40 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 -; SI-NEXT: v_alignbit_b32 v23, v53, v40, 24 -; SI-NEXT: v_alignbit_b32 v26, v53, v40, 16 -; SI-NEXT: v_alignbit_b32 v29, v53, v40, 8 -; SI-NEXT: v_alignbit_b32 v32, v42, v44, 24 -; SI-NEXT: v_alignbit_b32 v36, v42, v44, 16 -; SI-NEXT: v_alignbit_b32 v39, v42, v44, 8 -; SI-NEXT: v_alignbit_b32 v50, v47, v56, 24 -; SI-NEXT: v_alignbit_b32 v54, v47, v56, 16 -; SI-NEXT: v_alignbit_b32 v41, v47, v56, 8 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 -; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v30 -; SI-NEXT: v_lshrrev_b32_e32 v52, 24, v35 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v43, 8, v35 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v48 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v53 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v53 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v42 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v47 -; SI-NEXT: .LBB37_3: ; %end -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 -; SI-NEXT: v_and_b32_e32 v54, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v41, v56, v41 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s48 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 33 +; SI-NEXT: v_readlane_b32 s4, v61, 30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 31 +; SI-NEXT: v_readlane_b32 s4, v61, 28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 29 +; SI-NEXT: v_readlane_b32 s4, v61, 26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 27 +; SI-NEXT: v_readlane_b32 s4, v61, 24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 25 +; SI-NEXT: v_readlane_b32 s4, v61, 22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 23 +; SI-NEXT: v_readlane_b32 s4, v61, 20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 21 +; SI-NEXT: v_readlane_b32 s4, v61, 18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 19 +; SI-NEXT: v_readlane_b32 s4, v61, 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 17 +; SI-NEXT: v_readlane_b32 s4, v61, 14 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 15 +; SI-NEXT: v_readlane_b32 s4, v61, 12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 13 +; SI-NEXT: v_readlane_b32 s4, v61, 10 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 11 +; SI-NEXT: v_readlane_b32 s4, v61, 8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 9 +; SI-NEXT: v_readlane_b32 s4, v61, 6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 7 +; SI-NEXT: v_readlane_b32 s4, v61, 4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 5 +; SI-NEXT: v_readlane_b32 s4, v61, 2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 3 +; SI-NEXT: v_readlane_b32 s4, v61, 0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s50 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s54 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s64 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s66 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s68 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s70 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s80 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s82 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s84 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s86 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s96 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s98 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s46 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s56 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v13, s58 +; SI-NEXT: v_mov_b32_e32 v27, s62 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, s72 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, s74 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, s76 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v57, s16 +; SI-NEXT: v_mov_b32_e32 v58, s17 +; SI-NEXT: v_mov_b32_e32 v45, s18 +; SI-NEXT: v_mov_b32_e32 v46, s19 +; SI-NEXT: v_mov_b32_e32 v40, s20 +; SI-NEXT: v_mov_b32_e32 v41, s21 +; SI-NEXT: v_mov_b32_e32 v52, s22 +; SI-NEXT: v_mov_b32_e32 v53, s23 +; SI-NEXT: v_mov_b32_e32 v48, s24 +; SI-NEXT: v_mov_b32_e32 v49, s25 +; SI-NEXT: v_mov_b32_e32 v35, s26 +; SI-NEXT: v_mov_b32_e32 v36, s27 +; SI-NEXT: v_mov_b32_e32 v29, s28 +; SI-NEXT: v_mov_b32_e32 v30, s29 +; SI-NEXT: v_mov_b32_e32 v25, s44 +; SI-NEXT: v_mov_b32_e32 v26, s45 +; SI-NEXT: v_mov_b32_e32 v20, s42 +; SI-NEXT: v_mov_b32_e32 v21, s43 +; SI-NEXT: v_mov_b32_e32 v15, s40 +; SI-NEXT: v_mov_b32_e32 v16, s41 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v12, s15 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v8, s11 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v6, s9 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: v_mov_b32_e32 v13, s60 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v27, s78 +; SI-NEXT: v_mov_b32_e32 v31, s88 +; SI-NEXT: v_mov_b32_e32 v32, s90 +; SI-NEXT: v_mov_b32_e32 v33, s92 +; SI-NEXT: v_mov_b32_e32 v37, s94 +; SI-NEXT: v_mov_b32_e32 v38, s30 +; SI-NEXT: v_mov_b32_e32 v50, s34 +; SI-NEXT: v_mov_b32_e32 v42, s36 +; SI-NEXT: v_mov_b32_e32 v43, s38 +; SI-NEXT: .LBB37_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v43 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_or_b32_e32 v43, v57, v43 ; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v50 -; SI-NEXT: v_or_b32_e32 v50, v50, v54 -; SI-NEXT: v_and_b32_e32 v54, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v50, v54, v50 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 +; SI-NEXT: v_or_b32_e32 v50, v50, v42 +; SI-NEXT: v_and_b32_e32 v42, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v50, v42, v50 ; SI-NEXT: buffer_store_dword v50, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v21, v50, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v39 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v32 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v63 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v61 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v29 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v23 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v60 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v58 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v50, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v54, 8, v54 +; SI-NEXT: v_and_b32_e32 v51, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v50, v50, v54 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v39, 24, v39 +; SI-NEXT: v_or_b32_e32 v39, v39, v51 +; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v39, v50, v39 +; SI-NEXT: v_add_i32_e32 v50, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v39, 0xff, v45 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v37 +; SI-NEXT: v_or_b32_e32 v38, v39, v38 +; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_or_b32_e32 v33, v33, v37 +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v33, v37, v33 +; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v33, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v33, 0xff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_or_b32_e32 v23, v33, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v32 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v40 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v27 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v34 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v24 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v18, v22, v18 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v18, v22, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v14, v17, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v48 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v13, v17, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v57 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v45 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v60 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v56 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v38 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v52 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v47 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v55 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v33 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v34 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v28 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v30 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v31 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 -; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v26 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v21 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v17, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB37_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; kill: killed $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: s_branch .LBB37_2 ; ; VI-LABEL: bitcast_v32f32_to_v128i8_scalar: ; VI: ; %bb.0: @@ -53997,8 +54926,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -54006,133 +54942,93 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -54141,31 +55037,31 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -54177,140 +55073,206 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB39_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -54319,306 +55281,277 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -54645,108 +55578,112 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB39_3 ; SI-NEXT: .LBB39_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB39_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB39_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54755,17 +55692,17 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54775,15 +55712,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54793,15 +55730,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54811,15 +55748,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54829,15 +55766,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54847,15 +55784,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54865,15 +55802,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54882,16 +55819,17 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54901,15 +55839,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -54919,84 +55857,79 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -55005,15 +55938,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -55022,15 +55955,15 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -55040,9 +55973,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -55058,106 +55991,110 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -55165,14 +56102,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -55180,14 +56117,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -55217,7 +56154,7 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v32f32_scalar: @@ -55239,113 +56176,115 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -55354,29 +56293,28 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -55386,130 +56324,141 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB39_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -55518,208 +56467,197 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -55750,85 +56688,95 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB39_3 ; VI-NEXT: .LBB39_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB39_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB39_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -55837,26 +56785,25 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -55864,8 +56811,8 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -55877,9 +56824,9 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -55891,14 +56838,14 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -55906,280 +56853,280 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -56224,504 +57171,524 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB39_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -56752,32 +57719,48 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB39_3 ; GFX9-NEXT: .LBB39_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB39_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB39_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -56824,348 +57807,352 @@ define inreg <32 x float> @bitcast_v128i8_to_v32f32_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -63879,24 +64866,23 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -63906,165 +64892,177 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB43_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB43_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -64072,16 +65070,6 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -64089,212 +65077,238 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB43_3 +; SI-NEXT: .LBB43_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB43_3 -; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB43_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB43_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -64308,105 +65322,107 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: .LBB43_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -64425,41 +65441,6 @@ define inreg <32 x float> @bitcast_v64bf16_to_v32f32_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v64bf16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -66648,15 +67629,13 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr53 @@ -66666,13 +67645,15 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -66755,87 +67736,82 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 ; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v27 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 @@ -66851,7 +67827,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 @@ -66872,7 +67848,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 @@ -66886,6 +67862,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 @@ -66894,17 +67872,19 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -66926,133 +67906,122 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB44_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_f32_e32 v44, 1.0, v63 +; SI-NEXT: v_add_f32_e32 v46, 1.0, v62 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v23 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v58 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v34, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v32 +; SI-NEXT: v_mov_b32_e32 v32, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v56 ; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v61 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v59 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_add_f32_e32 v42, 1.0, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_add_f32_e32 v44, 1.0, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v57 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 ; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 @@ -67071,8 +68040,13 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 @@ -67082,40 +68056,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v2 -; SI-NEXT: v_mov_b32_e32 v52, v29 -; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v56, v28 -; SI-NEXT: v_mov_b32_e32 v34, v7 -; SI-NEXT: v_mov_b32_e32 v32, v6 -; SI-NEXT: v_mov_b32_e32 v46, v8 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v38, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_mov_b32_e32 v50, v28 +; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_mov_b32_e32 v58, v27 +; SI-NEXT: v_mov_b32_e32 v56, v8 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: .LBB44_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -67126,41 +68097,45 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -67169,7 +68144,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -67178,7 +68153,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -67187,7 +68162,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -67196,7 +68171,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -67205,7 +68180,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -67214,7 +68189,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -67223,7 +68198,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -67232,7 +68207,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -67242,20 +68217,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67266,7 +68230,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67277,7 +68241,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67288,7 +68252,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67299,7 +68263,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67310,7 +68274,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67319,9 +68283,9 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67331,8 +68295,8 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67343,7 +68307,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67354,7 +68318,7 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -67364,30 +68328,37 @@ define <64 x half> @bitcast_v32f32_to_v64f16(<32 x float> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -67620,7 +68591,8 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 ; SI-NEXT: s_lshr_b32 s4, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s4 ; SI-NEXT: s_lshr_b32 s4, s7, 16 @@ -67638,10 +68610,10 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s4 ; SI-NEXT: s_lshr_b32 s4, s15, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s4 ; SI-NEXT: s_lshr_b32 s4, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s4 ; SI-NEXT: s_lshr_b32 s4, s41, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s4 ; SI-NEXT: s_lshr_b32 s4, s42, 16 @@ -67661,7 +68633,6 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s28, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s4 ; SI-NEXT: s_lshr_b32 s4, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s4 ; SI-NEXT: s_lshr_b32 s4, s26, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v52, s4 @@ -67670,9 +68641,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s24, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v40, s4 ; SI-NEXT: s_lshr_b32 s4, s23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v42, s4 ; SI-NEXT: s_lshr_b32 s4, s22, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s4 @@ -67681,12 +68650,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_lshr_b32 s4, s20, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 ; SI-NEXT: s_lshr_b32 s4, s19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v57, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s18, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v58, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s4 ; SI-NEXT: s_lshr_b32 s4, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v60, s4 ; SI-NEXT: s_lshr_b32 s4, s16, 16 @@ -67694,11 +68663,12 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v61, s11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v20, s15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v22, s40 ; SI-NEXT: v_cvt_f32_f16_e32 v24, s41 ; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 @@ -67713,127 +68683,136 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v16, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s23 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s21 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e64 v1, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f32_e64 v2, s19, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v22 -; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 +; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e64 v41, s6, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s14, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s12, 1.0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v19 +; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 ; SI-NEXT: v_add_f32_e64 v10, s23, 1.0 ; SI-NEXT: v_add_f32_e64 v14, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s29, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_add_f32_e64 v21, s28, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 +; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v21 +; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 ; SI-NEXT: v_add_f32_e64 v27, s46, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 ; SI-NEXT: v_add_f32_e64 v26, s42, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v23 -; SI-NEXT: v_add_f32_e64 v25, s47, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 -; SI-NEXT: v_add_f32_e64 v53, s7, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v57 -; SI-NEXT: v_add_f32_e64 v49, s8, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: v_add_f32_e64 v1, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s43, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v41 -; SI-NEXT: v_add_f32_e64 v45, s9, 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 ; SI-NEXT: v_add_f32_e64 v1, s16, 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v20 +; SI-NEXT: v_add_f32_e64 v17, s13, 1.0 ; SI-NEXT: v_add_f32_e64 v34, s11, 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v6 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v6 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v56 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 -; SI-NEXT: v_add_f32_e64 v4, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s45, 1.0 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e64 v30, s44, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s41, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s15, 1.0 -; SI-NEXT: v_add_f32_e64 v37, s10, 1.0 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_add_f32_e64 v22, s40, 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v37 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -67841,13 +68820,11 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v15, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 @@ -67856,43 +68833,38 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 ; SI-NEXT: .LBB45_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen @@ -67905,7 +68877,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -67919,7 +68891,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -68016,20 +68988,22 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 @@ -68038,41 +69012,37 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -68092,7 +69062,7 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -68118,23 +69088,21 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr40 @@ -68163,27 +69131,28 @@ define inreg <64 x half> @bitcast_v32f32_to_v64f16_scalar(<32 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v32f32_to_v64f16_scalar: @@ -71117,21 +72086,21 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v36, s16 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v35, s17 -; SI-NEXT: v_mov_b32_e32 v33, s18 -; SI-NEXT: v_mov_b32_e32 v32, s19 -; SI-NEXT: v_mov_b32_e32 v31, s20 -; SI-NEXT: v_mov_b32_e32 v29, s21 -; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 ; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v25, s24 +; SI-NEXT: v_mov_b32_e32 v23, s24 ; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v22, s26 -; SI-NEXT: v_mov_b32_e32 v21, s27 -; SI-NEXT: v_mov_b32_e32 v20, s28 -; SI-NEXT: v_mov_b32_e32 v19, s29 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v20, s29 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill @@ -71150,234 +72119,242 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v52, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 -; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 -; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 +; SI-NEXT: v_mov_b32_e32 v55, v48 +; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 ; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v23, v18, v17, 16 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v27, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v48, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v50, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v52, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v41, v21, v22, 16 -; SI-NEXT: v_alignbit_b32 v43, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v45, v26, v28, 16 -; SI-NEXT: v_alignbit_b32 v47, v29, v31, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v32, v33, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v35, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[33:34], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_add_f32_e32 v31, 1.0, v31 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v32 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 -; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v36, v36, v60 -; SI-NEXT: v_or_b32_e32 v23, v35, v23 -; SI-NEXT: v_add_i32_e32 v35, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v23, v35, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v58 -; SI-NEXT: v_or_b32_e32 v23, v23, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v63 -; SI-NEXT: v_or_b32_e32 v23, v23, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v32, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v47 -; SI-NEXT: v_or_b32_e32 v23, v23, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v31, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v62 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v43 +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v31, v31, v50 +; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v35 +; SI-NEXT: v_or_b32_e32 v31, v31, v32 +; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v29, v29, v31 +; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 +; SI-NEXT: v_or_b32_e32 v29, v29, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v29 +; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v45 -; SI-NEXT: v_or_b32_e32 v23, v23, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v23, v28, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 -; SI-NEXT: v_or_b32_e32 v23, v23, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v34 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v33 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v41 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -71389,7 +72366,7 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -71401,57 +72378,64 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -71474,39 +72458,44 @@ define inreg <64 x i16> @bitcast_v32f32_to_v64i16_scalar(<32 x float> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; kill: killed $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v32f32_to_v64i16_scalar: @@ -72560,179 +73549,162 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB51_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB51_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -72740,15 +73712,18 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -72756,14 +73731,40 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB51_3 -; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB51_3 +; SI-NEXT: .LBB51_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB51_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB51_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -72804,143 +73805,143 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -72949,7 +73950,7 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB51_3: ; %end +; SI-NEXT: .LBB51_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -72968,35 +73969,6 @@ define inreg <32 x float> @bitcast_v64i16_to_v32f32_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB51_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v64i16_to_v32f32_scalar: ; VI: ; %bb.0: @@ -78979,894 +79951,1230 @@ define inreg <128 x i8> @bitcast_v16i64_to_v128i8_scalar(<16 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v16i64_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v41, s30, 0 -; SI-NEXT: v_writelane_b32 v41, s31, 1 -; SI-NEXT: v_writelane_b32 v41, s34, 2 -; SI-NEXT: v_writelane_b32 v41, s35, 3 -; SI-NEXT: v_writelane_b32 v41, s36, 4 -; SI-NEXT: v_writelane_b32 v41, s37, 5 -; SI-NEXT: v_writelane_b32 v41, s38, 6 -; SI-NEXT: v_writelane_b32 v41, s39, 7 -; SI-NEXT: v_writelane_b32 v41, s48, 8 -; SI-NEXT: v_writelane_b32 v41, s49, 9 -; SI-NEXT: v_writelane_b32 v41, s50, 10 -; SI-NEXT: v_writelane_b32 v41, s51, 11 -; SI-NEXT: v_writelane_b32 v41, s52, 12 -; SI-NEXT: v_writelane_b32 v41, s53, 13 -; SI-NEXT: v_writelane_b32 v41, s54, 14 -; SI-NEXT: v_writelane_b32 v41, s55, 15 -; SI-NEXT: v_writelane_b32 v41, s64, 16 -; SI-NEXT: v_writelane_b32 v41, s65, 17 -; SI-NEXT: v_writelane_b32 v41, s66, 18 -; SI-NEXT: v_writelane_b32 v41, s67, 19 -; SI-NEXT: v_writelane_b32 v41, s68, 20 -; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v41, s70, 22 -; SI-NEXT: v_readfirstlane_b32 s47, v1 -; SI-NEXT: v_readfirstlane_b32 s46, v2 -; SI-NEXT: v_readfirstlane_b32 s45, v3 -; SI-NEXT: v_readfirstlane_b32 s44, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v5 -; SI-NEXT: v_readfirstlane_b32 s42, v6 -; SI-NEXT: v_readfirstlane_b32 s41, v7 -; SI-NEXT: v_readfirstlane_b32 s40, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s7, v17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v18 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v22, s45 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 -; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_mov_b32_e32 v35, s24 -; SI-NEXT: v_mov_b32_e32 v39, s22 -; SI-NEXT: v_mov_b32_e32 v50, s20 -; SI-NEXT: v_mov_b32_e32 v53, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 -; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 -; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 -; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 -; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 -; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 -; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 -; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 -; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 -; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 -; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 -; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 -; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 -; SI-NEXT: s_lshr_b32 s56, s6, 24 -; SI-NEXT: s_lshr_b32 s57, s6, 16 -; SI-NEXT: s_lshr_b32 s58, s6, 8 -; SI-NEXT: s_lshr_b32 s59, s8, 24 -; SI-NEXT: s_lshr_b32 s60, s8, 16 -; SI-NEXT: s_lshr_b32 s61, s8, 8 -; SI-NEXT: s_lshr_b32 s62, s10, 24 -; SI-NEXT: s_lshr_b32 s63, s10, 16 -; SI-NEXT: s_lshr_b32 s72, s10, 8 -; SI-NEXT: s_lshr_b32 s73, s12, 24 -; SI-NEXT: s_lshr_b32 s74, s12, 16 -; SI-NEXT: s_lshr_b32 s75, s12, 8 -; SI-NEXT: s_lshr_b32 s76, s14, 24 -; SI-NEXT: s_lshr_b32 s77, s14, 16 -; SI-NEXT: s_lshr_b32 s78, s14, 8 -; SI-NEXT: s_lshr_b32 s79, s40, 24 -; SI-NEXT: s_lshr_b32 s88, s40, 16 -; SI-NEXT: s_lshr_b32 s89, s40, 8 -; SI-NEXT: s_lshr_b32 s90, s42, 24 -; SI-NEXT: s_lshr_b32 s91, s42, 16 -; SI-NEXT: s_lshr_b32 s92, s42, 8 -; SI-NEXT: s_lshr_b32 s93, s44, 24 -; SI-NEXT: s_lshr_b32 s94, s44, 16 -; SI-NEXT: s_lshr_b32 s95, s44, 8 -; SI-NEXT: s_lshr_b32 s30, s46, 24 -; SI-NEXT: s_lshr_b32 s31, s46, 16 -; SI-NEXT: s_lshr_b32 s34, s46, 8 -; SI-NEXT: s_lshr_b32 s35, s29, 24 -; SI-NEXT: s_lshr_b32 s36, s29, 16 -; SI-NEXT: s_lshr_b32 s37, s29, 8 -; SI-NEXT: s_lshr_b32 s38, s27, 24 -; SI-NEXT: s_lshr_b32 s39, s27, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 8 -; SI-NEXT: s_lshr_b32 s49, s25, 24 -; SI-NEXT: s_lshr_b32 s50, s25, 16 -; SI-NEXT: s_lshr_b32 s51, s25, 8 -; SI-NEXT: s_lshr_b32 s52, s23, 24 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s55, s21, 24 -; SI-NEXT: s_lshr_b32 s64, s21, 16 -; SI-NEXT: s_lshr_b32 s65, s21, 8 -; SI-NEXT: s_lshr_b32 s66, s19, 24 -; SI-NEXT: s_lshr_b32 s67, s19, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 8 -; SI-NEXT: s_lshr_b32 s69, s17, 24 -; SI-NEXT: s_lshr_b32 s70, s17, 16 -; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_lshr_b32 s46, s5, 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s46, 40 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 41 +; SI-NEXT: s_lshr_b32 s46, s5, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 42 +; SI-NEXT: s_lshr_b32 s46, s7, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 43 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 44 +; SI-NEXT: s_lshr_b32 s46, s7, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 45 +; SI-NEXT: s_lshr_b32 s46, s9, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 46 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 47 +; SI-NEXT: s_lshr_b32 s46, s9, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 48 +; SI-NEXT: s_lshr_b32 s46, s11, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 49 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 50 +; SI-NEXT: s_lshr_b32 s46, s11, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 51 +; SI-NEXT: s_lshr_b32 s46, s13, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 52 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 53 +; SI-NEXT: s_lshr_b32 s46, s13, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 54 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 55 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 56 +; SI-NEXT: s_lshr_b32 s46, s15, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 57 +; SI-NEXT: s_lshr_b32 s46, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 58 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 59 +; SI-NEXT: s_lshr_b32 s46, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 60 +; SI-NEXT: s_lshr_b32 s46, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 61 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 62 +; SI-NEXT: s_lshr_b32 s46, s43, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 63 +; SI-NEXT: s_lshr_b32 s46, s45, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 0 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 1 +; SI-NEXT: s_lshr_b32 s46, s45, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 2 +; SI-NEXT: s_lshr_b32 s46, s29, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 3 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 4 +; SI-NEXT: s_lshr_b32 s46, s29, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 5 +; SI-NEXT: s_lshr_b32 s46, s27, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 6 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 7 +; SI-NEXT: s_lshr_b32 s46, s27, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 8 +; SI-NEXT: s_lshr_b32 s46, s25, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 9 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 10 +; SI-NEXT: s_lshr_b32 s46, s25, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 11 +; SI-NEXT: s_lshr_b32 s46, s23, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 12 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 13 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 14 +; SI-NEXT: s_lshr_b32 s46, s21, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 15 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 38 +; SI-NEXT: v_writelane_b32 v22, s47, 39 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 36 +; SI-NEXT: v_writelane_b32 v22, s47, 37 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 34 +; SI-NEXT: v_writelane_b32 v22, s47, 35 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 32 +; SI-NEXT: v_writelane_b32 v22, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 30 +; SI-NEXT: v_writelane_b32 v22, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 28 +; SI-NEXT: v_writelane_b32 v22, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 26 +; SI-NEXT: v_writelane_b32 v22, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 24 +; SI-NEXT: v_writelane_b32 v22, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 22 +; SI-NEXT: v_writelane_b32 v22, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 20 +; SI-NEXT: v_writelane_b32 v22, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 18 +; SI-NEXT: v_writelane_b32 v22, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 16 +; SI-NEXT: v_writelane_b32 v22, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 14 +; SI-NEXT: v_writelane_b32 v22, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 12 +; SI-NEXT: v_writelane_b32 v22, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 10 +; SI-NEXT: v_writelane_b32 v22, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 8 +; SI-NEXT: v_writelane_b32 v22, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 6 +; SI-NEXT: v_writelane_b32 v22, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 4 +; SI-NEXT: v_writelane_b32 v22, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 2 +; SI-NEXT: v_writelane_b32 v22, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 0 +; SI-NEXT: s_lshr_b32 s49, s19, 24 +; SI-NEXT: s_lshr_b32 s48, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s19, 8 +; SI-NEXT: s_lshr_b32 s51, s17, 24 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s17, 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v22, s47, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s47, s47, 3 -; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_add_u32 s45, s45, 3 -; SI-NEXT: s_addc_u32 s44, s44, 0 -; SI-NEXT: s_add_u32 s43, s43, 3 -; SI-NEXT: s_addc_u32 s42, s42, 0 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s40, s40, 0 -; SI-NEXT: s_add_u32 s15, s15, 3 -; SI-NEXT: s_addc_u32 s14, s14, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: v_mov_b32_e32 v22, s45 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: v_mov_b32_e32 v9, s11 -; SI-NEXT: v_mov_b32_e32 v12, s13 -; SI-NEXT: v_mov_b32_e32 v15, s15 -; SI-NEXT: v_mov_b32_e32 v18, s41 -; SI-NEXT: v_mov_b32_e32 v21, s43 -; SI-NEXT: v_alignbit_b32 v24, s44, v22, 24 -; SI-NEXT: v_alignbit_b32 v25, s44, v22, 16 -; SI-NEXT: v_alignbit_b32 v26, s44, v22, 8 -; SI-NEXT: v_mov_b32_e32 v22, s47 -; SI-NEXT: v_mov_b32_e32 v23, s28 -; SI-NEXT: v_mov_b32_e32 v29, s26 -; SI-NEXT: v_mov_b32_e32 v35, s24 -; SI-NEXT: v_mov_b32_e32 v39, s22 -; SI-NEXT: v_mov_b32_e32 v50, s20 -; SI-NEXT: v_mov_b32_e32 v53, s18 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s8, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s8, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s8, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s12, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s12, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s12, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s14, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s14, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s14, v15, 8 -; SI-NEXT: v_alignbit_b32 v16, s40, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s40, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s40, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s42, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s42, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s42, v21, 8 -; SI-NEXT: v_alignbit_b32 v30, s46, v22, 24 -; SI-NEXT: v_alignbit_b32 v31, s46, v22, 16 -; SI-NEXT: v_alignbit_b32 v32, s46, v22, 8 -; SI-NEXT: v_alignbit_b32 v36, s29, v23, 24 -; SI-NEXT: v_alignbit_b32 v22, s29, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, s29, v23, 8 -; SI-NEXT: v_alignbit_b32 v27, s27, v29, 24 -; SI-NEXT: v_alignbit_b32 v28, s27, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, s27, v29, 8 -; SI-NEXT: v_alignbit_b32 v33, s25, v35, 24 -; SI-NEXT: v_alignbit_b32 v34, s25, v35, 16 -; SI-NEXT: v_alignbit_b32 v35, s25, v35, 8 -; SI-NEXT: v_alignbit_b32 v37, s23, v39, 24 -; SI-NEXT: v_alignbit_b32 v38, s23, v39, 16 -; SI-NEXT: v_alignbit_b32 v39, s23, v39, 8 -; SI-NEXT: v_alignbit_b32 v48, s21, v50, 24 -; SI-NEXT: v_alignbit_b32 v49, s21, v50, 16 -; SI-NEXT: v_alignbit_b32 v50, s21, v50, 8 -; SI-NEXT: v_alignbit_b32 v51, s19, v53, 24 -; SI-NEXT: v_alignbit_b32 v52, s19, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, s19, v53, 8 -; SI-NEXT: v_alignbit_b32 v54, s17, v40, 24 -; SI-NEXT: v_alignbit_b32 v55, s17, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, s17, v40, 8 -; SI-NEXT: s_lshr_b32 s56, s6, 24 -; SI-NEXT: s_lshr_b32 s57, s6, 16 -; SI-NEXT: s_lshr_b32 s58, s6, 8 -; SI-NEXT: s_lshr_b32 s59, s8, 24 -; SI-NEXT: s_lshr_b32 s60, s8, 16 -; SI-NEXT: s_lshr_b32 s61, s8, 8 -; SI-NEXT: s_lshr_b32 s62, s10, 24 -; SI-NEXT: s_lshr_b32 s63, s10, 16 -; SI-NEXT: s_lshr_b32 s72, s10, 8 -; SI-NEXT: s_lshr_b32 s73, s12, 24 -; SI-NEXT: s_lshr_b32 s74, s12, 16 -; SI-NEXT: s_lshr_b32 s75, s12, 8 -; SI-NEXT: s_lshr_b32 s76, s14, 24 -; SI-NEXT: s_lshr_b32 s77, s14, 16 -; SI-NEXT: s_lshr_b32 s78, s14, 8 -; SI-NEXT: s_lshr_b32 s79, s40, 24 -; SI-NEXT: s_lshr_b32 s88, s40, 16 -; SI-NEXT: s_lshr_b32 s89, s40, 8 -; SI-NEXT: s_lshr_b32 s90, s42, 24 -; SI-NEXT: s_lshr_b32 s91, s42, 16 -; SI-NEXT: s_lshr_b32 s92, s42, 8 -; SI-NEXT: s_lshr_b32 s93, s44, 24 -; SI-NEXT: s_lshr_b32 s94, s44, 16 -; SI-NEXT: s_lshr_b32 s95, s44, 8 -; SI-NEXT: s_lshr_b32 s30, s46, 24 -; SI-NEXT: s_lshr_b32 s31, s46, 16 -; SI-NEXT: s_lshr_b32 s34, s46, 8 -; SI-NEXT: s_lshr_b32 s35, s29, 24 -; SI-NEXT: s_lshr_b32 s36, s29, 16 -; SI-NEXT: s_lshr_b32 s37, s29, 8 -; SI-NEXT: s_lshr_b32 s38, s27, 24 -; SI-NEXT: s_lshr_b32 s39, s27, 16 -; SI-NEXT: s_lshr_b32 s48, s27, 8 -; SI-NEXT: s_lshr_b32 s49, s25, 24 -; SI-NEXT: s_lshr_b32 s50, s25, 16 -; SI-NEXT: s_lshr_b32 s51, s25, 8 -; SI-NEXT: s_lshr_b32 s52, s23, 24 -; SI-NEXT: s_lshr_b32 s53, s23, 16 -; SI-NEXT: s_lshr_b32 s54, s23, 8 -; SI-NEXT: s_lshr_b32 s55, s21, 24 -; SI-NEXT: s_lshr_b32 s64, s21, 16 -; SI-NEXT: s_lshr_b32 s65, s21, 8 -; SI-NEXT: s_lshr_b32 s66, s19, 24 -; SI-NEXT: s_lshr_b32 s67, s19, 16 -; SI-NEXT: s_lshr_b32 s68, s19, 8 -; SI-NEXT: s_lshr_b32 s69, s17, 24 -; SI-NEXT: s_lshr_b32 s70, s17, 16 -; SI-NEXT: s_lshr_b32 s71, s17, 8 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s46, s5, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 40 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 41 +; SI-NEXT: s_lshr_b32 s46, s5, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 42 +; SI-NEXT: s_lshr_b32 s46, s7, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 43 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 44 +; SI-NEXT: s_lshr_b32 s46, s7, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 45 +; SI-NEXT: s_lshr_b32 s46, s9, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 46 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 47 +; SI-NEXT: s_lshr_b32 s46, s9, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 48 +; SI-NEXT: s_lshr_b32 s46, s11, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 49 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 50 +; SI-NEXT: s_lshr_b32 s46, s11, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 51 +; SI-NEXT: s_lshr_b32 s46, s13, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 52 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 53 +; SI-NEXT: s_lshr_b32 s46, s13, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 54 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 55 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 56 +; SI-NEXT: s_lshr_b32 s46, s15, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 57 +; SI-NEXT: s_lshr_b32 s46, s41, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 58 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 59 +; SI-NEXT: s_lshr_b32 s46, s41, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 60 +; SI-NEXT: s_lshr_b32 s46, s43, 24 +; SI-NEXT: v_writelane_b32 v22, s46, 61 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: v_writelane_b32 v22, s46, 62 +; SI-NEXT: s_lshr_b32 s46, s43, 8 +; SI-NEXT: v_writelane_b32 v22, s46, 63 +; SI-NEXT: s_lshr_b32 s46, s45, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 0 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 1 +; SI-NEXT: s_lshr_b32 s46, s45, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 2 +; SI-NEXT: s_lshr_b32 s46, s29, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 3 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 4 +; SI-NEXT: s_lshr_b32 s46, s29, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 5 +; SI-NEXT: s_lshr_b32 s46, s27, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 6 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 7 +; SI-NEXT: s_lshr_b32 s46, s27, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 8 +; SI-NEXT: s_lshr_b32 s46, s25, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 9 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 10 +; SI-NEXT: s_lshr_b32 s46, s25, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 11 +; SI-NEXT: s_lshr_b32 s46, s23, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 12 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 13 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 14 +; SI-NEXT: s_lshr_b32 s46, s21, 24 +; SI-NEXT: v_writelane_b32 v21, s46, 15 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_writelane_b32 v21, s46, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 8 +; SI-NEXT: v_writelane_b32 v21, s46, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 38 +; SI-NEXT: v_writelane_b32 v22, s47, 39 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 36 +; SI-NEXT: v_writelane_b32 v22, s47, 37 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 34 +; SI-NEXT: v_writelane_b32 v22, s47, 35 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 32 +; SI-NEXT: v_writelane_b32 v22, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 30 +; SI-NEXT: v_writelane_b32 v22, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 28 +; SI-NEXT: v_writelane_b32 v22, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 26 +; SI-NEXT: v_writelane_b32 v22, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 24 +; SI-NEXT: v_writelane_b32 v22, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 22 +; SI-NEXT: v_writelane_b32 v22, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 20 +; SI-NEXT: v_writelane_b32 v22, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 18 +; SI-NEXT: v_writelane_b32 v22, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 16 +; SI-NEXT: v_writelane_b32 v22, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 14 +; SI-NEXT: v_writelane_b32 v22, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 12 +; SI-NEXT: v_writelane_b32 v22, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 10 +; SI-NEXT: v_writelane_b32 v22, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 8 +; SI-NEXT: v_writelane_b32 v22, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 6 +; SI-NEXT: v_writelane_b32 v22, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v22, s46, 4 +; SI-NEXT: v_writelane_b32 v22, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v22, s46, 2 +; SI-NEXT: v_writelane_b32 v22, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v22, s46, 0 +; SI-NEXT: s_lshr_b32 s49, s19, 24 +; SI-NEXT: s_lshr_b32 s48, s19, 16 +; SI-NEXT: s_lshr_b32 s50, s19, 8 +; SI-NEXT: s_lshr_b32 s51, s17, 24 +; SI-NEXT: s_lshr_b32 s52, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s17, 8 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v22, s47, 1 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 ; SI-NEXT: .LBB57_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 -; SI-NEXT: v_or_b32_e32 v40, s4, v40 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s71, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s69, 24 -; SI-NEXT: v_and_b32_e32 v55, 0xff, v55 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v54 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v54, v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v53 -; SI-NEXT: v_or_b32_e32 v53, s4, v53 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s67, 0xff -; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v51 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s66, 24 -; SI-NEXT: v_or_b32_e32 v54, v40, v54 -; SI-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v51, v51, v52 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: buffer_store_dword v54, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v54, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v51, v53, v51 -; SI-NEXT: v_add_i32_e32 v52, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v55, v54, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v51, v52, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v52, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v50, 8, v50 -; SI-NEXT: v_or_b32_e32 v50, s4, v50 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s64, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v48 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s55, 24 -; SI-NEXT: v_and_b32_e32 v50, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v48, v48, v49 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_lshl_b32 s47, s38, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s16, s16, s47 +; SI-NEXT: s_and_b32 s47, s36, 0xff +; SI-NEXT: s_lshl_b32 s57, s34, 24 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_or_b32 s47, s57, s47 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s47 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s53, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s52, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s47, s51, 24 +; SI-NEXT: s_or_b32 s17, s47, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s30, 8 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s94, 0xff +; SI-NEXT: s_lshl_b32 s18, s92, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xff +; SI-NEXT: s_lshl_b32 s17, s50, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s48, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s49, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s90, 8 +; SI-NEXT: s_and_b32 s17, s20, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s88, 0xff +; SI-NEXT: s_lshl_b32 s18, s78, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 17 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 16 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 15 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: s_lshl_b32 s16, s76, 8 +; SI-NEXT: s_and_b32 s17, s22, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_lshl_b32 s18, s72, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 14 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 13 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 12 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: s_lshl_b32 s16, s62, 8 +; SI-NEXT: s_and_b32 s17, s24, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s60, 0xff +; SI-NEXT: s_lshl_b32 s18, s58, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 11 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 10 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 9 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_and_b32 s17, s26, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s46, 0xff +; SI-NEXT: s_lshl_b32 s18, s98, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 8 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 7 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 6 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: s_lshl_b32 s16, s96, 8 +; SI-NEXT: s_and_b32 s17, s28, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s86, 0xff +; SI-NEXT: s_lshl_b32 s18, s84, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 5 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 4 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 3 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v14, s16 +; SI-NEXT: s_lshl_b32 s16, s82, 8 +; SI-NEXT: s_and_b32 s17, s44, 0xff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s80, 0xff +; SI-NEXT: s_lshl_b32 s18, s70, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 2 +; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v21, 1 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 0 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v48, v50, v48 -; SI-NEXT: v_add_i32_e32 v49, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v52, v51, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v48, v49, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v49, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v39 -; SI-NEXT: v_or_b32_e32 v39, s4, v39 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: v_and_b32_e32 v38, 0xff, v38 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s53, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v37 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s52, 24 -; SI-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v37, v37, v38 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: s_lshl_b32 s16, s68, 8 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s66, 0xff +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_lshl_b32 s18, s64, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: v_readlane_b32 s17, v22, 63 +; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 62 +; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 61 +; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v22, 0 +; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_readlane_b32 s19, v22, 1 +; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v22, 2 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v48, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v37, v39, v37 -; SI-NEXT: v_add_i32_e32 v38, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v49, v48, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v37, v38, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v38, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v35 -; SI-NEXT: v_or_b32_e32 v35, s4, v35 -; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s50, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s49, 24 -; SI-NEXT: v_and_b32_e32 v35, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: v_readlane_b32 s19, v22, 3 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 60 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v33, v35, v33 -; SI-NEXT: v_add_i32_e32 v34, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v38, v37, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v34, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; SI-NEXT: v_or_b32_e32 v29, s4, v29 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s39, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s38, 24 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v22, 59 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 58 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v27, v29, v27 -; SI-NEXT: v_add_i32_e32 v28, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v34, v33, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v28, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; SI-NEXT: v_or_b32_e32 v23, s4, v23 -; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s37, 8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 6 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s17, v22, 7 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s19, v22, 5 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: v_readlane_b32 s16, v22, 8 +; SI-NEXT: v_readlane_b32 s17, v22, 9 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s18, v22, 10 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 44, v0 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s36, 0xff -; SI-NEXT: buffer_store_dword v28, v27, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v36 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s35, 24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: v_readlane_b32 s15, v22, 57 +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_readlane_b32 s15, v22, 56 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: v_readlane_b32 s16, v22, 55 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v32 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_readlane_b32 s14, v22, 12 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s15, v22, 13 +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: v_readlane_b32 s14, v22, 14 +; SI-NEXT: v_readlane_b32 s15, v22, 15 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s16, v22, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v31 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s31, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v30 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s30, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v27, v23 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: v_readlane_b32 s13, v22, 54 +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_readlane_b32 s13, v22, 53 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: v_readlane_b32 s14, v22, 52 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 60, v0 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v26 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 -; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: s_lshl_b32 s5, s95, 8 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_readlane_b32 s12, v22, 18 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s13, v22, 19 +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: v_readlane_b32 s12, v22, 20 +; SI-NEXT: v_readlane_b32 s13, v22, 21 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s14, v22, 22 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s14, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v25 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s94, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s93, 24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v21, s4, v21 -; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s91, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s90, 24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v22, 51 +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_readlane_b32 s11, v22, 50 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: v_readlane_b32 s12, v22, 49 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v18, s4, v18 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s88, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s79, 24 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_readlane_b32 s10, v22, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s11, v22, 25 +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: v_readlane_b32 s10, v22, 26 +; SI-NEXT: v_readlane_b32 s11, v22, 27 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s12, v22, 28 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s76, 24 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v22, 48 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_readlane_b32 s9, v22, 47 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: v_readlane_b32 s10, v22, 46 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s73, 24 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_readlane_b32 s8, v22, 30 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s9, v22, 31 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: v_readlane_b32 s8, v22, 32 +; SI-NEXT: v_readlane_b32 s9, v22, 33 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v22, 34 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s63, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s10, s62, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s10, s5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v22, 45 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_readlane_b32 s7, v22, 44 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s8, v22, 43 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s59, 24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_readlane_b32 s6, v22, 36 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v22, 37 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s54, 0xff +; SI-NEXT: v_readlane_b32 s8, v22, 38 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: v_readlane_b32 s5, v22, 42 +; SI-NEXT: s_lshl_b32 s5, s5, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: v_readlane_b32 s5, v22, 41 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: v_readlane_b32 s6, v22, 40 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s56, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s19, v22, 11 +; SI-NEXT: v_readlane_b32 s17, v22, 17 +; SI-NEXT: v_readlane_b32 s15, v22, 23 +; SI-NEXT: v_readlane_b32 s13, v22, 29 +; SI-NEXT: v_readlane_b32 s11, v22, 35 +; SI-NEXT: v_readlane_b32 s9, v22, 39 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s71, v41, 23 -; SI-NEXT: v_readlane_b32 s70, v41, 22 -; SI-NEXT: v_readlane_b32 s69, v41, 21 -; SI-NEXT: v_readlane_b32 s68, v41, 20 -; SI-NEXT: v_readlane_b32 s67, v41, 19 -; SI-NEXT: v_readlane_b32 s66, v41, 18 -; SI-NEXT: v_readlane_b32 s65, v41, 17 -; SI-NEXT: v_readlane_b32 s64, v41, 16 -; SI-NEXT: v_readlane_b32 s55, v41, 15 -; SI-NEXT: v_readlane_b32 s54, v41, 14 -; SI-NEXT: v_readlane_b32 s53, v41, 13 -; SI-NEXT: v_readlane_b32 s52, v41, 12 -; SI-NEXT: v_readlane_b32 s51, v41, 11 -; SI-NEXT: v_readlane_b32 s50, v41, 10 -; SI-NEXT: v_readlane_b32 s49, v41, 9 -; SI-NEXT: v_readlane_b32 s48, v41, 8 -; SI-NEXT: v_readlane_b32 s39, v41, 7 -; SI-NEXT: v_readlane_b32 s38, v41, 6 -; SI-NEXT: v_readlane_b32 s37, v41, 5 -; SI-NEXT: v_readlane_b32 s36, v41, 4 -; SI-NEXT: v_readlane_b32 s35, v41, 3 -; SI-NEXT: v_readlane_b32 s34, v41, 2 -; SI-NEXT: v_readlane_b32 s31, v41, 1 -; SI-NEXT: v_readlane_b32 s30, v41, 0 -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $sgpr66 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v22, s54, 0 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s55, 1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr52 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr38 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr30 -; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr90 -; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_branch .LBB57_2 -; -; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: -; VI: ; %bb.0: -; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_writelane_b32 v20, s34, 2 -; VI-NEXT: v_writelane_b32 v20, s35, 3 -; VI-NEXT: v_writelane_b32 v20, s36, 4 -; VI-NEXT: v_writelane_b32 v20, s37, 5 -; VI-NEXT: v_writelane_b32 v20, s38, 6 -; VI-NEXT: v_writelane_b32 v20, s39, 7 -; VI-NEXT: v_writelane_b32 v20, s48, 8 -; VI-NEXT: v_writelane_b32 v20, s49, 9 -; VI-NEXT: v_writelane_b32 v20, s50, 10 -; VI-NEXT: v_writelane_b32 v20, s51, 11 -; VI-NEXT: v_writelane_b32 v20, s52, 12 -; VI-NEXT: v_writelane_b32 v20, s53, 13 -; VI-NEXT: v_writelane_b32 v20, s54, 14 -; VI-NEXT: v_writelane_b32 v20, s55, 15 -; VI-NEXT: v_writelane_b32 v20, s64, 16 -; VI-NEXT: v_writelane_b32 v20, s65, 17 -; VI-NEXT: v_writelane_b32 v20, s66, 18 -; VI-NEXT: v_writelane_b32 v20, s67, 19 -; VI-NEXT: v_writelane_b32 v20, s68, 20 -; VI-NEXT: v_writelane_b32 v20, s69, 21 -; VI-NEXT: v_writelane_b32 v20, s70, 22 -; VI-NEXT: v_writelane_b32 v20, s71, 23 -; VI-NEXT: v_writelane_b32 v20, s80, 24 -; VI-NEXT: v_writelane_b32 v20, s81, 25 -; VI-NEXT: v_writelane_b32 v20, s82, 26 -; VI-NEXT: v_writelane_b32 v20, s83, 27 -; VI-NEXT: v_writelane_b32 v20, s84, 28 -; VI-NEXT: v_writelane_b32 v20, s85, 29 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_writelane_b32 v20, s86, 30 -; VI-NEXT: v_readfirstlane_b32 s44, v1 -; VI-NEXT: v_readfirstlane_b32 s45, v2 -; VI-NEXT: v_readfirstlane_b32 s42, v3 -; VI-NEXT: v_readfirstlane_b32 s43, v4 -; VI-NEXT: v_readfirstlane_b32 s40, v5 -; VI-NEXT: v_readfirstlane_b32 s41, v6 -; VI-NEXT: v_readfirstlane_b32 s14, v7 -; VI-NEXT: v_readfirstlane_b32 s15, v8 -; VI-NEXT: v_readfirstlane_b32 s12, v9 -; VI-NEXT: v_readfirstlane_b32 s13, v10 -; VI-NEXT: v_readfirstlane_b32 s10, v11 -; VI-NEXT: v_readfirstlane_b32 s11, v12 -; VI-NEXT: v_readfirstlane_b32 s8, v13 -; VI-NEXT: v_readfirstlane_b32 s9, v14 -; VI-NEXT: v_readfirstlane_b32 s6, v15 -; VI-NEXT: v_readfirstlane_b32 s7, v16 -; VI-NEXT: v_readfirstlane_b32 s4, v17 -; VI-NEXT: s_and_b64 s[46:47], vcc, exec -; VI-NEXT: v_readfirstlane_b32 s5, v18 -; VI-NEXT: v_writelane_b32 v20, s87, 31 -; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane -; VI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 2 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s55, 3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 4 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s55, 5 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 6 +; SI-NEXT: v_writelane_b32 v22, s55, 7 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 8 +; SI-NEXT: v_writelane_b32 v22, s55, 9 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 10 +; SI-NEXT: v_writelane_b32 v22, s55, 11 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 12 +; SI-NEXT: v_writelane_b32 v22, s55, 13 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 14 +; SI-NEXT: v_writelane_b32 v22, s55, 15 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 16 +; SI-NEXT: v_writelane_b32 v22, s55, 17 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 18 +; SI-NEXT: v_writelane_b32 v22, s55, 19 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 20 +; SI-NEXT: v_writelane_b32 v22, s55, 21 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 22 +; SI-NEXT: v_writelane_b32 v22, s55, 23 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 24 +; SI-NEXT: v_writelane_b32 v22, s55, 25 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 26 +; SI-NEXT: v_writelane_b32 v22, s55, 27 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 28 +; SI-NEXT: v_writelane_b32 v22, s55, 29 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 30 +; SI-NEXT: v_writelane_b32 v22, s55, 31 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 32 +; SI-NEXT: v_writelane_b32 v22, s55, 33 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 34 +; SI-NEXT: v_writelane_b32 v22, s55, 35 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 36 +; SI-NEXT: v_writelane_b32 v22, s55, 37 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v22, s54, 38 +; SI-NEXT: v_writelane_b32 v22, s55, 39 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: s_branch .LBB57_2 +; +; VI-LABEL: bitcast_v16i64_to_v128i8_scalar: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; VI-NEXT: s_mov_b64 exec, s[4:5] +; VI-NEXT: v_writelane_b32 v20, s30, 0 +; VI-NEXT: v_writelane_b32 v20, s31, 1 +; VI-NEXT: v_writelane_b32 v20, s34, 2 +; VI-NEXT: v_writelane_b32 v20, s35, 3 +; VI-NEXT: v_writelane_b32 v20, s36, 4 +; VI-NEXT: v_writelane_b32 v20, s37, 5 +; VI-NEXT: v_writelane_b32 v20, s38, 6 +; VI-NEXT: v_writelane_b32 v20, s39, 7 +; VI-NEXT: v_writelane_b32 v20, s48, 8 +; VI-NEXT: v_writelane_b32 v20, s49, 9 +; VI-NEXT: v_writelane_b32 v20, s50, 10 +; VI-NEXT: v_writelane_b32 v20, s51, 11 +; VI-NEXT: v_writelane_b32 v20, s52, 12 +; VI-NEXT: v_writelane_b32 v20, s53, 13 +; VI-NEXT: v_writelane_b32 v20, s54, 14 +; VI-NEXT: v_writelane_b32 v20, s55, 15 +; VI-NEXT: v_writelane_b32 v20, s64, 16 +; VI-NEXT: v_writelane_b32 v20, s65, 17 +; VI-NEXT: v_writelane_b32 v20, s66, 18 +; VI-NEXT: v_writelane_b32 v20, s67, 19 +; VI-NEXT: v_writelane_b32 v20, s68, 20 +; VI-NEXT: v_writelane_b32 v20, s69, 21 +; VI-NEXT: v_writelane_b32 v20, s70, 22 +; VI-NEXT: v_writelane_b32 v20, s71, 23 +; VI-NEXT: v_writelane_b32 v20, s80, 24 +; VI-NEXT: v_writelane_b32 v20, s81, 25 +; VI-NEXT: v_writelane_b32 v20, s82, 26 +; VI-NEXT: v_writelane_b32 v20, s83, 27 +; VI-NEXT: v_writelane_b32 v20, s84, 28 +; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_readfirstlane_b32 s44, v1 +; VI-NEXT: v_readfirstlane_b32 s45, v2 +; VI-NEXT: v_readfirstlane_b32 s42, v3 +; VI-NEXT: v_readfirstlane_b32 s43, v4 +; VI-NEXT: v_readfirstlane_b32 s40, v5 +; VI-NEXT: v_readfirstlane_b32 s41, v6 +; VI-NEXT: v_readfirstlane_b32 s14, v7 +; VI-NEXT: v_readfirstlane_b32 s15, v8 +; VI-NEXT: v_readfirstlane_b32 s12, v9 +; VI-NEXT: v_readfirstlane_b32 s13, v10 +; VI-NEXT: v_readfirstlane_b32 s10, v11 +; VI-NEXT: v_readfirstlane_b32 s11, v12 +; VI-NEXT: v_readfirstlane_b32 s8, v13 +; VI-NEXT: v_readfirstlane_b32 s9, v14 +; VI-NEXT: v_readfirstlane_b32 s6, v15 +; VI-NEXT: v_readfirstlane_b32 s7, v16 +; VI-NEXT: v_readfirstlane_b32 s4, v17 +; VI-NEXT: s_and_b64 s[46:47], vcc, exec +; VI-NEXT: v_readfirstlane_b32 s5, v18 +; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane +; VI-NEXT: s_cbranch_scc0 .LBB57_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: s_lshr_b32 s46, s5, 24 ; VI-NEXT: v_writelane_b32 v21, s46, 8 @@ -88751,8 +90059,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -88760,133 +90075,93 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -88895,31 +90170,31 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -88931,140 +90206,206 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB59_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -89073,306 +90414,277 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -89399,108 +90711,112 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB59_3 ; SI-NEXT: .LBB59_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB59_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89509,17 +90825,17 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89529,15 +90845,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89547,15 +90863,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89565,15 +90881,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89583,15 +90899,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89601,15 +90917,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89619,15 +90935,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89636,16 +90952,17 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89655,15 +90972,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89673,84 +90990,79 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -89759,15 +91071,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -89776,15 +91088,15 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89794,9 +91106,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -89812,106 +91124,110 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -89919,14 +91235,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -89934,14 +91250,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -89971,7 +91287,7 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16i64_scalar: @@ -89993,113 +91309,115 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -90108,29 +91426,28 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -90140,130 +91457,141 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB59_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -90272,208 +91600,197 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -90504,85 +91821,95 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB59_3 ; VI-NEXT: .LBB59_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB59_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB59_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -90591,26 +91918,25 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -90618,8 +91944,8 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -90631,9 +91957,9 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -90645,14 +91971,14 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -90660,280 +91986,280 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -90978,504 +92304,524 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB59_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -91506,32 +92852,48 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB59_3 ; GFX9-NEXT: .LBB59_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB59_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB59_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -91578,348 +92940,352 @@ define inreg <16 x i64> @bitcast_v128i8_to_v16i64_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -98575,24 +99941,23 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -98602,165 +99967,177 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB63_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB63_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -98768,16 +100145,6 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -98785,212 +100152,238 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB63_3 +; SI-NEXT: .LBB63_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB63_3 -; SI-NEXT: .LBB63_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB63_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB63_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -99004,105 +100397,107 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB63_3: ; %end +; SI-NEXT: .LBB63_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -99121,41 +100516,6 @@ define inreg <16 x i64> @bitcast_v64bf16_to_v16i64_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB63_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB63_2 ; ; VI-LABEL: bitcast_v64bf16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -101344,13 +102704,12 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr41 @@ -101360,15 +102719,16 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr32 @@ -101451,108 +102811,89 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v31 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v28 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v36, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v28 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v58, v2 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v35, v2 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v12 @@ -101582,6 +102923,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v31 @@ -101590,17 +102932,19 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v62 -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -101617,7 +102961,22 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: .LBB64_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB64_4 @@ -101626,8 +102985,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_addc_u32_e32 v35, vcc, 0, v2, vcc ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc @@ -101646,108 +103005,95 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc ; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 ; SI-NEXT: v_addc_u32_e32 v24, vcc, 0, v24, vcc -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_add_i32_e32 v25, vcc, 3, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 ; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v27 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 ; SI-NEXT: v_addc_u32_e32 v28, vcc, 0, v28, vcc -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v25 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v29 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v48 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v48 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v63 +; SI-NEXT: v_addc_u32_e32 v46, vcc, 0, v62, vcc ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v46 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v38 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v58 +; SI-NEXT: v_mov_b32_e32 v58, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v34 +; SI-NEXT: v_mov_b32_e32 v34, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v32 +; SI-NEXT: v_mov_b32_e32 v32, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v56 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 -; SI-NEXT: v_addc_u32_e32 v44, vcc, 0, v62, vcc -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v57 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 @@ -101765,12 +103111,17 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 @@ -101778,39 +103129,36 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v6 ; SI-NEXT: v_mov_b32_e32 v50, v29 ; SI-NEXT: v_mov_b32_e32 v48, v30 -; SI-NEXT: v_mov_b32_e32 v46, v28 -; SI-NEXT: v_mov_b32_e32 v34, v8 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 +; SI-NEXT: v_mov_b32_e32 v56, v28 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: .LBB64_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 @@ -101821,32 +103169,34 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 @@ -101855,7 +103205,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 @@ -101864,7 +103214,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 @@ -101873,7 +103223,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -101882,7 +103232,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -101891,7 +103241,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -101900,7 +103250,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -101909,7 +103259,7 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -101918,16 +103268,16 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -101937,8 +103287,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101948,8 +103298,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101959,8 +103309,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101970,8 +103320,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101981,8 +103331,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -101992,8 +103342,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102003,8 +103353,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102014,8 +103364,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102025,8 +103375,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102036,8 +103386,8 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -102046,18 +103396,16 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 @@ -102068,20 +103416,20 @@ define <64 x half> @bitcast_v16i64_to_v64f16(<16 x i64> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -105876,385 +107224,431 @@ define inreg <64 x i16> @bitcast_v16i64_to_v64i16_scalar(<16 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v16i64_to_v64i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_readfirstlane_b32 s47, v1 -; SI-NEXT: v_readfirstlane_b32 s46, v2 -; SI-NEXT: v_readfirstlane_b32 s45, v3 -; SI-NEXT: v_readfirstlane_b32 s44, v4 -; SI-NEXT: v_readfirstlane_b32 s43, v5 -; SI-NEXT: v_readfirstlane_b32 s42, v6 -; SI-NEXT: v_readfirstlane_b32 s41, v7 -; SI-NEXT: v_readfirstlane_b32 s40, v8 -; SI-NEXT: v_readfirstlane_b32 s15, v9 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s12, v12 -; SI-NEXT: v_readfirstlane_b32 s11, v13 -; SI-NEXT: v_readfirstlane_b32 s10, v14 -; SI-NEXT: v_readfirstlane_b32 s9, v15 -; SI-NEXT: v_readfirstlane_b32 s8, v16 -; SI-NEXT: v_readfirstlane_b32 s7, v17 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_readfirstlane_b32 s44, v1 +; SI-NEXT: v_readfirstlane_b32 s45, v2 +; SI-NEXT: v_readfirstlane_b32 s42, v3 +; SI-NEXT: v_readfirstlane_b32 s43, v4 +; SI-NEXT: v_readfirstlane_b32 s40, v5 +; SI-NEXT: v_readfirstlane_b32 s41, v6 +; SI-NEXT: v_readfirstlane_b32 s14, v7 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s10, v11 +; SI-NEXT: v_readfirstlane_b32 s11, v12 +; SI-NEXT: v_readfirstlane_b32 s8, v13 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: v_readfirstlane_b32 s6, v15 +; SI-NEXT: v_readfirstlane_b32 s7, v16 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: v_writelane_b32 v20, s69, 21 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s47 -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s18 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 -; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s8, 16 -; SI-NEXT: s_lshr_b32 s58, s10, 16 -; SI-NEXT: s_lshr_b32 s59, s12, 16 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s40, 16 -; SI-NEXT: s_lshr_b32 s62, s42, 16 -; SI-NEXT: s_lshr_b32 s63, s44, 16 -; SI-NEXT: s_lshr_b32 s72, s46, 16 -; SI-NEXT: s_lshr_b32 s73, s29, 16 -; SI-NEXT: s_lshr_b32 s74, s27, 16 -; SI-NEXT: s_lshr_b32 s75, s25, 16 -; SI-NEXT: s_lshr_b32 s76, s23, 16 -; SI-NEXT: s_lshr_b32 s77, s21, 16 -; SI-NEXT: s_lshr_b32 s78, s19, 16 -; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s41, 16 +; SI-NEXT: s_lshr_b32 s53, s43, 16 +; SI-NEXT: s_lshr_b32 s54, s45, 16 +; SI-NEXT: s_lshr_b32 s55, s29, 16 +; SI-NEXT: s_lshr_b32 s64, s27, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 16 +; SI-NEXT: s_lshr_b32 s66, s23, 16 +; SI-NEXT: s_lshr_b32 s67, s21, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 16 +; SI-NEXT: s_lshr_b32 s69, s17, 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 +; SI-NEXT: s_add_u32 s44, s44, 3 +; SI-NEXT: s_addc_u32 s45, s45, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s47, s47, 3 -; SI-NEXT: s_addc_u32 s46, s46, 0 -; SI-NEXT: s_add_u32 s45, s45, 3 -; SI-NEXT: s_addc_u32 s44, s44, 0 -; SI-NEXT: s_add_u32 s43, s43, 3 -; SI-NEXT: s_addc_u32 s42, s42, 0 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s40, s40, 0 -; SI-NEXT: s_add_u32 s15, s15, 3 -; SI-NEXT: s_addc_u32 s14, s14, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s47 -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v11, s26 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v13, s22 -; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v15, s18 -; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s46, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s29, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s27, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s25, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s23, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s21, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s19, v15, 16 -; SI-NEXT: v_alignbit_b32 v16, s17, v16, 16 -; SI-NEXT: s_lshr_b32 s56, s6, 16 -; SI-NEXT: s_lshr_b32 s57, s8, 16 -; SI-NEXT: s_lshr_b32 s58, s10, 16 -; SI-NEXT: s_lshr_b32 s59, s12, 16 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s40, 16 -; SI-NEXT: s_lshr_b32 s62, s42, 16 -; SI-NEXT: s_lshr_b32 s63, s44, 16 -; SI-NEXT: s_lshr_b32 s72, s46, 16 -; SI-NEXT: s_lshr_b32 s73, s29, 16 -; SI-NEXT: s_lshr_b32 s74, s27, 16 -; SI-NEXT: s_lshr_b32 s75, s25, 16 -; SI-NEXT: s_lshr_b32 s76, s23, 16 -; SI-NEXT: s_lshr_b32 s77, s21, 16 -; SI-NEXT: s_lshr_b32 s78, s19, 16 -; SI-NEXT: s_lshr_b32 s79, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s38, s5, 16 +; SI-NEXT: s_lshr_b32 s39, s7, 16 +; SI-NEXT: s_lshr_b32 s48, s9, 16 +; SI-NEXT: s_lshr_b32 s49, s11, 16 +; SI-NEXT: s_lshr_b32 s50, s13, 16 +; SI-NEXT: s_lshr_b32 s51, s15, 16 +; SI-NEXT: s_lshr_b32 s52, s41, 16 +; SI-NEXT: s_lshr_b32 s53, s43, 16 +; SI-NEXT: s_lshr_b32 s54, s45, 16 +; SI-NEXT: s_lshr_b32 s55, s29, 16 +; SI-NEXT: s_lshr_b32 s64, s27, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 16 +; SI-NEXT: s_lshr_b32 s66, s23, 16 +; SI-NEXT: s_lshr_b32 s67, s21, 16 +; SI-NEXT: s_lshr_b32 s68, s19, 16 +; SI-NEXT: s_lshr_b32 s69, s17, 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v16, s4, v16 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s79, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s47, s36, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s47 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s69, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s68, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 -; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v16, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s67, 16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s77, 16 -; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s66, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s92, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s75, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s90, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s47, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s46, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s44, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xffff +; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s51, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr79 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr77 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v16i64_to_v64i16_scalar: @@ -107332,179 +108726,162 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB71_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB71_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -107512,15 +108889,18 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -107528,14 +108908,40 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB71_3 -; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB71_3 +; SI-NEXT: .LBB71_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB71_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB71_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -107576,143 +108982,143 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -107721,7 +109127,7 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB71_3: ; %end +; SI-NEXT: .LBB71_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -107740,35 +109146,6 @@ define inreg <16 x i64> @bitcast_v64i16_to_v16i64_scalar(<64 x i16> inreg %a, i3 ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB71_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB71_2 ; ; VI-LABEL: bitcast_v64i16_to_v16i64_scalar: ; VI: ; %bb.0: @@ -112695,656 +114072,1312 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-LABEL: bitcast_v16f64_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_mov_b32_e32 v31, s16 -; SI-NEXT: v_mov_b32_e32 v32, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v30, s19 -; SI-NEXT: v_mov_b32_e32 v27, s20 -; SI-NEXT: v_mov_b32_e32 v28, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 -; SI-NEXT: v_mov_b32_e32 v26, s23 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v24, s25 -; SI-NEXT: v_mov_b32_e32 v21, s26 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v19, s28 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB73_4 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s12, v9 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: v_readfirstlane_b32 s14, v11 +; SI-NEXT: v_readfirstlane_b32 s15, v12 +; SI-NEXT: v_readfirstlane_b32 s40, v13 +; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v15 +; SI-NEXT: v_readfirstlane_b32 s43, v16 +; SI-NEXT: v_readfirstlane_b32 s44, v17 +; SI-NEXT: s_and_b64 s[46:47], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s45, v18 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr61 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: s_cbranch_scc0 .LBB73_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v4, v3, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v4, v3, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v4, v3, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b32 s46, s45, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 34 +; SI-NEXT: s_lshr_b32 s46, s45, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 35 +; SI-NEXT: s_lshr_b32 s46, s45, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 36 +; SI-NEXT: s_lshr_b32 s46, s43, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 37 +; SI-NEXT: s_lshr_b32 s46, s43, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 38 +; SI-NEXT: s_lshr_b32 s46, s43, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 39 +; SI-NEXT: s_lshr_b32 s46, s41, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 40 +; SI-NEXT: s_lshr_b32 s46, s41, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 41 +; SI-NEXT: s_lshr_b32 s46, s41, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 42 +; SI-NEXT: s_lshr_b32 s46, s15, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 43 +; SI-NEXT: s_lshr_b32 s46, s15, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 44 +; SI-NEXT: s_lshr_b32 s46, s15, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 45 +; SI-NEXT: s_lshr_b32 s46, s13, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 46 +; SI-NEXT: s_lshr_b32 s46, s13, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 47 +; SI-NEXT: s_lshr_b32 s46, s13, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 48 +; SI-NEXT: s_lshr_b32 s46, s11, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 49 +; SI-NEXT: s_lshr_b32 s46, s11, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 50 +; SI-NEXT: s_lshr_b32 s46, s11, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 51 +; SI-NEXT: s_lshr_b32 s46, s9, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 52 +; SI-NEXT: s_lshr_b32 s46, s9, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 53 +; SI-NEXT: s_lshr_b32 s46, s9, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 54 +; SI-NEXT: s_lshr_b32 s46, s7, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 55 +; SI-NEXT: s_lshr_b32 s46, s7, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 56 +; SI-NEXT: s_lshr_b32 s46, s7, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 57 +; SI-NEXT: s_lshr_b32 s46, s5, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 58 +; SI-NEXT: s_lshr_b32 s46, s5, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 59 +; SI-NEXT: s_lshr_b32 s46, s5, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 60 +; SI-NEXT: s_lshr_b32 s46, s29, 24 +; SI-NEXT: v_writelane_b32 v61, s46, 61 +; SI-NEXT: s_lshr_b32 s46, s29, 16 +; SI-NEXT: v_writelane_b32 v61, s46, 62 +; SI-NEXT: s_lshr_b32 s46, s29, 8 +; SI-NEXT: v_writelane_b32 v61, s46, 63 +; SI-NEXT: s_lshr_b32 s46, s27, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 0 +; SI-NEXT: s_lshr_b32 s46, s27, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 1 +; SI-NEXT: s_lshr_b32 s46, s27, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 2 +; SI-NEXT: s_lshr_b32 s46, s25, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 3 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: s_lshr_b32 s46, s25, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 5 +; SI-NEXT: s_lshr_b32 s46, s23, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 6 +; SI-NEXT: s_lshr_b32 s46, s23, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 7 +; SI-NEXT: s_lshr_b32 s46, s23, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 8 +; SI-NEXT: s_lshr_b32 s46, s21, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 9 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 10 +; SI-NEXT: s_lshr_b32 s46, s21, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 11 +; SI-NEXT: s_lshr_b32 s46, s19, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 12 +; SI-NEXT: s_lshr_b32 s46, s19, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 13 +; SI-NEXT: s_lshr_b32 s46, s19, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 14 +; SI-NEXT: s_lshr_b32 s46, s17, 24 +; SI-NEXT: v_writelane_b32 v62, s46, 15 +; SI-NEXT: s_lshr_b32 s46, s17, 16 +; SI-NEXT: v_writelane_b32 v62, s46, 16 +; SI-NEXT: s_lshr_b32 s46, s17, 8 +; SI-NEXT: v_writelane_b32 v62, s46, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 32 +; SI-NEXT: v_writelane_b32 v61, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 30 +; SI-NEXT: v_writelane_b32 v61, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 28 +; SI-NEXT: v_writelane_b32 v61, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 26 +; SI-NEXT: v_writelane_b32 v61, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 24 +; SI-NEXT: v_writelane_b32 v61, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 22 +; SI-NEXT: v_writelane_b32 v61, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 20 +; SI-NEXT: v_writelane_b32 v61, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 18 +; SI-NEXT: v_writelane_b32 v61, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 16 +; SI-NEXT: v_writelane_b32 v61, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 14 +; SI-NEXT: v_writelane_b32 v61, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 12 +; SI-NEXT: v_writelane_b32 v61, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 10 +; SI-NEXT: v_writelane_b32 v61, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 8 +; SI-NEXT: v_writelane_b32 v61, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 6 +; SI-NEXT: v_writelane_b32 v61, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v61, s46, 4 +; SI-NEXT: v_writelane_b32 v61, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v61, s46, 2 +; SI-NEXT: v_writelane_b32 v61, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v61, s46, 0 +; SI-NEXT: s_lshr_b64 s[48:49], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v61, s47, 1 +; SI-NEXT: s_lshr_b64 s[50:51], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[84:85], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[86:87], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[98:99], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_cbranch_execnz .LBB73_4 +; SI-NEXT: .LBB73_2: ; %cmp.true +; SI-NEXT: v_add_f64 v[5:6], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[14:15], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v6 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v8 +; SI-NEXT: v_add_f64 v[9:10], s[12:13], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v10 +; SI-NEXT: v_add_f64 v[11:12], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v10 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v10 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v12 +; SI-NEXT: v_add_f64 v[13:14], s[8:9], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v12 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v12 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v14 +; SI-NEXT: v_add_f64 v[15:16], s[6:7], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v14 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v16 +; SI-NEXT: v_add_f64 v[17:18], s[4:5], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v18 +; SI-NEXT: v_add_f64 v[19:20], s[28:29], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v18 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v18 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v20 +; SI-NEXT: v_add_f64 v[21:22], s[26:27], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v20 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v20 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v22 +; SI-NEXT: v_add_f64 v[23:24], s[24:25], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v22 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v24 +; SI-NEXT: v_add_f64 v[38:39], s[22:23], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 -; SI-NEXT: v_alignbit_b32 v48, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v50, v28, v27, 8 -; SI-NEXT: v_alignbit_b32 v52, v30, v29, 24 -; SI-NEXT: v_alignbit_b32 v54, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v40, v30, v29, 8 -; SI-NEXT: v_alignbit_b32 v42, v32, v31, 24 -; SI-NEXT: v_alignbit_b32 v44, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v46, v32, v31, 8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v26 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v30 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v32 -; SI-NEXT: s_cbranch_execnz .LBB73_3 -; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v39 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_add_f64 v[52:53], s[20:21], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v39 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 24 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v53 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v53 +; SI-NEXT: v_add_f64 v[44:45], s[18:19], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v53 +; SI-NEXT: v_add_f64 v[1:2], s[44:45], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 24 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v45 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v45 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 24 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 8 +; SI-NEXT: v_add_f64 v[3:4], s[42:43], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v10, v9, 24 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v10, v9, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v10, v9, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v8, v7, 24 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v8, v7, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v8, v7, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 24 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v4, v3, 24 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v4, v3, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v4, v3, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 24 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 24 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v20, v19, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v22, v21, 24 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v22, v21, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v22, v21, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v24, v23, 24 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 16 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[19:20], 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[21:22], 24 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[23:24], 16 +; SI-NEXT: v_add_f64 v[58:59], s[16:17], 1.0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[35:36], v[21:22], 8 +; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 24, v2 +; SI-NEXT: v_lshr_b64 v[36:37], v[23:24], 24 +; SI-NEXT: v_lshr_b64 v[49:50], v[38:39], 24 +; SI-NEXT: v_lshr_b64 v[40:41], v[38:39], 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[25:26], v[44:45], 8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v2 +; SI-NEXT: v_mov_b32_e32 v37, v27 +; SI-NEXT: v_lshr_b64 v[50:51], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[52:53], 24 +; SI-NEXT: v_lshr_b64 v[54:55], v[52:53], 8 +; SI-NEXT: v_lshr_b64 v[26:27], v[58:59], 24 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v2 +; SI-NEXT: v_mov_b32_e32 v51, v28 +; SI-NEXT: v_lshr_b64 v[42:43], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[55:56], v[44:45], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[58:59], 16 +; SI-NEXT: v_mov_b32_e32 v43, v29 +; SI-NEXT: v_lshr_b64 v[56:57], v[44:45], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[58:59], 8 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v45 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v59 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v59 +; SI-NEXT: v_lshrrev_b32_e32 v57, 24, v4 +; SI-NEXT: s_branch .LBB73_5 +; SI-NEXT: .LBB73_3: +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 0 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 1 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 2 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 3 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 4 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 5 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 6 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 7 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 8 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 9 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 10 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 11 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 13 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 14 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 15 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 16 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 17 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 18 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 19 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 20 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 21 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 22 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 23 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 24 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 25 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 26 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s49, 27 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 28 +; SI-NEXT: v_writelane_b32 v61, s49, 29 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 30 +; SI-NEXT: v_writelane_b32 v61, s49, 31 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: v_writelane_b32 v61, s48, 32 +; SI-NEXT: v_writelane_b32 v61, s49, 33 +; SI-NEXT: ; kill: killed $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: s_branch .LBB73_2 +; SI-NEXT: .LBB73_4: +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 34 +; SI-NEXT: v_mov_b32_e32 v37, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 35 +; SI-NEXT: v_mov_b32_e32 v51, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 36 +; SI-NEXT: v_mov_b32_e32 v43, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 37 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v57, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 38 +; SI-NEXT: v_mov_b32_e32 v33, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 39 +; SI-NEXT: v_mov_b32_e32 v30, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 40 +; SI-NEXT: v_mov_b32_e32 v34, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 41 +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 42 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v24, v23, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 43 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v24, v23, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 44 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 24 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 45 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 46 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v33, v26, v25, 8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 47 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 48 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 49 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 50 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 51 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v16 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 52 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 53 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 54 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v14 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 55 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 56 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 57 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v12 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 58 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 59 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 60 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v10 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 61 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 62 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 63 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v8 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 1 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 2 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v6 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 3 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 4 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 5 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v4 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 6 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v2 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 7 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 8 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v2 -; SI-NEXT: v_alignbit_b32 v38, v28, v27, 24 -; SI-NEXT: v_alignbit_b32 v48, v28, v27, 16 -; SI-NEXT: v_alignbit_b32 v50, v28, v27, 8 -; SI-NEXT: v_alignbit_b32 v52, v30, v29, 24 -; SI-NEXT: v_alignbit_b32 v54, v30, v29, 16 -; SI-NEXT: v_alignbit_b32 v40, v30, v29, 8 -; SI-NEXT: v_alignbit_b32 v42, v32, v31, 24 -; SI-NEXT: v_alignbit_b32 v44, v32, v31, 16 -; SI-NEXT: v_alignbit_b32 v46, v32, v31, 8 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v55, 8, v22 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v24 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v26 -; SI-NEXT: v_lshrrev_b32_e32 v58, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v28 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v30 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v30 +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 9 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v35, 8, v32 -; SI-NEXT: .LBB73_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v46 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: v_or_b32_e32 v31, v31, v46 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v42 -; SI-NEXT: v_or_b32_e32 v42, v42, v44 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v42 -; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 10 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v35 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v33 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 11 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v40 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v54 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v52 -; SI-NEXT: v_or_b32_e32 v31, v32, v31 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v31 -; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 12 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v63 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v61 -; SI-NEXT: v_or_b32_e32 v30, v31, v30 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 13 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v50 +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_mov_b32_e32 v29, s46 +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s98 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s96 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s86 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s84 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s82 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s80 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s70 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s68 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s66 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s64 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s54 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s52 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s50 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s4, v62, 14 +; SI-NEXT: v_mov_b32_e32 v60, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 15 +; SI-NEXT: v_mov_b32_e32 v31, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 16 +; SI-NEXT: v_mov_b32_e32 v32, s4 +; SI-NEXT: v_readlane_b32 s4, v62, 17 +; SI-NEXT: v_mov_b32_e32 v18, s5 +; SI-NEXT: v_mov_b32_e32 v46, s4 +; SI-NEXT: v_readlane_b32 s4, v61, 0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 1 +; SI-NEXT: v_readlane_b32 s4, v61, 2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 3 +; SI-NEXT: v_readlane_b32 s4, v61, 4 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 5 +; SI-NEXT: v_readlane_b32 s4, v61, 6 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 7 +; SI-NEXT: v_readlane_b32 s4, v61, 8 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 9 +; SI-NEXT: v_readlane_b32 s4, v61, 10 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 11 +; SI-NEXT: v_readlane_b32 s4, v61, 12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 13 +; SI-NEXT: v_readlane_b32 s4, v61, 14 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 15 +; SI-NEXT: v_readlane_b32 s4, v61, 16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 17 +; SI-NEXT: v_readlane_b32 s4, v61, 18 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 19 +; SI-NEXT: v_readlane_b32 s4, v61, 20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 21 +; SI-NEXT: v_readlane_b32 s4, v61, 22 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 23 +; SI-NEXT: v_readlane_b32 s4, v61, 24 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 25 +; SI-NEXT: v_readlane_b32 s4, v61, 26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 27 +; SI-NEXT: v_readlane_b32 s4, v61, 28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 29 +; SI-NEXT: v_readlane_b32 s4, v61, 30 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s48 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_readlane_b32 s5, v61, 31 +; SI-NEXT: v_readlane_b32 s4, v61, 32 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v29, s4 +; SI-NEXT: v_mov_b32_e32 v59, s17 +; SI-NEXT: v_mov_b32_e32 v58, s16 +; SI-NEXT: v_mov_b32_e32 v45, s19 +; SI-NEXT: v_mov_b32_e32 v44, s18 +; SI-NEXT: v_mov_b32_e32 v53, s21 +; SI-NEXT: v_mov_b32_e32 v52, s20 +; SI-NEXT: v_mov_b32_e32 v39, s23 +; SI-NEXT: v_mov_b32_e32 v38, s22 +; SI-NEXT: v_mov_b32_e32 v24, s25 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v21, s26 +; SI-NEXT: v_mov_b32_e32 v20, s29 +; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v16, s7 +; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v14, s9 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v12, s11 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v10, s13 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v8, s15 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v6, s41 +; SI-NEXT: v_mov_b32_e32 v5, s40 +; SI-NEXT: v_mov_b32_e32 v4, s43 +; SI-NEXT: v_mov_b32_e32 v3, s42 +; SI-NEXT: v_mov_b32_e32 v2, s45 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: v_mov_b32_e32 v28, s38 +; SI-NEXT: v_mov_b32_e32 v27, s36 +; SI-NEXT: v_mov_b32_e32 v26, s34 +; SI-NEXT: v_mov_b32_e32 v25, s30 +; SI-NEXT: v_mov_b32_e32 v56, s94 +; SI-NEXT: v_mov_b32_e32 v55, s92 +; SI-NEXT: v_mov_b32_e32 v54, s90 +; SI-NEXT: v_mov_b32_e32 v42, s88 +; SI-NEXT: v_mov_b32_e32 v41, s78 +; SI-NEXT: v_mov_b32_e32 v40, s76 +; SI-NEXT: v_mov_b32_e32 v50, s74 +; SI-NEXT: v_mov_b32_e32 v49, s72 +; SI-NEXT: v_mov_b32_e32 v48, s62 +; SI-NEXT: v_mov_b32_e32 v47, s60 +; SI-NEXT: v_mov_b32_e32 v36, s58 +; SI-NEXT: v_mov_b32_e32 v35, s56 +; SI-NEXT: v_readlane_b32 s5, v61, 33 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: .LBB73_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v28 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v29, 0xff, v58 ; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v29 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v38 -; SI-NEXT: v_or_b32_e32 v29, v30, v29 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v60 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v58 ; SI-NEXT: v_or_b32_e32 v28, v29, v28 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v59 +; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v46 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v31 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v26, 0xff, v44 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v55 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v60 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v54 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v52 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v41 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v25, 0xff, v53 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v27, v28, v27 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v57 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v56 +; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v40 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v38 +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v49 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v47 ; SI-NEXT: v_or_b32_e32 v26, v27, v26 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xff, v39 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_lshlrev_b32_e32 v26, 8, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; SI-NEXT: v_or_b32_e32 v26, v27, v26 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v48 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v36 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v45 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v24 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v43 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v25 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v41 ; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v35 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 ; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 @@ -113352,28 +115385,37 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v55 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v53 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v51 ; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_or_b32_e32 v21, v22, v21 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 @@ -113381,518 +115423,370 @@ define inreg <128 x i8> @bitcast_v16f64_to_v128i8_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v49 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v39 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v37 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v1, v1, v19 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v19, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_or_b32_e32 v18, v19, v18 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v17, v18, v17 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v17 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_or_b32_e32 v15, v15, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_or_b32_e32 v14, v15, v14 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v7 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_or_b32_e32 v12, v13, v12 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v8 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: v_or_b32_e32 v11, v12, v11 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v9, v9, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v7, v7, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v15 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v57 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v37 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB73_4: -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: s_branch .LBB73_2 ; ; VI-LABEL: bitcast_v16f64_to_v128i8_scalar: ; VI: ; %bb.0: @@ -123434,8 +125328,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 @@ -123443,133 +125344,93 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:144 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:176 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v21 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v62, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v29 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v25 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v45 +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v43 +; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v42 +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v53 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v40 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v44 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v51 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v42 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v34 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v40 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v54 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v39 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v33 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v35 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v37 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:200 @@ -123578,31 +125439,31 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:224 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:232 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 @@ -123614,140 +125475,206 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:304 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v13 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:36 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:196 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:228 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:156 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v3 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:220 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:252 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:244 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:276 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB75_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v57, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v0, v0, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v4, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v0, v0, v30 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, v0, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v30, v5 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v5, v2, v3 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: v_mov_b32_e32 v2, v9 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -123756,306 +125683,277 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_mov_b32_e32 v3, v7 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v58, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_mov_b32_e32 v2, v9 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v10, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v29, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v11, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v12, v1 +; SI-NEXT: v_or_b32_e32 v1, v44, v1 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v25 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v0, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v36 +; SI-NEXT: v_or_b32_e32 v0, v0, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 +; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v18 +; SI-NEXT: v_or_b32_e32 v0, v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_or_b32_e32 v1, v15, v1 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 -; SI-NEXT: v_mov_b32_e32 v43, v16 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mov_b32_e32 v50, v16 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v16, v1 ; SI-NEXT: v_or_b32_e32 v16, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_mov_b32_e32 v48, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v32, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v34 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v17, v1 ; SI-NEXT: v_or_b32_e32 v17, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 -; SI-NEXT: v_mov_b32_e32 v55, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v40, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v44 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v18, v1 ; SI-NEXT: v_or_b32_e32 v18, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v50 -; SI-NEXT: v_mov_b32_e32 v44, v23 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v19, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v45 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v54 +; SI-NEXT: v_mov_b32_e32 v54, v23 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v20, v1 ; SI-NEXT: v_or_b32_e32 v20, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v52 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v21, v1 ; SI-NEXT: v_or_b32_e32 v21, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 -; SI-NEXT: v_mov_b32_e32 v59, v24 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v22, v1 ; SI-NEXT: v_or_b32_e32 v22, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v39, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_mov_b32_e32 v45, v24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v34, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v23, v1 ; SI-NEXT: v_or_b32_e32 v23, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v59 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v24, v1 ; SI-NEXT: v_or_b32_e32 v24, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v42 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v1 +; SI-NEXT: v_mov_b32_e32 v43, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 ; SI-NEXT: v_or_b32_e32 v25, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v56 +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v63, v1 +; SI-NEXT: v_or_b32_e32 v1, v33, v1 ; SI-NEXT: v_or_b32_e32 v26, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v32, v1 +; SI-NEXT: v_mov_b32_e32 v37, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v27, v1 ; SI-NEXT: v_or_b32_e32 v27, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v28, v1 ; SI-NEXT: v_or_b32_e32 v28, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v1 +; SI-NEXT: v_mov_b32_e32 v36, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v29, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: v_or_b32_e32 v0, v0, v30 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v49 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v30, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 ; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v31, v0, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v0, 0xff, v44 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: v_mov_b32_e32 v38, v63 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 @@ -124082,108 +125980,112 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: v_mov_b32_e32 v57, v1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch .LBB75_3 ; SI-NEXT: .LBB75_2: -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v56 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v45 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mov_b32_e32 v45, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: v_mov_b32_e32 v34, v35 -; SI-NEXT: v_mov_b32_e32 v35, v36 -; SI-NEXT: v_mov_b32_e32 v36, v54 -; SI-NEXT: v_mov_b32_e32 v54, v37 -; SI-NEXT: v_mov_b32_e32 v37, v41 -; SI-NEXT: v_mov_b32_e32 v41, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: .LBB75_3: ; %Flow -; SI-NEXT: v_mov_b32_e32 v63, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v35, v57 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB75_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v0, s4, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_and_b32 s4, s16, 0xff ; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s6, s18, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, s4, v0 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s20, 0xff ; SI-NEXT: s_lshl_b32 s6, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_and_b32 s7, s22, 0xff ; SI-NEXT: s_addk_i32 s5, 0x300 ; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s6, s7 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s6, s24, 0xff ; SI-NEXT: s_lshl_b32 s7, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: s_and_b32 s8, s26, 0xff ; SI-NEXT: s_addk_i32 s6, 0x300 ; SI-NEXT: s_lshl_b32 s7, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124192,17 +126094,17 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124212,15 +126114,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124230,15 +126132,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124248,15 +126150,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124266,15 +126168,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124284,15 +126186,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124302,15 +126204,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124319,16 +126221,17 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124338,15 +126241,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124356,84 +126259,79 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 +; SI-NEXT: v_or_b32_e32 v1, v50, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -124442,15 +126340,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 @@ -124459,15 +126357,15 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124477,9 +126375,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 @@ -124495,106 +126393,110 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v55, v1 +; SI-NEXT: v_or_b32_e32 v1, v48, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x3000000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v60 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v44, v1 +; SI-NEXT: v_or_b32_e32 v1, v54, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_or_b32_e32 v1, v59, v1 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x3000000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v46, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v56 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -124602,14 +126504,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -124617,14 +126519,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -124654,7 +126556,7 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v128i8_to_v16f64_scalar: @@ -124676,113 +126578,115 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:112 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:80 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:88 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:96 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:104 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:136 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:144 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:152 +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:160 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:168 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v15 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v19 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v17 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v23 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v25 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v43 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v42 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v41 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v40 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v55 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v54 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v53 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v52 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v51 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v50 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v22 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 @@ -124791,29 +126695,28 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v52, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v38 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v13 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v49, 8, v3 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v9 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v7 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 @@ -124823,130 +126726,141 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v48, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v11 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(12) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v7 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v37, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:76 +; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:84 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:92 +; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:100 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:108 +; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:116 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 ; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:260 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:276 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:284 ; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:308 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB75_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v2, v6, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -124955,208 +126869,197 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v3, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v3, v7 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v29, v9 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v59, v0 -; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v1 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v0 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v37, v0 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v62, v0 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v63, v1 +; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_mov_b32_e32 v60, v0 +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v21, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v51, v3 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_mov_b32_e32 v35, v0 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v34, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v22, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v34, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v23, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_or_b32_sdwa v0, v30, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v59, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v54, v0 -; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v32, v61 +; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v55, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v55, v43 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v46, v61 +; VI-NEXT: v_or_b32_sdwa v0, v42, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v53, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v45, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v41, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v44, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v41, v33 ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v58, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v44, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v47, v45 +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_or_b32_sdwa v0, v56, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v48, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v42, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v36, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v1, v33, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v50, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v40, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v51, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v48, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v49, v51 +; VI-NEXT: v_mov_b32_e32 v40, v34 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v57, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -125187,85 +127090,95 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_mov_b64 s[4:5], 0 ; VI-NEXT: s_branch .LBB75_3 ; VI-NEXT: .LBB75_2: -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v32, v54 -; VI-NEXT: v_mov_b32_e32 v43, v49 -; VI-NEXT: v_mov_b32_e32 v46, v61 -; VI-NEXT: v_mov_b32_e32 v47, v45 -; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v34, v26 -; VI-NEXT: v_mov_b32_e32 v58, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v63, v42 -; VI-NEXT: v_mov_b32_e32 v51, v7 -; VI-NEXT: v_mov_b32_e32 v48, v29 +; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v44, v56 +; VI-NEXT: v_mov_b32_e32 v41, v33 +; VI-NEXT: v_mov_b32_e32 v50, v40 +; VI-NEXT: v_mov_b32_e32 v38, v39 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v54, v53 +; VI-NEXT: v_mov_b32_e32 v52, v36 +; VI-NEXT: v_mov_b32_e32 v49, v51 ; VI-NEXT: s_mov_b64 s[4:5], -1 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; VI-NEXT: .LBB75_3: ; %Flow +; VI-NEXT: v_mov_b32_e32 v51, v41 +; VI-NEXT: v_mov_b32_e32 v36, v44 +; VI-NEXT: v_mov_b32_e32 v53, v54 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_mov_b32_e32 v54, v60 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; VI-NEXT: v_mov_b32_e32 v44, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_mov_b32_e32 v46, v49 ; VI-NEXT: s_cbranch_vccnz .LBB75_5 ; VI-NEXT: ; %bb.4: ; %cmp.true -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_e32 v0, s4, v0 ; VI-NEXT: s_add_i32 s16, s16, 3 -; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_add_i32 s18, s18, 3 +; VI-NEXT: s_lshl_b32 s6, s19, 8 +; VI-NEXT: s_add_i32 s20, s20, 3 +; VI-NEXT: s_add_i32 s22, s22, 3 +; VI-NEXT: s_lshl_b32 s7, s23, 8 +; VI-NEXT: s_add_i32 s24, s24, 3 +; VI-NEXT: s_add_i32 s26, s26, 3 +; VI-NEXT: s_lshl_b32 s8, s27, 8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v0, s4, v0 +; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s18, 0xff -; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 -; VI-NEXT: s_add_i32 s20, s20, 3 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s20, 0xff ; VI-NEXT: s_lshl_b32 s6, s21, 8 -; VI-NEXT: s_add_i32 s22, s22, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s22, 0xff -; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 -; VI-NEXT: s_add_i32 s24, s24, 3 ; VI-NEXT: s_or_b32 s5, s6, s5 ; VI-NEXT: s_and_b32 s6, s24, 0xff ; VI-NEXT: s_lshl_b32 s7, s25, 8 -; VI-NEXT: s_add_i32 s26, s26, 3 ; VI-NEXT: s_or_b32 s6, s7, s6 ; VI-NEXT: s_and_b32 s7, s26, 0xff -; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_or_b32 s7, s8, s7 ; VI-NEXT: s_and_b32 s6, s6, 0xffff @@ -125274,26 +127187,25 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -125301,8 +127213,8 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -125314,9 +127226,9 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -125328,14 +127240,14 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 @@ -125343,280 +127255,280 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v36 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v58 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v63 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v41 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v60 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v49 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v53 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v39 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -125661,504 +127573,524 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:104 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:136 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:144 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:152 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:128 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:136 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:144 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:152 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:160 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:168 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:176 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v11 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v25 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v27 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX9-NEXT: s_waitcnt vmcnt(35) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(34) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v27 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v45 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v43 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v34 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v49 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v37 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v24 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v36 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v38 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v48 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v50 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v51 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v52 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v53 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v41, 8, v41 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v3 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v5 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v9 ; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v49, 8, v4 -; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 +; GFX9-NEXT: v_lshlrev_b32_e32 v40, 8, v7 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 -; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 +; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v13 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v11 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v5 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v9 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v7 ; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:164 -; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:244 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:300 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_waitcnt vmcnt(42) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v2 -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 +; GFX9-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:124 +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:132 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:140 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:148 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:220 +; GFX9-NEXT: s_waitcnt vmcnt(29) +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:228 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:236 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:244 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:252 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:268 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:284 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:292 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:308 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:316 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB75_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v2, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v6, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v8, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff +; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v13, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v38 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v61, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v63, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v37, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v27, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_mov_b32_e32 v47, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v21, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v38, v51 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v60 -; GFX9-NEXT: v_mov_b32_e32 v52, v56 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_mov_b32_e32 v34, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_or_b32_sdwa v1, v63, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_or_b32_sdwa v0, v54, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v43 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_or_b32_sdwa v1, v32, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v51, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v1, v55, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v53, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v39, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v45, v62 +; GFX9-NEXT: v_mov_b32_e32 v46, v56 +; GFX9-NEXT: v_mov_b32_e32 v56, v58 +; GFX9-NEXT: v_mov_b32_e32 v58, v53 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -126189,32 +128121,48 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB75_3 ; GFX9-NEXT: .LBB75_2: -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v61, v0 -; GFX9-NEXT: v_mov_b32_e32 v63, v57 -; GFX9-NEXT: v_mov_b32_e32 v53, v3 +; GFX9-NEXT: v_mov_b32_e32 v38, v51 +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v33, v43 +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v35, v62 +; GFX9-NEXT: v_mov_b32_e32 v36, v31 +; GFX9-NEXT: v_mov_b32_e32 v40, v30 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-NEXT: v_mov_b32_e32 v57, v38 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB75_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v62, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB75_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -126261,348 +128209,352 @@ define inreg <16 x double> @bitcast_v128i8_to_v16f64_scalar(<128 x i8> inreg %a, ; GFX9-NEXT: s_movk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_and_b32 s8, s8, 0xffff +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v38 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v17, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v63 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v53 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v46 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v63 +; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v40 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v32 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v43 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v44 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v42 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v48 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v54 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 @@ -129496,92 +131448,92 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_writelane_b32 v62, s46, 3 ; SI-NEXT: s_cbranch_execnz .LBB77_4 ; SI-NEXT: .LBB77_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[6:7], 1.0 -; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v20 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v35 +; SI-NEXT: v_add_f64 v[1:2], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[41:42], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 +; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v28 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v28 +; SI-NEXT: v_and_b32_e32 v45, 0xffff0000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v27 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v54, 0xffff0000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v42 -; SI-NEXT: v_and_b32_e32 v40, 0xffff0000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v42 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v42 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v41 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 ; SI-NEXT: v_and_b32_e32 v42, 0xffff0000, v2 ; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v2 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[2:3], s[20:21], 1.0 ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; SI-NEXT: v_and_b32_e32 v46, 0xffff0000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v3 +; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v32 +; SI-NEXT: v_and_b32_e32 v57, 0xffff0000, v31 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v31 +; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v3 ; SI-NEXT: v_add_f64 v[3:4], s[16:17], 1.0 -; SI-NEXT: v_and_b32_e32 v43, 0xffff0000, v1 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v1 -; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v4 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; SI-NEXT: v_mov_b32_e32 v4, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[51:52], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[49:50], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[44:45], 1.0 -; SI-NEXT: v_add_f64 v[31:32], s[42:43], 1.0 -; SI-NEXT: v_add_f64 v[27:28], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[23:24], s[14:15], 1.0 -; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[19:20], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[59:60], s[18:19], 1.0 ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v8 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v12 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v11 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v16 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v16 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v15 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v15 +; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v20 ; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v20 -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v24 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v35 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xffff0000, v49 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v49 +; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v19 +; SI-NEXT: v_and_b32_e32 v53, 0xffff0000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v24 +; SI-NEXT: v_and_b32_e32 v55, 0xffff0000, v23 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v23 +; SI-NEXT: v_and_b32_e32 v61, 0xffff0000, v36 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v36 +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v49 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v52 ; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v52 ; SI-NEXT: v_and_b32_e32 v52, 0xffff0000, v51 ; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_and_b32_e32 v47, 0xffff0000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v2 -; SI-NEXT: v_and_b32_e32 v58, 0xffff0000, v60 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v60 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v60 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v60 ; SI-NEXT: v_and_b32_e32 v60, 0xffff0000, v59 ; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v59 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 @@ -129658,17 +131610,17 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: ; kill: killed $sgpr46 ; SI-NEXT: s_branch .LBB77_2 ; SI-NEXT: .LBB77_4: -; SI-NEXT: v_mov_b32_e32 v1, s71 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s69 +; SI-NEXT: v_mov_b32_e32 v1, s36 ; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s68 -; SI-NEXT: v_mov_b32_e32 v61, s4 +; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_readlane_b32 s4, v62, 2 @@ -129690,45 +131642,45 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_mov_b32_e32 v26, s82 ; SI-NEXT: v_mov_b32_e32 v33, s81 ; SI-NEXT: v_mov_b32_e32 v30, s80 +; SI-NEXT: v_mov_b32_e32 v37, s71 ; SI-NEXT: v_mov_b32_e32 v34, s70 -; SI-NEXT: v_mov_b32_e32 v8, s67 -; SI-NEXT: v_mov_b32_e32 v7, s66 -; SI-NEXT: v_mov_b32_e32 v24, s65 -; SI-NEXT: v_mov_b32_e32 v23, s64 -; SI-NEXT: v_mov_b32_e32 v16, s55 -; SI-NEXT: v_mov_b32_e32 v15, s54 -; SI-NEXT: v_mov_b32_e32 v28, s53 -; SI-NEXT: v_mov_b32_e32 v27, s52 -; SI-NEXT: v_mov_b32_e32 v12, s51 -; SI-NEXT: v_mov_b32_e32 v11, s50 -; SI-NEXT: v_mov_b32_e32 v32, s49 -; SI-NEXT: v_mov_b32_e32 v31, s48 -; SI-NEXT: v_mov_b32_e32 v20, s39 -; SI-NEXT: v_mov_b32_e32 v19, s38 -; SI-NEXT: v_mov_b32_e32 v36, s37 -; SI-NEXT: v_mov_b32_e32 v35, s36 -; SI-NEXT: v_mov_b32_e32 v38, s35 -; SI-NEXT: v_mov_b32_e32 v37, s34 -; SI-NEXT: v_mov_b32_e32 v48, s31 -; SI-NEXT: v_mov_b32_e32 v39, s30 +; SI-NEXT: v_mov_b32_e32 v39, s69 +; SI-NEXT: v_mov_b32_e32 v38, s68 +; SI-NEXT: v_mov_b32_e32 v53, s67 +; SI-NEXT: v_mov_b32_e32 v48, s66 +; SI-NEXT: v_mov_b32_e32 v55, s65 +; SI-NEXT: v_mov_b32_e32 v54, s64 +; SI-NEXT: v_mov_b32_e32 v43, s55 +; SI-NEXT: v_mov_b32_e32 v40, s54 +; SI-NEXT: v_mov_b32_e32 v45, s53 +; SI-NEXT: v_mov_b32_e32 v44, s52 +; SI-NEXT: v_mov_b32_e32 v47, s51 +; SI-NEXT: v_mov_b32_e32 v46, s50 +; SI-NEXT: v_mov_b32_e32 v57, s49 +; SI-NEXT: v_mov_b32_e32 v56, s48 +; SI-NEXT: v_mov_b32_e32 v61, s39 +; SI-NEXT: v_mov_b32_e32 v58, s38 +; SI-NEXT: v_mov_b32_e32 v8, s35 +; SI-NEXT: v_mov_b32_e32 v24, s31 +; SI-NEXT: v_mov_b32_e32 v23, s30 ; SI-NEXT: v_mov_b32_e32 v50, s95 ; SI-NEXT: v_mov_b32_e32 v49, s94 ; SI-NEXT: v_mov_b32_e32 v52, s93 ; SI-NEXT: v_mov_b32_e32 v51, s92 -; SI-NEXT: v_mov_b32_e32 v54, s91 -; SI-NEXT: v_mov_b32_e32 v53, s90 -; SI-NEXT: v_mov_b32_e32 v40, s89 -; SI-NEXT: v_mov_b32_e32 v55, s88 +; SI-NEXT: v_mov_b32_e32 v16, s91 +; SI-NEXT: v_mov_b32_e32 v15, s90 +; SI-NEXT: v_mov_b32_e32 v28, s89 +; SI-NEXT: v_mov_b32_e32 v27, s88 ; SI-NEXT: v_mov_b32_e32 v42, s79 ; SI-NEXT: v_mov_b32_e32 v41, s78 -; SI-NEXT: v_mov_b32_e32 v43, s77 -; SI-NEXT: v_mov_b32_e32 v44, s76 -; SI-NEXT: v_mov_b32_e32 v46, s75 -; SI-NEXT: v_mov_b32_e32 v45, s74 -; SI-NEXT: v_mov_b32_e32 v47, s73 -; SI-NEXT: v_mov_b32_e32 v56, s72 -; SI-NEXT: v_mov_b32_e32 v58, s63 -; SI-NEXT: v_mov_b32_e32 v57, s62 +; SI-NEXT: v_mov_b32_e32 v11, s77 +; SI-NEXT: v_mov_b32_e32 v12, s76 +; SI-NEXT: v_mov_b32_e32 v32, s75 +; SI-NEXT: v_mov_b32_e32 v31, s74 +; SI-NEXT: v_mov_b32_e32 v19, s73 +; SI-NEXT: v_mov_b32_e32 v20, s72 +; SI-NEXT: v_mov_b32_e32 v36, s63 +; SI-NEXT: v_mov_b32_e32 v35, s62 ; SI-NEXT: v_mov_b32_e32 v60, s61 ; SI-NEXT: v_mov_b32_e32 v59, s60 ; SI-NEXT: v_mov_b32_e32 v3, s4 @@ -129739,7 +131691,7 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 @@ -129753,30 +131705,30 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v46 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v11 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -129788,16 +131740,16 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v55 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -129816,128 +131768,128 @@ define inreg <64 x bfloat> @bitcast_v16f64_to_v64bf16_scalar(<16 x double> inreg ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v37 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v36 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v61 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v57 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v56 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v46 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v45 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v44 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v53 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v37 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -133206,24 +135158,23 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 +; SI-NEXT: v_mov_b32_e32 v52, v30 +; SI-NEXT: v_mov_b32_e32 v53, v28 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -133233,165 +135184,177 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:68 -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v14 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v0 ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v6 -; SI-NEXT: v_mov_b32_e32 v39, v10 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e32 v0, 1.0, v8 -; SI-NEXT: v_mov_b32_e32 v38, v12 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v15 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v55 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v30 -; SI-NEXT: v_mov_b32_e32 v37, v14 -; SI-NEXT: v_mov_b32_e32 v14, v11 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 ; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v40 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v15 ; SI-NEXT: v_mul_f32_e32 v15, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mul_f32_e32 v16, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v20 ; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v22 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v40, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v24 ; SI-NEXT: v_mul_f32_e32 v20, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v26 ; SI-NEXT: v_mul_f32_e32 v21, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v28 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v52 ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s26 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s29 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v42 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v51 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v44 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v45 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v46 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v47 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v56 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v57 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v58 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v59 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v60 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v61 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v38 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v60 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v42 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_mul_f32_e32 v37, 1.0, v62 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v63 +; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v0, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_mul_f32_e32 v31, 1.0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v36 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v42, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s22 -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v35 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v36 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s24 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB79_4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB79_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v3, 16 +; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v2, v2, v8, 16 +; SI-NEXT: v_alignbit_b32 v3, v3, v9, 16 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_alignbit_b32 v6, v6, v7, 16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v5, v5, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v1, v1, v35, 16 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v4, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 -; SI-NEXT: v_alignbit_b32 v2, v2, v42, 16 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v3, v3, v33, 16 -; SI-NEXT: v_mov_b32_e32 v33, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v58 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v44 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_alignbit_b32 v5, v5, v11, 16 +; SI-NEXT: v_alignbit_b32 v7, v7, v14, 16 +; SI-NEXT: v_alignbit_b32 v8, v8, v54, 16 +; SI-NEXT: v_alignbit_b32 v9, v9, v46, 16 +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_alignbit_b32 v13, v13, v47, 16 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v45 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v57 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 @@ -133399,16 +135362,6 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: v_alignbit_b32 v15, v15, v53, 16 -; SI-NEXT: v_alignbit_b32 v17, v17, v39, 16 -; SI-NEXT: v_alignbit_b32 v18, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v19, v19, v40, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v21, v21, v54, 16 -; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 @@ -133416,212 +135369,238 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_alignbit_b32 v30, v30, v31, 16 -; SI-NEXT: v_alignbit_b32 v23, v23, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_alignbit_b32 v4, v4, v34, 16 +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_alignbit_b32 v16, v16, v43, 16 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_alignbit_b32 v17, v17, v41, 16 +; SI-NEXT: v_alignbit_b32 v18, v18, v40, 16 +; SI-NEXT: v_mov_b32_e32 v40, v55 +; SI-NEXT: v_alignbit_b32 v19, v19, v55, 16 +; SI-NEXT: v_alignbit_b32 v20, v20, v48, 16 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_alignbit_b32 v21, v21, v53, 16 +; SI-NEXT: v_alignbit_b32 v22, v22, v52, 16 ; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_alignbit_b32 v24, v24, v51, 16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_alignbit_b32 v25, v25, v50, 16 +; SI-NEXT: v_alignbit_b32 v23, v23, v51, 16 +; SI-NEXT: v_alignbit_b32 v24, v24, v50, 16 ; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_alignbit_b32 v26, v26, v49, 16 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_alignbit_b32 v27, v27, v48, 16 -; SI-NEXT: v_mov_b32_e32 v48, v37 +; SI-NEXT: v_alignbit_b32 v25, v25, v49, 16 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: v_alignbit_b32 v26, v26, v39, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_alignbit_b32 v27, v27, v38, 16 +; SI-NEXT: v_mov_b32_e32 v35, v37 ; SI-NEXT: v_alignbit_b32 v28, v28, v37, 16 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: v_alignbit_b32 v29, v29, v32, 16 +; SI-NEXT: v_alignbit_b32 v31, v31, v42, 16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_alignbit_b32 v10, v10, v61, 16 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v35, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v12, v54, 16 +; SI-NEXT: v_mov_b32_e32 v41, v61 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v43, v8 -; SI-NEXT: v_alignbit_b32 v7, v7, v8, 16 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v9 +; SI-NEXT: v_alignbit_b32 v11, v11, v59, 16 +; SI-NEXT: v_mov_b32_e32 v55, v59 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v32 -; SI-NEXT: v_alignbit_b32 v31, v31, v34, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v60, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_alignbit_b32 v8, v8, v9, 16 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v58, v11 -; SI-NEXT: v_alignbit_b32 v9, v9, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v14, v45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v11 -; SI-NEXT: v_alignbit_b32 v10, v10, v11, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v12 -; SI-NEXT: v_alignbit_b32 v11, v11, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v63, v14 -; SI-NEXT: v_alignbit_b32 v12, v12, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v14 -; SI-NEXT: v_alignbit_b32 v13, v13, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v36, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_alignbit_b32 v14, v14, v38, 16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v38 -; SI-NEXT: v_alignbit_b32 v16, v16, v38, 16 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 +; SI-NEXT: v_alignbit_b32 v15, v15, v47, 16 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: s_branch .LBB79_3 +; SI-NEXT: .LBB79_2: +; SI-NEXT: v_mov_b32_e32 v63, v44 +; SI-NEXT: v_mov_b32_e32 v44, v43 +; SI-NEXT: v_mov_b32_e32 v43, v41 ; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v22, v54, 16 -; SI-NEXT: s_cbranch_execnz .LBB79_3 -; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v35 +; SI-NEXT: v_mov_b32_e32 v48, v53 +; SI-NEXT: v_mov_b32_e32 v52, v51 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v61 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v58, v57 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v45 +; SI-NEXT: v_mov_b32_e32 v50, v49 +; SI-NEXT: v_mov_b32_e32 v36, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v34, v38 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v37, v32 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v33, v42 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB79_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v39, v52 +; SI-NEXT: v_mov_b32_e32 v49, v40 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: s_cbranch_vccnz .LBB79_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v57 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v47 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v45 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v60 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v58 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v32 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 ; SI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29 +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -133635,105 +135614,107 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_alignbit_b32 v3, v4, v3, 16 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v45 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_alignbit_b32 v7, v8, v7, 16 ; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v42 ; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; SI-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 ; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v41 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_alignbit_b32 v10, v11, v10, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_alignbit_b32 v11, v12, v11, 16 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v63 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 ; SI-NEXT: v_alignbit_b32 v12, v13, v12, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v56 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 ; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v62 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v15, v16, v15, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_alignbit_b32 v16, v17, v16, 16 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v38 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v39 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 ; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; SI-NEXT: v_alignbit_b32 v18, v19, v18, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v49 ; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 ; SI-NEXT: v_alignbit_b32 v19, v20, v19, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v40 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_alignbit_b32 v20, v21, v20, 16 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_alignbit_b32 v22, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v39 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; SI-NEXT: v_alignbit_b32 v23, v24, v23, 16 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v52 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; SI-NEXT: v_alignbit_b32 v24, v25, v24, 16 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v38 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v36 ; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; SI-NEXT: v_alignbit_b32 v26, v27, v26, 16 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v49 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v34 ; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; SI-NEXT: v_alignbit_b32 v27, v28, v27, 16 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v48 +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v35 ; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 ; SI-NEXT: v_alignbit_b32 v28, v29, v28, 16 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; SI-NEXT: v_alignbit_b32 v30, v31, v30, 16 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v33 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; SI-NEXT: v_alignbit_b32 v31, v32, v31, 16 -; SI-NEXT: .LBB79_3: ; %end +; SI-NEXT: .LBB79_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -133752,41 +135733,6 @@ define inreg <16 x double> @bitcast_v64bf16_to_v16f64_scalar(<64 x bfloat> inreg ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB79_4: -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v61, v53 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v59, v2 -; SI-NEXT: v_mov_b32_e32 v57, v11 -; SI-NEXT: v_mov_b32_e32 v47, v10 -; SI-NEXT: v_mov_b32_e32 v45, v12 -; SI-NEXT: v_mov_b32_e32 v33, v14 -; SI-NEXT: v_mov_b32_e32 v62, v38 -; SI-NEXT: v_mov_b32_e32 v38, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v40 -; SI-NEXT: v_mov_b32_e32 v40, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB79_2 ; ; VI-LABEL: bitcast_v64bf16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -136060,12 +138006,12 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; kill: killed $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 @@ -136074,38 +138020,41 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB80_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v11 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v26 ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v8, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 -; SI-NEXT: v_mov_b32_e32 v48, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v49 +; SI-NEXT: v_mov_b32_e32 v48, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v49 ; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v50 +; SI-NEXT: v_mov_b32_e32 v50, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v51 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 @@ -136117,19 +138066,18 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v28 ; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 @@ -136152,7 +138100,6 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) @@ -136166,54 +138113,53 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v39 ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v50, v9 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: v_mov_b32_e32 v53, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v35 -; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v36 ; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v40 -; SI-NEXT: v_mov_b32_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v35 +; SI-NEXT: v_mov_b32_e32 v35, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v6 @@ -136240,71 +138186,59 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: s_cbranch_execz .LBB80_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v3 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v63 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v60 ; SI-NEXT: v_add_f64 v[35:36], v[11:12], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v23 ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v59 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v36 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 @@ -136315,17 +138249,19 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v32 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v4 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 @@ -136351,11 +138287,15 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 @@ -136367,41 +138307,43 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v5 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB80_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 @@ -136518,7 +138460,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -136527,7 +138469,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -136536,7 +138478,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -136545,7 +138487,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 @@ -136554,7 +138496,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 @@ -136563,7 +138505,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 @@ -136572,7 +138514,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -136583,7 +138525,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136594,7 +138536,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136605,7 +138547,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136616,7 +138558,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136627,7 +138569,7 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -136636,18 +138578,16 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 @@ -136656,25 +138596,25 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 @@ -136684,10 +138624,12 @@ define <64 x half> @bitcast_v16f64_to_v64f16(<16 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -136874,69 +138816,67 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_lshr_b32 s46, s5, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s46 ; SI-NEXT: s_lshr_b32 s46, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v55, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v51, s46 ; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s46 ; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 ; SI-NEXT: s_lshr_b32 s46, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 ; SI-NEXT: s_lshr_b32 s46, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v44, s46 ; SI-NEXT: s_lshr_b32 s46, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s46 ; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v9, s46 ; SI-NEXT: s_lshr_b32 s46, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s46 ; SI-NEXT: s_lshr_b32 s46, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v23, s46 ; SI-NEXT: s_lshr_b32 s46, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v27, s46 ; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v54, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v31, s46 ; SI-NEXT: s_lshr_b32 s46, s41, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v38, s46 ; SI-NEXT: s_lshr_b32 s46, s40, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v42, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v50, s46 ; SI-NEXT: s_lshr_b32 s46, s43, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 ; SI-NEXT: s_lshr_b32 s46, s42, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v22, s46 ; SI-NEXT: s_lshr_b32 s46, s45, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v30, s46 ; SI-NEXT: s_lshr_b32 s46, s44, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s46 ; SI-NEXT: s_lshr_b32 s46, s29, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s46 ; SI-NEXT: s_lshr_b32 s46, s28, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v37, s46 ; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 -; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s25 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s46 ; SI-NEXT: s_lshr_b32 s46, s26, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s46 -; SI-NEXT: s_lshr_b32 s46, s25, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v13, s46 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 ; SI-NEXT: s_lshr_b32 s46, s24, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v24, s46 ; SI-NEXT: s_lshr_b32 s46, s23, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s45 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s24 +; SI-NEXT: v_cvt_f32_f16_e32 v28, s46 ; SI-NEXT: s_lshr_b32 s46, s22, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v35, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s46 ; SI-NEXT: s_lshr_b32 s46, s21, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v46, s46 ; SI-NEXT: s_lshr_b32 s46, s20, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s46 +; SI-NEXT: v_cvt_f32_f16_e32 v56, s46 ; SI-NEXT: s_lshr_b32 s46, s19, 16 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s23 ; SI-NEXT: v_cvt_f32_f16_e32 v57, s46 ; SI-NEXT: s_lshr_b32 s46, s18, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s46 @@ -136945,7 +138885,7 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: s_lshr_b32 s46, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v62, s46 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v56, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s9 @@ -136953,25 +138893,25 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v11, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v39, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v51, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v55, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v41, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s14 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s41 -; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v63, s43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v37, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, s28 -; SI-NEXT: v_cvt_f32_f16_e32 v49, s27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s41 +; SI-NEXT: v_cvt_f32_f16_e32 v54, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s43 +; SI-NEXT: v_cvt_f32_f16_e32 v26, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s45 +; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v63, s29 +; SI-NEXT: v_cvt_f32_f16_e32 v20, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s26 -; SI-NEXT: v_cvt_f32_f16_e32 v53, s25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, s24 -; SI-NEXT: v_cvt_f32_f16_e32 v30, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s22 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v53, s21 ; SI-NEXT: v_cvt_f32_f16_e32 v47, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s19 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s18 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s18 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 @@ -136990,151 +138930,158 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v48 ; SI-NEXT: v_add_f64 v[36:37], s[28:29], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 ; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[4:5], s[4:5], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 ; SI-NEXT: v_add_f64 v[29:30], s[42:43], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v29 +; SI-NEXT: v_add_f64 v[6:7], s[6:7], 1.0 +; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v21 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v4, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v24 -; SI-NEXT: v_add_f64 v[10:11], s[8:9], 1.0 -; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v17 -; SI-NEXT: v_add_f64 v[21:22], s[14:15], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v57 +; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v63, v37 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 +; SI-NEXT: v_mov_b32_e32 v15, v12 +; SI-NEXT: v_mov_b32_e32 v12, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v56, v14 +; SI-NEXT: v_mov_b32_e32 v14, v5 +; SI-NEXT: v_mov_b32_e32 v5, v40 +; SI-NEXT: v_mov_b32_e32 v40, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f64 v[33:34], s[44:45], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 +; SI-NEXT: v_add_f64 v[25:26], s[40:41], 1.0 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v34 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v49, v1 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[1:2], s[18:19], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v25 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 +; SI-NEXT: v_mov_b32_e32 v7, v42 +; SI-NEXT: v_mov_b32_e32 v42, v20 +; SI-NEXT: v_mov_b32_e32 v20, v21 +; SI-NEXT: v_mov_b32_e32 v21, v26 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v48 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_cvt_f32_f16_e32 v35, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v52 ; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v1 ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v19 ; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 -; SI-NEXT: v_mov_b32_e32 v7, v61 -; SI-NEXT: v_mov_b32_e32 v61, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v19 ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_add_f64 v[46:47], s[20:21], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v15 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v47 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v47 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v47, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v15 -; SI-NEXT: v_mov_b32_e32 v15, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mov_b32_e32 v12, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v4 ; SI-NEXT: v_mov_b32_e32 v18, v3 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v17 ; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v52, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: .LBB81_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -137151,19 +139098,19 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 ; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -137171,188 +139118,188 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 ; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 ; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 ; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 ; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v29 ; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v33 ; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v20 ; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 ; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 ; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 ; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v41 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v55 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v19 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v11 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v5 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -137390,62 +139337,62 @@ define inreg <64 x half> @bitcast_v16f64_to_v64f16_scalar(<16 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -140338,219 +142285,224 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB85_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v49, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 +; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 ; SI-NEXT: s_cbranch_execnz .LBB85_3 ; SI-NEXT: .LBB85_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_alignbit_b32 v33, v18, v17, 16 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v18 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v36, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v38, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v39, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v49, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v51, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v40, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v45, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v47, v28, v27, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v30, v29, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v60, v32, v31, 16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v10 +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[33:34], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshr_b64 v[38:39], v[7:8], 16 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshr_b64 v[39:40], v[23:24], 16 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_lshr_b64 v[40:41], v[25:26], 16 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[31:32], v[31:32], 1.0 +; SI-NEXT: v_lshr_b64 v[41:42], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[42:43], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[43:44], v[31:32], 16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v32 ; SI-NEXT: .LBB85_3: ; %end -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v43 ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v31, v31, v60 +; SI-NEXT: v_or_b32_e32 v31, v31, v50 ; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v31, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v57 ; SI-NEXT: v_or_b32_e32 v31, v31, v32 ; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v31, v32, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v42 ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 ; SI-NEXT: v_or_b32_e32 v29, v29, v31 ; SI-NEXT: v_add_i32_e32 v31, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v29, v31, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 ; SI-NEXT: v_or_b32_e32 v29, v29, v30 ; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v41 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 ; SI-NEXT: v_or_b32_e32 v27, v27, v29 ; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v27, v29, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v47 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v40 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v39 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v45 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v63 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v62 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -140562,7 +142514,7 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -140572,9 +142524,11 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -140584,9 +142538,11 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -140596,9 +142552,11 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -140608,25 +142566,27 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -140647,39 +142607,43 @@ define inreg <64 x i16> @bitcast_v16f64_to_v64i16_scalar(<16 x double> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB85_4: -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; kill: killed $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; kill: killed $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_branch .LBB85_2 ; ; VI-LABEL: bitcast_v16f64_to_v64i16_scalar: @@ -141703,179 +143667,162 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v56, v10 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_mov_b32_e32 v57, v8 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v35, v8 +; SI-NEXT: v_mov_b32_e32 v38, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 ; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v19 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v40 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v10 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v12 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v38 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v33 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v32 -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB87_4 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB87_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v7, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v7, v0, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v9, v0, v50 +; SI-NEXT: v_or_b32_e32 v9, v0, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v10, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 -; SI-NEXT: v_or_b32_e32 v12, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_or_b32_e32 v13, v0, v13 +; SI-NEXT: v_or_b32_e32 v10, v0, v47 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v11, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v12, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v13, v0, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v14 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_or_b32_e32 v14, v0, v48 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_or_b32_e32 v14, v0, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v16 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v18 -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_or_b32_e32 v16, v0, v37 +; SI-NEXT: v_or_b32_e32 v16, v0, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v20 ; SI-NEXT: v_or_b32_e32 v17, v0, v17 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v18, v0, v35 +; SI-NEXT: v_or_b32_e32 v18, v0, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: v_or_b32_e32 v19, v0, v19 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_or_b32_e32 v20, v0, v33 +; SI-NEXT: v_or_b32_e32 v20, v0, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: v_or_b32_e32 v21, v0, v21 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v22, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v39, v23 +; SI-NEXT: v_or_b32_e32 v22, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v23, v0, v23 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 -; SI-NEXT: v_mov_b32_e32 v24, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v24, v0, v57 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: v_or_b32_e32 v25, v0, v25 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 -; SI-NEXT: v_mov_b32_e32 v26, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v26, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v27, v0, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v27, v0, v27 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_mov_b32_e32 v33, v28 ; SI-NEXT: v_or_b32_e32 v28, v0, v5 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_or_b32_e32 v29, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v29, v0, v29 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 @@ -141883,15 +143830,18 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_or_b32_e32 v30, v0, v3 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_or_b32_e32 v8, v1, v55 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: v_or_b32_e32 v31, v0, v34 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_or_b32_e32 v8, v1, v56 +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: v_or_b32_e32 v31, v0, v31 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -141899,14 +143849,40 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: s_cbranch_execnz .LBB87_3 -; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB87_3 +; SI-NEXT: .LBB87_2: +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v42, v41 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v55, v61 +; SI-NEXT: v_mov_b32_e32 v38, v2 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v54 +; SI-NEXT: v_mov_b32_e32 v54, v14 +; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v62, v52 +; SI-NEXT: v_mov_b32_e32 v60, v59 +; SI-NEXT: v_mov_b32_e32 v49, v51 +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: .LBB87_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v58, v49 +; SI-NEXT: s_cbranch_vccnz .LBB87_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v32, v1 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v53 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -141947,143 +143923,143 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v38, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_or_b32_e32 v0, v39, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v50, v0 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v36, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v49, v0 +; SI-NEXT: v_or_b32_e32 v0, v44, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v39, v0 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v30, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -142092,7 +144068,7 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: v_add_i32_e32 v31, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: .LBB87_3: ; %end +; SI-NEXT: .LBB87_5: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -142111,35 +144087,6 @@ define inreg <16 x double> @bitcast_v64i16_to_v16f64_scalar(<64 x i16> inreg %a, ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB87_4: -; SI-NEXT: v_mov_b32_e32 v38, v61 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v63, v2 -; SI-NEXT: v_mov_b32_e32 v55, v4 -; SI-NEXT: v_mov_b32_e32 v53, v6 -; SI-NEXT: v_mov_b32_e32 v52, v57 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v61, v56 -; SI-NEXT: v_mov_b32_e32 v50, v43 -; SI-NEXT: v_mov_b32_e32 v36, v41 -; SI-NEXT: v_mov_b32_e32 v57, v40 -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v13 -; SI-NEXT: v_mov_b32_e32 v43, v48 -; SI-NEXT: v_mov_b32_e32 v48, v15 -; SI-NEXT: v_mov_b32_e32 v41, v14 -; SI-NEXT: v_mov_b32_e32 v56, v16 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v45, v44 -; SI-NEXT: v_mov_b32_e32 v59, v42 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v37, v20 -; SI-NEXT: v_mov_b32_e32 v39, v23 -; SI-NEXT: v_mov_b32_e32 v35, v24 -; SI-NEXT: v_mov_b32_e32 v33, v28 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: s_branch .LBB87_2 ; ; VI-LABEL: bitcast_v64i16_to_v16f64_scalar: ; VI: ; %bb.0: @@ -150763,22 +152710,22 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -150804,13 +152751,11 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -150819,49 +152764,46 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -150870,34 +152812,35 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -150907,131 +152850,155 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB89_4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB89_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -151040,225 +153007,205 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v10 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v32, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v37 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v54 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v38, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v39 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v60 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v35 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -151287,14 +153234,54 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB89_3 -; VI-NEXT: .LBB89_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB89_3 +; VI-NEXT: .LBB89_2: +; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v7 +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB89_3: ; %Flow +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB89_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -151313,351 +153300,356 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 -; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 +; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 +; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v56, s4, v56 ; VI-NEXT: s_and_b32 s4, s26, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s24, 0xff @@ -151670,35 +153662,26 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: s_or_b32 s8, s9, s8 ; VI-NEXT: s_and_b32 s9, s16, 0xff ; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_addk_i32 s7, 0x300 ; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s8, s8, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB89_3: ; %end +; VI-NEXT: .LBB89_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -151717,39 +153700,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB89_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB89_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64bf16_scalar: ; GFX9: ; %bb.0: @@ -151770,31 +153720,36 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 @@ -151804,133 +153759,129 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 @@ -151938,148 +153889,149 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB89_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -152087,19 +154039,12 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -152121,272 +154066,291 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v57, v5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v39, v16 -; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v55, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v53, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v52, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v50, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v49, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v51, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v59, v44 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v57, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB89_3 ; GFX9-NEXT: .LBB89_2: -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v51, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v46, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB89_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB89_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 @@ -152399,61 +154363,55 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_lshl_b32 s9, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_and_b32 s4, s24, 0xff ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s26, 0xff @@ -152465,8 +154423,6 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s18, 0xff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -152483,14 +154439,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -152498,9 +154454,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -152510,254 +154466,277 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 -; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 ; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 ; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 ; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 ; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 @@ -152769,33 +154748,14 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: .LBB89_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -158082,16 +160042,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v46, v15 -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 -; GFX9-NEXT: ; implicit-def: $vgpr33 -; GFX9-NEXT: ; kill: killed $vgpr33 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 @@ -158124,50 +160083,69 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: v_mov_b32_e32 v46, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; kill: killed $vgpr59 ; GFX9-NEXT: ; implicit-def: $vgpr59 ; GFX9-NEXT: ; kill: killed $vgpr58 ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: v_mov_b32_e32 v47, v16 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr36 @@ -158180,21 +160158,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr51 ; GFX9-NEXT: ; implicit-def: $vgpr53 ; GFX9-NEXT: ; implicit-def: $vgpr48 -; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; implicit-def: $vgpr52 -; GFX9-NEXT: ; implicit-def: $vgpr16 +; GFX9-NEXT: ; implicit-def: $vgpr32 ; GFX9-NEXT: ; implicit-def: $vgpr42 ; GFX9-NEXT: ; implicit-def: $vgpr39 -; GFX9-NEXT: ; implicit-def: $vgpr45 +; GFX9-NEXT: ; kill: killed $vgpr0 +; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: ; implicit-def: $vgpr40 ; GFX9-NEXT: ; implicit-def: $vgpr54 -; GFX9-NEXT: ; implicit-def: $vgpr62 +; GFX9-NEXT: ; implicit-def: $vgpr43 ; GFX9-NEXT: ; implicit-def: $vgpr61 ; GFX9-NEXT: ; implicit-def: $vgpr60 ; GFX9-NEXT: ; implicit-def: $vgpr35 -; GFX9-NEXT: ; implicit-def: $vgpr44 +; GFX9-NEXT: ; implicit-def: $vgpr45 ; GFX9-NEXT: ; implicit-def: $vgpr34 -; GFX9-NEXT: ; implicit-def: $vgpr43 +; GFX9-NEXT: ; implicit-def: $vgpr47 ; GFX9-NEXT: ; kill: killed $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr33 ; GFX9-NEXT: ; implicit-def: $vgpr55 @@ -158204,11 +160182,15 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: ; implicit-def: $vgpr37 ; GFX9-NEXT: ; implicit-def: $vgpr57 ; GFX9-NEXT: ; implicit-def: $vgpr36 -; GFX9-NEXT: ; implicit-def: $vgpr63 +; GFX9-NEXT: ; implicit-def: $vgpr49 ; GFX9-NEXT: ; kill: killed $vgpr59 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -158218,6 +160200,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill @@ -158230,21 +160216,21 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: ; implicit-def: $vgpr58 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-NEXT: ; implicit-def: $vgpr58 ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 @@ -158261,555 +160247,551 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: ; implicit-def: $vgpr58 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 -; GFX9-NEXT: ; kill: killed $vgpr15 -; GFX9-NEXT: ; implicit-def: $vgpr15 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; GFX9-NEXT: ; implicit-def: $vgpr31 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v20 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v17 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v16 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v15 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v4 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v32 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v32 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v32 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(38) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v47 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v3 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v2 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v31 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v47 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v46 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v63 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v30 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v46 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v30 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v29 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v13 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v28 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v27 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v12 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v13 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v26 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v10 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v12 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v11 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v25 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v10 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 16, v18 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v8 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v6 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v8 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v6 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v24 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[46:47] -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v5 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[15:16] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v9 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v5 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v20 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[13:14] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[11:12] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[9:10] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[7:8] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[5:6] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[31:32] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[3:4] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[1:2] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[62:63] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[29:30] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[27:28] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[25:26] ; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[23:24] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[21:22] +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[19:20] -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b64 v[58:59], 24, v[17:18] -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v6 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v38, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v24 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v23 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v22 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v21 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v20 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v20 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v19 -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v18 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v17 -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-NEXT: .LBB90_2: ; %Flow -; GFX9-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: s_xor_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB90_4 ; GFX9-NEXT: ; %bb.3: ; %cmp.true -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v18 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v18 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 ; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v18 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v33, v16, v33, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v18, vcc +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v33, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_bfe_u32 v31, v18, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v47, v32, v33, vcc +; GFX9-NEXT: v_add3_u32 v31, v31, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; GFX9-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v56, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v18, 16, 1 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_add3_u32 v31, v31, v18, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v18 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v57, v31, v32, vcc +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 ; GFX9-NEXT: s_mov_b32 s7, 0x7060302 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v34, v15, v33, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v17 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v17 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v18, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v33, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v20 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v20 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v31, vcc +; GFX9-NEXT: v_perm_b32 v13, v17, v57, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v20 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v31, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_perm_b32 v14, v56, v47, s7 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v19 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v19 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v19 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v17, v15, v20, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v22 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v20, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v20, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v19 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v32, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v20, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v31, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v22 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v22 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v21 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_mov_b32_e32 v59, v32 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v21 +; GFX9-NEXT: v_mov_b32_e32 v58, v31 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v24 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v24 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v23 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v18, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v21 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v21 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v61, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v17, v15, v61, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v24 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v26 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v26 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v25 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v24 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v59, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v23 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v23 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v58, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v26 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v26 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: v_perm_b32 v63, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v25 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v25 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v62, v15, v18, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v28 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v28 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v60, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v27 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v28, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v27 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v26, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v56, v15, v26, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v30 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v30 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v27, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v29 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v30, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v25, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v33, v15, v25, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v29, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: s_waitcnt vmcnt(51) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v31 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v24, v16, v17, vcc -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc -; GFX9-NEXT: v_perm_b32 v35, v15, v24, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v2 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v25 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v14, v13, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: v_perm_b32 v13, v17, v0, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v28 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: v_cndmask_b32_e32 v0, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v27 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v28, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v27 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v25, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v60, v17, v25, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v30 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v27, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v30, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v29 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v24, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: v_perm_b32 v33, v17, v24, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v63 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v63 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v29, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: s_waitcnt vmcnt(52) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v62 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v44, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_and_b32_e32 v17, 0xffff0000, v62 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v23, v18, v19, vcc +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v17, v18, v19, vcc +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v35, v17, v23, s7 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX9-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v17, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v18, v18, v17, s6 +; GFX9-NEXT: v_or_b32_e32 v19, 0x400000, v17 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v43, v18, v19, vcc +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v22, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v21, v17, v18, vcc +; GFX9-NEXT: v_bfe_u32 v17, v2, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_add3_u32 v15, v15, v2, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v2 +; GFX9-NEXT: v_add3_u32 v17, v17, v2, s6 +; GFX9-NEXT: v_or_b32_e32 v18, 0x400000, v2 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v23, v15, v16, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v22, v17, v18, vcc ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v15, vcc -; GFX9-NEXT: v_perm_b32 v37, v1, v23, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v17, vcc +; GFX9-NEXT: v_perm_b32 v37, v1, v22, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v17, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v21, v2, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v17, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v3 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v3 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v20, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v19, v2, v4, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v48, v1, v20, s7 +; GFX9-NEXT: v_perm_b32 v48, v1, v19, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v6 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -158818,8 +160800,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: v_cndmask_b32_e32 v18, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v17, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 @@ -158833,13 +160814,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v5 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-NEXT: v_perm_b32 v50, v1, v17, s7 +; GFX9-NEXT: v_perm_b32 v50, v1, v4, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v8 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -158862,13 +160843,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v7 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 ; GFX9-NEXT: v_or_b32_e32 v7, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v7, vcc -; GFX9-NEXT: v_perm_b32 v52, v1, v4, s7 +; GFX9-NEXT: v_perm_b32 v52, v1, v3, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v10 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 @@ -158887,312 +160868,319 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc ; GFX9-NEXT: v_bfe_u32 v2, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v2, v2, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v31, vcc ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v15, vcc -; GFX9-NEXT: v_perm_b32 v39, v1, v3, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v31, vcc +; GFX9-NEXT: v_perm_b32 v39, v1, v2, s7 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v12 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v9, v1, 16, 1 ; GFX9-NEXT: v_add3_u32 v9, v9, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX9-NEXT: v_bfe_u32 v12, v1, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v15, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v31, vcc ; GFX9-NEXT: v_add3_u32 v12, v12, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX9-NEXT: v_or_b32_e32 v31, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; GFX9-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v15, vcc -; GFX9-NEXT: v_bfe_u32 v15, v1, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v31, vcc +; GFX9-NEXT: v_bfe_u32 v31, v1, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; GFX9-NEXT: v_add3_u32 v15, v15, v1, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v1 +; GFX9-NEXT: v_add3_u32 v31, v31, v1, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v1 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v54, v11, v2, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc +; GFX9-NEXT: v_perm_b32 v54, v11, v1, s7 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 16, v14 ; GFX9-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; GFX9-NEXT: v_bfe_u32 v15, v11, 16, 1 +; GFX9-NEXT: v_bfe_u32 v31, v11, 16, 1 ; GFX9-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; GFX9-NEXT: v_add3_u32 v15, v15, v11, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v11 +; GFX9-NEXT: v_add3_u32 v31, v31, v11, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v11 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 ; GFX9-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v16, vcc -; GFX9-NEXT: v_bfe_u32 v15, v14, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v14, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v14, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v14 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v13 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX9-NEXT: v_cndmask_b32_e32 v14, v31, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v13 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX9-NEXT: v_bfe_u32 v32, v31, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v41, vcc -; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_add3_u32 v32, v32, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v41, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX9-NEXT: v_perm_b32 v61, v28, v0, s7 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v41, vcc +; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX9-NEXT: v_perm_b32 v41, v13, v1, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v47 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc +; GFX9-NEXT: v_perm_b32 v41, v13, v0, s7 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 16, v16 ; GFX9-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX9-NEXT: v_bfe_u32 v15, v13, 16, 1 -; GFX9-NEXT: v_add3_u32 v15, v15, v13, s6 -; GFX9-NEXT: v_or_b32_e32 v16, 0x400000, v13 +; GFX9-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX9-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX9-NEXT: v_add3_u32 v31, v31, v13, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v13 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v15, v16, vcc -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v47 +; GFX9-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v13, v31, v32, vcc +; GFX9-NEXT: v_bfe_u32 v31, v16, 16, 1 +; GFX9-NEXT: v_add3_u32 v31, v31, v16, s6 +; GFX9-NEXT: v_or_b32_e32 v32, 0x400000, v16 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v16, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v32, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX9-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX9-NEXT: v_bfe_u32 v26, v31, 16, 1 ; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v16, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v16, v16, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v44, 0x400000, v15 +; GFX9-NEXT: v_add3_u32 v26, v26, v31, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v31 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX9-NEXT: v_bfe_u32 v31, v15, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v26, v26, v45, vcc +; GFX9-NEXT: v_add3_u32 v31, v31, v15, s6 +; GFX9-NEXT: v_or_b32_e32 v45, 0x400000, v15 ; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v44, v16, v44, vcc -; GFX9-NEXT: v_perm_b32 v16, v44, v13, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v47, 16, v13 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v31, v45, vcc +; GFX9-NEXT: v_perm_b32 v31, v15, v26, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v16 +; GFX9-NEXT: v_perm_b32 v32, v16, v13, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v26 ; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v14 -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v17 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v32 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v30 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v28 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 -; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; GFX9-NEXT: v_perm_b32 v36, v32, v29, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; GFX9-NEXT: v_perm_b32 v38, v22, v31, s7 ; GFX9-NEXT: v_perm_b32 v42, v14, v11, s7 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v31 -; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v24 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v12 +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v10 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v8 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v21 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v44 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v30 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v28 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: v_perm_b32 v34, v30, v27, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v30, 16, v27 +; GFX9-NEXT: v_lshrrev_b32_e32 v27, 16, v25 +; GFX9-NEXT: v_perm_b32 v36, v44, v29, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; GFX9-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v20 -; GFX9-NEXT: v_perm_b32 v49, v19, v21, s7 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 16, v46 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v43, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_and_b32_e32 v15, 0xffff0000, v46 -; GFX9-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v43, v45, v43, vcc -; GFX9-NEXT: v_bfe_u32 v45, v15, 16, 1 -; GFX9-NEXT: v_add3_u32 v45, v45, v15, s6 -; GFX9-NEXT: v_or_b32_e32 v46, 0x400000, v15 -; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v45, v46, vcc -; GFX9-NEXT: v_perm_b32 v15, v15, v43, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 16, v44 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v46, 16, v43 -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[15:16] -; GFX9-NEXT: v_perm_b32 v51, v6, v18, s7 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; GFX9-NEXT: v_perm_b32 v38, v21, v43, s7 +; GFX9-NEXT: v_perm_b32 v49, v18, v20, s7 +; GFX9-NEXT: v_perm_b32 v53, v8, v5, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v5 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v20 +; GFX9-NEXT: v_perm_b32 v51, v6, v17, s7 ; GFX9-NEXT: v_perm_b32 v40, v10, v7, s7 -; GFX9-NEXT: v_perm_b32 v57, v28, v60, s7 ; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v7 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v21 -; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v19 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v57 +; GFX9-NEXT: v_perm_b32 v55, v12, v9, s7 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v47 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v28, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v26, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v25, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v23, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v22, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v21, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v18, 16, v17 -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v19, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v56 +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[31:32] +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[41:42] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[41:42] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[54:55] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[54:55] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[39:40] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[39:40] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[52:53] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[52:53] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[50:51] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[50:51] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[48:49] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[48:49] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[37:38] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[37:38] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[35:36] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[35:36] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[33:34] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[33:34] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[60:61] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[56:57] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[62:63] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[58:59] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v32 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[60:61] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[13:14] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[62:63] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b64 v[43:44], 24, v[43:44] -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 24, v16 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v16 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v42 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v41 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v55 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v39 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v53 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[43:44] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b64 v[56:57], 24, v[58:59] +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v42 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v41 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v55 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v54 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 24, v40 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v39 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v39 +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 24, v53 ; GFX9-NEXT: v_lshrrev_b32_e32 v39, 8, v53 -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 24, v51 +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 24, v51 ; GFX9-NEXT: v_lshrrev_b32_e32 v42, 8, v51 ; GFX9-NEXT: v_lshrrev_b32_e32 v53, 24, v38 ; GFX9-NEXT: v_lshrrev_b32_e32 v51, 8, v38 @@ -159200,439 +161188,449 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX9-NEXT: v_lshrrev_b32_e32 v41, 8, v37 ; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v36 ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 8, v36 -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v35 ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 8, v35 -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v34 ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 8, v34 -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v33 ; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v33 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v57 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v57 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v56 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v56 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v63 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v63 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v61 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v60 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v60 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 16, v50 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v62 -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 16, v52 -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v50 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v59 -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v14 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v14 +; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 16, v13 +; GFX9-NEXT: v_mov_b32_e32 v34, v62 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 16, v52 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 16, v50 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshrrev_b32_e32 v63, 8, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61 -; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v60 -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 -; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v52 +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 8, v13 +; GFX9-NEXT: v_mov_b32_e32 v35, v63 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v31, 8, v52 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v32, 8, v50 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 24, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v52, 8, v49 ; GFX9-NEXT: v_lshrrev_b32_e32 v49, 16, v48 +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v35 +; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v63, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v54, 8, v54 +; GFX9-NEXT: v_lshrrev_b32_e32 v40, 8, v40 ; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48 -; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58 -; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v62 -; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v62 -; GFX9-NEXT: v_lshrrev_b32_e32 v44, 8, v61 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v60 -; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v59 -; GFX9-NEXT: v_lshrrev_b32_e32 v62, 8, v59 +; GFX9-NEXT: v_mov_b32_e32 v62, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v49, 8, v35 +; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v57, 8, v34 +; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v44 +; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v43 +; GFX9-NEXT: v_lshrrev_b32_e32 v35, 24, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v43, 8, v58 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v60 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v33, 24, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v47, 8, v61 +; GFX9-NEXT: v_lshrrev_b32_e32 v45, 8, v60 +; GFX9-NEXT: v_lshrrev_b32_e32 v60, 8, v59 +; GFX9-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-NEXT: .LBB90_4: ; %end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v7, v7, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v39 -; GFX9-NEXT: v_or_b32_sdwa v8, v8, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v45 -; GFX9-NEXT: v_or_b32_sdwa v9, v9, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v40 -; GFX9-NEXT: v_or_b32_sdwa v10, v10, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v54 -; GFX9-NEXT: v_or_b32_sdwa v11, v11, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v9, 8, v40 +; GFX9-NEXT: v_or_b32_sdwa v9, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v10, 8, v54 +; GFX9-NEXT: v_or_b32_sdwa v10, v11, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v32 +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v5, v5, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v32, 8, v42 +; GFX9-NEXT: v_or_b32_sdwa v7, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v31, 8, v39 +; GFX9-NEXT: v_or_b32_sdwa v6, v6, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v51, 8, v51 -; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 -; GFX9-NEXT: v_or_b32_sdwa v5, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v42 ; GFX9-NEXT: v_or_b32_sdwa v2, v2, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v48 ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshlrev_b16_e32 v48, 8, v52 -; GFX9-NEXT: v_or_b32_sdwa v6, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v62 -; GFX9-NEXT: v_or_b32_sdwa v17, v17, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v60 -; GFX9-NEXT: v_lshlrev_b16_e32 v41, 8, v41 -; GFX9-NEXT: v_or_b32_sdwa v18, v18, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v4, v4, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v12, v12, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b16_e32 v11, 8, v11 +; GFX9-NEXT: v_or_b32_sdwa v11, v12, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v12, 8, v12 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v47, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v13, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v13, 8, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v14, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_lshlrev_b16_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v14, v15, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v14, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v39, 8, v51 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v39 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v16, 8, v43 +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v17, 8, v60 +; GFX9-NEXT: v_or_b32_sdwa v17, v18, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b16_e32 v18, 8, v31 +; GFX9-NEXT: v_or_b32_sdwa v18, v38, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v53 -; GFX9-NEXT: v_lshlrev_b16_e32 v15, 8, v15 -; GFX9-NEXT: v_or_b32_sdwa v15, v46, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:8 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:12 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:16 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:20 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:24 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen offset:28 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:32 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:36 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:40 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:44 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:48 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:52 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:56 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:60 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v35 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:64 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v35 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:68 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; GFX9-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v43 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v33 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v45 +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:72 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v47 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v33 +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:76 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v55 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v37 -; GFX9-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:80 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v37 +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v58 -; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:84 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v57 +; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v63 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:88 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v49 +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:92 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:96 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:100 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v27, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:104 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:108 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:112 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:116 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:120 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_store_dword v0, v46, s[0:3], 0 offen offset:124 ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -162079,613 +164077,652 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-LABEL: bitcast_v64bf16_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:72 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v9 +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v14 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v19 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v24, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v30 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v18 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v11 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v20, 1.0, v46 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v23 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v26, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v28, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v30, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v34 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v37 ; SI-NEXT: v_mul_f32_e32 v34, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v15, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v48 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 ; SI-NEXT: v_mul_f32_e32 v48, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v50 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v40 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v42 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v53 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v54 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v44 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v55 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_mul_f32_e32 v41, 1.0, v41 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v43 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s29 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB91_2 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v43, v36 -; SI-NEXT: v_alignbit_b32 v36, v1, v2, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_alignbit_b32 v6, v1, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_alignbit_b32 v2, v1, v13, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_alignbit_b32 v5, v1, v17, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_alignbit_b32 v4, v1, v3, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_alignbit_b32 v16, v1, v57, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v13, v1, v58, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_alignbit_b32 v10, v1, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; SI-NEXT: v_alignbit_b32 v44, v19, v8, 16 -; SI-NEXT: v_alignbit_b32 v7, v1, v22, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: v_alignbit_b32 v60, v1, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: v_alignbit_b32 v57, v1, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: v_alignbit_b32 v58, v22, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v1, v37, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: v_alignbit_b32 v47, v25, v12, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_alignbit_b32 v53, v1, v48, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 -; SI-NEXT: v_alignbit_b32 v50, v8, v59, 16 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v52, v1, v52, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 -; SI-NEXT: v_mov_b32_e32 v17, v63 -; SI-NEXT: v_alignbit_b32 v1, v1, v41, 16 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v31 -; SI-NEXT: v_alignbit_b32 v62, v8, v61, 16 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v3, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v7, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v6, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v19, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v10, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB91_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v3 +; SI-NEXT: v_mov_b32_e32 v42, v37 +; SI-NEXT: v_alignbit_b32 v37, v2, v11, 16 +; SI-NEXT: v_alignbit_b32 v11, v44, v4, 16 +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_writelane_b32 v62, s6, 0 +; SI-NEXT: v_alignbit_b32 v2, v2, v15, 16 +; SI-NEXT: v_writelane_b32 v62, s7, 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v56 -; SI-NEXT: v_alignbit_b32 v55, v8, v63, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v48, v62, v4, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 -; SI-NEXT: v_alignbit_b32 v38, v8, v45, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v38, v16, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v9 -; SI-NEXT: v_alignbit_b32 v35, v8, v18, 16 -; SI-NEXT: v_mov_b32_e32 v45, v8 -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v29, v35, v13, 8 -; SI-NEXT: v_alignbit_b32 v61, v38, v16, 24 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v59 -; SI-NEXT: v_alignbit_b32 v30, v8, v21, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v8, v24, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_alignbit_b32 v24, v8, v26, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_alignbit_b32 v21, v8, v14, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v34 -; SI-NEXT: v_alignbit_b32 v18, v8, v15, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v14, v52, v6, 16 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v19, v2, v8, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v40, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_readfirstlane_b32 s5, v19 +; SI-NEXT: v_alignbit_b32 v2, v2, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v56 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_alignbit_b32 v47, v45, v47, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_alignbit_b32 v63, v8, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_readfirstlane_b32 s5, v47 +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v58 +; SI-NEXT: s_lshr_b64 s[20:21], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_mov_b32_e32 v4, v58 +; SI-NEXT: v_alignbit_b32 v58, v8, v41, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v59 +; SI-NEXT: v_readfirstlane_b32 s5, v58 +; SI-NEXT: v_alignbit_b32 v2, v2, v61, 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 16 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v2, v2, v60, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v23, v22 +; SI-NEXT: v_mov_b32_e32 v40, v36 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_alignbit_b32 v41, v15, v6, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v41 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 8 +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v53, 8 -; SI-NEXT: v_alignbit_b32 v12, v40, v43, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_alignbit_b32 v59, v1, v13, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s5, v59 +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 +; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 8 +; SI-NEXT: v_alignbit_b32 v61, v1, v17, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v2, v2, v21, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_alignbit_b32 v2, v2, v12, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_alignbit_b32 v60, v2, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_alignbit_b32 v1, v2, v46, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v60 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[4:5], 8 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v42 -; SI-NEXT: v_mov_b32_e32 v15, v9 -; SI-NEXT: v_alignbit_b32 v9, v8, v54, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v1, v5 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v23 +; SI-NEXT: v_mov_b32_e32 v5, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v25, v2, v26, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 +; SI-NEXT: v_readfirstlane_b32 s5, v25 +; SI-NEXT: v_alignbit_b32 v2, v2, v16, 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v22, v2, v30, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: v_alignbit_b32 v2, v2, v27, 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v8, v37 -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v49 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_alignbit_b32 v17, v2, v36, 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v37, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v8 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: v_alignbit_b32 v2, v2, v34, 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v34 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v28 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v15 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v29 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v29, v37 +; SI-NEXT: v_mov_b32_e32 v37, v42 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v33 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v20 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v12 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v33 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v34, 24, v4 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v43 +; SI-NEXT: v_mov_b32_e32 v31, v20 +; SI-NEXT: v_mov_b32_e32 v20, v34 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_alignbit_b32 v30, v2, v36, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_alignbit_b32 v2, v2, v39, 16 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s5, v30 +; SI-NEXT: s_lshr_b64 s[54:55], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v28, v36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_alignbit_b32 v57, v2, v39, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v20, 8, v24 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_readfirstlane_b32 s5, v57 +; SI-NEXT: v_alignbit_b32 v2, v2, v50, 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v39 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v46, v2, v38, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_readfirstlane_b32 s5, v46 +; SI-NEXT: v_alignbit_b32 v2, v2, v54, 16 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[84:85], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[4:5], 8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v38, v2, v53, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v9 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v49 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 +; SI-NEXT: v_readfirstlane_b32 s5, v38 +; SI-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v23, v48 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 +; SI-NEXT: v_mov_b32_e32 v32, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v63 -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v34, v53 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 +; SI-NEXT: v_mov_b32_e32 v55, v49 +; SI-NEXT: v_mov_b32_e32 v49, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v8, v6 ; SI-NEXT: s_branch .LBB91_3 ; SI-NEXT: .LBB91_2: -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v55, v49 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_mov_b32_e32 v40, v36 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; kill: killed $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -162703,1130 +164740,975 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: v_mov_b32_e32 v53, v42 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mov_b32_e32 v48, v33 -; SI-NEXT: v_mov_b32_e32 v29, v28 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v17, v63 -; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; kill: killed $vgpr56 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: .LBB91_3: ; %Flow +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v56, v17 -; SI-NEXT: v_mov_b32_e32 v54, v61 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v42, v32 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, vcc ; SI-NEXT: s_cbranch_vccnz .LBB91_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v7, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v8 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_alignbit_b32 v10, v6, v4, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s52, v10 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v9, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s86, v9 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v50 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_alignbit_b32 v52, v3, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v12, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s80, v12 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_alignbit_b32 v13, v3, v2, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s66, v13 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v33 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v15 ; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_alignbit_b32 v51, v5, v4, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v15, v7, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v42 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v46, v6, v3, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_alignbit_b32 v57, v7, v5, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v26 -; SI-NEXT: v_alignbit_b32 v9, v7, v6, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v32 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v23, v7, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v53 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v57, v6, v3, 16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11 -; SI-NEXT: v_alignbit_b32 v12, v8, v7, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v60, v10, v6, 16 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v16, v7, v6, 16 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v30, v6, v3, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v21 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; SI-NEXT: v_alignbit_b32 v7, v13, v7, 16 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v31 -; SI-NEXT: v_alignbit_b32 v63, v13, v10, 16 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v40 +; SI-NEXT: v_alignbit_b32 v18, v9, v7, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_alignbit_b32 v20, v10, v9, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s38, v15 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s90, v16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s30, v23 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s76, v18 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s62, v20 +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s5, v38 +; SI-NEXT: v_readfirstlane_b32 s87, v46 +; SI-NEXT: v_readfirstlane_b32 s81, v57 +; SI-NEXT: v_readfirstlane_b32 s67, v30 +; SI-NEXT: s_lshr_b64 s[54:55], s[66:67], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[66:67], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[66:67], 8 +; SI-NEXT: s_lshr_b64 s[66:67], s[80:81], 24 +; SI-NEXT: s_lshr_b64 s[70:71], s[80:81], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[80:81], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[86:87], 24 +; SI-NEXT: s_lshr_b64 s[84:85], s[86:87], 16 +; SI-NEXT: s_lshr_b64 s[96:97], s[86:87], 8 +; SI-NEXT: s_lshr_b64 s[86:87], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 8 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v30 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v17, v7, v3, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s53, v17 +; SI-NEXT: s_lshr_b64 s[48:49], s[52:53], 24 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_alignbit_b32 v10, v14, v10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_alignbit_b32 v18, v14, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_alignbit_b32 v21, v12, v10, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s56, v21 +; SI-NEXT: s_lshr_b64 s[50:51], s[52:53], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 8 +; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_alignbit_b32 v22, v9, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v23, v13, v12, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v49, 0x40c00000, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v21, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s42, v23 +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v57 +; SI-NEXT: v_readfirstlane_b32 s39, v22 +; SI-NEXT: v_lshrrev_b32_e32 v9, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v46 +; SI-NEXT: s_lshr_b64 s[36:37], s[38:39], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v21 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_alignbit_b32 v13, v16, v13, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v16 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v24, v15, v13, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v25, v10, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s31, v25 +; SI-NEXT: v_readfirstlane_b32 s26, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b64 s[94:95], s[30:31], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[30:31], 8 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v23 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_alignbit_b32 v26, v16, v15, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_readfirstlane_b32 s20, v26 +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v21 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; SI-NEXT: v_alignbit_b32 v16, v19, v16, 16 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v24, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v27, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v3, v22, v19, 16 -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v59 -; SI-NEXT: v_add_f32_e32 v54, 0x40c00000, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v54 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v44 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v30, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_alignbit_b32 v4, v25, v22, 16 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v40, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v40 -; SI-NEXT: v_alignbit_b32 v35, v45, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; SI-NEXT: v_alignbit_b32 v27, v18, v16, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v10 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v60, v12, v3, 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s91, v60 +; SI-NEXT: v_readfirstlane_b32 s14, v27 +; SI-NEXT: v_lshrrev_b32_e32 v10, 24, v10 +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_lshr_b64 s[88:89], s[90:91], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[90:91], 8 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v29, v20, v18, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s8, v29 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v12 +; SI-NEXT: v_alignbit_b32 v61, v11, v3, 16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s77, v61 +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[76:77], 8 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v18 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v59, v36, v3, 16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s63, v59 +; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 +; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; SI-NEXT: v_lshrrev_b32_e32 v18, 8, v25 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_alignbit_b32 v5, v28, v25, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v28 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v41 -; SI-NEXT: v_alignbit_b32 v38, v15, v14, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v20 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_alignbit_b32 v2, v33, v28, 16 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v3 +; SI-NEXT: v_alignbit_b32 v41, v49, v15, 16 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s57, v41 +; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v41 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v33 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v43 -; SI-NEXT: v_alignbit_b32 v55, v61, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_alignbit_b32 v6, v36, v33, 16 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 +; SI-NEXT: v_alignbit_b32 v58, v32, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s43, v58 +; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 -; SI-NEXT: v_add_f32_e32 v46, 0x40c00000, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xffff0000, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v46 -; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; SI-NEXT: v_alignbit_b32 v62, v15, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_alignbit_b32 v36, v39, v36, 16 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v47, v45, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s27, v47 +; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v47 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v39 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v42 -; SI-NEXT: v_alignbit_b32 v50, v17, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v19, v11, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s21, v19 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v19 +; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v56 -; SI-NEXT: v_alignbit_b32 v47, v25, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v14, v52, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: v_lshrrev_b32_e32 v7, 8, v14 +; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v39 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v50 -; SI-NEXT: v_alignbit_b32 v58, v22, v14, 16 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v56 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v11, v44, v16, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v23, 8, v60 +; SI-NEXT: v_readfirstlane_b32 s9, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v11 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 +; SI-NEXT: v_writelane_b32 v62, s6, 0 +; SI-NEXT: v_writelane_b32 v62, s7, 1 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[90:91], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[30:31], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[38:39], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 8 +; SI-NEXT: .LBB91_5: ; %end +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s6, 0xff +; SI-NEXT: v_readlane_b32 s6, v62, 0 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v42, 8, v63 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xff, v11 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v46 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v55 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_lshl_b32 s4, s4, 8 +; SI-NEXT: v_readlane_b32 s7, v62, 1 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s12, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v43 -; SI-NEXT: v_alignbit_b32 v43, v38, v16, 8 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v41 -; SI-NEXT: v_alignbit_b32 v41, v38, v16, 16 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s22, 8 +; SI-NEXT: s_lshl_b32 s6, s14, 24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s18, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v40 -; SI-NEXT: v_mov_b32_e32 v40, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v54 -; SI-NEXT: v_alignbit_b32 v54, v38, v16, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v1, 0xff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s28, 8 +; SI-NEXT: s_lshl_b32 s6, s20, 24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v20 -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v20, v35, v13, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v15, 8, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v23 -; SI-NEXT: v_alignbit_b32 v23, v62, v4, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v49 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v47 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v62 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v58 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v26 -; SI-NEXT: v_alignbit_b32 v26, v24, v60, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_alignbit_b32 v44, v19, v14, 16 -; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v59 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v44, v36, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v58, v6, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v47, v2, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v50, v5, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v62, v4, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v55, v3, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v35, v13, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v30, v10, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v27, v7, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v24, v60, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v21, v57, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v8, v18, v51, 8 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 8, v44 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v21 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 24 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v32, 8, v9 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v63, v34, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v12, v52, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v9, v1, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v27 -; SI-NEXT: .LBB91_5: ; %end -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v28 -; SI-NEXT: v_or_b32_e32 v32, v36, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v29 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v29 -; SI-NEXT: v_or_b32_e32 v36, v56, v36 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v51 -; SI-NEXT: v_or_b32_e32 v32, v32, v36 -; SI-NEXT: v_and_b32_e32 v36, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_or_b32_e32 v14, v14, v36 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_add_i32_e32 v32, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v6 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v19 -; SI-NEXT: v_or_b32_e32 v32, v33, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_add_i32_e32 v32, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v14, v32, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v6 -; SI-NEXT: v_or_b32_e32 v14, v32, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v60 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v49 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v28, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v57 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v19 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v39 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v2 -; SI-NEXT: v_or_b32_e32 v14, v25, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v47 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v23 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s44, 8 +; SI-NEXT: s_lshl_b32 s6, s26, 24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s40, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v22, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s58, 8 +; SI-NEXT: s_lshl_b32 s6, s42, 24 +; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s46, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v46 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_lshl_b32 s6, s56, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v17 -; SI-NEXT: v_or_b32_e32 v14, v19, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v43 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v54 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v38 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v2 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v11, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v20 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_lshl_b32 s6, s62, 24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: s_lshl_b32 s6, s76, 24 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v2 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v13, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s88, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: s_lshl_b32 s6, s90, 24 +; SI-NEXT: v_readlane_b32 s34, v63, 2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v27 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s94, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s38, 8 +; SI-NEXT: s_lshl_b32 s6, s30, 24 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s30, v63, 0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v22 +; SI-NEXT: s_lshl_b32 s5, s52, 8 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s6, s48, 24 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s36, v63, 4 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v21 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s50, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v17 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s68, 8 +; SI-NEXT: s_lshl_b32 s6, s54, 24 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s50, v63, 10 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v52 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s64, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_lshl_b32 s5, s82, 8 +; SI-NEXT: s_lshl_b32 s6, s66, 24 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s64, v63, 16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v42 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s70, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v57 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s96, 8 +; SI-NEXT: s_lshl_b32 s6, s80, 24 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s70, v63, 22 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v12 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: s_and_b32 s5, s84, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v1, s5, v1 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s5, s86, 24 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s84, v63, 28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 @@ -163835,19 +165717,30 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v9 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v35 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: s_and_b32 s4, s98, 0xff +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 @@ -163855,22 +165748,24 @@ define inreg <128 x i8> @bitcast_v64bf16_to_v128i8_scalar(<64 x bfloat> inreg %a ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -168941,39 +170836,38 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v44, v19 -; SI-NEXT: v_mov_b32_e32 v43, v17 -; SI-NEXT: v_mov_b32_e32 v32, v14 -; SI-NEXT: v_mov_b32_e32 v14, v12 -; SI-NEXT: v_mov_b32_e32 v12, v10 -; SI-NEXT: v_mov_b32_e32 v41, v7 -; SI-NEXT: v_mov_b32_e32 v55, v5 -; SI-NEXT: v_mov_b32_e32 v54, v3 -; SI-NEXT: v_mov_b32_e32 v51, v1 -; SI-NEXT: v_mov_b32_e32 v10, v0 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 +; SI-NEXT: v_mov_b32_e32 v42, v29 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v44, v25 +; SI-NEXT: v_mov_b32_e32 v45, v23 +; SI-NEXT: v_mov_b32_e32 v46, v21 +; SI-NEXT: v_mov_b32_e32 v47, v19 +; SI-NEXT: v_mov_b32_e32 v56, v17 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v15 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v13 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v59, v11 +; SI-NEXT: v_mov_b32_e32 v55, v9 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:392 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:140 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:148 @@ -168982,335 +170876,359 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:136 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:144 +; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v41, 8, v4 +; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v16 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v18 ; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 ; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v49 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:144 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v23 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v37 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v39 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v48 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:212 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:192 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:200 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:200 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:208 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:244 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:252 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:224 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:224 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:260 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:272 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:272 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:292 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:316 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v45, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:324 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:340 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:348 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:372 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:380 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v2 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:360 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:384 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:112 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:120 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:280 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:344 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:376 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:104 ; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:96 @@ -169319,118 +171237,129 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:64 ; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB92_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v51 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v53 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v41 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: v_or_b32_e32 v9, v2, v63 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v11, v2, v8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 +; SI-NEXT: v_or_b32_e32 v13, v2, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v59 +; SI-NEXT: v_or_b32_e32 v15, v2, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v17, v2, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v19, v2, v61 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v21, v2, v62 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v6, 0xff, v42 +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 ; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 @@ -169440,1780 +171369,1688 @@ define <64 x half> @bitcast_v128i8_to_v64f16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v39, 0xff, v50 -; SI-NEXT: v_and_b32_e32 v48, 0xff, v40 -; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 -; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_or_b32_e32 v45, v46, v45 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v56, v56, v61 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v3, v47, v3 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: v_and_b32_e32 v35, 0xff, v52 +; SI-NEXT: v_and_b32_e32 v36, 0xff, v40 +; SI-NEXT: v_and_b32_e32 v37, 0xff, v60 +; SI-NEXT: v_and_b32_e32 v39, 0xff, v31 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, v4, v49 +; SI-NEXT: v_and_b32_e32 v49, 0xff, v51 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v23, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v25, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v0 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v55 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v41 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v12 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v11 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v2, v14 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 -; SI-NEXT: v_mov_b32_e32 v8, v7 -; SI-NEXT: v_mov_b32_e32 v7, v19 -; SI-NEXT: v_or_b32_e32 v19, v2, v32 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 -; SI-NEXT: v_and_b32_e32 v35, 0xff, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v17, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v23, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v27, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v31, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v21 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v31 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_or_b32_e32 v51, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v51 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v29 -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v43 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v2, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v5, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v4, v4, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v6, v6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v8, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v8, v8, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v10, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v10, v10, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v12, v12, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v14, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v14, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 ; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v32, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v32, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v18, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v18 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v22, v22, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v22 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v24, v24, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v24, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v24 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v26, v26, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v26, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v28, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v28, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v30, v30, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v30, v30, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v34, v34, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v34, v34, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v16, v16, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v16, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v20, v20, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v20, v20, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v20 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v35, v35, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v35, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v36, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v36, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v36, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v37, v37, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v37, v37, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v38, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v38, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v38, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v38, v38, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v39, v39, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v39, v39, v1 +; SI-NEXT: v_mov_b32_e32 v1, v48 +; SI-NEXT: v_and_b32_e32 v48, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v48, v48, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v51, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v7, v7, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v48, v48, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v49, v49, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v51, v51, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v49, v49, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v52, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v50, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v50, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v52, v52, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v50, v50, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v50 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v9, v9, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v54, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v54, v54, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v54 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v52, v52, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v2, v51 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v52 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v53, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v11, v11, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v53, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v53, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v53, v53, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: v_and_b32_e32 v54, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v55, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v54, v54, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v55, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v55, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v55, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v13, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v41, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v40, v40, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v40 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v41, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v41, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v40, 0xff, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v41, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v40, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v48, v40 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v42, v42, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 +; SI-NEXT: v_or_b32_e32 v42, v42, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 ; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v43, 0xff, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v43, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v43, v43, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: v_or_b32_e32 v43, v43, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_or_b32_e32 v15, v15, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v15 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v44, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v44, v44, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v44, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v45, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v46, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v46, v46, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v46 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: v_or_b32_e32 v45, v45, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v58, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v58, v58, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v58 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v59, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v59, v59, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v59 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: v_or_b32_e32 v46, v46, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v60, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v60, v60, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v60 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v25, v61, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v25 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: v_or_b32_e32 v47, v47, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v47 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v61, 0xff, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v61, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v56, v56, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v57, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v57, v57, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v44 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; kill: killed $vgpr0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v13 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: .LBB92_2: ; %Flow -; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: s_cbranch_execz .LBB92_4 -; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v47 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v5, v3, v2 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_mov_b32_e32 v17, v43 -; SI-NEXT: v_mov_b32_e32 v19, v44 -; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 -; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v15 -; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v51 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v23, v63, v2 -; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v54 -; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: v_or_b32_e32 v58, v58, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v25, v25, v2 +; SI-NEXT: v_and_b32_e32 v59, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v31, v62, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v56 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v33, v61, v2 -; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v17 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v41 -; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 -; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v55 -; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v59, v59, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v35, v60, v2 -; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v9 -; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 -; SI-NEXT: v_or_b32_e32 v12, v12, v60 -; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v25 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v37, v59, v2 -; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v11 -; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 -; SI-NEXT: v_or_b32_e32 v14, v14, v59 -; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v31 +; SI-NEXT: v_or_b32_e32 v60, v60, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v38, v58, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v39, v45, v2 -; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v21 -; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 -; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v13 -; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 -; SI-NEXT: v_or_b32_e32 v32, v32, v58 -; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v33 -; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v32 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v61, v61, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v48, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v62, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v62, v62, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v31, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v31, v31, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v42 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v63, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v63, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v6, v3 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v55 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v43 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v11 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v13 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v17 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v19 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v5 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v56 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v61 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v46 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v59 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: .LBB92_2: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB92_4 +; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v40 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_add_i32_e32 v29, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v29 +; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v34, vcc, 3, v34 +; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v30, 0xff, v30 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 +; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v26, 0xff, v26 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 +; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: v_mov_b32_e32 v7, v49 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v42 +; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 +; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v43 +; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 +; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v44 +; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 +; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v45 +; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 +; SI-NEXT: v_add_i32_e32 v56, vcc, 3, v56 +; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 +; SI-NEXT: v_or_b32_e32 v56, v62, v56 +; SI-NEXT: v_add_i32_e32 v62, vcc, 3, v5 +; SI-NEXT: v_add_i32_e32 v46, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v62, 0xff, v62 +; SI-NEXT: v_and_b32_e32 v46, 0xff, v46 +; SI-NEXT: v_or_b32_e32 v62, v63, v62 +; SI-NEXT: v_add_i32_e32 v63, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_add_i32_e32 v57, vcc, 3, v57 +; SI-NEXT: v_and_b32_e32 v57, 0xff, v57 +; SI-NEXT: v_or_b32_e32 v57, v61, v57 +; SI-NEXT: v_add_i32_e32 v61, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v58, vcc, 3, v58 +; SI-NEXT: v_and_b32_e32 v58, 0xff, v58 +; SI-NEXT: v_or_b32_e32 v58, v14, v58 +; SI-NEXT: v_add_i32_e32 v59, vcc, 3, v59 +; SI-NEXT: v_and_b32_e32 v59, 0xff, v59 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v11 +; SI-NEXT: v_or_b32_e32 v59, v12, v59 +; SI-NEXT: v_and_b32_e32 v61, 0xff, v61 +; SI-NEXT: v_or_b32_e32 v61, v8, v61 +; SI-NEXT: v_and_b32_e32 v63, 0xff, v63 +; SI-NEXT: v_add_i32_e32 v47, vcc, 3, v47 +; SI-NEXT: v_and_b32_e32 v47, 0xff, v47 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v62 +; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v61 +; SI-NEXT: v_add_i32_e32 v59, vcc, s6, v59 +; SI-NEXT: v_add_i32_e32 v58, vcc, s6, v58 +; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v57 +; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v56 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v15, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v17, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v17 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v19, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v21, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v23, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v25, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v25 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v32 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v33, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v35 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v52 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v49 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v40 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v53 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v23 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v0, v2 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v50 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v27, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v60, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v15 +; SI-NEXT: v_and_b32_e32 v60, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v60, v10, v60 +; SI-NEXT: v_add_i32_e32 v60, vcc, s6, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v6, v41 +; SI-NEXT: v_or_b32_e32 v63, v6, v63 +; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v29 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v20 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v0, v16 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v34 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v36, 0xff, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v36 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v2, v2, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v4, v42 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v49, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v49, 0xff, v49 +; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v42 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v43, v4, v43 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v43 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v41, v2, v49 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v21 +; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v41 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v44, v4, v44 +; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v50, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v50, 0xff, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v51, v9, v50 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v19 +; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v51 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v52, vcc, 3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v52, 0xff, v52 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v45, v4, v45 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v45 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v52, v2, v52 +; SI-NEXT: v_mov_b32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v53, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v53, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v7, v7, v53 -; SI-NEXT: v_add_i32_e32 v54, vcc, s6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v46, v4, v46 +; SI-NEXT: v_or_b32_e32 v13, v2, v3 +; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v46 +; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v63 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v42, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v42, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v43, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v43, 0xff, v43 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v62 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v44, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v44, 0xff, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v61 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v45, vcc, 3, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v45, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v60 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v45, vcc, s6, v48 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v46, vcc, s6, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v58 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v57 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v56, vcc, s6, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v56 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_or_b32_e32 v53, v9, v53 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v53 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_or_b32_e32 v47, v4, v47 +; SI-NEXT: v_add_i32_e32 v47, vcc, s6, v47 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v8, v61 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v47 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v6, v62 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v46 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v57, vcc, s6, v35 -; SI-NEXT: v_add_i32_e32 v61, vcc, s6, v23 -; SI-NEXT: v_add_i32_e32 v62, vcc, s6, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v0, v63 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v45 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v44 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v43 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v42 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v36 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v40, vcc, 3, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v40, 0xff, v40 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v40, v9, v40 +; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v40 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v13, v51 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v41 +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v41, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v55, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v51, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v31, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v44, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v43, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v42, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v37 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v40, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v53, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v52, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v50, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v25, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v49, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v48, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v39, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v22, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v38, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v37, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v20, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v36, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v35, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v34, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v33, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v30, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v29, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v28, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v26, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v24, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v18, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v0 -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, s6, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v2 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v6 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v8 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v16 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, s6, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v28 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v20 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v24 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v39 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v33 +; SI-NEXT: v_mov_b32_e32 v37, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v27 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v55 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v28 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v56 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v30 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v46 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v48 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v59 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v44 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v38 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v42 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v54 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v49 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_mov_b32_e32 v35, v2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v50 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v52 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v40 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v43 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, v45 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v31 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v60 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v58 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v61 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v51 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v32, vcc, s6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v62 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v14 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v32 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_add_i32_e32 v63, vcc, s6, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 ; SI-NEXT: .LBB92_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v10 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v10 -; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v10 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v10 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v17 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x7c, v10 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload @@ -177051,22 +178888,22 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -177092,13 +178929,11 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -177107,49 +178942,46 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -177158,34 +178990,35 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -177195,131 +179028,155 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB93_4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB93_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -177328,225 +179185,205 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v10 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v32, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v37 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v54 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v38, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v39 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v60 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v35 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -177575,14 +179412,54 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB93_3 -; VI-NEXT: .LBB93_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB93_3 +; VI-NEXT: .LBB93_2: +; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v7 +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB93_3: ; %Flow +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB93_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -177601,351 +179478,356 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 -; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 +; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 +; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v56, s4, v56 ; VI-NEXT: s_and_b32 s4, s26, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s24, 0xff @@ -177958,35 +179840,26 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: s_or_b32 s8, s9, s8 ; VI-NEXT: s_and_b32 s9, s16, 0xff ; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_addk_i32 s7, 0x300 ; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s8, s8, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB93_3: ; %end +; VI-NEXT: .LBB93_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -178005,39 +179878,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB93_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB93_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64f16_scalar: ; GFX9: ; %bb.0: @@ -178058,31 +179898,36 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 @@ -178092,133 +179937,129 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 @@ -178226,148 +180067,149 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB93_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -178375,19 +180217,12 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -178409,272 +180244,291 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v57, v5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v39, v16 -; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v55, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v53, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v52, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v50, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v49, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v51, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v59, v44 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v57, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB93_3 ; GFX9-NEXT: .LBB93_2: -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v51, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v46, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB93_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB93_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 @@ -178687,61 +180541,55 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_lshl_b32 s9, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_and_b32 s4, s24, 0xff ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s26, 0xff @@ -178753,8 +180601,6 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s18, 0xff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -178771,14 +180617,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -178786,9 +180632,9 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -178798,254 +180644,277 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 -; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 ; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 ; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 ; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 ; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 @@ -179057,33 +180926,14 @@ define inreg <64 x half> @bitcast_v128i8_to_v64f16_scalar(<128 x i8> inreg %a, i ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: .LBB93_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -182779,1037 +184629,1093 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v15 -; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v12 -; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v11 -; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v10 -; VI-NEXT: v_lshrrev_b32_e32 v37, 16, v8 -; VI-NEXT: v_lshrrev_b32_e32 v47, 16, v6 -; VI-NEXT: v_lshrrev_b32_e32 v44, 16, v4 -; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v2 -; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 -; VI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 -; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v21 -; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr52 -; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: v_mov_b32_e32 v44, v12 +; VI-NEXT: v_mov_b32_e32 v12, v0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v43, v11 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; VI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 +; VI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v4 +; VI-NEXT: v_mov_b32_e32 v32, v20 +; VI-NEXT: v_mov_b32_e32 v55, v22 +; VI-NEXT: v_mov_b32_e32 v54, v21 +; VI-NEXT: v_mov_b32_e32 v31, v19 +; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v44 +; VI-NEXT: ; implicit-def: $vgpr20 +; VI-NEXT: ; implicit-def: $vgpr57 +; VI-NEXT: ; implicit-def: $vgpr51 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr56 +; VI-NEXT: ; implicit-def: $vgpr63 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr21 +; VI-NEXT: ; implicit-def: $vgpr47 +; VI-NEXT: ; implicit-def: $vgpr22 +; VI-NEXT: ; implicit-def: $vgpr46 ; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshrrev_b32_e32 v36, 16, v61 -; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v60 -; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr52 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v32 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 16, v43 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: ; implicit-def: $vgpr8 +; VI-NEXT: ; implicit-def: $vgpr15 +; VI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; VI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 +; VI-NEXT: v_lshrrev_b32_e32 v39, 16, v7 +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr13 +; VI-NEXT: ; implicit-def: $vgpr9 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr5 +; VI-NEXT: ; implicit-def: $vgpr4 +; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr30 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v29 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr29 +; VI-NEXT: ; implicit-def: $vgpr28 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr27 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 +; VI-NEXT: ; implicit-def: $vgpr25 +; VI-NEXT: ; implicit-def: $vgpr24 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v55 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr23 +; VI-NEXT: ; implicit-def: $vgpr34 +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: ; kill: killed $vgpr0 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr10 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr50 -; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr33 +; VI-NEXT: ; implicit-def: $vgpr61 ; VI-NEXT: ; implicit-def: $vgpr49 ; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr32 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr37 +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v4 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v4 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v3 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v2 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v2 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v1 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v61 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v61 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v60 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v30 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v30 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v29 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v28 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v27 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v26 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v26 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v24 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v24 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v23 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v22 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v22 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v21 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v20 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v20 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v19 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v18 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[13:14] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[11:12] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[9:10] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[7:8] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v31, v33 -; VI-NEXT: v_mov_b32_e32 v33, v43 -; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[5:6] -; VI-NEXT: v_mov_b32_e32 v43, v33 -; VI-NEXT: v_mov_b32_e32 v33, v46 -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[3:4] -; VI-NEXT: v_mov_b32_e32 v46, v33 -; VI-NEXT: v_mov_b32_e32 v33, v53 -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[1:2] -; VI-NEXT: v_mov_b32_e32 v53, v33 -; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[60:61] -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[32:33], 24, v[29:30] -; VI-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28] -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v33, v36 -; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26] -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24] -; VI-NEXT: v_mov_b32_e32 v36, v33 -; VI-NEXT: v_mov_b32_e32 v33, v41 -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[21:22] -; VI-NEXT: v_mov_b32_e32 v34, v51 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v56, v38 +; VI-NEXT: v_mov_b32_e32 v45, v7 +; VI-NEXT: v_mov_b32_e32 v63, v53 +; VI-NEXT: v_mov_b32_e32 v15, v3 +; VI-NEXT: v_mov_b32_e32 v28, v48 +; VI-NEXT: v_mov_b32_e32 v48, v16 +; VI-NEXT: v_mov_b32_e32 v16, v40 +; VI-NEXT: v_mov_b32_e32 v47, v39 +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v32 +; VI-NEXT: v_lshrrev_b32_e32 v29, 24, v44 +; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v32 +; VI-NEXT: v_lshrrev_b32_e32 v13, 24, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v38 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v37 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v44 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v43 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v62, v36 +; VI-NEXT: v_lshrrev_b32_e32 v41, 24, v38 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v11 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v10 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v8, 24, v11 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v52 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshrrev_b32_e32 v57, 24, v53 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v3 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v53 +; VI-NEXT: v_lshrrev_b32_e32 v19, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v59 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v58 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v26 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshrrev_b32_e32 v14, 24, v27 +; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v27 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v33 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v34 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v42, 24, v34 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v35 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v36 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v31 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v18 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v61, 8, v17 +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[37:38] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[43:44] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[10:11] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[2:3] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[26:27] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[33:34] +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v36 +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[35:36] +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[52:53] +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[58:59] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; VI-NEXT: v_mov_b32_e32 v53, v63 +; VI-NEXT: v_mov_b32_e32 v27, v19 +; VI-NEXT: v_mov_b32_e32 v34, v14 +; VI-NEXT: v_lshrrev_b32_e32 v9, 24, v55 +; VI-NEXT: v_mov_b32_e32 v7, v45 +; VI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; VI-NEXT: v_mov_b32_e32 v3, v15 +; VI-NEXT: v_mov_b32_e32 v15, v29 +; VI-NEXT: v_lshrrev_b32_e32 v26, 16, v17 +; VI-NEXT: v_mov_b32_e32 v38, v56 +; VI-NEXT: v_mov_b32_e32 v29, v41 +; VI-NEXT: v_mov_b32_e32 v45, v60 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v55 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v49 +; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v50 +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v50 +; VI-NEXT: v_lshrrev_b32_e32 v51, 24, v40 +; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[49:50] +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[39:40] +; VI-NEXT: v_mov_b32_e32 v58, v51 +; VI-NEXT: v_mov_b32_e32 v36, v62 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[54:55] ; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v41, v33 -; VI-NEXT: v_mov_b32_e32 v33, v31 -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[19:20] -; VI-NEXT: v_mov_b32_e32 v51, v34 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v40 +; VI-NEXT: v_mov_b32_e32 v40, v16 +; VI-NEXT: v_mov_b32_e32 v16, v48 +; VI-NEXT: v_mov_b32_e32 v48, v28 +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v39 +; VI-NEXT: v_mov_b32_e32 v39, v47 +; VI-NEXT: v_mov_b32_e32 v47, v4 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v54 ; VI-NEXT: .LBB94_2: ; %Flow ; VI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; VI-NEXT: s_cbranch_execz .LBB94_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v63, 0x200 -; VI-NEXT: v_add_f16_sdwa v31, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v21, v18, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v21 ; VI-NEXT: v_add_f16_e32 v18, 0x200, v18 -; VI-NEXT: v_or_b32_e32 v32, v18, v31 -; VI-NEXT: v_add_f16_sdwa v31, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v20, v17, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v58, v18, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 ; VI-NEXT: v_add_f16_e32 v17, 0x200, v17 -; VI-NEXT: v_or_b32_e32 v31, v17, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v20, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v20, 0x200, v20 -; VI-NEXT: v_or_b32_e32 v32, v20, v31 -; VI-NEXT: v_add_f16_sdwa v31, v19, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v19, 0x200, v19 -; VI-NEXT: v_or_b32_e32 v31, v19, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v34, v22, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v34 -; VI-NEXT: v_add_f16_e32 v22, 0x200, v22 -; VI-NEXT: v_or_b32_e32 v32, v22, v31 -; VI-NEXT: v_add_f16_sdwa v31, v21, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v21, 0x200, v21 -; VI-NEXT: v_or_b32_e32 v31, v21, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 -; VI-NEXT: v_or_b32_e32 v32, v24, v31 -; VI-NEXT: v_add_f16_sdwa v31, v23, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v23, 0x200, v23 -; VI-NEXT: v_or_b32_e32 v31, v23, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_sdwa v31, v26, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v26, 0x200, v26 -; VI-NEXT: v_or_b32_e32 v36, v26, v31 -; VI-NEXT: v_add_f16_sdwa v31, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_add_f16_sdwa v22, v32, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v57, v17, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v22 +; VI-NEXT: v_add_f16_e32 v32, 0x200, v32 +; VI-NEXT: v_or_b32_e32 v15, v32, v0 +; VI-NEXT: v_add_f16_sdwa v0, v31, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_e32 v31, 0x200, v31 +; VI-NEXT: v_add_f16_sdwa v23, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_e32 v14, v31, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_e32 v62, v55, v0 +; VI-NEXT: v_add_f16_sdwa v0, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: v_or_b32_e32 v61, v54, v0 +; VI-NEXT: v_mov_b32_e32 v26, v54 +; VI-NEXT: v_mov_b32_e32 v27, v55 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v60, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 ; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 -; VI-NEXT: v_or_b32_e32 v35, v25, v31 -; VI-NEXT: v_add_f16_sdwa v31, v28, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v28, 0x200, v28 -; VI-NEXT: v_or_b32_e32 v38, v28, v31 -; VI-NEXT: v_add_f16_sdwa v31, v27, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v27, 0x200, v27 -; VI-NEXT: v_or_b32_e32 v37, v27, v31 -; VI-NEXT: v_add_f16_sdwa v31, v30, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v30, 0x200, v30 -; VI-NEXT: v_add_f16_sdwa v32, v29, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v29, 0x200, v29 -; VI-NEXT: v_or_b32_e32 v49, v30, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v48, v29, v31 -; VI-NEXT: v_add_f16_sdwa v31, v61, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v61, 0x200, v61 -; VI-NEXT: v_add_f16_sdwa v32, v60, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v60, 0x200, v60 -; VI-NEXT: v_or_b32_e32 v51, v61, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v50, v60, v31 -; VI-NEXT: v_add_f16_sdwa v31, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v34, v25, v0 +; VI-NEXT: v_add_f16_sdwa v0, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v33, v24, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 -; VI-NEXT: v_add_f16_sdwa v32, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: v_or_b32_e32 v36, v2, v0 +; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 -; VI-NEXT: v_or_b32_e32 v53, v2, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v52, v1, v31 -; VI-NEXT: v_add_f16_sdwa v31, v4, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v4, 0x200, v4 -; VI-NEXT: v_add_f16_sdwa v32, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v35, v1, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_or_b32_e32 v38, v2, v0 +; VI-NEXT: v_add_f16_sdwa v0, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v37, v1, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v1, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 +; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v49, v9, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v47, v3, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v3, 0x200, v3 -; VI-NEXT: v_or_b32_e32 v46, v4, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v45, v3, v31 -; VI-NEXT: v_add_f16_sdwa v31, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_sdwa v1, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v48, v8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v47 +; VI-NEXT: v_mov_b32_e32 v9, v31 +; VI-NEXT: v_add_f16_sdwa v8, v43, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v10, v32 +; VI-NEXT: v_add_f16_e32 v43, 0x200, v43 +; VI-NEXT: v_or_b32_e32 v51, v3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_or_b32_e32 v50, v2, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v3, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v53, v2, v0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_add_f16_sdwa v3, v44, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v44, 0x200, v44 +; VI-NEXT: v_or_b32_e32 v52, v1, v0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v59, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v0, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_e32 v46, v2, v0 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; VI-NEXT: v_or_b32_e32 v45, v1, v0 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v1, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v11, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 ; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 -; VI-NEXT: v_add_f16_sdwa v32, v5, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v5, 0x200, v5 -; VI-NEXT: v_or_b32_e32 v43, v6, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_add_f16_sdwa v44, v8, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v42, v5, v31 -; VI-NEXT: v_add_f16_e32 v8, 0x200, v8 -; VI-NEXT: v_add_f16_sdwa v32, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v44 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v5, v7, v0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; VI-NEXT: v_or_b32_e32 v4, v6, v0 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_f16_sdwa v39, v6, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f16_sdwa v56, v7, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_f16_e32 v7, 0x200, v7 -; VI-NEXT: v_or_b32_e32 v41, v8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_or_b32_e32 v40, v7, v31 -; VI-NEXT: v_add_f16_sdwa v31, v10, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: v_add_f16_e32 v10, 0x200, v10 -; VI-NEXT: v_add_f16_sdwa v32, v9, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; VI-NEXT: v_add_f16_e32 v9, 0x200, v9 -; VI-NEXT: v_or_b32_e32 v55, v10, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v32 -; VI-NEXT: v_add_f16_sdwa v39, v12, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v54, v9, v31 -; VI-NEXT: v_add_f16_e32 v12, 0x200, v12 -; VI-NEXT: v_add_f16_sdwa v33, v11, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v39 -; VI-NEXT: v_add_f16_sdwa v47, v14, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v32, v12, v31 -; VI-NEXT: v_lshlrev_b32_e32 v31, 16, v33 -; VI-NEXT: v_add_f16_e32 v14, 0x200, v14 -; VI-NEXT: v_add_f16_sdwa v33, v13, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v47 -; VI-NEXT: v_or_b32_e32 v57, v14, v56 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v56, 16, v33 -; VI-NEXT: v_add_f16_sdwa v33, v16, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v16, 0x200, v16 -; VI-NEXT: v_add_f16_sdwa v63, v15, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v33 -; VI-NEXT: v_add_f16_e32 v15, 0x200, v15 -; VI-NEXT: v_or_b32_e32 v59, v16, v58 -; VI-NEXT: v_lshlrev_b32_e32 v58, 16, v63 -; VI-NEXT: v_or_b32_e32 v58, v15, v58 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v59 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v58 -; VI-NEXT: v_lshrrev_b64 v[58:59], 24, v[58:59] -; VI-NEXT: v_add_f16_e32 v13, 0x200, v13 -; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v56, v13, v56 -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v57 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v56 -; VI-NEXT: v_lshrrev_b64 v[56:57], 24, v[56:57] -; VI-NEXT: v_add_f16_e32 v11, 0x200, v11 -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v31, v11, v31 -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v32 -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v31 -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[31:32] -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v55 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v54 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[54:55] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v41 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v40 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[40:41] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v43 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v42 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v46 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v45 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v53 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v52 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] -; VI-NEXT: v_lshrrev_b64 v[42:43], 24, v[42:43] -; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[45:46] -; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] -; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[48:49] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v38 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[37:38] -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v36 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v35 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v6, 0x200, v6 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; VI-NEXT: v_or_b32_e32 v41, v7, v0 +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_e32 v40, v6, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v19, v24, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v42, v25, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v25, 0x200, v25 +; VI-NEXT: v_add_f16_e32 v24, 0x200, v24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v42 +; VI-NEXT: v_or_b32_e32 v7, v25, v0 +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_f16_sdwa v28, v2, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v2, 0x200, v2 +; VI-NEXT: v_add_f16_sdwa v16, v1, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v1, 0x200, v1 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v13, v54, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v54, 0x200, v54 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v25, 8, v46 +; VI-NEXT: v_or_b32_e32 v6, v24, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; VI-NEXT: v_or_b32_e32 v32, v44, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; VI-NEXT: v_or_b32_e32 v31, v43, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v28 +; VI-NEXT: v_or_b32_e32 v30, v2, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_add_f16_sdwa v2, v55, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v55, 0x200, v55 +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; VI-NEXT: v_or_b32_e32 v29, v1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v24, 8, v4 +; VI-NEXT: v_lshrrev_b32_e32 v63, 8, v34 +; VI-NEXT: v_or_b32_e32 v1, v55, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; VI-NEXT: v_or_b32_e32 v0, v54, v0 +; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v1 +; VI-NEXT: v_lshrrev_b32_e32 v54, 8, v0 +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v30 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v29 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[29:30] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v32 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v31 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[31:32] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v6 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[0:1], 24, v[6:7] +; VI-NEXT: v_mov_b32_e32 v32, v10 +; VI-NEXT: v_mov_b32_e32 v31, v9 +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v41 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v7, v11 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[40:41] +; VI-NEXT: v_mov_b32_e32 v55, v27 +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v54, v26 +; VI-NEXT: v_mov_b32_e32 v26, v20 +; VI-NEXT: v_lshrrev_b32_e32 v20, 8, v5 +; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[4:5] +; VI-NEXT: v_mov_b32_e32 v5, v22 +; VI-NEXT: v_mov_b32_e32 v13, v21 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[45:46] +; VI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v53 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v52 +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[50:51] +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v50 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v48 +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[21:22], 24, v[48:49] +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v49 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v36 +; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[35:36], 24, v[35:36] -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v54, v39 -; VI-NEXT: v_mov_b32_e32 v37, v44 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_mov_b32_e32 v56, v58 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v49 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v48 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[48:49] -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v33 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[50:51] -; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v51 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v50 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[50:51] -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v50 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 -; VI-NEXT: v_mov_b32_e32 v33, v47 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v33, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v39, 8, 8 -; VI-NEXT: v_mov_b32_e32 v39, v63 -; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[50:51] -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v36, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v62 +; VI-NEXT: v_lshrrev_b32_e32 v4, 8, v61 +; VI-NEXT: v_lshrrev_b64 v[61:62], 24, v[61:62] +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v2, 8, v14 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v48, v56 +; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v33 +; VI-NEXT: v_lshrrev_b64 v[49:50], 24, v[33:34] +; VI-NEXT: v_lshrrev_b64 v[33:34], 24, v[14:15] +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v58 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v57 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v23 +; VI-NEXT: v_lshrrev_b32_e32 v23, 8, v40 +; VI-NEXT: v_mov_b32_e32 v14, v8 +; VI-NEXT: v_mov_b32_e32 v40, v42 +; VI-NEXT: v_bfe_u32 v8, v42, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v38 +; VI-NEXT: v_lshrrev_b32_e32 v22, 8, v37 +; VI-NEXT: v_lshrrev_b64 v[37:38], 24, v[37:38] +; VI-NEXT: v_lshrrev_b32_e32 v27, 8, v45 +; VI-NEXT: v_lshrrev_b64 v[52:53], 24, v[52:53] +; VI-NEXT: v_lshrrev_b32_e32 v45, 8, v51 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_lshrrev_b64 v[50:51], 24, v[57:58] +; VI-NEXT: v_bfe_u32 v28, v36, 8, 8 +; VI-NEXT: v_bfe_u32 v29, v38, 8, 8 +; VI-NEXT: v_mov_b32_e32 v53, v3 +; VI-NEXT: v_bfe_u32 v15, v3, 8, 8 +; VI-NEXT: v_mov_b32_e32 v3, v59 +; VI-NEXT: v_bfe_u32 v51, v48, 8, 8 +; VI-NEXT: v_bfe_u32 v57, v7, 8, 8 +; VI-NEXT: v_bfe_u32 v58, v60, 8, 8 ; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_bfe_u32 v32, v63, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v44, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v34, v62, 8, 8 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_bfe_u32 v2, v2, 8, 8 +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v34, v47, 8, 8 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v9, v9, 8, 8 +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v5, v5, 8, 8 +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v13, v13, 8, 8 +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_bfe_u32 v32, v47, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v42, v0, 8, 8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v44, v32 -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_bfe_u32 v0, v0, 8, 8 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v55, v32 -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v36, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfe_u32 v32, v32, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v58, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v57, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v59, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v34, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v53, 8, 8 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v59 -; VI-NEXT: v_mov_b32_e32 v59, v34 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v32, v41, 8, 8 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v47, v0, 8, 8 ; VI-NEXT: .LBB94_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v2, v2, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v34, 8, v52 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v32, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v45 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v52 +; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v0, v12, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v44, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v27 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 8, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 12, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v24 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 16, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v20 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v57 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v7, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 20, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v23 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 24, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v51 +; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 28, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 32, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v8 +; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 36, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 40, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v15 +; VI-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 44, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v16, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v1, v14, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 48, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v29 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v15, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 52, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v16, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v50 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 56, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v28 +; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v31 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 60, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v50 +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 64, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v13 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v40 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x44, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v33 +; VI-NEXT: v_or_b32_sdwa v0, v11, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x48, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v49 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x4c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v4 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v61 +; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x50, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v41 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v9 +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x54, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v56 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v49 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x58, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v63 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v58 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x5c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v21 +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x60, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v30 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v47 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x64, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v22 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v37 ; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x68, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v46 +; VI-NEXT: v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x6c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 -; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v3 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x70, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x74, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v2, v36, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x78, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v45 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v34 +; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7c, v12 +; VI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload @@ -186028,23 +187934,24 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-LABEL: bitcast_v64f16_to_v128i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 @@ -186053,1830 +187960,1684 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:36 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:32 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 +; SI-NEXT: v_writelane_b32 v63, s30, 0 +; SI-NEXT: v_writelane_b32 v63, s31, 1 +; SI-NEXT: v_writelane_b32 v63, s34, 2 +; SI-NEXT: v_writelane_b32 v63, s35, 3 +; SI-NEXT: v_writelane_b32 v63, s36, 4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_writelane_b32 v63, s37, 5 +; SI-NEXT: v_writelane_b32 v63, s38, 6 +; SI-NEXT: v_writelane_b32 v63, s39, 7 +; SI-NEXT: v_writelane_b32 v63, s48, 8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 -; SI-NEXT: v_mov_b32_e32 v59, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v7 +; SI-NEXT: v_writelane_b32 v63, s49, 9 +; SI-NEXT: v_writelane_b32 v63, s50, 10 +; SI-NEXT: v_writelane_b32 v63, s51, 11 +; SI-NEXT: v_writelane_b32 v63, s52, 12 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_writelane_b32 v63, s53, 13 +; SI-NEXT: v_writelane_b32 v63, s54, 14 +; SI-NEXT: v_writelane_b32 v63, s55, 15 +; SI-NEXT: v_writelane_b32 v63, s64, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 +; SI-NEXT: v_writelane_b32 v63, s65, 17 +; SI-NEXT: v_writelane_b32 v63, s66, 18 +; SI-NEXT: v_writelane_b32 v63, s67, 19 +; SI-NEXT: v_writelane_b32 v63, s68, 20 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 +; SI-NEXT: v_writelane_b32 v63, s69, 21 +; SI-NEXT: v_writelane_b32 v63, s70, 22 +; SI-NEXT: v_writelane_b32 v63, s71, 23 +; SI-NEXT: v_writelane_b32 v63, s80, 24 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 +; SI-NEXT: v_writelane_b32 v63, s81, 25 +; SI-NEXT: v_writelane_b32 v63, s82, 26 +; SI-NEXT: v_writelane_b32 v63, s83, 27 +; SI-NEXT: v_writelane_b32 v63, s84, 28 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v63, s85, 29 +; SI-NEXT: v_writelane_b32 v63, s86, 30 +; SI-NEXT: v_writelane_b32 v63, s87, 31 +; SI-NEXT: v_mov_b32_e32 v46, v29 +; SI-NEXT: v_writelane_b32 v63, s96, 32 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v15 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v36 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v60 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v50 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v52 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(13) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v51 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v53 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v49, v54 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v40, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s21 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v40 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v22, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 +; SI-NEXT: v_writelane_b32 v63, s97, 33 +; SI-NEXT: v_writelane_b32 v63, s98, 34 +; SI-NEXT: v_writelane_b32 v63, s99, 35 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB95_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v13, v13, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v44 -; SI-NEXT: v_or_b32_e32 v55, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; SI-NEXT: v_or_b32_e32 v57, v16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; SI-NEXT: v_or_b32_e32 v17, v14, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_or_b32_e32 v16, v19, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v24 -; SI-NEXT: v_or_b32_e32 v19, v23, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v29 -; SI-NEXT: v_or_b32_e32 v47, v60, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v43 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v43, v42, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 -; SI-NEXT: v_or_b32_e32 v14, v63, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v61 -; SI-NEXT: v_or_b32_e32 v42, v58, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v60, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; SI-NEXT: v_or_b32_e32 v22, v2, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_or_b32_e32 v12, v46, v12 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v2, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; SI-NEXT: v_or_b32_e32 v4, v4, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_or_b32_e32 v34, v34, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_or_b32_e32 v3, v59, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_or_b32_e32 v59, v56, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; SI-NEXT: v_or_b32_e32 v6, v62, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; SI-NEXT: v_or_b32_e32 v62, v25, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v2, v27, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; SI-NEXT: v_or_b32_e32 v25, v28, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v36, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; SI-NEXT: v_or_b32_e32 v23, v35, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v39, v11 -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: v_mov_b32_e32 v35, v1 -; SI-NEXT: v_alignbit_b32 v1, v55, v13, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v13, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v55, v13, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v57, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v57, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v57, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v21, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v21, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v21, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v19, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v19, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v47, v19, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v14, v43, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v14, v43, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v14, v43, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v60, v42, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v22, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v22, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v24, v22, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v4, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v4, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v34, v4, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v59, v3, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v59, v3, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v59, v3, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v26, 8, v34 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v38 -; SI-NEXT: v_or_b32_e32 v61, v50, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v49 -; SI-NEXT: v_or_b32_e32 v2, v48, v11 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v49, v6 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v62, v49, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v62, v49, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v62, v49, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v36, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v36, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v25, v36, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v35, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v35, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v23, v35, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v61, v18, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v61, v18, 16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v48 -; SI-NEXT: v_or_b32_e32 v58, v54, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_or_b32_e32 v6, v53, v11 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v61, v18, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v58, v2, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v1, v58, v2, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v17 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v14 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v60 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v61 -; SI-NEXT: v_or_b32_e32 v54, v40, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v44, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v20, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v15, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v29, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v32, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v31, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v10, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v9, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v8, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v5, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v38, 8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v48, 8, 8 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v37, 8, 8 -; SI-NEXT: v_or_b32_e32 v11, v45, v11 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: s_or_b32 s44, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: s_or_b32 s45, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: s_or_b32 s42, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: s_or_b32 s43, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_writelane_b32 v62, s5, 7 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s40, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v30 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v39 +; SI-NEXT: s_or_b32 s41, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: s_or_b32 s28, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v47 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_readfirstlane_b32 s46, v55 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_mov_b32_e32 v40, v34 +; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: s_lshr_b32 s71, s45, 8 +; SI-NEXT: s_lshr_b32 s70, s43, 8 +; SI-NEXT: s_lshr_b32 s69, s41, 8 +; SI-NEXT: v_bfe_u32 v38, v47, 8, 8 +; SI-NEXT: v_bfe_u32 v37, v33, 8, 8 +; SI-NEXT: v_bfe_u32 v35, v32, 8, 8 +; SI-NEXT: v_bfe_u32 v20, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v9, 8, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; SI-NEXT: v_alignbit_b32 v28, v58, v2, 24 -; SI-NEXT: v_alignbit_b32 v2, v54, v6, 24 -; SI-NEXT: v_alignbit_b32 v39, v54, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v54, v6, 8 -; SI-NEXT: v_alignbit_b32 v27, v12, v11, 24 -; SI-NEXT: v_alignbit_b32 v56, v12, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 8 -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_mov_b32_e32 v15, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_or_b32 s29, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s5, 23 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s5, 19 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s26, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v43 +; SI-NEXT: s_or_b32 s27, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s5, 25 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshr_b32 s68, s29, 8 +; SI-NEXT: s_lshr_b32 s66, s27, 8 +; SI-NEXT: v_bfe_u32 v43, v31, 8, 8 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s24, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_or_b32 s25, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v29 +; SI-NEXT: s_or_b32 s22, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v6 +; SI-NEXT: s_or_b32 s23, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 8 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: v_readfirstlane_b32 s4, v58 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: s_or_b32 s20, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: s_or_b32 s21, s5, s4 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 24 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v7, v29 +; SI-NEXT: v_mov_b32_e32 v29, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_or_b32 s18, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v61 +; SI-NEXT: s_or_b32 s19, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v24 +; SI-NEXT: s_or_b32 s16, s5, s4 +; SI-NEXT: v_mov_b32_e32 v1, v53 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: s_lshr_b32 s64, s25, 8 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: s_lshr_b32 s52, s21, 8 +; SI-NEXT: s_lshr_b32 s50, s19, 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_bfe_u32 v24, v12, 8, 8 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_bfe_u32 v48, v48, 8, 8 +; SI-NEXT: v_bfe_u32 v61, v59, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_bfe_u32 v18, v11, 8, 8 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: s_or_b32 s17, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v44 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v42 +; SI-NEXT: s_or_b32 s14, s5, s4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v46 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v3 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v27 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v28 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v45 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v57 +; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v59 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v56 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v26 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v25 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v50 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v49 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v54 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v53 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_or_b32 s5, s46, s5 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: s_lshr_b32 s48, s17, 8 +; SI-NEXT: s_lshr_b32 s67, s15, 8 +; SI-NEXT: s_lshr_b32 s65, s13, 8 +; SI-NEXT: s_lshr_b32 s55, s11, 8 +; SI-NEXT: s_lshr_b32 s53, s9, 8 +; SI-NEXT: s_lshr_b32 s51, s7, 8 +; SI-NEXT: s_lshr_b32 s49, s5, 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 +; SI-NEXT: s_mov_b64 vcc, 0 +; SI-NEXT: v_mov_b32_e32 v57, v30 +; SI-NEXT: v_bfe_u32 v50, v30, 8, 8 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_mov_b32_e32 v30, v32 ; SI-NEXT: v_mov_b32_e32 v32, v31 ; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_bfe_u32 v29, v7, 8, 8 -; SI-NEXT: v_mov_b32_e32 v7, v8 -; SI-NEXT: v_mov_b32_e32 v8, v5 -; SI-NEXT: v_mov_b32_e32 v44, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v47, v52, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v1, 8, 8 +; SI-NEXT: v_mov_b32_e32 v3, v14 +; SI-NEXT: v_mov_b32_e32 v25, v59 +; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: v_mov_b32_e32 v44, v11 ; SI-NEXT: s_branch .LBB95_3 ; SI-NEXT: .LBB95_2: -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v20, v29 -; SI-NEXT: v_mov_b32_e32 v15, v33 -; SI-NEXT: v_mov_b32_e32 v33, v32 +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v7, v29 +; SI-NEXT: v_mov_b32_e32 v29, v6 +; SI-NEXT: v_mov_b32_e32 v39, v5 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: v_mov_b32_e32 v51, v42 +; SI-NEXT: v_writelane_b32 v62, s4, 0 +; SI-NEXT: v_writelane_b32 v62, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v41, v21 +; SI-NEXT: v_writelane_b32 v62, s4, 2 +; SI-NEXT: v_writelane_b32 v62, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_writelane_b32 v62, s4, 4 +; SI-NEXT: v_writelane_b32 v62, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v40, v34 +; SI-NEXT: v_writelane_b32 v62, s4, 6 +; SI-NEXT: v_writelane_b32 v62, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v34, v61 +; SI-NEXT: v_writelane_b32 v62, s4, 8 +; SI-NEXT: v_writelane_b32 v62, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v13, v12 +; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v62, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_writelane_b32 v62, s4, 12 +; SI-NEXT: v_writelane_b32 v62, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v57, v30 +; SI-NEXT: v_writelane_b32 v62, s4, 14 +; SI-NEXT: v_writelane_b32 v62, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v58, v11 +; SI-NEXT: v_writelane_b32 v62, s4, 16 +; SI-NEXT: v_writelane_b32 v62, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_writelane_b32 v62, s4, 18 +; SI-NEXT: v_writelane_b32 v62, s5, 19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v46, v33 +; SI-NEXT: v_writelane_b32 v62, s4, 20 +; SI-NEXT: v_writelane_b32 v62, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_writelane_b32 v62, s4, 22 +; SI-NEXT: v_writelane_b32 v62, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_mov_b32_e32 v32, v31 +; SI-NEXT: v_writelane_b32 v62, s4, 24 +; SI-NEXT: v_writelane_b32 v62, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: v_mov_b32_e32 v31, v10 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v9, v7 -; SI-NEXT: v_mov_b32_e32 v7, v8 -; SI-NEXT: v_mov_b32_e32 v8, v5 -; SI-NEXT: v_mov_b32_e32 v44, v37 -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: v_writelane_b32 v62, s4, 26 +; SI-NEXT: v_writelane_b32 v62, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_writelane_b32 v62, s4, 28 +; SI-NEXT: v_writelane_b32 v62, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_writelane_b32 v62, s4, 30 +; SI-NEXT: v_writelane_b32 v62, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v22, v2 +; SI-NEXT: v_writelane_b32 v62, s4, 32 +; SI-NEXT: v_writelane_b32 v62, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_writelane_b32 v62, s4, 34 +; SI-NEXT: v_writelane_b32 v62, s5, 35 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v17, v43 +; SI-NEXT: v_writelane_b32 v62, s4, 36 +; SI-NEXT: v_writelane_b32 v62, s5, 37 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_mov_b64 vcc, -1 +; SI-NEXT: v_writelane_b32 v62, s4, 38 +; SI-NEXT: v_writelane_b32 v62, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v25, v59 +; SI-NEXT: v_writelane_b32 v62, s4, 40 +; SI-NEXT: v_writelane_b32 v62, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_mov_b32_e32 v1, v52 +; SI-NEXT: v_writelane_b32 v62, s4, 42 +; SI-NEXT: v_writelane_b32 v62, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr71 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr64 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr96 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: v_writelane_b32 v62, s4, 44 +; SI-NEXT: v_writelane_b32 v62, s5, 45 +; SI-NEXT: v_writelane_b32 v62, s80, 46 +; SI-NEXT: v_writelane_b32 v62, s81, 47 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v62, s80, 48 +; SI-NEXT: v_writelane_b32 v62, s81, 49 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: .LBB95_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v5, v8 -; SI-NEXT: v_mov_b32_e32 v6, v7 -; SI-NEXT: v_mov_b32_e32 v7, v9 -; SI-NEXT: v_mov_b32_e32 v8, v10 -; SI-NEXT: v_mov_b32_e32 v9, v31 -; SI-NEXT: v_mov_b32_e32 v31, v33 -; SI-NEXT: v_mov_b32_e32 v44, v15 -; SI-NEXT: v_mov_b32_e32 v33, v20 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v50, v2 -; SI-NEXT: v_mov_b32_e32 v53, v40 -; SI-NEXT: v_mov_b32_e32 v40, v28 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_mov_b32_e32 v2, v48 -; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v11, v27 -; SI-NEXT: v_mov_b32_e32 v38, v30 -; SI-NEXT: v_mov_b32_e32 v27, v52 -; SI-NEXT: v_mov_b32_e32 v30, v29 -; SI-NEXT: v_mov_b32_e32 v29, v26 +; SI-NEXT: v_mov_b32_e32 v14, v17 +; SI-NEXT: v_mov_b32_e32 v17, v39 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_andn2_b64 vcc, exec, vcc +; SI-NEXT: v_mov_b32_e32 v23, v2 +; SI-NEXT: v_mov_b32_e32 v59, v34 +; SI-NEXT: v_mov_b32_e32 v2, v25 ; SI-NEXT: s_cbranch_vccnz .LBB95_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v52, v29 +; SI-NEXT: v_mov_b32_e32 v29, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v18, v14 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v15, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v31 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 -; SI-NEXT: v_or_b32_e32 v56, v12, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_bfe_u32 v50, v57, 8, 8 +; SI-NEXT: v_bfe_u32 v48, v12, 8, 8 +; SI-NEXT: v_bfe_u32 v43, v32, 8, 8 +; SI-NEXT: v_bfe_u32 v24, v13, 8, 8 +; SI-NEXT: v_bfe_u32 v20, v31, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v54, 8, 8 +; SI-NEXT: v_bfe_u32 v42, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v45, v55, 8, 8 +; SI-NEXT: v_bfe_u32 v61, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v33, v53, 8, 8 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v14 +; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v44 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_readfirstlane_b32 s5, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v53 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v36, v14, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s49, s5, 8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v54, v14, v16 +; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v16, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v52, v17, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_lshr_b64 s[82:83], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[84:85], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[86:87], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s51, s7, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v58, v17, v19 -; SI-NEXT: v_alignbit_b32 v40, v58, v52, 24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s8, v14 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s9, v14 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v11, v21, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b32 s53, s9, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v27, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v61, v21, v22 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s10, v14 +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v16, v23, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s11, v14 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readfirstlane_b32 s11, v55 +; SI-NEXT: s_lshl_b32 s11, s11, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 +; SI-NEXT: s_lshr_b32 s55, s11, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s12, v14 +; SI-NEXT: s_lshl_b32 s12, s12, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v48, v25, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s13, v14 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v5 +; SI-NEXT: s_lshl_b32 s13, s13, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v3 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_or_b32_e32 v53, v26, v27 -; SI-NEXT: v_mov_b32_e32 v26, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v28, v4 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v62, v28, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_lshr_b64 s[78:79], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[12:13], 8 +; SI-NEXT: s_lshr_b32 s65, s13, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v29, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v27, v34 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v63 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_or_b32_e32 v59, v29, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v51 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v30, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v3, v30, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v44 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v4, v34, v30 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v32 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_mov_b32_e32 v30, v10 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: v_or_b32_e32 v34, v35, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v46 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v22, v35, v36 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v10 -; SI-NEXT: v_mov_b32_e32 v35, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v51 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s15, v14 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: s_lshl_b32 s15, s15, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v24, v37, v36 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v38 -; SI-NEXT: v_or_b32_e32 v42, v39, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: v_mov_b32_e32 v36, v48 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: s_lshr_b64 s[92:93], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[14:15], 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[30:31], s[14:15], 8 +; SI-NEXT: s_lshr_b32 s67, s15, 8 +; SI-NEXT: v_readfirstlane_b32 s16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v21 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readfirstlane_b32 s17, v44 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshr_b64 s[38:39], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s48, s17, 8 +; SI-NEXT: v_readfirstlane_b32 s18, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v60 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s19, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_readfirstlane_b32 s19, v54 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_lshr_b64 s[96:97], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[98:99], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 8 +; SI-NEXT: s_lshr_b32 s50, s19, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v13 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v60, v37, v39 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_alignbit_b32 v39, v54, v29, 16 -; SI-NEXT: v_or_b32_e32 v43, v48, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s20, v14 +; SI-NEXT: s_lshl_b32 s20, s20, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v14, v49, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v28, v14, v43, 8 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s21, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v58 +; SI-NEXT: s_or_b32 s20, s21, s20 +; SI-NEXT: v_readfirstlane_b32 s21, v31 +; SI-NEXT: s_lshl_b32 s21, s21, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v39 +; SI-NEXT: s_or_b32 s21, s22, s21 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 8 +; SI-NEXT: s_lshr_b32 s52, s21, 8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s22, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v29 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s23, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v52 +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_readfirstlane_b32 s23, v32 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v17 +; SI-NEXT: s_or_b32 s23, s24, s23 +; SI-NEXT: s_lshr_b32 s54, s23, 8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s24, v14 +; SI-NEXT: s_lshl_b32 s24, s24, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v19, v48, v37 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v33 -; SI-NEXT: v_or_b32_e32 v47, v49, v37 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v30 +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v14 +; SI-NEXT: v_readfirstlane_b32 s25, v30 +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: v_bfe_u32 v35, v30, 8, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_or_b32_e32 v21, v50, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_or_b32_e32 v16, v37, v49 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_alignbit_b32 v50, v54, v29, 24 -; SI-NEXT: v_or_b32_e32 v57, v48, v37 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v17, v49, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_mov_b32_e32 v49, v53 -; SI-NEXT: v_alignbit_b32 v53, v54, v29, 8 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_or_b32_e32 v13, v48, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_or_b32_e32 v55, v51, v37 -; SI-NEXT: v_alignbit_b32 v10, v55, v13, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v55, v13, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v17, v57, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v17, v57, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v17, v57, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v16, v21, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v16, v21, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v16, v21, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v47, v19, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v47, v19, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v47, v19, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v14, v43, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v14, v43, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v60, v42, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v24, v22, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v24, v22, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v24, v22, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v34, v4, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v34, v4, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v34, v4, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v59, v3, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v59, v3, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v59, v3, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v62, v49, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v62, v49, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v62, v49, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v25, v36, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v25, v36, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v25, v36, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v23, v35, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v23, v35, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v23, v35, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v10, v61, v11, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v10, v58, v52, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v10, v58, v52, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v10, v56 -; SI-NEXT: v_alignbit_b32 v11, v12, v10, 24 -; SI-NEXT: v_alignbit_b32 v56, v12, v10, 16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v38, v12, v10, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v55 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v17 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v16 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v47 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v14 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v60 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v59 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v62 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v23 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v61 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v58 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v54 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v12 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v20, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v18, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v15, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v33, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v44, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v31, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v30, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v9, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v8, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v6, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v5, 8, 8 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_bfe_u32 v10, v26, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v2, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v1, 8, 8 -; SI-NEXT: v_alignbit_b32 v48, v55, v13, 24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 -; SI-NEXT: v_bfe_u32 v30, v7, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v10, v27, 8, 8 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: .LBB95_5: ; %end -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v48 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v37, v37, v51 -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v51, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_or_b32_e32 v51, v52, v51 -; SI-NEXT: v_or_b32_e32 v37, v37, v51 -; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v37, v37, v51 -; SI-NEXT: v_and_b32_e32 v37, 0xffff, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 -; SI-NEXT: v_or_b32_e32 v20, v48, v20 -; SI-NEXT: v_or_b32_e32 v20, v37, v20 -; SI-NEXT: v_add_i32_e32 v37, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v57 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v20, v20, v37 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v37, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v48, 24, v10 -; SI-NEXT: v_or_b32_e32 v37, v48, v37 -; SI-NEXT: v_or_b32_e32 v20, v20, v37 -; SI-NEXT: v_add_i32_e32 v37, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v37, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v20, v20, v37 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v37, v18 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v10 -; SI-NEXT: v_or_b32_e32 v20, v37, v20 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v22 +; SI-NEXT: s_or_b32 s25, s26, s25 +; SI-NEXT: s_lshr_b32 s64, s25, 8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s26, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v23 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s27, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v46 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v18 +; SI-NEXT: v_bfe_u32 v18, v44, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s27, v46 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshl_b32 s27, s27, 16 +; SI-NEXT: v_bfe_u32 v37, v46, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s28, v14 +; SI-NEXT: s_or_b32 s27, s28, s27 +; SI-NEXT: s_lshr_b32 s66, s27, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v15, v20, v15 -; SI-NEXT: v_or_b32_e32 v15, v18, v15 -; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s28, v14 +; SI-NEXT: s_lshl_b32 s28, s28, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s29, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v56 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 +; SI-NEXT: v_bfe_u32 v15, v5, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s29, v56 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshl_b32 s29, s29, 16 +; SI-NEXT: v_bfe_u32 v38, v56, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s40, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s29, s40, s29 +; SI-NEXT: s_lshr_b32 s68, s29, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s40, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_lshl_b32 s40, s40, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s41, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: v_readfirstlane_b32 s41, v57 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshl_b32 s41, s41, 16 +; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: s_or_b32 s41, s42, s41 +; SI-NEXT: s_lshr_b32 s69, s41, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s42, v14 +; SI-NEXT: s_lshl_b32 s42, s42, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s43, v14 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: s_lshl_b32 s43, s43, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: s_or_b32 s43, s44, s43 +; SI-NEXT: s_lshr_b32 s70, s43, 8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s44, v14 +; SI-NEXT: s_lshl_b32 s44, s44, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_bfe_u32 v47, v1, 8, 8 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s45, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: v_readfirstlane_b32 s45, v13 +; SI-NEXT: s_lshl_b32 s45, s45, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_readfirstlane_b32 s46, v14 +; SI-NEXT: s_or_b32 s45, s46, s45 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 4 +; SI-NEXT: v_writelane_b32 v62, s47, 5 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 2 +; SI-NEXT: v_writelane_b32 v62, s47, 3 +; SI-NEXT: s_lshr_b64 s[46:47], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 0 +; SI-NEXT: v_writelane_b32 v62, s47, 1 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 10 +; SI-NEXT: v_writelane_b32 v62, s47, 11 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 8 +; SI-NEXT: v_writelane_b32 v62, s47, 9 +; SI-NEXT: s_lshr_b64 s[46:47], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 6 +; SI-NEXT: v_writelane_b32 v62, s47, 7 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 16 +; SI-NEXT: v_writelane_b32 v62, s47, 17 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 14 +; SI-NEXT: v_writelane_b32 v62, s47, 15 +; SI-NEXT: s_lshr_b64 s[46:47], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 12 +; SI-NEXT: v_writelane_b32 v62, s47, 13 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 22 +; SI-NEXT: v_writelane_b32 v62, s47, 23 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 20 +; SI-NEXT: v_writelane_b32 v62, s47, 21 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 18 +; SI-NEXT: v_writelane_b32 v62, s47, 19 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 28 +; SI-NEXT: v_writelane_b32 v62, s47, 29 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 26 +; SI-NEXT: v_writelane_b32 v62, s47, 27 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 24 +; SI-NEXT: v_writelane_b32 v62, s47, 25 +; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 34 +; SI-NEXT: v_writelane_b32 v62, s47, 35 +; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 32 +; SI-NEXT: v_writelane_b32 v62, s47, 33 +; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 30 +; SI-NEXT: v_writelane_b32 v62, s47, 31 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 40 +; SI-NEXT: v_writelane_b32 v62, s47, 41 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 38 +; SI-NEXT: v_writelane_b32 v62, s47, 39 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 8 +; SI-NEXT: v_writelane_b32 v62, s46, 36 +; SI-NEXT: v_writelane_b32 v62, s47, 37 +; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 24 +; SI-NEXT: v_writelane_b32 v62, s46, 44 +; SI-NEXT: v_writelane_b32 v62, s47, 45 +; SI-NEXT: s_lshr_b64 s[46:47], s[20:21], 16 +; SI-NEXT: v_writelane_b32 v62, s46, 42 +; SI-NEXT: v_writelane_b32 v62, s47, 43 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: v_writelane_b32 v62, vcc_lo, 46 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v62, vcc_hi, 47 +; SI-NEXT: s_lshr_b32 s71, s45, 8 +; SI-NEXT: .LBB95_5: ; %end +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 0 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 1 +; SI-NEXT: s_lshl_b32 s47, vcc_lo, 8 +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 2 +; SI-NEXT: s_and_b32 s44, s44, 0xff +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 3 +; SI-NEXT: s_or_b32 s44, s44, s47 +; SI-NEXT: s_and_b32 s47, vcc_lo, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 4 +; SI-NEXT: s_lshl_b32 s57, vcc_lo, 24 +; SI-NEXT: s_lshl_b32 s47, s47, 16 +; SI-NEXT: s_or_b32 s47, s57, s47 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: s_or_b32 s44, s44, s47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mov_b32_e32 v16, s44 +; SI-NEXT: s_and_b32 s44, s45, 0xff +; SI-NEXT: s_lshl_b32 s45, s71, 8 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_or_b32 s44, s44, s45 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v24 +; SI-NEXT: s_and_b32 s44, s44, 0xffff +; SI-NEXT: v_or_b32_e32 v13, v21, v13 +; SI-NEXT: v_or_b32_e32 v13, s44, v13 +; SI-NEXT: v_readlane_b32 s44, v62, 6 +; SI-NEXT: v_readlane_b32 s45, v62, 7 +; SI-NEXT: s_lshl_b32 s44, s44, 8 +; SI-NEXT: s_and_b32 s42, s42, 0xff +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 5 +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: v_readlane_b32 s44, v62, 8 +; SI-NEXT: v_readlane_b32 s45, v62, 9 +; SI-NEXT: s_and_b32 s44, s44, 0xff +; SI-NEXT: v_readlane_b32 vcc_lo, v62, 10 +; SI-NEXT: s_lshl_b32 s45, vcc_lo, 24 +; SI-NEXT: s_lshl_b32 s44, s44, 16 +; SI-NEXT: s_or_b32 s44, s45, s44 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: s_or_b32 s42, s42, s44 +; SI-NEXT: v_mov_b32_e32 v21, s42 +; SI-NEXT: s_and_b32 s42, s43, 0xff +; SI-NEXT: s_lshl_b32 s43, s70, 8 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 +; SI-NEXT: s_or_b32 s42, s42, s43 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v48 +; SI-NEXT: s_and_b32 s42, s42, 0xffff +; SI-NEXT: v_or_b32_e32 v12, v23, v12 +; SI-NEXT: v_or_b32_e32 v12, s42, v12 +; SI-NEXT: v_readlane_b32 s42, v62, 12 +; SI-NEXT: v_readlane_b32 s43, v62, 13 +; SI-NEXT: s_lshl_b32 s42, s42, 8 +; SI-NEXT: s_and_b32 s40, s40, 0xff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: v_readlane_b32 s42, v62, 14 +; SI-NEXT: v_readlane_b32 s43, v62, 15 +; SI-NEXT: s_and_b32 s42, s42, 0xff +; SI-NEXT: v_readlane_b32 s44, v62, 16 +; SI-NEXT: s_lshl_b32 s43, s44, 24 +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_or_b32 s42, s43, s42 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: s_or_b32 s40, s40, s42 +; SI-NEXT: v_mov_b32_e32 v23, s40 +; SI-NEXT: s_and_b32 s40, s41, 0xff +; SI-NEXT: s_lshl_b32 s41, s69, 8 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v57 +; SI-NEXT: s_or_b32 s40, s40, s41 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v50 +; SI-NEXT: s_and_b32 s40, s40, 0xffff +; SI-NEXT: v_or_b32_e32 v11, v24, v11 +; SI-NEXT: v_or_b32_e32 v11, s40, v11 +; SI-NEXT: v_readlane_b32 s40, v62, 18 +; SI-NEXT: v_readlane_b32 s41, v62, 19 +; SI-NEXT: s_lshl_b32 s40, s40, 8 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: s_or_b32 s28, s28, s40 +; SI-NEXT: v_readlane_b32 s40, v62, 20 +; SI-NEXT: v_readlane_b32 s41, v62, 21 +; SI-NEXT: s_and_b32 s40, s40, 0xff +; SI-NEXT: v_readlane_b32 s42, v62, 22 +; SI-NEXT: s_lshl_b32 s41, s42, 24 +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_or_b32 s40, s41, s40 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: s_or_b32 s28, s28, s40 +; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: s_and_b32 s28, s29, 0xff +; SI-NEXT: s_lshl_b32 s29, s68, 8 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v56 +; SI-NEXT: s_or_b32 s28, s28, s29 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v38 +; SI-NEXT: s_and_b32 s28, s28, 0xffff +; SI-NEXT: v_or_b32_e32 v25, v26, v25 +; SI-NEXT: v_or_b32_e32 v25, s28, v25 +; SI-NEXT: v_readlane_b32 s28, v62, 24 +; SI-NEXT: v_readlane_b32 s29, v62, 25 +; SI-NEXT: s_lshl_b32 s28, s28, 8 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: s_or_b32 s26, s26, s28 +; SI-NEXT: v_readlane_b32 s28, v62, 26 +; SI-NEXT: v_readlane_b32 s29, v62, 27 +; SI-NEXT: s_and_b32 s28, s28, 0xff +; SI-NEXT: v_readlane_b32 s40, v62, 28 +; SI-NEXT: s_lshl_b32 s29, s40, 24 +; SI-NEXT: s_lshl_b32 s28, s28, 16 +; SI-NEXT: s_or_b32 s28, s29, s28 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: s_or_b32 s26, s26, s28 +; SI-NEXT: v_mov_b32_e32 v26, s26 +; SI-NEXT: s_and_b32 s26, s27, 0xff +; SI-NEXT: s_lshl_b32 s27, s66, 8 +; SI-NEXT: v_and_b32_e32 v27, 0xff, v46 +; SI-NEXT: s_or_b32 s26, s26, s27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v37 +; SI-NEXT: s_and_b32 s26, s26, 0xffff +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_or_b32_e32 v27, s26, v27 +; SI-NEXT: v_readlane_b32 s26, v62, 30 +; SI-NEXT: v_readlane_b32 s27, v62, 31 +; SI-NEXT: s_lshl_b32 s26, s26, 8 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: s_or_b32 s24, s24, s26 +; SI-NEXT: v_readlane_b32 s26, v62, 32 +; SI-NEXT: v_readlane_b32 s27, v62, 33 +; SI-NEXT: s_and_b32 s26, s26, 0xff +; SI-NEXT: v_readlane_b32 s28, v62, 34 +; SI-NEXT: s_lshl_b32 s27, s28, 24 +; SI-NEXT: s_lshl_b32 s26, s26, 16 +; SI-NEXT: v_readlane_b32 vcc_hi, v62, 11 +; SI-NEXT: s_or_b32 s26, s27, s26 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: s_or_b32 s24, s24, s26 +; SI-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v16, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v28, s24 +; SI-NEXT: s_and_b32 s24, s25, 0xff +; SI-NEXT: s_lshl_b32 s25, s64, 8 +; SI-NEXT: v_and_b32_e32 v29, 0xff, v30 +; SI-NEXT: buffer_store_dword v13, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 +; SI-NEXT: s_or_b32 s24, s24, s25 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v35 +; SI-NEXT: buffer_store_dword v21, v13, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 +; SI-NEXT: s_and_b32 s24, s24, 0xffff +; SI-NEXT: v_or_b32_e32 v29, v30, v29 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v22 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xff, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v10 -; SI-NEXT: v_or_b32_e32 v18, v20, v18 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v29, s24, v29 +; SI-NEXT: buffer_store_dword v23, v12, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 +; SI-NEXT: v_readlane_b32 s24, v62, 36 +; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 +; SI-NEXT: v_readlane_b32 s25, v62, 37 +; SI-NEXT: s_lshl_b32 s24, s24, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: buffer_store_dword v24, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: v_readlane_b32 s24, v62, 38 +; SI-NEXT: buffer_store_dword v25, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: v_readlane_b32 s25, v62, 39 +; SI-NEXT: s_and_b32 s24, s24, 0xff +; SI-NEXT: v_readlane_b32 s26, v62, 40 +; SI-NEXT: buffer_store_dword v26, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: s_lshl_b32 s24, s24, 16 +; SI-NEXT: s_lshl_b32 s25, s26, 24 +; SI-NEXT: buffer_store_dword v27, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: s_or_b32 s24, s25, s24 +; SI-NEXT: buffer_store_dword v28, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 44, v0 +; SI-NEXT: s_or_b32 s22, s22, s24 +; SI-NEXT: buffer_store_dword v29, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: s_and_b32 s22, s23, 0xff +; SI-NEXT: s_lshl_b32 s23, s54, 8 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v32 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s22, s22, s23 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 -; SI-NEXT: v_or_b32_e32 v10, v18, v10 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_add_i32_e32 v15, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v13 -; SI-NEXT: v_or_b32_e32 v15, v18, v15 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v43 +; SI-NEXT: s_and_b32 s22, s22, 0xffff +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_or_b32_e32 v10, s22, v10 +; SI-NEXT: s_and_b32 s20, s20, 0xff +; SI-NEXT: s_lshl_b32 s22, s62, 8 +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: v_readlane_b32 s22, v62, 42 +; SI-NEXT: v_readlane_b32 s23, v62, 43 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: v_readlane_b32 s24, v62, 44 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s23, s24, 24 +; SI-NEXT: s_and_b32 s20, s20, 0xffff +; SI-NEXT: s_or_b32 s22, s23, s22 +; SI-NEXT: v_add_i32_e32 v11, vcc, 52, v0 +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 -; SI-NEXT: v_or_b32_e32 v9, v15, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v11, s20 +; SI-NEXT: s_and_b32 s20, s21, 0xff +; SI-NEXT: s_lshl_b32 s21, s52, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v31 +; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s20, s20, s21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v20 +; SI-NEXT: s_and_b32 s20, s20, 0xffff ; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v9, s20, v9 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s20, s58, 8 +; SI-NEXT: s_or_b32 s18, s18, s20 +; SI-NEXT: s_and_b32 s20, s98, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 16 +; SI-NEXT: s_lshl_b32 s21, s96, 24 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s20, s21, s20 ; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 +; SI-NEXT: s_or_b32 s18, s18, s20 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v3 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v13 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v3 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xff +; SI-NEXT: s_lshl_b32 s19, s50, 8 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v54 +; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s18, s18, s19 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v19 +; SI-NEXT: s_and_b32 s18, s18, 0xffff ; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: v_or_b32_e32 v8, s18, v8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s18, s38, 8 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s36, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s34, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s18, s19, s18 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s18 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v49 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v4 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 +; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s48, 8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v44 +; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v7, s16, v7 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s30, 8 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s16, s94, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s92, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v36 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v3 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v6, v8, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s67, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v42 +; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v6, s14, v6 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s90, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s88, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s78, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v35 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v3 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s65, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v15 +; SI-NEXT: s_and_b32 s12, s12, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s74, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s72, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v61 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s55, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v55 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v45 +; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s60, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s56, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s46, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v40 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v58 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s53, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v61 +; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, s8, v2 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s86, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s84, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s82, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v50 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v1, v4, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s51, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v47 +; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s6, v1 +; SI-NEXT: v_readlane_b32 s6, v62, 46 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: v_readlane_b32 s7, v62, 47 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s80, 0xff +; SI-NEXT: v_readlane_b32 s8, v62, 48 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s8, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v38 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s49, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v33 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s45, v62, 17 +; SI-NEXT: v_readlane_b32 s43, v62, 23 +; SI-NEXT: v_readlane_b32 s41, v62, 29 +; SI-NEXT: v_readlane_b32 s29, v62, 35 +; SI-NEXT: v_readlane_b32 s27, v62, 41 +; SI-NEXT: v_readlane_b32 s25, v62, 45 +; SI-NEXT: v_readlane_b32 s9, v62, 49 +; SI-NEXT: v_readlane_b32 s99, v63, 35 +; SI-NEXT: v_readlane_b32 s98, v63, 34 +; SI-NEXT: v_readlane_b32 s97, v63, 33 +; SI-NEXT: v_readlane_b32 s96, v63, 32 +; SI-NEXT: v_readlane_b32 s87, v63, 31 +; SI-NEXT: v_readlane_b32 s86, v63, 30 +; SI-NEXT: v_readlane_b32 s85, v63, 29 +; SI-NEXT: v_readlane_b32 s84, v63, 28 +; SI-NEXT: v_readlane_b32 s83, v63, 27 +; SI-NEXT: v_readlane_b32 s82, v63, 26 +; SI-NEXT: v_readlane_b32 s81, v63, 25 +; SI-NEXT: v_readlane_b32 s80, v63, 24 +; SI-NEXT: v_readlane_b32 s71, v63, 23 +; SI-NEXT: v_readlane_b32 s70, v63, 22 +; SI-NEXT: v_readlane_b32 s69, v63, 21 +; SI-NEXT: v_readlane_b32 s68, v63, 20 +; SI-NEXT: v_readlane_b32 s67, v63, 19 +; SI-NEXT: v_readlane_b32 s66, v63, 18 +; SI-NEXT: v_readlane_b32 s65, v63, 17 +; SI-NEXT: v_readlane_b32 s64, v63, 16 +; SI-NEXT: v_readlane_b32 s55, v63, 15 +; SI-NEXT: v_readlane_b32 s54, v63, 14 +; SI-NEXT: v_readlane_b32 s53, v63, 13 +; SI-NEXT: v_readlane_b32 s52, v63, 12 +; SI-NEXT: v_readlane_b32 s51, v63, 11 +; SI-NEXT: v_readlane_b32 s50, v63, 10 +; SI-NEXT: v_readlane_b32 s49, v63, 9 +; SI-NEXT: v_readlane_b32 s48, v63, 8 +; SI-NEXT: v_readlane_b32 s39, v63, 7 +; SI-NEXT: v_readlane_b32 s38, v63, 6 +; SI-NEXT: v_readlane_b32 s37, v63, 5 +; SI-NEXT: v_readlane_b32 s36, v63, 4 +; SI-NEXT: v_readlane_b32 s35, v63, 3 +; SI-NEXT: v_readlane_b32 s34, v63, 2 +; SI-NEXT: v_readlane_b32 s31, v63, 1 +; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -188114,113 +189875,105 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: .LBB95_2: ; %cmp.true ; VI-NEXT: s_lshr_b32 s46, s45, 16 ; VI-NEXT: v_mov_b32_e32 v7, 0x200 -; VI-NEXT: v_add_f16_e32 v1, s46, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v11, s46, v7 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 ; VI-NEXT: v_add_f16_e32 v2, s45, v7 ; VI-NEXT: s_lshr_b32 s45, s44, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v23, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s45, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v40, s45, v7 ; VI-NEXT: v_add_f16_e32 v2, s44, v7 ; VI-NEXT: s_lshr_b32 s44, s43, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; VI-NEXT: v_add_f16_e32 v43, s44, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v22, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s44, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 ; VI-NEXT: v_add_f16_e32 v2, s43, v7 ; VI-NEXT: s_lshr_b32 s43, s42, 16 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v25, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s43, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_add_f16_e32 v54, s43, v7 ; VI-NEXT: v_add_f16_e32 v2, s42, v7 ; VI-NEXT: s_lshr_b32 s42, s41, 16 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; VI-NEXT: v_add_f16_e32 v37, s42, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v24, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s42, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 ; VI-NEXT: v_add_f16_e32 v2, s41, v7 ; VI-NEXT: s_lshr_b32 s41, s40, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v27, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s41, v7 +; VI-NEXT: v_add_f16_e32 v49, s41, v7 ; VI-NEXT: v_add_f16_e32 v2, s40, v7 ; VI-NEXT: s_lshr_b32 s40, s15, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v53, s40, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; VI-NEXT: v_add_f16_e32 v52, s40, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v26, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 ; VI-NEXT: v_add_f16_e32 v2, s15, v7 ; VI-NEXT: s_lshr_b32 s15, s14, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v29, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s15, v7 +; VI-NEXT: v_add_f16_e32 v55, s15, v7 ; VI-NEXT: v_add_f16_e32 v2, s14, v7 ; VI-NEXT: s_lshr_b32 s14, s13, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v43, s14, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v55 +; VI-NEXT: v_add_f16_e32 v53, s14, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v28, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v53 ; VI-NEXT: v_add_f16_e32 v2, s13, v7 ; VI-NEXT: s_lshr_b32 s13, s12, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v6, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s13, v7 ; VI-NEXT: v_add_f16_e32 v2, s12, v7 ; VI-NEXT: s_lshr_b32 s12, s11, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v37, s12, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: v_add_f16_e32 v39, s12, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v37 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 ; VI-NEXT: v_add_f16_e32 v2, s11, v7 ; VI-NEXT: s_lshr_b32 s11, s10, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v31, v2, v1 -; VI-NEXT: v_add_f16_e32 v1, s11, v7 +; VI-NEXT: v_add_f16_e32 v60, s11, v7 ; VI-NEXT: v_add_f16_e32 v2, s10, v7 ; VI-NEXT: s_lshr_b32 s10, s9, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; VI-NEXT: v_add_f16_e32 v52, s10, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v60 +; VI-NEXT: v_add_f16_e32 v48, s10, v7 +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v30, v2, v1 -; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 ; VI-NEXT: v_add_f16_e32 v2, s9, v7 ; VI-NEXT: s_lshr_b32 s9, s8, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v4, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s9, v7 ; VI-NEXT: v_add_f16_e32 v2, s8, v7 ; VI-NEXT: s_lshr_b32 s8, s7, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v50, s8, v7 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v3, v2, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 ; VI-NEXT: v_add_f16_e32 v2, s7, v7 ; VI-NEXT: s_lshr_b32 s7, s6, 16 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v2, v2, v1 ; VI-NEXT: v_add_f16_e32 v1, s7, v7 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s17, 16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_add_f16_e32 v36, s6, v7 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v1, v8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 ; VI-NEXT: v_add_f16_e32 v9, s17, v7 @@ -188228,12 +189981,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v33, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s19, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s16, v7 ; VI-NEXT: v_add_f16_e32 v38, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v32, v9, v8 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 ; VI-NEXT: v_add_f16_e32 v9, s19, v7 @@ -188241,12 +189994,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v21, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s6, v7 ; VI-NEXT: s_lshr_b32 s6, s21, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s18, v7 ; VI-NEXT: v_add_f16_e32 v61, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v20, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s20, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v61 @@ -188254,12 +190007,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v35, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s23, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s20, v7 ; VI-NEXT: v_add_f16_e32 v45, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v34, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s22, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v45 @@ -188267,12 +190020,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v19, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s25, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s22, v7 ; VI-NEXT: v_add_f16_e32 v47, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v18, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s24, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v47 @@ -188280,12 +190033,12 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v16, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s27, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s24, v7 ; VI-NEXT: v_add_f16_e32 v57, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v15, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s26, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v57 @@ -188293,112 +190046,116 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_e32 v13, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 ; VI-NEXT: s_lshr_b32 s6, s29, 16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s26, v7 ; VI-NEXT: v_add_f16_e32 v59, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v12, v9, v8 ; VI-NEXT: s_lshr_b32 s7, s28, 16 ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v59 ; VI-NEXT: v_add_f16_e32 v9, s29, v7 -; VI-NEXT: s_lshr_b32 s6, s5, 16 ; VI-NEXT: v_or_b32_e32 v10, v9, v8 ; VI-NEXT: v_add_f16_e32 v8, s7, v7 -; VI-NEXT: s_lshr_b32 s7, s4, 16 -; VI-NEXT: v_add_f16_e32 v51, s6, v7 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: s_lshr_b32 s6, s5, 16 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; VI-NEXT: v_add_f16_e32 v9, s28, v7 -; VI-NEXT: v_add_f16_e32 v54, s5, v7 -; VI-NEXT: v_add_f16_e32 v11, s7, v7 -; VI-NEXT: v_add_f16_e32 v55, s4, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: s_lshr_b32 s7, s4, 16 +; VI-NEXT: v_add_f16_e32 v51, s6, v7 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v9, v9, v8 -; VI-NEXT: v_or_b32_e32 v8, v54, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v7, v55, v7 -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v8 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v7 +; VI-NEXT: v_add_f16_e32 v8, s5, v7 +; VI-NEXT: v_add_f16_e32 v14, s7, v7 +; VI-NEXT: v_add_f16_e32 v17, s4, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v51 +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v8, v8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v14 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v7, v17, v7 +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v8 +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v14, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v13 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v10 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v9 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v13 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v12 +; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v12, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[1:2] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v16 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v19 +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; VI-NEXT: v_lshrrev_b32_e32 v39, 8, v16 -; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3 -; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[12:13] -; VI-NEXT: v_lshrrev_b32_e32 v44, 8, v18 -; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[3:4] -; VI-NEXT: v_lshrrev_b32_e32 v49, 8, v15 -; VI-NEXT: v_lshrrev_b32_e32 v40, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v60, 8, v35 +; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] +; VI-NEXT: v_lshrrev_b64 v[16:17], 24, v[15:16] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v18 +; VI-NEXT: v_lshrrev_b64 v[17:18], 24, v[18:19] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 +; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v35 ; VI-NEXT: v_lshrrev_b64 v[18:19], 24, v[34:35] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 ; VI-NEXT: v_lshrrev_b32_e32 v35, 8, v23 ; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[22:23] +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v23, v50, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v52, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v31 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v37, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v43, 8, 8 -; VI-NEXT: v_lshrrev_b64 v[10:11], 24, v[9:10] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: v_bfe_u32 v23, v53, 8, 8 -; VI-NEXT: v_lshrrev_b32_e32 v48, 8, v12 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 -; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 -; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[24:25] -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v50, v11 +; VI-NEXT: v_bfe_u32 v11, v48, 8, 8 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[30:31] +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v6 +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v11, v39, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v56, 8, v20 ; VI-NEXT: v_lshrrev_b64 v[19:20], 24, v[20:21] -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[4:5], 24, v[5:6] +; VI-NEXT: v_lshrrev_b64 v[30:31], 24, v[26:27] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v11, v53, 8, 8 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v34 ; VI-NEXT: v_lshrrev_b32_e32 v41, 8, v21 ; VI-NEXT: v_lshrrev_b64 v[20:21], 24, v[32:33] -; VI-NEXT: v_lshrrev_b32_e32 v30, 8, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[28:29] -; VI-NEXT: v_lshrrev_b64 v[8:9], 24, v[26:27] +; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v25 +; VI-NEXT: v_lshrrev_b32_e32 v34, 8, v24 +; VI-NEXT: v_lshrrev_b64 v[24:25], 24, v[24:25] +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v11, v52, 8, 8 ; VI-NEXT: v_lshrrev_b32_e32 v46, 8, v33 ; VI-NEXT: v_lshrrev_b32_e32 v58, 8, v32 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v29 ; VI-NEXT: v_lshrrev_b32_e32 v33, 8, v28 ; VI-NEXT: v_lshrrev_b32_e32 v28, 8, v27 ; VI-NEXT: v_lshrrev_b32_e32 v29, 8, v26 -; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v25 ; VI-NEXT: v_lshrrev_b32_e32 v21, 8, v22 ; VI-NEXT: v_bfe_u32 v25, v51, 8, 8 ; VI-NEXT: v_bfe_u32 v27, v59, 8, 8 @@ -188408,12 +190165,11 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_bfe_u32 v1, v61, 8, 8 ; VI-NEXT: v_bfe_u32 v22, v38, 8, 8 ; VI-NEXT: v_bfe_u32 v2, v36, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_bfe_u32 v44, v37, 8, 8 +; VI-NEXT: v_bfe_u32 v11, v43, 8, 8 ; VI-NEXT: v_bfe_u32 v26, v50, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_bfe_u32 v23, v23, 8, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bfe_u32 v24, v24, 8, 8 +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: s_branch .LBB95_5 ; VI-NEXT: .LBB95_3: ; VI-NEXT: ; implicit-def: $sgpr46 @@ -188573,133 +190329,120 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: s_branch .LBB95_2 ; VI-NEXT: .LBB95_4: ; VI-NEXT: v_mov_b32_e32 v1, s44 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s45 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s42 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s43 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s40 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s41 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s14 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s15 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s11 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s16 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s71 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s69 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s68 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s67 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s66 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s65 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s64 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s55 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s87 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s85 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s53 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s52 ; VI-NEXT: v_readlane_b32 s6, v62, 0 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 1 ; VI-NEXT: v_mov_b32_e32 v36, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 3 ; VI-NEXT: v_mov_b32_e32 v38, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 4 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 5 ; VI-NEXT: v_mov_b32_e32 v61, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 6 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 7 ; VI-NEXT: v_mov_b32_e32 v45, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 9 ; VI-NEXT: v_mov_b32_e32 v47, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 10 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 11 ; VI-NEXT: v_mov_b32_e32 v57, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 12 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 13 -; VI-NEXT: v_mov_b32_e32 v55, s4 -; VI-NEXT: v_readlane_b32 s4, v62, 16 ; VI-NEXT: v_mov_b32_e32 v59, s6 ; VI-NEXT: v_readlane_b32 s6, v62, 14 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_readlane_b32 s4, v62, 16 ; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 17 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v22, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 18 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 19 ; VI-NEXT: v_mov_b32_e32 v15, s4 @@ -188714,45 +190457,48 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 24 ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 25 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 26 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s80 ; VI-NEXT: v_mov_b32_e32 v46, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 27 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s81 ; VI-NEXT: v_mov_b32_e32 v41, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 28 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s80 -; VI-NEXT: v_mov_b32_e32 v60, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s82 +; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 29 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s81 -; VI-NEXT: v_mov_b32_e32 v40, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 30 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s82 -; VI-NEXT: v_mov_b32_e32 v39, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 31 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 32 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 33 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 34 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 35 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 36 -; VI-NEXT: v_mov_b32_e32 v48, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 37 -; VI-NEXT: v_mov_b32_e32 v49, s4 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 38 -; VI-NEXT: v_mov_b32_e32 v44, s4 +; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 39 ; VI-NEXT: v_mov_b32_e32 v42, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 40 @@ -188760,31 +190506,36 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 41 ; VI-NEXT: v_mov_b32_e32 v58, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 42 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 43 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 44 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 45 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 46 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 47 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, s4 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v3, s78 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 48 -; VI-NEXT: v_mov_b32_e32 v31, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 49 -; VI-NEXT: v_mov_b32_e32 v30, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s46 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v3, s78 +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_readlane_b32 s4, v62, 50 ; VI-NEXT: v_mov_b32_e32 v33, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 51 @@ -188796,22 +190547,29 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s4, v62, 54 ; VI-NEXT: v_mov_b32_e32 v34, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 55 -; VI-NEXT: v_mov_b32_e32 v9, s4 +; VI-NEXT: v_mov_b32_e32 v31, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 56 ; VI-NEXT: v_mov_b32_e32 v3, s88 ; VI-NEXT: v_readlane_b32 s6, v62, 15 ; VI-NEXT: v_mov_b32_e32 v21, s4 ; VI-NEXT: v_readlane_b32 s4, v62, 57 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v40, s71 ; VI-NEXT: v_mov_b32_e32 v50, s70 -; VI-NEXT: v_mov_b32_e32 v43, s54 -; VI-NEXT: v_mov_b32_e32 v37, s86 -; VI-NEXT: v_mov_b32_e32 v52, s84 +; VI-NEXT: v_mov_b32_e32 v54, s69 +; VI-NEXT: v_mov_b32_e32 v43, s68 +; VI-NEXT: v_mov_b32_e32 v49, s67 +; VI-NEXT: v_mov_b32_e32 v37, s66 +; VI-NEXT: v_mov_b32_e32 v55, s65 +; VI-NEXT: v_mov_b32_e32 v52, s64 +; VI-NEXT: v_mov_b32_e32 v53, s54 +; VI-NEXT: v_mov_b32_e32 v60, s87 +; VI-NEXT: v_mov_b32_e32 v39, s86 +; VI-NEXT: v_mov_b32_e32 v48, s84 ; VI-NEXT: v_mov_b32_e32 v51, s6 -; VI-NEXT: v_mov_b32_e32 v54, s5 -; VI-NEXT: v_mov_b32_e32 v23, s83 -; VI-NEXT: v_mov_b32_e32 v24, s50 +; VI-NEXT: v_mov_b32_e32 v44, s83 +; VI-NEXT: v_mov_b32_e32 v11, s50 ; VI-NEXT: v_mov_b32_e32 v26, s51 ; VI-NEXT: v_mov_b32_e32 v35, s4 ; VI-NEXT: v_mov_b32_e32 v20, s76 @@ -188820,16 +190578,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_mov_b32_e32 v17, s62 ; VI-NEXT: v_mov_b32_e32 v16, s60 ; VI-NEXT: v_mov_b32_e32 v13, s58 -; VI-NEXT: v_mov_b32_e32 v10, s56 -; VI-NEXT: v_mov_b32_e32 v7, s46 +; VI-NEXT: v_mov_b32_e32 v9, s56 ; VI-NEXT: v_mov_b32_e32 v3, s90 ; VI-NEXT: v_mov_b32_e32 v4, s30 ; VI-NEXT: v_mov_b32_e32 v5, s34 -; VI-NEXT: v_mov_b32_e32 v8, s36 -; VI-NEXT: v_mov_b32_e32 v11, s38 +; VI-NEXT: v_mov_b32_e32 v30, s36 +; VI-NEXT: v_mov_b32_e32 v24, s38 ; VI-NEXT: v_mov_b32_e32 v14, s48 ; VI-NEXT: .LBB95_5: ; %end -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 @@ -188871,30 +190628,30 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_readlane_b32 s31, v63, 1 ; VI-NEXT: v_readlane_b32 s30, v63, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v58, v53, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v58, v23, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v20, v53, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v20, v23, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v58, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v20, v0, s[0:3], 0 offen ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v46 -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v20, v46, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v20, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v2, v20, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v56 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v41 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -188903,36 +190660,38 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v2, v2, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v19, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v2, v19, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v42 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v2, v2, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v18, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v2, v18, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v60 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v17 -; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v17, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v40 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 @@ -188940,19 +190699,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v49 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v16 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v15, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v39 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 @@ -188960,19 +190721,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v48 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v12, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -188982,21 +190745,21 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v9 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189006,38 +190769,43 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v25 ; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v21 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v40, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v35 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -189046,130 +190814,121 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v24 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 +; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v8 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v30 +; VI-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v28 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v23 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 +; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v30 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v43, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v53, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v37, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189177,28 +190936,28 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v48, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -189206,15 +190965,15 @@ define inreg <128 x i8> @bitcast_v64f16_to_v128i8_scalar(<64 x half> inreg %a, i ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD @@ -191614,24 +193373,22 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:392 -; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_mov_b32_e32 v57, v5 +; SI-NEXT: v_mov_b32_e32 v41, v3 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:392 ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 @@ -191648,160 +193405,164 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:164 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:188 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v14 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v12 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v30 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v28 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v20 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; kill: killed $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:360 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v3 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 +; SI-NEXT: v_lshlrev_b32_e32 v36, 24, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v19 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v21 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v17 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v23 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v27 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v32 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:88 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v29 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v31 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:80 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:112 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:108 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v34 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v10 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 @@ -191810,21 +193571,21 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:216 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:212 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:228 @@ -191833,23 +193594,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:248 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:244 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:240 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:260 @@ -191858,23 +193619,23 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:268 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:292 @@ -191882,25 +193643,26 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v35, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:312 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:308 ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v9 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:300 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:324 @@ -191911,66 +193673,65 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v63, 8, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:344 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:924 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v60, 24, v9 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:356 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:380 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:352 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:376 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:368 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 -; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:352 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:384 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:936 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:376 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:952 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v18 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:372 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:368 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:956 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:364 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:388 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:384 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v26 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v31, 8, v15 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_lshlrev_b32_e32 v32, 24, v8 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v11 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(0) @@ -191978,658 +193739,661 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v35 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:56 -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v6 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:960 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v12 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v12 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v13 +; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v15 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v13 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; kill: killed $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; kill: killed $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:944 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v8, v16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v12, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v13, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v5, v2, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v6, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v42, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v11, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v5, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v7, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v22, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v15, v24, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v17, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v18, v36, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v7, v16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v11, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v19, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v40, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v10, v24, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v15, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v20, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v26, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v18, v25, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v19, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v30, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v30, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v21, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v21, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v29, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v23, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v33, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v33, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v27, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v49, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v27, v2, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v38, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v50, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v38, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v52, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v52, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v14, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v1, v2 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v53, v2, v7 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v55, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v55, v7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v2, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v43, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v14, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v44, v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v44, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v16, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v45, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v45, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v25, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v47, v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v47, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v24, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v58, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v58, v3, v7 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v16, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v59, v7, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v59, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v3, v34 +; SI-NEXT: v_or_b32_e32 v34, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v62, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v36, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v24, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v48, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v13, v1, v6 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v35, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v6, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v54, v6, v7 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v23, v1, v8 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v41, v7, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v37, v37, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v36, v1, v8 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v57, v51, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v8, v8, v35 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v63, v7, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v35, v37, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v51, v56, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v37, v51, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v56, v60, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v51, v22, v63 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v34, v56, v22 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v60, v7, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v7, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v9, v9, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v7, v7, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v26, v12, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v56, v60, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v54, v22, v4 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v9, v9, v22 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v57 +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v28, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v28, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v29, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v32, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v32, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v39, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v62, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v41, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v46, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v57, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v61, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v60, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v3, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v39, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v1, v22, v1 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v48, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v40, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v46, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v31, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v49, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v12, v12, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v50, v22, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v42, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v20 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v22, 0xff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v22, v22, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v53, v22, v53 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_or_b32_e32 v3, v22, v3 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v61 -; SI-NEXT: v_mov_b32_e32 v61, v42 -; SI-NEXT: v_or_b32_e32 v31, v22, v31 -; SI-NEXT: v_or_b32_e32 v22, v12, v61 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v28 -; SI-NEXT: v_or_b32_e32 v43, v12, v5 -; SI-NEXT: v_alignbit_b32 v5, v22, v5, 16 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v12, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v11 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v32, v32, v59 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v25, v6, v13 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v5 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v7, v25, v5, 16 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v26 +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_alignbit_b32 v11, v5, v11, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v40 -; SI-NEXT: v_or_b32_e32 v42, v11, v10 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v40, v11, v15 -; SI-NEXT: v_alignbit_b32 v11, v42, v15, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v5, v5, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v26 -; SI-NEXT: v_or_b32_e32 v26, v11, v18 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v39, v11, v19 -; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v6, v5, v11, 16 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v28, v11, v30 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v57 -; SI-NEXT: v_or_b32_e32 v11, v11, v21 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; SI-NEXT: v_or_b32_e32 v6, v11, v10 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v11, v6, v15, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v26, v11, v18 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v11, v11, v19 ; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v11, v26, v19, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v28, v11, v30 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v20, v11, v21 ; SI-NEXT: v_alignbit_b32 v11, v28, v21, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v29 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v29, v11, v33 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v60 -; SI-NEXT: v_or_b32_e32 v21, v11, v27 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v23, v11, v27 ; SI-NEXT: v_alignbit_b32 v11, v29, v27, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v19, v11, v38 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v63 -; SI-NEXT: v_or_b32_e32 v27, v11, v52 -; SI-NEXT: v_alignbit_b32 v11, v19, v52, 16 -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v46 +; SI-NEXT: v_or_b32_e32 v27, v11, v50 +; SI-NEXT: v_alignbit_b32 v11, v19, v50, 16 +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v23 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v11, v11, v2 -; SI-NEXT: v_alignbit_b32 v1, v11, v55, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v11, v11, v14 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v21, v15, v53 +; SI-NEXT: v_alignbit_b32 v15, v11, v53, 16 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v55 +; SI-NEXT: v_or_b32_e32 v15, v15, v2 +; SI-NEXT: v_or_b32_e32 v46, v3, v43 +; SI-NEXT: v_alignbit_b32 v3, v15, v43, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v17, v3, v16 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v39, v3, v45 +; SI-NEXT: v_alignbit_b32 v3, v17, v45, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v15, v1, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v44 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v61, v3, v24 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v3, v3, v58 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v15, v44, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v3, v61, v58, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v17, v1, v25 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 -; SI-NEXT: v_or_b32_e32 v1, v1, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v17, v47, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v62, v3, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 +; SI-NEXT: v_or_b32_e32 v40, v3, v36 +; SI-NEXT: v_alignbit_b32 v3, v62, v36, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v59, v3, v35 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 +; SI-NEXT: v_or_b32_e32 v31, v3, v54 +; SI-NEXT: v_alignbit_b32 v3, v59, v54, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v47, v3, v37 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v25, v3, v57 +; SI-NEXT: v_alignbit_b32 v3, v47, v57, 16 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v16 -; SI-NEXT: v_alignbit_b32 v32, v1, v59, 16 -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v59, v6, v23 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v20 -; SI-NEXT: v_or_b32_e32 v62, v32, v24 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v50 -; SI-NEXT: v_or_b32_e32 v50, v6, v36 -; SI-NEXT: v_alignbit_b32 v6, v59, v36, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v47, v6, v35 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v49, v6, v37 -; SI-NEXT: v_alignbit_b32 v6, v47, v37, 16 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v45, v6, v34 -; SI-NEXT: v_or_b32_e32 v48, v3, v56 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v45, v3, v51 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 +; SI-NEXT: v_or_b32_e32 v22, v3, v56 ; SI-NEXT: v_alignbit_b32 v3, v45, v56, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v44, v3, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v31 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: v_mov_b32_e32 v14, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -192834,169 +194598,191 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: v_or_b32_e32 v46, v32, v13 -; SI-NEXT: v_alignbit_b32 v13, v62, v13, 16 -; SI-NEXT: v_alignbit_b32 v6, v44, v9, 16 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v24 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:980 ; 4-byte Folded Reload +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_or_b32_e32 v12, v1, v9 +; SI-NEXT: v_alignbit_b32 v1, v44, v9, 16 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v34 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v13 +; SI-NEXT: v_mov_b32_e32 v13, v25 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 +; SI-NEXT: v_mov_b32_e32 v14, v31 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v35 +; SI-NEXT: v_mov_b32_e32 v35, v22 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; kill: killed $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; kill: killed $vgpr42 ; SI-NEXT: .LBB96_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB96_4 ; SI-NEXT: ; %bb.3: ; %cmp.true ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v61 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v41 -; SI-NEXT: v_or_b32_e32 v1, v31, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:976 ; 4-byte Folded Reload +; SI-NEXT: s_movk_i32 s6, 0x300 +; SI-NEXT: v_mov_b32_e32 v30, v16 +; SI-NEXT: v_mov_b32_e32 v33, v31 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: s_mov_b32 s7, 0x3000000 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x300, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v56, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_or_b32_e32 v2, v9, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v23, v2, v3 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v57 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_movk_i32 s6, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_or_b32_e32 v4, v32, v4 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v4, v2 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:968 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v56, v5 -; SI-NEXT: v_mov_b32_e32 v30, v16 -; SI-NEXT: s_mov_b32 s7, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v31, v24 +; SI-NEXT: v_mov_b32_e32 v32, v24 ; SI-NEXT: v_add_i32_e32 v44, vcc, s7, v2 -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v1 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:916 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:952 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:936 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:972 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:964 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, s6, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v51, v6 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v7 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v60, v4 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:956 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v32, vcc, s7, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:924 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v35, vcc, s7, v3 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 ; SI-NEXT: v_or_b32_e32 v4, v63, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, s6, v4 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:948 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v45, vcc, s7, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v45 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v45 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v5, v53, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v5, vcc, s6, v5 ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_or_b32_e32 v6, v51, v6 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v33, vcc, s7, v5 -; SI-NEXT: v_mov_b32_e32 v49, v33 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s7, v5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_or_b32_e32 v6, v35, v6 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v6 ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_or_b32_e32 v7, v37, v7 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v47, vcc, s7, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, s6, v7 ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193005,16 +194791,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 ; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:932 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v7 -; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, s7, v7 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s6, v8 ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193023,15 +194808,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:960 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v59, vcc, s7, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s6, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193040,16 +194825,16 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 ; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v8, vcc, s7, v9 -; SI-NEXT: v_mov_b32_e32 v46, v8 +; SI-NEXT: v_mov_b32_e32 v40, v8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, s6, v10 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193058,15 +194843,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:928 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v62, vcc, s7, v10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s6, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193075,15 +194860,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 ; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, s7, v11 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, s6, v12 ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193092,15 +194877,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 ; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v12 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v61, vcc, s7, v12 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, s6, v13 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193109,45 +194894,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 ; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v13 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v13 +; SI-NEXT: v_mov_b32_e32 v13, v2 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, s6, v14 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_or_b32_e32 v15, v17, v15 ; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v15, vcc, 3, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v15, v17, v15 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s6, v15 ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 ; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v13, vcc, s7, v15 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v46, vcc, s7, v15 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, s6, v16 ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193156,15 +194947,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_or_b32_e32 v17, v18, v17 ; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, s7, v16 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193173,32 +194964,32 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 ; SI-NEXT: v_or_b32_e32 v18, v19, v18 ; SI-NEXT: v_or_b32_e32 v18, v18, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v18 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v23 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 3, v19 ; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 ; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v19, v19, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, s7, v19 ; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v11 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v20, v17 -; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193207,83 +194998,95 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 ; SI-NEXT: v_or_b32_e32 v20, v21, v20 ; SI-NEXT: v_or_b32_e32 v20, v20, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 ; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v22, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v21 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_or_b32_e32 v22, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v19, vcc, s7, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v19 +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v22, v17 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v22 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_or_b32_e32 v24, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v17, v23, v17 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v17, v21, v17 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, s6, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v23, vcc, 3, v23 -; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v23, v26, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v23 +; SI-NEXT: v_add_i32_e32 v21, vcc, 3, v21 +; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_or_b32_e32 v21, v26, v21 +; SI-NEXT: v_or_b32_e32 v29, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v29, vcc, s7, v29 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_or_b32_e32 v24, v24, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v26, v21, v17 +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v17, vcc, 3, v17 ; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v25, v25, v17 -; SI-NEXT: v_or_b32_e32 v2, v25, v2 -; SI-NEXT: v_add_i32_e32 v18, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 -; SI-NEXT: v_mov_b32_e32 v14, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, s7, v20 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v28, v21, v17 +; SI-NEXT: v_or_b32_e32 v2, v28, v2 +; SI-NEXT: v_add_i32_e32 v20, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v23 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, s7, v14 +; SI-NEXT: v_mov_b32_e32 v14, v3 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v23, vcc, s7, v24 +; SI-NEXT: v_add_i32_e32 v21, vcc, s7, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193292,17 +195095,15 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v28, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v25 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v28 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193311,52 +195112,51 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v39, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v10, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v26, v3 +; SI-NEXT: v_or_b32_e32 v3, v36, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v26, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v26 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v26 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, v32, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v40, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v16, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -193365,310 +195165,295 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v31, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v42, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v42 +; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v4, vcc, s7, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(2) ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v54 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v3, v30, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_add_i32_e32 v5, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v2, v33, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, v1, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v43, vcc, s7, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: v_add_i32_e32 v7, vcc, s7, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:944 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v2, v1, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v2, vcc, s6, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v1, v3 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v22, vcc, s7, v2 -; SI-NEXT: v_alignbit_b32 v2, v22, v43, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: v_add_i32_e32 v1, vcc, s7, v2 +; SI-NEXT: v_alignbit_b32 v2, v5, v4, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v6, v16, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_alignbit_b32 v2, v26, v10, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v5, v7, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v28, v20, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v42, v40, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v29, v23, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v26, v39, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v28, v18, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v29, v21, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v11, v21, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v19, v27, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v11, v16, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v15, v13, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:884 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v17, v10, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:888 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v15, v46, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v17, v39, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v2, v1, v9, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:892 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v61, v9, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:856 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_alignbit_b32 v2, v62, v8, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v59, v6, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v59, v14, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v47, v33, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v47, v13, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v45, v32, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v45, v35, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v2, v44, v14, 16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_alignbit_b32 v2, v44, v12, 16 +; SI-NEXT: v_alignbit_b32 v3, v1, v7, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v17 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v61 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v47 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v44 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:940 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:904 ; 4-byte Folded Spill ; SI-NEXT: .LBB96_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v58 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v39 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v20 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v29 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:880 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:884 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v18 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:852 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:888 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:892 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:856 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:896 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:860 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193680,9 +195465,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:900 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:864 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193694,9 +195479,9 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:868 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193704,13 +195489,13 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:908 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:872 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -193718,19 +195503,19 @@ define <64 x i16> @bitcast_v128i8_to_v64i16(<128 x i8> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:912 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:876 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:940 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:904 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 @@ -197895,1598 +199680,1736 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:332 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:328 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:324 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:320 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:308 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:304 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:300 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:296 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:292 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:288 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:276 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:272 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:264 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:260 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:256 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:244 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:240 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:236 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:232 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:228 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:224 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:212 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:208 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:204 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:200 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:196 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v62, s28, 0 -; SI-NEXT: v_writelane_b32 v62, s25, 1 -; SI-NEXT: v_writelane_b32 v62, s24, 2 -; SI-NEXT: v_writelane_b32 v62, s23, 3 -; SI-NEXT: v_writelane_b32 v62, s22, 4 -; SI-NEXT: v_writelane_b32 v62, s21, 5 -; SI-NEXT: v_writelane_b32 v62, s18, 6 -; SI-NEXT: v_writelane_b32 v62, s16, 7 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_writelane_b32 v63, s37, 5 -; SI-NEXT: v_writelane_b32 v63, s38, 6 -; SI-NEXT: v_writelane_b32 v63, s39, 7 -; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: v_writelane_b32 v63, s49, 9 -; SI-NEXT: v_writelane_b32 v63, s50, 10 -; SI-NEXT: v_writelane_b32 v63, s51, 11 -; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: v_writelane_b32 v63, s53, 13 -; SI-NEXT: v_writelane_b32 v63, s54, 14 -; SI-NEXT: v_writelane_b32 v63, s55, 15 -; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: v_writelane_b32 v63, s99, 35 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:332 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:328 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:324 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:320 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:316 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:312 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:308 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:304 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_writelane_b32 v41, s30, 0 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v29, v26 -; SI-NEXT: v_readfirstlane_b32 s15, v16 -; SI-NEXT: v_readfirstlane_b32 s18, v25 -; SI-NEXT: v_readfirstlane_b32 s43, v15 -; SI-NEXT: v_readfirstlane_b32 s42, v24 -; SI-NEXT: v_readfirstlane_b32 s44, v23 -; SI-NEXT: v_readfirstlane_b32 s49, v12 -; SI-NEXT: v_readfirstlane_b32 s8, v11 -; SI-NEXT: v_readfirstlane_b32 s53, v20 -; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_writelane_b32 v43, s29, 0 +; SI-NEXT: v_writelane_b32 v43, s28, 1 +; SI-NEXT: v_writelane_b32 v43, s27, 2 +; SI-NEXT: v_writelane_b32 v43, s26, 3 +; SI-NEXT: v_writelane_b32 v43, s25, 4 +; SI-NEXT: v_writelane_b32 v43, s24, 5 +; SI-NEXT: v_writelane_b32 v43, s23, 6 +; SI-NEXT: v_writelane_b32 v43, s22, 7 +; SI-NEXT: v_writelane_b32 v43, s21, 8 +; SI-NEXT: v_writelane_b32 v43, s20, 9 +; SI-NEXT: v_writelane_b32 v43, s19, 10 +; SI-NEXT: v_writelane_b32 v43, s18, 11 +; SI-NEXT: v_writelane_b32 v43, s17, 12 +; SI-NEXT: v_writelane_b32 v43, s16, 13 +; SI-NEXT: v_writelane_b32 v41, s31, 1 +; SI-NEXT: v_writelane_b32 v41, s34, 2 +; SI-NEXT: v_writelane_b32 v41, s35, 3 +; SI-NEXT: v_writelane_b32 v41, s36, 4 +; SI-NEXT: v_writelane_b32 v41, s37, 5 +; SI-NEXT: v_writelane_b32 v41, s38, 6 +; SI-NEXT: v_writelane_b32 v41, s39, 7 +; SI-NEXT: v_writelane_b32 v41, s48, 8 +; SI-NEXT: v_writelane_b32 v41, s49, 9 +; SI-NEXT: v_writelane_b32 v41, s50, 10 +; SI-NEXT: v_writelane_b32 v41, s51, 11 +; SI-NEXT: v_writelane_b32 v41, s52, 12 +; SI-NEXT: v_writelane_b32 v41, s53, 13 +; SI-NEXT: v_writelane_b32 v41, s54, 14 +; SI-NEXT: v_writelane_b32 v41, s55, 15 +; SI-NEXT: v_writelane_b32 v41, s64, 16 +; SI-NEXT: v_writelane_b32 v41, s65, 17 +; SI-NEXT: v_writelane_b32 v41, s66, 18 +; SI-NEXT: v_writelane_b32 v41, s67, 19 +; SI-NEXT: v_writelane_b32 v41, s68, 20 +; SI-NEXT: v_writelane_b32 v41, s69, 21 +; SI-NEXT: v_writelane_b32 v41, s70, 22 +; SI-NEXT: v_writelane_b32 v41, s71, 23 +; SI-NEXT: v_writelane_b32 v41, s80, 24 +; SI-NEXT: v_writelane_b32 v41, s81, 25 +; SI-NEXT: v_writelane_b32 v41, s82, 26 +; SI-NEXT: v_writelane_b32 v41, s83, 27 +; SI-NEXT: v_writelane_b32 v41, s84, 28 +; SI-NEXT: v_writelane_b32 v41, s85, 29 +; SI-NEXT: v_writelane_b32 v41, s86, 30 +; SI-NEXT: v_writelane_b32 v41, s87, 31 +; SI-NEXT: v_writelane_b32 v41, s96, 32 +; SI-NEXT: v_writelane_b32 v41, s97, 33 +; SI-NEXT: v_writelane_b32 v41, s98, 34 +; SI-NEXT: v_readfirstlane_b32 s39, v26 +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s47, v12 +; SI-NEXT: v_writelane_b32 v42, s39, 0 +; SI-NEXT: v_readfirstlane_b32 s56, v11 +; SI-NEXT: v_writelane_b32 v42, s47, 1 +; SI-NEXT: v_readfirstlane_b32 s48, v24 +; SI-NEXT: v_writelane_b32 v42, s56, 2 +; SI-NEXT: v_readfirstlane_b32 s49, v23 +; SI-NEXT: v_writelane_b32 v42, s48, 3 +; SI-NEXT: v_readfirstlane_b32 s50, v21 +; SI-NEXT: v_writelane_b32 v42, s49, 4 +; SI-NEXT: v_readfirstlane_b32 s51, v22 +; SI-NEXT: v_writelane_b32 v42, s50, 5 +; SI-NEXT: v_writelane_b32 v42, s51, 6 +; SI-NEXT: v_readfirstlane_b32 s57, v20 +; SI-NEXT: v_readfirstlane_b32 s58, v19 +; SI-NEXT: v_readfirstlane_b32 s64, v29 +; SI-NEXT: v_readfirstlane_b32 s65, v30 +; SI-NEXT: v_readfirstlane_b32 s59, v28 +; SI-NEXT: v_readfirstlane_b32 s60, v27 +; SI-NEXT: v_readfirstlane_b32 s11, v1 +; SI-NEXT: v_readfirstlane_b32 s12, v2 +; SI-NEXT: v_readfirstlane_b32 s13, v9 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 14 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:300 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:296 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:292 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:288 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:284 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:280 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 15 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:276 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:272 +; SI-NEXT: v_writelane_b32 v43, s4, 16 +; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_readfirstlane_b32 s4, v34 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: v_readfirstlane_b32 s4, v38 -; SI-NEXT: v_writelane_b32 v62, s4, 9 +; SI-NEXT: v_writelane_b32 v43, s4, 17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 18 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s44, v36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s90, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:268 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:264 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:260 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:256 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s6, v38 +; SI-NEXT: v_readfirstlane_b32 s14, v10 +; SI-NEXT: v_readfirstlane_b32 s15, v8 +; SI-NEXT: v_readfirstlane_b32 s18, v7 +; SI-NEXT: v_readfirstlane_b32 s21, v5 +; SI-NEXT: v_readfirstlane_b32 s22, v6 +; SI-NEXT: v_readfirstlane_b32 s40, v17 +; SI-NEXT: v_readfirstlane_b32 s41, v18 +; SI-NEXT: v_readfirstlane_b32 s42, v4 +; SI-NEXT: v_readfirstlane_b32 s43, v3 +; SI-NEXT: v_readfirstlane_b32 s76, v16 +; SI-NEXT: v_readfirstlane_b32 s77, v15 +; SI-NEXT: v_readfirstlane_b32 s38, v25 +; SI-NEXT: v_writelane_b32 v41, s99, 35 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 19 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 20 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v48 +; SI-NEXT: v_writelane_b32 v43, s4, 21 +; SI-NEXT: s_waitcnt vmcnt(8) ; SI-NEXT: v_readfirstlane_b32 s4, v49 -; SI-NEXT: v_writelane_b32 v62, s4, 10 +; SI-NEXT: v_writelane_b32 v43, s4, 22 +; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_readfirstlane_b32 s4, v50 -; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: v_readfirstlane_b32 s79, v52 -; SI-NEXT: v_readfirstlane_b32 s88, v54 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:192 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:180 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:176 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:168 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:164 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:160 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:148 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:144 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:140 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:136 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: v_readfirstlane_b32 s77, v41 -; SI-NEXT: v_readfirstlane_b32 s4, v42 -; SI-NEXT: v_readfirstlane_b32 s94, v31 -; SI-NEXT: v_readfirstlane_b32 s70, v32 -; SI-NEXT: v_readfirstlane_b32 s51, v33 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s37, v45 -; SI-NEXT: v_readfirstlane_b32 s24, v56 -; SI-NEXT: v_readfirstlane_b32 s7, v57 -; SI-NEXT: v_readfirstlane_b32 s92, v58 -; SI-NEXT: v_readfirstlane_b32 s28, v59 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:132 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:80 -; SI-NEXT: v_readfirstlane_b32 s35, v43 -; SI-NEXT: v_readfirstlane_b32 s55, v46 -; SI-NEXT: v_readfirstlane_b32 s68, v35 -; SI-NEXT: v_readfirstlane_b32 s87, v37 -; SI-NEXT: v_readfirstlane_b32 s67, v39 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s74, v53 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:64 -; SI-NEXT: v_readfirstlane_b32 s85, v48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 -; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: v_readfirstlane_b32 s98, v40 -; SI-NEXT: v_readfirstlane_b32 s69, v51 -; SI-NEXT: v_readfirstlane_b32 s21, v36 -; SI-NEXT: v_readfirstlane_b32 s40, v19 -; SI-NEXT: v_readfirstlane_b32 s23, v28 -; SI-NEXT: v_readfirstlane_b32 s34, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, v13 -; SI-NEXT: v_mov_b32_e32 v13, v5 -; SI-NEXT: v_readfirstlane_b32 s97, v29 -; SI-NEXT: v_readfirstlane_b32 s80, v18 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v30 -; SI-NEXT: v_readfirstlane_b32 s96, v17 -; SI-NEXT: v_readfirstlane_b32 s64, v9 -; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s83, v7 -; SI-NEXT: v_readfirstlane_b32 s84, v4 -; SI-NEXT: v_readfirstlane_b32 s93, v3 -; SI-NEXT: v_readfirstlane_b32 s76, v1 -; SI-NEXT: v_readfirstlane_b32 s58, v38 -; SI-NEXT: v_readfirstlane_b32 s65, v49 -; SI-NEXT: v_readfirstlane_b32 s62, v54 -; SI-NEXT: v_readfirstlane_b32 s81, v44 -; SI-NEXT: v_readfirstlane_b32 s71, v47 -; SI-NEXT: v_readfirstlane_b32 s38, v60 -; SI-NEXT: v_readfirstlane_b32 s86, v61 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:156 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:220 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s90, v50 -; SI-NEXT: v_readfirstlane_b32 s31, v52 -; SI-NEXT: v_readfirstlane_b32 s4, v55 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 -; SI-NEXT: v_readfirstlane_b32 s72, v31 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:316 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:72 -; SI-NEXT: v_readfirstlane_b32 s82, v56 -; SI-NEXT: v_readfirstlane_b32 s95, v57 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s39, v58 -; SI-NEXT: v_readfirstlane_b32 s56, v59 -; SI-NEXT: v_readfirstlane_b32 s57, v41 -; SI-NEXT: v_readfirstlane_b32 s36, v42 -; SI-NEXT: v_readfirstlane_b32 s73, v45 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:284 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:252 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:188 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:124 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 -; SI-NEXT: v_readfirstlane_b32 s16, v34 -; SI-NEXT: v_readfirstlane_b32 s48, v32 -; SI-NEXT: v_readfirstlane_b32 s52, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: v_readfirstlane_b32 s47, v35 -; SI-NEXT: v_readfirstlane_b32 s60, v37 -; SI-NEXT: v_readfirstlane_b32 s61, v39 -; SI-NEXT: v_readfirstlane_b32 s89, v43 +; SI-NEXT: v_writelane_b32 v43, s4, 23 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s4, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:252 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:244 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:236 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:228 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s91, v32 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s8, v33 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 +; SI-NEXT: v_writelane_b32 v43, s4, 24 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 25 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v35 +; SI-NEXT: v_writelane_b32 v43, s4, 26 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v36 +; SI-NEXT: v_writelane_b32 v43, s4, 27 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:212 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:208 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:204 +; SI-NEXT: v_writelane_b32 v43, s4, 28 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 29 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s89, v38 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s78, v39 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s7, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s82, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s96, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:200 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:196 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:192 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:188 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:184 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:176 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s70, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:168 +; SI-NEXT: v_writelane_b32 v43, s4, 30 +; SI-NEXT: v_readfirstlane_b32 s4, v32 +; SI-NEXT: v_writelane_b32 v43, s4, 31 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:164 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:160 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:156 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:152 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s99, v46 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:312 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:280 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:216 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:184 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:152 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 -; SI-NEXT: v_readfirstlane_b32 s54, v48 -; SI-NEXT: v_readfirstlane_b32 s50, v53 -; SI-NEXT: v_readfirstlane_b32 s78, v49 -; SI-NEXT: v_readfirstlane_b32 s30, v51 -; SI-NEXT: v_readfirstlane_b32 s66, v54 -; SI-NEXT: v_readfirstlane_b32 s91, v40 +; SI-NEXT: v_readfirstlane_b32 s4, v34 +; SI-NEXT: v_writelane_b32 v43, s4, 32 +; SI-NEXT: v_readfirstlane_b32 s9, v35 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s4, v37 +; SI-NEXT: v_writelane_b32 v43, s4, 33 +; SI-NEXT: v_readfirstlane_b32 s10, v36 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s4, v31 +; SI-NEXT: v_writelane_b32 v43, s4, 34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: v_writelane_b32 v43, s4, 35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s4, v39 +; SI-NEXT: v_writelane_b32 v43, s4, 36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s69, v48 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s16, v50 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:144 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:136 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:132 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:128 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:124 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:120 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:336 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:104 +; SI-NEXT: v_writelane_b32 v43, s4, 37 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s6, v44 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v50 +; SI-NEXT: v_readfirstlane_b32 s4, v52 +; SI-NEXT: v_writelane_b32 v43, s4, 38 +; SI-NEXT: v_readfirstlane_b32 s4, v53 +; SI-NEXT: v_writelane_b32 v43, s4, 39 +; SI-NEXT: v_readfirstlane_b32 s4, v54 +; SI-NEXT: v_writelane_b32 v43, s4, 40 +; SI-NEXT: v_writelane_b32 v43, s44, 41 +; SI-NEXT: v_writelane_b32 v43, s6, 42 +; SI-NEXT: v_writelane_b32 v43, s7, 43 +; SI-NEXT: v_writelane_b32 v43, s8, 44 +; SI-NEXT: v_writelane_b32 v43, s9, 45 +; SI-NEXT: v_writelane_b32 v43, s10, 46 +; SI-NEXT: v_writelane_b32 v43, s11, 47 +; SI-NEXT: v_writelane_b32 v43, s12, 48 +; SI-NEXT: v_writelane_b32 v43, s13, 49 +; SI-NEXT: v_writelane_b32 v43, s14, 50 +; SI-NEXT: v_writelane_b32 v43, s15, 51 +; SI-NEXT: v_writelane_b32 v43, s18, 52 +; SI-NEXT: v_writelane_b32 v43, s21, 53 +; SI-NEXT: v_writelane_b32 v43, s22, 54 +; SI-NEXT: v_writelane_b32 v43, s40, 55 +; SI-NEXT: v_writelane_b32 v43, s41, 56 +; SI-NEXT: v_writelane_b32 v43, s42, 57 +; SI-NEXT: v_writelane_b32 v43, s43, 58 +; SI-NEXT: v_writelane_b32 v43, s76, 59 +; SI-NEXT: v_writelane_b32 v43, s77, 60 +; SI-NEXT: v_readfirstlane_b32 s93, v55 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_readfirstlane_b32 s95, v40 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s17, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s98, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s23, v35 +; SI-NEXT: v_readfirstlane_b32 s25, v31 +; SI-NEXT: v_readfirstlane_b32 s28, v32 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s26, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s88, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s79, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s75, v39 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:88 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:84 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v48 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s24, v49 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s85, v50 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s66, v51 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s4, v10 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: v_readfirstlane_b32 s4, v2 -; SI-NEXT: v_writelane_b32 v62, s4, 16 -; SI-NEXT: v_writelane_b32 v62, s17, 17 -; SI-NEXT: v_writelane_b32 v62, s15, 18 -; SI-NEXT: v_writelane_b32 v62, s18, 19 -; SI-NEXT: v_writelane_b32 v62, s43, 20 -; SI-NEXT: v_writelane_b32 v62, s42, 21 -; SI-NEXT: v_writelane_b32 v62, s44, 22 -; SI-NEXT: v_writelane_b32 v62, s16, 23 -; SI-NEXT: v_writelane_b32 v62, s49, 24 -; SI-NEXT: v_writelane_b32 v62, s8, 25 -; SI-NEXT: v_writelane_b32 v62, s6, 26 -; SI-NEXT: v_readfirstlane_b32 s45, v52 -; SI-NEXT: v_writelane_b32 v62, s56, 27 -; SI-NEXT: v_writelane_b32 v62, s45, 28 -; SI-NEXT: v_writelane_b32 v62, s53, 29 -; SI-NEXT: v_writelane_b32 v62, s94, 30 -; SI-NEXT: v_writelane_b32 v62, s57, 31 -; SI-NEXT: v_writelane_b32 v62, s58, 32 -; SI-NEXT: v_writelane_b32 v62, s47, 33 -; SI-NEXT: v_readfirstlane_b32 s46, v55 -; SI-NEXT: v_writelane_b32 v62, s40, 34 -; SI-NEXT: v_readfirstlane_b32 s59, v47 -; SI-NEXT: v_writelane_b32 v62, s46, 35 -; SI-NEXT: v_writelane_b32 v62, s59, 36 -; SI-NEXT: v_writelane_b32 v62, s60, 37 -; SI-NEXT: v_writelane_b32 v62, s36, 38 -; SI-NEXT: v_writelane_b32 v62, s65, 39 -; SI-NEXT: v_writelane_b32 v62, s61, 40 -; SI-NEXT: v_writelane_b32 v62, s73, 41 -; SI-NEXT: v_writelane_b32 v62, s62, 42 -; SI-NEXT: v_writelane_b32 v62, s72, 43 -; SI-NEXT: v_writelane_b32 v62, s23, 44 -; SI-NEXT: v_writelane_b32 v62, s48, 45 -; SI-NEXT: v_writelane_b32 v62, s34, 46 -; SI-NEXT: v_writelane_b32 v62, s78, 47 -; SI-NEXT: v_writelane_b32 v62, s30, 48 -; SI-NEXT: v_writelane_b32 v62, s54, 49 -; SI-NEXT: v_writelane_b32 v62, s50, 50 -; SI-NEXT: v_writelane_b32 v62, s52, 51 -; SI-NEXT: v_writelane_b32 v62, s82, 52 -; SI-NEXT: v_writelane_b32 v62, s66, 53 -; SI-NEXT: v_readfirstlane_b32 s22, v36 +; SI-NEXT: v_readfirstlane_b32 vcc_lo, v13 +; SI-NEXT: v_readfirstlane_b32 vcc_hi, v14 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 61 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 62 +; SI-NEXT: v_writelane_b32 v43, s38, 63 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v57 +; SI-NEXT: v_readfirstlane_b32 s20, v31 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v58 +; SI-NEXT: v_readfirstlane_b32 s19, v32 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v59 -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v56 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v60 -; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v45 -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v61 -; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v42 -; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v41 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v31 -; SI-NEXT: v_writelane_b32 v62, s91, 54 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s27, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s94, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s72, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s73, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s67, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s71, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s97, v39 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s35, v48 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s83, v49 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s87, v50 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s63, v51 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s74, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s81, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s80, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s86, v34 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s34, v35 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s84, v36 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s31, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s61, v38 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s62, v39 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s53, v48 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_readfirstlane_b32 s52, v49 +; SI-NEXT: v_writelane_b32 v42, s52, 7 +; SI-NEXT: v_writelane_b32 v42, s53, 8 +; SI-NEXT: v_writelane_b32 v42, s57, 9 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_readfirstlane_b32 s54, v50 +; SI-NEXT: v_writelane_b32 v42, s58, 10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_readfirstlane_b32 s55, v51 +; SI-NEXT: v_writelane_b32 v42, s54, 11 +; SI-NEXT: v_writelane_b32 v42, s55, 12 +; SI-NEXT: v_writelane_b32 v42, s64, 13 +; SI-NEXT: v_writelane_b32 v42, s65, 14 +; SI-NEXT: v_writelane_b32 v42, s67, 15 +; SI-NEXT: v_writelane_b32 v42, s71, 16 +; SI-NEXT: v_writelane_b32 v42, s80, 17 +; SI-NEXT: v_writelane_b32 v42, s81, 18 +; SI-NEXT: v_writelane_b32 v42, s59, 19 +; SI-NEXT: v_writelane_b32 v42, s60, 20 +; SI-NEXT: v_writelane_b32 v42, s86, 21 +; SI-NEXT: v_writelane_b32 v42, s97, 22 +; SI-NEXT: v_writelane_b32 v42, s34, 23 +; SI-NEXT: v_writelane_b32 v42, s66, 24 +; SI-NEXT: v_writelane_b32 v42, s85, 25 +; SI-NEXT: v_writelane_b32 v42, s31, 26 +; SI-NEXT: v_writelane_b32 v42, s84, 27 +; SI-NEXT: v_writelane_b32 v42, s35, 28 +; SI-NEXT: v_writelane_b32 v42, s98, 29 +; SI-NEXT: v_writelane_b32 v42, s17, 30 +; SI-NEXT: v_writelane_b32 v42, s20, 31 +; SI-NEXT: v_writelane_b32 v42, s61, 32 +; SI-NEXT: v_writelane_b32 v42, s19, 33 +; SI-NEXT: v_writelane_b32 v42, s62, 34 +; SI-NEXT: v_writelane_b32 v42, s23, 35 +; SI-NEXT: v_writelane_b32 v42, s83, 36 +; SI-NEXT: v_writelane_b32 v42, s87, 37 +; SI-NEXT: v_writelane_b32 v42, s26, 38 +; SI-NEXT: v_writelane_b32 v42, s94, 39 +; SI-NEXT: v_writelane_b32 v42, s27, 40 +; SI-NEXT: v_writelane_b32 v42, s63, 41 +; SI-NEXT: v_writelane_b32 v42, s79, 42 +; SI-NEXT: v_writelane_b32 v42, s88, 43 +; SI-NEXT: v_writelane_b32 v42, s72, 44 +; SI-NEXT: v_writelane_b32 v42, s73, 45 +; SI-NEXT: v_writelane_b32 v42, s74, 46 +; SI-NEXT: v_writelane_b32 v42, s75, 47 +; SI-NEXT: v_writelane_b32 v42, s24, 48 +; SI-NEXT: v_writelane_b32 v42, s25, 49 +; SI-NEXT: v_writelane_b32 v42, s28, 50 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v5, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_writelane_b32 v62, s4, 55 -; SI-NEXT: v_readlane_b32 s4, v62, 4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: v_readlane_b32 s5, v62, 3 -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_or_b32 s63, s5, s4 -; SI-NEXT: v_readlane_b32 s4, v62, 6 -; SI-NEXT: s_and_b32 s5, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s19, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: s_or_b32 s9, s9, s5 -; SI-NEXT: s_and_b32 s5, s4, 0xff -; SI-NEXT: s_lshl_b32 s10, s29, 8 -; SI-NEXT: s_or_b32 s4, s5, s10 -; SI-NEXT: v_writelane_b32 v62, s4, 56 -; SI-NEXT: s_and_b32 s5, s76, 0xff -; SI-NEXT: v_readlane_b32 s10, v62, 16 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s10, 24 -; SI-NEXT: s_or_b32 s5, s11, s5 -; SI-NEXT: s_and_b32 s11, s26, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s12, s27, 24 -; SI-NEXT: s_or_b32 s14, s12, s11 -; SI-NEXT: s_and_b32 s11, s83, 0xff -; SI-NEXT: s_lshl_b32 s12, s25, 8 -; SI-NEXT: s_or_b32 s10, s11, s12 -; SI-NEXT: v_writelane_b32 v62, s10, 57 -; SI-NEXT: s_and_b32 s11, s64, 0xff -; SI-NEXT: v_readlane_b32 s10, v62, 15 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s13, s10, 24 -; SI-NEXT: s_or_b32 s41, s13, s11 -; SI-NEXT: s_and_b32 s11, s43, 0xff -; SI-NEXT: s_lshl_b32 s13, s15, 8 -; SI-NEXT: s_or_b32 s10, s11, s13 -; SI-NEXT: s_and_b32 s11, s96, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s80, 24 -; SI-NEXT: s_or_b32 s43, s15, s11 -; SI-NEXT: s_and_b32 s11, s44, 0xff -; SI-NEXT: s_lshl_b32 s15, s42, 8 -; SI-NEXT: s_or_b32 s13, s11, s15 -; SI-NEXT: s_and_b32 s11, s18, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s97, 24 -; SI-NEXT: s_or_b32 s44, s15, s11 -; SI-NEXT: s_and_b32 s11, s59, 0xff -; SI-NEXT: s_lshl_b32 s15, s46, 8 -; SI-NEXT: s_or_b32 s12, s11, s15 -; SI-NEXT: s_and_b32 s11, s45, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s6, 24 -; SI-NEXT: s_or_b32 s45, s15, s11 -; SI-NEXT: s_and_b32 s11, s30, 0xff -; SI-NEXT: s_lshl_b32 s15, s78, 8 -; SI-NEXT: v_writelane_b32 v62, s10, 58 -; SI-NEXT: s_or_b32 s10, s11, s15 -; SI-NEXT: s_and_b32 s11, s99, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s89, 24 -; SI-NEXT: s_or_b32 s46, s15, s11 -; SI-NEXT: s_and_b32 s11, s61, 0xff -; SI-NEXT: s_lshl_b32 s15, s60, 8 -; SI-NEXT: s_or_b32 s6, s11, s15 -; SI-NEXT: s_and_b32 s11, s22, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s47, 24 -; SI-NEXT: s_or_b32 s47, s15, s11 -; SI-NEXT: s_and_b32 s11, s57, 0xff -; SI-NEXT: s_lshl_b32 s15, s56, 8 -; SI-NEXT: v_writelane_b32 v62, s6, 59 -; SI-NEXT: s_or_b32 s6, s11, s15 -; SI-NEXT: s_and_b32 s11, s39, 0xff -; SI-NEXT: v_writelane_b32 v62, s6, 60 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s95, 24 -; SI-NEXT: s_or_b32 s56, s15, s11 -; SI-NEXT: s_and_b32 s11, s48, 0xff -; SI-NEXT: s_lshl_b32 s15, s72, 8 -; SI-NEXT: v_readlane_b32 s6, v62, 14 -; SI-NEXT: s_or_b32 s48, s11, s15 -; SI-NEXT: s_and_b32 s11, s6, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s31, 24 -; SI-NEXT: s_or_b32 vcc_lo, s15, s11 -; SI-NEXT: s_and_b32 s11, s86, 0xff -; SI-NEXT: s_lshl_b32 s15, s38, 8 -; SI-NEXT: s_or_b32 s72, s11, s15 -; SI-NEXT: s_and_b32 s11, s71, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s81, 24 -; SI-NEXT: s_or_b32 vcc_hi, s15, s11 -; SI-NEXT: s_and_b32 s11, s58, 0xff -; SI-NEXT: s_lshl_b32 s15, s85, 8 -; SI-NEXT: s_or_b32 s57, s11, s15 -; SI-NEXT: s_and_b32 s11, s69, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s74, 24 -; SI-NEXT: v_writelane_b32 v62, s74, 61 -; SI-NEXT: s_or_b32 s74, s15, s11 -; SI-NEXT: s_and_b32 s11, s87, 0xff -; SI-NEXT: s_lshl_b32 s15, s21, 8 -; SI-NEXT: s_or_b32 s58, s11, s15 -; SI-NEXT: s_and_b32 s11, s68, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s28, 24 -; SI-NEXT: s_or_b32 s75, s15, s11 -; SI-NEXT: s_and_b32 s11, s24, 0xff -; SI-NEXT: s_lshl_b32 s15, s55, 8 -; SI-NEXT: v_writelane_b32 v62, s25, 62 -; SI-NEXT: s_or_b32 s59, s11, s15 -; SI-NEXT: s_and_b32 s11, s37, 0xff -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s51, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 13 -; SI-NEXT: s_mov_b32 s18, s21 -; SI-NEXT: s_mov_b32 s21, s97 -; SI-NEXT: s_mov_b32 s97, s37 -; SI-NEXT: s_mov_b32 s37, s76 -; SI-NEXT: s_or_b32 s76, s15, s11 -; SI-NEXT: s_and_b32 s11, s35, 0xff -; SI-NEXT: s_lshl_b32 s15, s4, 8 -; SI-NEXT: s_or_b32 s60, s11, s15 -; SI-NEXT: s_and_b32 s11, s77, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 12 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_mov_b32 s6, s95 -; SI-NEXT: s_mov_b32 s95, s39 -; SI-NEXT: s_mov_b32 s39, s89 -; SI-NEXT: s_mov_b32 s89, s99 -; SI-NEXT: s_mov_b32 s99, s83 -; SI-NEXT: s_mov_b32 s83, s55 -; SI-NEXT: s_mov_b32 s55, s64 -; SI-NEXT: s_mov_b32 s64, s35 -; SI-NEXT: s_mov_b32 s35, s77 -; SI-NEXT: s_or_b32 s77, s15, s11 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 10 -; SI-NEXT: s_lshl_b32 s15, s4, 8 -; SI-NEXT: v_readlane_b32 s4, v62, 9 -; SI-NEXT: s_or_b32 s61, s11, s15 -; SI-NEXT: s_and_b32 s11, s4, 0xff -; SI-NEXT: v_readlane_b32 s4, v62, 8 -; SI-NEXT: s_lshl_b32 s11, s11, 16 -; SI-NEXT: s_lshl_b32 s15, s4, 24 -; SI-NEXT: s_or_b32 s78, s15, s11 -; SI-NEXT: v_readlane_b32 s11, v62, 7 -; SI-NEXT: s_and_b32 s11, s11, 0xff -; SI-NEXT: s_lshl_b32 s15, s17, 8 -; SI-NEXT: s_or_b32 s11, s11, s15 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: v_mov_b32_e32 v51, s9 -; SI-NEXT: s_or_b32 s17, s11, s9 -; SI-NEXT: v_readlane_b32 s9, v62, 2 -; SI-NEXT: v_readlane_b32 s11, v62, 1 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s15, s11, 8 -; SI-NEXT: s_or_b32 s9, s9, s15 -; SI-NEXT: s_and_b32 s9, s9, 0xffff -; SI-NEXT: s_mov_b32 s4, s96 -; SI-NEXT: s_mov_b32 s96, s24 -; SI-NEXT: v_mov_b32_e32 v52, s14 -; SI-NEXT: s_or_b32 s24, s9, s14 -; SI-NEXT: s_and_b32 s14, s93, 0xff -; SI-NEXT: s_lshl_b32 s15, s84, 8 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v53, v6, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v50, s14, v53 -; SI-NEXT: s_and_b32 s14, s8, 0xff -; SI-NEXT: s_lshl_b32 s15, s49, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v21 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v54, v14, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v17, s14, v54 -; SI-NEXT: s_and_b32 s14, s40, 0xff -; SI-NEXT: s_lshl_b32 s15, s53, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v29 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v55, v18, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v16, s14, v55 -; SI-NEXT: s_and_b32 s14, s34, 0xff -; SI-NEXT: s_lshl_b32 s15, s23, 8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v13 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v40, v19, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v15, s14, v40 -; SI-NEXT: s_and_b32 s14, s91, 0xff -; SI-NEXT: s_lshl_b32 s15, s66, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v41, v22, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v12, s14, v41 -; SI-NEXT: s_and_b32 s14, s50, 0xff -; SI-NEXT: s_lshl_b32 s15, s54, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v32 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v42, v23, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v11, s14, v42 -; SI-NEXT: s_and_b32 s14, s73, 0xff -; SI-NEXT: s_lshl_b32 s15, s36, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v46 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v28, v59, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v10, s14, v28 -; SI-NEXT: s_and_b32 s14, s82, 0xff -; SI-NEXT: s_lshl_b32 s15, s52, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v60, v24, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v9, s14, v60 -; SI-NEXT: s_and_b32 s14, s90, 0xff -; SI-NEXT: s_lshl_b32 s15, s16, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v31, v44, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v8, s14, v31 -; SI-NEXT: s_and_b32 s14, s62, 0xff -; SI-NEXT: s_lshl_b32 s15, s65, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v61, v45, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v7, s14, v61 -; SI-NEXT: s_and_b32 s14, s98, 0xff -; SI-NEXT: s_lshl_b32 s15, s67, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v38 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v6, v47, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: v_or_b32_e32 v4, s14, v6 -; SI-NEXT: s_and_b32 s14, s92, 0xff -; SI-NEXT: s_lshl_b32 s15, s7, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_readlane_b32 s8, v62, 55 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mov_b32_e32 v22, v14 -; SI-NEXT: v_or_b32_e32 v14, v56, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_or_b32_e32 v2, s14, v14 -; SI-NEXT: s_and_b32 s14, s70, 0xff -; SI-NEXT: s_lshl_b32 s15, s94, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v39 -; SI-NEXT: s_or_b32 s42, s8, s63 -; SI-NEXT: v_readlane_b32 s8, v62, 56 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: v_mov_b32_e32 v23, v18 -; SI-NEXT: v_or_b32_e32 v18, v57, v1 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s40, s8, s5 -; SI-NEXT: v_readlane_b32 s8, v62, 57 -; SI-NEXT: v_or_b32_e32 v1, s14, v18 -; SI-NEXT: s_and_b32 s14, s88, 0xff -; SI-NEXT: s_lshl_b32 s15, s79, 8 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v34 -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: v_readlane_b32 s9, v62, 60 -; SI-NEXT: s_or_b32 s14, s14, s15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_or_b32 s15, s8, s41 -; SI-NEXT: v_readlane_b32 s8, v62, 58 -; SI-NEXT: s_and_b32 s16, s9, 0xffff -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v26, v24 -; SI-NEXT: v_mov_b32_e32 v24, v19 -; SI-NEXT: v_or_b32_e32 v19, v58, v3 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_or_b32 s36, s16, s56 -; SI-NEXT: s_and_b32 s16, s48, 0xffff -; SI-NEXT: v_or_b32_e32 v3, s14, v19 -; SI-NEXT: s_or_b32 s14, s8, s43 -; SI-NEXT: s_and_b32 s8, s13, 0xffff -; SI-NEXT: s_or_b32 s53, s16, vcc_lo -; SI-NEXT: s_and_b32 s16, s72, 0xffff -; SI-NEXT: s_or_b32 s13, s8, s44 -; SI-NEXT: s_and_b32 s8, s12, 0xffff -; SI-NEXT: s_or_b32 s94, s16, vcc_hi -; SI-NEXT: s_and_b32 s16, s57, 0xffff -; SI-NEXT: s_or_b32 s12, s8, s45 -; SI-NEXT: s_and_b32 s8, s10, 0xffff -; SI-NEXT: s_or_b32 s49, s16, s74 -; SI-NEXT: s_and_b32 s16, s58, 0xffff -; SI-NEXT: s_or_b32 s10, s8, s46 -; SI-NEXT: v_readlane_b32 s8, v62, 59 -; SI-NEXT: s_or_b32 s48, s16, s75 -; SI-NEXT: s_and_b32 s16, s59, 0xffff -; SI-NEXT: s_and_b32 s8, s8, 0xffff -; SI-NEXT: s_or_b32 s11, s16, s76 -; SI-NEXT: s_and_b32 s16, s60, 0xffff -; SI-NEXT: s_and_b32 s23, s61, 0xffff -; SI-NEXT: s_mov_b32 s30, s87 -; SI-NEXT: s_mov_b32 s87, s85 -; SI-NEXT: s_or_b32 s8, s8, s47 -; SI-NEXT: s_or_b32 s9, s16, s77 -; SI-NEXT: s_or_b32 s16, s23, s78 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v25, v58 -; SI-NEXT: v_alignbit_b32 v57, s42, v51, 16 -; SI-NEXT: v_alignbit_b32 v58, s40, v52, 16 -; SI-NEXT: v_alignbit_b32 v56, s15, v53, 16 -; SI-NEXT: v_alignbit_b32 v47, s14, v54, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v46, s13, v55, 16 -; SI-NEXT: v_alignbit_b32 v45, s12, v40, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v44, s10, v41, 16 -; SI-NEXT: v_alignbit_b32 v43, s8, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, s36, v28, 16 -; SI-NEXT: v_alignbit_b32 v41, s53, v60, 16 -; SI-NEXT: v_alignbit_b32 v40, s94, v31, 16 -; SI-NEXT: v_alignbit_b32 v55, s49, v61, 16 -; SI-NEXT: v_alignbit_b32 v54, s48, v6, 16 -; SI-NEXT: v_alignbit_b32 v53, s11, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v22 -; SI-NEXT: v_alignbit_b32 v52, s9, v18, 16 -; SI-NEXT: v_mov_b32_e32 v18, v23 -; SI-NEXT: v_alignbit_b32 v51, s16, v19, 16 -; SI-NEXT: v_mov_b32_e32 v19, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: s_lshr_b32 s73, s63, 16 -; SI-NEXT: s_lshr_b32 s72, s5, 16 -; SI-NEXT: s_lshr_b32 s63, s41, 16 -; SI-NEXT: s_lshr_b32 s62, s43, 16 -; SI-NEXT: s_lshr_b32 s61, s44, 16 -; SI-NEXT: s_lshr_b32 s60, s45, 16 -; SI-NEXT: s_lshr_b32 s59, s46, 16 -; SI-NEXT: s_lshr_b32 s58, s47, 16 -; SI-NEXT: s_lshr_b32 s57, s56, 16 -; SI-NEXT: s_lshr_b32 s56, vcc_lo, 16 -; SI-NEXT: s_lshr_b32 s47, vcc_hi, 16 -; SI-NEXT: s_lshr_b32 s46, s74, 16 -; SI-NEXT: v_readlane_b32 s25, v62, 62 -; SI-NEXT: v_readlane_b32 s74, v62, 61 -; SI-NEXT: s_lshr_b32 s45, s75, 16 -; SI-NEXT: s_lshr_b32 s44, s76, 16 -; SI-NEXT: s_mov_b32 s76, s37 -; SI-NEXT: s_mov_b32 s37, s97 -; SI-NEXT: s_mov_b32 s97, s21 -; SI-NEXT: s_mov_b32 s21, s18 -; SI-NEXT: s_mov_b32 s18, s17 -; SI-NEXT: s_mov_b32 s85, s87 -; SI-NEXT: s_mov_b32 s87, s30 -; SI-NEXT: s_mov_b32 s17, s24 -; SI-NEXT: s_lshr_b32 s43, s77, 16 -; SI-NEXT: s_mov_b32 s77, s35 -; SI-NEXT: s_mov_b32 s35, s64 -; SI-NEXT: s_mov_b32 s64, s55 -; SI-NEXT: s_mov_b32 s55, s83 -; SI-NEXT: s_mov_b32 s83, s99 -; SI-NEXT: s_mov_b32 s99, s89 -; SI-NEXT: s_mov_b32 s89, s39 -; SI-NEXT: s_mov_b32 s39, s95 -; SI-NEXT: s_mov_b32 s95, s6 -; SI-NEXT: s_lshr_b32 s41, s78, 16 -; SI-NEXT: s_mov_b32 s24, s96 -; SI-NEXT: s_mov_b32 s96, s4 -; SI-NEXT: s_cbranch_execnz .LBB97_3 -; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v36 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v5, v27 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: s_add_i32 s4, s88, 3 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s79, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_readlane_b32 s4, v62, 11 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 10 -; SI-NEXT: v_readlane_b32 s6, v62, 9 +; SI-NEXT: v_readlane_b32 s4, v43, 13 +; SI-NEXT: v_readlane_b32 s5, v43, 12 ; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s70, s70, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 30 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s5, s70, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v48 -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v2, s5, v2 -; SI-NEXT: s_add_i32 s5, s35, 3 -; SI-NEXT: v_readlane_b32 s6, v62, 13 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 8 -; SI-NEXT: s_add_i32 s9, s77, 3 -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_readlane_b32 s6, v62, 12 -; SI-NEXT: s_and_b32 s9, s9, 0xff -; SI-NEXT: s_lshl_b32 s8, s6, 24 -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_addk_i32 s5, 0x300 -; SI-NEXT: s_or_b32 s8, s8, s9 -; SI-NEXT: s_and_b32 s5, s5, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_add_i32 s79, s92, 3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v1 -; SI-NEXT: s_add_i32 s16, s4, 0x3000000 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x3000000, v2 -; SI-NEXT: s_add_i32 s9, s5, 0x3000000 -; SI-NEXT: s_and_b32 s4, s79, 0xff -; SI-NEXT: s_lshl_b32 s5, s7, 8 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v2, v49, v2 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s4, s24, 3 +; SI-NEXT: s_or_b32 s29, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 5 +; SI-NEXT: v_readlane_b32 s5, v43, 4 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s55, 8 -; SI-NEXT: s_add_i32 s8, s37, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s51, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s52, s98, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s67, 8 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v38 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v4, v20, v4 -; SI-NEXT: s_add_i32 s30, s87, 3 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s30, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_add_i32 s8, s68, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s28, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s48, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 42 -; SI-NEXT: v_mov_b32_e32 v22, v30 -; SI-NEXT: s_add_i32 s87, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 39 -; SI-NEXT: s_and_b32 s4, s87, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v22 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v7, v35, v7 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: v_readlane_b32 s4, v62, 32 -; SI-NEXT: s_add_i32 s67, s4, 3 -; SI-NEXT: s_and_b32 s4, s67, 0xff -; SI-NEXT: s_lshl_b32 s5, s85, 8 -; SI-NEXT: s_add_i32 s8, s69, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s74, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s50, s90, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 23 -; SI-NEXT: s_add_i32 s49, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s50, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_add_i32 s94, s86, 3 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s94, 0xff -; SI-NEXT: s_lshl_b32 s5, s38, 8 -; SI-NEXT: s_add_i32 s8, s71, 3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s81, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s94, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 52 -; SI-NEXT: s_add_i32 s18, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 51 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v9, v24, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: v_readlane_b32 s4, v62, 45 -; SI-NEXT: s_add_i32 s98, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 43 -; SI-NEXT: v_readlane_b32 s6, v62, 14 -; SI-NEXT: s_and_b32 s4, s98, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s8, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s8, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s31, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s53, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 41 -; SI-NEXT: s_add_i32 s86, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 38 -; SI-NEXT: s_and_b32 s4, s86, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v10, v59, v10 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: v_readlane_b32 s4, v62, 31 -; SI-NEXT: s_add_i32 s66, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 27 -; SI-NEXT: s_and_b32 s4, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s37, s39, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s37, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s95, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s36, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 50 -; SI-NEXT: s_add_i32 s21, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 49 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 37 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s71, s22, 3 -; SI-NEXT: s_and_b32 s8, s71, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_add_i32 s35, s99, 3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v5 -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, v32, v11 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: v_readlane_b32 s4, v62, 40 -; SI-NEXT: s_add_i32 s85, s4, 3 -; SI-NEXT: s_and_b32 s4, s85, 0xff -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 33 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s8, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 54 -; SI-NEXT: s_add_i32 s17, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 53 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_readlane_b32 s5, v62, 47 -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_and_b32 s6, s35, 0xff -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v39, s9 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x3000000, v2 -; SI-NEXT: v_mov_b32_e32 v28, s11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v4 -; SI-NEXT: v_mov_b32_e32 v27, s48 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v7 -; SI-NEXT: v_mov_b32_e32 v26, s49 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v8 -; SI-NEXT: v_mov_b32_e32 v25, s94 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v9 -; SI-NEXT: v_mov_b32_e32 v24, s53 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v10 -; SI-NEXT: v_mov_b32_e32 v23, s36 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v11 -; SI-NEXT: v_mov_b32_e32 v22, s8 -; SI-NEXT: v_alignbit_b32 v43, v22, v11, 16 -; SI-NEXT: v_alignbit_b32 v42, v23, v10, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v9, 16 -; SI-NEXT: v_alignbit_b32 v40, v25, v8, 16 -; SI-NEXT: v_alignbit_b32 v55, v26, v7, 16 -; SI-NEXT: v_alignbit_b32 v54, v27, v4, 16 -; SI-NEXT: v_alignbit_b32 v53, v28, v2, 16 -; SI-NEXT: v_alignbit_b32 v52, v39, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v30, v3, 16 -; SI-NEXT: s_lshr_b32 s58, s8, 16 -; SI-NEXT: s_lshr_b32 s57, s36, 16 -; SI-NEXT: s_lshr_b32 s56, s53, 16 -; SI-NEXT: s_lshr_b32 s47, s94, 16 -; SI-NEXT: s_lshr_b32 s46, s49, 16 -; SI-NEXT: s_lshr_b32 s45, s48, 16 -; SI-NEXT: s_lshr_b32 s44, s11, 16 -; SI-NEXT: s_lshr_b32 s43, s9, 16 -; SI-NEXT: s_lshr_b32 s41, s16, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v5, v12, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 3, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s4, v62, 48 -; SI-NEXT: s_add_i32 s7, s4, 3 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s43, 0xff +; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s56, 0xff +; SI-NEXT: s_lshl_b32 s5, s47, 8 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s58, 0xff +; SI-NEXT: s_lshl_b32 s5, s57, 8 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: s_lshl_b32 s5, s59, 8 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: s_and_b32 s4, s62, 0xff +; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: s_lshl_b32 s5, s63, 8 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s73, 0xff +; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s62, s4, s5 +; SI-NEXT: s_and_b32 s4, s36, 0xff +; SI-NEXT: s_lshl_b32 s5, s16, 8 +; SI-NEXT: s_or_b32 s63, s4, s5 +; SI-NEXT: s_and_b32 s4, s10, 0xff +; SI-NEXT: s_lshl_b32 s5, s9, 8 +; SI-NEXT: s_or_b32 s72, s4, s5 ; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s89, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s10, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 46 -; SI-NEXT: s_add_i32 s99, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 44 -; SI-NEXT: s_and_b32 s4, s99, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v19, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 36 -; SI-NEXT: s_add_i32 s81, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 35 -; SI-NEXT: v_readlane_b32 s6, v62, 28 -; SI-NEXT: s_and_b32 s4, s81, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s55, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 26 -; SI-NEXT: s_and_b32 s6, s55, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s12, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 34 -; SI-NEXT: s_add_i32 s69, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 29 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v5 -; SI-NEXT: s_and_b32 s4, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v29 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v18, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 22 -; SI-NEXT: s_add_i32 s34, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 21 -; SI-NEXT: v_readlane_b32 s6, v62, 19 -; SI-NEXT: s_and_b32 s4, s34, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s92, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s92, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s97, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s13, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 25 -; SI-NEXT: s_add_i32 s51, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 24 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x3000000, v5 -; SI-NEXT: s_and_b32 s4, s51, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v21 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v14, v5 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: v_readlane_b32 s4, v62, 20 -; SI-NEXT: s_add_i32 s95, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 18 -; SI-NEXT: s_and_b32 s4, s95, 0xff +; SI-NEXT: s_lshl_b32 s5, s78, 8 +; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_and_b32 s4, s8, 0xff +; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: s_or_b32 s74, s4, s5 +; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_or_b32 s75, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 9 +; SI-NEXT: v_readlane_b32 s5, v43, 8 +; SI-NEXT: s_and_b32 s4, s4, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s6, s96, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s80, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s14, s4, 0x3000000 -; SI-NEXT: s_add_i32 s4, s93, 3 +; SI-NEXT: s_or_b32 s5, s4, s5 +; SI-NEXT: v_readlane_b32 s4, v43, 7 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s84, 8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v6, v6, v13 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_add_i32 s4, s83, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 6 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_or_b32 s7, s6, s4 +; SI-NEXT: v_readlane_b32 s4, v43, 11 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s6, s64, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 15 +; SI-NEXT: v_readlane_b32 s6, v43, 10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: v_writelane_b32 v42, s7, 51 +; SI-NEXT: s_or_b32 s4, s6, s4 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_readlane_b32 s6, v43, 1 +; SI-NEXT: v_readlane_b32 s7, v43, 0 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s7, s6, s7 +; SI-NEXT: s_and_b32 s6, s11, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s15, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 2 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 1 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s6, s26, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s8, s12, 24 +; SI-NEXT: s_or_b32 s37, s8, s6 +; SI-NEXT: v_readlane_b32 s6, v43, 3 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: v_readlane_b32 s8, v43, 2 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s17, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 0 +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_or_b32 s6, s8, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_lshl_b32 s9, s15, 8 +; SI-NEXT: s_or_b32 s9, s8, s9 +; SI-NEXT: s_and_b32 s8, s13, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s14, 24 +; SI-NEXT: s_or_b32 s68, s10, s8 +; SI-NEXT: s_and_b32 s8, s21, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s10, s22, 24 +; SI-NEXT: s_or_b32 s8, s10, s8 +; SI-NEXT: s_and_b32 s10, s77, 0xff +; SI-NEXT: s_lshl_b32 s11, s76, 8 +; SI-NEXT: s_or_b32 s11, s10, s11 +; SI-NEXT: s_and_b32 s10, s40, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, s41, 24 +; SI-NEXT: s_or_b32 s99, s12, s10 +; SI-NEXT: s_and_b32 s10, vcc_lo, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s12, vcc_hi, 24 +; SI-NEXT: s_or_b32 s10, s12, s10 +; SI-NEXT: s_and_b32 s12, s49, 0xff +; SI-NEXT: s_lshl_b32 s13, s48, 8 +; SI-NEXT: s_or_b32 s13, s12, s13 +; SI-NEXT: s_and_b32 s12, s38, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s14, s39, 24 +; SI-NEXT: s_or_b32 s92, s14, s12 +; SI-NEXT: s_and_b32 s12, s50, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s14, s51, 24 +; SI-NEXT: s_or_b32 s12, s14, s12 +; SI-NEXT: s_and_b32 s14, s55, 0xff +; SI-NEXT: s_lshl_b32 s15, s54, 8 +; SI-NEXT: s_or_b32 s15, s14, s15 +; SI-NEXT: s_and_b32 s14, s52, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s24, s53, 24 +; SI-NEXT: s_mov_b32 s28, s90 +; SI-NEXT: s_or_b32 s90, s24, s14 +; SI-NEXT: s_and_b32 s14, s64, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s25, s65, 24 +; SI-NEXT: s_or_b32 s14, s25, s14 +; SI-NEXT: s_and_b32 s25, s34, 0xff +; SI-NEXT: s_lshl_b32 s40, s86, 8 +; SI-NEXT: s_or_b32 s41, s25, s40 +; SI-NEXT: s_and_b32 s25, s80, 0xff +; SI-NEXT: s_lshl_b32 s25, s25, 16 +; SI-NEXT: s_lshl_b32 s40, s81, 24 +; SI-NEXT: s_or_b32 s18, s40, s25 +; SI-NEXT: s_and_b32 s40, s31, 0xff +; SI-NEXT: s_lshl_b32 s40, s40, 16 +; SI-NEXT: s_lshl_b32 s42, s84, 24 +; SI-NEXT: s_or_b32 s40, s42, s40 +; SI-NEXT: s_and_b32 s42, s35, 0xff +; SI-NEXT: s_lshl_b32 s43, s97, 8 +; SI-NEXT: s_or_b32 s43, s42, s43 +; SI-NEXT: s_and_b32 s42, s71, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s76, s67, 24 +; SI-NEXT: s_or_b32 s35, s76, s42 +; SI-NEXT: s_and_b32 s42, s87, 0xff +; SI-NEXT: s_lshl_b32 s42, s42, 16 +; SI-NEXT: s_lshl_b32 s76, s83, 24 +; SI-NEXT: s_or_b32 s42, s76, s42 +; SI-NEXT: s_and_b32 s76, s19, 0xff +; SI-NEXT: s_lshl_b32 s77, s20, 8 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s66, 0xff +; SI-NEXT: v_writelane_b32 v42, s78, 52 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s85, 24 +; SI-NEXT: s_or_b32 s19, s78, s77 +; SI-NEXT: s_and_b32 s77, s94, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s27, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_or_b32 vcc_lo, s78, s77 +; SI-NEXT: s_or_b32 vcc_hi, s76, s19 +; SI-NEXT: s_and_b32 s76, s26, 0xff +; SI-NEXT: s_lshl_b32 s77, s23, 8 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s98, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_or_b32 s71, s78, s77 +; SI-NEXT: s_and_b32 s77, s79, 0xff +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s17, v43, 40 +; SI-NEXT: s_and_b32 s41, s41, 0xffff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s88, 24 +; SI-NEXT: s_or_b32 s39, s76, s71 +; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 39 +; SI-NEXT: s_or_b32 s41, s41, s18 +; SI-NEXT: s_mov_b32 s31, s18 +; SI-NEXT: s_or_b32 s38, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s17, 8 +; SI-NEXT: v_readlane_b32 s18, v43, 38 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 37 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_or_b32 s80, s78, s77 +; SI-NEXT: s_and_b32 s77, s95, 0xff +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s17, v43, 36 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s93, 24 +; SI-NEXT: s_or_b32 s49, s76, s80 +; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 35 +; SI-NEXT: s_or_b32 s48, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s17, 8 +; SI-NEXT: v_readlane_b32 s17, v43, 34 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 33 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_or_b32 s81, s78, s77 +; SI-NEXT: s_and_b32 s77, s30, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s69, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s17, v43, 31 +; SI-NEXT: s_or_b32 s50, s78, s77 +; SI-NEXT: s_or_b32 s51, s76, s81 +; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_lshl_b32 s77, s96, 8 +; SI-NEXT: v_readlane_b32 s17, v43, 30 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s82, 24 +; SI-NEXT: v_writelane_b32 v42, s96, 53 +; SI-NEXT: v_readlane_b32 s18, v43, 32 +; SI-NEXT: v_writelane_b32 v42, s82, 54 +; SI-NEXT: s_or_b32 s82, s78, s77 +; SI-NEXT: s_and_b32 s77, s18, 0xff +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s17, v43, 28 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s70, 24 +; SI-NEXT: s_or_b32 s53, s76, s82 +; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 27 +; SI-NEXT: s_or_b32 s52, s78, s77 +; SI-NEXT: s_lshl_b32 s77, s17, 8 +; SI-NEXT: v_readlane_b32 s18, v43, 26 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s18, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 25 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: v_writelane_b32 v42, s16, 55 +; SI-NEXT: s_or_b32 s16, s78, s77 +; SI-NEXT: s_and_b32 s77, s89, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 29 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s17, v43, 22 +; SI-NEXT: v_readlane_b32 s18, v43, 21 +; SI-NEXT: s_or_b32 s54, s78, s77 +; SI-NEXT: s_or_b32 s55, s76, s16 +; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_lshl_b32 s77, s18, 8 +; SI-NEXT: v_readlane_b32 s17, v43, 20 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 19 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: v_readlane_b32 s17, v43, 24 +; SI-NEXT: s_or_b32 s83, s78, s77 +; SI-NEXT: s_and_b32 s77, s17, 0xff +; SI-NEXT: v_readlane_b32 s17, v43, 23 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s17, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: v_readlane_b32 s17, v43, 17 +; SI-NEXT: v_readlane_b32 s18, v43, 16 +; SI-NEXT: s_or_b32 s64, s78, s77 +; SI-NEXT: s_or_b32 s65, s76, s83 +; SI-NEXT: s_and_b32 s76, s17, 0xff +; SI-NEXT: s_lshl_b32 s77, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v43, 15 +; SI-NEXT: s_or_b32 s76, s76, s77 +; SI-NEXT: s_and_b32 s77, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 14 +; SI-NEXT: v_writelane_b32 v42, s89, 56 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: v_writelane_b32 v42, s70, 57 +; SI-NEXT: s_or_b32 s85, s78, s77 +; SI-NEXT: s_and_b32 s77, s44, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 18 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_and_b32 s43, s43, 0xffff +; SI-NEXT: v_writelane_b32 v42, s69, 58 +; SI-NEXT: s_lshl_b32 s77, s77, 16 +; SI-NEXT: s_lshl_b32 s78, s18, 24 +; SI-NEXT: s_and_b32 s76, s76, 0xffff +; SI-NEXT: s_and_b32 s44, s29, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s37 +; SI-NEXT: s_or_b32 s9, s9, s68 +; SI-NEXT: s_or_b32 s11, s11, s99 +; SI-NEXT: s_or_b32 s13, s13, s92 +; SI-NEXT: s_or_b32 s15, s15, s90 +; SI-NEXT: s_or_b32 s43, s43, s35 +; SI-NEXT: v_writelane_b32 v42, s30, 59 +; SI-NEXT: s_mov_b32 s23, s91 +; SI-NEXT: s_mov_b32 s91, s36 +; SI-NEXT: s_or_b32 s66, s78, s77 +; SI-NEXT: s_or_b32 s67, s76, s85 +; SI-NEXT: s_and_b32 s45, s45, 0xffff +; SI-NEXT: s_and_b32 s46, s46, 0xffff +; SI-NEXT: s_and_b32 s47, s47, 0xffff +; SI-NEXT: s_and_b32 s56, s56, 0xffff +; SI-NEXT: s_and_b32 s57, s57, 0xffff +; SI-NEXT: s_and_b32 s30, s58, 0xffff +; SI-NEXT: s_and_b32 s34, s59, 0xffff +; SI-NEXT: s_and_b32 s36, s60, 0xffff +; SI-NEXT: s_and_b32 s97, s61, 0xffff +; SI-NEXT: s_and_b32 s86, s62, 0xffff +; SI-NEXT: s_and_b32 s98, s63, 0xffff +; SI-NEXT: s_and_b32 s17, s72, 0xffff +; SI-NEXT: s_and_b32 s87, s73, 0xffff +; SI-NEXT: s_and_b32 s96, s74, 0xffff +; SI-NEXT: s_and_b32 s22, s75, 0xffff +; SI-NEXT: s_or_b32 s74, s44, s4 +; SI-NEXT: s_mov_b32 s75, s5 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: s_mov_b32 s70, s93 +; SI-NEXT: s_mov_b32 s69, s95 +; SI-NEXT: s_mov_b32 s93, s28 +; SI-NEXT: s_or_b32 s72, s45, s6 +; SI-NEXT: s_mov_b32 s73, s7 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 16 +; SI-NEXT: s_or_b32 s62, s46, s8 +; SI-NEXT: s_mov_b32 s63, s9 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_or_b32 s60, s47, s10 +; SI-NEXT: s_mov_b32 s61, s11 +; SI-NEXT: s_lshr_b64 s[88:89], s[10:11], 16 +; SI-NEXT: s_or_b32 s58, s56, s12 +; SI-NEXT: s_mov_b32 s59, s13 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 16 +; SI-NEXT: s_or_b32 s56, s57, s14 +; SI-NEXT: s_mov_b32 s57, s15 +; SI-NEXT: s_lshr_b64 s[24:25], s[14:15], 16 +; SI-NEXT: s_or_b32 s46, s30, s40 +; SI-NEXT: s_mov_b32 s47, s41 +; SI-NEXT: s_or_b32 s44, s34, s42 +; SI-NEXT: s_mov_b32 s34, s4 +; SI-NEXT: s_mov_b32 s45, s43 +; SI-NEXT: s_lshr_b64 s[94:95], s[42:43], 16 +; SI-NEXT: s_or_b32 s42, s36, vcc_lo +; SI-NEXT: s_mov_b32 s43, vcc_hi +; SI-NEXT: s_lshr_b64 vcc, vcc, 16 +; SI-NEXT: s_or_b32 s40, s97, s38 +; SI-NEXT: s_mov_b32 s41, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[38:39], 16 +; SI-NEXT: s_or_b32 s14, s86, s48 +; SI-NEXT: s_mov_b32 s15, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[48:49], 16 +; SI-NEXT: s_or_b32 s12, s98, s50 +; SI-NEXT: s_mov_b32 s13, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[50:51], 16 +; SI-NEXT: s_or_b32 s10, s17, s52 +; SI-NEXT: s_mov_b32 s11, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[52:53], 16 +; SI-NEXT: s_or_b32 s8, s87, s54 +; SI-NEXT: s_mov_b32 s9, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[54:55], 16 +; SI-NEXT: s_or_b32 s6, s96, s64 +; SI-NEXT: s_mov_b32 s7, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[64:65], 16 +; SI-NEXT: s_or_b32 s4, s22, s66 +; SI-NEXT: s_mov_b32 s5, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[66:67], 16 +; SI-NEXT: v_readlane_b32 s17, v42, 51 +; SI-NEXT: s_lshr_b32 s55, s17, 16 +; SI-NEXT: s_lshr_b32 s53, s37, 16 +; SI-NEXT: s_lshr_b32 s51, s68, 16 +; SI-NEXT: s_lshr_b32 s49, s99, 16 +; SI-NEXT: s_lshr_b32 s86, s92, 16 +; SI-NEXT: s_lshr_b32 s39, s90, 16 +; SI-NEXT: s_lshr_b32 s18, s31, 16 +; SI-NEXT: s_lshr_b32 s22, s35, 16 +; SI-NEXT: s_lshr_b32 s97, s19, 16 +; SI-NEXT: s_lshr_b32 s65, s71, 16 +; SI-NEXT: s_lshr_b32 s19, s80, 16 +; SI-NEXT: s_lshr_b32 s71, s81, 16 +; SI-NEXT: s_lshr_b32 s67, s82, 16 +; SI-NEXT: v_readlane_b32 s82, v42, 54 +; SI-NEXT: v_readlane_b32 s96, v42, 53 +; SI-NEXT: s_lshr_b32 s80, s16, 16 +; SI-NEXT: v_readlane_b32 s16, v42, 55 +; SI-NEXT: s_lshr_b32 s81, s83, 16 +; SI-NEXT: s_mov_b32 s90, s93 +; SI-NEXT: v_readlane_b32 s78, v42, 52 +; SI-NEXT: s_mov_b32 s95, s69 +; SI-NEXT: s_mov_b32 s93, s70 +; SI-NEXT: v_readlane_b32 s30, v42, 59 +; SI-NEXT: v_readlane_b32 s69, v42, 58 +; SI-NEXT: v_readlane_b32 s70, v42, 57 +; SI-NEXT: v_readlane_b32 s89, v42, 56 +; SI-NEXT: s_lshr_b32 s77, s85, 16 +; SI-NEXT: s_mov_b32 s84, vcc_lo +; SI-NEXT: s_mov_b32 s36, s91 +; SI-NEXT: s_mov_b32 s91, s23 +; SI-NEXT: s_cbranch_execnz .LBB97_3 +; SI-NEXT: .LBB97_2: ; %cmp.true +; SI-NEXT: v_readlane_b32 s4, v43, 42 ; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 41 ; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s76, 3 +; SI-NEXT: s_lshl_b32 s5, s90, 8 +; SI-NEXT: s_add_i32 s6, s6, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 16 +; SI-NEXT: v_readlane_b32 s5, v43, 18 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s5, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s40, s4, 0x3000000 -; SI-NEXT: v_readlane_b32 s4, v62, 7 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 17 -; SI-NEXT: v_readlane_b32 s6, v62, 6 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s18, s4, 0x3000000 -; SI-NEXT: s_add_i32 s4, s20, 3 -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: v_readlane_b32 s6, v62, 4 -; SI-NEXT: s_and_b32 s4, s4, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 +; SI-NEXT: v_readlane_b32 s5, v43, 17 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: v_readlane_b32 s6, v43, 16 +; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: s_and_b32 s5, s5, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v43, 14 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readlane_b32 s6, v43, 44 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 3 +; SI-NEXT: v_readlane_b32 s8, v43, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s5, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s42, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v20, s10 -; SI-NEXT: v_mov_b32_e32 v19, s12 -; SI-NEXT: v_mov_b32_e32 v18, s13 -; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v5 -; SI-NEXT: v_mov_b32_e32 v5, s14 -; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v6 -; SI-NEXT: v_mov_b32_e32 v6, s15 -; SI-NEXT: v_alignbit_b32 v57, s42, v13, 16 -; SI-NEXT: v_mov_b32_e32 v13, s17 -; SI-NEXT: v_alignbit_b32 v58, s40, v13, 16 -; SI-NEXT: v_alignbit_b32 v56, v6, v50, 16 -; SI-NEXT: v_alignbit_b32 v47, v5, v17, 16 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v46, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v45, v19, v15, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v44, v20, v12, 16 -; SI-NEXT: s_lshr_b32 s73, s42, 16 -; SI-NEXT: s_lshr_b32 s72, s40, 16 -; SI-NEXT: s_lshr_b32 s63, s15, 16 -; SI-NEXT: s_lshr_b32 s62, s14, 16 -; SI-NEXT: s_lshr_b32 s61, s13, 16 -; SI-NEXT: s_lshr_b32 s60, s12, 16 -; SI-NEXT: s_lshr_b32 s59, s10, 16 +; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v43, 23 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readlane_b32 s7, v43, 22 +; SI-NEXT: s_add_i32 s7, s7, 3 +; SI-NEXT: v_readlane_b32 s8, v43, 21 +; SI-NEXT: v_readlane_b32 s9, v43, 20 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v43, 19 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v43, 43 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s9, s78, 8 +; SI-NEXT: s_add_i32 s10, s89, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v43, 29 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 24 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_addk_i32 s8, 0x300 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v43, 28 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: v_readlane_b32 s10, v43, 27 +; SI-NEXT: v_readlane_b32 s11, v43, 26 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v43, 25 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_addk_i32 s9, 0x300 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v43, 46 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_readlane_b32 s11, v43, 45 +; SI-NEXT: v_readlane_b32 s12, v43, 32 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s11, s70, 24 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_addk_i32 s10, 0x300 +; SI-NEXT: s_or_b32 s11, s11, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v43, 31 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: v_readlane_b32 s13, v43, 30 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: s_lshl_b32 s12, s96, 8 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s12, s82, 24 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_addk_i32 s11, 0x300 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_add_i32 s12, s36, 3 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s13, s16, 8 +; SI-NEXT: s_add_i32 s14, s30, 3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s13, s69, 24 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_addk_i32 s12, 0x300 +; SI-NEXT: s_or_b32 s13, s13, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v43, 36 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: v_readlane_b32 s14, v43, 35 +; SI-NEXT: v_readlane_b32 s15, v43, 34 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v43, 33 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_addk_i32 s13, 0x300 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v42, 50 +; SI-NEXT: s_add_i32 s17, s14, 3 +; SI-NEXT: v_readlane_b32 s15, v42, 49 +; SI-NEXT: s_and_b32 s14, s17, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_add_i32 s16, s95, 3 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s15, s93, 24 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_addk_i32 s14, 0x300 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v43, 40 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: v_readlane_b32 s16, v43, 39 +; SI-NEXT: v_readlane_b32 s17, v43, 38 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v43, 37 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_addk_i32 s15, 0x300 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v42, 48 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 47 +; SI-NEXT: v_readlane_b32 s18, v42, 42 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s99, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 43 +; SI-NEXT: s_and_b32 s18, s99, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 38 +; SI-NEXT: s_add_i32 s87, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 35 +; SI-NEXT: v_readlane_b32 s19, v42, 29 +; SI-NEXT: s_and_b32 s17, s87, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_add_i32 s23, s19, 3 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 30 +; SI-NEXT: s_and_b32 s23, s23, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s23, s23, 16 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_or_b32 s18, s18, s23 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_add_i32 s40, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 45 +; SI-NEXT: s_add_i32 s41, s17, 0x3000000 +; SI-NEXT: s_add_i32 s68, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 44 +; SI-NEXT: v_readlane_b32 s18, v42, 39 +; SI-NEXT: s_and_b32 s16, s68, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s96, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 40 +; SI-NEXT: s_and_b32 s18, s96, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 33 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: v_readlane_b32 s18, v42, 31 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 8 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v42, 24 +; SI-NEXT: s_addk_i32 s17, 0x300 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s42, s16, 0x3000000 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: v_readlane_b32 s17, v42, 25 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s43, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 46 +; SI-NEXT: s_add_i32 s23, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 41 +; SI-NEXT: v_readlane_b32 s18, v42, 37 +; SI-NEXT: s_and_b32 s16, s23, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s86, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 36 +; SI-NEXT: s_and_b32 s18, s86, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 28 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 22 +; SI-NEXT: v_readlane_b32 s18, v42, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 15 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 34 +; SI-NEXT: s_add_i32 s83, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 32 +; SI-NEXT: v_readlane_b32 s18, v42, 26 +; SI-NEXT: s_and_b32 s16, s83, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 27 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s46, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 23 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 21 +; SI-NEXT: v_readlane_b32 s18, v42, 17 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 18 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s47, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 20 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 19 +; SI-NEXT: v_readlane_b32 s18, v42, 13 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 14 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s56, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 12 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 11 +; SI-NEXT: v_readlane_b32 s18, v42, 7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 8 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s57, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 10 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 9 +; SI-NEXT: v_readlane_b32 s18, v42, 5 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 6 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s58, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 3 +; SI-NEXT: v_readlane_b32 s18, v43, 63 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v42, 0 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s59, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v42, 2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v42, 1 +; SI-NEXT: v_readlane_b32 s18, v43, 61 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 62 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s60, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 60 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 59 +; SI-NEXT: v_readlane_b32 s18, v43, 55 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 56 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 58 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 57 +; SI-NEXT: v_readlane_b32 s18, v43, 53 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 54 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s62, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 52 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 51 +; SI-NEXT: v_readlane_b32 s18, v43, 49 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 50 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s63, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 5 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 4 +; SI-NEXT: v_readlane_b32 s18, v43, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 2 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s72, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 0 +; SI-NEXT: v_readlane_b32 s18, v43, 47 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 48 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s73, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 13 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 12 +; SI-NEXT: v_readlane_b32 s18, v43, 11 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 10 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s74, s16, 0x3000000 +; SI-NEXT: v_readlane_b32 s16, v43, 9 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v43, 8 +; SI-NEXT: v_readlane_b32 s18, v43, 7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readlane_b32 s17, v43, 6 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_addk_i32 s16, 0x300 +; SI-NEXT: s_lshl_b32 s17, s17, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_add_i32 s8, s8, 0x3000000 +; SI-NEXT: s_add_i32 s9, s9, 0x3000000 +; SI-NEXT: s_add_i32 s10, s10, 0x3000000 +; SI-NEXT: s_add_i32 s11, s11, 0x3000000 +; SI-NEXT: s_add_i32 s12, s12, 0x3000000 +; SI-NEXT: s_add_i32 s13, s13, 0x3000000 +; SI-NEXT: s_add_i32 s14, s14, 0x3000000 +; SI-NEXT: s_add_i32 s15, s15, 0x3000000 +; SI-NEXT: s_add_i32 s75, s16, 0x3000000 +; SI-NEXT: s_lshr_b64 s[76:77], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[64:65], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[60:61], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[24:25], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[46:47], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[84:85], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s55, s75, 16 +; SI-NEXT: s_lshr_b32 s53, s73, 16 +; SI-NEXT: s_lshr_b32 s51, s63, 16 +; SI-NEXT: s_lshr_b32 s49, s61, 16 +; SI-NEXT: s_lshr_b32 s86, s59, 16 +; SI-NEXT: s_lshr_b32 s39, s57, 16 +; SI-NEXT: s_lshr_b32 s18, s47, 16 +; SI-NEXT: s_lshr_b32 s22, s45, 16 +; SI-NEXT: s_lshr_b32 s97, s43, 16 +; SI-NEXT: s_lshr_b32 s65, s41, 16 +; SI-NEXT: s_lshr_b32 s19, s15, 16 +; SI-NEXT: s_lshr_b32 s71, s13, 16 +; SI-NEXT: s_lshr_b32 s67, s11, 16 +; SI-NEXT: s_lshr_b32 s80, s9, 16 +; SI-NEXT: s_lshr_b32 s81, s7, 16 +; SI-NEXT: s_lshr_b32 s77, s5, 16 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s16, s74, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s75, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s72, 0xffff +; SI-NEXT: s_lshl_b32 s17, s26, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v58 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s73, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v50 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s62, 0xffff +; SI-NEXT: s_lshl_b32 s17, s28, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v56 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s63, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s60, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v47 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s61, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s58, 0xffff +; SI-NEXT: s_lshl_b32 s17, s20, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v46 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s59, 0xffff +; SI-NEXT: s_lshl_b32 s17, s86, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s56, 0xffff +; SI-NEXT: s_lshl_b32 s17, s24, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v45 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s57, 0xffff +; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s46, 0xffff +; SI-NEXT: s_lshl_b32 s17, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v44 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s47, 0xffff +; SI-NEXT: s_lshl_b32 s17, s18, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s44, 0xffff +; SI-NEXT: s_lshl_b32 s17, s94, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v43 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s45, 0xffff +; SI-NEXT: s_lshl_b32 s17, s22, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 -; SI-NEXT: s_and_b32 s4, s36, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xffff +; SI-NEXT: s_lshl_b32 s17, s97, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v41 -; SI-NEXT: s_and_b32 s4, s53, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s65, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v40 -; SI-NEXT: s_and_b32 s4, s94, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s19, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v7 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v55 -; SI-NEXT: s_and_b32 s4, s49, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s71, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v54 -; SI-NEXT: s_and_b32 s4, s48, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s52, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x64, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s67, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s54, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s80, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s64, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s81, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s66, 16 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s99, v41, 35 +; SI-NEXT: v_readlane_b32 s98, v41, 34 +; SI-NEXT: v_readlane_b32 s97, v41, 33 +; SI-NEXT: v_readlane_b32 s96, v41, 32 +; SI-NEXT: v_readlane_b32 s87, v41, 31 +; SI-NEXT: v_readlane_b32 s86, v41, 30 +; SI-NEXT: v_readlane_b32 s85, v41, 29 +; SI-NEXT: v_readlane_b32 s84, v41, 28 +; SI-NEXT: v_readlane_b32 s83, v41, 27 +; SI-NEXT: v_readlane_b32 s82, v41, 26 +; SI-NEXT: v_readlane_b32 s81, v41, 25 +; SI-NEXT: v_readlane_b32 s80, v41, 24 +; SI-NEXT: v_readlane_b32 s71, v41, 23 +; SI-NEXT: v_readlane_b32 s70, v41, 22 +; SI-NEXT: v_readlane_b32 s69, v41, 21 +; SI-NEXT: v_readlane_b32 s68, v41, 20 +; SI-NEXT: v_readlane_b32 s67, v41, 19 +; SI-NEXT: v_readlane_b32 s66, v41, 18 +; SI-NEXT: v_readlane_b32 s65, v41, 17 +; SI-NEXT: v_readlane_b32 s64, v41, 16 +; SI-NEXT: v_readlane_b32 s55, v41, 15 +; SI-NEXT: v_readlane_b32 s54, v41, 14 +; SI-NEXT: v_readlane_b32 s53, v41, 13 +; SI-NEXT: v_readlane_b32 s52, v41, 12 +; SI-NEXT: v_readlane_b32 s51, v41, 11 +; SI-NEXT: v_readlane_b32 s50, v41, 10 +; SI-NEXT: v_readlane_b32 s49, v41, 9 +; SI-NEXT: v_readlane_b32 s48, v41, 8 +; SI-NEXT: v_readlane_b32 s39, v41, 7 +; SI-NEXT: v_readlane_b32 s38, v41, 6 +; SI-NEXT: v_readlane_b32 s37, v41, 5 +; SI-NEXT: v_readlane_b32 s36, v41, 4 +; SI-NEXT: v_readlane_b32 s35, v41, 3 +; SI-NEXT: v_readlane_b32 s34, v41, 2 +; SI-NEXT: v_readlane_b32 s31, v41, 1 +; SI-NEXT: v_readlane_b32 s30, v41, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: v_mov_b32_e32 v5, v13 -; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v58 -; SI-NEXT: v_mov_b32_e32 v48, v39 -; SI-NEXT: v_mov_b32_e32 v39, v57 -; SI-NEXT: v_mov_b32_e32 v49, v56 -; SI-NEXT: v_mov_b32_e32 v20, v47 -; SI-NEXT: v_mov_b32_e32 v30, v37 -; SI-NEXT: v_mov_b32_e32 v36, v35 -; SI-NEXT: v_mov_b32_e32 v35, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v32, v23 -; SI-NEXT: ; implicit-def: $sgpr18 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $sgpr17 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr55 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr53 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr86 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $sgpr94 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr18 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr16 -; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr97 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v128i8_to_v64i16_scalar: @@ -199508,22 +201431,22 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 ; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 @@ -199549,13 +201472,11 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v43, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 @@ -199564,49 +201485,46 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v27 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v29 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v10 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v12 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v26 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v28 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v30 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v31 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v32 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v33 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v34 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v35 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v36 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v37 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v38 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:184 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:200 @@ -199615,34 +201533,35 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:232 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v22 ; VI-NEXT: v_lshlrev_b32_e32 v8, 8, v24 +; VI-NEXT: v_lshlrev_b32_e32 v10, 8, v26 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec ; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 ; VI-NEXT: v_lshlrev_b32_e32 v16, 8, v16 ; VI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 ; VI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v0 +; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v0 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v3 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v5 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v6 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v24, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v36, 8, v2 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:248 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:264 @@ -199652,131 +201571,155 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:296 ; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v26, 8, v0 -; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v2 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v4 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v2 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:312 ; VI-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 ; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 offset:328 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 ; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:36 +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: v_lshlrev_b32_e32 v54, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v4 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v5 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v6 -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; VI-NEXT: s_waitcnt vmcnt(10) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v6, 8, v0 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:68 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:84 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:92 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:100 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:108 -; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:116 -; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:124 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:132 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:140 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:148 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:156 -; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:164 -; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:172 -; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:180 -; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:188 -; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:196 -; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:204 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:212 -; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:220 -; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:228 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:236 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:244 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:252 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:260 -; VI-NEXT: s_waitcnt vmcnt(14) -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v0 +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:68 +; VI-NEXT: s_waitcnt vmcnt(10) +; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:268 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:76 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:284 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:292 -; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:300 -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:308 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:316 -; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:324 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill -; VI-NEXT: s_cbranch_scc0 .LBB97_4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:84 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:92 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:100 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:108 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v0, off, s[0:3], s32 offset:116 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:124 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:132 +; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:140 +; VI-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:148 +; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:156 +; VI-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:164 +; VI-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:172 +; VI-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:180 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:188 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:196 +; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:204 +; VI-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:212 +; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:220 +; VI-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:228 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:236 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:244 +; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:252 +; VI-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 +; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:268 +; VI-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:276 +; VI-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:284 +; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:292 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:300 +; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:308 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:316 +; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:324 +; VI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; VI-NEXT: s_waitcnt vmcnt(14) +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:836 ; 4-byte Folded Spill +; VI-NEXT: s_cbranch_scc0 .LBB97_2 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:844 ; 4-byte Folded Spill +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:848 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 @@ -199785,225 +201728,205 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v1, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(6) +; VI-NEXT: v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v2, v8 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v0, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v3, v3, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v3, v8 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, v10 ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v2, v50, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v12, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v35, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v36, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v37, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v38, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v38, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v39, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v49, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v48, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v45, v62 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v2, v35, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v0, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v16, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_or_b32_sdwa v3, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v17, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v32, v1 -; VI-NEXT: v_or_b32_sdwa v1, v41, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: v_mov_b32_e32 v41, v24 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_or_b32_sdwa v0, v55, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v18, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v34, v0 -; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v37, v1 -; VI-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v55, v26 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v42, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v1, v41, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v19, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v0, v39, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v42, v43 +; VI-NEXT: v_mov_b32_e32 v43, v37 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v39, v0 -; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v49, v1 -; VI-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v27 +; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v20, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v51, v0 -; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v47, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v47, v54 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v35, v1 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v53, v28 +; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v21, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v47, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v33, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v61, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v58, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v22, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v57, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v36, v0 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v1, v24, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v23, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v0, v34, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v56, v0 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v58, v1 -; VI-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 +; VI-NEXT: v_or_b32_sdwa v1, v25, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v24, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v38, v0 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v48, v1 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v29, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v25, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v45, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v50, v0 -; VI-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v28, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v0, v40, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v26, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v1, v62, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v51, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v40, v41 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v52, v0 -; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v27, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v29, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:844 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v0, v31, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v54, v33 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v46, v1 +; VI-NEXT: v_mov_b32_e32 v56, v1 ; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v28, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v63, v0 -; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v47, v1 -; VI-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v29, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:840 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:848 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v63, v39 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_mov_b32_e32 v57, v1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v57, v0 +; VI-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v58, v1 +; VI-NEXT: v_or_b32_sdwa v1, v44, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v30, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v52, v60 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v31, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v53, v35 +; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -200032,14 +201955,54 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: s_cbranch_execnz .LBB97_3 -; VI-NEXT: .LBB97_2: ; %cmp.true -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v59 -; VI-NEXT: v_or_b32_sdwa v29, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: s_branch .LBB97_3 +; VI-NEXT: .LBB97_2: +; VI-NEXT: v_mov_b32_e32 v47, v54 +; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; VI-NEXT: v_mov_b32_e32 v58, v7 +; VI-NEXT: v_mov_b32_e32 v57, v5 +; VI-NEXT: v_mov_b32_e32 v56, v3 +; VI-NEXT: s_mov_b64 s[4:5], -1 +; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 +; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; VI-NEXT: .LBB97_3: ; %Flow +; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; VI-NEXT: s_cbranch_vccnz .LBB97_5 +; VI-NEXT: ; %bb.4: ; %cmp.true +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 @@ -200058,351 +202021,356 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_lshl_b32 s9, s19, 8 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_lshl_b32 s10, s17, 8 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v29, v41, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v31, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:836 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v33 +; VI-NEXT: v_or_b32_sdwa v30, v58, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 +; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v28, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v26, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v62 -; VI-NEXT: v_or_b32_sdwa v28, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v53, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v45 -; VI-NEXT: v_or_b32_sdwa v27, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v52, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v40 -; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v59, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v61 -; VI-NEXT: v_or_b32_sdwa v24, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v44, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v48, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 -; VI-NEXT: v_or_b32_sdwa v24, v24, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 +; VI-NEXT: v_or_b32_sdwa v27, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v23, v41, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v40, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v38, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 -; VI-NEXT: v_or_b32_sdwa v23, v23, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 +; VI-NEXT: v_or_b32_sdwa v26, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v22, v54, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v34, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 +; VI-NEXT: v_or_b32_sdwa v26, v26, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v50, v33, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v50 -; VI-NEXT: v_or_b32_sdwa v22, v22, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 +; VI-NEXT: v_or_b32_sdwa v25, v48, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v21, v35, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v21, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v21 +; VI-NEXT: v_or_b32_sdwa v25, v25, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v54, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v24, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v20, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v32, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v32 +; VI-NEXT: v_or_b32_sdwa v24, v24, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x3000000, v24 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v49, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 -; VI-NEXT: v_or_b32_sdwa v20, v20, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_or_b32_sdwa v23, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v19, v37, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v61, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v61 +; VI-NEXT: v_or_b32_sdwa v23, v23, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x3000000, v23 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v22, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v60 +; VI-NEXT: v_or_b32_sdwa v36, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 +; VI-NEXT: v_or_b32_sdwa v22, v22, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v22, vcc, 0x3000000, v22 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v37, v34, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v37 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v63, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v31, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v19, v19, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 -; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: v_or_b32_sdwa v18, v32, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_or_b32_sdwa v38, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v38, vcc, 0x300, v38 +; VI-NEXT: v_or_b32_sdwa v21, v63, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v57, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v35, vcc, 0x300, v57 -; VI-NEXT: v_or_b32_sdwa v18, v18, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: v_or_b32_sdwa v20, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v39, v45, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v39 +; VI-NEXT: v_or_b32_sdwa v20, v20, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v20, vcc, 0x3000000, v20 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v19, v43, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v48, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v18, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 +; VI-NEXT: v_add_u32_e32 v48, vcc, 0x300, v48 +; VI-NEXT: v_or_b32_sdwa v19, v19, v48 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x3000000, v19 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v62, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v37, vcc, 0x300, v62 +; VI-NEXT: v_or_b32_sdwa v18, v18, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v18, vcc, 0x3000000, v18 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v16, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v10, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v17, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v51 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v11, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v50 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v15, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v56, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v49, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v49 +; VI-NEXT: v_or_b32_sdwa v15, v15, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v14, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v34, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v34 -; VI-NEXT: v_or_b32_sdwa v14, v14, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 +; VI-NEXT: v_or_b32_sdwa v14, v14, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v29, v29, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v14 +; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_or_b32_sdwa v13, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v13, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v36, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v36 -; VI-NEXT: v_or_b32_sdwa v13, v13, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v36, vcc, 0x300, v26 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v52 -; VI-NEXT: v_or_b32_sdwa v26, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v54 -; VI-NEXT: v_or_b32_sdwa v21, v21, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v52, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v52 +; VI-NEXT: v_or_b32_sdwa v13, v13, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v52, vcc, 0x300, v44 +; VI-NEXT: v_or_b32_sdwa v28, v28, v52 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 -; VI-NEXT: v_add_u32_e32 v21, vcc, 0x3000000, v21 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x3000000, v26 +; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v12, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v51, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v51 -; VI-NEXT: v_or_b32_sdwa v12, v12, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v51, vcc, 0x300, v59 -; VI-NEXT: v_or_b32_sdwa v25, v25, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v54, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v54, vcc, 0x300, v54 +; VI-NEXT: v_or_b32_sdwa v12, v12, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 -; VI-NEXT: v_add_u32_e32 v25, vcc, 0x3000000, v25 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v33, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v50, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v40, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v30, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_or_b32_sdwa v39, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2 -; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v34, vcc, 0x300, v2 +; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v53, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v55, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v9, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v41, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v41 -; VI-NEXT: v_or_b32_sdwa v9, v9, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v41, vcc, 0x300, v10 +; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 +; VI-NEXT: v_or_b32_sdwa v9, v9, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v10 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x300, v55 -; VI-NEXT: v_or_b32_sdwa v10, v39, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v53 -; VI-NEXT: v_or_b32_sdwa v27, v28, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v28, v29, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v29, v30, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v49, v16, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v53, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v53, vcc, 0x300, v40 +; VI-NEXT: v_or_b32_sdwa v27, v27, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 ; VI-NEXT: v_add_u32_e32 v27, vcc, 0x3000000, v27 -; VI-NEXT: v_add_u32_e32 v28, vcc, 0x3000000, v28 -; VI-NEXT: v_add_u32_e32 v29, vcc, 0x3000000, v29 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v8, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v42, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v42 -; VI-NEXT: v_or_b32_sdwa v8, v8, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v42, vcc, 0x300, v11 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v40 -; VI-NEXT: v_or_b32_sdwa v11, v33, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v33, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v30, v31, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v17, v17, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v43, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v43 +; VI-NEXT: v_or_b32_sdwa v8, v8, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v43, vcc, 0x300, v11 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x300, v41 +; VI-NEXT: v_or_b32_sdwa v17, v17, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v50, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 +; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v49 +; VI-NEXT: v_add_u32_e32 v49, vcc, 0x300, v0 +; VI-NEXT: v_add_u32_e32 v50, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v30, v30, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v31, v31, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v30, vcc, 0x3000000, v30 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v7, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v44, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v44, vcc, 0x300, v44 -; VI-NEXT: v_or_b32_sdwa v7, v7, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 +; VI-NEXT: v_or_b32_sdwa v7, v7, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v7 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v6, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v45, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v45, vcc, 0x300, v45 -; VI-NEXT: v_or_b32_sdwa v6, v6, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 +; VI-NEXT: v_or_b32_sdwa v6, v6, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v6 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v5, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v46, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v46, vcc, 0x300, v46 -; VI-NEXT: v_or_b32_sdwa v5, v5, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v47, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v47, vcc, 0x300, v47 +; VI-NEXT: v_or_b32_sdwa v5, v5, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v4, vcc, 3, v4 -; VI-NEXT: v_or_b32_sdwa v4, v47, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v47, vcc, 3, v32 -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v4, v56, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x300, v4 ; VI-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v4 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v47, v32, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_e32 v47, s4, v47 +; VI-NEXT: v_add_u32_e32 v56, vcc, 3, v56 +; VI-NEXT: v_or_b32_sdwa v56, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_e32 v56, s4, v56 ; VI-NEXT: s_and_b32 s4, s26, 0xff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_and_b32 s5, s24, 0xff @@ -200415,35 +202383,26 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: s_or_b32 s8, s9, s8 ; VI-NEXT: s_and_b32 s9, s16, 0xff ; VI-NEXT: s_or_b32 s9, s10, s9 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v56 ; VI-NEXT: s_addk_i32 s5, 0x300 ; VI-NEXT: s_addk_i32 s7, 0x300 ; VI-NEXT: s_addk_i32 s9, 0x300 -; VI-NEXT: v_or_b32_sdwa v15, v15, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v32, v16, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: s_lshl_b32 s4, s4, 16 ; VI-NEXT: s_lshl_b32 s6, s6, 16 ; VI-NEXT: s_lshl_b32 s8, s8, 16 ; VI-NEXT: s_and_b32 s9, s9, 0xffff ; VI-NEXT: s_and_b32 s7, s7, 0xffff ; VI-NEXT: s_and_b32 s5, s5, 0xffff -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x3000000, v17 -; VI-NEXT: v_add_u32_e32 v17, vcc, 0x3000000, v32 -; VI-NEXT: v_add_u32_e32 v32, vcc, 0x300, v0 ; VI-NEXT: s_or_b32 s8, s8, s9 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_add_i32 s8, s8, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 -; VI-NEXT: v_or_b32_sdwa v31, v31, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v47 -; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v15 +; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v56 ; VI-NEXT: v_mov_b32_e32 v0, s8 ; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_add_u32_e32 v31, vcc, 0x3000000, v31 -; VI-NEXT: .LBB97_3: ; %end +; VI-NEXT: .LBB97_5: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload @@ -200462,39 +202421,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] -; VI-NEXT: .LBB97_4: -; VI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v61, v60 -; VI-NEXT: v_mov_b32_e32 v60, v59 -; VI-NEXT: v_mov_b32_e32 v45, v62 -; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v57, v5 -; VI-NEXT: v_mov_b32_e32 v47, v4 -; VI-NEXT: v_mov_b32_e32 v63, v3 -; VI-NEXT: v_mov_b32_e32 v53, v28 -; VI-NEXT: v_mov_b32_e32 v43, v27 -; VI-NEXT: v_mov_b32_e32 v55, v26 -; VI-NEXT: v_mov_b32_e32 v41, v24 -; VI-NEXT: v_mov_b32_e32 v54, v22 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 -; VI-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; VI-NEXT: s_branch .LBB97_2 ; ; GFX9-LABEL: bitcast_v128i8_to_v64i16_scalar: ; GFX9: ; %bb.0: @@ -200515,31 +202441,36 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:332 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:88 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:104 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:88 +; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:104 ; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:112 ; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:120 ; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:128 @@ -200549,133 +202480,129 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:160 ; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:168 ; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:176 -; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v29 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v22, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 -; GFX9-NEXT: s_waitcnt vmcnt(24) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v43 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v26, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v44, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v18, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v43, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v20, 8, v29 ; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v47 +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v24, 8, v56 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v45 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v44 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v42 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v41 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v40 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v55 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v54 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v53 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v52 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v51 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v50 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v49 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v48 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v39 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:612 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v28 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v30 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v31 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v32 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v33 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v34 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v35 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v36 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v37 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(20) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v38 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:184 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:192 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:200 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:200 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:208 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:216 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:216 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:224 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:232 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:240 +; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v46 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v7 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:248 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:256 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:264 +; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:264 ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:272 -; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:280 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:280 ; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:288 ; GFX9-NEXT: buffer_load_ushort v9, off, s[0:3], s32 offset:296 ; GFX9-NEXT: buffer_load_ushort v7, off, s[0:3], s32 offset:304 @@ -200683,148 +202610,149 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:556 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v15 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:572 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v13 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Spill +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v3 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v4 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v9 -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:564 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:580 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_ushort v3, off, s[0:3], s32 offset:312 ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:320 -; GFX9-NEXT: buffer_load_ushort v5, off, s[0:3], s32 offset:328 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:328 ; GFX9-NEXT: buffer_load_ushort v11, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:28 ; GFX9-NEXT: buffer_load_ushort v13, off, s[0:3], s32 offset:36 -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v1 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v15, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:84 -; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:92 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:116 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:124 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:132 -; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:140 -; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:148 -; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:156 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:164 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: buffer_load_ushort v21, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v25, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v23, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:84 +; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:92 +; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:108 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:576 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:172 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:180 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:188 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:196 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:204 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:212 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:220 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:228 -; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:236 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:244 +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:116 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:124 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:132 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:140 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:148 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:156 +; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:164 +; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:172 +; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:180 +; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:188 +; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:196 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:204 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:212 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:220 +; GFX9-NEXT: buffer_load_ushort v61, off, s[0:3], s32 offset:228 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:236 +; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:244 ; GFX9-NEXT: buffer_load_ushort v29, off, s[0:3], s32 offset:252 -; GFX9-NEXT: buffer_load_ushort v27, off, s[0:3], s32 offset:260 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:268 -; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:276 -; GFX9-NEXT: buffer_load_ushort v59, off, s[0:3], s32 offset:284 -; GFX9-NEXT: buffer_load_ushort v63, off, s[0:3], s32 offset:292 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:300 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:260 +; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:268 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:276 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:284 +; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:292 +; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:300 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:308 -; GFX9-NEXT: buffer_load_ushort v58, off, s[0:3], s32 offset:316 -; GFX9-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:324 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:620 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:624 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:628 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:632 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:636 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:640 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:644 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:316 +; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:324 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:648 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:652 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(23) +; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:584 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:532 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(24) +; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:588 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:592 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:536 ; 4-byte Folded Spill ; GFX9-NEXT: s_waitcnt vmcnt(28) -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:656 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:660 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:664 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:668 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(30) -; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:672 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:676 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:680 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:684 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(33) -; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:688 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:516 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(36) -; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:520 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(39) -; GFX9-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(41) -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:692 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:696 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:700 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(31) +; GFX9-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:596 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(34) +; GFX9-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:600 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(35) +; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:776 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:784 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:828 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:832 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:804 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:808 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:812 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:816 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:820 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:824 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:604 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:608 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB97_2 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff @@ -200832,19 +202760,12 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v3, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v0, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 @@ -200866,272 +202787,291 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v4, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v1 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v57, v5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v43 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v39, v16 -; GFX9-NEXT: v_or_b32_sdwa v17, v34, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v36, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v46, v32 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v1, v28, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v42, v61 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v55, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v13, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v17, v45, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v45, v59 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v53, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v52, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v19, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v50, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v49, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v15, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v48, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v16, v2, 16, v0 ; GFX9-NEXT: v_lshl_or_b32 v17, v17, 16, v1 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v33, v45 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v18, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v43, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v47, v32 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v19, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v60, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v20, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v51, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v21, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v51, v57 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v22, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v59, v44 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v34, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v23, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_mov_b32_e32 v46, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v35, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v24, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v35, v45 -; GFX9-NEXT: v_mov_b32_e32 v45, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v42 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_mov_b32_e32 v38, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v36, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v54, v63 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v25, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v54, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v29, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v26, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v29, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v35 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v27, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v32 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v44 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v36, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v31, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_lshl_or_b32 v28, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v29, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_mov_b32_e32 v57, v35 +; GFX9-NEXT: v_mov_b32_e32 v35, v38 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v30, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v31, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch .LBB97_3 ; GFX9-NEXT: .LBB97_2: -; GFX9-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v33, v45 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v58, v50 +; GFX9-NEXT: v_mov_b32_e32 v45, v59 +; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:616 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:612 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v34, v35 +; GFX9-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: v_mov_b32_e32 v49, v39 +; GFX9-NEXT: v_mov_b32_e32 v55, v22 +; GFX9-NEXT: v_mov_b32_e32 v51, v5 ; GFX9-NEXT: s_mov_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v46, v32 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: ; implicit-def: $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX9-NEXT: .LBB97_3: ; %Flow -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:608 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:604 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:600 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:596 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:588 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:584 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:592 ; 4-byte Folded Reload ; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GFX9-NEXT: s_cbranch_vccnz .LBB97_5 ; GFX9-NEXT: ; %bb.4: ; %cmp.true -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:644 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:828 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:804 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:576 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_addk_i32 s4, 0x300 +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:580 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s24, s24, 3 ; GFX9-NEXT: s_lshl_b32 s5, s25, 8 ; GFX9-NEXT: s_add_i32 s26, s26, 3 @@ -201144,61 +203084,55 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_lshl_b32 s9, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 ; GFX9-NEXT: s_lshl_b32 s10, s19, 8 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:824 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:748 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:740 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:736 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(15) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(14) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:720 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(12) ; GFX9-NEXT: v_add_u32_e32 v25, 3, v25 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v25, v37, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v37, v51, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v25, v26, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:708 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:640 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v4, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:628 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v38, v38, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 +; GFX9-NEXT: v_or_b32_sdwa v23, v50, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:632 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshl_or_b32 v3, v2, 16, v3 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:728 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_and_b32 s4, s24, 0xff ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_and_b32 s5, s26, 0xff @@ -201210,8 +203144,6 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s8, s16, 0xff ; GFX9-NEXT: s_or_b32 s8, s9, s8 ; GFX9-NEXT: s_and_b32 s9, s18, 0xff -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -201228,14 +203160,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:620 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:704 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -201243,9 +203175,9 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:700 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -201255,254 +203187,277 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v53, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:668 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:652 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v63, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v1, 3, v42 +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:832 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:820 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:816 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v40, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v52, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:812 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:808 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v55, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:800 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v37, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:696 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:572 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v37, 0x300, v37 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v62, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:792 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:788 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v38, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:568 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v49, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:784 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_or_b32_sdwa v39, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:684 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:564 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) ; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v39, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v39, v36, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:692 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v48, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:560 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:776 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 +; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:744 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:680 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 +; GFX9-NEXT: s_waitcnt vmcnt(4) ; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v48, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v48, v46, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:688 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v49, v35, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:676 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:540 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:772 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:528 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v51, v34, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v16, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v16, v17, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_lshl_or_b32 v17, v1, 16, v0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:520 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:768 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v16 ; GFX9-NEXT: v_lshl_or_b32 v16, v16, 16, v2 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:536 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 +; GFX9-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:752 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 +; GFX9-NEXT: v_or_b32_sdwa v2, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v49, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:672 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:556 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v50, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:552 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v58 +; GFX9-NEXT: v_or_b32_sdwa v19, v51, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 +; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v51, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v52, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:656 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:532 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:664 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:548 ; 4-byte Folded Reload ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v53, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v61 -; GFX9-NEXT: v_or_b32_sdwa v24, v54, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 -; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 -; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v55 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v54, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v45 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:660 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v34 ; GFX9-NEXT: v_add_u32_e32 v20, 3, v20 -; GFX9-NEXT: v_or_b32_sdwa v20, v57, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v20, v35, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v34, 0x300, v20 +; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v55, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v21, 3, v21 -; GFX9-NEXT: v_or_b32_sdwa v21, v32, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v26, 3, v60 +; GFX9-NEXT: v_or_b32_sdwa v21, v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 ; GFX9-NEXT: v_add_u32_e32 v28, 0x300, v21 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v54 +; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; GFX9-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; GFX9-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v40, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v23, 3, v23 -; GFX9-NEXT: v_add_u32_e32 v26, 3, v47 -; GFX9-NEXT: v_or_b32_sdwa v23, v41, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v40 -; GFX9-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_or_b32_sdwa v41, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v46 +; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_add_u32_e32 v22, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v22, v44, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v22, v36, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v35, 0x300, v22 ; GFX9-NEXT: v_add_u32_e32 v22, 0x300, v52 -; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 ; GFX9-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: v_lshl_or_b32 v28, v35, 16, v28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v42, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:636 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v26, 3, v43 +; GFX9-NEXT: v_add_u32_e32 v24, 3, v24 +; GFX9-NEXT: v_or_b32_sdwa v24, v57, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 +; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v24 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v48 +; GFX9-NEXT: v_add_u32_e32 v48, 0x300, v51 +; GFX9-NEXT: v_add_u32_e32 v51, 0x300, v41 +; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX9-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX9-NEXT: v_lshl_or_b32 v20, v51, 16, v20 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v43, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:624 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:524 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v31, 0x300, v0 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; GFX9-NEXT: v_add_u32_e32 v52, 0x300, v43 -; GFX9-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v44, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:764 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v26, 3, v33 -; GFX9-NEXT: v_add_u32_e32 v32, 0x300, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v31, v32, 16, v31 +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:648 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 +; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_add_u32_e32 v26, 3, v26 +; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v45, v27, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v27, 0x300, v23 ; GFX9-NEXT: v_add_u32_e32 v26, 0x300, v25 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v2, 3, v2 -; GFX9-NEXT: v_or_b32_sdwa v2, v18, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:760 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v30, 0x300, v2 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v38 ; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v50 ; GFX9-NEXT: v_add_u32_e32 v38, 0x300, v39 @@ -201514,33 +203469,14 @@ define inreg <64 x i16> @bitcast_v128i8_to_v64i16_scalar(<128 x i8> inreg %a, i3 ; GFX9-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; GFX9-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX9-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX9-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_lshl_or_b32 v21, v50, 16, v21 ; GFX9-NEXT: v_lshl_or_b32 v22, v49, 16, v22 -; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v24, v39, 16, v24 +; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 +; GFX9-NEXT: v_lshl_or_b32 v23, v48, 16, v23 ; GFX9-NEXT: v_lshl_or_b32 v25, v38, 16, v25 ; GFX9-NEXT: v_lshl_or_b32 v26, v37, 16, v26 ; GFX9-NEXT: v_lshl_or_b32 v27, v36, 16, v27 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v18, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v18, v19, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v33, 0x300, v18 -; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v44 -; GFX9-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX9-NEXT: v_lshl_or_b32 v18, v53, 16, v18 -; GFX9-NEXT: v_lshl_or_b32 v30, v33, 16, v30 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v19, 3, v19 -; GFX9-NEXT: v_or_b32_sdwa v19, v60, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v19 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v42 -; GFX9-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX9-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX9-NEXT: v_lshl_or_b32 v19, v52, 16, v19 -; GFX9-NEXT: v_lshl_or_b32 v29, v34, 16, v29 ; GFX9-NEXT: .LBB97_5: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload @@ -205289,36 +207225,88 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:4 ; VI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v15 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v5 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v3 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v1 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v28 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v27 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v26 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v25 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 ; VI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; VI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 ; VI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 ; VI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 ; VI-NEXT: v_lshrrev_b32_e32 v59, 16, v10 @@ -205327,20 +207315,68 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshrrev_b32_e32 v56, 16, v4 ; VI-NEXT: v_lshrrev_b32_e32 v57, 16, v2 ; VI-NEXT: v_lshrrev_b32_e32 v46, 16, v30 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v23 ; VI-NEXT: v_lshrrev_b32_e32 v32, 16, v22 ; VI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 ; VI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; VI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 +; VI-NEXT: v_lshrrev_b32_e32 v38, 16, v18 +; VI-NEXT: v_lshrrev_b32_e32 v54, 16, v17 +; VI-NEXT: ; implicit-def: $vgpr42 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr48 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr32 +; VI-NEXT: ; implicit-def: $vgpr61 +; VI-NEXT: ; implicit-def: $vgpr58 +; VI-NEXT: ; implicit-def: $vgpr39 +; VI-NEXT: ; kill: killed $vgpr35 +; VI-NEXT: ; implicit-def: $vgpr35 +; VI-NEXT: ; kill: killed $vgpr40 +; VI-NEXT: ; implicit-def: $vgpr43 +; VI-NEXT: ; implicit-def: $vgpr40 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr45 +; VI-NEXT: ; implicit-def: $vgpr44 +; VI-NEXT: ; implicit-def: $vgpr41 +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; VI-NEXT: ; implicit-def: $vgpr48 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v37 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 -; VI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 @@ -205411,7 +207447,6 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr32 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 @@ -205422,151 +207457,53 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr55 +; VI-NEXT: ; implicit-def: $vgpr31 +; VI-NEXT: v_lshrrev_b32_e32 v52, 16, v37 +; VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36 +; VI-NEXT: ; kill: killed $vgpr31 ; VI-NEXT: ; implicit-def: $vgpr31 ; VI-NEXT: ; kill: killed $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr62 ; VI-NEXT: ; implicit-def: $vgpr31 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr38 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr42 -; VI-NEXT: ; implicit-def: $vgpr61 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr58 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr39 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; kill: killed $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr35 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: ; kill: killed $vgpr48 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; kill: killed $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr43 -; VI-NEXT: ; implicit-def: $vgpr40 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr45 -; VI-NEXT: ; implicit-def: $vgpr44 -; VI-NEXT: ; implicit-def: $vgpr41 -; VI-NEXT: ; implicit-def: $vgpr54 -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; VI-NEXT: ; implicit-def: $vgpr48 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; VI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; VI-NEXT: s_cbranch_execz .LBB98_2 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v14 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v12 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v11 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v10 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v8 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v6 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v5 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[31:32], 24, v[15:16] ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -205580,143 +207517,148 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v31, v7 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v10 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v11 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v12 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v13 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v14 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v9, v15 +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v16 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v9, v8 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v5 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v7, v6 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 24, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v5, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v37 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v3, v2 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v5, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[23:24] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v21 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v19 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v20 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v17 ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v18 -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] ; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_mov_b32_e32 v1, v46 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: v_mov_b32_e32 v32, v15 ; VI-NEXT: v_lshrrev_b32_e32 v35, 24, v26 ; VI-NEXT: v_lshrrev_b32_e32 v39, 24, v24 ; VI-NEXT: v_lshrrev_b32_e32 v58, 24, v22 ; VI-NEXT: v_lshrrev_b32_e32 v61, 24, v20 -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 ; VI-NEXT: v_lshrrev_b32_e32 v31, 24, v18 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 ; VI-NEXT: v_mov_b32_e32 v46, v1 ; VI-NEXT: ; implicit-def: $vgpr1 ; VI-NEXT: ; implicit-def: $vgpr3 @@ -205739,93 +207681,89 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: s_cbranch_execz .LBB98_4 ; VI-NEXT: ; %bb.3: ; %cmp.true ; VI-NEXT: v_mov_b32_e32 v31, 3 -; VI-NEXT: v_add_u16_sdwa v51, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_u16_sdwa v55, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v32, 3, v18 -; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; VI-NEXT: v_add_u16_sdwa v54, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v18, v32, v18 ; VI-NEXT: v_add_u16_e32 v32, 3, v17 -; VI-NEXT: v_add_u16_sdwa v17, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 +; VI-NEXT: v_add_u16_sdwa v38, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v17, v32, v17 ; VI-NEXT: v_add_u16_e32 v32, 3, v20 -; VI-NEXT: v_add_u16_sdwa v20, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:512 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v20, 16, v38 +; VI-NEXT: v_add_u16_sdwa v62, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v20, v32, v20 ; VI-NEXT: v_add_u16_e32 v32, 3, v19 -; VI-NEXT: v_add_u16_sdwa v19, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; VI-NEXT: v_lshlrev_b32_e32 v19, 16, v62 ; VI-NEXT: v_add_u16_sdwa v48, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v19, v32, v19 ; VI-NEXT: v_add_u16_e32 v32, 3, v22 ; VI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 ; VI-NEXT: v_add_u16_sdwa v53, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v22, v32, v22 ; VI-NEXT: v_add_u16_e32 v32, 3, v21 ; VI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 ; VI-NEXT: v_add_u16_sdwa v61, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v21, v32, v21 ; VI-NEXT: v_add_u16_e32 v32, 3, v24 ; VI-NEXT: v_lshlrev_b32_e32 v24, 16, v61 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_sdwa v49, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v24, v32, v24 ; VI-NEXT: v_add_u16_e32 v32, 3, v23 -; VI-NEXT: v_add_u16_sdwa v23, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; VI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; VI-NEXT: v_add_u16_sdwa v58, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v23, v32, v23 ; VI-NEXT: v_add_u16_e32 v32, 3, v26 ; VI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v26, v32, v26 ; VI-NEXT: v_add_u16_e32 v32, 3, v25 ; VI-NEXT: v_add_u16_sdwa v25, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 ; VI-NEXT: v_add_u16_sdwa v39, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v25, v32, v25 ; VI-NEXT: v_add_u16_e32 v32, 3, v28 ; VI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v28, v32, v28 ; VI-NEXT: v_add_u16_e32 v32, 3, v27 ; VI-NEXT: v_add_u16_sdwa v27, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 ; VI-NEXT: v_add_u16_sdwa v35, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v27, v32, v27 ; VI-NEXT: v_add_u16_e32 v33, 3, v30 ; VI-NEXT: v_add_u16_e32 v34, 3, v29 ; VI-NEXT: v_add_u16_sdwa v32, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v35 ; VI-NEXT: v_add_u16_sdwa v52, v37, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v30, v33, v29 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v29, 16, v32 ; VI-NEXT: v_add_u16_e32 v33, 3, v37 ; VI-NEXT: v_add_u16_sdwa v50, v36, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v52 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v29, v34, v29 ; VI-NEXT: v_add_u16_e32 v34, 3, v36 ; VI-NEXT: v_or_b32_e32 v37, v33, v32 ; VI-NEXT: v_lshlrev_b32_e32 v32, 16, v50 ; VI-NEXT: v_add_u16_sdwa v57, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v36, v34, v32 ; VI-NEXT: v_add_u16_e32 v33, 3, v2 ; VI-NEXT: v_add_u16_e32 v34, 3, v1 @@ -205834,9 +207772,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v2, v33, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 ; VI-NEXT: v_add_u16_sdwa v56, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v1, v34, v1 ; VI-NEXT: v_add_u16_e32 v33, 3, v4 ; VI-NEXT: v_add_u16_e32 v34, 3, v3 @@ -205845,9 +207783,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_e32 v4, v33, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 ; VI-NEXT: v_add_u16_sdwa v47, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v3, v34, v3 ; VI-NEXT: v_add_u16_e32 v33, 3, v6 ; VI-NEXT: v_add_u16_e32 v34, 3, v5 @@ -205855,203 +207793,206 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 ; VI-NEXT: v_or_b32_e32 v6, v33, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v5, v34, v5 ; VI-NEXT: v_add_u16_sdwa v34, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; VI-NEXT: v_add_u16_e32 v38, 3, v8 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; VI-NEXT: v_add_u16_e32 v40, 3, v8 ; VI-NEXT: v_add_u16_e32 v33, 3, v7 ; VI-NEXT: v_add_u16_sdwa v32, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; VI-NEXT: v_or_b32_e32 v8, v38, v7 +; VI-NEXT: v_or_b32_e32 v8, v40, v7 ; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v32 ; VI-NEXT: v_add_u16_sdwa v59, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v7, v33, v7 ; VI-NEXT: v_add_u16_e32 v33, 3, v10 -; VI-NEXT: v_add_u16_e32 v38, 3, v9 +; VI-NEXT: v_add_u16_e32 v40, 3, v9 ; VI-NEXT: v_add_u16_sdwa v32, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v59 ; VI-NEXT: v_or_b32_e32 v10, v33, v9 ; VI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 ; VI-NEXT: v_add_u16_sdwa v63, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v9, v38, v9 +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v9, v40, v9 ; VI-NEXT: v_add_u16_e32 v33, 3, v12 -; VI-NEXT: v_add_u16_e32 v38, 3, v11 +; VI-NEXT: v_add_u16_e32 v40, 3, v11 ; VI-NEXT: v_add_u16_sdwa v32, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v63 -; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v12, v33, v11 ; VI-NEXT: v_lshlrev_b32_e32 v11, 16, v32 ; VI-NEXT: v_add_u16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v11, v38, v11 -; VI-NEXT: v_add_u16_e32 v38, 3, v14 -; VI-NEXT: v_add_u16_e32 v49, 3, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v11, v40, v11 +; VI-NEXT: v_add_u16_e32 v40, 3, v14 +; VI-NEXT: v_add_u16_e32 v41, 3, v13 ; VI-NEXT: v_add_u16_sdwa v32, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 ; VI-NEXT: v_add_u16_sdwa v60, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v14, v38, v13 -; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; VI-NEXT: v_or_b32_e32 v14, v40, v13 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; VI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 ; VI-NEXT: v_add_u16_sdwa v31, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_e32 v16, 3, v16 ; VI-NEXT: v_add_u16_e32 v32, 3, v15 ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v60 -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v16, v16, v15 ; VI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; VI-NEXT: v_or_b32_e32 v15, v32, v15 -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v16 -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; VI-NEXT: v_lshrrev_b32_e32 v31, 8, v15 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v16 +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v15 ; VI-NEXT: v_lshrrev_b64 v[15:16], 24, v[15:16] -; VI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_or_b32_e32 v13, v49, v13 +; VI-NEXT: v_or_b32_e32 v13, v41, v13 ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v14 -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v15, 8, v13 ; VI-NEXT: v_lshrrev_b64 v[13:14], 24, v[13:14] -; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v11 ; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[11:12] -; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v10 -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v11, 8, v9 ; VI-NEXT: v_lshrrev_b64 v[9:10], 24, v[9:10] -; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v9, 8, v7 ; VI-NEXT: v_lshrrev_b64 v[7:8], 24, v[7:8] -; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v6 -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; VI-NEXT: v_lshrrev_b64 v[5:6], 24, v[5:6] -; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3 ; VI-NEXT: v_lshrrev_b64 v[40:41], 24, v[3:4] ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; VI-NEXT: v_lshrrev_b64 v[43:44], 24, v[1:2] ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v37 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v36 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[36:37] -; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v30 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v29 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[29:30] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v28 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v27 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[27:28] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v26 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v25 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[25:26] ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v24 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v23 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[1:2], 24, v[23:24] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v22 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill ; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v21 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v20 ; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v19 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v60, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v33, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v63, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v59, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v34, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v47, 8, 8 ; VI-NEXT: v_lshrrev_b64 v[44:45], 24, v[19:20] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v56, 8, 8 ; VI-NEXT: v_lshrrev_b64 v[45:46], 24, v[17:18] -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v57, 8, 8 ; VI-NEXT: v_mov_b32_e32 v46, v35 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v52, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v46, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v1, v39, 8, 8 -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: v_mov_b32_e32 v51, v49 ; VI-NEXT: v_mov_b32_e32 v49, v53 -; VI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:512 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v52, v51 -; VI-NEXT: v_bfe_u32 v31, v51, 8, 8 -; VI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload -; VI-NEXT: v_lshrrev_b64 v[54:55], 24, v[23:24] -; VI-NEXT: v_lshrrev_b64 v[41:42], 24, v[21:22] -; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v20 -; VI-NEXT: v_lshrrev_b32_e32 v38, 8, v19 -; VI-NEXT: v_lshrrev_b32_e32 v62, 8, v18 -; VI-NEXT: v_lshrrev_b32_e32 v55, 8, v17 +; VI-NEXT: v_mov_b32_e32 v53, v38 +; VI-NEXT: v_mov_b32_e32 v38, v55 +; VI-NEXT: v_lshrrev_b32_e32 v32, 8, v18 +; VI-NEXT: v_lshrrev_b32_e32 v42, 8, v17 +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:492 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v35, v58, 8, 8 -; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; VI-NEXT: v_bfe_u32 v39, v61, 8, 8 ; VI-NEXT: v_bfe_u32 v58, v48, 8, 8 -; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_mov_b32_e32 v55, v31 ; VI-NEXT: v_bfe_u32 v61, v53, 8, 8 +; VI-NEXT: v_bfe_u32 v31, v38, 8, 8 +; VI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; VI-NEXT: .LBB98_4: ; %end ; VI-NEXT: s_or_b64 exec, exec, s[4:5] -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206061,22 +208002,22 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v57, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206087,28 +208028,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v56, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206116,28 +208057,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v47, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206145,28 +208086,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 24, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 28, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206174,28 +208115,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v59, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 36, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206203,28 +208144,28 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206232,76 +208173,79 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v33, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 52, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; VI-NEXT: v_or_b32_sdwa v1, v32, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v55, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 56, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v60, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v55 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v45 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v62 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v31 -; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x44, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v38 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:508 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v44 -; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x48, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v42 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v61 @@ -206309,8 +208253,8 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x4c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:504 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206320,9 +208264,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x50, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:500 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206333,22 +208277,23 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x54, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v54 +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 +; VI-NEXT: v_or_b32_sdwa v2, v51, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x58, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206359,15 +208304,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x5c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206375,9 +208320,9 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x60, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -206388,15 +208333,15 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x64, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206404,29 +208349,29 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x68, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:496 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x6c, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(2) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206434,21 +208379,21 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x70, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 ; VI-NEXT: v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x74, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -206461,18 +208406,17 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_add_u32_e32 v2, vcc, 0x78, v0 ; VI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x7c, v0 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v2, v52, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload @@ -208639,1444 +210583,1726 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-FAKE16-NEXT: v_or_b32_e32 v20, v20, v21 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, v22, v23 ; GFX11-FAKE16-NEXT: v_or_b32_e32 v22, v24, v25 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 -; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 -; GFX11-FAKE16-NEXT: s_clause 0x5 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 -; GFX11-FAKE16-NEXT: s_clause 0x13 -; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 -; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 -; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 -; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 -; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 -; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 -; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 -; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 -; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 -; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 -; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 -; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 -; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 -; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 -; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 -; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 -; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 -; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 -; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 -; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] - %cmp = icmp eq i32 %b, 0 - br i1 %cmp, label %cmp.true, label %cmp.false - -cmp.true: - %a1 = add <64 x i16> %a, splat (i16 3) - %a2 = bitcast <64 x i16> %a1 to <128 x i8> - br label %end - -cmp.false: - %a3 = bitcast <64 x i16> %a to <128 x i8> - br label %end - -end: - %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] - ret <128 x i8> %phi -} - -define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i32 inreg %b) { -; SI-LABEL: bitcast_v64i16_to_v128i8_scalar: -; SI: ; %bb.0: -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; SI-NEXT: v_writelane_b32 v63, s30, 0 -; SI-NEXT: v_writelane_b32 v63, s31, 1 -; SI-NEXT: v_writelane_b32 v63, s34, 2 -; SI-NEXT: v_writelane_b32 v63, s35, 3 -; SI-NEXT: v_writelane_b32 v63, s36, 4 -; SI-NEXT: v_writelane_b32 v63, s37, 5 -; SI-NEXT: v_writelane_b32 v63, s38, 6 -; SI-NEXT: v_writelane_b32 v63, s39, 7 -; SI-NEXT: v_writelane_b32 v63, s48, 8 -; SI-NEXT: v_writelane_b32 v63, s49, 9 -; SI-NEXT: v_writelane_b32 v63, s50, 10 -; SI-NEXT: v_writelane_b32 v63, s51, 11 -; SI-NEXT: v_writelane_b32 v63, s52, 12 -; SI-NEXT: v_writelane_b32 v63, s53, 13 -; SI-NEXT: v_writelane_b32 v63, s54, 14 -; SI-NEXT: v_writelane_b32 v63, s55, 15 -; SI-NEXT: v_writelane_b32 v63, s64, 16 -; SI-NEXT: v_writelane_b32 v63, s65, 17 -; SI-NEXT: v_writelane_b32 v63, s66, 18 -; SI-NEXT: v_writelane_b32 v63, s67, 19 -; SI-NEXT: v_writelane_b32 v63, s68, 20 -; SI-NEXT: v_writelane_b32 v63, s69, 21 -; SI-NEXT: v_writelane_b32 v63, s70, 22 -; SI-NEXT: v_writelane_b32 v63, s71, 23 -; SI-NEXT: v_writelane_b32 v63, s80, 24 -; SI-NEXT: v_writelane_b32 v63, s81, 25 -; SI-NEXT: v_writelane_b32 v63, s82, 26 -; SI-NEXT: v_writelane_b32 v63, s83, 27 -; SI-NEXT: v_writelane_b32 v63, s84, 28 -; SI-NEXT: v_writelane_b32 v63, s85, 29 -; SI-NEXT: v_writelane_b32 v63, s86, 30 -; SI-NEXT: v_writelane_b32 v63, s87, 31 -; SI-NEXT: v_writelane_b32 v63, s96, 32 -; SI-NEXT: v_writelane_b32 v63, s97, 33 -; SI-NEXT: v_writelane_b32 v63, s98, 34 -; SI-NEXT: s_mov_b32 s6, s18 -; SI-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane -; SI-NEXT: v_writelane_b32 v63, s99, 35 -; SI-NEXT: v_readfirstlane_b32 s62, v30 -; SI-NEXT: v_readfirstlane_b32 s63, v29 -; SI-NEXT: v_readfirstlane_b32 s59, v26 -; SI-NEXT: v_readfirstlane_b32 s60, v25 -; SI-NEXT: v_readfirstlane_b32 s98, v22 -; SI-NEXT: v_readfirstlane_b32 s61, v21 -; SI-NEXT: v_readfirstlane_b32 s99, v18 -; SI-NEXT: v_readfirstlane_b32 s58, v17 -; SI-NEXT: v_readfirstlane_b32 s96, v14 -; SI-NEXT: v_readfirstlane_b32 s97, v13 -; SI-NEXT: v_readfirstlane_b32 s86, v10 -; SI-NEXT: v_readfirstlane_b32 s87, v9 -; SI-NEXT: v_readfirstlane_b32 s84, v6 -; SI-NEXT: v_readfirstlane_b32 s85, v5 -; SI-NEXT: v_readfirstlane_b32 s81, v2 -; SI-NEXT: v_readfirstlane_b32 s82, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v24 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s88, v36 -; SI-NEXT: v_readfirstlane_b32 s18, v37 -; SI-NEXT: v_readfirstlane_b32 s78, v38 -; SI-NEXT: v_readfirstlane_b32 s79, v39 -; SI-NEXT: v_readfirstlane_b32 s76, v48 -; SI-NEXT: v_readfirstlane_b32 s77, v49 -; SI-NEXT: v_readfirstlane_b32 s74, v50 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_readfirstlane_b32 s75, v51 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_readfirstlane_b32 s72, v52 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_readfirstlane_b32 s73, v53 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v54 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v43 -; SI-NEXT: v_writelane_b32 v62, s6, 0 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB99_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s56, s4, s5 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s57, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s56 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: v_alignbit_b32 v8, s57, v1, 24 -; SI-NEXT: v_alignbit_b32 v50, s57, v1, 16 -; SI-NEXT: v_alignbit_b32 v1, s57, v1, 8 -; SI-NEXT: s_or_b32 s46, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_or_b32 s47, s4, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s46 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 24 -; SI-NEXT: s_or_b32 s44, s4, s5 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s47, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, s47, v1, 8 -; SI-NEXT: s_or_b32 s45, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s44 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 24 -; SI-NEXT: s_or_b32 s42, s4, s5 -; SI-NEXT: s_and_b32 s4, s82, 0xffff -; SI-NEXT: s_lshl_b32 s5, s81, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s45, v1, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v1, 8 -; SI-NEXT: s_or_b32 s43, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s42 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, s43, v1, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v1, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s85, 0xffff -; SI-NEXT: s_lshl_b32 s5, s84, 16 -; SI-NEXT: v_or_b32_e32 v16, v1, v2 -; SI-NEXT: s_or_b32 s41, s4, s5 -; SI-NEXT: v_alignbit_b32 v1, s41, v16, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s41, v16, 16 -; SI-NEXT: s_and_b32 s4, s87, 0xffff -; SI-NEXT: s_lshl_b32 s5, s86, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s41, v16, 8 -; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s97, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: s_or_b32 s15, s4, s5 -; SI-NEXT: s_and_b32 s4, s58, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 -; SI-NEXT: v_or_b32_e32 v14, v1, v4 -; SI-NEXT: s_or_b32 s14, s4, s5 -; SI-NEXT: s_and_b32 s4, s61, 0xffff -; SI-NEXT: s_lshl_b32 s5, s98, 16 -; SI-NEXT: v_alignbit_b32 v1, s40, v14, 24 -; SI-NEXT: s_or_b32 s13, s4, s5 -; SI-NEXT: s_and_b32 s4, s60, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s40, v14, 16 -; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s63, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s40, v14, 8 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s73, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s75, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: v_or_b32_e32 v12, v1, v5 -; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s77, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: v_alignbit_b32 v1, s15, v12, 24 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s15, v12, 16 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s15, v12, 8 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_lshr_b32 s4, s11, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_writelane_b32 v62, s4, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_or_b32_e32 v10, v1, v6 -; SI-NEXT: v_writelane_b32 v62, s4, 3 -; SI-NEXT: s_lshr_b32 s4, s9, 8 -; SI-NEXT: v_alignbit_b32 v1, s14, v10, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: s_lshr_b32 s4, s8, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s14, v10, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: s_lshr_b32 s4, s7, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, s14, v10, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: s_lshr_b32 s4, s6, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v19 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: s_and_b32 s4, s72, 0xffff -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v8, v1, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: s_and_b32 s4, s74, 0xffff -; SI-NEXT: v_or_b32_e32 v5, v1, v13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 -; SI-NEXT: v_writelane_b32 v62, s4, 5 -; SI-NEXT: s_and_b32 s4, s76, 0xffff -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: v_or_b32_e32 v13, v1, v17 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: s_and_b32 s4, s78, 0xffff -; SI-NEXT: v_mov_b32_e32 v26, v9 -; SI-NEXT: v_or_b32_e32 v9, v1, v18 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 -; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: s_and_b32 s4, s88, 0xffff -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: v_or_b32_e32 v6, v1, v20 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: s_bfe_u32 s4, s74, 0x80008 -; SI-NEXT: v_or_b32_e32 v4, v1, v21 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: s_bfe_u32 s4, s76, 0x80008 -; SI-NEXT: v_or_b32_e32 v2, v1, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 -; SI-NEXT: v_writelane_b32 v62, s4, 7 -; SI-NEXT: s_bfe_u32 s4, s78, 0x80008 -; SI-NEXT: v_or_b32_e32 v1, v1, v24 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: s_bfe_u32 s4, s88, 0x80008 -; SI-NEXT: v_mov_b32_e32 v29, v17 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: s_lshr_b32 s68, s57, 8 -; SI-NEXT: s_lshr_b32 s65, s47, 8 -; SI-NEXT: s_lshr_b32 s54, s45, 8 -; SI-NEXT: s_lshr_b32 s51, s43, 8 -; SI-NEXT: s_lshr_b32 s48, s41, 8 -; SI-NEXT: s_lshr_b32 s37, s40, 8 -; SI-NEXT: s_lshr_b32 s34, s15, 8 -; SI-NEXT: s_lshr_b32 s95, s14, 8 -; SI-NEXT: s_lshr_b32 s92, s13, 8 -; SI-NEXT: s_lshr_b32 s89, s12, 8 -; SI-NEXT: s_and_b32 s71, s19, 0xffff -; SI-NEXT: s_and_b32 s69, s23, 0xffff -; SI-NEXT: s_and_b32 s66, s27, 0xffff -; SI-NEXT: s_and_b32 s55, s81, 0xffff -; SI-NEXT: s_and_b32 s52, s84, 0xffff -; SI-NEXT: s_and_b32 s49, s86, 0xffff -; SI-NEXT: s_and_b32 s38, s96, 0xffff -; SI-NEXT: s_and_b32 s35, s99, 0xffff -; SI-NEXT: s_and_b32 s30, s98, 0xffff -; SI-NEXT: s_and_b32 s93, s59, 0xffff -; SI-NEXT: s_and_b32 s90, s62, 0xffff -; SI-NEXT: s_bfe_u32 s83, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s80, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s70, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s67, s81, 0x80008 -; SI-NEXT: s_bfe_u32 s64, s84, 0x80008 -; SI-NEXT: s_bfe_u32 s53, s86, 0x80008 -; SI-NEXT: s_bfe_u32 s50, s96, 0x80008 -; SI-NEXT: s_bfe_u32 s39, s99, 0x80008 -; SI-NEXT: s_bfe_u32 s36, s98, 0x80008 -; SI-NEXT: s_bfe_u32 s31, s59, 0x80008 -; SI-NEXT: s_bfe_u32 s94, s62, 0x80008 -; SI-NEXT: s_bfe_u32 s91, s72, 0x80008 -; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: v_alignbit_b32 v45, s13, v8, 24 -; SI-NEXT: v_alignbit_b32 v47, s13, v8, 16 -; SI-NEXT: v_alignbit_b32 v57, s13, v8, 8 -; SI-NEXT: v_alignbit_b32 v41, s12, v5, 24 -; SI-NEXT: v_alignbit_b32 v43, s12, v5, 16 -; SI-NEXT: v_alignbit_b32 v44, s12, v5, 8 -; SI-NEXT: v_alignbit_b32 v21, s11, v13, 24 -; SI-NEXT: v_alignbit_b32 v22, s11, v13, 16 -; SI-NEXT: v_alignbit_b32 v24, s11, v13, 8 -; SI-NEXT: v_alignbit_b32 v17, s10, v9, 24 -; SI-NEXT: v_alignbit_b32 v18, s10, v9, 16 -; SI-NEXT: v_alignbit_b32 v20, s10, v9, 8 -; SI-NEXT: v_alignbit_b32 v59, s9, v6, 24 -; SI-NEXT: v_alignbit_b32 v60, s9, v6, 16 -; SI-NEXT: v_alignbit_b32 v61, s9, v6, 8 -; SI-NEXT: v_alignbit_b32 v46, s8, v4, 24 -; SI-NEXT: v_alignbit_b32 v56, s8, v4, 16 -; SI-NEXT: v_alignbit_b32 v58, s8, v4, 8 -; SI-NEXT: v_alignbit_b32 v55, s7, v2, 24 -; SI-NEXT: v_alignbit_b32 v40, s7, v2, 16 -; SI-NEXT: v_alignbit_b32 v42, s7, v2, 8 -; SI-NEXT: v_alignbit_b32 v52, s6, v1, 24 -; SI-NEXT: v_alignbit_b32 v53, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, s6, v1, 8 -; SI-NEXT: s_cbranch_execnz .LBB99_3 -; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s88, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s79, s79, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s79, 0xffff -; SI-NEXT: s_lshl_b32 s5, s78, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s77, s77, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s77, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s75, s75, 3 -; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s75, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s73, s73, 3 -; SI-NEXT: s_add_i32 s9, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s73, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s63, s63, 3 -; SI-NEXT: s_add_i32 s10, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s63, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s60, s60, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s60, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s61, s61, 3 -; SI-NEXT: s_add_i32 s12, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s61, 0xffff -; SI-NEXT: s_lshl_b32 s5, s98, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s58, s58, 3 -; SI-NEXT: s_add_i32 s13, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s58, 0xffff -; SI-NEXT: s_lshl_b32 s5, s99, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s97, s97, 3 -; SI-NEXT: s_add_i32 s14, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s97, 0xffff -; SI-NEXT: s_lshl_b32 s5, s96, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s87, s87, 3 -; SI-NEXT: s_add_i32 s15, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s87, 0xffff -; SI-NEXT: s_lshl_b32 s5, s86, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s85, s85, 3 -; SI-NEXT: s_add_i32 s40, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s85, 0xffff -; SI-NEXT: s_lshl_b32 s5, s84, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s82, s82, 3 -; SI-NEXT: s_add_i32 s42, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s82, 0xffff -; SI-NEXT: s_lshl_b32 s5, s81, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s43, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s44, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s45, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s46, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s47, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s56, s4, 0x30000 -; SI-NEXT: v_readlane_b32 s4, v62, 0 -; SI-NEXT: s_add_i32 s4, s4, 3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s57, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v21, s56 -; SI-NEXT: v_alignbit_b32 v22, s57, v21, 24 -; SI-NEXT: v_alignbit_b32 v50, s57, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s57, v21, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v21, s46 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s47, v21, 24 -; SI-NEXT: s_lshr_b32 s4, s11, 8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s47, v21, 16 -; SI-NEXT: v_alignbit_b32 v51, s47, v21, 8 -; SI-NEXT: v_mov_b32_e32 v21, s44 -; SI-NEXT: v_writelane_b32 v62, s4, 1 -; SI-NEXT: s_lshr_b32 s4, s10, 16 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v32 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 2 -; SI-NEXT: s_lshr_b32 s4, s10, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v7 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_or_b32_e32 v3, v16, v3 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s45, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, s45, v21, 8 -; SI-NEXT: v_mov_b32_e32 v21, s42 -; SI-NEXT: v_writelane_b32 v62, s4, 3 -; SI-NEXT: s_lshr_b32 s4, s9, 24 -; SI-NEXT: v_or_b32_e32 v5, v36, v5 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v3 -; SI-NEXT: v_mov_b32_e32 v3, s41 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 4 -; SI-NEXT: s_lshr_b32 s4, s9, 16 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x30000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v31 -; SI-NEXT: v_or_b32_e32 v7, v14, v7 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, s43, v21, 16 -; SI-NEXT: v_alignbit_b32 v48, s43, v21, 8 -; SI-NEXT: v_alignbit_b32 v21, v3, v16, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 5 -; SI-NEXT: s_lshr_b32 s4, s9, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_add_i32_e32 v11, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v7 -; SI-NEXT: v_mov_b32_e32 v7, s40 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v3, v16, 16 -; SI-NEXT: v_alignbit_b32 v3, v3, v16, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 6 -; SI-NEXT: s_lshr_b32 s4, s8, 24 -; SI-NEXT: v_or_b32_e32 v5, v30, v5 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v7, v14, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 7 -; SI-NEXT: s_lshr_b32 s4, s8, 16 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v27 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v7, v14, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 8 -; SI-NEXT: s_lshr_b32 s4, s8, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v11 -; SI-NEXT: v_mov_b32_e32 v11, s15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v7, v14, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 9 -; SI-NEXT: s_lshr_b32 s4, s7, 24 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v11, v12, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 10 -; SI-NEXT: s_lshr_b32 s4, s7, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v34 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v33 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v23 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v19 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v11, v12, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 11 -; SI-NEXT: s_lshr_b32 s4, s7, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v10 -; SI-NEXT: v_mov_b32_e32 v15, s14 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v11, v12, 8 -; SI-NEXT: v_writelane_b32 v62, s4, 12 -; SI-NEXT: s_lshr_b32 s4, s6, 24 -; SI-NEXT: v_or_b32_e32 v1, v39, v1 -; SI-NEXT: v_or_b32_e32 v2, v38, v2 -; SI-NEXT: v_or_b32_e32 v4, v37, v4 -; SI-NEXT: v_or_b32_e32 v5, v28, v5 -; SI-NEXT: v_or_b32_e32 v8, v26, v8 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v15, v10, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 13 -; SI-NEXT: s_lshr_b32 s4, s6, 16 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v35, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; SI-NEXT: v_mov_b32_e32 v34, s7 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; SI-NEXT: v_mov_b32_e32 v33, s8 -; SI-NEXT: v_mov_b32_e32 v32, s9 -; SI-NEXT: v_mov_b32_e32 v20, s10 -; SI-NEXT: v_mov_b32_e32 v17, s11 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v5 -; SI-NEXT: v_mov_b32_e32 v18, s12 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v8 -; SI-NEXT: v_mov_b32_e32 v19, s13 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v15, v10, 16 -; SI-NEXT: v_writelane_b32 v62, s4, 14 -; SI-NEXT: s_lshr_b32 s4, s6, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v15, v10, 8 -; SI-NEXT: v_alignbit_b32 v45, v19, v8, 24 -; SI-NEXT: v_alignbit_b32 v47, v19, v8, 16 -; SI-NEXT: v_alignbit_b32 v57, v19, v8, 8 -; SI-NEXT: v_alignbit_b32 v41, v18, v5, 24 -; SI-NEXT: v_alignbit_b32 v43, v18, v5, 16 -; SI-NEXT: v_alignbit_b32 v44, v18, v5, 8 -; SI-NEXT: v_alignbit_b32 v21, v17, v13, 24 -; SI-NEXT: v_alignbit_b32 v22, v17, v13, 16 -; SI-NEXT: v_alignbit_b32 v24, v17, v13, 8 -; SI-NEXT: v_alignbit_b32 v17, v20, v9, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v9, 16 -; SI-NEXT: v_alignbit_b32 v20, v20, v9, 8 -; SI-NEXT: v_alignbit_b32 v59, v32, v6, 24 -; SI-NEXT: v_alignbit_b32 v60, v32, v6, 16 -; SI-NEXT: v_alignbit_b32 v61, v32, v6, 8 -; SI-NEXT: v_alignbit_b32 v46, v33, v4, 24 -; SI-NEXT: v_alignbit_b32 v56, v33, v4, 16 -; SI-NEXT: v_alignbit_b32 v58, v33, v4, 8 -; SI-NEXT: v_alignbit_b32 v55, v34, v2, 24 -; SI-NEXT: v_alignbit_b32 v40, v34, v2, 16 -; SI-NEXT: v_alignbit_b32 v42, v34, v2, 8 -; SI-NEXT: v_alignbit_b32 v52, v35, v1, 24 -; SI-NEXT: v_alignbit_b32 v53, v35, v1, 16 -; SI-NEXT: v_alignbit_b32 v54, v35, v1, 8 -; SI-NEXT: s_lshr_b32 s83, s57, 24 -; SI-NEXT: s_lshr_b32 s71, s57, 16 -; SI-NEXT: s_lshr_b32 s68, s57, 8 -; SI-NEXT: s_lshr_b32 s80, s47, 24 -; SI-NEXT: s_lshr_b32 s69, s47, 16 -; SI-NEXT: s_lshr_b32 s65, s47, 8 -; SI-NEXT: s_lshr_b32 s70, s45, 24 -; SI-NEXT: s_lshr_b32 s66, s45, 16 -; SI-NEXT: s_lshr_b32 s54, s45, 8 -; SI-NEXT: s_lshr_b32 s67, s43, 24 -; SI-NEXT: s_lshr_b32 s55, s43, 16 -; SI-NEXT: s_lshr_b32 s51, s43, 8 -; SI-NEXT: s_lshr_b32 s64, s41, 24 -; SI-NEXT: s_lshr_b32 s52, s41, 16 -; SI-NEXT: s_lshr_b32 s48, s41, 8 -; SI-NEXT: s_lshr_b32 s53, s40, 24 -; SI-NEXT: s_lshr_b32 s49, s40, 16 -; SI-NEXT: s_lshr_b32 s37, s40, 8 -; SI-NEXT: s_lshr_b32 s50, s15, 24 -; SI-NEXT: s_lshr_b32 s38, s15, 16 -; SI-NEXT: s_lshr_b32 s34, s15, 8 -; SI-NEXT: s_lshr_b32 s39, s14, 24 -; SI-NEXT: s_lshr_b32 s35, s14, 16 -; SI-NEXT: s_lshr_b32 s95, s14, 8 -; SI-NEXT: s_lshr_b32 s36, s13, 24 -; SI-NEXT: s_lshr_b32 s30, s13, 16 -; SI-NEXT: s_lshr_b32 s92, s13, 8 -; SI-NEXT: s_lshr_b32 s31, s12, 24 -; SI-NEXT: s_lshr_b32 s93, s12, 16 -; SI-NEXT: s_lshr_b32 s89, s12, 8 -; SI-NEXT: s_lshr_b32 s94, s11, 24 -; SI-NEXT: s_lshr_b32 s90, s11, 16 -; SI-NEXT: s_lshr_b32 s91, s10, 24 -; SI-NEXT: v_writelane_b32 v62, s4, 15 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: s_and_b32 s4, s56, 0xff -; SI-NEXT: s_lshl_b32 s5, s68, 8 -; SI-NEXT: s_lshl_b32 s16, s83, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_readlane_b32 s99, v63, 35 -; SI-NEXT: v_readlane_b32 s98, v63, 34 -; SI-NEXT: v_readlane_b32 s97, v63, 33 -; SI-NEXT: v_readlane_b32 s96, v63, 32 -; SI-NEXT: v_readlane_b32 s87, v63, 31 -; SI-NEXT: v_readlane_b32 s86, v63, 30 -; SI-NEXT: v_readlane_b32 s85, v63, 29 -; SI-NEXT: v_readlane_b32 s84, v63, 28 -; SI-NEXT: v_readlane_b32 s83, v63, 27 -; SI-NEXT: v_readlane_b32 s82, v63, 26 -; SI-NEXT: v_readlane_b32 s81, v63, 25 -; SI-NEXT: v_readlane_b32 s68, v63, 20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s57, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s71, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s46, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s47, 0xff -; SI-NEXT: s_lshl_b32 s5, s65, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s69, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s80, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s54, 8 -; SI-NEXT: s_lshl_b32 s16, s70, 24 -; SI-NEXT: v_readlane_b32 s80, v63, 24 -; SI-NEXT: v_readlane_b32 s71, v63, 23 -; SI-NEXT: v_readlane_b32 s70, v63, 22 -; SI-NEXT: v_readlane_b32 s69, v63, 21 -; SI-NEXT: v_readlane_b32 s65, v63, 17 -; SI-NEXT: v_readlane_b32 s54, v63, 14 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s44, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v49 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s66, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s51, 8 -; SI-NEXT: s_lshl_b32 s16, s67, 24 -; SI-NEXT: v_readlane_b32 s67, v63, 19 -; SI-NEXT: v_readlane_b32 s66, v63, 18 -; SI-NEXT: v_readlane_b32 s51, v63, 11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: s_and_b32 s4, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s55, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s48, 8 -; SI-NEXT: s_lshl_b32 s16, s64, 24 -; SI-NEXT: v_readlane_b32 s64, v63, 16 -; SI-NEXT: v_readlane_b32 s55, v63, 15 -; SI-NEXT: v_readlane_b32 s48, v63, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s52, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s37, 8 -; SI-NEXT: s_lshl_b32 s16, s53, 24 -; SI-NEXT: v_readlane_b32 s53, v63, 13 -; SI-NEXT: v_readlane_b32 s52, v63, 12 -; SI-NEXT: v_readlane_b32 s37, v63, 5 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s49, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s34, 8 -; SI-NEXT: v_readlane_b32 s49, v63, 9 -; SI-NEXT: v_readlane_b32 s34, v63, 2 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v23, v26, v27 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v24, v28, v29 +; GFX11-FAKE16-NEXT: s_clause 0x5 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 +; GFX11-FAKE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-FAKE16-NEXT: s_clause 0x13 +; GFX11-FAKE16-NEXT: scratch_load_b32 v75, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v74, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v73, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v72, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v63, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v62, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v61, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v60, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v59, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v58, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v57, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v56, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v47, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v46, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v45, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v44, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v43, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v42, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v41, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v40, off, s32 offset:88 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %b, 0 + br i1 %cmp, label %cmp.true, label %cmp.false + +cmp.true: + %a1 = add <64 x i16> %a, splat (i16 3) + %a2 = bitcast <64 x i16> %a1 to <128 x i8> + br label %end + +cmp.false: + %a3 = bitcast <64 x i16> %a to <128 x i8> + br label %end + +end: + %phi = phi <128 x i8> [ %a2, %cmp.true ], [ %a3, %cmp.false ] + ret <128 x i8> %phi +} + +define inreg <128 x i8> @bitcast_v64i16_to_v128i8_scalar(<64 x i16> inreg %a, i32 inreg %b) { +; SI-LABEL: bitcast_v64i16_to_v128i8_scalar: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: s_mov_b32 s88, s17 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: ; implicit-def: $vgpr41 : SGPR spill to VGPR lane +; SI-NEXT: v_readfirstlane_b32 s7, v15 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_writelane_b32 v41, s6, 0 +; SI-NEXT: v_readfirstlane_b32 s8, v21 +; SI-NEXT: v_writelane_b32 v41, s7, 1 +; SI-NEXT: v_readfirstlane_b32 s9, v20 +; SI-NEXT: v_writelane_b32 v41, s8, 2 +; SI-NEXT: v_readfirstlane_b32 s10, v19 +; SI-NEXT: v_writelane_b32 v41, s9, 3 +; SI-NEXT: v_readfirstlane_b32 s11, v25 +; SI-NEXT: v_writelane_b32 v41, s10, 4 +; SI-NEXT: v_readfirstlane_b32 s12, v24 +; SI-NEXT: v_writelane_b32 v41, s11, 5 +; SI-NEXT: v_readfirstlane_b32 s13, v23 +; SI-NEXT: v_writelane_b32 v41, s12, 6 +; SI-NEXT: v_readfirstlane_b32 s15, v29 +; SI-NEXT: v_writelane_b32 v41, s13, 7 +; SI-NEXT: v_readfirstlane_b32 s14, v28 +; SI-NEXT: v_writelane_b32 v41, s15, 8 +; SI-NEXT: s_mov_b32 s79, s16 +; SI-NEXT: v_readfirstlane_b32 s16, v27 +; SI-NEXT: v_writelane_b32 v41, s14, 9 +; SI-NEXT: v_writelane_b32 v41, s16, 10 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: v_writelane_b32 v40, s86, 30 +; SI-NEXT: v_writelane_b32 v40, s87, 31 +; SI-NEXT: v_writelane_b32 v40, s96, 32 +; SI-NEXT: v_writelane_b32 v40, s97, 33 +; SI-NEXT: v_writelane_b32 v40, s98, 34 +; SI-NEXT: v_writelane_b32 v40, s99, 35 +; SI-NEXT: v_readfirstlane_b32 s98, v30 +; SI-NEXT: v_readfirstlane_b32 s97, v26 +; SI-NEXT: v_readfirstlane_b32 s96, v22 +; SI-NEXT: v_readfirstlane_b32 s87, v18 +; SI-NEXT: v_readfirstlane_b32 s81, v17 +; SI-NEXT: v_readfirstlane_b32 s86, v14 +; SI-NEXT: v_readfirstlane_b32 s67, v13 +; SI-NEXT: v_readfirstlane_b32 s69, v12 +; SI-NEXT: v_readfirstlane_b32 s71, v11 +; SI-NEXT: v_readfirstlane_b32 s85, v10 +; SI-NEXT: v_readfirstlane_b32 s51, v9 +; SI-NEXT: v_readfirstlane_b32 s53, v8 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s89, v31 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s91, v32 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s93, v33 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s55, v34 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s17, v35 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s95, v36 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s35, v37 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s83, v38 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:80 +; SI-NEXT: v_readfirstlane_b32 s65, v7 +; SI-NEXT: v_readfirstlane_b32 s84, v6 +; SI-NEXT: v_readfirstlane_b32 s31, v5 +; SI-NEXT: v_readfirstlane_b32 s37, v4 +; SI-NEXT: v_readfirstlane_b32 s49, v3 +; SI-NEXT: v_readfirstlane_b32 s78, v2 +; SI-NEXT: v_readfirstlane_b32 s39, v1 +; SI-NEXT: ; implicit-def: $vgpr43 : SGPR spill to VGPR lane +; SI-NEXT: ; implicit-def: $vgpr42 : SGPR spill to VGPR lane +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_readfirstlane_b32 s77, v31 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_readfirstlane_b32 s38, v32 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_readfirstlane_b32 s48, v33 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_readfirstlane_b32 s50, v39 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_readfirstlane_b32 s76, v48 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_readfirstlane_b32 s30, v49 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_readfirstlane_b32 s34, v50 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_readfirstlane_b32 s36, v51 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_readfirstlane_b32 s99, v34 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_readfirstlane_b32 s90, v35 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_readfirstlane_b32 s92, v36 +; SI-NEXT: v_writelane_b32 v41, s90, 11 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_readfirstlane_b32 s94, v37 +; SI-NEXT: v_writelane_b32 v41, s92, 12 +; SI-NEXT: v_writelane_b32 v41, s94, 13 +; SI-NEXT: v_writelane_b32 v41, s30, 14 +; SI-NEXT: v_writelane_b32 v41, s34, 15 +; SI-NEXT: v_writelane_b32 v41, s36, 16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: s_and_b32 s4, s15, 0xff +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v38 +; SI-NEXT: v_writelane_b32 v41, s38, 17 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_writelane_b32 v41, s48, 18 +; SI-NEXT: v_writelane_b32 v41, s50, 19 +; SI-NEXT: s_cbranch_scc0 .LBB99_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: s_or_b32 s60, s4, s5 +; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_lshl_b32 s5, s19, 16 +; SI-NEXT: s_or_b32 s61, s4, s5 +; SI-NEXT: s_and_b32 s4, s20, 0xffff +; SI-NEXT: s_lshl_b32 s5, s21, 16 +; SI-NEXT: s_or_b32 s56, s4, s5 +; SI-NEXT: s_and_b32 s4, s22, 0xffff +; SI-NEXT: s_lshl_b32 s5, s23, 16 +; SI-NEXT: s_or_b32 s57, s4, s5 +; SI-NEXT: s_and_b32 s4, s24, 0xffff +; SI-NEXT: s_lshl_b32 s5, s25, 16 +; SI-NEXT: s_or_b32 s44, s4, s5 +; SI-NEXT: s_and_b32 s4, s26, 0xffff +; SI-NEXT: s_lshl_b32 s5, s27, 16 +; SI-NEXT: s_or_b32 s45, s4, s5 +; SI-NEXT: s_and_b32 s4, s28, 0xffff +; SI-NEXT: s_lshl_b32 s5, s29, 16 +; SI-NEXT: s_or_b32 s74, s4, s5 +; SI-NEXT: s_and_b32 s4, s39, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s75, s4, s5 +; SI-NEXT: s_and_b32 s4, s49, 0xffff +; SI-NEXT: s_lshl_b32 s5, s37, 16 +; SI-NEXT: s_or_b32 s72, s4, s5 +; SI-NEXT: s_and_b32 s4, s31, 0xffff +; SI-NEXT: s_lshl_b32 s5, s84, 16 +; SI-NEXT: s_or_b32 s73, s4, s5 +; SI-NEXT: s_and_b32 s4, s65, 0xffff +; SI-NEXT: s_lshl_b32 s5, s53, 16 +; SI-NEXT: s_or_b32 s62, s4, s5 +; SI-NEXT: s_and_b32 s4, s51, 0xffff +; SI-NEXT: s_lshl_b32 s5, s85, 16 +; SI-NEXT: s_or_b32 s63, s4, s5 +; SI-NEXT: s_and_b32 s4, s71, 0xffff +; SI-NEXT: s_lshl_b32 s5, s69, 16 +; SI-NEXT: s_or_b32 s58, s4, s5 +; SI-NEXT: s_and_b32 s4, s67, 0xffff +; SI-NEXT: s_lshl_b32 s5, s86, 16 +; SI-NEXT: s_or_b32 s59, s4, s5 +; SI-NEXT: s_and_b32 s4, s7, 0xffff +; SI-NEXT: s_lshl_b32 s5, s6, 16 +; SI-NEXT: s_or_b32 s46, s4, s5 +; SI-NEXT: s_and_b32 s4, s81, 0xffff +; SI-NEXT: s_lshl_b32 s5, s87, 16 +; SI-NEXT: s_or_b32 s47, s4, s5 +; SI-NEXT: s_and_b32 s4, s10, 0xffff +; SI-NEXT: s_lshl_b32 s5, s9, 16 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s8, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s13, 0xffff +; SI-NEXT: s_lshl_b32 s5, s12, 16 +; SI-NEXT: s_or_b32 s40, s4, s5 +; SI-NEXT: s_and_b32 s4, s11, 0xffff +; SI-NEXT: s_lshl_b32 s5, s97, 16 +; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_and_b32 s4, s16, 0xffff +; SI-NEXT: s_lshl_b32 s5, s14, 16 +; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_and_b32 s4, s15, 0xffff +; SI-NEXT: s_lshl_b32 s5, s98, 16 +; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_and_b32 s4, s94, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s90, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s36, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_and_b32 s4, s30, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s48, 16 +; SI-NEXT: s_or_b32 s8, s4, s5 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s77, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s83, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_or_b32 s6, s4, s5 +; SI-NEXT: s_and_b32 s4, s95, 0xffff +; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s55, 0xffff +; SI-NEXT: s_lshl_b32 s5, s93, 16 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s38, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s15, s50, 24 +; SI-NEXT: s_and_b32 s5, s91, 0xffff +; SI-NEXT: s_lshl_b32 s16, s89, 16 +; SI-NEXT: s_or_b32 s5, s5, s16 +; SI-NEXT: s_lshr_b32 s16, s61, 8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v43, s16, 20 +; SI-NEXT: s_lshr_b32 s16, s57, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 23 +; SI-NEXT: s_lshr_b32 s16, s45, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 26 +; SI-NEXT: s_lshr_b32 s16, s75, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 29 +; SI-NEXT: s_lshr_b32 s16, s73, 8 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 32 +; SI-NEXT: s_lshr_b32 s16, s63, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 22 +; SI-NEXT: v_writelane_b32 v43, s16, 35 +; SI-NEXT: s_lshr_b32 s16, s59, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 23 +; SI-NEXT: s_lshr_b64 vcc, s[60:61], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 38 +; SI-NEXT: s_lshr_b32 s16, s47, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 20 +; SI-NEXT: v_writelane_b32 v43, s16, 41 +; SI-NEXT: s_lshr_b32 s16, s43, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 21 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 44 +; SI-NEXT: s_lshr_b32 s16, s41, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 28 +; SI-NEXT: v_writelane_b32 v43, s16, 47 +; SI-NEXT: s_lshr_b32 s16, s15, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 29 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 50 +; SI-NEXT: s_lshr_b32 s16, s13, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 26 +; SI-NEXT: v_writelane_b32 v43, s16, 53 +; SI-NEXT: s_lshr_b32 s16, s11, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 27 +; SI-NEXT: s_lshr_b64 vcc, s[56:57], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 56 +; SI-NEXT: s_lshr_b32 s16, s9, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 59 +; SI-NEXT: s_lshr_b32 s16, s7, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 25 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 62 +; SI-NEXT: s_lshr_b32 s16, s5, 8 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 32 +; SI-NEXT: v_writelane_b32 v42, s16, 1 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 33 +; SI-NEXT: s_lshr_b64 vcc, s[46:47], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 19 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 30 +; SI-NEXT: v_writelane_b32 v43, s16, 22 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 31 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 25 +; SI-NEXT: s_and_b32 s16, s78, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 38 +; SI-NEXT: v_writelane_b32 v43, s16, 28 +; SI-NEXT: s_and_b32 s16, s84, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 39 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 31 +; SI-NEXT: s_and_b32 s16, s85, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 36 +; SI-NEXT: v_writelane_b32 v43, s16, 34 +; SI-NEXT: s_and_b32 s16, s86, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 37 +; SI-NEXT: s_lshr_b64 vcc, s[42:43], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 37 +; SI-NEXT: s_and_b32 s16, s87, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 34 +; SI-NEXT: v_writelane_b32 v43, s16, 40 +; SI-NEXT: s_and_b32 s16, s96, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 35 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 43 +; SI-NEXT: s_and_b32 s16, s97, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 44 +; SI-NEXT: v_writelane_b32 v43, s16, 46 +; SI-NEXT: s_and_b32 s16, s98, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 45 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 49 +; SI-NEXT: s_and_b32 s16, s99, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 42 +; SI-NEXT: v_writelane_b32 v43, s16, 52 +; SI-NEXT: s_and_b32 s16, s76, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 43 +; SI-NEXT: s_lshr_b64 vcc, s[40:41], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 55 +; SI-NEXT: s_and_b32 s16, s77, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 40 +; SI-NEXT: v_writelane_b32 v43, s16, 58 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 41 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 61 +; SI-NEXT: s_and_b32 s16, s89, 0xffff +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 50 +; SI-NEXT: v_writelane_b32 v42, s16, 0 +; SI-NEXT: s_bfe_u32 s16, s19, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 51 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 18 +; SI-NEXT: s_bfe_u32 s16, s23, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 48 +; SI-NEXT: v_writelane_b32 v43, s16, 21 +; SI-NEXT: s_bfe_u32 s16, s27, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 49 +; SI-NEXT: s_lshr_b64 vcc, s[14:15], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 24 +; SI-NEXT: s_bfe_u32 s16, s78, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 46 +; SI-NEXT: v_writelane_b32 v43, s16, 27 +; SI-NEXT: s_bfe_u32 s16, s84, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 47 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 30 +; SI-NEXT: s_bfe_u32 s16, s85, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 56 +; SI-NEXT: v_writelane_b32 v43, s16, 33 +; SI-NEXT: s_bfe_u32 s16, s86, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 57 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 36 +; SI-NEXT: s_bfe_u32 s16, s87, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 54 +; SI-NEXT: v_writelane_b32 v43, s16, 39 +; SI-NEXT: s_bfe_u32 s16, s96, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 55 +; SI-NEXT: s_lshr_b64 vcc, s[12:13], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 42 +; SI-NEXT: s_bfe_u32 s16, s97, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 52 +; SI-NEXT: v_writelane_b32 v43, s16, 45 +; SI-NEXT: s_bfe_u32 s16, s98, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 53 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 48 +; SI-NEXT: s_bfe_u32 s16, s99, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 62 +; SI-NEXT: v_writelane_b32 v43, s16, 51 +; SI-NEXT: s_bfe_u32 s16, s76, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 63 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 54 +; SI-NEXT: s_bfe_u32 s16, s77, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 60 +; SI-NEXT: v_writelane_b32 v43, s16, 57 +; SI-NEXT: s_bfe_u32 s16, s17, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 61 +; SI-NEXT: s_lshr_b64 vcc, s[10:11], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 60 +; SI-NEXT: s_bfe_u32 s16, s89, 0x80008 +; SI-NEXT: v_writelane_b32 v41, vcc_lo, 58 +; SI-NEXT: v_writelane_b32 v43, s16, 63 +; SI-NEXT: v_writelane_b32 v41, vcc_hi, 59 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 24 +; SI-NEXT: s_mov_b32 s16, s93 +; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 4 +; SI-NEXT: s_mov_b32 s93, s16 +; SI-NEXT: s_mov_b32 s16, s71 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 5 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 16 +; SI-NEXT: s_mov_b32 s71, s16 +; SI-NEXT: s_mov_b32 s16, s81 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 2 +; SI-NEXT: s_mov_b32 s81, s16 +; SI-NEXT: s_mov_b32 s16, s83 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 3 +; SI-NEXT: s_lshr_b64 vcc, s[8:9], 8 +; SI-NEXT: s_mov_b32 s83, s16 +; SI-NEXT: s_mov_b32 s16, s65 +; SI-NEXT: s_lshr_b64 s[64:65], s[74:75], 24 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 0 +; SI-NEXT: s_mov_b32 s65, s16 +; SI-NEXT: s_mov_b32 s16, s67 +; SI-NEXT: s_lshr_b64 s[66:67], s[74:75], 16 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 1 +; SI-NEXT: s_lshr_b64 vcc, s[6:7], 24 +; SI-NEXT: s_mov_b32 s67, s16 +; SI-NEXT: s_mov_b32 s16, s69 +; SI-NEXT: s_lshr_b64 s[68:69], s[74:75], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 10 +; SI-NEXT: s_mov_b32 s69, s16 +; SI-NEXT: s_mov_b32 s16, s51 +; SI-NEXT: s_lshr_b64 s[50:51], s[72:73], 24 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 11 +; SI-NEXT: s_lshr_b64 vcc, s[6:7], 16 +; SI-NEXT: s_mov_b32 s51, s16 +; SI-NEXT: s_mov_b32 s16, s53 +; SI-NEXT: s_lshr_b64 s[52:53], s[72:73], 16 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 8 +; SI-NEXT: s_mov_b32 s53, s16 +; SI-NEXT: s_mov_b32 s16, s55 +; SI-NEXT: s_lshr_b64 s[54:55], s[72:73], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 9 +; SI-NEXT: s_lshr_b64 vcc, s[6:7], 8 +; SI-NEXT: s_mov_b32 s55, s16 +; SI-NEXT: s_mov_b32 s16, s37 +; SI-NEXT: s_lshr_b64 s[36:37], s[62:63], 24 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 6 +; SI-NEXT: s_mov_b32 s37, s16 +; SI-NEXT: s_mov_b32 s16, s39 +; SI-NEXT: s_lshr_b64 s[38:39], s[62:63], 16 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 7 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 24 +; SI-NEXT: s_mov_b32 s39, s16 +; SI-NEXT: s_mov_b32 s16, s49 +; SI-NEXT: s_lshr_b64 s[48:49], s[62:63], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 16 +; SI-NEXT: s_mov_b32 s49, s16 +; SI-NEXT: s_mov_b32 s16, s95 +; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 24 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 17 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 16 +; SI-NEXT: s_mov_b32 s95, s16 +; SI-NEXT: s_mov_b32 s16, s31 +; SI-NEXT: s_lshr_b64 s[30:31], s[58:59], 16 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 14 +; SI-NEXT: s_mov_b32 s31, s16 +; SI-NEXT: s_mov_b32 s16, s35 +; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 15 +; SI-NEXT: s_lshr_b64 vcc, s[4:5], 8 +; SI-NEXT: s_mov_b32 s35, s16 +; SI-NEXT: s_mov_b32 s16, s91 +; SI-NEXT: s_lshr_b64 s[90:91], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v43, vcc_lo, 12 +; SI-NEXT: s_mov_b32 s91, s16 +; SI-NEXT: v_writelane_b32 v43, vcc_hi, 13 +; SI-NEXT: s_cbranch_execnz .LBB99_3 +; SI-NEXT: .LBB99_2: ; %cmp.true +; SI-NEXT: s_add_i32 s4, s55, 3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s15, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s95, 8 -; SI-NEXT: v_readlane_b32 s50, v63, 10 -; SI-NEXT: v_readlane_b32 s38, v63, 6 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s5, s93, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s5, s91, 3 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_lshl_b32 s6, s89, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s6, s83, 3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s7, s95, 3 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_lshl_b32 s8, s17, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readlane_b32 s8, v41, 19 +; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: v_readlane_b32 s9, v41, 18 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readlane_b32 s9, v41, 17 +; SI-NEXT: s_add_i32 s9, s9, 3 +; SI-NEXT: s_and_b32 s9, s9, 0xffff +; SI-NEXT: s_lshl_b32 s10, s77, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_readlane_b32 s10, v41, 16 +; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: v_readlane_b32 s11, v41, 15 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readlane_b32 s11, v41, 14 +; SI-NEXT: s_add_i32 s11, s11, 3 +; SI-NEXT: s_and_b32 s11, s11, 0xffff +; SI-NEXT: s_lshl_b32 s12, s76, 16 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readlane_b32 s12, v41, 13 +; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: v_readlane_b32 s13, v41, 12 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readlane_b32 s13, v41, 11 +; SI-NEXT: s_add_i32 s13, s13, 3 +; SI-NEXT: s_and_b32 s13, s13, 0xffff +; SI-NEXT: s_lshl_b32 s14, s99, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readlane_b32 s14, v41, 10 +; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: v_readlane_b32 s15, v41, 9 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readlane_b32 s15, v41, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_and_b32 s15, s15, 0xffff +; SI-NEXT: s_lshl_b32 s16, s98, 16 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readlane_b32 s16, v41, 7 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 6 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 5 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s97, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 4 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s42, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 2 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s96, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s43, s16, 0x30000 +; SI-NEXT: v_readlane_b32 s16, v41, 1 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: v_readlane_b32 s17, v41, 0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s46, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s81, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s87, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s47, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s71, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s69, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s58, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s67, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s86, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s59, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s65, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s62, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s51, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s85, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s63, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s49, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s37, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s72, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s31, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s84, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s73, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s28, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s29, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s74, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s39, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s75, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s24, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s25, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s44, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s26, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s27, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s45, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s20, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s21, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s56, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s22, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s23, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s57, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s79, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s60, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s18, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s19, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s61, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 22 +; SI-NEXT: v_writelane_b32 v41, s17, 23 +; SI-NEXT: s_lshr_b64 s[16:17], s[60:61], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 20 +; SI-NEXT: v_writelane_b32 v41, s17, 21 +; SI-NEXT: s_lshr_b32 s16, s61, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 18 +; SI-NEXT: s_lshr_b32 s16, s61, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 19 +; SI-NEXT: s_lshr_b32 s16, s61, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 20 +; SI-NEXT: s_lshr_b32 s16, s57, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 21 +; SI-NEXT: s_lshr_b32 s16, s57, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 22 +; SI-NEXT: s_lshr_b32 s16, s57, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 23 +; SI-NEXT: s_lshr_b32 s16, s45, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 24 +; SI-NEXT: s_lshr_b32 s16, s45, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 25 +; SI-NEXT: s_lshr_b32 s16, s45, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 26 +; SI-NEXT: s_lshr_b32 s16, s75, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 27 +; SI-NEXT: s_lshr_b32 s16, s75, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 28 +; SI-NEXT: s_lshr_b32 s16, s75, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 29 +; SI-NEXT: s_lshr_b32 s16, s73, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 30 +; SI-NEXT: s_lshr_b32 s16, s73, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 31 +; SI-NEXT: s_lshr_b32 s16, s73, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 32 +; SI-NEXT: s_lshr_b32 s16, s63, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 33 +; SI-NEXT: s_lshr_b32 s16, s63, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 34 +; SI-NEXT: s_lshr_b32 s16, s63, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 35 +; SI-NEXT: s_lshr_b32 s16, s59, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 36 +; SI-NEXT: s_lshr_b32 s16, s59, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 37 +; SI-NEXT: s_lshr_b32 s16, s59, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 38 +; SI-NEXT: s_lshr_b32 s16, s47, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 39 +; SI-NEXT: s_lshr_b32 s16, s47, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 40 +; SI-NEXT: s_lshr_b32 s16, s47, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 41 +; SI-NEXT: s_lshr_b32 s16, s43, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 42 +; SI-NEXT: s_lshr_b32 s16, s43, 16 +; SI-NEXT: v_writelane_b32 v43, s16, 43 +; SI-NEXT: s_lshr_b32 s16, s43, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 44 +; SI-NEXT: s_lshr_b32 s16, s41, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 45 +; SI-NEXT: s_lshr_b32 s16, s41, 16 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 46 +; SI-NEXT: s_lshr_b32 s16, s41, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 47 +; SI-NEXT: s_lshr_b32 s16, s15, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 48 +; SI-NEXT: s_lshr_b32 s16, s15, 16 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 49 +; SI-NEXT: s_lshr_b32 s16, s15, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 50 +; SI-NEXT: s_lshr_b32 s16, s13, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 51 +; SI-NEXT: s_lshr_b32 s16, s13, 16 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 52 +; SI-NEXT: s_lshr_b32 s16, s13, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 53 +; SI-NEXT: s_lshr_b32 s16, s11, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 54 +; SI-NEXT: s_lshr_b32 s16, s11, 16 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 55 +; SI-NEXT: s_lshr_b32 s16, s11, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 56 +; SI-NEXT: s_lshr_b32 s16, s9, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 57 +; SI-NEXT: s_lshr_b32 s16, s9, 16 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 58 +; SI-NEXT: s_lshr_b32 s16, s9, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 59 +; SI-NEXT: s_lshr_b32 s16, s7, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 60 +; SI-NEXT: s_lshr_b32 s16, s7, 16 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 61 +; SI-NEXT: s_lshr_b32 s16, s7, 8 +; SI-NEXT: v_writelane_b32 v43, s16, 62 +; SI-NEXT: s_lshr_b32 s16, s5, 24 +; SI-NEXT: v_writelane_b32 v43, s16, 63 +; SI-NEXT: s_lshr_b32 s16, s5, 16 +; SI-NEXT: v_writelane_b32 v42, s16, 0 +; SI-NEXT: s_lshr_b32 s16, s5, 8 +; SI-NEXT: v_writelane_b32 v42, s16, 1 +; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 28 +; SI-NEXT: v_writelane_b32 v41, s17, 29 +; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 26 +; SI-NEXT: v_writelane_b32 v41, s17, 27 +; SI-NEXT: s_lshr_b64 s[16:17], s[56:57], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 24 +; SI-NEXT: v_writelane_b32 v41, s17, 25 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 32 +; SI-NEXT: v_writelane_b32 v41, s17, 33 +; SI-NEXT: s_lshr_b64 s[16:17], s[46:47], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 30 +; SI-NEXT: v_writelane_b32 v41, s17, 31 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 38 +; SI-NEXT: v_writelane_b32 v41, s17, 39 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 36 +; SI-NEXT: v_writelane_b32 v41, s17, 37 +; SI-NEXT: s_lshr_b64 s[16:17], s[42:43], 8 +; SI-NEXT: v_writelane_b32 v41, s16, 34 +; SI-NEXT: v_writelane_b32 v41, s17, 35 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 44 +; SI-NEXT: v_writelane_b32 v41, s17, 45 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 42 +; SI-NEXT: v_writelane_b32 v41, s17, 43 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 40 +; SI-NEXT: v_writelane_b32 v41, s17, 41 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 50 +; SI-NEXT: v_writelane_b32 v41, s17, 51 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 48 +; SI-NEXT: v_writelane_b32 v41, s17, 49 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 46 +; SI-NEXT: v_writelane_b32 v41, s17, 47 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 56 +; SI-NEXT: v_writelane_b32 v41, s17, 57 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 54 +; SI-NEXT: v_writelane_b32 v41, s17, 55 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 8 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 52 +; SI-NEXT: v_writelane_b32 v41, s17, 53 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v41, s16, 62 +; SI-NEXT: v_writelane_b32 v41, s17, 63 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v41, s16, 60 +; SI-NEXT: v_writelane_b32 v41, s17, 61 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: v_writelane_b32 v41, s16, 58 +; SI-NEXT: v_writelane_b32 v41, s17, 59 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 4 +; SI-NEXT: v_writelane_b32 v43, s17, 5 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 2 +; SI-NEXT: v_writelane_b32 v43, s17, 3 +; SI-NEXT: s_lshr_b64 s[16:17], s[8:9], 8 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 0 +; SI-NEXT: v_writelane_b32 v43, s17, 1 +; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 10 +; SI-NEXT: v_writelane_b32 v43, s17, 11 +; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 8 +; SI-NEXT: v_writelane_b32 v43, s17, 9 +; SI-NEXT: s_lshr_b64 s[16:17], s[6:7], 8 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: v_writelane_b32 v43, s16, 6 +; SI-NEXT: v_writelane_b32 v43, s17, 7 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 24 +; SI-NEXT: v_writelane_b32 v43, s16, 16 +; SI-NEXT: v_writelane_b32 v43, s17, 17 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 16 +; SI-NEXT: v_writelane_b32 v43, s16, 14 +; SI-NEXT: v_writelane_b32 v43, s17, 15 +; SI-NEXT: s_lshr_b64 s[16:17], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[60:61], 8 +; SI-NEXT: s_lshr_b64 s[70:71], s[44:45], 24 +; SI-NEXT: s_lshr_b64 s[80:81], s[44:45], 16 +; SI-NEXT: s_lshr_b64 s[82:83], s[44:45], 8 +; SI-NEXT: s_lshr_b64 s[64:65], s[74:75], 24 +; SI-NEXT: s_lshr_b64 s[66:67], s[74:75], 16 +; SI-NEXT: s_lshr_b64 s[68:69], s[74:75], 8 +; SI-NEXT: s_lshr_b64 s[50:51], s[72:73], 24 +; SI-NEXT: s_lshr_b64 s[52:53], s[72:73], 16 +; SI-NEXT: s_lshr_b64 s[54:55], s[72:73], 8 +; SI-NEXT: s_lshr_b64 s[36:37], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[38:39], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[48:49], s[62:63], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[58:59], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[58:59], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[58:59], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[46:47], 8 +; SI-NEXT: v_writelane_b32 v43, s16, 12 +; SI-NEXT: v_writelane_b32 v43, s17, 13 +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: s_lshl_b32 s17, s92, 8 +; SI-NEXT: s_and_b32 s18, s60, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 20 +; SI-NEXT: v_readlane_b32 s19, v41, 21 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: v_readlane_b32 s20, v41, 22 +; SI-NEXT: s_lshl_b32 s19, s20, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 20 +; SI-NEXT: v_mov_b32_e32 v1, s17 +; SI-NEXT: s_and_b32 s17, s61, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 19 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 18 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_readlane_b32 s16, v41, 24 +; SI-NEXT: v_readlane_b32 s17, v41, 25 +; SI-NEXT: s_lshl_b32 s17, s16, 8 +; SI-NEXT: s_and_b32 s18, s56, 0xff +; SI-NEXT: v_readlane_b32 s21, v41, 23 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 26 +; SI-NEXT: v_readlane_b32 s19, v41, 27 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: v_readlane_b32 s20, v41, 28 +; SI-NEXT: s_lshl_b32 s19, s20, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 23 +; SI-NEXT: v_mov_b32_e32 v3, s17 +; SI-NEXT: s_and_b32 s17, s57, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 22 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 21 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: s_lshl_b32 s17, s82, 8 +; SI-NEXT: s_and_b32 s18, s44, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s80, 0xff +; SI-NEXT: s_lshl_b32 s19, s70, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 26 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: s_and_b32 s17, s45, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 25 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v6, s17 +; SI-NEXT: s_lshl_b32 s17, s68, 8 +; SI-NEXT: s_and_b32 s18, s74, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s66, 0xff +; SI-NEXT: s_lshl_b32 s19, s64, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 29 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: s_and_b32 s17, s75, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 28 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 27 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v8, s17 +; SI-NEXT: s_lshl_b32 s17, s54, 8 +; SI-NEXT: s_and_b32 s18, s72, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s52, 0xff +; SI-NEXT: s_lshl_b32 s19, s50, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 32 +; SI-NEXT: v_mov_b32_e32 v9, s17 +; SI-NEXT: s_and_b32 s17, s73, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 31 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 30 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v10, s17 +; SI-NEXT: s_lshl_b32 s17, s48, 8 +; SI-NEXT: s_and_b32 s18, s62, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s38, 0xff +; SI-NEXT: s_lshl_b32 s19, s36, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 35 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: s_and_b32 s17, s63, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 34 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 33 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: s_lshl_b32 s17, s34, 8 +; SI-NEXT: s_and_b32 s18, s58, 0xff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s18, s30, 0xff +; SI-NEXT: s_lshl_b32 s19, s94, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 38 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: s_and_b32 s17, s59, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 37 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 36 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: s_lshl_b32 s17, s90, 8 +; SI-NEXT: s_and_b32 s18, s46, 0xff +; SI-NEXT: v_readlane_b32 s21, v41, 29 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 30 +; SI-NEXT: v_readlane_b32 s19, v41, 31 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: v_readlane_b32 s20, v41, 32 +; SI-NEXT: s_lshl_b32 s19, s20, 24 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: v_readlane_b32 s16, v43, 41 +; SI-NEXT: v_mov_b32_e32 v15, s17 +; SI-NEXT: s_and_b32 s17, s47, 0xff +; SI-NEXT: s_lshl_b32 s18, s16, 8 +; SI-NEXT: v_readlane_b32 s16, v43, 40 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: s_and_b32 s18, s16, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 39 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s16, 24 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: s_and_b32 s17, s17, 0xffff +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s35, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s39, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_lshl_b32 s5, s92, 8 -; SI-NEXT: v_readlane_b32 s39, v63, 7 -; SI-NEXT: v_readlane_b32 s35, v63, 3 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s17, s17, s18 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v16, s17 +; SI-NEXT: v_readlane_b32 s16, v41, 34 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: v_readlane_b32 s17, v41, 35 +; SI-NEXT: v_readlane_b32 s18, v41, 36 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: s_and_b32 s17, s42, 0xff +; SI-NEXT: v_readlane_b32 s19, v41, 37 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 38 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: v_readlane_b32 s17, v43, 44 +; SI-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 43 +; SI-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 42 +; SI-NEXT: v_readlane_b32 s19, v41, 39 +; SI-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readlane_b32 s18, v41, 40 +; SI-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: v_readlane_b32 s19, v41, 41 +; SI-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v41, 42 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: v_readlane_b32 s19, v41, 43 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 44 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 47 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v57 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v47 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s30, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v45 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s13, s36, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s13, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_readlane_b32 s17, v43, 46 +; SI-NEXT: s_and_b32 s17, s17, 0xff +; SI-NEXT: v_readlane_b32 s18, v43, 45 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v44 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s89, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v43 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s93, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_readlane_b32 s16, v41, 46 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s17, v41, 47 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s19, v41, 45 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: v_readlane_b32 s16, v41, 48 +; SI-NEXT: v_readlane_b32 s17, v41, 49 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s18, v41, 50 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v41 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s31, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: v_readlane_b32 s15, v43, 50 +; SI-NEXT: s_lshl_b32 s15, s15, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_readlane_b32 s15, v43, 49 +; SI-NEXT: s_and_b32 s15, s15, 0xff +; SI-NEXT: v_readlane_b32 s16, v43, 48 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s16, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: v_readlane_b32 s5, v62, 1 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v13 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_readlane_b32 s14, v41, 52 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s15, v41, 53 +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: v_readlane_b32 s14, v41, 54 +; SI-NEXT: v_readlane_b32 s15, v41, 55 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s16, v41, 56 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v24 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v22 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s90, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v21 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s94, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s11, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 3 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: v_readlane_b32 s13, v43, 53 +; SI-NEXT: s_lshl_b32 s13, s13, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_readlane_b32 s13, v43, 52 +; SI-NEXT: s_and_b32 s13, s13, 0xff +; SI-NEXT: v_readlane_b32 s14, v43, 51 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v9 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_readlane_b32 s12, v41, 58 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s13, v41, 59 +; SI-NEXT: s_lshl_b32 s12, s12, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: v_readlane_b32 s12, v41, 60 +; SI-NEXT: v_readlane_b32 s13, v41, 61 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: v_readlane_b32 s14, v41, 62 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s14, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v20 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 2 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v18 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s10, s91, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 6 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 56 +; SI-NEXT: s_lshl_b32 s11, s11, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_readlane_b32 s11, v43, 55 +; SI-NEXT: s_and_b32 s11, s11, 0xff +; SI-NEXT: v_readlane_b32 s12, v43, 54 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s12, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v6 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_readlane_b32 s10, v43, 0 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s11, v43, 1 +; SI-NEXT: s_lshl_b32 s10, s10, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: v_readlane_b32 s10, v43, 2 +; SI-NEXT: v_readlane_b32 s11, v43, 3 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s12, v43, 4 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v61 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v60 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s9, v62, 4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v59 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s9, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 9 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 59 +; SI-NEXT: s_lshl_b32 s9, s9, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_readlane_b32 s9, v43, 58 +; SI-NEXT: s_and_b32 s9, s9, 0xff +; SI-NEXT: v_readlane_b32 s10, v43, 57 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s10, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v58 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v56 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s8, v62, 7 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_readlane_b32 s8, v43, 6 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s9, v43, 7 +; SI-NEXT: s_lshl_b32 s8, s8, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: v_readlane_b32 s8, v43, 8 +; SI-NEXT: v_readlane_b32 s9, v43, 9 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: v_readlane_b32 s10, v43, 10 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v46 -; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 62 +; SI-NEXT: s_lshl_b32 s7, s7, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_readlane_b32 s7, v43, 61 +; SI-NEXT: s_and_b32 s7, s7, 0xff +; SI-NEXT: v_readlane_b32 s8, v43, 60 +; SI-NEXT: s_lshl_b32 s7, s7, 16 ; SI-NEXT: s_lshl_b32 s8, s8, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 12 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v42 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 11 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v40 -; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s7, v62, 10 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v55 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s7, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_readlane_b32 s6, v43, 12 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: v_readlane_b32 s7, v43, 13 +; SI-NEXT: s_lshl_b32 s6, s6, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_readlane_b32 s6, v43, 14 +; SI-NEXT: v_readlane_b32 s7, v43, 15 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: v_readlane_b32 s8, v43, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s8, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s7, s5 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 15 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: v_readlane_b32 s5, v42, 1 ; SI-NEXT: s_lshl_b32 s5, s5, 8 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v54 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_readlane_b32 s5, v62, 14 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v53 +; SI-NEXT: v_readlane_b32 s5, v42, 0 ; SI-NEXT: s_and_b32 s5, s5, 0xff -; SI-NEXT: v_readlane_b32 s6, v62, 13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v52 +; SI-NEXT: v_readlane_b32 s6, v43, 63 ; SI-NEXT: s_lshl_b32 s5, s5, 16 ; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s21, v41, 33 +; SI-NEXT: v_readlane_b32 s19, v41, 51 +; SI-NEXT: v_readlane_b32 s17, v41, 57 +; SI-NEXT: v_readlane_b32 s15, v41, 63 +; SI-NEXT: v_readlane_b32 s13, v43, 5 +; SI-NEXT: v_readlane_b32 s11, v43, 11 +; SI-NEXT: v_readlane_b32 s9, v43, 17 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_readlane_b32 s36, v63, 4 -; SI-NEXT: v_readlane_b32 s31, v63, 1 -; SI-NEXT: v_readlane_b32 s30, v63, 0 +; SI-NEXT: v_readlane_b32 s99, v40, 35 +; SI-NEXT: v_readlane_b32 s98, v40, 34 +; SI-NEXT: v_readlane_b32 s97, v40, 33 +; SI-NEXT: v_readlane_b32 s96, v40, 32 +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 ; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: v_mov_b32_e32 v39, v24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v38, v22 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v37, v21 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v30, v18 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v29, v17 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v28, v13 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v26, v9 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: v_mov_b32_e32 v25, v6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vcc_lo +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: v_writelane_b32 v41, s4, 20 +; SI-NEXT: v_writelane_b32 v41, s5, 21 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr68 -; SI-NEXT: ; implicit-def: $sgpr71 -; SI-NEXT: ; implicit-def: $sgpr83 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr65 -; SI-NEXT: ; implicit-def: $sgpr69 -; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr54 -; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr80 ; SI-NEXT: ; implicit-def: $sgpr70 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr51 -; SI-NEXT: ; implicit-def: $sgpr55 -; SI-NEXT: ; implicit-def: $sgpr67 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr48 -; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr66 ; SI-NEXT: ; implicit-def: $sgpr64 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr37 -; SI-NEXT: ; implicit-def: $sgpr49 -; SI-NEXT: ; implicit-def: $sgpr53 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr34 -; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr52 ; SI-NEXT: ; implicit-def: $sgpr50 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr95 -; SI-NEXT: ; implicit-def: $sgpr35 -; SI-NEXT: ; implicit-def: $sgpr39 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr92 -; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr36 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr89 -; SI-NEXT: ; implicit-def: $sgpr93 -; SI-NEXT: ; implicit-def: $sgpr31 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr91 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vcc_lo -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; kill: killed $sgpr6 -; SI-NEXT: ; kill: killed $vcc_lo -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: v_writelane_b32 v41, s4, 22 +; SI-NEXT: v_writelane_b32 v41, s5, 23 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 24 +; SI-NEXT: v_writelane_b32 v41, s5, 25 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 26 +; SI-NEXT: v_writelane_b32 v41, s5, 27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 28 +; SI-NEXT: v_writelane_b32 v41, s5, 29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; kill: killed $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 30 +; SI-NEXT: v_writelane_b32 v41, s5, 31 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 32 +; SI-NEXT: v_writelane_b32 v41, s5, 33 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 34 +; SI-NEXT: v_writelane_b32 v41, s5, 35 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 36 +; SI-NEXT: v_writelane_b32 v41, s5, 37 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 38 +; SI-NEXT: v_writelane_b32 v41, s5, 39 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 40 +; SI-NEXT: v_writelane_b32 v41, s5, 41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 42 +; SI-NEXT: v_writelane_b32 v41, s5, 43 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 44 +; SI-NEXT: v_writelane_b32 v41, s5, 45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 46 +; SI-NEXT: v_writelane_b32 v41, s5, 47 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 48 +; SI-NEXT: v_writelane_b32 v41, s5, 49 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 50 +; SI-NEXT: v_writelane_b32 v41, s5, 51 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 52 +; SI-NEXT: v_writelane_b32 v41, s5, 53 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 54 +; SI-NEXT: v_writelane_b32 v41, s5, 55 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 56 +; SI-NEXT: v_writelane_b32 v41, s5, 57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 58 +; SI-NEXT: v_writelane_b32 v41, s5, 59 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 60 +; SI-NEXT: v_writelane_b32 v41, s5, 61 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v41, s4, 62 +; SI-NEXT: v_writelane_b32 v41, s5, 63 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v43, s4, 0 +; SI-NEXT: v_writelane_b32 v43, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 2 +; SI-NEXT: v_writelane_b32 v43, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 4 +; SI-NEXT: v_writelane_b32 v43, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 6 +; SI-NEXT: v_writelane_b32 v43, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 8 +; SI-NEXT: v_writelane_b32 v43, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s4, 10 +; SI-NEXT: v_writelane_b32 v43, s5, 11 +; SI-NEXT: v_writelane_b32 v43, s16, 12 +; SI-NEXT: v_writelane_b32 v43, s17, 13 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v43, s16, 14 +; SI-NEXT: v_writelane_b32 v43, s17, 15 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: v_writelane_b32 v43, s16, 16 +; SI-NEXT: v_writelane_b32 v43, s17, 17 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v64i16_to_v128i8_scalar: @@ -217279,1071 +219505,408 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:40 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v14 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:48 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v15 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v16 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v17 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:60 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v18 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v30 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v54, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; SI-NEXT: v_mul_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v14, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v6, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v8, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v9, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v28 ; SI-NEXT: v_mul_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v27, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v30, 1.0, s29 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v27, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s28 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v36 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v38 -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v48 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v55 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v50 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v53 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v54 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v42 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v28, 1.0, s28 -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB101_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: v_mov_b32_e32 v55, v50 -; SI-NEXT: v_mov_b32_e32 v40, v52 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v24, v47 -; SI-NEXT: v_mov_b32_e32 v23, v46 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v26, v57 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v36, v8 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v38 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v35 -; SI-NEXT: v_mov_b32_e32 v35, v43 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v38, v10 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v39 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_mov_b32_e32 v19, v28 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v39, v22 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 -; SI-NEXT: v_mov_b32_e32 v47, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 -; SI-NEXT: v_mov_b32_e32 v49, v15 -; SI-NEXT: v_mov_b32_e32 v15, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_mov_b32_e32 v51, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_mov_b32_e32 v20, v2 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; SI-NEXT: v_mov_b32_e32 v37, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v2 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v9 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v39 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v42 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v43 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v45 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v47 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v56 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14 +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v57 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v54 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: s_cbranch_execnz .LBB101_3 -; SI-NEXT: .LBB101_2: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v43 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v37, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v6, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v50 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v25 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v26 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v59 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v60 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v24 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v30, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v61 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v62 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v42 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v53 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v48 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v31 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v7 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v33 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v40 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v13 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v52 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v43, v14 -; SI-NEXT: .LBB101_3: ; %end -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_or_b32_e32 v14, v14, v53 -; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v34 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s29 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: s_cbranch_scc0 .LBB101_2 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v49 -; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v53, v54 +; SI-NEXT: v_mov_b32_e32 v54, v40 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_mov_b32_e32 v40, v35 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v39 -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v35 -; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v20 -; SI-NEXT: v_add_i32_e32 v16, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 -; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v36 -; SI-NEXT: v_add_i32_e32 v16, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v37 -; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 -; SI-NEXT: v_add_i32_e32 v16, vcc, 48, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v59 -; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 -; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v61 -; SI-NEXT: v_add_i32_e32 v16, vcc, 60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 -; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v44 -; SI-NEXT: v_add_i32_e32 v16, vcc, 0x44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v46 -; SI-NEXT: v_add_i32_e32 v15, vcc, 0x48, v0 -; SI-NEXT: v_or_b32_e32 v1, v14, v1 -; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v56 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x4c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v14, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v10 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v18 +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB101_4: -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v40, v52 -; SI-NEXT: v_mov_b32_e32 v55, v50 -; SI-NEXT: v_mov_b32_e32 v42, v51 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v16 +; SI-NEXT: v_mov_b32_e32 v46, v54 +; SI-NEXT: v_mov_b32_e32 v16, v20 +; SI-NEXT: v_mov_b32_e32 v20, v32 +; SI-NEXT: v_mov_b32_e32 v32, v53 +; SI-NEXT: v_mov_b32_e32 v45, v40 +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: s_branch .LBB101_3 +; SI-NEXT: .LBB101_2: +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 @@ -218369,72 +219932,775 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg % ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v26, v57 -; SI-NEXT: v_mov_b32_e32 v25, v56 -; SI-NEXT: v_mov_b32_e32 v24, v47 -; SI-NEXT: v_mov_b32_e32 v23, v46 +; SI-NEXT: s_mov_b64 s[4:5], -1 ; SI-NEXT: ; kill: killed $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: v_mov_b32_e32 v20, v53 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v46, v40 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v56, v37 +; SI-NEXT: v_mov_b32_e32 v47, v11 +; SI-NEXT: v_mov_b32_e32 v31, v15 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; kill: killed $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; kill: killed $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; kill: killed $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; kill: killed $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; kill: killed $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: s_branch .LBB101_2 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: .LBB101_3: ; %Flow +; SI-NEXT: v_mov_b32_e32 v5, v59 +; SI-NEXT: v_mov_b32_e32 v10, v63 +; SI-NEXT: v_mov_b32_e32 v59, v28 +; SI-NEXT: v_mov_b32_e32 v63, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v7, v60 +; SI-NEXT: v_mov_b32_e32 v11, v57 +; SI-NEXT: v_mov_b32_e32 v12, v38 +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v35, v1 +; SI-NEXT: v_mov_b32_e32 v37, v55 +; SI-NEXT: v_mov_b32_e32 v15, v36 +; SI-NEXT: v_mov_b32_e32 v38, v4 +; SI-NEXT: v_mov_b32_e32 v53, v6 +; SI-NEXT: v_mov_b32_e32 v54, v8 +; SI-NEXT: v_mov_b32_e32 v40, v9 +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: v_mov_b32_e32 v60, v41 +; SI-NEXT: v_mov_b32_e32 v36, v2 +; SI-NEXT: s_cbranch_vccnz .LBB101_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v51 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v62 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v61 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v51 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v12 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v31 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v2 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v21, v43 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v27 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v19 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v19 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v23 +; SI-NEXT: v_and_b32_e32 v44, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v44 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v8 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v10 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v15 +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v32 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v31 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v15, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v14 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v22, v4 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v45 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v17, v45 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v56 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v50, v6 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v11 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v61 +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v17, v7 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v16, v13 +; SI-NEXT: .LBB101_5: ; %end +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v35 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v64bf16_to_v64f16_scalar: ; VI: ; %bb.0: @@ -220488,6 +222754,7 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v0 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 @@ -220511,26 +222778,149 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:84 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:92 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -220546,23 +222936,8 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:104 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:108 -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:112 -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:116 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:120 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 @@ -220576,21 +222951,34 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 +; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v45, v45 -; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; kill: killed $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v56, v1 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -220615,333 +223003,222 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v31 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v56 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v9 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v60 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v13 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v15 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v16 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v8 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v19 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v10 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v11 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v14 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v16 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v18 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v21 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v22 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v27 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v29 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v15 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v30 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v33 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v35 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v25 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v38 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v28 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v39 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v49 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v34 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v52 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v54 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v55 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v57 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v41 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v40 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v60 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v43 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v61 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v44 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v45 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v62 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v46 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -220991,110 +223268,99 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: .LBB102_2: ; %Flow ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB102_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v2, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v63 ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v61 ; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v47 ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v57 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v45 -; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v56 +; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v56 ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v45, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v42 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 +; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 ; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v44 ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v45 ; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v54 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 ; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v40 ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v41 ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v49 ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v52 ; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v53 ; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v38 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 ; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 ; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v34 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v36 ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v37 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v58 -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 @@ -221104,8 +223370,13 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -221127,624 +223398,626 @@ define <64 x bfloat> @bitcast_v64f16_to_v64bf16(<64 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v58 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v23 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v38 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v53 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v50 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v41 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v59 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v62 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v2, v31 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v58 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 ; SI-NEXT: v_mov_b32_e32 v3, v24 -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; SI-NEXT: .LBB102_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: buffer_store_dword v0, v47, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v3 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v47 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload @@ -222018,13 +224291,14 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 ; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 ; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 @@ -222042,66 +224316,64 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:72 ; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:76 ; SI-NEXT: v_cvt_f16_f32_e32 v40, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_mov_b32_e32 v46, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v46, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v43, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v29, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s24 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v60, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v39 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v61, v48 -; SI-NEXT: s_waitcnt vmcnt(11) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v48 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v54 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v55 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 ; SI-NEXT: s_waitcnt vmcnt(5) @@ -222111,17 +224383,19 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v50 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v44, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v51, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill @@ -222129,737 +224403,717 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB103_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_mov_b32_e32 v23, v6 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v24 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v25 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v29 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v29 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v37 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v39 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v48 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v48 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v49 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v33 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v50 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v50 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v51 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v51 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v52 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v40 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v43 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v43 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v43 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 ; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mov_b32_e32 v50, v19 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v59 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v60 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_mov_b32_e32 v51, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v44 +; SI-NEXT: v_mov_b32_e32 v50, v26 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v38 +; SI-NEXT: v_mov_b32_e32 v38, v7 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v54 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v62 +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v11 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v11 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v13 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v13 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v14 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v31 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v31 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v17 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v18 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v19 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v20 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v46 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v35 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v57 -; SI-NEXT: v_mov_b32_e32 v57, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v32 -; SI-NEXT: v_mov_b32_e32 v32, v7 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v22 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v56 -; SI-NEXT: v_mov_b32_e32 v33, v12 -; SI-NEXT: v_mov_b32_e32 v34, v5 -; SI-NEXT: v_mov_b32_e32 v58, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v61 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v63 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v44 -; SI-NEXT: v_mov_b32_e32 v44, v18 -; SI-NEXT: v_mov_b32_e32 v5, v43 -; SI-NEXT: v_mov_b32_e32 v18, v6 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v58 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_mov_b32_e32 v59, v11 +; SI-NEXT: v_mov_b32_e32 v60, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v9 +; SI-NEXT: v_mov_b32_e32 v5, v23 +; SI-NEXT: v_mov_b32_e32 v7, v6 ; SI-NEXT: s_branch .LBB103_3 ; SI-NEXT: .LBB103_2: -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v35, v28 -; SI-NEXT: v_mov_b32_e32 v49, v47 -; SI-NEXT: v_mov_b32_e32 v27, v26 -; SI-NEXT: v_mov_b32_e32 v37, v45 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v51, v22 -; SI-NEXT: v_mov_b32_e32 v50, v19 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: v_mov_b32_e32 v33, v28 +; SI-NEXT: v_mov_b32_e32 v50, v26 +; SI-NEXT: v_mov_b32_e32 v36, v22 +; SI-NEXT: v_mov_b32_e32 v51, v21 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr5 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: v_mov_b32_e32 v5, v6 -; SI-NEXT: ; kill: killed $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; kill: killed $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; kill: killed $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr60 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: .LBB103_3: ; %Flow -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v36, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v43, v9 -; SI-NEXT: v_mov_b32_e32 v12, v31 +; SI-NEXT: v_mov_b32_e32 v19, v20 +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v61, v2 ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v31, v11 -; SI-NEXT: v_mov_b32_e32 v9, v17 ; SI-NEXT: s_cbranch_vccnz .LBB103_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v62 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v8 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v10 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v2 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v55 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v15, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v8 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v10 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v61 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v15 -; SI-NEXT: v_mov_b32_e32 v6, v37 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v39 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v53 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v51 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v60 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v35 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v10, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v13 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v18 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v32 ; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v49 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v15, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v10 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v14, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v15, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v57 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v35 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v38 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v56 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v44 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v36 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v28, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v32, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v32, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v33, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v34, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v36, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v51, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v50, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v51, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v40, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v40, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v43, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v44, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v46, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v47, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v47, 0x38000000, v47 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v57, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v58, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v26, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v26, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v22, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v19, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v35, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v35, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v13, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v12, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v7, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v5, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v59, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v59, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v59, 0x38000000, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v59 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v60, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v60, 0x38000000, v60 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v61, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v61, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v61, 0x38000000, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v61 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v62, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v62, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v62, 0x38000000, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v62, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v63, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v63, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v63, 0x38000000, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v63, v63 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v63 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v63 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v62 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v61 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v0, v60 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v59 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v19 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v57 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v46 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v43 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v42 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v41 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v36 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v31 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v7, v32 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v12 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v12 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v21 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v16 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v20 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v10 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v39 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v27 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v2 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v24 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v14 -; SI-NEXT: v_mov_b32_e32 v16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v37 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v48 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v39 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v55 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v49 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v5 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 +; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v54 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v5 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_mov_b32_e32 v4, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 +; SI-NEXT: v_mov_b32_e32 v17, v11 +; SI-NEXT: v_mov_b32_e32 v16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 -; SI-NEXT: v_mov_b32_e32 v3, v13 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; SI-NEXT: v_mov_b32_e32 v3, v19 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; SI-NEXT: v_mov_b32_e32 v4, v22 +; SI-NEXT: v_mov_b32_e32 v22, v6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v2 ; SI-NEXT: .LBB103_5: ; %end ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v37, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload @@ -222869,7 +225123,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload @@ -222880,7 +225134,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload @@ -222891,7 +225145,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload @@ -222902,7 +225156,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 16, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload @@ -222913,10 +225167,10 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -222924,10 +225178,10 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 24, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 @@ -222935,47 +225189,47 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v4 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload @@ -222986,7 +225240,7 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload @@ -222997,148 +225251,145 @@ define inreg <64 x bfloat> @bitcast_v64f16_to_v64bf16_scalar(<64 x half> inreg % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v31 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v17 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v19 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v40 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v13 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v8 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v59 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v57 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v38 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v33 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v60 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v44 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v18 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v10 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v31 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v58 +; SI-NEXT: v_mul_f32_e32 v2, 1.0, v15 ; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v37 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v22 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v24 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v40 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v24 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v48 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v45 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v48 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v39 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v21 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v25 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v49 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v25 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v28 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v29 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v52 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v29 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v47 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v30 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v28 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v30 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x74, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v26 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v12 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v43 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x78, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v14 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 -; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v11 +; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v26 +; SI-NEXT: v_alignbit_b32 v0, v0, v1, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x7c, v37 +; SI-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload @@ -226977,999 +229228,1089 @@ define inreg <64 x i16> @bitcast_v64bf16_to_v64i16_scalar(<64 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:16 ; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:76 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v1, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v3 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:72 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:76 +; SI-NEXT: v_mul_f32_e32 v38, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v6 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v7 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v8 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v12 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v61, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v45, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v5, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v12 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v20 +; SI-NEXT: v_mul_f32_e32 v59, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v4, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v7 ; SI-NEXT: v_mul_f32_e32 v8, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v6, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v62, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v46, 1.0, v13 -; SI-NEXT: v_mul_f32_e32 v13, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v60, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v57, 1.0, v17 -; SI-NEXT: v_mul_f32_e32 v16, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v19 -; SI-NEXT: v_mul_f32_e32 v47, 1.0, v20 -; SI-NEXT: v_mul_f32_e32 v17, 1.0, v21 -; SI-NEXT: v_mul_f32_e32 v19, 1.0, v22 -; SI-NEXT: v_mul_f32_e32 v18, 1.0, v23 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v24 -; SI-NEXT: v_mul_f32_e32 v21, 1.0, v25 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v26 -; SI-NEXT: v_mul_f32_e32 v63, 1.0, v27 -; SI-NEXT: v_mul_f32_e32 v58, 1.0, v28 -; SI-NEXT: v_mul_f32_e32 v26, 1.0, v29 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v30 -; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v29, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 -; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v10, 1.0, v32 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v33 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v36 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v34 -; SI-NEXT: v_mul_f32_e32 v27, 1.0, v35 -; SI-NEXT: v_mul_f32_e32 v59, 1.0, v37 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v38 -; SI-NEXT: v_mul_f32_e32 v30, 1.0, v39 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v48 -; SI-NEXT: v_mul_f32_e32 v4, 1.0, v49 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v50 -; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v54 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e32 v36, 1.0, v51 -; SI-NEXT: v_mul_f32_e32 v37, 1.0, v52 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v53 -; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) -; SI-NEXT: v_mul_f32_e32 v2, 1.0, v43 -; SI-NEXT: v_mul_f32_e32 v49, 1.0, v55 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v40 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v41 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v42 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; SI-NEXT: v_mul_f32_e64 v53, 1.0, s17 +; SI-NEXT: v_mul_f32_e32 v45, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v12, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v10, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v14, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v18, 1.0, v21 +; SI-NEXT: v_mul_f32_e32 v16, 1.0, v22 +; SI-NEXT: v_mul_f32_e32 v19, 1.0, v23 +; SI-NEXT: v_mul_f32_e32 v23, 1.0, v25 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_mul_f32_e32 v63, 1.0, v28 +; SI-NEXT: v_mul_f32_e32 v7, 1.0, v29 +; SI-NEXT: v_mul_f32_e32 v11, 1.0, v30 +; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v28, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v21, 1.0, s28 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_mul_f32_e32 v31, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v24 +; SI-NEXT: v_mul_f32_e32 v24, 1.0, v26 +; SI-NEXT: v_mul_f32_e32 v62, 1.0, v32 +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v33 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v34 +; SI-NEXT: v_mul_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v36 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v37 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v39 +; SI-NEXT: v_mul_f32_e32 v36, 1.0, v48 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s26 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v49 +; SI-NEXT: v_mul_f32_e32 v56, 1.0, v51 +; SI-NEXT: v_mul_f32_e32 v5, 1.0, v52 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v54 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_mul_f32_e32 v51, 1.0, v55 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v40 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_mul_f32_e32 v42, 1.0, v41 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_mul_f32_e32 v48, 1.0, v43 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v44 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_mul_f32_e32 v47, 1.0, v46 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v57 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mul_f32_e32 v58, 1.0, v58 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v34, 1.0, s25 +; SI-NEXT: v_mul_f32_e64 v33, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s29 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:348 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:352 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:356 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:360 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:364 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:368 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:372 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:376 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:380 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:384 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:392 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:396 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:400 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:404 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:408 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:412 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:416 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:420 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:424 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:428 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:476 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:480 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:488 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB105_2 ; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v54 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v52 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v50 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v45 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v53 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v14 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v46 -; SI-NEXT: v_mov_b32_e32 v29, v31 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v49 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v45 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_mov_b32_e32 v50, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v62 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v35 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v62 -; SI-NEXT: v_mov_b32_e32 v62, v5 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v36 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(5) expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v39 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_mov_b32_e32 v41, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v20 -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v33 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v34 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v37 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v48 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v48 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v37, v38 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v54, v50 +; SI-NEXT: v_mov_b32_e32 v46, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v9 +; SI-NEXT: v_mov_b32_e32 v44, v15 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v59 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v10 +; SI-NEXT: v_mov_b32_e32 v41, v27 +; SI-NEXT: v_mov_b32_e32 v52, v62 +; SI-NEXT: v_mov_b32_e32 v21, v58 +; SI-NEXT: v_mov_b32_e32 v58, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: v_mov_b32_e32 v55, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v53, v5 +; SI-NEXT: v_mov_b32_e32 v42, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v13 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v3 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_mov_b32_e32 v5, v19 +; SI-NEXT: v_mov_b32_e32 v7, v15 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v47 +; SI-NEXT: v_mov_b32_e32 v47, v3 +; SI-NEXT: v_mov_b32_e32 v3, v17 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_mov_b32_e32 v1, v13 ; SI-NEXT: s_branch .LBB105_3 ; SI-NEXT: .LBB105_2: -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v55, v12 -; SI-NEXT: v_mov_b32_e32 v33, v34 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v25, v1 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v21, v58 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v52, v62 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr16 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; kill: killed $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v7, v5 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; kill: killed $vgpr1 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: v_mov_b32_e32 v51, v61 -; SI-NEXT: v_mov_b32_e32 v42, v62 -; SI-NEXT: v_mov_b32_e32 v29, v31 -; SI-NEXT: v_mov_b32_e32 v25, v60 -; SI-NEXT: v_mov_b32_e32 v24, v56 -; SI-NEXT: v_mov_b32_e32 v54, v47 -; SI-NEXT: v_mov_b32_e32 v40, v20 -; SI-NEXT: v_mov_b32_e32 v43, v63 -; SI-NEXT: v_mov_b32_e32 v52, v10 -; SI-NEXT: v_mov_b32_e32 v53, v59 -; SI-NEXT: v_mov_b32_e32 v39, v4 -; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v54, v50 +; SI-NEXT: v_mov_b32_e32 v56, v47 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v53, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v40, v3 +; SI-NEXT: v_mov_b32_e32 v44, v15 +; SI-NEXT: v_mov_b32_e32 v57, v13 +; SI-NEXT: v_mov_b32_e32 v46, v19 +; SI-NEXT: v_mov_b32_e32 v41, v27 ; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_mov_b32_e32 v42, v43 +; SI-NEXT: v_mov_b32_e32 v3, v17 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; kill: killed $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: .LBB105_3: ; %Flow -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload ; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; SI-NEXT: s_cbranch_vccnz .LBB105_5 ; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v40 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v19 -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v55 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v39 -; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v37 -; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v32 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:484 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v31 +; SI-NEXT: v_mov_b32_e32 v38, v9 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v31 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:424 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:488 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v30 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; SI-NEXT: s_waitcnt vmcnt(8) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v28 +; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:480 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 +; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:476 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v51 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; SI-NEXT: v_alignbit_b32 v1, v9, v1, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v4 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v26 +; SI-NEXT: v_alignbit_b32 v2, v4, v2, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_alignbit_b32 v1, v11, v1, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v41 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v19 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v15 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_alignbit_b32 v1, v14, v1, 16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v25 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v50 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v14 -; SI-NEXT: v_alignbit_b32 v51, v16, v1, 16 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v44 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v16 -; SI-NEXT: v_alignbit_b32 v1, v18, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v16 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:396 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v23 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v27 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v28 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 -; SI-NEXT: v_and_b32_e32 v38, 0xffff0000, v2 -; SI-NEXT: v_and_b32_e32 v34, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:356 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v18, v20, v1, 16 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v1, v22, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v13 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v23, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v11 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v26, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v32 -; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v29 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v33 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v33 -; SI-NEXT: v_and_b32_e32 v33, 0xffff0000, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v27, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v12 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v25 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 -; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v1, v28, v1, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:384 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v52, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v36, v35, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 -; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v8 -; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v32 -; SI-NEXT: v_alignbit_b32 v48, v49, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v31 -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v10 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v25 -; SI-NEXT: v_and_b32_e32 v32, 0xffff0000, v20 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v42 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v28, v59, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v26, v28, v26, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v2, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_alignbit_b32 v46, v61, v31, 16 -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_alignbit_b32 v4, v6, v4, 16 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v53 +; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_alignbit_b32 v6, v8, v6, 16 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v21, v30, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21 +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v31 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v23, v10, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v63, v23, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v21, v12, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_alignbit_b32 v57, v58, v1, 16 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:400 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_alignbit_b32 v8, v10, v8, 16 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v17, v1, v20, 16 -; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:376 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v10, v12, v10, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:412 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 -; SI-NEXT: v_alignbit_b32 v56, v47, v20, 16 -; SI-NEXT: v_alignbit_b32 v20, v62, v11, 16 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v16, v56, v16, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; SI-NEXT: v_alignbit_b32 v12, v14, v12, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_alignbit_b32 v14, v18, v14, 16 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:388 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:360 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 +; SI-NEXT: v_alignbit_b32 v18, v20, v18, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:380 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v23 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_alignbit_b32 v24, v24, v23, 16 +; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v26 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:348 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v11 -; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v22, v45, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v29 -; SI-NEXT: v_alignbit_b32 v13, v60, v25, 16 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v22, v21, v20, 16 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:368 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:372 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[48:49], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[7:8], 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9 -; SI-NEXT: v_alignbit_b32 v24, v44, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:352 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v26 +; SI-NEXT: v_alignbit_b32 v26, v59, v25, 16 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[51:52], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[1:2], 16 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v39, 0xffff0000, v15 -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v39, 0x40c00000, v39 +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:364 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; SI-NEXT: v_alignbit_b32 v16, v45, v16, 16 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v9, v11, v9, 16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; SI-NEXT: v_alignbit_b32 v28, v58, v27, 16 +; SI-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v50, 0xffff0000, v15 -; SI-NEXT: v_mov_b32_e32 v15, v24 -; SI-NEXT: v_add_f32_e32 v50, 0x40c00000, v50 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v3, v3, v39, 16 -; SI-NEXT: v_alignbit_b32 v4, v3, v4, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; SI-NEXT: v_alignbit_b32 v20, v55, v20, 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[19:20], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v29 +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:404 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 +; SI-NEXT: v_alignbit_b32 v35, v43, v32, 16 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v30 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:392 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[62:63], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; SI-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v30 +; SI-NEXT: v_alignbit_b32 v39, v29, v32, 16 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[5:6], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v9, v5, 16 -; SI-NEXT: v_alignbit_b32 v5, v36, v7, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[27:28], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v2, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, v46, v33, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[23:24], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v24, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v48, v8, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v22, v37, 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v57, v32, 16 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_alignbit_b32 v4, v20, v34, 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[13:14], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v20, v52 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v4, v13, v14, 16 -; SI-NEXT: v_mov_b32_e32 v14, v51 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: .LBB105_5: ; %end +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v7, v4 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v51 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v33 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v14 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v63 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 ; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v59 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -233273,632 +235614,740 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:80 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:76 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:56 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:68 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:72 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v3 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:36 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:68 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v9 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s19 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v47 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v2, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v33 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v33, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v35 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v59 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v9, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v61 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v63 -; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v37 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v21, v48 -; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v51 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v54 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v51, v55 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v37, v40 -; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v25, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v63, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v62, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v61, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v39 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v56 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v4, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB109_4 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v53 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v43 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v59 +; SI-NEXT: s_cbranch_scc0 .LBB109_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_cbranch_execnz .LBB109_3 -; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_branch .LBB109_3 +; SI-NEXT: .LBB109_2: +; SI-NEXT: s_mov_b64 s[4:5], -1 +; SI-NEXT: .LBB109_3: ; %Flow +; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: s_cbranch_vccnz .LBB109_5 +; SI-NEXT: ; %bb.4: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_mov_b32_e32 v47, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v39, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v36, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 +; SI-NEXT: v_mov_b32_e32 v45, v35 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_mov_b32_e32 v57, v33 +; SI-NEXT: v_or_b32_e32 v34, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v56 +; SI-NEXT: v_or_b32_e32 v32, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v9, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshr_b64 v[58:59], v[33:34], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_or_b32_e32 v1, v1, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v2, v2, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_or_b32_e32 v8, v8, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_or_b32_e32 v11, v11, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_or_b32_e32 v15, v15, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_or_b32_e32 v18, v18, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_or_b32_e32 v31, v31, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v33 -; SI-NEXT: v_or_b32_e32 v32, v32, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v36 -; SI-NEXT: v_or_b32_e32 v35, v35, v48 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v39 -; SI-NEXT: v_or_b32_e32 v38, v38, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, v51 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v52 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v49, v48, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; SI-NEXT: v_or_b32_e32 v52, v48, v51 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v29, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v59 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v30, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v26, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v22, v3, v5 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v18, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v54, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v12 +; SI-NEXT: v_mov_b32_e32 v12, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v16, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v14, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v60 +; SI-NEXT: v_or_b32_e32 v43, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v41, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 +; SI-NEXT: v_lshr_b64 v[62:63], v[38:39], 16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v46, v48, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v57 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v48 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v61 -; SI-NEXT: v_or_b32_e32 v57, v48, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v63 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v49 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v60, v48, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v62 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_or_b32_e32 v59, v54, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_or_b32_e32 v56, v54, v48 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v45, v40, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v7 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_or_b32_e32 v7, v41, v55 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v51 +; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_mov_b32_e32 v63, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v45 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v47 +; SI-NEXT: v_or_b32_e32 v44, v28, v33 +; SI-NEXT: v_lshr_b64 v[46:47], v[29:30], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v49, v24, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v11 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v52 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v52, v20, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v61 +; SI-NEXT: v_or_b32_e32 v61, v24, v29 +; SI-NEXT: v_mov_b32_e32 v38, v49 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v37, v20, v31 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 +; SI-NEXT: v_or_b32_e32 v12, v28, v25 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v14, v14, v13 -; SI-NEXT: v_or_b32_e32 v23, v23, v17 -; SI-NEXT: v_or_b32_e32 v34, v34, v21 -; SI-NEXT: v_alignbit_b32 v4, v57, v4, 16 -; SI-NEXT: v_alignbit_b32 v63, v46, v51, 16 -; SI-NEXT: v_alignbit_b32 v62, v29, v48, 16 -; SI-NEXT: v_alignbit_b32 v61, v52, v54, 16 -; SI-NEXT: v_alignbit_b32 v44, v49, v55, 16 -; SI-NEXT: v_alignbit_b32 v13, v32, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v2, v21, 16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v7, v40 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v41, v10 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 +; SI-NEXT: v_or_b32_e32 v12, v20, v21 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v10, v35, v10, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v41, v20 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, v12 +; SI-NEXT: v_or_b32_e32 v12, v24, v17 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v20, v31, v20, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_or_b32_e32 v7, v41, v28 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_or_b32_e32 v12, v28, v15 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v28, v15, v28, 16 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v12 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v7 -; SI-NEXT: v_or_b32_e32 v7, v41, v27 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v27, v11, v27, 16 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v12 +; SI-NEXT: v_or_b32_e32 v12, v20, v13 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v41, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: v_or_b32_e32 v43, v42, v24 -; SI-NEXT: v_alignbit_b32 v26, v8, v26, 16 +; SI-NEXT: v_or_b32_e32 v12, v24, v42 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v20, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v54 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v41, v37 -; SI-NEXT: v_mov_b32_e32 v51, v7 -; SI-NEXT: v_alignbit_b32 v7, v38, v40, 16 -; SI-NEXT: v_alignbit_b32 v24, v5, v24, 16 +; SI-NEXT: v_or_b32_e32 v12, v28, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[56:57], v[31:32], 16 +; SI-NEXT: v_or_b32_e32 v54, v20, v40 +; SI-NEXT: v_or_b32_e32 v20, v24, v5 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v31, v55 +; SI-NEXT: v_lshr_b64 v[54:55], v[15:16], 16 +; SI-NEXT: v_mov_b32_e32 v15, v20 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_alignbit_b32 v37, v1, v37, 16 -; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: v_and_b32_e32 v48, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v4, v48, v4 -; SI-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v58 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_add_i32_e32 v48, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v63 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_add_i32_e32 v48, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v47 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_add_i32_e32 v48, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v56 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v62 -; SI-NEXT: v_or_b32_e32 v4, v4, v48 -; SI-NEXT: v_add_i32_e32 v48, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v4, v48, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v30 -; SI-NEXT: v_or_b32_e32 v4, v4, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v45 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v61 -; SI-NEXT: v_or_b32_e32 v4, v4, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v53 -; SI-NEXT: v_or_b32_e32 v4, v4, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v50, v8 +; SI-NEXT: v_or_b32_e32 v8, v28, v3 +; SI-NEXT: v_lshr_b64 v[28:29], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v50 -; SI-NEXT: v_or_b32_e32 v4, v4, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v4, v29, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v12, v50, v1 +; SI-NEXT: v_lshr_b64 v[49:50], v[35:36], 16 +; SI-NEXT: v_mov_b32_e32 v35, v44 +; SI-NEXT: v_lshr_b64 v[44:45], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[42:43], 16 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[20:21], v[9:10], 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v39 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshr_b64 v[20:21], v[40:41], 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[12:13], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 +; SI-NEXT: v_mov_b32_e32 v42, v61 +; SI-NEXT: v_mov_b32_e32 v61, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v51, v43 +; SI-NEXT: .LBB109_5: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v62 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v49 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v38 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v33 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v20 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v56 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v22 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v17 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v19 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v28 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v24 -; SI-NEXT: v_or_b32_e32 v4, v4, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x6c, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x70, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v12 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x74, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x78, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -233920,8 +236369,6 @@ define inreg <64 x i16> @bitcast_v64f16_to_v64i16_scalar(<64 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB109_4: -; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v64f16_to_v64i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 9b28fd9e7b6fd..64b5ecc8f6b8e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -1209,37 +1209,35 @@ define inreg <8 x i16> @bitcast_v4i32_to_v8i16_scalar(<4 x i32> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 ; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v3, s11 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v7, s10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v4i32_to_v8i16_scalar: @@ -3544,65 +3542,67 @@ define inreg <16 x i8> @bitcast_v4i32_to_v16i8_scalar(<4 x i32> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s19, 24 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 8 -; SI-NEXT: s_lshr_b32 s9, s17, 24 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_lshr_b32 s22, s19, 24 +; SI-NEXT: s_lshr_b32 s23, s19, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 8 +; SI-NEXT: s_lshr_b32 s25, s17, 24 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s19, 24 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 8 -; SI-NEXT: s_lshr_b32 s9, s17, 24 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s22, s19, 24 +; SI-NEXT: s_lshr_b32 s23, s19, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 8 +; SI-NEXT: s_lshr_b32 s25, s17, 24 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 8 ; SI-NEXT: .LBB25_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s10 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s25 ; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s4 ; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s22 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v4i32_to_v16i8_scalar: @@ -5664,36 +5664,41 @@ define inreg <8 x i16> @bitcast_v4f32_to_v8i16_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v11, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[5:6], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s19 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: .LBB37_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v11 +; SI-NEXT: v_mov_b32_e32 v4, v8 +; SI-NEXT: v_mov_b32_e32 v6, v9 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v8i16_scalar: @@ -7997,64 +8002,75 @@ define inreg <16 x i8> @bitcast_v4f32_to_v16i8_scalar(<4 x float> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s9, s19, 24 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s24, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 8 +; SI-NEXT: s_lshr_b32 s22, s17, 24 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v21, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[0:1], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[18:19], 24 +; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[20:21], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v21 +; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 -; SI-NEXT: v_mov_b32_e32 v13, s11 -; SI-NEXT: v_mov_b32_e32 v14, s10 -; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v5, s25 +; SI-NEXT: v_mov_b32_e32 v6, s23 +; SI-NEXT: v_mov_b32_e32 v7, s22 +; SI-NEXT: v_mov_b32_e32 v13, s27 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v15, s24 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v16, s12 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v16 +; SI-NEXT: v_mov_b32_e32 v12, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f32_to_v16i8_scalar: @@ -9769,37 +9785,35 @@ define inreg <8 x i16> @bitcast_v2i64_to_v8i16_scalar(<2 x i64> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 ; SI-NEXT: s_add_u32 s18, s18, 3 ; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s10, s19, 16 +; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v3, s11 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v7, s10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v2i64_to_v8i16_scalar: @@ -12106,65 +12120,67 @@ define inreg <16 x i8> @bitcast_v2i64_to_v16i8_scalar(<2 x i64> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s19, 24 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 8 -; SI-NEXT: s_lshr_b32 s9, s17, 24 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_lshr_b32 s22, s19, 24 +; SI-NEXT: s_lshr_b32 s23, s19, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 8 +; SI-NEXT: s_lshr_b32 s25, s17, 24 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 ; SI-NEXT: s_add_u32 s18, s18, 3 ; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s19, 24 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 8 -; SI-NEXT: s_lshr_b32 s9, s17, 24 -; SI-NEXT: s_lshr_b32 s10, s17, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 8 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s22, s19, 24 +; SI-NEXT: s_lshr_b32 s23, s19, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 8 +; SI-NEXT: s_lshr_b32 s25, s17, 24 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 ; SI-NEXT: .LBB69_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: v_mov_b32_e32 v3, s10 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s11 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v5, s27 +; SI-NEXT: v_mov_b32_e32 v6, s26 +; SI-NEXT: v_mov_b32_e32 v7, s25 ; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v11, s4 ; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v13, s8 -; SI-NEXT: v_mov_b32_e32 v14, s7 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s22 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v2i64_to_v16i8_scalar: @@ -13498,34 +13514,34 @@ define inreg <8 x i16> @bitcast_v2f64_to_v8i16_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB73_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s11, s19, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[10:11], s[16:17], 1.0 -; SI-NEXT: v_alignbit_b32 v5, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 16 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v11, s17 ; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v11, s17 ; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: .LBB73_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v10 ; SI-NEXT: v_mov_b32_e32 v2, v11 @@ -15789,67 +15805,73 @@ define inreg <16 x i8> @bitcast_v2f64_to_v16i8_scalar(<2 x double> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 8 -; SI-NEXT: s_lshr_b32 s8, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_lshr_b32 s27, s19, 24 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s19, 8 +; SI-NEXT: s_lshr_b32 s24, s17, 24 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s22, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[16:17], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[16:17], 1.0 -; SI-NEXT: v_alignbit_b32 v11, v17, v16, 24 -; SI-NEXT: v_alignbit_b32 v10, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v9, v17, v16, 8 -; SI-NEXT: v_alignbit_b32 v3, v19, v18, 24 -; SI-NEXT: v_alignbit_b32 v2, v19, v18, 16 -; SI-NEXT: v_alignbit_b32 v1, v19, v18, 8 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v17 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v17 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v19 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v19 +; SI-NEXT: v_add_f64 v[20:21], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[18:19], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[0:1], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[18:19], 24 +; SI-NEXT: v_lshr_b64 v[16:17], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[20:21], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[20:21], 8 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v19 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v21 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v21 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v19, s17 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v18, s16 -; SI-NEXT: v_mov_b32_e32 v7, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v15, s11 -; SI-NEXT: v_mov_b32_e32 v14, s10 -; SI-NEXT: v_mov_b32_e32 v13, s9 +; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_mov_b32_e32 v21, s17 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v15, s27 +; SI-NEXT: v_mov_b32_e32 v14, s26 +; SI-NEXT: v_mov_b32_e32 v13, s25 +; SI-NEXT: v_mov_b32_e32 v7, s24 +; SI-NEXT: v_mov_b32_e32 v6, s23 +; SI-NEXT: v_mov_b32_e32 v5, s22 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v0, s12 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v16, s6 +; SI-NEXT: v_mov_b32_e32 v11, s4 ; SI-NEXT: .LBB85_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v18 -; SI-NEXT: v_mov_b32_e32 v4, v19 -; SI-NEXT: v_mov_b32_e32 v8, v16 -; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, v20 +; SI-NEXT: v_mov_b32_e32 v4, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v16 +; SI-NEXT: v_mov_b32_e32 v12, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f64_to_v16i8_scalar: @@ -17515,11 +17537,11 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: s_cmp_lg_u32 s24, 0 @@ -17530,8 +17552,8 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 @@ -17545,10 +17567,10 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 @@ -17557,11 +17579,13 @@ define inreg <8 x i16> @bitcast_v8f16_to_v8i16_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v2, v2, v8 +; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[5:6], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 ; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -18405,60 +18429,62 @@ define inreg <8 x i16> @bitcast_v8bf16_to_v8i16_scalar(<8 x bfloat> inreg %a, i3 ; SI-NEXT: v_mul_f32_e64 v15, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v14, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v15 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v12 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 -; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v11 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v6, v7, v3, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[8:9], v[5:6], 16 +; SI-NEXT: v_alignbit_b32 v4, v12, v13, 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_mov_b32_e32 v5, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB95_2 @@ -19152,30 +19178,28 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: v_alignbit_b32 v3, s8, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s8, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s8, v0, 8 -; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v11, s9, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s9, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s9, v0, 8 -; SI-NEXT: s_lshr_b32 s10, s8, 8 -; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s9, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s9, s5, 8 +; SI-NEXT: s_lshr_b32 s15, s7, 8 ; SI-NEXT: s_and_b32 s11, s19, 0xffff -; SI-NEXT: s_and_b32 s14, s23, 0xffff -; SI-NEXT: s_bfe_u32 s12, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s15, s23, 0x80008 +; SI-NEXT: s_and_b32 s25, s23, 0xffff +; SI-NEXT: s_bfe_u32 s13, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s27, s23, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true ; SI-NEXT: s_add_i32 s20, s20, 3 @@ -19183,64 +19207,66 @@ define inreg <16 x i8> @bitcast_v8i16_to_v16i8_scalar(<8 x i16> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_add_i32 s6, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s22, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s8, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s8, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s8, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s7 -; SI-NEXT: v_alignbit_b32 v11, s9, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s9, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s9, v0, 8 -; SI-NEXT: s_lshr_b32 s12, s8, 24 -; SI-NEXT: s_lshr_b32 s11, s8, 16 -; SI-NEXT: s_lshr_b32 s10, s8, 8 -; SI-NEXT: s_lshr_b32 s15, s9, 24 -; SI-NEXT: s_lshr_b32 s14, s9, 16 -; SI-NEXT: s_lshr_b32 s13, s9, 8 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s8, s19, 16 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s13, s5, 24 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 8 +; SI-NEXT: s_lshr_b32 s27, s7, 24 +; SI-NEXT: s_lshr_b32 s25, s7, 16 +; SI-NEXT: s_lshr_b32 s15, s7, 8 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_mov_b32_e32 v6, s11 -; SI-NEXT: v_mov_b32_e32 v7, s12 -; SI-NEXT: v_mov_b32_e32 v8, s7 -; SI-NEXT: v_mov_b32_e32 v12, s9 -; SI-NEXT: v_mov_b32_e32 v13, s13 -; SI-NEXT: v_mov_b32_e32 v14, s14 -; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s24 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s15 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v8i16_to_v16i8_scalar: @@ -20067,53 +20093,53 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s8, v0 +; SI-NEXT: v_readfirstlane_b32 s15, v0 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s7, s5 -; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s7, s19, 24 -; SI-NEXT: s_or_b32 s4, s7, s4 -; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_or_b32 s40, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s40 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_or_b32 s42, s7, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s41, s5, s7 +; SI-NEXT: s_lshr_b64 s[8:9], s[40:41], 16 +; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: s_lshl_b32 s9, s29, 8 -; SI-NEXT: s_or_b32 s7, s7, s9 -; SI-NEXT: s_and_b32 s9, s8, 0xff -; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s6, 24 -; SI-NEXT: s_or_b32 s13, s11, s9 -; SI-NEXT: s_and_b32 s9, s26, 0xff +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s9, s15, 0xff ; SI-NEXT: s_lshl_b32 s9, s9, 16 -; SI-NEXT: s_lshl_b32 s11, s27, 24 -; SI-NEXT: s_or_b32 s9, s11, s9 -; SI-NEXT: s_and_b32 s11, s16, 0xff -; SI-NEXT: s_lshl_b32 s12, s17, 8 -; SI-NEXT: s_or_b32 s11, s11, s12 -; SI-NEXT: s_and_b32 s11, s11, 0xffff -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_or_b32 s11, s11, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s12, s25, 8 -; SI-NEXT: s_and_b32 s7, s7, 0xffff -; SI-NEXT: s_or_b32 s4, s4, s12 -; SI-NEXT: v_alignbit_b32 v1, s10, v0, 16 -; SI-NEXT: s_or_b32 s7, s7, s13 -; SI-NEXT: v_mov_b32_e32 v0, s9 +; SI-NEXT: s_lshl_b32 s10, s14, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s12, s10, s9 +; SI-NEXT: s_or_b32 s43, s5, s12 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v5, s7, v0, 16 -; SI-NEXT: s_or_b32 s9, s4, s9 -; SI-NEXT: s_lshr_b32 s12, s5, 16 -; SI-NEXT: s_lshr_b32 s13, s13, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[42:43], 16 +; SI-NEXT: s_or_b32 s4, s4, s42 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s12, 16 +; SI-NEXT: s_mov_b32 s7, s41 +; SI-NEXT: s_mov_b32 s5, s43 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true ; SI-NEXT: s_add_i32 s24, s24, 3 @@ -20121,76 +20147,74 @@ define inreg <8 x i16> @bitcast_v16i8_to_v8i16_scalar(<16 x i8> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_and_b32 s6, s26, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s9, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s8, s8, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_lshl_b32 s5, s6, 24 -; SI-NEXT: s_and_b32 s6, s8, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s15, s15, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s15, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s14, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s22, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s10, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_alignbit_b32 v1, s10, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s9 -; SI-NEXT: v_alignbit_b32 v5, s7, v0, 16 -; SI-NEXT: s_lshr_b32 s12, s10, 16 -; SI-NEXT: s_lshr_b32 s13, s7, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s22, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v2, s10 -; SI-NEXT: v_mov_b32_e32 v3, s12 -; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v7, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v16i8_to_v8i16_scalar: @@ -22076,41 +22100,41 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v8f16_to_v16i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v8, v20, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v0, v17, v0 -; SI-NEXT: v_or_b32_e32 v4, v16, v1 -; SI-NEXT: v_or_b32_e32 v12, v19, v5 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v23 +; SI-NEXT: v_or_b32_e32 v19, v16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_or_b32_e32 v20, v8, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v26 +; SI-NEXT: v_or_b32_e32 v17, v25, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 +; SI-NEXT: v_or_b32_e32 v18, v24, v1 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 +; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22120,13 +22144,13 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v8, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v18 +; SI-NEXT: v_or_b32_e32 v17, v1, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v14 -; SI-NEXT: v_or_b32_e32 v12, v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_or_b32_e32 v18, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -22136,34 +22160,38 @@ define inreg <16 x i8> @bitcast_v8f16_to_v16i8_scalar(<8 x half> inreg %a, i32 i ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v2, v1 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_or_b32_e32 v19, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_or_b32_e32 v20, v2, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[17:18], 24 +; SI-NEXT: v_lshr_b64 v[21:22], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[17:18], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v18 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v8, v17 +; SI-NEXT: v_mov_b32_e32 v10, v21 +; SI-NEXT: v_mov_b32_e32 v12, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB105_2 @@ -24073,89 +24101,94 @@ define inreg <16 x i8> @bitcast_v8bf16_to_v16i8_scalar(<8 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s24, 0 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v23, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v25, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s22 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v20 -; SI-NEXT: v_alignbit_b32 v0, v0, v19, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v17, 16 -; SI-NEXT: v_alignbit_b32 v8, v5, v23, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v21, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_alignbit_b32 v19, v1, v16, 16 +; SI-NEXT: v_alignbit_b32 v20, v6, v8, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_alignbit_b32 v21, v2, v26, 16 +; SI-NEXT: v_alignbit_b32 v22, v14, v24, 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v23 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 +; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v21, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_alignbit_b32 v22, v14, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_alignbit_b32 v19, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_alignbit_b32 v20, v6, v1, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[19:20], 24 +; SI-NEXT: v_lshr_b64 v[10:11], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[21:22], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[21:22], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v20 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v8, v21 +; SI-NEXT: v_mov_b32_e32 v9, v11 +; SI-NEXT: v_mov_b32_e32 v11, v17 +; SI-NEXT: v_mov_b32_e32 v12, v22 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v8bf16_to_v16i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll index c87d52c1e6907..ee209f84efe7c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.160bit.ll @@ -520,44 +520,41 @@ define inreg <10 x i16> @bitcast_v5i32_to_v10i16_scalar(<5 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_mov_b32_e32 v0, s18 ; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s19, 16 -; SI-NEXT: s_lshr_b32 s7, s17, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s12, s19, 16 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v3, s13 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v5i32_to_v10i16_scalar: @@ -1731,42 +1728,47 @@ define inreg <10 x i16> @bitcast_v5f32_to_v10i16_scalar(<5 x float> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s21, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v14, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s18, 1.0 +; SI-NEXT: v_lshr_b64 v[5:6], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v14 +; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v14, s17 +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 ; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: .LBB13_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v13 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v4, v11 +; SI-NEXT: v_mov_b32_e32 v6, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f32_to_v10i16_scalar: @@ -3319,11 +3321,11 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 @@ -3333,49 +3335,51 @@ define inreg <10 x i16> @bitcast_v10f16_to_v10i16_scalar(<10 x half> inreg %a, i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_or_b32_e32 v6, v6, v10 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 +; SI-NEXT: v_lshr_b64 v[11:12], v[5:6], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_or_b32_e32 v8, v8, v13 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v10 +; SI-NEXT: v_mov_b32_e32 v5, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll index c3ace0ac5af71..57eae8600dc4a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.192bit.ll @@ -1360,50 +1360,47 @@ define inreg <12 x i16> @bitcast_v6i32_to_v12i16_scalar(<6 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 ; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v3, s14 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v7, s13 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v11, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v6i32_to_v12i16_scalar: @@ -3505,48 +3502,55 @@ define inreg <12 x i16> @bitcast_v6f32_to_v12i16_scalar(<6 x float> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v17, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v14, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 +; SI-NEXT: s_branch .LBB29_5 ; SI-NEXT: .LBB29_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v14, s18 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v12, s20 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v17 +; SI-NEXT: v_mov_b32_e32 v4, v14 +; SI-NEXT: v_mov_b32_e32 v6, v15 +; SI-NEXT: v_mov_b32_e32 v8, v12 +; SI-NEXT: v_mov_b32_e32 v10, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f32_to_v12i16_scalar: @@ -5249,50 +5253,47 @@ define inreg <12 x i16> @bitcast_v3i64_to_v12i16_scalar(<3 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 ; SI-NEXT: s_add_u32 s20, s20, 3 ; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s12, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v3, s14 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v7, s13 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v11, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v3i64_to_v12i16_scalar: @@ -6578,45 +6579,45 @@ define inreg <12 x i16> @bitcast_v3f64_to_v12i16_scalar(<3 x double> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s13, s19, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[16:17], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[12:13], s[20:21], 1.0 ; SI-NEXT: v_add_f64 v[14:15], s[18:19], 1.0 -; SI-NEXT: v_alignbit_b32 v9, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v5, v15, v14, 16 -; SI-NEXT: v_alignbit_b32 v1, v17, v16, 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v17 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v17, s17 ; SI-NEXT: v_mov_b32_e32 v16, s16 -; SI-NEXT: v_mov_b32_e32 v15, s19 ; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v13, s21 ; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v17, s17 +; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: .LBB49_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v16 ; SI-NEXT: v_mov_b32_e32 v2, v17 @@ -8296,15 +8297,15 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: s_cmp_lg_u32 s28, 0 @@ -8317,53 +8318,56 @@ define inreg <12 x i16> @bitcast_v12f16_to_v12i16_scalar(<12 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v12 ; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v12 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v12 +; SI-NEXT: v_lshr_b64 v[14:15], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[9:10], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v14 +; SI-NEXT: v_mov_b32_e32 v5, v15 +; SI-NEXT: v_mov_b32_e32 v9, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll index c830d6b344b6f..7d0897bb2151b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.224bit.ll @@ -585,57 +585,53 @@ define inreg <14 x i16> @bitcast_v7i32_to_v14i16_scalar(<7 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_lshr_b32 s23, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s14, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v3, s23 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_mov_b32_e32 v7, s15 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s6 +; SI-NEXT: v_mov_b32_e32 v11, s14 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v7i32_to_v14i16_scalar: @@ -2048,54 +2044,61 @@ define inreg <14 x i16> @bitcast_v7f32_to_v14i16_scalar(<7 x float> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s23, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v13, s4, v12, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v20, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[9:10], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[12:13], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v20 +; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v19, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 ; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v9, s4 +; SI-NEXT: .LBB13_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v19 +; SI-NEXT: v_mov_b32_e32 v2, v20 +; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v6, v18 +; SI-NEXT: v_mov_b32_e32 v8, v15 +; SI-NEXT: v_mov_b32_e32 v10, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f32_to_v14i16_scalar: @@ -3965,22 +3968,21 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-LABEL: bitcast_v14f16_to_v14i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -3993,17 +3995,14 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -4012,40 +4011,46 @@ define inreg <14 x i16> @bitcast_v14f16_to_v14i16_scalar(<14 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v10, v10, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_or_b32_e32 v6, v6, v14 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v2, v2, v14 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshr_b64 v[14:15], v[1:2], 16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v2, v2, v14 +; SI-NEXT: v_lshr_b64 v[15:16], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[9:10], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v14 +; SI-NEXT: v_mov_b32_e32 v5, v15 +; SI-NEXT: v_mov_b32_e32 v9, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index 52e125d0d658f..cb4b3bd4382a4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -1514,63 +1514,59 @@ define inreg <16 x i16> @bitcast_v8i32_to_v16i16_scalar(<8 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: s_lshr_b32 s7, s21, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: s_lshr_b32 s7, s21, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 ; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v3, s25 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v7, s24 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s6 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v15, s14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v8i32_to_v16i16_scalar: @@ -5255,119 +5251,123 @@ define inreg <32 x i8> @bitcast_v8i32_to_v32i8_scalar(<8 x i32> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 -; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s23, 24 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 8 -; SI-NEXT: s_lshr_b32 s8, s21, 24 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s13, s19, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 8 -; SI-NEXT: s_lshr_b32 s15, s17, 24 -; SI-NEXT: s_lshr_b32 s24, s17, 16 -; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_lshr_b32 s56, s23, 24 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s23, 8 +; SI-NEXT: s_lshr_b32 s59, s21, 24 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 8 +; SI-NEXT: s_lshr_b32 s62, s19, 24 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 8 +; SI-NEXT: s_lshr_b32 s73, s17, 24 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 -; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s23, 24 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 8 -; SI-NEXT: s_lshr_b32 s8, s21, 24 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s13, s19, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 8 -; SI-NEXT: s_lshr_b32 s15, s17, 24 -; SI-NEXT: s_lshr_b32 s24, s17, 16 -; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s56, s23, 24 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s23, 8 +; SI-NEXT: s_lshr_b32 s59, s21, 24 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 8 +; SI-NEXT: s_lshr_b32 s62, s19, 24 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 8 +; SI-NEXT: s_lshr_b32 s73, s17, 24 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s17, 8 ; SI-NEXT: .LBB25_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s40 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s25 -; SI-NEXT: v_mov_b32_e32 v6, s24 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v5, s75 +; SI-NEXT: v_mov_b32_e32 v6, s74 +; SI-NEXT: v_mov_b32_e32 v7, s73 ; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 ; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s13 -; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v13, s72 +; SI-NEXT: v_mov_b32_e32 v14, s63 +; SI-NEXT: v_mov_b32_e32 v15, s62 ; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s10 ; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v21, s12 -; SI-NEXT: v_mov_b32_e32 v22, s10 -; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v21, s61 +; SI-NEXT: v_mov_b32_e32 v22, s60 +; SI-NEXT: v_mov_b32_e32 v23, s59 ; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s8 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s4 ; SI-NEXT: v_mov_b32_e32 v28, s23 -; SI-NEXT: v_mov_b32_e32 v29, s9 -; SI-NEXT: v_mov_b32_e32 v30, s7 -; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: v_mov_b32_e32 v29, s58 +; SI-NEXT: v_mov_b32_e32 v30, s57 +; SI-NEXT: v_mov_b32_e32 v31, s56 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v8i32_to_v32i8_scalar: @@ -8503,60 +8503,69 @@ define inreg <16 x i16> @bitcast_v8f32_to_v16i16_scalar(<8 x float> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v23, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v17, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 +; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v23, s17 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v17, s23 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: .LBB37_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v22 +; SI-NEXT: v_mov_b32_e32 v2, v23 +; SI-NEXT: v_mov_b32_e32 v4, v20 +; SI-NEXT: v_mov_b32_e32 v6, v21 +; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v10, v19 +; SI-NEXT: v_mov_b32_e32 v12, v16 +; SI-NEXT: v_mov_b32_e32 v14, v17 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v16i16_scalar: @@ -12246,116 +12255,137 @@ define inreg <32 x i8> @bitcast_v8f32_to_v32i8_scalar(<8 x float> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 -; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s15, s23, 24 -; SI-NEXT: s_lshr_b32 s24, s23, 16 -; SI-NEXT: s_lshr_b32 s25, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s9, s19, 24 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s72, s23, 24 +; SI-NEXT: s_lshr_b32 s74, s23, 16 +; SI-NEXT: s_lshr_b32 s75, s23, 8 +; SI-NEXT: s_lshr_b32 s61, s21, 24 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s73, s21, 8 +; SI-NEXT: s_lshr_b32 s58, s19, 24 +; SI-NEXT: s_lshr_b32 s60, s19, 16 +; SI-NEXT: s_lshr_b32 s62, s19, 8 +; SI-NEXT: s_lshr_b32 s56, s17, 24 +; SI-NEXT: s_lshr_b32 s57, s17, 16 +; SI-NEXT: s_lshr_b32 s59, s17, 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s22, 1.0 -; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v20 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v39, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v38, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v49, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v36, s20, 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[24:25], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[48:49], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[36:37], 24 +; SI-NEXT: v_lshr_b64 v[32:33], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[36:37], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[34:35], 24 +; SI-NEXT: v_lshr_b64 v[28:29], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[34:35], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[38:39], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[38:39], 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v37 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v37 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v35 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v39 +; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v24, s22 -; SI-NEXT: v_mov_b32_e32 v28, s23 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 -; SI-NEXT: v_mov_b32_e32 v13, s11 -; SI-NEXT: v_mov_b32_e32 v14, s10 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v21, s14 -; SI-NEXT: v_mov_b32_e32 v22, s13 -; SI-NEXT: v_mov_b32_e32 v23, s12 -; SI-NEXT: v_mov_b32_e32 v29, s25 -; SI-NEXT: v_mov_b32_e32 v30, s24 -; SI-NEXT: v_mov_b32_e32 v31, s15 +; SI-NEXT: v_mov_b32_e32 v38, s16 +; SI-NEXT: v_mov_b32_e32 v39, s17 +; SI-NEXT: v_mov_b32_e32 v34, s18 +; SI-NEXT: v_mov_b32_e32 v35, s19 +; SI-NEXT: v_mov_b32_e32 v36, s20 +; SI-NEXT: v_mov_b32_e32 v37, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v5, s59 +; SI-NEXT: v_mov_b32_e32 v6, s57 +; SI-NEXT: v_mov_b32_e32 v7, s56 +; SI-NEXT: v_mov_b32_e32 v13, s62 +; SI-NEXT: v_mov_b32_e32 v14, s60 +; SI-NEXT: v_mov_b32_e32 v15, s58 +; SI-NEXT: v_mov_b32_e32 v21, s73 +; SI-NEXT: v_mov_b32_e32 v22, s63 +; SI-NEXT: v_mov_b32_e32 v23, s61 +; SI-NEXT: v_mov_b32_e32 v29, s75 +; SI-NEXT: v_mov_b32_e32 v30, s74 +; SI-NEXT: v_mov_b32_e32 v31, s72 +; SI-NEXT: v_mov_b32_e32 v27, s40 +; SI-NEXT: v_mov_b32_e32 v24, s42 +; SI-NEXT: v_mov_b32_e32 v25, s44 +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v32, s26 +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v28, s12 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, v38 +; SI-NEXT: v_mov_b32_e32 v4, v39 +; SI-NEXT: v_mov_b32_e32 v10, v28 +; SI-NEXT: v_mov_b32_e32 v8, v34 +; SI-NEXT: v_mov_b32_e32 v12, v35 +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v16, v36 +; SI-NEXT: v_mov_b32_e32 v20, v37 +; SI-NEXT: v_mov_b32_e32 v26, v24 +; SI-NEXT: v_mov_b32_e32 v24, v48 +; SI-NEXT: v_mov_b32_e32 v28, v49 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v8f32_to_v32i8_scalar: @@ -15064,63 +15094,59 @@ define inreg <16 x i16> @bitcast_v4i64_to_v16i16_scalar(<4 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: s_lshr_b32 s7, s21, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_add_u32 s22, s22, 3 ; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: s_lshr_b32 s7, s21, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s14, s23, 16 +; SI-NEXT: s_lshr_b32 s15, s21, 16 +; SI-NEXT: s_lshr_b32 s24, s19, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s10 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v3, s25 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s8 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v7, s24 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s6 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v11, s15 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v15, s14 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr24 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v4i64_to_v16i16_scalar: @@ -18815,119 +18841,123 @@ define inreg <32 x i8> @bitcast_v4i64_to_v32i8_scalar(<4 x i64> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 -; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s23, 24 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 8 -; SI-NEXT: s_lshr_b32 s8, s21, 24 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s13, s19, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 8 -; SI-NEXT: s_lshr_b32 s15, s17, 24 -; SI-NEXT: s_lshr_b32 s24, s17, 16 -; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_lshr_b32 s56, s23, 24 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s23, 8 +; SI-NEXT: s_lshr_b32 s59, s21, 24 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 8 +; SI-NEXT: s_lshr_b32 s62, s19, 24 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 8 +; SI-NEXT: s_lshr_b32 s73, s17, 24 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 ; SI-NEXT: s_add_u32 s22, s22, 3 ; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_alignbit_b32 v25, s23, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 -; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s21, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s23, 24 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 8 -; SI-NEXT: s_lshr_b32 s8, s21, 24 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s13, s19, 16 -; SI-NEXT: s_lshr_b32 s14, s19, 8 -; SI-NEXT: s_lshr_b32 s15, s17, 24 -; SI-NEXT: s_lshr_b32 s24, s17, 16 -; SI-NEXT: s_lshr_b32 s25, s17, 8 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s56, s23, 24 +; SI-NEXT: s_lshr_b32 s57, s23, 16 +; SI-NEXT: s_lshr_b32 s58, s23, 8 +; SI-NEXT: s_lshr_b32 s59, s21, 24 +; SI-NEXT: s_lshr_b32 s60, s21, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 8 +; SI-NEXT: s_lshr_b32 s62, s19, 24 +; SI-NEXT: s_lshr_b32 s63, s19, 16 +; SI-NEXT: s_lshr_b32 s72, s19, 8 +; SI-NEXT: s_lshr_b32 s73, s17, 24 +; SI-NEXT: s_lshr_b32 s74, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 8 ; SI-NEXT: .LBB69_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: v_mov_b32_e32 v2, s42 +; SI-NEXT: v_mov_b32_e32 v3, s40 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s25 -; SI-NEXT: v_mov_b32_e32 v6, s24 -; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v5, s75 +; SI-NEXT: v_mov_b32_e32 v6, s74 +; SI-NEXT: v_mov_b32_e32 v7, s73 ; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 ; SI-NEXT: v_mov_b32_e32 v12, s19 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s13 -; SI-NEXT: v_mov_b32_e32 v15, s11 +; SI-NEXT: v_mov_b32_e32 v13, s72 +; SI-NEXT: v_mov_b32_e32 v14, s63 +; SI-NEXT: v_mov_b32_e32 v15, s62 ; SI-NEXT: v_mov_b32_e32 v16, s20 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_mov_b32_e32 v19, s10 ; SI-NEXT: v_mov_b32_e32 v20, s21 -; SI-NEXT: v_mov_b32_e32 v21, s12 -; SI-NEXT: v_mov_b32_e32 v22, s10 -; SI-NEXT: v_mov_b32_e32 v23, s8 +; SI-NEXT: v_mov_b32_e32 v21, s61 +; SI-NEXT: v_mov_b32_e32 v22, s60 +; SI-NEXT: v_mov_b32_e32 v23, s59 ; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v25, s8 +; SI-NEXT: v_mov_b32_e32 v26, s6 +; SI-NEXT: v_mov_b32_e32 v27, s4 ; SI-NEXT: v_mov_b32_e32 v28, s23 -; SI-NEXT: v_mov_b32_e32 v29, s9 -; SI-NEXT: v_mov_b32_e32 v30, s7 -; SI-NEXT: v_mov_b32_e32 v31, s6 +; SI-NEXT: v_mov_b32_e32 v29, s58 +; SI-NEXT: v_mov_b32_e32 v30, s57 +; SI-NEXT: v_mov_b32_e32 v31, s56 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v4i64_to_v32i8_scalar: @@ -21155,56 +21185,56 @@ define inreg <16 x i16> @bitcast_v4f64_to_v16i16_scalar(<4 x double> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB73_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s24, s21, 16 +; SI-NEXT: s_lshr_b32 s15, s19, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[22:23], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[16:17], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[18:19], s[20:21], 1.0 -; SI-NEXT: v_alignbit_b32 v13, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v9, v19, v18, 16 -; SI-NEXT: v_alignbit_b32 v5, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v1, v23, v22, 16 +; SI-NEXT: v_add_f64 v[20:21], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[18:19], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[22:23], 16 ; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v23 ; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v23, s17 -; SI-NEXT: v_mov_b32_e32 v21, s19 -; SI-NEXT: v_mov_b32_e32 v19, s21 ; SI-NEXT: v_mov_b32_e32 v17, s23 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v19, s21 +; SI-NEXT: v_mov_b32_e32 v21, s19 +; SI-NEXT: v_mov_b32_e32 v23, s17 ; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v20, s18 +; SI-NEXT: v_mov_b32_e32 v18, s20 +; SI-NEXT: v_mov_b32_e32 v16, s22 +; SI-NEXT: v_mov_b32_e32 v15, s25 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v7, s15 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: .LBB73_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v22 ; SI-NEXT: v_mov_b32_e32 v2, v23 @@ -24819,121 +24849,133 @@ define inreg <32 x i8> @bitcast_v4f64_to_v32i8_scalar(<4 x double> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v27, s23, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_alignbit_b32 v32, s23, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v19, s21, v0, 24 -; SI-NEXT: v_alignbit_b32 v18, s21, v0, 16 -; SI-NEXT: v_alignbit_b32 v33, s21, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s19, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v0, 16 -; SI-NEXT: v_alignbit_b32 v34, s19, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v35, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s25, s23, 24 -; SI-NEXT: s_lshr_b32 s24, s23, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 8 -; SI-NEXT: s_lshr_b32 s14, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s12, s21, 8 -; SI-NEXT: s_lshr_b32 s11, s19, 24 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 8 -; SI-NEXT: s_lshr_b32 s8, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s74, s23, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 8 +; SI-NEXT: s_lshr_b32 s72, s21, 24 +; SI-NEXT: s_lshr_b32 s63, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s21, 8 +; SI-NEXT: s_lshr_b32 s61, s19, 24 +; SI-NEXT: s_lshr_b32 s60, s19, 16 +; SI-NEXT: s_lshr_b32 s59, s19, 8 +; SI-NEXT: s_lshr_b32 s58, s17, 24 +; SI-NEXT: s_lshr_b32 s57, s17, 16 +; SI-NEXT: s_lshr_b32 s56, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[24:25], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[16:17], s[20:21], 1.0 -; SI-NEXT: v_alignbit_b32 v27, v25, v24, 24 -; SI-NEXT: v_alignbit_b32 v26, v25, v24, 16 -; SI-NEXT: v_alignbit_b32 v32, v25, v24, 8 -; SI-NEXT: v_alignbit_b32 v19, v17, v16, 24 -; SI-NEXT: v_alignbit_b32 v18, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v33, v17, v16, 8 -; SI-NEXT: v_alignbit_b32 v11, v9, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v34, v9, v8, 8 -; SI-NEXT: v_alignbit_b32 v3, v1, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v1, v0, 16 -; SI-NEXT: v_alignbit_b32 v35, v1, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v25 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v25 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; SI-NEXT: v_add_f64 v[50:51], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[37:38], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 +; SI-NEXT: v_add_f64 v[35:36], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[24:25], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[50:51], 24 +; SI-NEXT: v_lshr_b64 v[25:26], v[50:51], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[33:34], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v51 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v51 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v38 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v49 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr24 -; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: v_mov_b32_e32 v9, s19 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v25, s23 -; SI-NEXT: v_mov_b32_e32 v24, s22 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v7, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v5, s6 -; SI-NEXT: v_mov_b32_e32 v15, s11 -; SI-NEXT: v_mov_b32_e32 v14, s10 -; SI-NEXT: v_mov_b32_e32 v13, s9 -; SI-NEXT: v_mov_b32_e32 v23, s14 -; SI-NEXT: v_mov_b32_e32 v22, s13 -; SI-NEXT: v_mov_b32_e32 v21, s12 -; SI-NEXT: v_mov_b32_e32 v31, s25 -; SI-NEXT: v_mov_b32_e32 v30, s24 -; SI-NEXT: v_mov_b32_e32 v29, s15 +; SI-NEXT: v_mov_b32_e32 v51, s23 +; SI-NEXT: v_mov_b32_e32 v38, s21 +; SI-NEXT: v_mov_b32_e32 v36, s19 +; SI-NEXT: v_mov_b32_e32 v49, s17 +; SI-NEXT: v_mov_b32_e32 v48, s16 +; SI-NEXT: v_mov_b32_e32 v35, s18 +; SI-NEXT: v_mov_b32_e32 v37, s20 +; SI-NEXT: v_mov_b32_e32 v50, s22 +; SI-NEXT: v_mov_b32_e32 v31, s75 +; SI-NEXT: v_mov_b32_e32 v30, s74 +; SI-NEXT: v_mov_b32_e32 v29, s73 +; SI-NEXT: v_mov_b32_e32 v23, s72 +; SI-NEXT: v_mov_b32_e32 v22, s63 +; SI-NEXT: v_mov_b32_e32 v21, s62 +; SI-NEXT: v_mov_b32_e32 v15, s61 +; SI-NEXT: v_mov_b32_e32 v14, s60 +; SI-NEXT: v_mov_b32_e32 v13, s59 +; SI-NEXT: v_mov_b32_e32 v7, s58 +; SI-NEXT: v_mov_b32_e32 v6, s57 +; SI-NEXT: v_mov_b32_e32 v5, s56 +; SI-NEXT: v_mov_b32_e32 v1, s44 +; SI-NEXT: v_mov_b32_e32 v0, s42 +; SI-NEXT: v_mov_b32_e32 v3, s40 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v33, s26 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v17, s14 +; SI-NEXT: v_mov_b32_e32 v32, s12 +; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v25, s8 +; SI-NEXT: v_mov_b32_e32 v24, s6 +; SI-NEXT: v_mov_b32_e32 v27, s4 ; SI-NEXT: .LBB85_5: ; %end -; SI-NEXT: v_mov_b32_e32 v4, v1 -; SI-NEXT: v_mov_b32_e32 v12, v9 -; SI-NEXT: v_mov_b32_e32 v20, v17 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v1, v35 -; SI-NEXT: v_mov_b32_e32 v9, v34 -; SI-NEXT: v_mov_b32_e32 v17, v33 -; SI-NEXT: v_mov_b32_e32 v25, v32 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: v_mov_b32_e32 v10, v33 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: v_mov_b32_e32 v18, v32 +; SI-NEXT: v_mov_b32_e32 v16, v37 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v26, v24 +; SI-NEXT: v_mov_b32_e32 v24, v50 +; SI-NEXT: v_mov_b32_e32 v28, v51 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v4f64_to_v32i8_scalar: @@ -27681,26 +27723,24 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-LABEL: bitcast_v16f16_to_v16i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_mov_b32_e32 v16, v2 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false @@ -27713,16 +27753,12 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 @@ -27730,51 +27766,59 @@ define inreg <16 x i16> @bitcast_v16f16_to_v16i16_scalar(<16 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v10, v10, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v14, v14, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v16 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v16 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v2, v2, v16 +; SI-NEXT: v_lshr_b64 v[18:19], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[16:17], v[13:14], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v18 +; SI-NEXT: v_mov_b32_e32 v5, v21 +; SI-NEXT: v_mov_b32_e32 v9, v19 +; SI-NEXT: v_mov_b32_e32 v13, v16 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -29151,115 +29195,119 @@ define inreg <16 x i16> @bitcast_v16bf16_to_v16i16_scalar(<16 x bfloat> inreg %a ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v31, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v30, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v20, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v16, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v29, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v28, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v19, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v18, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v22, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v27, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v26, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v21, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v20, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v23, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v25, 1.0, s28 ; SI-NEXT: v_mul_f32_e64 v24, 1.0, s29 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v22, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v29 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v13 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v30 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v28 ; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v29 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v26 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v26 ; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v27 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v23 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v24 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v20 -; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v21 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v23 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v18 -; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_alignbit_b32 v10, v11, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v22 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v16 -; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_alignbit_b32 v14, v15, v7, 16 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v16 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; SI-NEXT: v_alignbit_b32 v6, v7, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_alignbit_b32 v2, v3, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_lshr_b64 v[17:18], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 +; SI-NEXT: v_alignbit_b32 v12, v24, v25, 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v17 +; SI-NEXT: v_mov_b32_e32 v5, v18 +; SI-NEXT: v_mov_b32_e32 v9, v21 +; SI-NEXT: v_mov_b32_e32 v13, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v16bf16_to_v16i16_scalar: @@ -30446,80 +30494,83 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v16i16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v4, v1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s78, v1 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s79, v0 ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s12, s4, s5 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: v_mov_b32_e32 v6, s6 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: v_alignbit_b32 v11, s9, v6, 24 -; SI-NEXT: v_alignbit_b32 v10, s9, v6, 16 -; SI-NEXT: v_alignbit_b32 v9, s9, v6, 8 -; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_alignbit_b32 v19, s10, v6, 24 -; SI-NEXT: v_alignbit_b32 v18, s10, v6, 16 -; SI-NEXT: v_alignbit_b32 v17, s10, v6, 8 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: v_or_b32_e32 v28, v6, v5 -; SI-NEXT: v_alignbit_b32 v3, s12, v1, 24 -; SI-NEXT: v_alignbit_b32 v2, s12, v1, 16 -; SI-NEXT: v_alignbit_b32 v1, s12, v1, 8 -; SI-NEXT: v_alignbit_b32 v27, v28, s8, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, s8, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, s8, 8 -; SI-NEXT: s_lshr_b32 s44, s12, 8 -; SI-NEXT: s_lshr_b32 s14, s9, 8 -; SI-NEXT: s_lshr_b32 s41, s10, 8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 -; SI-NEXT: s_and_b32 s45, s19, 0xffff -; SI-NEXT: s_and_b32 s15, s23, 0xffff -; SI-NEXT: s_and_b32 s42, s27, 0xffff -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v4 -; SI-NEXT: s_bfe_u32 s13, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s40, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s43, s27, 0x80008 -; SI-NEXT: v_bfe_u32 v31, v4, 8, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xffff +; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 +; SI-NEXT: s_or_b32 s9, s9, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s79, 0xffff +; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: s_lshr_b32 s13, s5, 8 +; SI-NEXT: s_lshr_b32 s41, s7, 8 +; SI-NEXT: s_lshr_b32 s47, s9, 8 +; SI-NEXT: s_lshr_b32 s88, s11, 8 +; SI-NEXT: s_and_b32 s15, s19, 0xffff +; SI-NEXT: s_and_b32 s45, s23, 0xffff +; SI-NEXT: s_and_b32 s59, s27, 0xffff +; SI-NEXT: s_and_b32 s90, s78, 0xffff +; SI-NEXT: s_bfe_u32 s43, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s89, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s91, s78, 0x80008 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s79, s79, 3 +; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_and_b32 s4, s79, 0xffff +; SI-NEXT: s_lshl_b32 s5, s78, 16 +; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s8, s4, 0x30000 +; SI-NEXT: s_add_i32 s11, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 +; SI-NEXT: s_add_i32 s8, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s10, s4, 0x30000 +; SI-NEXT: s_add_i32 s9, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 @@ -30529,99 +30580,103 @@ define inreg <32 x i8> @bitcast_v16i16_to_v32i8_scalar(<16 x i16> inreg %a, i32 ; SI-NEXT: s_lshl_b32 s5, s23, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s9, s4, 0x30000 +; SI-NEXT: s_add_i32 s7, s4, 0x30000 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_add_i32 s11, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: v_or_b32_e32 v0, v5, v0 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_mov_b32_e32 v0, s8 -; SI-NEXT: s_add_i32 s12, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_alignbit_b32 v11, s9, v4, 24 -; SI-NEXT: v_alignbit_b32 v10, s9, v4, 16 -; SI-NEXT: v_alignbit_b32 v9, s9, v4, 8 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_alignbit_b32 v3, s12, v1, 24 -; SI-NEXT: v_alignbit_b32 v2, s12, v1, 16 -; SI-NEXT: v_alignbit_b32 v1, s12, v1, 8 -; SI-NEXT: v_alignbit_b32 v19, s10, v4, 24 -; SI-NEXT: v_alignbit_b32 v18, s10, v4, 16 -; SI-NEXT: v_alignbit_b32 v17, s10, v4, 8 -; SI-NEXT: v_alignbit_b32 v27, v28, v0, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, v0, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, v0, 8 -; SI-NEXT: s_lshr_b32 s13, s12, 24 -; SI-NEXT: s_lshr_b32 s45, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s12, 8 -; SI-NEXT: s_lshr_b32 s40, s9, 24 -; SI-NEXT: s_lshr_b32 s15, s9, 16 -; SI-NEXT: s_lshr_b32 s14, s9, 8 -; SI-NEXT: s_lshr_b32 s43, s10, 24 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s10, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s12, s19, 16 +; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[10:11], 8 +; SI-NEXT: s_lshr_b32 s43, s5, 24 +; SI-NEXT: s_lshr_b32 s15, s5, 16 +; SI-NEXT: s_lshr_b32 s13, s5, 8 +; SI-NEXT: s_lshr_b32 s57, s7, 24 +; SI-NEXT: s_lshr_b32 s45, s7, 16 +; SI-NEXT: s_lshr_b32 s41, s7, 8 +; SI-NEXT: s_lshr_b32 s89, s9, 24 +; SI-NEXT: s_lshr_b32 s59, s9, 16 +; SI-NEXT: s_lshr_b32 s47, s9, 8 +; SI-NEXT: s_lshr_b32 s91, s11, 24 +; SI-NEXT: s_lshr_b32 s90, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s11, 8 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s11 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v5, s44 -; SI-NEXT: v_mov_b32_e32 v6, s45 -; SI-NEXT: v_mov_b32_e32 v7, s13 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_mov_b32_e32 v6, s15 +; SI-NEXT: v_mov_b32_e32 v7, s43 ; SI-NEXT: v_mov_b32_e32 v8, s6 -; SI-NEXT: v_mov_b32_e32 v12, s9 -; SI-NEXT: v_mov_b32_e32 v13, s14 -; SI-NEXT: v_mov_b32_e32 v14, s15 -; SI-NEXT: v_mov_b32_e32 v15, s40 -; SI-NEXT: v_mov_b32_e32 v16, s7 -; SI-NEXT: v_mov_b32_e32 v20, s10 -; SI-NEXT: v_mov_b32_e32 v21, s41 -; SI-NEXT: v_mov_b32_e32 v22, s42 -; SI-NEXT: v_mov_b32_e32 v23, s43 -; SI-NEXT: v_mov_b32_e32 v24, s8 +; SI-NEXT: v_mov_b32_e32 v9, s46 +; SI-NEXT: v_mov_b32_e32 v10, s44 +; SI-NEXT: v_mov_b32_e32 v11, s42 +; SI-NEXT: v_mov_b32_e32 v12, s7 +; SI-NEXT: v_mov_b32_e32 v13, s41 +; SI-NEXT: v_mov_b32_e32 v14, s45 +; SI-NEXT: v_mov_b32_e32 v15, s57 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_mov_b32_e32 v17, s60 +; SI-NEXT: v_mov_b32_e32 v18, s58 +; SI-NEXT: v_mov_b32_e32 v19, s56 +; SI-NEXT: v_mov_b32_e32 v20, s9 +; SI-NEXT: v_mov_b32_e32 v21, s47 +; SI-NEXT: v_mov_b32_e32 v22, s59 +; SI-NEXT: v_mov_b32_e32 v23, s89 +; SI-NEXT: v_mov_b32_e32 v24, s10 +; SI-NEXT: v_mov_b32_e32 v25, s74 +; SI-NEXT: v_mov_b32_e32 v26, s62 +; SI-NEXT: v_mov_b32_e32 v27, s72 +; SI-NEXT: v_mov_b32_e32 v28, s11 +; SI-NEXT: v_mov_b32_e32 v29, s88 +; SI-NEXT: v_mov_b32_e32 v30, s90 +; SI-NEXT: v_mov_b32_e32 v31, s91 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v16i16_to_v32i8_scalar: @@ -32019,234 +32074,229 @@ define inreg <16 x i16> @bitcast_v32i8_to_v16i16_scalar(<32 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 -; SI-NEXT: v_mov_b32_e32 v19, v14 -; SI-NEXT: v_mov_b32_e32 v20, v12 -; SI-NEXT: v_readfirstlane_b32 s13, v11 -; SI-NEXT: v_readfirstlane_b32 s14, v10 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_mov_b32_e32 v22, v14 +; SI-NEXT: v_mov_b32_e32 v21, v10 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s8, s19, 24 -; SI-NEXT: s_or_b32 s4, s8, s4 -; SI-NEXT: s_and_b32 s8, s28, 0xff -; SI-NEXT: s_lshl_b32 s12, s29, 8 -; SI-NEXT: s_or_b32 s8, s8, s12 -; SI-NEXT: s_and_b32 s12, s6, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s15, s7, 24 -; SI-NEXT: s_or_b32 s41, s15, s12 -; SI-NEXT: s_and_b32 s12, s26, 0xff -; SI-NEXT: s_lshl_b32 s12, s12, 16 -; SI-NEXT: s_lshl_b32 s15, s27, 24 -; SI-NEXT: s_or_b32 s12, s15, s12 -; SI-NEXT: s_and_b32 s15, s16, 0xff -; SI-NEXT: s_lshl_b32 s40, s17, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 -; SI-NEXT: s_or_b32 s15, s15, s40 -; SI-NEXT: v_or_b32_e32 v9, v9, v2 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_and_b32 s15, s15, 0xffff -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v0, v10 -; SI-NEXT: s_or_b32 s15, s15, s4 +; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s40, s25, 8 -; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_or_b32 s13, s5, s7 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v8 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s9, s42, 0xff +; SI-NEXT: v_or_b32_e32 v9, v9, v23 +; SI-NEXT: v_or_b32_e32 v13, v24, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v15, v0, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: s_or_b32 s4, s4, s40 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, s12 -; SI-NEXT: v_or_b32_e32 v12, v3, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v16 -; SI-NEXT: s_or_b32 s12, s4, s12 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s40, s9, 8 -; SI-NEXT: v_or_b32_e32 v9, v9, v21 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: s_or_b32 s4, s4, s40 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v15, v7, v13 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v14, v9, v15 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v18, s4, v12 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s40, s13, 8 -; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: v_or_b32_e32 v10, v10, v1 +; SI-NEXT: v_or_b32_e32 v14, v14, v7 +; SI-NEXT: v_or_b32_e32 v26, v5, v17 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v21 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s12, s10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_or_b32 s4, s4, s40 -; SI-NEXT: s_or_b32 s8, s8, s41 -; SI-NEXT: v_or_b32_e32 v22, v17, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v17, v17, v25 +; SI-NEXT: s_or_b32 s15, s5, s12 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, s8, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v22, 16 -; SI-NEXT: v_or_b32_e32 v12, s4, v22 -; SI-NEXT: s_lshr_b32 s40, s5, 16 -; SI-NEXT: s_lshr_b32 s41, s41, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_or_b32_e32 v9, v3, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v26 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: v_or_b32_e32 v19, v11, v9 +; SI-NEXT: v_mov_b32_e32 v20, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_or_b32_e32 v17, v17, v13 +; SI-NEXT: v_mov_b32_e32 v18, v14 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: s_mov_b32 s7, s13 +; SI-NEXT: s_mov_b32 s5, s15 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_add_i32 s14, s14, 3 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v19 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s10, s10, 3 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s9, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_and_b32 s6, s26, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s12, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s7, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v21 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: v_or_b32_e32 v9, v25, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v2, v23, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s8, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v24, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: v_add_i32_e32 v17, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_add_i32_e32 v19, vcc, 0x3000000, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v6 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 -; SI-NEXT: s_add_i32 s15, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s22, 0xff +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: v_alignbit_b32 v5, s8, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v18, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: s_lshr_b32 s41, s8, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v20, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[17:18], 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v18 ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s40 -; SI-NEXT: v_mov_b32_e32 v4, s12 -; SI-NEXT: v_mov_b32_e32 v6, s8 -; SI-NEXT: v_mov_b32_e32 v7, s41 -; SI-NEXT: v_mov_b32_e32 v8, v18 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, v19 +; SI-NEXT: v_mov_b32_e32 v10, v20 +; SI-NEXT: v_mov_b32_e32 v12, v17 +; SI-NEXT: v_mov_b32_e32 v14, v18 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: s_branch .LBB99_2 ; @@ -35384,116 +35434,121 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-LABEL: bitcast_v16f16_to_v32i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s16 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v32, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v32, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v37, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v53, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v14, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v48, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s22 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f16_f32_e32 v40, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v22, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v38, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v51, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s26 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v42, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 -; SI-NEXT: v_or_b32_e32 v8, v36, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v14 -; SI-NEXT: v_or_b32_e32 v12, v35, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 -; SI-NEXT: v_or_b32_e32 v16, v39, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v38, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_or_b32_e32 v48, v16, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v24, v50, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 -; SI-NEXT: v_or_b32_e32 v4, v32, v1 -; SI-NEXT: v_or_b32_e32 v28, v49, v5 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_or_b32_e32 v49, v8, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_or_b32_e32 v35, v52, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 +; SI-NEXT: v_or_b32_e32 v36, v39, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_or_b32_e32 v37, v55, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_or_b32_e32 v38, v54, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 +; SI-NEXT: v_or_b32_e32 v33, v41, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 +; SI-NEXT: v_or_b32_e32 v34, v0, v2 +; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 +; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v49 -; SI-NEXT: v_or_b32_e32 v24, v1, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v33, v2, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v55 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v30 -; SI-NEXT: v_or_b32_e32 v28, v2, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_or_b32_e32 v34, v0, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v16, v3, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v37 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; SI-NEXT: v_or_b32_e32 v37, v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v53 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v0 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 -; SI-NEXT: v_or_b32_e32 v20, v0, v2 +; SI-NEXT: v_or_b32_e32 v38, v0, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_or_b32_e32 v8, v2, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v34 +; SI-NEXT: v_or_b32_e32 v35, v2, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v32 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v14 -; SI-NEXT: v_or_b32_e32 v12, v1, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_or_b32_e32 v36, v1, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 @@ -35503,60 +35558,72 @@ define inreg <32 x i8> @bitcast_v16f16_to_v32i8_scalar(<16 x half> inreg %a, i32 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v2, v1 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_or_b32_e32 v48, v1, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v6 +; SI-NEXT: v_or_b32_e32 v49, v2, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[24:25], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshr_b64 v[12:13], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[35:36], 8 +; SI-NEXT: v_lshr_b64 v[19:20], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[17:18], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 +; SI-NEXT: v_lshr_b64 v[50:51], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v15, v14, 8, 8 ; SI-NEXT: v_bfe_u32 v23, v22, 8, 8 ; SI-NEXT: v_bfe_u32 v31, v30, 8, 8 ; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: v_mov_b32_e32 v8, v35 +; SI-NEXT: v_mov_b32_e32 v10, v12 +; SI-NEXT: v_mov_b32_e32 v12, v36 +; SI-NEXT: v_mov_b32_e32 v16, v37 +; SI-NEXT: v_mov_b32_e32 v18, v24 +; SI-NEXT: v_mov_b32_e32 v20, v38 +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: v_mov_b32_e32 v26, v50 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v16f16_to_v32i8_scalar: @@ -38793,166 +38860,186 @@ define inreg <32 x i8> @bitcast_v16bf16_to_v32i8_scalar(<16 x bfloat> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mul_f32_e64 v34, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v35, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v38, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v39, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v50, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v51, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v54, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v0 +; SI-NEXT: v_mul_f32_e64 v24, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v52, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v39, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v50, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s25 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mul_f32_e64 v40, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mul_f32_e64 v42, 1.0, s29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mul_f32_e64 v43, 1.0, s28 +; SI-NEXT: v_mul_f32_e32 v41, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v38 -; SI-NEXT: v_alignbit_b32 v8, v5, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v50 -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v32 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v36 -; SI-NEXT: v_alignbit_b32 v16, v5, v51, 16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v48 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v54 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v52 -; SI-NEXT: v_alignbit_b32 v0, v0, v35, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v33, 16 -; SI-NEXT: v_alignbit_b32 v12, v14, v37, 16 -; SI-NEXT: v_alignbit_b32 v20, v22, v49, 16 -; SI-NEXT: v_alignbit_b32 v24, v5, v55, 16 -; SI-NEXT: v_alignbit_b32 v28, v30, v53, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v32 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v36 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v48 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v52 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_alignbit_b32 v48, v1, v32, 16 +; SI-NEXT: v_alignbit_b32 v49, v6, v16, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_alignbit_b32 v37, v2, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v39 +; SI-NEXT: v_alignbit_b32 v35, v2, v40, 16 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v41 +; SI-NEXT: v_alignbit_b32 v38, v14, v50, 16 +; SI-NEXT: v_alignbit_b32 v36, v22, v54, 16 +; SI-NEXT: v_alignbit_b32 v33, v2, v43, 16 +; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v53 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v41 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v54 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v55 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v43 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v52 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v53 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v31 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v28, v30, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v55 +; SI-NEXT: v_alignbit_b32 v34, v30, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v16, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v48 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v49 +; SI-NEXT: v_alignbit_b32 v35, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v54 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v23 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v20, v22, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v51 +; SI-NEXT: v_alignbit_b32 v36, v22, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v52 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_alignbit_b32 v8, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v37 +; SI-NEXT: v_alignbit_b32 v37, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v50 ; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v15 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 -; SI-NEXT: v_alignbit_b32 v12, v14, v0, 16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v24 +; SI-NEXT: v_alignbit_b32 v38, v14, v0, 16 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v32 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v48, v1, v0, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, v12, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, v12, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, v12, v8, 8 -; SI-NEXT: v_alignbit_b32 v19, v20, v16, 24 -; SI-NEXT: v_alignbit_b32 v18, v20, v16, 16 -; SI-NEXT: v_alignbit_b32 v17, v20, v16, 8 -; SI-NEXT: v_alignbit_b32 v27, v28, v24, 24 -; SI-NEXT: v_alignbit_b32 v26, v28, v24, 16 -; SI-NEXT: v_alignbit_b32 v25, v28, v24, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v20 -; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v28 +; SI-NEXT: v_alignbit_b32 v49, v6, v0, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[48:49], 24 +; SI-NEXT: v_lshr_b64 v[11:12], v[37:38], 24 +; SI-NEXT: v_lshr_b64 v[19:20], v[35:36], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[33:34], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 8 +; SI-NEXT: v_lshr_b64 v[12:13], v[37:38], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[37:38], 8 +; SI-NEXT: v_lshr_b64 v[20:21], v[35:36], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[35:36], 8 +; SI-NEXT: v_lshr_b64 v[28:29], v[33:34], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[33:34], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v49 +; SI-NEXT: v_lshrrev_b32_e32 v13, 8, v38 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v36 +; SI-NEXT: v_lshrrev_b32_e32 v29, 8, v34 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 ; SI-NEXT: v_lshrrev_b32_e32 v15, 24, v15 ; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v23 ; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 ; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v49 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: v_mov_b32_e32 v10, v12 +; SI-NEXT: v_mov_b32_e32 v12, v38 +; SI-NEXT: v_mov_b32_e32 v16, v35 +; SI-NEXT: v_mov_b32_e32 v18, v20 +; SI-NEXT: v_mov_b32_e32 v20, v36 +; SI-NEXT: v_mov_b32_e32 v24, v33 +; SI-NEXT: v_mov_b32_e32 v26, v28 +; SI-NEXT: v_mov_b32_e32 v28, v34 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v16bf16_to_v32i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll index 6cf53d187fcab..57de868ad37b3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.288bit.ll @@ -647,70 +647,65 @@ define inreg <18 x i16> @bitcast_v9i32_to_v18i16_scalar(<9 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: s_lshr_b32 s7, s21, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s23, 16 -; SI-NEXT: s_lshr_b32 s7, s21, 16 -; SI-NEXT: s_lshr_b32 s8, s19, 16 -; SI-NEXT: s_lshr_b32 s9, s17, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_lshr_b32 s25, s23, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s26, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s28, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v3, s28 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s10 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v7, s27 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s6 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s7 +; SI-NEXT: v_mov_b32_e32 v11, s26 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s4 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s6 +; SI-NEXT: v_mov_b32_e32 v15, s25 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v9i32_to_v18i16_scalar: @@ -2361,66 +2356,75 @@ define inreg <18 x i16> @bitcast_v9f32_to_v18i16_scalar(<9 x float> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s25, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s25, s17, 16 +; SI-NEXT: s_lshr_b32 s28, s23, 16 +; SI-NEXT: s_lshr_b32 s27, s21, 16 +; SI-NEXT: s_lshr_b32 s26, s19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v17, s4, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v26, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v19, s22, 1.0 +; SI-NEXT: v_lshr_b64 v[13:14], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v26 +; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v21, s20 +; SI-NEXT: v_mov_b32_e32 v22, s21 +; SI-NEXT: v_mov_b32_e32 v19, s22 +; SI-NEXT: v_mov_b32_e32 v20, s23 ; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v3, s25 +; SI-NEXT: v_mov_b32_e32 v7, s26 +; SI-NEXT: v_mov_b32_e32 v11, s27 +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v17, s8 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v9, s6 +; SI-NEXT: v_mov_b32_e32 v13, s4 +; SI-NEXT: .LBB13_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v25 +; SI-NEXT: v_mov_b32_e32 v2, v26 +; SI-NEXT: v_mov_b32_e32 v4, v23 +; SI-NEXT: v_mov_b32_e32 v6, v24 +; SI-NEXT: v_mov_b32_e32 v8, v21 +; SI-NEXT: v_mov_b32_e32 v10, v22 +; SI-NEXT: v_mov_b32_e32 v12, v19 +; SI-NEXT: v_mov_b32_e32 v14, v20 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v9f32_to_v18i16_scalar: @@ -4659,112 +4663,114 @@ define inreg <18 x i16> @bitcast_v18f16_to_v18i16_scalar(<18 x half> inreg %a, i ; SI-LABEL: bitcast_v18f16_to_v18i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 -; SI-NEXT: v_mov_b32_e32 v18, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v9, v2 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v18 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v16, v16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v10, v10, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v14, v14, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v7 ; SI-NEXT: v_or_b32_e32 v6, v6, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_or_b32_e32 v2, v2, v18 +; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[13:14], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v5, v18 +; SI-NEXT: v_mov_b32_e32 v9, v21 +; SI-NEXT: v_mov_b32_e32 v13, v19 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 2abb2f3b9de52..3aaf25423a184 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -681,76 +681,71 @@ define inreg <20 x i16> @bitcast_v10i32_to_v20i16_scalar(<10 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v3, s40 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s10 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v10i32_to_v20i16_scalar: @@ -3371,241 +3366,239 @@ define inreg <40 x i8> @bitcast_v10i32_to_v40i8_scalar(<10 x i32> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 -; SI-NEXT: s_lshr_b32 s6, s25, 24 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 8 -; SI-NEXT: s_lshr_b32 s9, s23, 24 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s15, s19, 24 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 8 -; SI-NEXT: s_lshr_b32 s28, s17, 24 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_lshr_b32 s72, s25, 24 +; SI-NEXT: s_lshr_b32 s73, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s25, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 8 +; SI-NEXT: s_lshr_b32 s78, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 8 +; SI-NEXT: s_lshr_b32 s89, s19, 24 +; SI-NEXT: s_lshr_b32 s90, s19, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 8 +; SI-NEXT: s_lshr_b32 s92, s17, 24 +; SI-NEXT: s_lshr_b32 s93, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 ; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 -; SI-NEXT: s_lshr_b32 s6, s25, 24 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 8 -; SI-NEXT: s_lshr_b32 s9, s23, 24 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s15, s19, 24 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 8 -; SI-NEXT: s_lshr_b32 s28, s17, 24 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b32 s72, s25, 24 +; SI-NEXT: s_lshr_b32 s73, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s25, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 8 +; SI-NEXT: s_lshr_b32 s78, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 8 +; SI-NEXT: s_lshr_b32 s89, s19, 24 +; SI-NEXT: s_lshr_b32 s90, s19, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 8 +; SI-NEXT: s_lshr_b32 s92, s17, 24 +; SI-NEXT: s_lshr_b32 s93, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s28, 24 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s26, 0xff -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s15, s15, 24 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s15, s5 -; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s7, s58, 0xff +; SI-NEXT: s_lshl_b32 s9, s56, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xff +; SI-NEXT: s_lshl_b32 s7, s94, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s92, 24 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_and_b32 s7, s18, 0xff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s7, s44, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s12, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s11, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s19, 0xff +; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s90, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s89, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s40, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s21, 0xff +; SI-NEXT: s_lshl_b32 s7, s88, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s78, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s14, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s77, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s75, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s8, 8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s4, s4, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_and_b32 s5, s73, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_lshl_b32 s6, s72, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v10i32_to_v40i8_scalar: @@ -7591,72 +7584,83 @@ define inreg <20 x i16> @bitcast_v10f32_to_v20i16_scalar(<10 x float> inreg %a, ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB25_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB25_4 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v29, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v21, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v20, s24, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 +; SI-NEXT: s_branch .LBB25_5 ; SI-NEXT: .LBB25_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB25_2 ; SI-NEXT: .LBB25_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v29, s17 +; SI-NEXT: v_mov_b32_e32 v26, s18 +; SI-NEXT: v_mov_b32_e32 v27, s19 +; SI-NEXT: v_mov_b32_e32 v24, s20 +; SI-NEXT: v_mov_b32_e32 v25, s21 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_mov_b32_e32 v19, s40 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: .LBB25_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v2, v29 +; SI-NEXT: v_mov_b32_e32 v4, v26 +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: v_mov_b32_e32 v10, v25 +; SI-NEXT: v_mov_b32_e32 v12, v22 +; SI-NEXT: v_mov_b32_e32 v14, v23 +; SI-NEXT: v_mov_b32_e32 v16, v20 +; SI-NEXT: v_mov_b32_e32 v18, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v10f32_to_v20i16_scalar: @@ -10305,256 +10309,261 @@ define inreg <40 x i8> @bitcast_v10f32_to_v40i8_scalar(<10 x float> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB33_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 -; SI-NEXT: s_lshr_b32 s28, s25, 24 -; SI-NEXT: s_lshr_b32 s29, s25, 16 -; SI-NEXT: s_lshr_b32 s40, s25, 8 -; SI-NEXT: s_lshr_b32 s15, s23, 24 -; SI-NEXT: s_lshr_b32 s26, s23, 16 -; SI-NEXT: s_lshr_b32 s27, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s9, s19, 24 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s91, s25, 24 +; SI-NEXT: s_lshr_b32 s93, s25, 16 +; SI-NEXT: s_lshr_b32 s94, s25, 8 +; SI-NEXT: s_lshr_b32 s88, s23, 24 +; SI-NEXT: s_lshr_b32 s90, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s23, 8 +; SI-NEXT: s_lshr_b32 s77, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s89, s21, 8 +; SI-NEXT: s_lshr_b32 s74, s19, 24 +; SI-NEXT: s_lshr_b32 s76, s19, 16 +; SI-NEXT: s_lshr_b32 s78, s19, 8 +; SI-NEXT: s_lshr_b32 s72, s17, 24 +; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_lshr_b32 s75, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v31, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v34, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v28, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v29, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v23, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v21, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v17, s24, 1.0 -; SI-NEXT: v_alignbit_b32 v1, v16, v17, 24 -; SI-NEXT: v_alignbit_b32 v2, v16, v17, 16 -; SI-NEXT: v_alignbit_b32 v3, v16, v17, 8 -; SI-NEXT: v_alignbit_b32 v4, v18, v21, 24 -; SI-NEXT: v_alignbit_b32 v5, v18, v21, 16 -; SI-NEXT: v_alignbit_b32 v6, v18, v21, 8 -; SI-NEXT: v_alignbit_b32 v7, v23, v24, 24 -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 -; SI-NEXT: v_alignbit_b32 v9, v23, v24, 8 -; SI-NEXT: v_alignbit_b32 v10, v28, v29, 24 -; SI-NEXT: v_alignbit_b32 v11, v28, v29, 16 -; SI-NEXT: v_alignbit_b32 v12, v28, v29, 8 -; SI-NEXT: v_alignbit_b32 v13, v31, v34, 24 -; SI-NEXT: v_alignbit_b32 v14, v31, v34, 16 -; SI-NEXT: v_alignbit_b32 v15, v31, v34, 8 -; SI-NEXT: v_lshrrev_b32_e32 v19, 24, v16 -; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 24, v18 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v18 -; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v23 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v33, 8, v23 -; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v28 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v28 -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v31 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v31 +; SI-NEXT: v_add_f32_e64 v5, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v12, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v13, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s18, 1.0 +; SI-NEXT: v_readfirstlane_b32 s16, v6 +; SI-NEXT: v_readfirstlane_b32 s17, v5 +; SI-NEXT: v_add_f32_e64 v3, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s24, 1.0 +; SI-NEXT: s_lshr_b64 s[26:27], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 8 +; SI-NEXT: v_readfirstlane_b32 s16, v10 +; SI-NEXT: v_readfirstlane_b32 s17, v7 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[16:17], 8 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_readfirstlane_b32 s17, v12 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 +; SI-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v11, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v19, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v20, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v12 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v12 ; SI-NEXT: s_branch .LBB33_5 ; SI-NEXT: .LBB33_3: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr27 -; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB33_2 ; SI-NEXT: .LBB33_4: -; SI-NEXT: v_mov_b32_e32 v34, s16 -; SI-NEXT: v_mov_b32_e32 v31, s17 -; SI-NEXT: v_mov_b32_e32 v29, s18 -; SI-NEXT: v_mov_b32_e32 v28, s19 -; SI-NEXT: v_mov_b32_e32 v24, s20 -; SI-NEXT: v_mov_b32_e32 v23, s21 -; SI-NEXT: v_mov_b32_e32 v21, s22 -; SI-NEXT: v_mov_b32_e32 v18, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v48, s8 -; SI-NEXT: v_mov_b32_e32 v39, s7 -; SI-NEXT: v_mov_b32_e32 v38, s6 -; SI-NEXT: v_mov_b32_e32 v37, s11 -; SI-NEXT: v_mov_b32_e32 v36, s10 -; SI-NEXT: v_mov_b32_e32 v35, s9 -; SI-NEXT: v_mov_b32_e32 v33, s14 -; SI-NEXT: v_mov_b32_e32 v32, s13 -; SI-NEXT: v_mov_b32_e32 v30, s12 -; SI-NEXT: v_mov_b32_e32 v27, s27 -; SI-NEXT: v_mov_b32_e32 v26, s26 -; SI-NEXT: v_mov_b32_e32 v25, s15 -; SI-NEXT: v_mov_b32_e32 v22, s40 -; SI-NEXT: v_mov_b32_e32 v20, s29 -; SI-NEXT: v_mov_b32_e32 v19, s28 +; SI-NEXT: v_mov_b32_e32 v13, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_mov_b32_e32 v10, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v6, s20 +; SI-NEXT: v_mov_b32_e32 v5, s21 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_mov_b32_e32 v2, s24 +; SI-NEXT: v_mov_b32_e32 v1, s25 +; SI-NEXT: v_mov_b32_e32 v25, s75 +; SI-NEXT: v_mov_b32_e32 v24, s73 +; SI-NEXT: v_mov_b32_e32 v23, s72 +; SI-NEXT: v_mov_b32_e32 v22, s78 +; SI-NEXT: v_mov_b32_e32 v21, s76 +; SI-NEXT: v_mov_b32_e32 v20, s74 +; SI-NEXT: v_mov_b32_e32 v19, s89 +; SI-NEXT: v_mov_b32_e32 v18, s79 +; SI-NEXT: v_mov_b32_e32 v17, s77 +; SI-NEXT: v_mov_b32_e32 v16, s92 +; SI-NEXT: v_mov_b32_e32 v15, s90 +; SI-NEXT: v_mov_b32_e32 v14, s88 +; SI-NEXT: v_mov_b32_e32 v11, s94 +; SI-NEXT: v_mov_b32_e32 v9, s93 +; SI-NEXT: v_mov_b32_e32 v8, s91 ; SI-NEXT: .LBB33_5: ; %end -; SI-NEXT: v_and_b32_e32 v34, 0xff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: v_or_b32_e32 v15, v34, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v48 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v38 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: v_or_b32_e32 v13, s5, v13 +; SI-NEXT: s_and_b32 s5, s58, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s56, 24 ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v13, s5, v13 +; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_or_b32_e32 v12, v13, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 +; SI-NEXT: v_or_b32_e32 v10, s5, v10 +; SI-NEXT: s_and_b32 s5, s44, 0xff ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v35 +; SI-NEXT: v_or_b32_e32 v13, v23, v13 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s42, 24 +; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v10, s5, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v33 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v30 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v22 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v21 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_lshl_b32 s5, s40, 8 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v20 +; SI-NEXT: v_or_b32_e32 v6, s5, v6 +; SI-NEXT: s_and_b32 s5, s28, 0xff ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s26, 24 +; SI-NEXT: v_or_b32_e32 v7, v7, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v6, s5, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v27 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v25 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v19 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_lshl_b32 s5, s14, 8 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_or_b32_e32 v4, s5, v4 +; SI-NEXT: s_and_b32 s5, s12, 0xff +; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s10, 24 +; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v4, s5, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v16 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v15 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v14 +; SI-NEXT: v_or_b32_e32 v2, s5, v2 +; SI-NEXT: s_and_b32 s5, s6, 0xff ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s4, s4, 24 +; SI-NEXT: v_or_b32_e32 v3, v3, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v8 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -15184,124 +15193,127 @@ define inreg <20 x i16> @bitcast_v20f16_to_v20i16_scalar(<20 x half> inreg %a, i ; SI-LABEL: bitcast_v20f16_to_v20i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v9, v4 +; SI-NEXT: v_mov_b32_e32 v10, v3 +; SI-NEXT: v_mov_b32_e32 v13, v2 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB47_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v24 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v25 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v21 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v20 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v19 +; SI-NEXT: v_lshr_b64 v[24:25], v[9:10], 16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v18, v18, v20 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[25:26], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[17:18], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: .LBB47_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v5, v21 +; SI-NEXT: v_mov_b32_e32 v9, v24 +; SI-NEXT: v_mov_b32_e32 v13, v25 +; SI-NEXT: v_mov_b32_e32 v17, v26 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: ; SI-NEXT: s_branch .LBB47_2 @@ -16674,330 +16686,353 @@ define inreg <40 x i8> @bitcast_v20i16_to_v40i8_scalar(<20 x i16> inreg %a, i32 ; SI-LABEL: bitcast_v20i16_to_v40i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v8, s30, 0 +; SI-NEXT: v_writelane_b32 v8, s31, 1 +; SI-NEXT: v_writelane_b32 v8, s34, 2 +; SI-NEXT: v_writelane_b32 v8, s35, 3 +; SI-NEXT: v_writelane_b32 v8, s36, 4 +; SI-NEXT: v_writelane_b32 v8, s37, 5 +; SI-NEXT: v_writelane_b32 v8, s38, 6 +; SI-NEXT: v_writelane_b32 v8, s39, 7 +; SI-NEXT: v_writelane_b32 v8, s48, 8 +; SI-NEXT: v_writelane_b32 v8, s49, 9 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s72, v6 -; SI-NEXT: v_readfirstlane_b32 s73, v5 -; SI-NEXT: v_readfirstlane_b32 s62, v2 -; SI-NEXT: v_readfirstlane_b32 s63, v1 +; SI-NEXT: v_writelane_b32 v8, s50, 10 +; SI-NEXT: v_readfirstlane_b32 s39, v6 +; SI-NEXT: v_readfirstlane_b32 s48, v5 +; SI-NEXT: v_readfirstlane_b32 s49, v4 +; SI-NEXT: v_readfirstlane_b32 s50, v3 +; SI-NEXT: v_readfirstlane_b32 s35, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v4 +; SI-NEXT: v_readfirstlane_b32 s38, v1 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_or_b32 s12, s4, s5 ; SI-NEXT: s_and_b32 s4, s18, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s14, s4, s5 +; SI-NEXT: s_or_b32 s13, s4, s5 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_or_b32 s10, s4, s5 ; SI-NEXT: s_and_b32 s4, s22, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_or_b32 s11, s4, s5 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_or_b32 s8, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: v_alignbit_b32 v7, s14, v1, 24 -; SI-NEXT: v_alignbit_b32 v12, s14, v1, 16 -; SI-NEXT: v_alignbit_b32 v16, s14, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_or_b32 s10, s4, s5 +; SI-NEXT: s_or_b32 s9, s4, s5 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: v_alignbit_b32 v8, s12, v1, 24 -; SI-NEXT: v_alignbit_b32 v13, s12, v1, 16 -; SI-NEXT: v_alignbit_b32 v17, s12, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s9 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s63, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: v_alignbit_b32 v6, s10, v1, 24 -; SI-NEXT: v_alignbit_b32 v11, s10, v1, 16 -; SI-NEXT: v_alignbit_b32 v15, s10, v1, 8 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_alignbit_b32 v5, s8, v1, 24 -; SI-NEXT: v_alignbit_b32 v9, s8, v1, 16 -; SI-NEXT: v_alignbit_b32 v14, s8, v1, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s73, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: v_or_b32_e32 v1, v1, v18 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: v_alignbit_b32 v2, s6, v1, 24 -; SI-NEXT: v_alignbit_b32 v4, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v10, s6, v1, 8 -; SI-NEXT: s_lshr_b32 s59, s14, 8 -; SI-NEXT: s_lshr_b32 s56, s12, 8 -; SI-NEXT: s_lshr_b32 s45, s10, 8 -; SI-NEXT: s_lshr_b32 s42, s8, 8 -; SI-NEXT: s_lshr_b32 s15, s6, 8 -; SI-NEXT: s_and_b32 s60, s19, 0xffff -; SI-NEXT: s_and_b32 s57, s23, 0xffff -; SI-NEXT: s_and_b32 s46, s27, 0xffff -; SI-NEXT: s_and_b32 s43, s62, 0xffff -; SI-NEXT: s_and_b32 s40, s72, 0xffff -; SI-NEXT: s_bfe_u32 s61, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s58, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s47, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s44, s62, 0x80008 -; SI-NEXT: s_bfe_u32 s41, s72, 0x80008 +; SI-NEXT: s_and_b32 s4, s38, 0xffff +; SI-NEXT: s_lshl_b32 s5, s35, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s48, 0xffff +; SI-NEXT: s_lshl_b32 s15, s39, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_or_b32 s5, s5, s15 +; SI-NEXT: s_lshr_b32 s34, s13, 8 +; SI-NEXT: s_lshr_b32 s95, s11, 8 +; SI-NEXT: s_lshr_b32 s59, s9, 8 +; SI-NEXT: s_lshr_b32 s45, s7, 8 +; SI-NEXT: s_lshr_b32 s15, s5, 8 +; SI-NEXT: s_and_b32 s36, s19, 0xffff +; SI-NEXT: s_and_b32 s30, s23, 0xffff +; SI-NEXT: s_and_b32 s61, s27, 0xffff +; SI-NEXT: s_and_b32 s47, s35, 0xffff +; SI-NEXT: s_and_b32 s41, s39, 0xffff +; SI-NEXT: s_bfe_u32 s37, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s31, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s94, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s57, s35, 0x80008 +; SI-NEXT: s_bfe_u32 s43, s39, 0x80008 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: s_add_i32 s73, s73, 3 -; SI-NEXT: s_and_b32 s4, s73, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_add_i32 s50, s50, 3 +; SI-NEXT: s_and_b32 s4, s50, 0xffff +; SI-NEXT: s_lshl_b32 s5, s49, 16 +; SI-NEXT: s_add_i32 s48, s48, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s48, 0xffff +; SI-NEXT: s_lshl_b32 s6, s39, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s63, s63, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s63, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s28, 0xffff +; SI-NEXT: s_lshl_b32 s7, s29, 16 +; SI-NEXT: s_add_i32 s38, s38, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s38, 0xffff +; SI-NEXT: s_lshl_b32 s8, s35, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s24, 0xffff +; SI-NEXT: s_lshl_b32 s9, s25, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s9, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s10, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s12, s23, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s12, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: s_and_b32 s12, s16, 0xffff +; SI-NEXT: s_lshl_b32 s13, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s13, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v3 -; SI-NEXT: s_add_i32 s14, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v2, s13 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_alignbit_b32 v7, s14, v2, 24 -; SI-NEXT: v_alignbit_b32 v12, s14, v2, 16 -; SI-NEXT: v_alignbit_b32 v16, s14, v2, 8 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_or_b32_e32 v1, v18, v1 -; SI-NEXT: v_alignbit_b32 v8, s12, v2, 24 -; SI-NEXT: v_alignbit_b32 v13, s12, v2, 16 -; SI-NEXT: v_alignbit_b32 v17, s12, v2, 8 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_alignbit_b32 v6, s10, v2, 24 -; SI-NEXT: v_alignbit_b32 v11, s10, v2, 16 -; SI-NEXT: v_alignbit_b32 v15, s10, v2, 8 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_alignbit_b32 v5, s8, v2, 24 -; SI-NEXT: v_alignbit_b32 v9, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v14, s8, v2, 8 -; SI-NEXT: v_alignbit_b32 v2, v3, v1, 24 -; SI-NEXT: v_alignbit_b32 v4, v3, v1, 16 -; SI-NEXT: v_alignbit_b32 v10, v3, v1, 8 -; SI-NEXT: s_lshr_b32 s61, s14, 24 -; SI-NEXT: s_lshr_b32 s60, s14, 16 -; SI-NEXT: s_lshr_b32 s59, s14, 8 -; SI-NEXT: s_lshr_b32 s58, s12, 24 -; SI-NEXT: s_lshr_b32 s57, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s12, 8 -; SI-NEXT: s_lshr_b32 s47, s10, 24 -; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s10, 8 -; SI-NEXT: s_lshr_b32 s44, s8, 24 -; SI-NEXT: s_lshr_b32 s43, s8, 16 -; SI-NEXT: s_lshr_b32 s42, s8, 8 -; SI-NEXT: s_lshr_b32 s41, s6, 24 -; SI-NEXT: s_lshr_b32 s40, s6, 16 -; SI-NEXT: s_lshr_b32 s15, s6, 8 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: s_and_b32 s13, s18, 0xffff +; SI-NEXT: s_lshl_b32 s14, s19, 16 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: s_add_i32 s12, s12, 0x30000 +; SI-NEXT: s_add_i32 s13, s13, 0x30000 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[78:79], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s37, s13, 24 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s34, s13, 8 +; SI-NEXT: s_lshr_b32 s31, s11, 24 +; SI-NEXT: s_lshr_b32 s30, s11, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 8 +; SI-NEXT: s_lshr_b32 s94, s9, 24 +; SI-NEXT: s_lshr_b32 s61, s9, 16 +; SI-NEXT: s_lshr_b32 s59, s9, 8 +; SI-NEXT: s_lshr_b32 s57, s7, 24 +; SI-NEXT: s_lshr_b32 s47, s7, 16 +; SI-NEXT: s_lshr_b32 s45, s7, 8 +; SI-NEXT: s_lshr_b32 s43, s5, 24 +; SI-NEXT: s_lshr_b32 s41, s5, 16 +; SI-NEXT: s_lshr_b32 s15, s5, 8 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v16 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: s_lshl_b32 s5, s59, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s13, s61, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v7, v12 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s13, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_or_b32 s12, s12, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s34, 8 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: s_and_b32 s13, s36, 0xff +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: s_lshl_b32 s14, s37, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s60, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s46, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s42, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: s_lshl_b32 s5, s56, 8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v13 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s58, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s11, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s95, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s30, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s31, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v15 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: s_lshl_b32 s5, s45, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s88, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s58, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s44, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v11 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s47, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s9, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s59, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s61, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s94, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v14 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s74, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s90, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s78, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v9 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s43, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s44, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s45, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s47, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s57, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s76, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s72, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s62, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s7, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v10 -; SI-NEXT: s_and_b32 s4, s6, 0xff +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff ; SI-NEXT: s_lshl_b32 s5, s15, 8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s40, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_and_b32 s5, s41, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s41, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: s_lshl_b32 s6, s43, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s50, v8, 10 +; SI-NEXT: v_readlane_b32 s49, v8, 9 +; SI-NEXT: v_readlane_b32 s48, v8, 8 +; SI-NEXT: v_readlane_b32 s39, v8, 7 +; SI-NEXT: v_readlane_b32 s38, v8, 6 +; SI-NEXT: v_readlane_b32 s37, v8, 5 +; SI-NEXT: v_readlane_b32 s36, v8, 4 +; SI-NEXT: v_readlane_b32 s35, v8, 3 +; SI-NEXT: v_readlane_b32 s34, v8, 2 +; SI-NEXT: v_readlane_b32 s31, v8, 1 +; SI-NEXT: v_readlane_b32 s30, v8, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v20i16_to_v40i8_scalar: @@ -19045,286 +19080,280 @@ define inreg <20 x i16> @bitcast_v40i8_to_v20i16_scalar(<40 x i8> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26 -; SI-NEXT: v_readfirstlane_b32 s14, v19 -; SI-NEXT: v_readfirstlane_b32 s40, v18 -; SI-NEXT: v_readfirstlane_b32 s12, v11 -; SI-NEXT: v_readfirstlane_b32 s13, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v0 +; SI-NEXT: v_mov_b32_e32 v31, v18 +; SI-NEXT: v_mov_b32_e32 v32, v14 +; SI-NEXT: v_mov_b32_e32 v33, v10 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v0 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v34, 8, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v37, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 ; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v27, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v25 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s10, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s10, s19, 24 -; SI-NEXT: s_or_b32 s4, s10, s4 -; SI-NEXT: s_and_b32 s10, s28, 0xff -; SI-NEXT: s_lshl_b32 s15, s29, 8 -; SI-NEXT: s_or_b32 s10, s10, s15 -; SI-NEXT: s_and_b32 s15, s6, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_lshl_b32 s41, s7, 24 -; SI-NEXT: s_or_b32 s43, s41, s15 -; SI-NEXT: s_and_b32 s15, s26, 0xff -; SI-NEXT: s_lshl_b32 s15, s15, 16 -; SI-NEXT: s_lshl_b32 s41, s27, 24 -; SI-NEXT: s_or_b32 s15, s41, s15 -; SI-NEXT: s_and_b32 s41, s16, 0xff -; SI-NEXT: s_lshl_b32 s42, s17, 8 -; SI-NEXT: s_or_b32 s41, s41, s42 -; SI-NEXT: s_and_b32 s41, s41, 0xffff -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 -; SI-NEXT: s_or_b32 s41, s41, s4 +; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s42, s25, 8 -; SI-NEXT: v_or_b32_e32 v9, v9, v2 +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: v_and_b32_e32 v10, 0xff, v33 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: v_or_b32_e32 v10, v10, v36 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v12 +; SI-NEXT: s_or_b32 s13, s5, s7 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_or_b32 s4, s4, s42 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v0, v10 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_or_b32_e32 v10, v9, v11 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 +; SI-NEXT: v_or_b32_e32 v13, v35, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v8 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v16 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v24 +; SI-NEXT: s_or_b32 s5, s5, s9 +; SI-NEXT: s_and_b32 s9, s42, 0xff +; SI-NEXT: v_or_b32_e32 v9, v9, v34 +; SI-NEXT: v_or_b32_e32 v17, v37, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v19, v0, v14 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v32 +; SI-NEXT: v_or_b32_e32 v39, v5, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v9 ; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 -; SI-NEXT: s_or_b32 s15, s4, s15 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s42, s8, 8 +; SI-NEXT: v_or_b32_e32 v10, v10, v3 +; SI-NEXT: v_or_b32_e32 v14, v14, v7 +; SI-NEXT: v_or_b32_e32 v18, v18, v23 +; SI-NEXT: v_or_b32_e32 v48, v21, v25 +; SI-NEXT: v_and_b32_e32 v25, 0xff, v31 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s12, s10, s9 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v13, v13, v27 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_or_b32 s4, s4, s42 -; SI-NEXT: v_or_b32_e32 v15, v3, v9 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v19, v7, v17 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v25, v13, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 -; SI-NEXT: v_or_b32_e32 v23, s4, v15 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s42, s12, 8 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v17, v17, v30 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: s_or_b32 s4, s4, s42 -; SI-NEXT: v_or_b32_e32 v21, v28, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v32, v29, v18 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v18, v17, v32 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; SI-NEXT: v_or_b32_e32 v26, s4, v21 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s42, s14, 8 -; SI-NEXT: s_and_b32 s10, s10, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_or_b32 s4, s4, s42 -; SI-NEXT: s_or_b32 s10, s10, s43 -; SI-NEXT: v_or_b32_e32 v33, v31, v17 +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v25, v25, v38 +; SI-NEXT: s_or_b32 s15, s5, s12 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v1, s11, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, s10, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 -; SI-NEXT: v_alignbit_b32 v13, v25, v21, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v33, 16 -; SI-NEXT: v_or_b32_e32 v21, s4, v33 -; SI-NEXT: s_lshr_b32 s42, s5, 16 -; SI-NEXT: s_lshr_b32 s43, s43, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_or_b32_e32 v9, v1, v9 +; SI-NEXT: v_or_b32_e32 v10, v10, v19 +; SI-NEXT: v_or_b32_e32 v14, v14, v39 +; SI-NEXT: v_or_b32_e32 v18, v18, v48 +; SI-NEXT: v_and_b32_e32 v29, 0xffff, v25 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: v_or_b32_e32 v25, v11, v9 +; SI-NEXT: v_mov_b32_e32 v26, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[9:10], 16 +; SI-NEXT: v_or_b32_e32 v27, v15, v13 +; SI-NEXT: v_mov_b32_e32 v28, v14 +; SI-NEXT: v_lshr_b64 v[13:14], v[13:14], 16 +; SI-NEXT: v_or_b32_e32 v29, v29, v17 +; SI-NEXT: v_mov_b32_e32 v30, v18 +; SI-NEXT: v_lshr_b64 v[17:18], v[17:18], 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s12, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v48 +; SI-NEXT: s_mov_b32 s7, s13 +; SI-NEXT: s_mov_b32 s5, s15 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v31, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v21, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; SI-NEXT: v_or_b32_e32 v1, v30, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v29, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v28, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v31 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v9, v38, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v37, v10 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_add_i32_e32 v29, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v24 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 +; SI-NEXT: v_or_b32_e32 v9, v23, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 ; SI-NEXT: s_and_b32 s4, s24, 0xff ; SI-NEXT: s_lshl_b32 s5, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s26, 0xff +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v21, v10 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s15, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s7, 24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: v_add_i32_e32 v30, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v33 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v12 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: v_or_b32_e32 v9, v36, v9 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x300, v9 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_or_b32_e32 v2, v34, v2 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s10, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: v_or_b32_e32 v10, v35, v10 +; SI-NEXT: v_add_i32_e32 v2, vcc, 0x300, v2 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_i32_e32 v23, vcc, 0x3000000, v1 +; SI-NEXT: v_or_b32_e32 v9, v10, v9 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: v_add_i32_e32 v27, vcc, 0x3000000, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_add_i32_e32 v25, vcc, 0x3000000, v1 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 +; SI-NEXT: v_add_i32_e32 v9, vcc, 3, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v8 -; SI-NEXT: s_add_i32 s41, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s22, 0xff +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x300, v7 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v0, v0, v2 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_or_b32_e32 v5, v5, v7 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: s_add_i32 s11, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v0, s41 -; SI-NEXT: v_alignbit_b32 v1, s11, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s15 -; SI-NEXT: v_alignbit_b32 v5, s10, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v23, 16 -; SI-NEXT: v_alignbit_b32 v13, v25, v26, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v21, 16 -; SI-NEXT: s_lshr_b32 s42, s11, 16 -; SI-NEXT: s_lshr_b32 s43, s10, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v28, vcc, 0x3000000, v5 +; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v0 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[29:30], 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v30 ; SI-NEXT: .LBB51_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s41 -; SI-NEXT: v_mov_b32_e32 v2, s11 -; SI-NEXT: v_mov_b32_e32 v3, s42 -; SI-NEXT: v_mov_b32_e32 v4, s15 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, v23 -; SI-NEXT: v_mov_b32_e32 v12, v26 -; SI-NEXT: v_mov_b32_e32 v14, v25 -; SI-NEXT: v_mov_b32_e32 v16, v21 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, v25 +; SI-NEXT: v_mov_b32_e32 v10, v26 +; SI-NEXT: v_mov_b32_e32 v12, v27 +; SI-NEXT: v_mov_b32_e32 v14, v28 +; SI-NEXT: v_mov_b32_e32 v16, v29 +; SI-NEXT: v_mov_b32_e32 v18, v30 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: s_branch .LBB51_2 ; @@ -20574,78 +20603,78 @@ define inreg <20 x i16> @bitcast_v5f64_to_v20i16_scalar(<5 x double> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB55_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v20, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v21, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v22, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v23, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v24, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s25, 16 +; SI-NEXT: s_lshr_b32 s29, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s27, s19, 16 +; SI-NEXT: s_lshr_b32 s26, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB55_4 ; SI-NEXT: .LBB55_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 -; SI-NEXT: v_alignbit_b32 v20, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v21, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v24, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[20:21], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[22:23], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[24:25], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[22:23], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[28:29], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29 ; SI-NEXT: s_branch .LBB55_5 ; SI-NEXT: .LBB55_3: -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB55_2 ; SI-NEXT: .LBB55_4: -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: v_mov_b32_e32 v5, s19 -; SI-NEXT: v_mov_b32_e32 v9, s21 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v21, s25 +; SI-NEXT: v_mov_b32_e32 v23, s23 +; SI-NEXT: v_mov_b32_e32 v25, s21 +; SI-NEXT: v_mov_b32_e32 v27, s19 +; SI-NEXT: v_mov_b32_e32 v29, s17 +; SI-NEXT: v_mov_b32_e32 v28, s16 +; SI-NEXT: v_mov_b32_e32 v26, s18 +; SI-NEXT: v_mov_b32_e32 v24, s20 +; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v20, s24 +; SI-NEXT: v_mov_b32_e32 v19, s40 +; SI-NEXT: v_mov_b32_e32 v15, s29 +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: v_mov_b32_e32 v3, s26 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: .LBB55_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v18, v17 -; SI-NEXT: v_mov_b32_e32 v1, v24 -; SI-NEXT: v_mov_b32_e32 v5, v23 -; SI-NEXT: v_mov_b32_e32 v9, v22 -; SI-NEXT: v_mov_b32_e32 v13, v21 -; SI-NEXT: v_mov_b32_e32 v17, v20 +; SI-NEXT: v_mov_b32_e32 v0, v28 +; SI-NEXT: v_mov_b32_e32 v2, v29 +; SI-NEXT: v_mov_b32_e32 v4, v26 +; SI-NEXT: v_mov_b32_e32 v6, v27 +; SI-NEXT: v_mov_b32_e32 v8, v24 +; SI-NEXT: v_mov_b32_e32 v10, v25 +; SI-NEXT: v_mov_b32_e32 v12, v22 +; SI-NEXT: v_mov_b32_e32 v14, v23 +; SI-NEXT: v_mov_b32_e32 v16, v20 +; SI-NEXT: v_mov_b32_e32 v18, v21 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v5f64_to_v20i16_scalar: @@ -21435,76 +21464,71 @@ define inreg <20 x i16> @bitcast_v5i64_to_v20i16_scalar(<5 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 ; SI-NEXT: s_add_u32 s24, s24, 3 ; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s26, s25, 16 +; SI-NEXT: s_lshr_b32 s27, s23, 16 +; SI-NEXT: s_lshr_b32 s28, s21, 16 +; SI-NEXT: s_lshr_b32 s29, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 16 ; SI-NEXT: .LBB59_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s12 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v3, s40 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s10 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v15, s27 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: v_mov_b32_e32 v19, s26 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr28 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr26 ; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v5i64_to_v20i16_scalar: @@ -22779,357 +22803,375 @@ define inreg <40 x i8> @bitcast_v20f16_to_v40i8_scalar(<20 x half> inreg %a, i32 ; SI-LABEL: bitcast_v20f16_to_v40i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v16, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v39, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v9, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v54, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v53, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v50, s26 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v43, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s29 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v47, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s28 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB61_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v16 -; SI-NEXT: v_or_b32_e32 v28, v15, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v10 -; SI-NEXT: v_or_b32_e32 v24, v12, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 -; SI-NEXT: v_or_b32_e32 v14, v33, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; SI-NEXT: v_or_b32_e32 v13, v20, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 -; SI-NEXT: v_or_b32_e32 v7, v53, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; SI-NEXT: v_or_b32_e32 v11, v50, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v43 -; SI-NEXT: v_or_b32_e32 v5, v44, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v41, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 -; SI-NEXT: v_or_b32_e32 v4, v46, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_or_b32_e32 v3, v45, v3 -; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 -; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 -; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 -; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 -; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 -; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 -; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 -; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 -; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 -; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 -; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v16 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v15 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v19 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v18 +; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v8 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v17 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v5 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v20 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v24 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_readfirstlane_b32 s15, v22 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 +; SI-NEXT: s_or_b32 s5, s15, s5 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s23, s13, 8 +; SI-NEXT: s_lshr_b32 s21, s11, 8 +; SI-NEXT: s_lshr_b32 s19, s9, 8 +; SI-NEXT: s_lshr_b32 s17, s7, 8 +; SI-NEXT: s_lshr_b32 s15, s5, 8 +; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v6, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v4, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB61_3 ; SI-NEXT: .LBB61_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v3, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_readfirstlane_b32 s5, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v43 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v54 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v5 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v19 +; SI-NEXT: v_readfirstlane_b32 s7, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v20 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v53 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v50 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v33 -; SI-NEXT: v_or_b32_e32 v7, v13, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_or_b32_e32 v11, v11, v14 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; SI-NEXT: v_or_b32_e32 v28, v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v10 -; SI-NEXT: v_or_b32_e32 v13, v17, v13 -; SI-NEXT: v_or_b32_e32 v24, v12, v15 -; SI-NEXT: v_alignbit_b32 v30, v24, v28, 24 -; SI-NEXT: v_alignbit_b32 v35, v24, v28, 16 -; SI-NEXT: v_alignbit_b32 v37, v24, v28, 8 -; SI-NEXT: v_alignbit_b32 v29, v13, v14, 24 -; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v36, v13, v14, 8 -; SI-NEXT: v_alignbit_b32 v23, v11, v7, 24 -; SI-NEXT: v_alignbit_b32 v26, v11, v7, 16 -; SI-NEXT: v_alignbit_b32 v32, v11, v7, 8 -; SI-NEXT: v_alignbit_b32 v19, v6, v5, 24 -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 8 -; SI-NEXT: v_alignbit_b32 v17, v3, v4, 24 -; SI-NEXT: v_alignbit_b32 v18, v3, v4, 16 -; SI-NEXT: v_alignbit_b32 v22, v3, v4, 8 -; SI-NEXT: v_lshrrev_b32_e32 v40, 8, v24 -; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v49, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 8, v6 -; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v3 -; SI-NEXT: v_bfe_u32 v42, v10, 8, 8 -; SI-NEXT: v_bfe_u32 v55, v9, 8, 8 -; SI-NEXT: v_bfe_u32 v51, v8, 8, 8 -; SI-NEXT: v_bfe_u32 v48, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v34, v1, 8, 8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v8 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: v_readfirstlane_b32 s9, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v17 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v16 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v15 +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_readfirstlane_b32 s10, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v12 +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v13 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_readfirstlane_b32 s11, v9 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readfirstlane_b32 s12, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_readfirstlane_b32 s13, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v10 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[16:17], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[20:21], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[18:19], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s23, s13, 8 +; SI-NEXT: s_lshr_b32 s21, s11, 8 +; SI-NEXT: s_lshr_b32 s19, s9, 8 +; SI-NEXT: s_lshr_b32 s17, s7, 8 +; SI-NEXT: s_lshr_b32 s15, s5, 8 +; SI-NEXT: v_bfe_u32 v25, v10, 8, 8 +; SI-NEXT: v_bfe_u32 v7, v9, 8, 8 +; SI-NEXT: v_bfe_u32 v6, v8, 8, 8 +; SI-NEXT: v_bfe_u32 v4, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v3, v1, 8, 8 ; SI-NEXT: .LBB61_3: ; %end -; SI-NEXT: v_and_b32_e32 v12, 0xff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v37 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v30 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v15, v16, v15 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 -; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v40 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_or_b32_e32 v12, v12, v15 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v42 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v15, v10 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v36 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v29 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v52 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v55 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v9, v12, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v32 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v23 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s20, s20, 8 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_or_b32 s12, s12, s20 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s23, 8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v49 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v51 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v9, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v10 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v10, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s26, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s22, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s18, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v5, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v27 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v19 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: buffer_store_dword v10, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s21, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v9 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_or_b32_e32 v5, s10, v5 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s40, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s28, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s24, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v38 +; SI-NEXT: v_add_i32_e32 v5, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: buffer_store_dword v7, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s19, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s8, v5 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s44, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s42, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s46, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 -; SI-NEXT: v_or_b32_e32 v2, v5, v2 -; SI-NEXT: v_add_i32_e32 v5, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_or_b32_e32 v2, s6, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s60, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s58, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s56, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v2, v2, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v25 +; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s15, 8 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v34 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB61_4: -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr21 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr19 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr17 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB61_2 ; ; VI-LABEL: bitcast_v20f16_to_v40i8_scalar: @@ -31153,232 +31195,246 @@ define inreg <40 x i8> @bitcast_v5f64_to_v40i8_scalar(<5 x double> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB75_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s24 -; SI-NEXT: v_alignbit_b32 v2, s25, v1, 24 -; SI-NEXT: v_alignbit_b32 v11, s25, v1, 16 -; SI-NEXT: v_alignbit_b32 v12, s25, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s22 -; SI-NEXT: v_alignbit_b32 v4, s23, v1, 24 -; SI-NEXT: v_alignbit_b32 v13, s23, v1, 16 -; SI-NEXT: v_alignbit_b32 v14, s23, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s20 -; SI-NEXT: v_alignbit_b32 v6, s21, v1, 24 -; SI-NEXT: v_alignbit_b32 v15, s21, v1, 16 -; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s18 -; SI-NEXT: v_alignbit_b32 v8, s19, v1, 24 -; SI-NEXT: v_alignbit_b32 v10, s19, v1, 16 -; SI-NEXT: v_alignbit_b32 v17, s19, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: v_alignbit_b32 v18, s17, v1, 24 -; SI-NEXT: v_alignbit_b32 v19, s17, v1, 16 -; SI-NEXT: v_alignbit_b32 v20, s17, v1, 8 -; SI-NEXT: s_lshr_b32 s6, s25, 24 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 8 -; SI-NEXT: s_lshr_b32 s9, s23, 24 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s15, s19, 24 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 8 -; SI-NEXT: s_lshr_b32 s28, s17, 24 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_lshr_b32 s72, s25, 24 +; SI-NEXT: s_lshr_b32 s73, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s25, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 8 +; SI-NEXT: s_lshr_b32 s78, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 8 +; SI-NEXT: s_lshr_b32 s89, s19, 24 +; SI-NEXT: s_lshr_b32 s90, s19, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 8 +; SI-NEXT: s_lshr_b32 s92, s17, 24 +; SI-NEXT: s_lshr_b32 s93, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB75_4 ; SI-NEXT: .LBB75_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[9:10], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[5:6], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[15:16], s[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[8:9], 8 ; SI-NEXT: v_add_f64 v[1:2], s[24:25], 1.0 +; SI-NEXT: v_lshr_b64 v[23:24], v[15:16], 24 ; SI-NEXT: v_add_f64 v[3:4], s[22:23], 1.0 -; SI-NEXT: v_readfirstlane_b32 s25, v2 +; SI-NEXT: v_add_f64 v[20:21], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[24:25], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[15:16], 8 +; SI-NEXT: v_lshr_b64 v[11:12], v[3:4], 24 +; SI-NEXT: v_lshr_b64 v[26:27], v[20:21], 24 +; SI-NEXT: v_lshr_b64 v[5:6], v[1:2], 24 +; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[8:9], 24 +; SI-NEXT: v_lshr_b64 v[27:28], v[20:21], 16 +; SI-NEXT: v_readfirstlane_b32 s17, v21 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s21, v9 ; SI-NEXT: v_readfirstlane_b32 s23, v4 -; SI-NEXT: v_readfirstlane_b32 s21, v6 -; SI-NEXT: v_readfirstlane_b32 s19, v8 -; SI-NEXT: v_readfirstlane_b32 s17, v10 -; SI-NEXT: v_alignbit_b32 v2, s25, v1, 24 -; SI-NEXT: v_alignbit_b32 v11, s25, v1, 16 -; SI-NEXT: v_alignbit_b32 v12, s25, v1, 8 -; SI-NEXT: v_alignbit_b32 v4, s23, v3, 24 -; SI-NEXT: v_alignbit_b32 v13, s23, v3, 16 -; SI-NEXT: v_alignbit_b32 v14, s23, v3, 8 -; SI-NEXT: v_alignbit_b32 v6, s21, v5, 24 -; SI-NEXT: v_alignbit_b32 v15, s21, v5, 16 -; SI-NEXT: v_alignbit_b32 v16, s21, v5, 8 -; SI-NEXT: v_alignbit_b32 v8, s19, v7, 24 -; SI-NEXT: s_lshr_b32 s6, s25, 24 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 8 -; SI-NEXT: s_lshr_b32 s9, s23, 24 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s15, s19, 24 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 8 -; SI-NEXT: s_lshr_b32 s28, s17, 24 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v7, 16 -; SI-NEXT: v_alignbit_b32 v17, s19, v7, 8 -; SI-NEXT: v_alignbit_b32 v18, s17, v9, 24 -; SI-NEXT: v_alignbit_b32 v19, s17, v9, 16 -; SI-NEXT: v_alignbit_b32 v20, s17, v9, 8 +; SI-NEXT: v_readfirstlane_b32 s25, v2 +; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[3:4], 8 +; SI-NEXT: v_lshr_b64 v[18:19], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[20:21], 8 +; SI-NEXT: s_lshr_b32 s72, s25, 24 +; SI-NEXT: s_lshr_b32 s73, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s25, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 8 +; SI-NEXT: s_lshr_b32 s78, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 8 +; SI-NEXT: s_lshr_b32 s89, s19, 24 +; SI-NEXT: s_lshr_b32 s90, s19, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 8 +; SI-NEXT: s_lshr_b32 s92, s17, 24 +; SI-NEXT: s_lshr_b32 s93, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 8 ; SI-NEXT: s_branch .LBB75_5 ; SI-NEXT: .LBB75_3: -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB75_2 ; SI-NEXT: .LBB75_4: -; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_mov_b32_e32 v20, s16 +; SI-NEXT: v_mov_b32_e32 v15, s18 +; SI-NEXT: v_mov_b32_e32 v8, s20 ; SI-NEXT: v_mov_b32_e32 v3, s22 -; SI-NEXT: v_mov_b32_e32 v5, s20 -; SI-NEXT: v_mov_b32_e32 v7, s18 -; SI-NEXT: v_mov_b32_e32 v9, s16 +; SI-NEXT: v_mov_b32_e32 v1, s24 +; SI-NEXT: v_mov_b32_e32 v28, s60 +; SI-NEXT: v_mov_b32_e32 v27, s58 +; SI-NEXT: v_mov_b32_e32 v26, s56 +; SI-NEXT: v_mov_b32_e32 v25, s46 +; SI-NEXT: v_mov_b32_e32 v24, s44 +; SI-NEXT: v_mov_b32_e32 v23, s42 +; SI-NEXT: v_mov_b32_e32 v22, s40 +; SI-NEXT: v_mov_b32_e32 v18, s28 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v13, s14 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v10, s6 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 ; SI-NEXT: .LBB75_5: ; %end +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v28 ; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v20 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: s_lshl_b32 s5, s94, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v27 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s29, 0xff -; SI-NEXT: v_or_b32_e32 v9, v9, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 +; SI-NEXT: s_and_b32 s5, s93, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v26 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s28, 24 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: s_lshl_b32 s6, s92, 24 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v9, v9, v18 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v18, s4 -; SI-NEXT: buffer_store_dword v18, v9, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v17 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v25 ; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v10 +; SI-NEXT: s_lshl_b32 s5, s91, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v24 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v23 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s15, s15, 24 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_lshl_b32 s6, s89, 24 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s15, s5 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 8, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v22 ; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v15 +; SI-NEXT: s_lshl_b32 s5, s88, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v18 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_and_b32 s5, s79, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s12, 24 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_lshl_b32 s6, s78, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v7, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 20, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v14 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 ; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s11, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v13 +; SI-NEXT: s_lshl_b32 s5, s77, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v11 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: s_lshl_b32 s6, s75, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s9, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v12 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v10 ; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_and_b32 s5, s73, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 +; SI-NEXT: s_lshl_b32 s6, s72, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -35291,241 +35347,239 @@ define inreg <40 x i8> @bitcast_v5i64_to_v40i8_scalar(<5 x i64> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s26, 0 ; SI-NEXT: s_cbranch_scc0 .LBB79_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 -; SI-NEXT: s_lshr_b32 s6, s25, 24 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 8 -; SI-NEXT: s_lshr_b32 s9, s23, 24 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s15, s19, 24 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 8 -; SI-NEXT: s_lshr_b32 s28, s17, 24 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_lshr_b32 s72, s25, 24 +; SI-NEXT: s_lshr_b32 s73, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s25, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 8 +; SI-NEXT: s_lshr_b32 s78, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 8 +; SI-NEXT: s_lshr_b32 s89, s19, 24 +; SI-NEXT: s_lshr_b32 s90, s19, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 8 +; SI-NEXT: s_lshr_b32 s92, s17, 24 +; SI-NEXT: s_lshr_b32 s93, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB79_3 ; SI-NEXT: .LBB79_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 ; SI-NEXT: s_add_u32 s24, s24, 3 ; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: v_mov_b32_e32 v3, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s25, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s25, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s25, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s23, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s21, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 8 -; SI-NEXT: v_alignbit_b32 v10, s19, v12, 24 -; SI-NEXT: v_alignbit_b32 v11, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 8 -; SI-NEXT: v_alignbit_b32 v13, s17, v15, 24 -; SI-NEXT: v_alignbit_b32 v14, s17, v15, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 8 -; SI-NEXT: s_lshr_b32 s6, s25, 24 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 8 -; SI-NEXT: s_lshr_b32 s9, s23, 24 -; SI-NEXT: s_lshr_b32 s10, s23, 16 -; SI-NEXT: s_lshr_b32 s11, s23, 8 -; SI-NEXT: s_lshr_b32 s12, s21, 24 -; SI-NEXT: s_lshr_b32 s13, s21, 16 -; SI-NEXT: s_lshr_b32 s14, s21, 8 -; SI-NEXT: s_lshr_b32 s15, s19, 24 -; SI-NEXT: s_lshr_b32 s26, s19, 16 -; SI-NEXT: s_lshr_b32 s27, s19, 8 -; SI-NEXT: s_lshr_b32 s28, s17, 24 -; SI-NEXT: s_lshr_b32 s29, s17, 16 -; SI-NEXT: s_lshr_b32 s40, s17, 8 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s72, s25, 24 +; SI-NEXT: s_lshr_b32 s73, s25, 16 +; SI-NEXT: s_lshr_b32 s74, s25, 8 +; SI-NEXT: s_lshr_b32 s75, s23, 24 +; SI-NEXT: s_lshr_b32 s76, s23, 16 +; SI-NEXT: s_lshr_b32 s77, s23, 8 +; SI-NEXT: s_lshr_b32 s78, s21, 24 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s21, 8 +; SI-NEXT: s_lshr_b32 s89, s19, 24 +; SI-NEXT: s_lshr_b32 s90, s19, 16 +; SI-NEXT: s_lshr_b32 s91, s19, 8 +; SI-NEXT: s_lshr_b32 s92, s17, 24 +; SI-NEXT: s_lshr_b32 s93, s17, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 8 ; SI-NEXT: .LBB79_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s28, 24 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s27, 8 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s26, 0xff -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s15, s15, 24 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s15, s5 -; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s5, s60, 8 +; SI-NEXT: s_and_b32 s7, s16, 0xff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s7, s58, 0xff +; SI-NEXT: s_lshl_b32 s9, s56, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_and_b32 s5, s17, 0xff +; SI-NEXT: s_lshl_b32 s7, s94, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s93, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s92, 24 +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_lshl_b32 s5, s46, 8 +; SI-NEXT: s_and_b32 s7, s18, 0xff +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: s_and_b32 s7, s44, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s42, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s14, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s13, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s12, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s11, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s9, 24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s9, s5 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s19, 0xff +; SI-NEXT: s_lshl_b32 s7, s91, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s90, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s89, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s40, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s26, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s21, 0xff +; SI-NEXT: s_lshl_b32 s7, s88, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s79, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s78, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s7, s14, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s12, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s10, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s23, 0xff +; SI-NEXT: s_lshl_b32 s7, s77, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s76, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s9, s75, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s8, 8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s4, s4, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_lshl_b32 s5, s74, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_and_b32 s5, s73, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s6, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_lshl_b32 s6, s72, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 36, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB79_4: -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr29 ; SI-NEXT: ; implicit-def: $sgpr28 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr27 ; SI-NEXT: ; implicit-def: $sgpr26 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB79_2 ; ; VI-LABEL: bitcast_v5i64_to_v40i8_scalar: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll index 6fc9a35cd9ee6..f335b48ba4ae1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.352bit.ll @@ -716,83 +716,77 @@ define inreg <22 x i16> @bitcast_v11i32_to_v22i16_scalar(<11 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 ; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s25, 16 -; SI-NEXT: s_lshr_b32 s7, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s9, s19, 16 -; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_lshr_b32 s27, s25, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s40, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s42, s19, 16 +; SI-NEXT: s_lshr_b32 s43, s17, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v3, s43 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s12 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_mov_b32_e32 v7, s42 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s8 +; SI-NEXT: v_mov_b32_e32 v11, s41 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s6 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s7 +; SI-NEXT: v_mov_b32_e32 v15, s40 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s4 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s6 +; SI-NEXT: v_mov_b32_e32 v19, s27 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v11i32_to_v22i16_scalar: @@ -2676,78 +2670,89 @@ define inreg <22 x i16> @bitcast_v11f32_to_v22i16_scalar(<11 x float> inreg %a, ; SI-NEXT: s_cmp_lg_u32 s27, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s27, s17, 16 +; SI-NEXT: s_lshr_b32 s43, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s41, s21, 16 +; SI-NEXT: s_lshr_b32 s40, s19, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_4 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v21, s4, v20, 16 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v32, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v23, s24, 1.0 +; SI-NEXT: v_lshr_b64 v[17:18], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[27:28], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[31:32], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[20:21], 16 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: s_branch .LBB13_5 ; SI-NEXT: .LBB13_3: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_branch .LBB13_2 ; SI-NEXT: .LBB13_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v31, s16 +; SI-NEXT: v_mov_b32_e32 v32, s17 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v27, s20 +; SI-NEXT: v_mov_b32_e32 v28, s21 +; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v26, s23 +; SI-NEXT: v_mov_b32_e32 v23, s24 +; SI-NEXT: v_mov_b32_e32 v24, s25 ; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 +; SI-NEXT: v_mov_b32_e32 v3, s27 +; SI-NEXT: v_mov_b32_e32 v7, s40 +; SI-NEXT: v_mov_b32_e32 v11, s41 +; SI-NEXT: v_mov_b32_e32 v15, s42 +; SI-NEXT: v_mov_b32_e32 v19, s43 +; SI-NEXT: v_mov_b32_e32 v21, s8 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: .LBB13_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v31 +; SI-NEXT: v_mov_b32_e32 v2, v32 +; SI-NEXT: v_mov_b32_e32 v4, v29 +; SI-NEXT: v_mov_b32_e32 v6, v30 +; SI-NEXT: v_mov_b32_e32 v8, v27 +; SI-NEXT: v_mov_b32_e32 v10, v28 +; SI-NEXT: v_mov_b32_e32 v12, v25 +; SI-NEXT: v_mov_b32_e32 v14, v26 +; SI-NEXT: v_mov_b32_e32 v16, v23 +; SI-NEXT: v_mov_b32_e32 v18, v24 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v11f32_to_v22i16_scalar: @@ -5293,136 +5298,137 @@ define inreg <22 x i16> @bitcast_v22f16_to_v22i16_scalar(<22 x half> inreg %a, i ; SI-LABEL: bitcast_v22f16_to_v22i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v12, v7 -; SI-NEXT: v_mov_b32_e32 v13, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 -; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v9, v7 +; SI-NEXT: v_mov_b32_e32 v10, v4 +; SI-NEXT: v_mov_b32_e32 v13, v3 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB23_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB23_3 ; SI-NEXT: .LBB23_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v24 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v25 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v26 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_or_b32_e32 v14, v14, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v11 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_or_b32_e32 v10, v10, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v22 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_or_b32_e32 v18, v18, v22 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshr_b64 v[26:27], v[13:14], 16 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v21 +; SI-NEXT: v_lshr_b64 v[27:28], v[17:18], 16 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 ; SI-NEXT: .LBB23_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v23 +; SI-NEXT: v_mov_b32_e32 v5, v24 +; SI-NEXT: v_mov_b32_e32 v9, v25 +; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_mov_b32_e32 v17, v27 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB23_4: ; SI-NEXT: s_branch .LBB23_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll index c9860dbb7d72c..2cde373ec130c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.384bit.ll @@ -1824,89 +1824,83 @@ define inreg <24 x i16> @bitcast_v12i32_to_v24i16_scalar(<12 x i32> inreg %a, i3 ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 ; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v3, s45 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s12 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v7, s44 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s10 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v11, s43 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s8 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v15, s42 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s6 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v19, s41 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s4 ; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: v_mov_b32_e32 v23, s40 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v12i32_to_v24i16_scalar: @@ -5008,84 +5002,97 @@ define inreg <24 x i16> @bitcast_v12f32_to_v24i16_scalar(<12 x float> inreg %a, ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s11, s27, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v35, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v32, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v27, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v26, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v25, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v24, s26, 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: s_branch .LBB29_5 ; SI-NEXT: .LBB29_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 -; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_mov_b32_e32 v33, s19 +; SI-NEXT: v_mov_b32_e32 v30, s20 +; SI-NEXT: v_mov_b32_e32 v31, s21 +; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v29, s23 +; SI-NEXT: v_mov_b32_e32 v26, s24 +; SI-NEXT: v_mov_b32_e32 v27, s25 +; SI-NEXT: v_mov_b32_e32 v24, s26 +; SI-NEXT: v_mov_b32_e32 v25, s27 +; SI-NEXT: v_mov_b32_e32 v3, s40 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v11, s42 +; SI-NEXT: v_mov_b32_e32 v15, s43 +; SI-NEXT: v_mov_b32_e32 v19, s44 +; SI-NEXT: v_mov_b32_e32 v23, s45 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v17, s6 +; SI-NEXT: v_mov_b32_e32 v21, s4 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v6, v33 +; SI-NEXT: v_mov_b32_e32 v8, v30 +; SI-NEXT: v_mov_b32_e32 v10, v31 +; SI-NEXT: v_mov_b32_e32 v12, v28 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: v_mov_b32_e32 v16, v26 +; SI-NEXT: v_mov_b32_e32 v18, v27 +; SI-NEXT: v_mov_b32_e32 v20, v24 +; SI-NEXT: v_mov_b32_e32 v22, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v12f32_to_v24i16_scalar: @@ -7630,91 +7637,91 @@ define inreg <24 x i16> @bitcast_v6f64_to_v24i16_scalar(<6 x double> inreg %a, i ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB41_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v24, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v25, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v26, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v27, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v28, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v29, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s11, s27, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s44, s25, 16 +; SI-NEXT: s_lshr_b32 s43, s23, 16 +; SI-NEXT: s_lshr_b32 s42, s21, 16 +; SI-NEXT: s_lshr_b32 s41, s19, 16 +; SI-NEXT: s_lshr_b32 s40, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_4 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 -; SI-NEXT: v_alignbit_b32 v24, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v25, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v27, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v28, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v29, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[24:25], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[26:27], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[30:31], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[34:35], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[24:25], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[26:27], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: s_branch .LBB41_5 ; SI-NEXT: .LBB41_3: -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr43 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: s_branch .LBB41_2 ; SI-NEXT: .LBB41_4: -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: v_mov_b32_e32 v5, s19 -; SI-NEXT: v_mov_b32_e32 v9, s21 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v21, s27 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 -; SI-NEXT: v_mov_b32_e32 v23, s11 +; SI-NEXT: v_mov_b32_e32 v25, s27 +; SI-NEXT: v_mov_b32_e32 v27, s25 +; SI-NEXT: v_mov_b32_e32 v29, s23 +; SI-NEXT: v_mov_b32_e32 v31, s21 +; SI-NEXT: v_mov_b32_e32 v33, s19 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v32, s18 +; SI-NEXT: v_mov_b32_e32 v30, s20 +; SI-NEXT: v_mov_b32_e32 v28, s22 +; SI-NEXT: v_mov_b32_e32 v26, s24 +; SI-NEXT: v_mov_b32_e32 v24, s26 +; SI-NEXT: v_mov_b32_e32 v23, s45 +; SI-NEXT: v_mov_b32_e32 v19, s44 +; SI-NEXT: v_mov_b32_e32 v15, s43 +; SI-NEXT: v_mov_b32_e32 v11, s42 +; SI-NEXT: v_mov_b32_e32 v7, s41 +; SI-NEXT: v_mov_b32_e32 v3, s40 +; SI-NEXT: v_mov_b32_e32 v1, s14 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v9, s10 +; SI-NEXT: v_mov_b32_e32 v13, s8 +; SI-NEXT: v_mov_b32_e32 v17, s6 +; SI-NEXT: v_mov_b32_e32 v21, s4 ; SI-NEXT: .LBB41_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v18, v17 -; SI-NEXT: v_mov_b32_e32 v22, v21 -; SI-NEXT: v_mov_b32_e32 v1, v29 -; SI-NEXT: v_mov_b32_e32 v5, v28 -; SI-NEXT: v_mov_b32_e32 v9, v27 -; SI-NEXT: v_mov_b32_e32 v13, v26 -; SI-NEXT: v_mov_b32_e32 v17, v25 -; SI-NEXT: v_mov_b32_e32 v21, v24 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v32 +; SI-NEXT: v_mov_b32_e32 v6, v33 +; SI-NEXT: v_mov_b32_e32 v8, v30 +; SI-NEXT: v_mov_b32_e32 v10, v31 +; SI-NEXT: v_mov_b32_e32 v12, v28 +; SI-NEXT: v_mov_b32_e32 v14, v29 +; SI-NEXT: v_mov_b32_e32 v16, v26 +; SI-NEXT: v_mov_b32_e32 v18, v27 +; SI-NEXT: v_mov_b32_e32 v20, v24 +; SI-NEXT: v_mov_b32_e32 v22, v25 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v6f64_to_v24i16_scalar: @@ -9690,89 +9697,83 @@ define inreg <24 x i16> @bitcast_v6i64_to_v24i16_scalar(<6 x i64> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 ; SI-NEXT: s_add_u32 s26, s26, 3 ; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s27, 16 -; SI-NEXT: s_lshr_b32 s7, s25, 16 -; SI-NEXT: s_lshr_b32 s8, s23, 16 -; SI-NEXT: s_lshr_b32 s9, s21, 16 -; SI-NEXT: s_lshr_b32 s10, s19, 16 -; SI-NEXT: s_lshr_b32 s11, s17, 16 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s40, s27, 16 +; SI-NEXT: s_lshr_b32 s41, s25, 16 +; SI-NEXT: s_lshr_b32 s42, s23, 16 +; SI-NEXT: s_lshr_b32 s43, s21, 16 +; SI-NEXT: s_lshr_b32 s44, s19, 16 +; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[16:17], 16 ; SI-NEXT: .LBB49_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s14 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_mov_b32_e32 v3, s45 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s12 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s10 +; SI-NEXT: v_mov_b32_e32 v7, s44 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s10 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s9 +; SI-NEXT: v_mov_b32_e32 v11, s43 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s8 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s8 +; SI-NEXT: v_mov_b32_e32 v15, s42 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s6 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s7 +; SI-NEXT: v_mov_b32_e32 v19, s41 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s4 ; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s6 +; SI-NEXT: v_mov_b32_e32 v23, s40 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr41 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v6i64_to_v24i16_scalar: @@ -12464,149 +12465,150 @@ define inreg <24 x i16> @bitcast_v24f16_to_v24i16_scalar(<24 x half> inreg %a, i ; SI-LABEL: bitcast_v24f16_to_v24i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v11, v9 -; SI-NEXT: v_mov_b32_e32 v12, v8 -; SI-NEXT: v_mov_b32_e32 v13, v7 -; SI-NEXT: v_mov_b32_e32 v20, v6 -; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v13, v8 +; SI-NEXT: v_mov_b32_e32 v17, v7 ; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 +; SI-NEXT: v_mov_b32_e32 v19, v3 ; SI-NEXT: v_mov_b32_e32 v14, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v27, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v19 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v25 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v18, v18, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v14, v14, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v11 +; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_or_b32_e32 v10, v10, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v24 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 ; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v22, v22, v24 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[21:22], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 ; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v27 +; SI-NEXT: v_mov_b32_e32 v5, v25 +; SI-NEXT: v_mov_b32_e32 v9, v28 +; SI-NEXT: v_mov_b32_e32 v13, v29 +; SI-NEXT: v_mov_b32_e32 v17, v30 +; SI-NEXT: v_mov_b32_e32 v21, v31 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll index eaf314d4b65dc..718851f97bade 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.448bit.ll @@ -2004,102 +2004,95 @@ define inreg <28 x i16> @bitcast_v14i32_to_v28i16_scalar(<14 x i32> inreg %a, i3 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s28 -; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s7, s27, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 16 -; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: v_mov_b32_e32 v0, s28 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s7, s27, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 16 -; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 ; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s40 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v3, s58 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v7, s57 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s12 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v11, s56 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s10 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v15, s47 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s8 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s8 +; SI-NEXT: v_mov_b32_e32 v19, s46 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s6 ; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v23, s45 ; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s6 +; SI-NEXT: v_mov_b32_e32 v27, s44 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v14i32_to_v28i16_scalar: @@ -5543,96 +5536,111 @@ define inreg <28 x i16> @bitcast_v14f32_to_v28i16_scalar(<14 x float> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB29_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s28 -; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s11, s27, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s58, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_4 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_add_f32_e64 v6, s19, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v10, s21, 1.0 -; SI-NEXT: v_add_f32_e64 v8, s20, 1.0 -; SI-NEXT: v_add_f32_e64 v14, s23, 1.0 -; SI-NEXT: v_add_f32_e64 v12, s22, 1.0 -; SI-NEXT: v_add_f32_e64 v18, s25, 1.0 -; SI-NEXT: v_add_f32_e64 v16, s24, 1.0 -; SI-NEXT: v_add_f32_e64 v22, s27, 1.0 -; SI-NEXT: v_add_f32_e64 v20, s26, 1.0 -; SI-NEXT: v_add_f32_e64 v26, s29, 1.0 -; SI-NEXT: v_add_f32_e64 v24, s28, 1.0 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v49, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v48, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v39, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v38, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v37, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v36, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v35, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v34, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v33, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v32, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v31, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v30, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v29, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v28, s28, 1.0 +; SI-NEXT: v_lshr_b64 v[25:26], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 +; SI-NEXT: s_branch .LBB29_5 ; SI-NEXT: .LBB29_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB29_2 ; SI-NEXT: .LBB29_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 -; SI-NEXT: v_mov_b32_e32 v23, s11 -; SI-NEXT: v_mov_b32_e32 v27, s12 +; SI-NEXT: v_mov_b32_e32 v48, s16 +; SI-NEXT: v_mov_b32_e32 v49, s17 +; SI-NEXT: v_mov_b32_e32 v38, s18 +; SI-NEXT: v_mov_b32_e32 v39, s19 +; SI-NEXT: v_mov_b32_e32 v36, s20 +; SI-NEXT: v_mov_b32_e32 v37, s21 +; SI-NEXT: v_mov_b32_e32 v34, s22 +; SI-NEXT: v_mov_b32_e32 v35, s23 +; SI-NEXT: v_mov_b32_e32 v32, s24 +; SI-NEXT: v_mov_b32_e32 v33, s25 +; SI-NEXT: v_mov_b32_e32 v30, s26 +; SI-NEXT: v_mov_b32_e32 v31, s27 +; SI-NEXT: v_mov_b32_e32 v28, s28 +; SI-NEXT: v_mov_b32_e32 v29, s29 +; SI-NEXT: v_mov_b32_e32 v3, s44 +; SI-NEXT: v_mov_b32_e32 v7, s45 +; SI-NEXT: v_mov_b32_e32 v11, s46 +; SI-NEXT: v_mov_b32_e32 v15, s47 +; SI-NEXT: v_mov_b32_e32 v19, s56 +; SI-NEXT: v_mov_b32_e32 v23, s57 +; SI-NEXT: v_mov_b32_e32 v27, s58 +; SI-NEXT: v_mov_b32_e32 v25, s4 +; SI-NEXT: v_mov_b32_e32 v21, s6 +; SI-NEXT: v_mov_b32_e32 v17, s8 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: .LBB29_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v49 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v12, v34 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: v_mov_b32_e32 v20, v30 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: v_mov_b32_e32 v24, v28 +; SI-NEXT: v_mov_b32_e32 v26, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v14f32_to_v28i16_scalar: @@ -8475,102 +8483,95 @@ define inreg <28 x i16> @bitcast_v7i64_to_v28i16_scalar(<7 x i64> inreg %a, i32 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s28 -; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s7, s27, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 16 -; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: v_mov_b32_e32 v0, s28 -; SI-NEXT: v_alignbit_b32 v25, s29, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v21, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v17, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v13, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v9, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s29, 16 -; SI-NEXT: s_lshr_b32 s7, s27, 16 -; SI-NEXT: s_lshr_b32 s8, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s10, s21, 16 -; SI-NEXT: s_lshr_b32 s11, s19, 16 -; SI-NEXT: s_lshr_b32 s12, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s44, s29, 16 +; SI-NEXT: s_lshr_b32 s45, s27, 16 +; SI-NEXT: s_lshr_b32 s46, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s56, s21, 16 +; SI-NEXT: s_lshr_b32 s57, s19, 16 +; SI-NEXT: s_lshr_b32 s58, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s40 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s12 +; SI-NEXT: v_mov_b32_e32 v3, s58 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s14 ; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v7, s57 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s12 ; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v11, s56 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s10 ; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v15, s9 +; SI-NEXT: v_mov_b32_e32 v15, s47 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s8 ; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v19, s8 +; SI-NEXT: v_mov_b32_e32 v19, s46 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s6 ; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v23, s7 +; SI-NEXT: v_mov_b32_e32 v23, s45 ; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: v_mov_b32_e32 v27, s6 +; SI-NEXT: v_mov_b32_e32 v27, s44 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $sgpr46 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v7i64_to_v28i16_scalar: @@ -10809,104 +10810,104 @@ define inreg <28 x i16> @bitcast_v7f64_to_v28i16_scalar(<7 x double> inreg %a, i ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s28 -; SI-NEXT: v_alignbit_b32 v28, s29, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s26 -; SI-NEXT: v_alignbit_b32 v29, s27, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s24 -; SI-NEXT: v_alignbit_b32 v30, s25, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s22 -; SI-NEXT: v_alignbit_b32 v31, s23, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s20 -; SI-NEXT: v_alignbit_b32 v32, s21, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v33, s19, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v34, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s11, s27, 16 -; SI-NEXT: s_lshr_b32 s10, s25, 16 -; SI-NEXT: s_lshr_b32 s9, s23, 16 -; SI-NEXT: s_lshr_b32 s8, s21, 16 -; SI-NEXT: s_lshr_b32 s7, s19, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s58, s29, 16 +; SI-NEXT: s_lshr_b32 s57, s27, 16 +; SI-NEXT: s_lshr_b32 s56, s25, 16 +; SI-NEXT: s_lshr_b32 s47, s23, 16 +; SI-NEXT: s_lshr_b32 s46, s21, 16 +; SI-NEXT: s_lshr_b32 s45, s19, 16 +; SI-NEXT: s_lshr_b32 s44, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[4:5], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[8:9], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[12:13], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[16:17], s[24:25], 1.0 -; SI-NEXT: v_add_f64 v[24:25], s[28:29], 1.0 -; SI-NEXT: v_add_f64 v[20:21], s[26:27], 1.0 -; SI-NEXT: v_alignbit_b32 v28, v25, v24, 16 -; SI-NEXT: v_alignbit_b32 v29, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v30, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v31, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v32, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v33, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_add_f64 v[28:29], s[28:29], 1.0 +; SI-NEXT: v_add_f64 v[30:31], s[26:27], 1.0 +; SI-NEXT: v_add_f64 v[32:33], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[34:35], s[22:23], 1.0 +; SI-NEXT: v_add_f64 v[36:37], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[38:39], s[18:19], 1.0 +; SI-NEXT: v_add_f64 v[48:49], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[25:26], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[30:31], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[34:35], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[48:49], 16 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v35 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v49 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr47 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr58 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v5, s19 -; SI-NEXT: v_mov_b32_e32 v9, s21 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v21, s27 -; SI-NEXT: v_mov_b32_e32 v25, s29 -; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v3, s6 -; SI-NEXT: v_mov_b32_e32 v7, s7 -; SI-NEXT: v_mov_b32_e32 v11, s8 -; SI-NEXT: v_mov_b32_e32 v15, s9 -; SI-NEXT: v_mov_b32_e32 v19, s10 -; SI-NEXT: v_mov_b32_e32 v23, s11 -; SI-NEXT: v_mov_b32_e32 v27, s12 +; SI-NEXT: v_mov_b32_e32 v29, s29 +; SI-NEXT: v_mov_b32_e32 v31, s27 +; SI-NEXT: v_mov_b32_e32 v33, s25 +; SI-NEXT: v_mov_b32_e32 v35, s23 +; SI-NEXT: v_mov_b32_e32 v37, s21 +; SI-NEXT: v_mov_b32_e32 v39, s19 +; SI-NEXT: v_mov_b32_e32 v49, s17 +; SI-NEXT: v_mov_b32_e32 v48, s16 +; SI-NEXT: v_mov_b32_e32 v38, s18 +; SI-NEXT: v_mov_b32_e32 v36, s20 +; SI-NEXT: v_mov_b32_e32 v34, s22 +; SI-NEXT: v_mov_b32_e32 v32, s24 +; SI-NEXT: v_mov_b32_e32 v30, s26 +; SI-NEXT: v_mov_b32_e32 v28, s28 +; SI-NEXT: v_mov_b32_e32 v27, s58 +; SI-NEXT: v_mov_b32_e32 v23, s57 +; SI-NEXT: v_mov_b32_e32 v19, s56 +; SI-NEXT: v_mov_b32_e32 v15, s47 +; SI-NEXT: v_mov_b32_e32 v11, s46 +; SI-NEXT: v_mov_b32_e32 v7, s45 +; SI-NEXT: v_mov_b32_e32 v3, s44 +; SI-NEXT: v_mov_b32_e32 v1, s40 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v9, s12 +; SI-NEXT: v_mov_b32_e32 v13, s10 +; SI-NEXT: v_mov_b32_e32 v17, s8 +; SI-NEXT: v_mov_b32_e32 v21, s6 +; SI-NEXT: v_mov_b32_e32 v25, s4 ; SI-NEXT: .LBB49_5: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v18, v17 -; SI-NEXT: v_mov_b32_e32 v22, v21 -; SI-NEXT: v_mov_b32_e32 v26, v25 -; SI-NEXT: v_mov_b32_e32 v1, v34 -; SI-NEXT: v_mov_b32_e32 v5, v33 -; SI-NEXT: v_mov_b32_e32 v9, v32 -; SI-NEXT: v_mov_b32_e32 v13, v31 -; SI-NEXT: v_mov_b32_e32 v17, v30 -; SI-NEXT: v_mov_b32_e32 v21, v29 -; SI-NEXT: v_mov_b32_e32 v25, v28 +; SI-NEXT: v_mov_b32_e32 v0, v48 +; SI-NEXT: v_mov_b32_e32 v2, v49 +; SI-NEXT: v_mov_b32_e32 v4, v38 +; SI-NEXT: v_mov_b32_e32 v6, v39 +; SI-NEXT: v_mov_b32_e32 v8, v36 +; SI-NEXT: v_mov_b32_e32 v10, v37 +; SI-NEXT: v_mov_b32_e32 v12, v34 +; SI-NEXT: v_mov_b32_e32 v14, v35 +; SI-NEXT: v_mov_b32_e32 v16, v32 +; SI-NEXT: v_mov_b32_e32 v18, v33 +; SI-NEXT: v_mov_b32_e32 v20, v30 +; SI-NEXT: v_mov_b32_e32 v22, v31 +; SI-NEXT: v_mov_b32_e32 v24, v28 +; SI-NEXT: v_mov_b32_e32 v26, v29 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v7f64_to_v28i16_scalar: @@ -13866,83 +13867,107 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-LABEL: bitcast_v28f16_to_v28i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v19, v5 +; SI-NEXT: v_mov_b32_e32 v17, v12 +; SI-NEXT: v_mov_b32_e32 v21, v11 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v25, v7 ; SI-NEXT: v_mov_b32_e32 v18, v4 -; SI-NEXT: v_mov_b32_e32 v17, v3 -; SI-NEXT: v_mov_b32_e32 v16, v2 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v20, v0 +; SI-NEXT: v_mov_b32_e32 v26, v3 +; SI-NEXT: v_mov_b32_e32 v28, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v30 +; SI-NEXT: v_or_b32_e32 v22, v22, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -13950,48 +13975,30 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_or_b32_e32 v18, v18, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v14, v14, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -13999,18 +14006,20 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v10, v10, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v28 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshr_b64 v[34:35], v[1:2], 16 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_or_b32_e32 v26, v26, v28 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshr_b64 v[29:30], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[25:26], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -14018,14 +14027,14 @@ define inreg <28 x i16> @bitcast_v28f16_to_v28i16_scalar(<28 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 ; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v34 +; SI-NEXT: v_mov_b32_e32 v5, v32 +; SI-NEXT: v_mov_b32_e32 v9, v29 +; SI-NEXT: v_mov_b32_e32 v13, v35 +; SI-NEXT: v_mov_b32_e32 v17, v30 +; SI-NEXT: v_mov_b32_e32 v21, v36 +; SI-NEXT: v_mov_b32_e32 v25, v37 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: ; SI-NEXT: s_branch .LBB59_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index fd190b23dd8ca..acc02472c7161 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -903,31 +903,32 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v4, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 +; SI-NEXT: v_alignbit_b32 v0, v5, v0, 16 ; SI-NEXT: .LBB5_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_branch .LBB5_2 ; @@ -1391,26 +1392,27 @@ define inreg <3 x i16> @bitcast_v3f16_to_v3i16_scalar(<3 x half> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB9_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB9_3 ; SI-NEXT: .LBB9_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: .LBB9_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v3 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB9_4: ; SI-NEXT: s_branch .LBB9_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 352b2cb7123b1..e33493c6a760e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -2143,96 +2143,113 @@ define inreg <32 x i16> @bitcast_v16i32_to_v32i16_scalar(<16 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v16i32_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_mov_b32_e32 v30, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_cbranch_scc0 .LBB13_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s27, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB13_3 +; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s27, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 16 +; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s44 ; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s63 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s42 ; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s62 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s40 ; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s61 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s14 ; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s60 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s12 ; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s59 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s10 ; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s58 ; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v25, s8 ; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: s_cbranch_scc0 .LBB13_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_cbranch_execnz .LBB13_3 -; SI-NEXT: .LBB13_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v2 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_add_i32_e32 v6, vcc, 3, v6 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 3, v10 -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_add_i32_e32 v14, vcc, 3, v14 -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_add_i32_e32 v18, vcc, 3, v18 -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_add_i32_e32 v22, vcc, 3, v22 -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_add_i32_e32 v26, vcc, 3, v26 -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_add_i32_e32 v30, vcc, 3, v30 -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: .LBB13_3: ; %end +; SI-NEXT: v_mov_b32_e32 v27, s57 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: v_mov_b32_e32 v30, s5 +; SI-NEXT: v_mov_b32_e32 v31, s56 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v16i32_to_v32i16_scalar: @@ -9385,386 +9402,449 @@ define inreg <64 x i8> @bitcast_v16i32_to_v64i8_scalar(<16 x i32> inreg %a, i32 ; SI-LABEL: bitcast_v16i32_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v4, s30, 0 +; SI-NEXT: v_writelane_b32 v4, s31, 1 +; SI-NEXT: v_writelane_b32 v4, s34, 2 +; SI-NEXT: v_writelane_b32 v4, s35, 3 +; SI-NEXT: v_writelane_b32 v4, s36, 4 +; SI-NEXT: v_writelane_b32 v4, s37, 5 +; SI-NEXT: v_writelane_b32 v4, s38, 6 +; SI-NEXT: v_writelane_b32 v4, s39, 7 +; SI-NEXT: v_writelane_b32 v4, s48, 8 +; SI-NEXT: v_writelane_b32 v4, s49, 9 +; SI-NEXT: v_writelane_b32 v4, s50, 10 +; SI-NEXT: v_writelane_b32 v4, s51, 11 +; SI-NEXT: v_writelane_b32 v4, s52, 12 +; SI-NEXT: v_writelane_b32 v4, s53, 13 +; SI-NEXT: v_writelane_b32 v4, s54, 14 +; SI-NEXT: v_writelane_b32 v4, s55, 15 +; SI-NEXT: v_writelane_b32 v4, s64, 16 +; SI-NEXT: v_writelane_b32 v4, s65, 17 +; SI-NEXT: v_writelane_b32 v4, s66, 18 +; SI-NEXT: v_writelane_b32 v4, s67, 19 +; SI-NEXT: v_writelane_b32 v4, s68, 20 +; SI-NEXT: v_writelane_b32 v4, s69, 21 +; SI-NEXT: v_writelane_b32 v4, s70, 22 +; SI-NEXT: v_writelane_b32 v4, s71, 23 +; SI-NEXT: v_writelane_b32 v4, s80, 24 +; SI-NEXT: v_writelane_b32 v4, s81, 25 +; SI-NEXT: v_writelane_b32 v4, s82, 26 +; SI-NEXT: v_writelane_b32 v4, s83, 27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_writelane_b32 v4, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_writelane_b32 v4, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 -; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 -; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 -; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 -; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 -; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 -; SI-NEXT: s_lshr_b32 s8, s6, 24 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 8 -; SI-NEXT: s_lshr_b32 s11, s29, 24 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s29, 8 -; SI-NEXT: s_lshr_b32 s14, s27, 24 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 8 -; SI-NEXT: s_lshr_b32 s41, s25, 24 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 8 -; SI-NEXT: s_lshr_b32 s44, s23, 24 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: s_lshr_b32 s47, s21, 24 -; SI-NEXT: s_lshr_b32 s56, s21, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 8 -; SI-NEXT: s_lshr_b32 s58, s19, 24 -; SI-NEXT: s_lshr_b32 s59, s19, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 8 -; SI-NEXT: s_lshr_b32 s61, s17, 24 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_lshr_b32 s38, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s5, 16 +; SI-NEXT: s_lshr_b32 s48, s5, 8 +; SI-NEXT: s_lshr_b32 s49, s29, 24 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s29, 8 +; SI-NEXT: s_lshr_b32 s52, s27, 24 +; SI-NEXT: s_lshr_b32 s53, s27, 16 +; SI-NEXT: s_lshr_b32 s54, s27, 8 +; SI-NEXT: s_lshr_b32 s55, s25, 24 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 8 +; SI-NEXT: s_lshr_b32 s66, s23, 24 +; SI-NEXT: s_lshr_b32 s67, s23, 16 +; SI-NEXT: s_lshr_b32 s68, s23, 8 +; SI-NEXT: s_lshr_b32 s69, s21, 24 +; SI-NEXT: s_lshr_b32 s70, s21, 16 +; SI-NEXT: s_lshr_b32 s71, s21, 8 +; SI-NEXT: s_lshr_b32 s80, s19, 24 +; SI-NEXT: s_lshr_b32 s81, s19, 16 +; SI-NEXT: s_lshr_b32 s82, s19, 8 +; SI-NEXT: s_lshr_b32 s83, s17, 24 +; SI-NEXT: s_lshr_b32 s84, s17, 16 +; SI-NEXT: s_lshr_b32 s85, s17, 8 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true -; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v9, 8 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v9, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 8 -; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 -; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 -; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 -; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 -; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 -; SI-NEXT: s_lshr_b32 s8, s6, 24 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 8 -; SI-NEXT: s_lshr_b32 s11, s29, 24 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s29, 8 -; SI-NEXT: s_lshr_b32 s14, s27, 24 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 8 -; SI-NEXT: s_lshr_b32 s41, s25, 24 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 8 -; SI-NEXT: s_lshr_b32 s44, s23, 24 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: s_lshr_b32 s47, s21, 24 -; SI-NEXT: s_lshr_b32 s56, s21, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 8 -; SI-NEXT: s_lshr_b32 s58, s19, 24 -; SI-NEXT: s_lshr_b32 s59, s19, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 8 -; SI-NEXT: s_lshr_b32 s61, s17, 24 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 16 +; SI-NEXT: s_lshr_b32 s38, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s5, 16 +; SI-NEXT: s_lshr_b32 s48, s5, 8 +; SI-NEXT: s_lshr_b32 s49, s29, 24 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s29, 8 +; SI-NEXT: s_lshr_b32 s52, s27, 24 +; SI-NEXT: s_lshr_b32 s53, s27, 16 +; SI-NEXT: s_lshr_b32 s54, s27, 8 +; SI-NEXT: s_lshr_b32 s55, s25, 24 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 8 +; SI-NEXT: s_lshr_b32 s66, s23, 24 +; SI-NEXT: s_lshr_b32 s67, s23, 16 +; SI-NEXT: s_lshr_b32 s68, s23, 8 +; SI-NEXT: s_lshr_b32 s69, s21, 24 +; SI-NEXT: s_lshr_b32 s70, s21, 16 +; SI-NEXT: s_lshr_b32 s71, s21, 8 +; SI-NEXT: s_lshr_b32 s80, s19, 24 +; SI-NEXT: s_lshr_b32 s81, s19, 16 +; SI-NEXT: s_lshr_b32 s82, s19, 8 +; SI-NEXT: s_lshr_b32 s83, s17, 24 +; SI-NEXT: s_lshr_b32 s84, s17, 16 +; SI-NEXT: s_lshr_b32 s85, s17, 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 ; SI-NEXT: .LBB25_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s63, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s61, 24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v21, s4, v21 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s60, 8 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s59, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s58, 24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v18, s4, v18 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s56, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s47, 24 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s44, 24 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_lshl_b32 s7, s36, 8 +; SI-NEXT: s_and_b32 s9, s16, 0xff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s9, s34, 0xff +; SI-NEXT: s_lshl_b32 s11, s30, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xff +; SI-NEXT: s_lshl_b32 s9, s85, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s84, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s83, 24 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_lshl_b32 s7, s94, 8 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s9, s92, 0xff +; SI-NEXT: s_lshl_b32 s11, s90, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xff +; SI-NEXT: s_lshl_b32 s9, s82, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s81, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s80, 24 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s43, 8 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s9, s78, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s76, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s21, 0xff +; SI-NEXT: s_lshl_b32 s9, s71, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s70, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s69, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s41, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s9, s88, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s72, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s62, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s23, 0xff +; SI-NEXT: s_lshl_b32 s9, s68, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s67, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s66, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s24, 0xff +; SI-NEXT: s_lshl_b32 s9, s60, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s58, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s56, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s25, 0xff +; SI-NEXT: s_lshl_b32 s9, s65, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s64, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s55, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: buffer_store_dword v11, v9, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v10 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s14, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s12, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s11, 24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s9, s46, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s44, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s42, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s27, 0xff +; SI-NEXT: s_lshl_b32 s9, s54, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s53, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s52, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s29, 0xff +; SI-NEXT: s_lshl_b32 s9, s51, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s50, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s49, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s7, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s7 +; SI-NEXT: s_and_b32 s7, s8, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_and_b32 s5, s39, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s8, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_lshl_b32 s6, s38, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s85, v4, 29 +; SI-NEXT: v_readlane_b32 s84, v4, 28 +; SI-NEXT: v_readlane_b32 s83, v4, 27 +; SI-NEXT: v_readlane_b32 s82, v4, 26 +; SI-NEXT: v_readlane_b32 s81, v4, 25 +; SI-NEXT: v_readlane_b32 s80, v4, 24 +; SI-NEXT: v_readlane_b32 s71, v4, 23 +; SI-NEXT: v_readlane_b32 s70, v4, 22 +; SI-NEXT: v_readlane_b32 s69, v4, 21 +; SI-NEXT: v_readlane_b32 s68, v4, 20 +; SI-NEXT: v_readlane_b32 s67, v4, 19 +; SI-NEXT: v_readlane_b32 s66, v4, 18 +; SI-NEXT: v_readlane_b32 s65, v4, 17 +; SI-NEXT: v_readlane_b32 s64, v4, 16 +; SI-NEXT: v_readlane_b32 s55, v4, 15 +; SI-NEXT: v_readlane_b32 s54, v4, 14 +; SI-NEXT: v_readlane_b32 s53, v4, 13 +; SI-NEXT: v_readlane_b32 s52, v4, 12 +; SI-NEXT: v_readlane_b32 s51, v4, 11 +; SI-NEXT: v_readlane_b32 s50, v4, 10 +; SI-NEXT: v_readlane_b32 s49, v4, 9 +; SI-NEXT: v_readlane_b32 s48, v4, 8 +; SI-NEXT: v_readlane_b32 s39, v4, 7 +; SI-NEXT: v_readlane_b32 s38, v4, 6 +; SI-NEXT: v_readlane_b32 s37, v4, 5 +; SI-NEXT: v_readlane_b32 s36, v4, 4 +; SI-NEXT: v_readlane_b32 s35, v4, 3 +; SI-NEXT: v_readlane_b32 s34, v4, 2 +; SI-NEXT: v_readlane_b32 s31, v4, 1 +; SI-NEXT: v_readlane_b32 s30, v4, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_v16i32_to_v64i8_scalar: @@ -13390,42 +13470,46 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -13437,49 +13521,48 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB27_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -13487,113 +13570,80 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -13602,6 +13652,7 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -13610,65 +13661,116 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB27_3 ; SI-NEXT: .LBB27_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -13710,100 +13812,58 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -13826,47 +13886,12 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB27_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB27_2 ; @@ -13889,142 +13914,121 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB27_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -14054,47 +14058,91 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB27_3 ; VI-NEXT: .LBB27_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -14134,76 +14182,35 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB27_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14224,43 +14231,9 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB27_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB27_2 ; @@ -14283,147 +14256,124 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB27_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -14453,48 +14403,78 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB27_3 ; GFX9-NEXT: .LBB27_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -14518,6 +14498,20 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -14534,76 +14528,35 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB27_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -14624,43 +14577,9 @@ define inreg <16 x i32> @bitcast_v64i8_to_v16i32_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB27_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB27_2 ; @@ -16911,78 +16830,93 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, ; SI-LABEL: bitcast_v16f32_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_mov_b32_e32 v30, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v6, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v10, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v14, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v18, s25 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v22, s27 -; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_mov_b32_e32 v33, v1 +; SI-NEXT: v_mov_b32_e32 v32, v0 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v26, s29 +; SI-NEXT: v_mov_b32_e32 v55, s29 ; SI-NEXT: s_cbranch_scc0 .LBB37_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_3 ; SI-NEXT: .LBB37_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v0, 1.0, v0 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: v_add_f32_e32 v35, 1.0, v35 +; SI-NEXT: v_add_f32_e32 v34, 1.0, v34 +; SI-NEXT: v_add_f32_e32 v37, 1.0, v37 +; SI-NEXT: v_add_f32_e32 v36, 1.0, v36 +; SI-NEXT: v_add_f32_e32 v39, 1.0, v39 +; SI-NEXT: v_add_f32_e32 v38, 1.0, v38 +; SI-NEXT: v_add_f32_e32 v49, 1.0, v49 +; SI-NEXT: v_add_f32_e32 v48, 1.0, v48 +; SI-NEXT: v_add_f32_e32 v51, 1.0, v51 +; SI-NEXT: v_add_f32_e32 v50, 1.0, v50 +; SI-NEXT: v_add_f32_e32 v53, 1.0, v53 +; SI-NEXT: v_add_f32_e32 v52, 1.0, v52 +; SI-NEXT: v_add_f32_e32 v55, 1.0, v55 +; SI-NEXT: v_add_f32_e32 v54, 1.0, v54 +; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 +; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB37_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v50 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v26, v55 +; SI-NEXT: v_mov_b32_e32 v28, v32 +; SI-NEXT: v_mov_b32_e32 v30, v33 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB37_4: ; SI-NEXT: ; implicit-def: $vgpr1 @@ -16997,10 +16931,10 @@ define inreg <32 x i16> @bitcast_v16f32_to_v32i16_scalar(<16 x float> inreg %a, ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB37_2 ; ; VI-LABEL: bitcast_v16f32_to_v32i16_scalar: @@ -24115,433 +24049,494 @@ define inreg <64 x i8> @bitcast_v16f32_to_v64i8_scalar(<16 x float> inreg %a, i3 ; SI-LABEL: bitcast_v16f32_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v25, s17 -; SI-NEXT: v_mov_b32_e32 v20, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s36, v1 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s37, v2 +; SI-NEXT: v_writelane_b32 v40, s85, 29 +; SI-NEXT: s_cbranch_scc0 .LBB49_3 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s82, s37, 24 +; SI-NEXT: s_lshr_b32 s84, s37, 16 +; SI-NEXT: s_lshr_b32 s85, s37, 8 +; SI-NEXT: s_lshr_b32 s71, s29, 24 +; SI-NEXT: s_lshr_b32 s81, s29, 16 +; SI-NEXT: s_lshr_b32 s83, s29, 8 +; SI-NEXT: s_lshr_b32 s68, s27, 24 +; SI-NEXT: s_lshr_b32 s70, s27, 16 +; SI-NEXT: s_lshr_b32 s80, s27, 8 +; SI-NEXT: s_lshr_b32 s65, s25, 24 +; SI-NEXT: s_lshr_b32 s67, s25, 16 +; SI-NEXT: s_lshr_b32 s69, s25, 8 +; SI-NEXT: s_lshr_b32 s54, s23, 24 +; SI-NEXT: s_lshr_b32 s64, s23, 16 +; SI-NEXT: s_lshr_b32 s66, s23, 8 +; SI-NEXT: s_lshr_b32 s51, s21, 24 +; SI-NEXT: s_lshr_b32 s53, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s21, 8 +; SI-NEXT: s_lshr_b32 s48, s19, 24 +; SI-NEXT: s_lshr_b32 s50, s19, 16 +; SI-NEXT: s_lshr_b32 s52, s19, 8 +; SI-NEXT: s_lshr_b32 s38, s17, 24 +; SI-NEXT: s_lshr_b32 s39, s17, 16 +; SI-NEXT: s_lshr_b32 s49, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[36:37], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[36:37], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[36:37], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8 +; SI-NEXT: s_cbranch_execnz .LBB49_4 +; SI-NEXT: .LBB49_2: ; %cmp.true +; SI-NEXT: v_add_f32_e64 v20, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v22, s16, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s19, 1.0 +; SI-NEXT: v_add_f32_e64 v18, s18, 1.0 +; SI-NEXT: v_add_f32_e64 v11, s21, 1.0 +; SI-NEXT: v_add_f32_e64 v15, s20, 1.0 +; SI-NEXT: v_add_f32_e64 v9, s23, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s22, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s25, 1.0 +; SI-NEXT: v_add_f32_e64 v8, s24, 1.0 +; SI-NEXT: v_add_f32_e64 v5, s27, 1.0 +; SI-NEXT: v_add_f32_e64 v6, s26, 1.0 +; SI-NEXT: v_add_f32_e64 v3, s29, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s28, 1.0 +; SI-NEXT: v_add_f32_e64 v1, s37, 1.0 +; SI-NEXT: v_add_f32_e64 v2, s36, 1.0 +; SI-NEXT: v_readfirstlane_b32 s16, v22 +; SI-NEXT: v_readfirstlane_b32 s17, v20 +; SI-NEXT: v_readfirstlane_b32 s18, v18 +; SI-NEXT: v_readfirstlane_b32 s19, v16 +; SI-NEXT: v_readfirstlane_b32 s20, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v11 +; SI-NEXT: v_readfirstlane_b32 s22, v10 +; SI-NEXT: v_readfirstlane_b32 s23, v9 +; SI-NEXT: v_readfirstlane_b32 s24, v8 +; SI-NEXT: v_readfirstlane_b32 s25, v7 +; SI-NEXT: v_readfirstlane_b32 s26, v6 +; SI-NEXT: v_readfirstlane_b32 s27, v5 +; SI-NEXT: v_readfirstlane_b32 s14, v4 +; SI-NEXT: v_readfirstlane_b32 s15, v3 +; SI-NEXT: v_readfirstlane_b32 s8, v2 +; SI-NEXT: v_readfirstlane_b32 s9, v1 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[72:73], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[94:95], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 8 +; SI-NEXT: v_lshrrev_b32_e32 v12, 24, v1 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v14, 8, v1 +; SI-NEXT: v_lshrrev_b32_e32 v17, 24, v3 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v21, 8, v3 +; SI-NEXT: v_lshrrev_b32_e32 v23, 24, v5 +; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v25, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v26, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v7 +; SI-NEXT: v_lshrrev_b32_e32 v29, 24, v9 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v11 +; SI-NEXT: v_lshrrev_b32_e32 v35, 24, v16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v37, 8, v16 +; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v20 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v20 +; SI-NEXT: s_branch .LBB49_5 +; SI-NEXT: .LBB49_3: +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_branch .LBB49_2 +; SI-NEXT: .LBB49_4: +; SI-NEXT: v_mov_b32_e32 v22, s16 +; SI-NEXT: v_mov_b32_e32 v20, s17 +; SI-NEXT: v_mov_b32_e32 v18, s18 +; SI-NEXT: v_mov_b32_e32 v16, s19 ; SI-NEXT: v_mov_b32_e32 v15, s20 -; SI-NEXT: v_mov_b32_e32 v14, s21 -; SI-NEXT: v_mov_b32_e32 v11, s22 +; SI-NEXT: v_mov_b32_e32 v11, s21 +; SI-NEXT: v_mov_b32_e32 v10, s22 ; SI-NEXT: v_mov_b32_e32 v9, s23 ; SI-NEXT: v_mov_b32_e32 v8, s24 ; SI-NEXT: v_mov_b32_e32 v7, s25 ; SI-NEXT: v_mov_b32_e32 v6, s26 ; SI-NEXT: v_mov_b32_e32 v5, s27 ; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v3, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: s_cbranch_scc0 .LBB49_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 -; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 -; SI-NEXT: v_alignbit_b32 v17, v3, v4, 16 -; SI-NEXT: v_alignbit_b32 v18, v3, v4, 8 -; SI-NEXT: v_alignbit_b32 v21, v5, v6, 24 -; SI-NEXT: v_alignbit_b32 v22, v5, v6, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v6, 8 -; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 -; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 -; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 -; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 -; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 -; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 -; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 -; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 -; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 -; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 -; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 -; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v5 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v19 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 -; SI-NEXT: s_cbranch_execnz .LBB49_3 -; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_alignbit_b32 v10, v2, v1, 24 -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v13, v2, v1, 8 -; SI-NEXT: v_alignbit_b32 v16, v3, v4, 24 -; SI-NEXT: v_alignbit_b32 v17, v3, v4, 16 -; SI-NEXT: v_alignbit_b32 v18, v3, v4, 8 -; SI-NEXT: v_alignbit_b32 v21, v5, v6, 24 -; SI-NEXT: v_alignbit_b32 v22, v5, v6, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v6, 8 -; SI-NEXT: v_alignbit_b32 v29, v7, v8, 24 -; SI-NEXT: v_alignbit_b32 v30, v7, v8, 16 -; SI-NEXT: v_alignbit_b32 v31, v7, v8, 8 -; SI-NEXT: v_alignbit_b32 v35, v9, v11, 24 -; SI-NEXT: v_alignbit_b32 v36, v9, v11, 16 -; SI-NEXT: v_alignbit_b32 v37, v9, v11, 8 -; SI-NEXT: v_alignbit_b32 v49, v14, v15, 24 -; SI-NEXT: v_alignbit_b32 v50, v14, v15, 16 -; SI-NEXT: v_alignbit_b32 v52, v14, v15, 8 -; SI-NEXT: v_alignbit_b32 v55, v19, v20, 24 -; SI-NEXT: v_alignbit_b32 v41, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v20, 8 -; SI-NEXT: v_alignbit_b32 v46, v25, v28, 24 -; SI-NEXT: v_alignbit_b32 v56, v25, v28, 16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v58, v25, v28, 8 -; SI-NEXT: v_lshrrev_b32_e32 v24, 24, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v2 -; SI-NEXT: v_lshrrev_b32_e32 v32, 24, v3 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v3 -; SI-NEXT: v_lshrrev_b32_e32 v34, 8, v3 -; SI-NEXT: v_lshrrev_b32_e32 v38, 24, v5 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v5 -; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v7 -; SI-NEXT: v_lshrrev_b32_e32 v40, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v9 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v14 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v19 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v19 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v61, 8, v19 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v25 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25 +; SI-NEXT: v_mov_b32_e32 v2, s36 +; SI-NEXT: v_mov_b32_e32 v1, s37 +; SI-NEXT: v_mov_b32_e32 v48, s49 +; SI-NEXT: v_mov_b32_e32 v39, s39 +; SI-NEXT: v_mov_b32_e32 v38, s38 +; SI-NEXT: v_mov_b32_e32 v37, s52 +; SI-NEXT: v_mov_b32_e32 v36, s50 +; SI-NEXT: v_mov_b32_e32 v35, s48 +; SI-NEXT: v_mov_b32_e32 v34, s55 +; SI-NEXT: v_mov_b32_e32 v33, s53 +; SI-NEXT: v_mov_b32_e32 v32, s51 +; SI-NEXT: v_mov_b32_e32 v31, s66 +; SI-NEXT: v_mov_b32_e32 v30, s64 +; SI-NEXT: v_mov_b32_e32 v29, s54 +; SI-NEXT: v_mov_b32_e32 v28, s69 +; SI-NEXT: v_mov_b32_e32 v27, s67 +; SI-NEXT: v_mov_b32_e32 v26, s65 +; SI-NEXT: v_mov_b32_e32 v25, s80 +; SI-NEXT: v_mov_b32_e32 v24, s70 +; SI-NEXT: v_mov_b32_e32 v23, s68 +; SI-NEXT: v_mov_b32_e32 v21, s83 +; SI-NEXT: v_mov_b32_e32 v19, s81 +; SI-NEXT: v_mov_b32_e32 v17, s71 +; SI-NEXT: v_mov_b32_e32 v14, s85 +; SI-NEXT: v_mov_b32_e32 v13, s84 +; SI-NEXT: v_mov_b32_e32 v12, s82 +; SI-NEXT: .LBB49_5: ; %end +; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: s_lshl_b32 s5, s34, 8 +; SI-NEXT: v_or_b32_e32 v22, s5, v22 +; SI-NEXT: s_and_b32 s5, s30, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s94, 24 +; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: v_or_b32_e32 v22, s5, v22 +; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v10, 8, v25 -; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_and_b32_e32 v28, 0xff, v28 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v58 -; SI-NEXT: v_and_b32_e32 v56, 0xff, v56 -; SI-NEXT: v_or_b32_e32 v28, v28, v58 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v46, 24, v46 -; SI-NEXT: v_or_b32_e32 v46, v46, v56 -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; SI-NEXT: v_or_b32_e32 v28, v28, v46 -; SI-NEXT: v_or_b32_e32 v10, v25, v10 -; SI-NEXT: v_and_b32_e32 v25, 0xff, v63 -; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v48 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v18 +; SI-NEXT: s_lshl_b32 s5, s92, 8 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v38, 24, v38 +; SI-NEXT: v_or_b32_e32 v18, s5, v18 +; SI-NEXT: s_and_b32 s5, s90, 0xff +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v22, v38, v22 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s88, 24 +; SI-NEXT: v_or_b32_e32 v20, v20, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v18, s5, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 24, v62 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v25, v28, v25 -; SI-NEXT: v_or_b32_e32 v10, v10, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v10, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 8, v43 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v41 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v55 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v20, v25, v20 -; SI-NEXT: v_or_b32_e32 v10, v10, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v18, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v61 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v59 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_or_b32_e32 v10, v10, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v10, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v52 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v37 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_and_b32_e32 v18, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 +; SI-NEXT: s_lshl_b32 s5, s74, 8 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 24, v35 +; SI-NEXT: v_or_b32_e32 v15, s5, v15 +; SI-NEXT: s_and_b32 s5, s62, 0xff +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_or_b32_e32 v18, v20, v18 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s60, 24 +; SI-NEXT: v_or_b32_e32 v16, v16, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v15, s5, v15 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v34 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: s_lshl_b32 s5, s78, 8 ; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v49 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v15, v19, v15 -; SI-NEXT: v_or_b32_e32 v10, v10, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v10, v15, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v57 -; SI-NEXT: v_or_b32_e32 v10, v10, v14 -; SI-NEXT: v_and_b32_e32 v14, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v45 +; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v32 +; SI-NEXT: v_or_b32_e32 v10, s5, v10 +; SI-NEXT: s_and_b32 s5, s76, 0xff +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v15, v16, v15 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s72, 24 +; SI-NEXT: v_or_b32_e32 v11, v11, v15 +; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v14, v15, v14 -; SI-NEXT: v_or_b32_e32 v10, v10, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s5, s7, s5 +; SI-NEXT: buffer_store_dword v11, v15, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v10, s5, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v37 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v35 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v44 +; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v31 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v42 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_lshl_b32 s5, s58, 8 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v40 +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v29 +; SI-NEXT: v_or_b32_e32 v8, s5, v8 +; SI-NEXT: s_and_b32 s5, s56, 0xff ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s46, 24 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v8, s5, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v31 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v29 -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v54 +; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v28 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v53 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v27 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: s_lshl_b32 s5, s44, 8 ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v51 +; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v26 +; SI-NEXT: v_or_b32_e32 v6, s5, v6 +; SI-NEXT: s_and_b32 s5, s42, 0xff ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: v_or_b32_e32 v8, v9, v8 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s40, 24 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v6, s5, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v23 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v21 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v7, v8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v48 +; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v25 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v39 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_lshl_b32 s5, s14, 8 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v38 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v23 +; SI-NEXT: v_or_b32_e32 v4, s5, v4 +; SI-NEXT: s_and_b32 s5, s12, 0xff ; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s10, 24 ; SI-NEXT: v_or_b32_e32 v5, v5, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_or_b32 s5, s7, s5 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: v_or_b32_e32 v4, s5, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v18 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v16 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v34 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v21 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v33 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: s_lshl_b32 s5, s8, 8 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v17 +; SI-NEXT: v_or_b32_e32 v2, s5, v2 +; SI-NEXT: s_and_b32 s5, s6, 0xff ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s4, s4, 24 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, s4, v2 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v13 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v14 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v26 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v12 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] -; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; kill: killed $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v16f32_to_v64i8_scalar: ; VI: ; %bb.0: @@ -28259,42 +28254,46 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -28306,49 +28305,48 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -28356,113 +28354,80 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -28471,6 +28436,7 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -28479,65 +28445,116 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -28579,100 +28596,58 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -28695,47 +28670,12 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB51_2 ; @@ -28758,142 +28698,121 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB51_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -28923,47 +28842,91 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB51_3 ; VI-NEXT: .LBB51_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -29003,76 +28966,35 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB51_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -29093,43 +29015,9 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB51_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB51_2 ; @@ -29152,147 +29040,124 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB51_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -29322,48 +29187,78 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB51_3 ; GFX9-NEXT: .LBB51_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -29387,6 +29282,20 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -29403,76 +29312,35 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB51_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -29493,43 +29361,9 @@ define inreg <16 x float> @bitcast_v64i8_to_v16f32_scalar(<64 x i8> inreg %a, i3 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB51_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB51_2 ; @@ -31135,96 +30969,113 @@ define inreg <32 x i16> @bitcast_v8i64_to_v32i16_scalar(<8 x i64> inreg %a, i32 ; SI-LABEL: bitcast_v8i64_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v3, v2 -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_mov_b32_e32 v30, v1 -; SI-NEXT: v_mov_b32_e32 v28, v0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v0 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_cbranch_scc0 .LBB57_4 +; SI-NEXT: ; %bb.1: ; %cmp.false +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s27, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: s_cbranch_execnz .LBB57_3 +; SI-NEXT: .LBB57_2: ; %cmp.true +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_addc_u32 s29, s29, 0 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s56, s5, 16 +; SI-NEXT: s_lshr_b32 s57, s29, 16 +; SI-NEXT: s_lshr_b32 s58, s27, 16 +; SI-NEXT: s_lshr_b32 s59, s25, 16 +; SI-NEXT: s_lshr_b32 s60, s23, 16 +; SI-NEXT: s_lshr_b32 s61, s21, 16 +; SI-NEXT: s_lshr_b32 s62, s19, 16 +; SI-NEXT: s_lshr_b32 s63, s17, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[16:17], 16 +; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s44 ; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v3, s63 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s42 ; SI-NEXT: v_mov_b32_e32 v6, s19 +; SI-NEXT: v_mov_b32_e32 v7, s62 ; SI-NEXT: v_mov_b32_e32 v8, s20 +; SI-NEXT: v_mov_b32_e32 v9, s40 ; SI-NEXT: v_mov_b32_e32 v10, s21 +; SI-NEXT: v_mov_b32_e32 v11, s61 ; SI-NEXT: v_mov_b32_e32 v12, s22 +; SI-NEXT: v_mov_b32_e32 v13, s14 ; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v15, s60 ; SI-NEXT: v_mov_b32_e32 v16, s24 +; SI-NEXT: v_mov_b32_e32 v17, s12 ; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v19, s59 ; SI-NEXT: v_mov_b32_e32 v20, s26 +; SI-NEXT: v_mov_b32_e32 v21, s10 ; SI-NEXT: v_mov_b32_e32 v22, s27 +; SI-NEXT: v_mov_b32_e32 v23, s58 ; SI-NEXT: v_mov_b32_e32 v24, s28 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v25, s8 ; SI-NEXT: v_mov_b32_e32 v26, s29 -; SI-NEXT: s_cbranch_scc0 .LBB57_4 -; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_cbranch_execnz .LBB57_3 -; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 -; SI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v4 -; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; SI-NEXT: v_add_i32_e32 v8, vcc, 3, v8 -; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc -; SI-NEXT: v_add_i32_e32 v12, vcc, 3, v12 -; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc -; SI-NEXT: v_add_i32_e32 v16, vcc, 3, v16 -; SI-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; SI-NEXT: v_add_i32_e32 v20, vcc, 3, v20 -; SI-NEXT: v_addc_u32_e32 v22, vcc, 0, v22, vcc -; SI-NEXT: v_add_i32_e32 v24, vcc, 3, v24 -; SI-NEXT: v_addc_u32_e32 v26, vcc, 0, v26, vcc -; SI-NEXT: v_add_i32_e32 v28, vcc, 3, v28 -; SI-NEXT: v_addc_u32_e32 v30, vcc, 0, v30, vcc -; SI-NEXT: v_alignbit_b32 v29, v30, v28, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v24, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v20, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v16, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v8, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v27, s57 +; SI-NEXT: v_mov_b32_e32 v28, s4 +; SI-NEXT: v_mov_b32_e32 v29, s6 +; SI-NEXT: v_mov_b32_e32 v30, s5 +; SI-NEXT: v_mov_b32_e32 v31, s56 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr59 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr56 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v8i64_to_v32i16_scalar: @@ -38395,386 +38246,449 @@ define inreg <64 x i8> @bitcast_v8i64_to_v64i8_scalar(<8 x i64> inreg %a, i32 in ; SI-LABEL: bitcast_v8i64_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v4, s30, 0 +; SI-NEXT: v_writelane_b32 v4, s31, 1 +; SI-NEXT: v_writelane_b32 v4, s34, 2 +; SI-NEXT: v_writelane_b32 v4, s35, 3 +; SI-NEXT: v_writelane_b32 v4, s36, 4 +; SI-NEXT: v_writelane_b32 v4, s37, 5 +; SI-NEXT: v_writelane_b32 v4, s38, 6 +; SI-NEXT: v_writelane_b32 v4, s39, 7 +; SI-NEXT: v_writelane_b32 v4, s48, 8 +; SI-NEXT: v_writelane_b32 v4, s49, 9 +; SI-NEXT: v_writelane_b32 v4, s50, 10 +; SI-NEXT: v_writelane_b32 v4, s51, 11 +; SI-NEXT: v_writelane_b32 v4, s52, 12 +; SI-NEXT: v_writelane_b32 v4, s53, 13 +; SI-NEXT: v_writelane_b32 v4, s54, 14 +; SI-NEXT: v_writelane_b32 v4, s55, 15 +; SI-NEXT: v_writelane_b32 v4, s64, 16 +; SI-NEXT: v_writelane_b32 v4, s65, 17 +; SI-NEXT: v_writelane_b32 v4, s66, 18 +; SI-NEXT: v_writelane_b32 v4, s67, 19 +; SI-NEXT: v_writelane_b32 v4, s68, 20 +; SI-NEXT: v_writelane_b32 v4, s69, 21 +; SI-NEXT: v_writelane_b32 v4, s70, 22 +; SI-NEXT: v_writelane_b32 v4, s71, 23 +; SI-NEXT: v_writelane_b32 v4, s80, 24 +; SI-NEXT: v_writelane_b32 v4, s81, 25 +; SI-NEXT: v_writelane_b32 v4, s82, 26 +; SI-NEXT: v_writelane_b32 v4, s83, 27 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v2 +; SI-NEXT: v_writelane_b32 v4, s84, 28 +; SI-NEXT: v_readfirstlane_b32 s4, v1 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_writelane_b32 v4, s85, 29 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 -; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 -; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 -; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 -; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 -; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 -; SI-NEXT: s_lshr_b32 s8, s6, 24 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 8 -; SI-NEXT: s_lshr_b32 s11, s29, 24 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s29, 8 -; SI-NEXT: s_lshr_b32 s14, s27, 24 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 8 -; SI-NEXT: s_lshr_b32 s41, s25, 24 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 8 -; SI-NEXT: s_lshr_b32 s44, s23, 24 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: s_lshr_b32 s47, s21, 24 -; SI-NEXT: s_lshr_b32 s56, s21, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 8 -; SI-NEXT: s_lshr_b32 s58, s19, 24 -; SI-NEXT: s_lshr_b32 s59, s19, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 8 -; SI-NEXT: s_lshr_b32 s61, s17, 24 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_lshr_b32 s38, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s5, 16 +; SI-NEXT: s_lshr_b32 s48, s5, 8 +; SI-NEXT: s_lshr_b32 s49, s29, 24 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s29, 8 +; SI-NEXT: s_lshr_b32 s52, s27, 24 +; SI-NEXT: s_lshr_b32 s53, s27, 16 +; SI-NEXT: s_lshr_b32 s54, s27, 8 +; SI-NEXT: s_lshr_b32 s55, s25, 24 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 8 +; SI-NEXT: s_lshr_b32 s66, s23, 24 +; SI-NEXT: s_lshr_b32 s67, s23, 16 +; SI-NEXT: s_lshr_b32 s68, s23, 8 +; SI-NEXT: s_lshr_b32 s69, s21, 24 +; SI-NEXT: s_lshr_b32 s70, s21, 16 +; SI-NEXT: s_lshr_b32 s71, s21, 8 +; SI-NEXT: s_lshr_b32 s80, s19, 24 +; SI-NEXT: s_lshr_b32 s81, s19, 16 +; SI-NEXT: s_lshr_b32 s82, s19, 8 +; SI-NEXT: s_lshr_b32 s83, s17, 24 +; SI-NEXT: s_lshr_b32 s84, s17, 16 +; SI-NEXT: s_lshr_b32 s85, s17, 8 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v3, s7 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v22, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v3, 24 -; SI-NEXT: v_alignbit_b32 v2, s6, v3, 16 -; SI-NEXT: v_alignbit_b32 v3, s6, v3, 8 -; SI-NEXT: v_alignbit_b32 v4, s29, v6, 24 -; SI-NEXT: v_alignbit_b32 v5, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 8 -; SI-NEXT: v_alignbit_b32 v7, s27, v9, 24 -; SI-NEXT: v_alignbit_b32 v8, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 8 -; SI-NEXT: v_alignbit_b32 v13, s25, v10, 24 -; SI-NEXT: v_alignbit_b32 v15, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 8 -; SI-NEXT: v_alignbit_b32 v11, s23, v14, 24 -; SI-NEXT: v_alignbit_b32 v12, s23, v14, 16 -; SI-NEXT: v_alignbit_b32 v14, s23, v14, 8 -; SI-NEXT: v_alignbit_b32 v16, s21, v18, 24 -; SI-NEXT: v_alignbit_b32 v17, s21, v18, 16 -; SI-NEXT: v_alignbit_b32 v18, s21, v18, 8 -; SI-NEXT: v_alignbit_b32 v19, s19, v21, 24 -; SI-NEXT: v_alignbit_b32 v20, s19, v21, 16 -; SI-NEXT: v_alignbit_b32 v21, s19, v21, 8 -; SI-NEXT: v_alignbit_b32 v23, s17, v22, 24 -; SI-NEXT: v_alignbit_b32 v24, s17, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, s17, v22, 8 -; SI-NEXT: s_lshr_b32 s8, s6, 24 -; SI-NEXT: s_lshr_b32 s9, s6, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 8 -; SI-NEXT: s_lshr_b32 s11, s29, 24 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s29, 8 -; SI-NEXT: s_lshr_b32 s14, s27, 24 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 8 -; SI-NEXT: s_lshr_b32 s41, s25, 24 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 8 -; SI-NEXT: s_lshr_b32 s44, s23, 24 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: s_lshr_b32 s47, s21, 24 -; SI-NEXT: s_lshr_b32 s56, s21, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 8 -; SI-NEXT: s_lshr_b32 s58, s19, 24 -; SI-NEXT: s_lshr_b32 s59, s19, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 8 -; SI-NEXT: s_lshr_b32 s61, s17, 24 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s38, s5, 24 +; SI-NEXT: s_lshr_b32 s39, s5, 16 +; SI-NEXT: s_lshr_b32 s48, s5, 8 +; SI-NEXT: s_lshr_b32 s49, s29, 24 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s29, 8 +; SI-NEXT: s_lshr_b32 s52, s27, 24 +; SI-NEXT: s_lshr_b32 s53, s27, 16 +; SI-NEXT: s_lshr_b32 s54, s27, 8 +; SI-NEXT: s_lshr_b32 s55, s25, 24 +; SI-NEXT: s_lshr_b32 s64, s25, 16 +; SI-NEXT: s_lshr_b32 s65, s25, 8 +; SI-NEXT: s_lshr_b32 s66, s23, 24 +; SI-NEXT: s_lshr_b32 s67, s23, 16 +; SI-NEXT: s_lshr_b32 s68, s23, 8 +; SI-NEXT: s_lshr_b32 s69, s21, 24 +; SI-NEXT: s_lshr_b32 s70, s21, 16 +; SI-NEXT: s_lshr_b32 s71, s21, 8 +; SI-NEXT: s_lshr_b32 s80, s19, 24 +; SI-NEXT: s_lshr_b32 s81, s19, 16 +; SI-NEXT: s_lshr_b32 s82, s19, 8 +; SI-NEXT: s_lshr_b32 s83, s17, 24 +; SI-NEXT: s_lshr_b32 s84, s17, 16 +; SI-NEXT: s_lshr_b32 s85, s17, 8 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[14:15], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[16:17], 8 ; SI-NEXT: .LBB69_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v22, s4, v22 -; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s5, s63, 8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s62, 0xff -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s61, 24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 24, v23 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_mov_b32_e32 v23, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v21, s4, v21 -; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s5, s60, 8 -; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s59, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s58, 24 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v19, v21, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v23, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v20, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v18 -; SI-NEXT: v_or_b32_e32 v18, s4, v18 -; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s5, s57, 8 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s56, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v16 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s47, 24 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v16, v18, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v20, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v17, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s5, s46, 8 -; SI-NEXT: v_and_b32_e32 v12, 0xff, v12 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s44, 24 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 +; SI-NEXT: s_lshl_b32 s7, s36, 8 +; SI-NEXT: s_and_b32 s9, s16, 0xff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s9, s34, 0xff +; SI-NEXT: s_lshl_b32 s11, s30, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_and_b32 s7, s17, 0xff +; SI-NEXT: s_lshl_b32 s9, s85, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s84, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s83, 24 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_lshl_b32 s7, s94, 8 +; SI-NEXT: s_and_b32 s9, s18, 0xff +; SI-NEXT: s_or_b32 s7, s9, s7 +; SI-NEXT: s_and_b32 s9, s92, 0xff +; SI-NEXT: s_lshl_b32 s11, s90, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: s_and_b32 s7, s19, 0xff +; SI-NEXT: s_lshl_b32 s9, s82, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s81, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s80, 24 +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v11, v14, v11 -; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v17, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s5, s43, 8 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xff, v15 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s9, s76, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s72, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s62, 24 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s21, 0xff +; SI-NEXT: s_lshl_b32 s9, s71, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s70, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s69, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v13 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s41, 24 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s5, s40, 8 -; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s14, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s22, 0xff +; SI-NEXT: s_lshl_b32 s9, s88, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s78, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s23, 0xff +; SI-NEXT: s_lshl_b32 s9, s68, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s67, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s66, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s24, 0xff +; SI-NEXT: s_lshl_b32 s9, s60, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s58, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s56, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s25, 0xff +; SI-NEXT: s_lshl_b32 s9, s65, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s64, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s55, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s5, s13, 8 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s12, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s11, s11, 24 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s26, 0xff +; SI-NEXT: s_lshl_b32 s9, s46, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s44, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s42, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s27, 0xff +; SI-NEXT: s_lshl_b32 s9, s54, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s53, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s52, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s40, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s14, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s12, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s7, s29, 0xff +; SI-NEXT: s_lshl_b32 s9, s51, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s50, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s11, s49, 24 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s7, s10, 8 +; SI-NEXT: s_or_b32 s4, s4, s7 +; SI-NEXT: s_and_b32 s7, s8, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s6, s6, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s11, s5 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 -; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s48, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; SI-NEXT: s_and_b32 s5, s39, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s8, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: s_lshl_b32 s6, s38, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s85, v4, 29 +; SI-NEXT: v_readlane_b32 s84, v4, 28 +; SI-NEXT: v_readlane_b32 s83, v4, 27 +; SI-NEXT: v_readlane_b32 s82, v4, 26 +; SI-NEXT: v_readlane_b32 s81, v4, 25 +; SI-NEXT: v_readlane_b32 s80, v4, 24 +; SI-NEXT: v_readlane_b32 s71, v4, 23 +; SI-NEXT: v_readlane_b32 s70, v4, 22 +; SI-NEXT: v_readlane_b32 s69, v4, 21 +; SI-NEXT: v_readlane_b32 s68, v4, 20 +; SI-NEXT: v_readlane_b32 s67, v4, 19 +; SI-NEXT: v_readlane_b32 s66, v4, 18 +; SI-NEXT: v_readlane_b32 s65, v4, 17 +; SI-NEXT: v_readlane_b32 s64, v4, 16 +; SI-NEXT: v_readlane_b32 s55, v4, 15 +; SI-NEXT: v_readlane_b32 s54, v4, 14 +; SI-NEXT: v_readlane_b32 s53, v4, 13 +; SI-NEXT: v_readlane_b32 s52, v4, 12 +; SI-NEXT: v_readlane_b32 s51, v4, 11 +; SI-NEXT: v_readlane_b32 s50, v4, 10 +; SI-NEXT: v_readlane_b32 s49, v4, 9 +; SI-NEXT: v_readlane_b32 s48, v4, 8 +; SI-NEXT: v_readlane_b32 s39, v4, 7 +; SI-NEXT: v_readlane_b32 s38, v4, 6 +; SI-NEXT: v_readlane_b32 s37, v4, 5 +; SI-NEXT: v_readlane_b32 s36, v4, 4 +; SI-NEXT: v_readlane_b32 s35, v4, 3 +; SI-NEXT: v_readlane_b32 s34, v4, 2 +; SI-NEXT: v_readlane_b32 s31, v4, 1 +; SI-NEXT: v_readlane_b32 s30, v4, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr85 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr57 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr15 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v8i64_to_v64i8_scalar: @@ -42400,42 +42314,46 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -42447,49 +42365,48 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB71_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -42497,113 +42414,80 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -42612,6 +42496,7 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -42620,65 +42505,116 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB71_3 ; SI-NEXT: .LBB71_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -42720,100 +42656,58 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -42836,47 +42730,12 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB71_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB71_2 ; @@ -42899,142 +42758,121 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB71_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -43064,47 +42902,91 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB71_3 ; VI-NEXT: .LBB71_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -43144,76 +43026,35 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB71_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -43234,43 +43075,9 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB71_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB71_2 ; @@ -43293,147 +43100,124 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB71_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -43463,48 +43247,78 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB71_3 ; GFX9-NEXT: .LBB71_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -43528,6 +43342,20 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -43544,76 +43372,35 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB71_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -43634,43 +43421,9 @@ define inreg <8 x i64> @bitcast_v64i8_to_v8i64_scalar(<64 x i8> inreg %a, i32 in ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB71_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB71_2 ; @@ -44650,100 +44403,100 @@ define inreg <32 x i16> @bitcast_v8f64_to_v32i16_scalar(<8 x double> inreg %a, i ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 ; SI-NEXT: v_mov_b32_e32 v33, v1 ; SI-NEXT: v_mov_b32_e32 v32, v0 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v1, s17 -; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v5, s19 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s21 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v16, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v21, s27 -; SI-NEXT: v_mov_b32_e32 v24, s28 +; SI-NEXT: v_mov_b32_e32 v34, s16 +; SI-NEXT: v_mov_b32_e32 v35, s17 +; SI-NEXT: v_mov_b32_e32 v36, s18 +; SI-NEXT: v_mov_b32_e32 v37, s19 +; SI-NEXT: v_mov_b32_e32 v38, s20 +; SI-NEXT: v_mov_b32_e32 v39, s21 +; SI-NEXT: v_mov_b32_e32 v48, s22 +; SI-NEXT: v_mov_b32_e32 v49, s23 +; SI-NEXT: v_mov_b32_e32 v50, s24 +; SI-NEXT: v_mov_b32_e32 v51, s25 +; SI-NEXT: v_mov_b32_e32 v52, s26 +; SI-NEXT: v_mov_b32_e32 v53, s27 +; SI-NEXT: v_mov_b32_e32 v54, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v25, s29 +; SI-NEXT: v_mov_b32_e32 v55, s29 ; SI-NEXT: s_cbranch_scc0 .LBB73_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 -; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 -; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_3 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 -; SI-NEXT: v_add_f64 v[4:5], v[4:5], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[20:21], v[20:21], 1.0 ; SI-NEXT: v_add_f64 v[32:33], v[32:33], 1.0 -; SI-NEXT: v_add_f64 v[24:25], v[24:25], 1.0 -; SI-NEXT: v_alignbit_b32 v29, v33, v32, 16 -; SI-NEXT: v_alignbit_b32 v48, v25, v24, 16 -; SI-NEXT: v_alignbit_b32 v39, v21, v20, 16 -; SI-NEXT: v_alignbit_b32 v38, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v37, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v36, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v35, v5, v4, 16 -; SI-NEXT: v_alignbit_b32 v34, v1, v0, 16 +; SI-NEXT: v_add_f64 v[54:55], v[54:55], 1.0 +; SI-NEXT: v_add_f64 v[52:53], v[52:53], 1.0 +; SI-NEXT: v_add_f64 v[50:51], v[50:51], 1.0 +; SI-NEXT: v_add_f64 v[48:49], v[48:49], 1.0 +; SI-NEXT: v_add_f64 v[38:39], v[38:39], 1.0 +; SI-NEXT: v_add_f64 v[36:37], v[36:37], 1.0 +; SI-NEXT: v_add_f64 v[34:35], v[34:35], 1.0 +; SI-NEXT: v_lshr_b64 v[29:30], v[32:33], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[54:55], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[52:53], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[13:14], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[38:39], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[36:37], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[34:35], 16 ; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v37 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v35 ; SI-NEXT: .LBB73_3: ; %end -; SI-NEXT: v_mov_b32_e32 v2, v1 -; SI-NEXT: v_mov_b32_e32 v6, v5 -; SI-NEXT: v_mov_b32_e32 v10, v9 -; SI-NEXT: v_mov_b32_e32 v14, v13 -; SI-NEXT: v_mov_b32_e32 v18, v17 -; SI-NEXT: v_mov_b32_e32 v22, v21 -; SI-NEXT: v_mov_b32_e32 v26, v25 +; SI-NEXT: v_mov_b32_e32 v0, v34 +; SI-NEXT: v_mov_b32_e32 v2, v35 +; SI-NEXT: v_mov_b32_e32 v4, v36 +; SI-NEXT: v_mov_b32_e32 v6, v37 +; SI-NEXT: v_mov_b32_e32 v8, v38 +; SI-NEXT: v_mov_b32_e32 v10, v39 +; SI-NEXT: v_mov_b32_e32 v12, v48 +; SI-NEXT: v_mov_b32_e32 v14, v49 +; SI-NEXT: v_mov_b32_e32 v16, v50 +; SI-NEXT: v_mov_b32_e32 v18, v51 +; SI-NEXT: v_mov_b32_e32 v20, v52 +; SI-NEXT: v_mov_b32_e32 v22, v53 +; SI-NEXT: v_mov_b32_e32 v24, v54 +; SI-NEXT: v_mov_b32_e32 v26, v55 ; SI-NEXT: v_mov_b32_e32 v28, v32 ; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v1, v34 -; SI-NEXT: v_mov_b32_e32 v5, v35 -; SI-NEXT: v_mov_b32_e32 v9, v36 -; SI-NEXT: v_mov_b32_e32 v13, v37 -; SI-NEXT: v_mov_b32_e32 v17, v38 -; SI-NEXT: v_mov_b32_e32 v21, v39 -; SI-NEXT: v_mov_b32_e32 v25, v48 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB73_4: -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB73_2 ; ; VI-LABEL: bitcast_v8f64_to_v32i16_scalar: @@ -51674,376 +51427,435 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-LABEL: bitcast_v8f64_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 +; SI-NEXT: v_writelane_b32 v40, s36, 4 +; SI-NEXT: v_writelane_b32 v40, s37, 5 +; SI-NEXT: v_writelane_b32 v40, s38, 6 +; SI-NEXT: v_writelane_b32 v40, s39, 7 +; SI-NEXT: v_writelane_b32 v40, s48, 8 +; SI-NEXT: v_writelane_b32 v40, s49, 9 +; SI-NEXT: v_writelane_b32 v40, s50, 10 +; SI-NEXT: v_writelane_b32 v40, s51, 11 +; SI-NEXT: v_writelane_b32 v40, s52, 12 +; SI-NEXT: v_writelane_b32 v40, s53, 13 +; SI-NEXT: v_writelane_b32 v40, s54, 14 +; SI-NEXT: v_writelane_b32 v40, s55, 15 +; SI-NEXT: v_writelane_b32 v40, s64, 16 +; SI-NEXT: v_writelane_b32 v40, s65, 17 +; SI-NEXT: v_writelane_b32 v40, s66, 18 +; SI-NEXT: v_writelane_b32 v40, s67, 19 +; SI-NEXT: v_writelane_b32 v40, s68, 20 +; SI-NEXT: v_writelane_b32 v40, s69, 21 +; SI-NEXT: v_writelane_b32 v40, s70, 22 +; SI-NEXT: v_writelane_b32 v40, s71, 23 +; SI-NEXT: v_writelane_b32 v40, s80, 24 +; SI-NEXT: v_writelane_b32 v40, s81, 25 +; SI-NEXT: v_writelane_b32 v40, s82, 26 +; SI-NEXT: v_writelane_b32 v40, s83, 27 +; SI-NEXT: v_writelane_b32 v40, s84, 28 +; SI-NEXT: v_writelane_b32 v40, s85, 29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; SI-NEXT: v_writelane_b32 v40, s86, 30 ; SI-NEXT: v_readfirstlane_b32 s4, v1 ; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_writelane_b32 v40, s87, 31 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 -; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 -; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s28 -; SI-NEXT: v_alignbit_b32 v20, s29, v1, 24 -; SI-NEXT: v_alignbit_b32 v4, s29, v1, 16 -; SI-NEXT: v_alignbit_b32 v19, s29, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s26 -; SI-NEXT: v_alignbit_b32 v6, s27, v1, 24 -; SI-NEXT: v_alignbit_b32 v21, s27, v1, 16 -; SI-NEXT: v_alignbit_b32 v22, s27, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s24 -; SI-NEXT: v_alignbit_b32 v8, s25, v1, 24 -; SI-NEXT: v_alignbit_b32 v23, s25, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, s25, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s22 -; SI-NEXT: v_alignbit_b32 v10, s23, v1, 24 -; SI-NEXT: v_alignbit_b32 v25, s23, v1, 16 -; SI-NEXT: v_alignbit_b32 v26, s23, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s20 -; SI-NEXT: v_alignbit_b32 v12, s21, v1, 24 -; SI-NEXT: v_alignbit_b32 v14, s21, v1, 16 -; SI-NEXT: v_alignbit_b32 v16, s21, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s18 -; SI-NEXT: v_alignbit_b32 v27, s19, v1, 24 -; SI-NEXT: v_alignbit_b32 v28, s19, v1, 16 -; SI-NEXT: v_alignbit_b32 v29, s19, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s16 -; SI-NEXT: v_alignbit_b32 v30, s17, v1, 24 -; SI-NEXT: v_alignbit_b32 v31, s17, v1, 16 -; SI-NEXT: v_alignbit_b32 v32, s17, v1, 8 -; SI-NEXT: s_lshr_b32 s8, s5, 24 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_lshr_b32 s10, s5, 8 -; SI-NEXT: s_lshr_b32 s11, s29, 24 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s29, 8 -; SI-NEXT: s_lshr_b32 s14, s27, 24 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 8 -; SI-NEXT: s_lshr_b32 s41, s25, 24 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 8 -; SI-NEXT: s_lshr_b32 s44, s23, 24 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: s_lshr_b32 s47, s21, 24 -; SI-NEXT: s_lshr_b32 s56, s21, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 8 -; SI-NEXT: s_lshr_b32 s58, s19, 24 -; SI-NEXT: s_lshr_b32 s59, s19, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 8 -; SI-NEXT: s_lshr_b32 s61, s17, 24 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 8 +; SI-NEXT: s_lshr_b32 s48, s5, 24 +; SI-NEXT: s_lshr_b32 s49, s5, 16 +; SI-NEXT: s_lshr_b32 s50, s5, 8 +; SI-NEXT: s_lshr_b32 s51, s29, 24 +; SI-NEXT: s_lshr_b32 s52, s29, 16 +; SI-NEXT: s_lshr_b32 s53, s29, 8 +; SI-NEXT: s_lshr_b32 s54, s27, 24 +; SI-NEXT: s_lshr_b32 s55, s27, 16 +; SI-NEXT: s_lshr_b32 s64, s27, 8 +; SI-NEXT: s_lshr_b32 s65, s25, 24 +; SI-NEXT: s_lshr_b32 s66, s25, 16 +; SI-NEXT: s_lshr_b32 s67, s25, 8 +; SI-NEXT: s_lshr_b32 s68, s23, 24 +; SI-NEXT: s_lshr_b32 s69, s23, 16 +; SI-NEXT: s_lshr_b32 s70, s23, 8 +; SI-NEXT: s_lshr_b32 s71, s21, 24 +; SI-NEXT: s_lshr_b32 s80, s21, 16 +; SI-NEXT: s_lshr_b32 s81, s21, 8 +; SI-NEXT: s_lshr_b32 s82, s19, 24 +; SI-NEXT: s_lshr_b32 s83, s19, 16 +; SI-NEXT: s_lshr_b32 s84, s19, 8 +; SI-NEXT: s_lshr_b32 s85, s17, 24 +; SI-NEXT: s_lshr_b32 s86, s17, 16 +; SI-NEXT: s_lshr_b32 s87, s17, 8 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[28:29], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 8 +; SI-NEXT: s_lshr_b64 s[76:77], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[24:25], 24 +; SI-NEXT: s_lshr_b64 s[94:95], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[24:25], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[22:23], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[38:39], s[22:23], 8 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[58:59], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[15:16], s[16:17], 1.0 -; SI-NEXT: v_add_f64 v[13:14], s[18:19], 1.0 -; SI-NEXT: v_add_f64 v[11:12], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[9:10], s[22:23], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[28:29], s[18:19], 1.0 ; SI-NEXT: v_add_f64 v[5:6], s[26:27], 1.0 -; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[13:14], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[3:4], s[28:29], 1.0 -; SI-NEXT: v_readfirstlane_b32 s5, v2 -; SI-NEXT: v_readfirstlane_b32 s29, v4 -; SI-NEXT: v_readfirstlane_b32 s27, v6 +; SI-NEXT: v_lshr_b64 v[48:49], v[28:29], 24 +; SI-NEXT: v_add_f64 v[1:2], s[4:5], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[24:25], 1.0 +; SI-NEXT: v_add_f64 v[20:21], s[20:21], 1.0 +; SI-NEXT: v_add_f64 v[32:33], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[28:29], 16 +; SI-NEXT: v_lshr_b64 v[15:16], v[3:4], 24 +; SI-NEXT: v_lshr_b64 v[23:24], v[5:6], 8 +; SI-NEXT: v_lshr_b64 v[35:36], v[13:14], 8 +; SI-NEXT: v_lshr_b64 v[50:51], v[28:29], 8 +; SI-NEXT: v_lshr_b64 v[9:10], v[1:2], 24 +; SI-NEXT: v_lshr_b64 v[16:17], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 24 +; SI-NEXT: v_lshr_b64 v[36:37], v[20:21], 24 +; SI-NEXT: v_lshr_b64 v[51:52], v[32:33], 24 +; SI-NEXT: v_lshr_b64 v[10:11], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[17:18], v[3:4], 8 +; SI-NEXT: v_lshr_b64 v[25:26], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[32:33], 16 +; SI-NEXT: v_readfirstlane_b32 s17, v33 +; SI-NEXT: v_readfirstlane_b32 s19, v29 +; SI-NEXT: v_readfirstlane_b32 s21, v21 +; SI-NEXT: v_readfirstlane_b32 s23, v14 ; SI-NEXT: v_readfirstlane_b32 s25, v8 -; SI-NEXT: v_readfirstlane_b32 s23, v10 -; SI-NEXT: v_readfirstlane_b32 s21, v12 -; SI-NEXT: v_readfirstlane_b32 s19, v14 -; SI-NEXT: v_readfirstlane_b32 s17, v16 -; SI-NEXT: v_alignbit_b32 v2, s5, v1, 24 -; SI-NEXT: v_alignbit_b32 v17, s5, v1, 16 -; SI-NEXT: v_alignbit_b32 v18, s5, v1, 8 -; SI-NEXT: v_alignbit_b32 v20, s29, v3, 24 -; SI-NEXT: v_alignbit_b32 v4, s29, v3, 16 -; SI-NEXT: v_alignbit_b32 v19, s29, v3, 8 -; SI-NEXT: v_alignbit_b32 v6, s27, v5, 24 -; SI-NEXT: v_alignbit_b32 v21, s27, v5, 16 -; SI-NEXT: v_alignbit_b32 v22, s27, v5, 8 -; SI-NEXT: v_alignbit_b32 v8, s25, v7, 24 -; SI-NEXT: v_alignbit_b32 v23, s25, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, s25, v7, 8 -; SI-NEXT: v_alignbit_b32 v10, s23, v9, 24 -; SI-NEXT: v_alignbit_b32 v25, s23, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, s23, v9, 8 -; SI-NEXT: v_alignbit_b32 v12, s21, v11, 24 -; SI-NEXT: s_lshr_b32 s8, s5, 24 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_lshr_b32 s10, s5, 8 -; SI-NEXT: s_lshr_b32 s11, s29, 24 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s29, 8 -; SI-NEXT: s_lshr_b32 s14, s27, 24 -; SI-NEXT: s_lshr_b32 s15, s27, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 8 -; SI-NEXT: s_lshr_b32 s41, s25, 24 -; SI-NEXT: s_lshr_b32 s42, s25, 16 -; SI-NEXT: s_lshr_b32 s43, s25, 8 -; SI-NEXT: s_lshr_b32 s44, s23, 24 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s23, 8 -; SI-NEXT: s_lshr_b32 s47, s21, 24 -; SI-NEXT: s_lshr_b32 s56, s21, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 8 -; SI-NEXT: s_lshr_b32 s58, s19, 24 -; SI-NEXT: s_lshr_b32 s59, s19, 16 -; SI-NEXT: s_lshr_b32 s60, s19, 8 -; SI-NEXT: s_lshr_b32 s61, s17, 24 -; SI-NEXT: s_lshr_b32 s62, s17, 16 -; SI-NEXT: s_lshr_b32 s63, s17, 8 -; SI-NEXT: v_alignbit_b32 v14, s21, v11, 16 -; SI-NEXT: v_alignbit_b32 v16, s21, v11, 8 -; SI-NEXT: v_alignbit_b32 v27, s19, v13, 24 -; SI-NEXT: v_alignbit_b32 v28, s19, v13, 16 -; SI-NEXT: v_alignbit_b32 v29, s19, v13, 8 -; SI-NEXT: v_alignbit_b32 v30, s17, v15, 24 -; SI-NEXT: v_alignbit_b32 v31, s17, v15, 16 -; SI-NEXT: v_alignbit_b32 v32, s17, v15, 8 +; SI-NEXT: v_readfirstlane_b32 s27, v6 +; SI-NEXT: v_readfirstlane_b32 s29, v4 +; SI-NEXT: v_readfirstlane_b32 s5, v2 +; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 8 +; SI-NEXT: v_lshr_b64 v[18:19], v[5:6], 24 +; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 8 +; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 24 +; SI-NEXT: v_lshr_b64 v[38:39], v[20:21], 8 +; SI-NEXT: v_lshr_b64 v[53:54], v[32:33], 8 +; SI-NEXT: s_lshr_b32 s48, s5, 24 +; SI-NEXT: s_lshr_b32 s49, s5, 16 +; SI-NEXT: s_lshr_b32 s50, s5, 8 +; SI-NEXT: s_lshr_b32 s51, s29, 24 +; SI-NEXT: s_lshr_b32 s52, s29, 16 +; SI-NEXT: s_lshr_b32 s53, s29, 8 +; SI-NEXT: s_lshr_b32 s54, s27, 24 +; SI-NEXT: s_lshr_b32 s55, s27, 16 +; SI-NEXT: s_lshr_b32 s64, s27, 8 +; SI-NEXT: s_lshr_b32 s65, s25, 24 +; SI-NEXT: s_lshr_b32 s66, s25, 16 +; SI-NEXT: s_lshr_b32 s67, s25, 8 +; SI-NEXT: s_lshr_b32 s68, s23, 24 +; SI-NEXT: s_lshr_b32 s69, s23, 16 +; SI-NEXT: s_lshr_b32 s70, s23, 8 +; SI-NEXT: s_lshr_b32 s71, s21, 24 +; SI-NEXT: s_lshr_b32 s80, s21, 16 +; SI-NEXT: s_lshr_b32 s81, s21, 8 +; SI-NEXT: s_lshr_b32 s82, s19, 24 +; SI-NEXT: s_lshr_b32 s83, s19, 16 +; SI-NEXT: s_lshr_b32 s84, s19, 8 +; SI-NEXT: s_lshr_b32 s85, s17, 24 +; SI-NEXT: s_lshr_b32 s86, s17, 16 +; SI-NEXT: s_lshr_b32 s87, s17, 8 ; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $sgpr63 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr59 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr57 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr85 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr45 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr83 +; SI-NEXT: ; implicit-def: $sgpr82 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $sgpr9 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr67 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr65 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_mov_b32_e32 v3, s28 -; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v32, s16 +; SI-NEXT: v_mov_b32_e32 v28, s18 +; SI-NEXT: v_mov_b32_e32 v20, s20 +; SI-NEXT: v_mov_b32_e32 v13, s22 ; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 +; SI-NEXT: v_mov_b32_e32 v5, s26 +; SI-NEXT: v_mov_b32_e32 v3, s28 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v53, s74 +; SI-NEXT: v_mov_b32_e32 v52, s62 +; SI-NEXT: v_mov_b32_e32 v51, s58 +; SI-NEXT: v_mov_b32_e32 v50, s56 +; SI-NEXT: v_mov_b32_e32 v49, s44 +; SI-NEXT: v_mov_b32_e32 v48, s42 +; SI-NEXT: v_mov_b32_e32 v38, s40 +; SI-NEXT: v_mov_b32_e32 v37, s12 +; SI-NEXT: v_mov_b32_e32 v36, s8 +; SI-NEXT: v_mov_b32_e32 v35, s38 +; SI-NEXT: v_mov_b32_e32 v34, s36 +; SI-NEXT: v_mov_b32_e32 v30, s34 +; SI-NEXT: v_mov_b32_e32 v26, s30 +; SI-NEXT: v_mov_b32_e32 v25, s94 +; SI-NEXT: v_mov_b32_e32 v24, s92 +; SI-NEXT: v_mov_b32_e32 v23, s88 +; SI-NEXT: v_mov_b32_e32 v22, s78 +; SI-NEXT: v_mov_b32_e32 v18, s76 +; SI-NEXT: v_mov_b32_e32 v17, s72 +; SI-NEXT: v_mov_b32_e32 v16, s60 +; SI-NEXT: v_mov_b32_e32 v15, s46 +; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v9, s6 ; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: v_and_b32_e32 v2, 0xff, v32 +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v53 ; SI-NEXT: s_and_b32 s4, s17, 0xff -; SI-NEXT: s_lshl_b32 s6, s63, 8 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v32, 8, v32 -; SI-NEXT: v_and_b32_e32 v31, 0xff, v31 +; SI-NEXT: s_lshl_b32 s6, s87, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v52 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s62, 0xff -; SI-NEXT: v_or_b32_e32 v15, v15, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v30 +; SI-NEXT: s_and_b32 s6, s86, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v51 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s61, 24 -; SI-NEXT: v_or_b32_e32 v30, v30, v31 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: s_lshl_b32 s7, s85, 24 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v15, v15, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v30, s4 -; SI-NEXT: buffer_store_dword v30, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v29 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v50 ; SI-NEXT: s_and_b32 s4, s19, 0xff -; SI-NEXT: s_lshl_b32 s6, s60, 8 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v28 +; SI-NEXT: s_lshl_b32 s6, s84, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v49 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s59, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v27 +; SI-NEXT: s_and_b32 s6, s83, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s58, 24 -; SI-NEXT: v_or_b32_e32 v15, v27, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: s_lshl_b32 s7, s82, 24 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: buffer_store_dword v15, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v16 +; SI-NEXT: v_add_i32_e32 v2, vcc, 12, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v20 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v38 ; SI-NEXT: s_and_b32 s4, s21, 0xff -; SI-NEXT: s_lshl_b32 s6, s57, 8 -; SI-NEXT: v_or_b32_e32 v11, v11, v13 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 +; SI-NEXT: s_lshl_b32 s6, s81, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v37 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s56, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; SI-NEXT: s_and_b32 s6, s80, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s47, 24 -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: s_lshl_b32 s7, s71, 24 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 16, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v26 +; SI-NEXT: v_add_i32_e32 v2, vcc, 20, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v35 ; SI-NEXT: s_and_b32 s4, s23, 0xff -; SI-NEXT: s_lshl_b32 s6, s46, 8 -; SI-NEXT: v_or_b32_e32 v9, v9, v11 -; SI-NEXT: v_and_b32_e32 v11, 0xff, v25 +; SI-NEXT: s_lshl_b32 s6, s70, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v34 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s45, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; SI-NEXT: s_and_b32 s6, s69, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v30 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s44, 24 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: s_lshl_b32 s7, s68, 24 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 24, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v24 +; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v26 ; SI-NEXT: s_and_b32 s4, s25, 0xff -; SI-NEXT: s_lshl_b32 s6, s43, 8 -; SI-NEXT: v_or_b32_e32 v7, v7, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v23 +; SI-NEXT: s_lshl_b32 s6, s67, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v25 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s42, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; SI-NEXT: s_and_b32 s6, s66, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v24 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s41, 24 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: s_lshl_b32 s7, s65, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v6, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 32, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v22 +; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v5 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v23 ; SI-NEXT: s_and_b32 s4, s27, 0xff -; SI-NEXT: s_lshl_b32 s6, s40, 8 -; SI-NEXT: v_or_b32_e32 v5, v5, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v21 +; SI-NEXT: s_lshl_b32 s6, s64, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v22 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s15, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; SI-NEXT: s_and_b32 s6, s55, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v18 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s14, 24 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 +; SI-NEXT: s_lshl_b32 s7, s54, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 -; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v2, 0xff, v3 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17 ; SI-NEXT: s_and_b32 s4, s29, 0xff -; SI-NEXT: s_lshl_b32 s6, s13, 8 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v19 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_lshl_b32 s6, s53, 8 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v16 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: s_and_b32 s6, s12, 0xff -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v20 +; SI-NEXT: s_and_b32 s6, s52, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v15 ; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_lshl_b32 s7, s11, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: s_lshl_b32 s7, s51, 24 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 ; SI-NEXT: s_or_b32 s4, s4, s6 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v11 ; SI-NEXT: s_and_b32 s4, s5, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 -; SI-NEXT: v_or_b32_e32 v1, v1, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v17 +; SI-NEXT: s_lshl_b32 s5, s50, 8 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v10 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s9, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; SI-NEXT: s_and_b32 s5, s49, 0xff +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v9 ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s8, 24 +; SI-NEXT: s_lshl_b32 s6, s48, 24 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 @@ -52054,6 +51866,41 @@ define inreg <64 x i8> @bitcast_v8f64_to_v64i8_scalar(<8 x double> inreg %a, i32 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s87, v40, 31 +; SI-NEXT: v_readlane_b32 s86, v40, 30 +; SI-NEXT: v_readlane_b32 s85, v40, 29 +; SI-NEXT: v_readlane_b32 s84, v40, 28 +; SI-NEXT: v_readlane_b32 s83, v40, 27 +; SI-NEXT: v_readlane_b32 s82, v40, 26 +; SI-NEXT: v_readlane_b32 s81, v40, 25 +; SI-NEXT: v_readlane_b32 s80, v40, 24 +; SI-NEXT: v_readlane_b32 s71, v40, 23 +; SI-NEXT: v_readlane_b32 s70, v40, 22 +; SI-NEXT: v_readlane_b32 s69, v40, 21 +; SI-NEXT: v_readlane_b32 s68, v40, 20 +; SI-NEXT: v_readlane_b32 s67, v40, 19 +; SI-NEXT: v_readlane_b32 s66, v40, 18 +; SI-NEXT: v_readlane_b32 s65, v40, 17 +; SI-NEXT: v_readlane_b32 s64, v40, 16 +; SI-NEXT: v_readlane_b32 s55, v40, 15 +; SI-NEXT: v_readlane_b32 s54, v40, 14 +; SI-NEXT: v_readlane_b32 s53, v40, 13 +; SI-NEXT: v_readlane_b32 s52, v40, 12 +; SI-NEXT: v_readlane_b32 s51, v40, 11 +; SI-NEXT: v_readlane_b32 s50, v40, 10 +; SI-NEXT: v_readlane_b32 s49, v40, 9 +; SI-NEXT: v_readlane_b32 s48, v40, 8 +; SI-NEXT: v_readlane_b32 s39, v40, 7 +; SI-NEXT: v_readlane_b32 s38, v40, 6 +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -55687,42 +55534,46 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v48, v30 +; SI-NEXT: v_mov_b32_e32 v48, v28 +; SI-NEXT: v_mov_b32_e32 v38, v26 +; SI-NEXT: v_mov_b32_e32 v49, v24 +; SI-NEXT: v_mov_b32_e32 v51, v14 +; SI-NEXT: v_mov_b32_e32 v54, v12 +; SI-NEXT: v_mov_b32_e32 v34, v10 +; SI-NEXT: v_mov_b32_e32 v44, v6 ; SI-NEXT: v_mov_b32_e32 v33, v4 ; SI-NEXT: v_mov_b32_e32 v32, v2 ; SI-NEXT: v_mov_b32_e32 v31, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 -; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 -; SI-NEXT: v_lshlrev_b32_e32 v40, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 +; SI-NEXT: v_lshlrev_b32_e32 v50, 24, v1 +; SI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v57, 24, v17 ; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v21 ; SI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 @@ -55734,49 +55585,48 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v23, 8, v2 ; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v51 -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v39 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_lshlrev_b32_e32 v51, 8, v30 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v27, 24, v42 -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v44 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v6 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v52 +; SI-NEXT: s_waitcnt vmcnt(11) +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v10 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v12 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 8, v14 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_lshlrev_b32_e32 v14, 24, v28 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_lshlrev_b32_e32 v28, 8, v26 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v24 +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB87_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v33 +; SI-NEXT: v_or_b32_e32 v0, v0, v39 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: v_or_b32_e32 v4, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v12 -; SI-NEXT: v_or_b32_e32 v0, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_or_b32_e32 v0, v0, v60 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v6 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v6, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v51 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v16 -; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_or_b32_e32 v0, v0, v58 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v7, v1 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: v_or_b32_e32 v7, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v18 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v20 @@ -55784,113 +55634,80 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v47, v1 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_mov_b32_e32 v29, v8 +; SI-NEXT: v_mov_b32_e32 v26, v8 ; SI-NEXT: v_and_b32_e32 v3, 0xff, v8 ; SI-NEXT: v_or_b32_e32 v8, v0, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v24 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 ; SI-NEXT: v_or_b32_e32 v0, v0, v46 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v9, v1 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v9 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_or_b32_e32 v9, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v26 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v38 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v48 ; SI-NEXT: v_or_b32_e32 v0, v0, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_mov_b32_e32 v44, v10 ; SI-NEXT: v_or_b32_e32 v10, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v48 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v42 ; SI-NEXT: v_or_b32_e32 v0, v0, v23 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v11, v1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v25, v11 ; SI-NEXT: v_or_b32_e32 v11, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v60 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v40 ; SI-NEXT: v_or_b32_e32 v0, v0, v21 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v19, v1 -; SI-NEXT: v_mov_b32_e32 v36, v12 ; SI-NEXT: v_or_b32_e32 v12, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v34 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v55 ; SI-NEXT: v_or_b32_e32 v0, v0, v17 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v1, v13, v1 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 ; SI-NEXT: v_or_b32_e32 v13, v0, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v55 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 -; SI-NEXT: v_or_b32_e32 v0, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v36 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v53 +; SI-NEXT: v_or_b32_e32 v0, v0, v29 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 -; SI-NEXT: v_mov_b32_e32 v52, v14 +; SI-NEXT: v_or_b32_e32 v1, v14, v1 ; SI-NEXT: v_or_b32_e32 v14, v0, v1 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v37 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v43 +; SI-NEXT: v_or_b32_e32 v0, v0, v28 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: v_or_b32_e32 v1, v42, v1 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v0, 0xff, v53 -; SI-NEXT: v_or_b32_e32 v0, v0, v15 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_or_b32_e32 v1, v15, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v44 ; SI-NEXT: v_or_b32_e32 v15, v0, v1 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v62 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_or_b32_e32 v3, v61, v3 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v40, v5 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v5, v2, v3 -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_lshl_b32 s7, s23, 24 -; SI-NEXT: s_lshl_b32 s8, s27, 24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_or_b32_e32 v3, s4, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_or_b32 s4, s4, s5 @@ -55899,6 +55716,7 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 ; SI-NEXT: s_and_b32 s5, s5, 0xffff ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_or_b32 s5, s5, s6 @@ -55907,65 +55725,116 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s26, 0xff ; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s27, 24 ; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_cbranch_execnz .LBB87_3 ; SI-NEXT: .LBB87_2: ; %cmp.true +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 +; SI-NEXT: v_or_b32_e32 v1, v39, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v2, v63, v2 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: s_add_i32 s28, s28, 3 ; SI-NEXT: s_and_b32 s4, s28, 0xff ; SI-NEXT: s_lshl_b32 s5, s29, 8 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v32 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v33 ; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v1, v38, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v42, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_or_b32_e32 v0, v50, v0 ; SI-NEXT: v_or_b32_e32 v0, s4, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v39, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v27 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x3000000, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v29 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v40, v1 +; SI-NEXT: v_or_b32_e32 v1, v61, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v36 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v59, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v47, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v48 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v25, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v23, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -56007,100 +55876,58 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: s_add_i32 s4, s4, 0x3000000 ; SI-NEXT: s_add_i32 s5, s5, 0x3000000 ; SI-NEXT: s_add_i32 s6, s6, 0x3000000 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v57, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v18 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 +; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v26 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v0, v21, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v28, v1 +; SI-NEXT: v_or_b32_e32 v1, v19, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v25, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v49 -; SI-NEXT: v_or_b32_e32 v0, v23, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v21, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 -; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v35 -; SI-NEXT: v_or_b32_e32 v0, v19, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v17, v1 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 -; SI-NEXT: v_or_b32_e32 v0, v48, v0 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x3000000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v37 -; SI-NEXT: v_or_b32_e32 v0, v51, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v43 +; SI-NEXT: v_or_b32_e32 v0, v28, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v27, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x3000000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -56123,47 +55950,12 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB87_4: -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v27, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v38, v1 -; SI-NEXT: v_mov_b32_e32 v43, v6 -; SI-NEXT: v_mov_b32_e32 v29, v8 -; SI-NEXT: v_mov_b32_e32 v44, v10 -; SI-NEXT: v_mov_b32_e32 v36, v12 -; SI-NEXT: v_mov_b32_e32 v52, v14 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v16, v18 -; SI-NEXT: v_mov_b32_e32 v18, v20 -; SI-NEXT: v_mov_b32_e32 v20, v22 -; SI-NEXT: v_mov_b32_e32 v22, v24 -; SI-NEXT: v_mov_b32_e32 v24, v26 -; SI-NEXT: v_mov_b32_e32 v26, v28 -; SI-NEXT: v_mov_b32_e32 v30, v48 -; SI-NEXT: v_mov_b32_e32 v39, v40 -; SI-NEXT: v_mov_b32_e32 v41, v3 -; SI-NEXT: v_mov_b32_e32 v40, v5 -; SI-NEXT: v_mov_b32_e32 v63, v59 -; SI-NEXT: v_mov_b32_e32 v62, v58 -; SI-NEXT: v_mov_b32_e32 v61, v57 -; SI-NEXT: v_mov_b32_e32 v57, v7 -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v46 -; SI-NEXT: v_mov_b32_e32 v56, v9 -; SI-NEXT: v_mov_b32_e32 v46, v45 -; SI-NEXT: v_mov_b32_e32 v28, v25 -; SI-NEXT: v_mov_b32_e32 v45, v23 -; SI-NEXT: v_mov_b32_e32 v25, v11 -; SI-NEXT: v_mov_b32_e32 v23, v21 -; SI-NEXT: v_mov_b32_e32 v21, v19 -; SI-NEXT: v_mov_b32_e32 v19, v17 -; SI-NEXT: v_mov_b32_e32 v17, v13 -; SI-NEXT: v_mov_b32_e32 v48, v51 -; SI-NEXT: v_mov_b32_e32 v51, v15 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v26, v8 +; SI-NEXT: v_mov_b32_e32 v52, v42 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; SI-NEXT: s_branch .LBB87_2 ; @@ -56186,142 +55978,121 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v37, v30 -; VI-NEXT: v_mov_b32_e32 v61, v28 -; VI-NEXT: v_mov_b32_e32 v31, v0 +; VI-NEXT: v_mov_b32_e32 v36, v28 +; VI-NEXT: v_mov_b32_e32 v35, v26 +; VI-NEXT: v_mov_b32_e32 v34, v24 +; VI-NEXT: v_mov_b32_e32 v39, v14 +; VI-NEXT: v_mov_b32_e32 v48, v12 +; VI-NEXT: v_mov_b32_e32 v49, v10 +; VI-NEXT: v_mov_b32_e32 v50, v8 +; VI-NEXT: v_mov_b32_e32 v51, v6 +; VI-NEXT: v_mov_b32_e32 v44, v2 +; VI-NEXT: v_mov_b32_e32 v45, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; VI-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; VI-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; VI-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; VI-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; VI-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; VI-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; VI-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; VI-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; VI-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; VI-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; VI-NEXT: v_mov_b32_e32 v37, v30 +; VI-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; VI-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; VI-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; VI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v29 ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v33 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; VI-NEXT: s_waitcnt vmcnt(12) -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; VI-NEXT: s_waitcnt vmcnt(10) -; VI-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; VI-NEXT: s_waitcnt vmcnt(8) -; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; VI-NEXT: s_waitcnt vmcnt(6) -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; VI-NEXT: s_waitcnt vmcnt(4) -; VI-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; VI-NEXT: s_waitcnt vmcnt(13) +; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; VI-NEXT: s_waitcnt vmcnt(11) +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; VI-NEXT: s_waitcnt vmcnt(9) +; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; VI-NEXT: s_waitcnt vmcnt(7) +; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; VI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; VI-NEXT: s_cbranch_scc0 .LBB87_4 ; VI-NEXT: ; %bb.1: ; %cmp.false -; VI-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v4 +; VI-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v26, v4 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 +; VI-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 +; VI-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v39, v10 +; VI-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v25, v11 +; VI-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v43, v12 +; VI-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 +; VI-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_waitcnt vmcnt(3) -; VI-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v41, v5 +; VI-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff @@ -56351,47 +56122,91 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_cbranch_execnz .LBB87_3 ; VI-NEXT: .LBB87_2: ; %cmp.true +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v26 +; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; VI-NEXT: s_add_i32 s28, s28, 3 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: s_addk_i32 s4, 0x300 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v31 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v44 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v45 ; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v0, s4, v0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 ; VI-NEXT: v_add_u32_e32 v4, vcc, 0x3000000, v1 -; VI-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 +; VI-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v5, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 -; VI-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v48 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v6, vcc, 0x3000000, v0 -; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v16 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v18 +; VI-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v20 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v22 +; VI-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v34 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v35 +; VI-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v36 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v37 +; VI-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v40 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 +; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 +; VI-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 ; VI-NEXT: s_add_i32 s16, s16, 3 ; VI-NEXT: s_and_b32 s4, s16, 0xff ; VI-NEXT: s_lshl_b32 s5, s17, 8 @@ -56431,76 +56246,35 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_add_i32 s4, s4, 0x3000000 ; VI-NEXT: s_add_i32 s5, s5, 0x3000000 ; VI-NEXT: s_add_i32 s6, s6, 0x3000000 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v0 -; VI-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v1 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v18 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v20 -; VI-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v22 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v24 -; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v26 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v62 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v60 -; VI-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v29 -; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v54 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v52 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v33 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v41 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v51 -; VI-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v50 +; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v43 ; VI-NEXT: v_add_u32_e32 v0, vcc, 0x300, v0 -; VI-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x3000000, v0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: .LBB87_3: ; %end ; VI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; VI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -56521,43 +56295,9 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB87_4: -; VI-NEXT: v_mov_b32_e32 v44, v2 -; VI-NEXT: v_mov_b32_e32 v34, v39 -; VI-NEXT: v_mov_b32_e32 v35, v4 -; VI-NEXT: v_mov_b32_e32 v29, v33 -; VI-NEXT: v_mov_b32_e32 v49, v6 -; VI-NEXT: v_mov_b32_e32 v48, v8 -; VI-NEXT: v_mov_b32_e32 v39, v10 -; VI-NEXT: v_mov_b32_e32 v43, v12 -; VI-NEXT: v_mov_b32_e32 v16, v18 -; VI-NEXT: v_mov_b32_e32 v18, v20 -; VI-NEXT: v_mov_b32_e32 v20, v22 -; VI-NEXT: v_mov_b32_e32 v22, v24 -; VI-NEXT: v_mov_b32_e32 v24, v26 -; VI-NEXT: v_mov_b32_e32 v26, v61 -; VI-NEXT: v_mov_b32_e32 v30, v37 -; VI-NEXT: v_mov_b32_e32 v38, v1 -; VI-NEXT: v_mov_b32_e32 v41, v5 -; VI-NEXT: v_mov_b32_e32 v40, v3 -; VI-NEXT: v_mov_b32_e32 v63, v59 -; VI-NEXT: v_mov_b32_e32 v36, v58 -; VI-NEXT: v_mov_b32_e32 v58, v57 -; VI-NEXT: v_mov_b32_e32 v57, v7 -; VI-NEXT: v_mov_b32_e32 v59, v56 -; VI-NEXT: v_mov_b32_e32 v56, v47 -; VI-NEXT: v_mov_b32_e32 v47, v46 -; VI-NEXT: v_mov_b32_e32 v46, v9 -; VI-NEXT: v_mov_b32_e32 v45, v25 -; VI-NEXT: v_mov_b32_e32 v61, v23 -; VI-NEXT: v_mov_b32_e32 v25, v11 -; VI-NEXT: v_mov_b32_e32 v23, v21 -; VI-NEXT: v_mov_b32_e32 v21, v19 -; VI-NEXT: v_mov_b32_e32 v19, v17 -; VI-NEXT: v_mov_b32_e32 v17, v13 -; VI-NEXT: v_mov_b32_e32 v37, v27 -; VI-NEXT: v_mov_b32_e32 v27, v42 -; VI-NEXT: v_mov_b32_e32 v33, v28 -; VI-NEXT: v_mov_b32_e32 v28, v15 +; VI-NEXT: v_mov_b32_e32 v28, v44 +; VI-NEXT: v_mov_b32_e32 v26, v4 +; VI-NEXT: v_mov_b32_e32 v33, v42 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB87_2 ; @@ -56580,147 +56320,124 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v37, v30 -; GFX9-NEXT: v_mov_b32_e32 v61, v28 -; GFX9-NEXT: v_mov_b32_e32 v31, v0 +; GFX9-NEXT: v_mov_b32_e32 v36, v28 +; GFX9-NEXT: v_mov_b32_e32 v35, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v39, v14 +; GFX9-NEXT: v_mov_b32_e32 v48, v12 +; GFX9-NEXT: v_mov_b32_e32 v49, v10 +; GFX9-NEXT: v_mov_b32_e32 v50, v8 +; GFX9-NEXT: v_mov_b32_e32 v51, v6 +; GFX9-NEXT: v_mov_b32_e32 v44, v2 +; GFX9-NEXT: v_mov_b32_e32 v45, v0 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:8 -; GFX9-NEXT: buffer_load_ushort v62, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_ushort v60, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v34, off, s[0:3], s32 offset:40 -; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:56 -; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_ushort v51, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:72 -; GFX9-NEXT: buffer_load_ushort v50, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_lshlrev_b32_e32 v32, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v39, 8, v3 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v17 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:4 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v55, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v53, off, s[0:3], s32 offset:28 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:44 +; GFX9-NEXT: buffer_load_ushort v28, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:52 +; GFX9-NEXT: buffer_load_ushort v26, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 offset:60 +; GFX9-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:68 +; GFX9-NEXT: v_mov_b32_e32 v37, v30 +; GFX9-NEXT: v_lshlrev_b32_e32 v30, 8, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v9 +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v57, 8, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v56, 8, v19 ; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v21 ; GFX9-NEXT: v_lshlrev_b32_e32 v46, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v25 -; GFX9-NEXT: v_lshlrev_b32_e32 v45, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v29 +; GFX9-NEXT: s_waitcnt vmcnt(19) ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_waitcnt vmcnt(21) -; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v48 -; GFX9-NEXT: s_waitcnt vmcnt(20) -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v28 -; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec ; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v38 -; GFX9-NEXT: s_waitcnt vmcnt(16) -; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v36 -; GFX9-NEXT: s_waitcnt vmcnt(14) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(12) -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v34 -; GFX9-NEXT: s_waitcnt vmcnt(10) -; GFX9-NEXT: v_lshlrev_b32_e32 v28, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(8) -; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v42 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v43 -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v44 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(17) +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v33 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_waitcnt vmcnt(15) +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(13) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(11) +; GFX9-NEXT: v_lshlrev_b32_e32 v33, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(9) +; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v28 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v26 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v24 +; GFX9-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-NEXT: s_cbranch_scc0 .LBB87_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v35, v4 +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v4, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v26, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v48, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v18, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v20, v47 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v0, v22, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v61, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_or_b32_sdwa v0, v35, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v37, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v62, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v60, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v33, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v43, v12 +; GFX9-NEXT: v_or_b32_sdwa v0, v55, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v54, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v55, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v54, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 +; GFX9-NEXT: v_or_b32_sdwa v0, v53, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v53, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v42, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v41, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v51, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_or_b32_sdwa v1, v50, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_or_b32_sdwa v2, v51, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff -; GFX9-NEXT: v_or_b32_sdwa v0, v31, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v41, v5 +; GFX9-NEXT: v_or_b32_sdwa v0, v45, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s4, v0 ; GFX9-NEXT: s_and_b32 s4, s16, 0xff @@ -56750,48 +56467,78 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB87_3 ; GFX9-NEXT: .LBB87_2: ; %cmp.true +; GFX9-NEXT: v_add_u32_e32 v1, 3, v28 +; GFX9-NEXT: v_add_u32_e32 v2, 3, v26 +; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 +; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s5, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s29, 8 ; GFX9-NEXT: s_or_b32 s5, s6, s5 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v31 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v44 -; GFX9-NEXT: v_add_u32_e32 v2, 3, v35 -; GFX9-NEXT: s_movk_i32 s4, 0x300 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v45 ; GFX9-NEXT: s_addk_i32 s5, 0x300 -; GFX9-NEXT: v_or_b32_sdwa v0, v32, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_and_b32 s5, s5, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, v0, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 -; GFX9-NEXT: v_add_u32_sdwa v2, v2, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, s5, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 +; GFX9-NEXT: v_or_b32_sdwa v0, v62, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v61, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v49 ; GFX9-NEXT: v_add_u32_e32 v1, 3, v48 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v60, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_add_u32_e32 v0, 3, v39 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v0, v63, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v16 +; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v18 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v20 +; GFX9-NEXT: v_or_b32_sdwa v0, v56, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v47, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v22 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v35 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v36 +; GFX9-NEXT: v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v23, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v37 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v40 +; GFX9-NEXT: v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: s_add_i32 s16, s16, 3 ; GFX9-NEXT: s_and_b32 s5, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s17, 8 @@ -56815,6 +56562,20 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_or_b32 s7, s8, s7 ; GFX9-NEXT: s_addk_i32 s6, 0x300 ; GFX9-NEXT: s_addk_i32 s7, 0x300 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 +; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v1, v29, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 ; GFX9-NEXT: s_and_b32 s6, s6, 0xffff ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 ; GFX9-NEXT: s_add_i32 s24, s24, 3 @@ -56831,76 +56592,35 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s7, s7, 0xffff ; GFX9-NEXT: s_lshl_b32 s8, s8, 16 ; GFX9-NEXT: s_or_b32 s7, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v0, 3, v0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v57, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v16 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v18 -; GFX9-NEXT: v_or_b32_sdwa v0, v59, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v20 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v47, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_add_u32_e32 v0, 3, v24 -; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v26 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v30 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v62 -; GFX9-NEXT: v_or_b32_sdwa v0, v61, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v25, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v60 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v29 -; GFX9-NEXT: v_or_b32_sdwa v0, v23, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v21, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 -; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v55 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v54 -; GFX9-NEXT: v_or_b32_sdwa v0, v19, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v53 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v37, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_add_u32_e32 v0, 3, v33 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v41 +; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v0, 3, v51 -; GFX9-NEXT: v_add_u32_e32 v1, 3, v50 -; GFX9-NEXT: v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v1, v27, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v1, 3, v43 +; GFX9-NEXT: v_or_b32_sdwa v0, v31, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 ; GFX9-NEXT: v_add_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: .LBB87_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload @@ -56921,43 +56641,9 @@ define inreg <8 x double> @bitcast_v64i8_to_v8f64_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB87_4: -; GFX9-NEXT: v_mov_b32_e32 v44, v2 -; GFX9-NEXT: v_mov_b32_e32 v34, v39 -; GFX9-NEXT: v_mov_b32_e32 v35, v4 -; GFX9-NEXT: v_mov_b32_e32 v29, v33 -; GFX9-NEXT: v_mov_b32_e32 v49, v6 -; GFX9-NEXT: v_mov_b32_e32 v48, v8 -; GFX9-NEXT: v_mov_b32_e32 v39, v10 -; GFX9-NEXT: v_mov_b32_e32 v43, v12 -; GFX9-NEXT: v_mov_b32_e32 v16, v18 -; GFX9-NEXT: v_mov_b32_e32 v18, v20 -; GFX9-NEXT: v_mov_b32_e32 v20, v22 -; GFX9-NEXT: v_mov_b32_e32 v22, v24 -; GFX9-NEXT: v_mov_b32_e32 v24, v26 -; GFX9-NEXT: v_mov_b32_e32 v26, v61 -; GFX9-NEXT: v_mov_b32_e32 v30, v37 -; GFX9-NEXT: v_mov_b32_e32 v38, v1 -; GFX9-NEXT: v_mov_b32_e32 v41, v5 -; GFX9-NEXT: v_mov_b32_e32 v40, v3 -; GFX9-NEXT: v_mov_b32_e32 v63, v59 -; GFX9-NEXT: v_mov_b32_e32 v36, v58 -; GFX9-NEXT: v_mov_b32_e32 v58, v57 -; GFX9-NEXT: v_mov_b32_e32 v57, v7 -; GFX9-NEXT: v_mov_b32_e32 v59, v56 -; GFX9-NEXT: v_mov_b32_e32 v56, v47 -; GFX9-NEXT: v_mov_b32_e32 v47, v46 -; GFX9-NEXT: v_mov_b32_e32 v46, v9 -; GFX9-NEXT: v_mov_b32_e32 v45, v25 -; GFX9-NEXT: v_mov_b32_e32 v61, v23 -; GFX9-NEXT: v_mov_b32_e32 v25, v11 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v21, v19 -; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_mov_b32_e32 v17, v13 -; GFX9-NEXT: v_mov_b32_e32 v37, v27 -; GFX9-NEXT: v_mov_b32_e32 v27, v42 -; GFX9-NEXT: v_mov_b32_e32 v33, v28 -; GFX9-NEXT: v_mov_b32_e32 v28, v15 +; GFX9-NEXT: v_mov_b32_e32 v28, v44 +; GFX9-NEXT: v_mov_b32_e32 v26, v4 +; GFX9-NEXT: v_mov_b32_e32 v33, v42 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB87_2 ; @@ -58845,95 +58531,120 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-LABEL: bitcast_v32f16_to_v32i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v31, v17 -; SI-NEXT: v_mov_b32_e32 v30, v16 -; SI-NEXT: v_mov_b32_e32 v29, v15 -; SI-NEXT: v_mov_b32_e32 v28, v14 -; SI-NEXT: v_mov_b32_e32 v15, v1 -; SI-NEXT: v_mov_b32_e32 v14, v0 +; SI-NEXT: v_mov_b32_e32 v21, v16 +; SI-NEXT: v_mov_b32_e32 v25, v15 +; SI-NEXT: v_mov_b32_e32 v26, v12 +; SI-NEXT: v_mov_b32_e32 v29, v11 +; SI-NEXT: v_mov_b32_e32 v22, v8 +; SI-NEXT: v_mov_b32_e32 v30, v7 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v33, v3 +; SI-NEXT: v_mov_b32_e32 v34, v0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s23 ; SI-NEXT: v_cvt_f16_f32_e32 v8, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 ; SI-NEXT: v_cvt_f16_f32_e32 v11, s27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v37, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB91_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v15 +; SI-NEXT: v_or_b32_e32 v14, v14, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v11 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v35 +; SI-NEXT: v_or_b32_e32 v6, v6, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v33 +; SI-NEXT: v_or_b32_e32 v10, v10, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v38 +; SI-NEXT: v_or_b32_e32 v22, v22, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 +; SI-NEXT: v_or_b32_e32 v26, v26, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 @@ -58942,52 +58653,34 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v30, v30, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_or_b32_e32 v26, v26, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_or_b32_e32 v22, v22, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_or_b32_e32 v18, v18, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v14, v14, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 @@ -58996,19 +58689,23 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v32 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v32 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v31 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; SI-NEXT: v_or_b32_e32 v30, v30, v32 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshr_b64 v[37:38], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[29:30], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_or_b32_e32 v4, v4, v5 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 @@ -59017,15 +58714,15 @@ define inreg <32 x i16> @bitcast_v32f16_to_v32i16_scalar(<32 x half> inreg %a, i ; SI-NEXT: v_or_b32_e32 v20, v20, v21 ; SI-NEXT: v_or_b32_e32 v24, v24, v25 ; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 ; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v37 +; SI-NEXT: v_mov_b32_e32 v5, v35 +; SI-NEXT: v_mov_b32_e32 v9, v33 +; SI-NEXT: v_mov_b32_e32 v13, v38 +; SI-NEXT: v_mov_b32_e32 v17, v48 +; SI-NEXT: v_mov_b32_e32 v21, v49 +; SI-NEXT: v_mov_b32_e32 v25, v52 +; SI-NEXT: v_mov_b32_e32 v29, v50 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -61430,185 +61127,185 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_mul_f32_e64 v57, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v56, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v0 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v0 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v1 ; SI-NEXT: v_mul_f32_e32 v47, 1.0, v2 ; SI-NEXT: v_mul_f32_e32 v46, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v38, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v53, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v5 ; SI-NEXT: v_mul_f32_e32 v45, 1.0, v6 ; SI-NEXT: v_mul_f32_e32 v44, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v51, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v50, 1.0, v9 +; SI-NEXT: v_mul_f32_e32 v55, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v21, 1.0, v9 ; SI-NEXT: v_mul_f32_e32 v43, 1.0, v10 ; SI-NEXT: v_mul_f32_e32 v42, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v53, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v52, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v13, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v41, 1.0, v14 ; SI-NEXT: v_mul_f32_e32 v40, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v55, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v54, 1.0, v17 -; SI-NEXT: v_mul_f32_e64 v33, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v32, 1.0, s19 +; SI-NEXT: v_mul_f32_e32 v29, 1.0, v16 +; SI-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_mul_f32_e64 v32, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mul_f32_e64 v63, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v62, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v37, 1.0, s22 -; SI-NEXT: v_mul_f32_e64 v36, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v51, 1.0, s22 +; SI-NEXT: v_mul_f32_e64 v5, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v61, 1.0, s24 ; SI-NEXT: v_mul_f32_e64 v60, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v49, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v48, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v54, 1.0, s26 +; SI-NEXT: v_mul_f32_e64 v9, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v59, 1.0, s28 ; SI-NEXT: v_mul_f32_e64 v58, 1.0, s29 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v57 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v33 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v56 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v63 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v37 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v62 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v51 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v61 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v60 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v48 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v60 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v59 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v58 -; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v35 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v39 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v47 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v46 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v39 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v52 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v44 -; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v42 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v53 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v40 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v58 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v46 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v53 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v42 +; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v29 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v56 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v57 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v28, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v62 ; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v63 ; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v60 ; SI-NEXT: v_alignbit_b32 v4, v4, v2, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v61 -; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_alignbit_b32 v8, v6, v2, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v58 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 -; SI-NEXT: v_alignbit_b32 v12, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_alignbit_b32 v8, v7, v2, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v58 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v59 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; SI-NEXT: v_alignbit_b32 v16, v9, v2, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v44 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 +; SI-NEXT: v_alignbit_b32 v12, v10, v2, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v46 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v47 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 +; SI-NEXT: v_alignbit_b32 v16, v11, v2, 16 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v44 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v45 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9 -; SI-NEXT: v_alignbit_b32 v20, v10, v2, 16 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v42 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 +; SI-NEXT: v_alignbit_b32 v20, v14, v2, 16 +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v42 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v43 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_alignbit_b32 v24, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v40 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14 +; SI-NEXT: v_alignbit_b32 v24, v15, v2, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v41 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v40 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v11 -; SI-NEXT: v_alignbit_b32 v28, v13, v2, 16 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v54 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v29 +; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v21 +; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v55 ; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v13 -; SI-NEXT: v_and_b32_e32 v29, 0xffff0000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v13 +; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v11 ; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v52 -; SI-NEXT: v_alignbit_b32 v30, v31, v2, 16 +; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 ; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v17 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11 -; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v50 -; SI-NEXT: v_alignbit_b32 v26, v27, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 -; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; SI-NEXT: v_and_b32_e32 v21, 0xffff0000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v38 -; SI-NEXT: v_alignbit_b32 v22, v23, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v39 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v9 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v7 -; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; SI-NEXT: v_alignbit_b32 v18, v19, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; SI-NEXT: v_alignbit_b32 v30, v31, v15, 16 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v48 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v10 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v9 +; SI-NEXT: v_and_b32_e32 v25, 0xffff0000, v14 ; SI-NEXT: v_alignbit_b32 v14, v15, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v49 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v54 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v36 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; SI-NEXT: v_alignbit_b32 v10, v11, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v37 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v51 ; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 ; SI-NEXT: v_alignbit_b32 v6, v7, v2, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v13, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v21, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v25, 16 -; SI-NEXT: v_alignbit_b32 v29, v30, v29, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v28 +; SI-NEXT: v_lshr_b64 v[33:34], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[29:30], 16 +; SI-NEXT: v_alignbit_b32 v28, v40, v41, 16 ; SI-NEXT: .LBB95_3: ; %end ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -61626,41 +61323,49 @@ define inreg <32 x i16> @bitcast_v32bf16_to_v32i16_scalar(<32 x bfloat> inreg %a ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v5, v34 +; SI-NEXT: v_mov_b32_e32 v9, v35 +; SI-NEXT: v_mov_b32_e32 v13, v36 +; SI-NEXT: v_mov_b32_e32 v17, v37 +; SI-NEXT: v_mov_b32_e32 v21, v38 +; SI-NEXT: v_mov_b32_e32 v25, v50 +; SI-NEXT: v_mov_b32_e32 v29, v48 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB95_2 ; ; VI-LABEL: bitcast_v32bf16_to_v32i16_scalar: @@ -64884,534 +64589,686 @@ define inreg <64 x i8> @bitcast_v32i16_to_v64i8_scalar(<32 x i16> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_store_dword v37, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v37, s30, 0 -; SI-NEXT: v_writelane_b32 v37, s31, 1 -; SI-NEXT: v_writelane_b32 v37, s34, 2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_writelane_b32 v20, s30, 0 +; SI-NEXT: v_writelane_b32 v20, s31, 1 +; SI-NEXT: v_writelane_b32 v20, s34, 2 +; SI-NEXT: v_writelane_b32 v20, s35, 3 +; SI-NEXT: v_writelane_b32 v20, s36, 4 +; SI-NEXT: v_writelane_b32 v20, s37, 5 +; SI-NEXT: v_writelane_b32 v20, s38, 6 +; SI-NEXT: v_writelane_b32 v20, s39, 7 +; SI-NEXT: v_writelane_b32 v20, s48, 8 +; SI-NEXT: v_writelane_b32 v20, s49, 9 +; SI-NEXT: v_writelane_b32 v20, s50, 10 +; SI-NEXT: v_writelane_b32 v20, s51, 11 +; SI-NEXT: v_writelane_b32 v20, s52, 12 +; SI-NEXT: v_writelane_b32 v20, s53, 13 +; SI-NEXT: v_writelane_b32 v20, s54, 14 +; SI-NEXT: v_writelane_b32 v20, s55, 15 +; SI-NEXT: v_writelane_b32 v20, s64, 16 +; SI-NEXT: v_writelane_b32 v20, s65, 17 +; SI-NEXT: v_writelane_b32 v20, s66, 18 +; SI-NEXT: v_writelane_b32 v20, s67, 19 +; SI-NEXT: v_writelane_b32 v20, s68, 20 +; SI-NEXT: v_writelane_b32 v20, s69, 21 +; SI-NEXT: v_writelane_b32 v20, s70, 22 +; SI-NEXT: v_writelane_b32 v20, s71, 23 +; SI-NEXT: v_writelane_b32 v20, s80, 24 +; SI-NEXT: v_writelane_b32 v20, s81, 25 +; SI-NEXT: v_writelane_b32 v20, s82, 26 +; SI-NEXT: v_writelane_b32 v20, s83, 27 +; SI-NEXT: v_writelane_b32 v20, s84, 28 +; SI-NEXT: v_writelane_b32 v20, s85, 29 +; SI-NEXT: v_writelane_b32 v20, s86, 30 +; SI-NEXT: v_writelane_b32 v20, s87, 31 +; SI-NEXT: v_writelane_b32 v20, s96, 32 +; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v20, s98, 34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v37, s35, 3 -; SI-NEXT: v_readfirstlane_b32 s34, v18 -; SI-NEXT: v_readfirstlane_b32 s35, v17 -; SI-NEXT: v_readfirstlane_b32 s30, v14 -; SI-NEXT: v_readfirstlane_b32 s31, v13 -; SI-NEXT: v_readfirstlane_b32 s94, v10 -; SI-NEXT: v_readfirstlane_b32 s95, v9 -; SI-NEXT: v_readfirstlane_b32 s92, v6 -; SI-NEXT: v_readfirstlane_b32 s93, v5 -; SI-NEXT: v_readfirstlane_b32 s90, v2 -; SI-NEXT: v_readfirstlane_b32 s91, v1 +; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: s_mov_b32 s93, s18 +; SI-NEXT: s_mov_b32 s31, s17 +; SI-NEXT: v_readfirstlane_b32 s59, v18 +; SI-NEXT: v_readfirstlane_b32 s18, v17 +; SI-NEXT: v_readfirstlane_b32 s63, v16 +; SI-NEXT: v_readfirstlane_b32 s17, v15 +; SI-NEXT: v_readfirstlane_b32 s72, v14 +; SI-NEXT: v_readfirstlane_b32 s76, v13 +; SI-NEXT: v_readfirstlane_b32 s57, v12 +; SI-NEXT: v_readfirstlane_b32 s61, v11 +; SI-NEXT: v_readfirstlane_b32 s44, v10 +; SI-NEXT: v_readfirstlane_b32 s58, v9 +; SI-NEXT: v_readfirstlane_b32 s62, v8 +; SI-NEXT: v_readfirstlane_b32 s45, v7 +; SI-NEXT: v_readfirstlane_b32 s96, v6 +; SI-NEXT: v_readfirstlane_b32 s97, v5 +; SI-NEXT: v_readfirstlane_b32 s99, v4 +; SI-NEXT: v_readfirstlane_b32 s46, v3 +; SI-NEXT: v_readfirstlane_b32 s83, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v4 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v16 +; SI-NEXT: v_readfirstlane_b32 s85, v1 +; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB97_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: s_lshl_b32 s5, s31, 16 ; SI-NEXT: s_or_b32 s40, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff +; SI-NEXT: s_and_b32 s4, s93, 0xffff ; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s41, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 24 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s4, 4 +; SI-NEXT: v_writelane_b32 v21, s5, 5 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: v_writelane_b32 v21, s5, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[40:41], 8 +; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: v_writelane_b32 v21, s5, 1 ; SI-NEXT: s_and_b32 s4, s20, 0xffff ; SI-NEXT: s_lshl_b32 s5, s21, 16 ; SI-NEXT: s_or_b32 s14, s4, s5 ; SI-NEXT: s_and_b32 s4, s22, 0xffff ; SI-NEXT: s_lshl_b32 s5, s23, 16 ; SI-NEXT: s_or_b32 s15, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v21, s4, 10 +; SI-NEXT: v_writelane_b32 v21, s5, 11 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v21, s4, 8 +; SI-NEXT: v_writelane_b32 v21, s5, 9 +; SI-NEXT: s_lshr_b64 s[4:5], s[14:15], 8 +; SI-NEXT: v_writelane_b32 v21, s4, 6 +; SI-NEXT: v_writelane_b32 v21, s5, 7 ; SI-NEXT: s_and_b32 s4, s24, 0xffff ; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: v_mov_b32_e32 v1, s40 -; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_or_b32 s10, s4, s5 ; SI-NEXT: s_and_b32 s4, s26, 0xffff ; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: v_alignbit_b32 v18, s41, v1, 24 -; SI-NEXT: v_alignbit_b32 v25, s41, v1, 16 -; SI-NEXT: v_alignbit_b32 v30, s41, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s14 -; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_or_b32 s11, s4, s5 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v21, s4, 16 +; SI-NEXT: v_writelane_b32 v21, s5, 17 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 16 +; SI-NEXT: v_writelane_b32 v21, s4, 14 +; SI-NEXT: v_writelane_b32 v21, s5, 15 +; SI-NEXT: s_lshr_b64 s[4:5], s[10:11], 8 +; SI-NEXT: v_writelane_b32 v21, s4, 12 +; SI-NEXT: v_writelane_b32 v21, s5, 13 ; SI-NEXT: s_and_b32 s4, s28, 0xffff ; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: v_alignbit_b32 v19, s15, v1, 24 -; SI-NEXT: v_alignbit_b32 v26, s15, v1, 16 -; SI-NEXT: v_alignbit_b32 v31, s15, v1, 8 -; SI-NEXT: v_mov_b32_e32 v1, s12 -; SI-NEXT: s_or_b32 s10, s4, s5 -; SI-NEXT: s_and_b32 s4, s91, 0xffff -; SI-NEXT: s_lshl_b32 s5, s90, 16 -; SI-NEXT: v_alignbit_b32 v17, s13, v1, 24 -; SI-NEXT: v_alignbit_b32 v23, s13, v1, 16 -; SI-NEXT: v_alignbit_b32 v29, s13, v1, 8 -; SI-NEXT: s_or_b32 s11, s4, s5 -; SI-NEXT: v_mov_b32_e32 v1, s10 -; SI-NEXT: v_alignbit_b32 v16, s11, v1, 24 -; SI-NEXT: v_alignbit_b32 v20, s11, v1, 16 -; SI-NEXT: v_alignbit_b32 v27, s11, v1, 8 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: s_and_b32 s4, s93, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 -; SI-NEXT: v_or_b32_e32 v5, v1, v33 -; SI-NEXT: s_or_b32 s9, s4, s5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: s_and_b32 s4, s95, 0xffff -; SI-NEXT: s_lshl_b32 s5, s94, 16 -; SI-NEXT: v_or_b32_e32 v4, v1, v34 +; SI-NEXT: s_or_b32 s42, s4, s5 +; SI-NEXT: s_and_b32 s4, s85, 0xffff +; SI-NEXT: s_lshl_b32 s5, s83, 16 +; SI-NEXT: s_or_b32 s43, s4, s5 +; SI-NEXT: s_and_b32 s4, s46, 0xffff +; SI-NEXT: s_lshl_b32 s5, s99, 16 +; SI-NEXT: s_or_b32 s12, s4, s5 +; SI-NEXT: s_and_b32 s4, s97, 0xffff +; SI-NEXT: s_lshl_b32 s5, s96, 16 +; SI-NEXT: s_or_b32 s13, s4, s5 +; SI-NEXT: s_and_b32 s4, s45, 0xffff +; SI-NEXT: s_lshl_b32 s5, s62, 16 ; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: s_and_b32 s4, s31, 0xffff -; SI-NEXT: s_lshl_b32 s5, s30, 16 -; SI-NEXT: v_or_b32_e32 v2, v1, v35 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: s_and_b32 s4, s35, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 -; SI-NEXT: v_or_b32_e32 v1, v1, v36 +; SI-NEXT: s_and_b32 s4, s58, 0xffff +; SI-NEXT: s_lshl_b32 s5, s44, 16 +; SI-NEXT: s_or_b32 s9, s4, s5 +; SI-NEXT: s_and_b32 s4, s61, 0xffff +; SI-NEXT: s_lshl_b32 s5, s57, 16 ; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: v_alignbit_b32 v9, s9, v5, 24 -; SI-NEXT: v_alignbit_b32 v12, s9, v5, 16 -; SI-NEXT: v_alignbit_b32 v21, s9, v5, 8 -; SI-NEXT: v_alignbit_b32 v6, s8, v4, 24 -; SI-NEXT: v_alignbit_b32 v8, s8, v4, 16 -; SI-NEXT: v_alignbit_b32 v13, s8, v4, 8 -; SI-NEXT: v_alignbit_b32 v24, s7, v2, 24 -; SI-NEXT: v_alignbit_b32 v28, s7, v2, 16 -; SI-NEXT: v_alignbit_b32 v32, s7, v2, 8 -; SI-NEXT: v_alignbit_b32 v10, s6, v1, 24 -; SI-NEXT: v_alignbit_b32 v14, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v22, s6, v1, 8 -; SI-NEXT: s_lshr_b32 s78, s41, 8 -; SI-NEXT: s_lshr_b32 s75, s15, 8 -; SI-NEXT: s_lshr_b32 s72, s13, 8 -; SI-NEXT: s_lshr_b32 s61, s11, 8 -; SI-NEXT: s_lshr_b32 s58, s9, 8 -; SI-NEXT: s_lshr_b32 s47, s8, 8 -; SI-NEXT: s_lshr_b32 s45, s7, 8 -; SI-NEXT: s_lshr_b32 s42, s6, 8 -; SI-NEXT: s_and_b32 s88, s19, 0xffff -; SI-NEXT: s_and_b32 s77, s23, 0xffff -; SI-NEXT: s_and_b32 s74, s27, 0xffff -; SI-NEXT: s_and_b32 s63, s90, 0xffff -; SI-NEXT: s_and_b32 s60, s92, 0xffff -; SI-NEXT: s_and_b32 s57, s94, 0xffff -; SI-NEXT: s_and_b32 s46, s30, 0xffff -; SI-NEXT: s_and_b32 s43, s34, 0xffff -; SI-NEXT: s_bfe_u32 s89, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s79, s23, 0x80008 -; SI-NEXT: s_bfe_u32 s76, s27, 0x80008 -; SI-NEXT: s_bfe_u32 s73, s90, 0x80008 -; SI-NEXT: s_bfe_u32 s62, s92, 0x80008 -; SI-NEXT: s_bfe_u32 s59, s94, 0x80008 -; SI-NEXT: s_bfe_u32 s56, s30, 0x80008 -; SI-NEXT: s_bfe_u32 s44, s34, 0x80008 +; SI-NEXT: s_and_b32 s4, s76, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: s_or_b32 s7, s4, s5 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: s_and_b32 s78, s72, 0xffff +; SI-NEXT: s_lshr_b64 s[34:35], s[8:9], 24 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s47, s59, 16 +; SI-NEXT: s_mov_b32 s35, s78 +; SI-NEXT: s_mov_b32 s78, s93 +; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 24 +; SI-NEXT: s_or_b32 s5, s5, s47 +; SI-NEXT: s_lshr_b32 s79, s7, 8 +; SI-NEXT: s_mov_b32 s93, s78 +; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 +; SI-NEXT: s_mov_b32 s78, s31 +; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s88, s5, 8 +; SI-NEXT: s_bfe_u32 s89, s72, 0x80008 +; SI-NEXT: s_lshr_b64 s[36:37], s[8:9], 16 +; SI-NEXT: s_mov_b32 s95, s79 +; SI-NEXT: s_mov_b32 s31, s78 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; SI-NEXT: s_and_b32 s90, s59, 0xffff +; SI-NEXT: s_mov_b32 s37, s89 +; SI-NEXT: s_mov_b32 s79, s88 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_bfe_u32 vcc_lo, s59, 0x80008 +; SI-NEXT: s_mov_b32 s89, s90 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s60, s41, 8 +; SI-NEXT: s_lshr_b32 s87, s15, 8 +; SI-NEXT: s_lshr_b32 s82, s11, 8 +; SI-NEXT: s_lshr_b32 s71, s43, 8 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s73, s9, 8 +; SI-NEXT: s_and_b32 s74, s19, 0xffff +; SI-NEXT: s_and_b32 s98, s23, 0xffff +; SI-NEXT: s_and_b32 s84, s27, 0xffff +; SI-NEXT: s_and_b32 s80, s83, 0xffff +; SI-NEXT: s_and_b32 s69, s96, 0xffff +; SI-NEXT: s_and_b32 s75, s44, 0xffff +; SI-NEXT: s_bfe_u32 s47, s19, 0x80008 +; SI-NEXT: s_bfe_u32 s56, s23, 0x80008 +; SI-NEXT: s_bfe_u32 s86, s27, 0x80008 +; SI-NEXT: s_bfe_u32 s81, s83, 0x80008 +; SI-NEXT: s_bfe_u32 s70, s96, 0x80008 +; SI-NEXT: s_bfe_u32 s77, s44, 0x80008 +; SI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[38:39], s[8:9], 8 +; SI-NEXT: s_mov_b32 s91, vcc_lo ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true -; SI-NEXT: s_add_i32 s35, s35, 3 -; SI-NEXT: s_and_b32 s4, s35, 0xffff -; SI-NEXT: s_lshl_b32 s5, s34, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s31, s31, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s31, 0xffff -; SI-NEXT: s_lshl_b32 s5, s30, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s95, s95, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s95, 0xffff -; SI-NEXT: s_lshl_b32 s5, s94, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s93, s93, 3 -; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s93, 0xffff -; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: s_add_i32 s17, s17, 3 +; SI-NEXT: s_and_b32 s4, s17, 0xffff +; SI-NEXT: s_lshl_b32 s5, s63, 16 +; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s59, 16 +; SI-NEXT: s_add_i32 s61, s61, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s61, 0xffff +; SI-NEXT: s_lshl_b32 s7, s57, 16 +; SI-NEXT: s_add_i32 s76, s76, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s76, 0xffff +; SI-NEXT: s_lshl_b32 s8, s72, 16 +; SI-NEXT: s_add_i32 s45, s45, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s8, s45, 0xffff +; SI-NEXT: s_lshl_b32 s9, s62, 16 +; SI-NEXT: s_add_i32 s58, s58, 3 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: s_and_b32 s9, s58, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: s_add_i32 s46, s46, 3 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: s_and_b32 s10, s46, 0xffff +; SI-NEXT: s_lshl_b32 s11, s99, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s97, s97, 3 +; SI-NEXT: s_add_i32 s12, s10, 0x30000 +; SI-NEXT: s_and_b32 s10, s97, 0xffff +; SI-NEXT: s_lshl_b32 s11, s96, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s9, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: s_lshl_b32 s5, s29, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s91, s91, 3 -; SI-NEXT: s_add_i32 s10, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s91, 0xffff -; SI-NEXT: s_lshl_b32 s5, s90, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s13, s10, 0x30000 +; SI-NEXT: s_and_b32 s10, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s29, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_add_i32 s85, s85, 3 +; SI-NEXT: s_add_i32 s42, s10, 0x30000 +; SI-NEXT: s_and_b32 s10, s85, 0xffff +; SI-NEXT: s_lshl_b32 s11, s83, 16 +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s11, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: s_lshl_b32 s5, s25, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_add_i32 s43, s10, 0x30000 +; SI-NEXT: s_and_b32 s10, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s25, 16 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_add_i32 s12, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: s_lshl_b32 s5, s27, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: s_and_b32 s11, s26, 0xffff +; SI-NEXT: s_lshl_b32 s14, s27, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s13, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: s_and_b32 s14, s20, 0xffff +; SI-NEXT: s_lshl_b32 s15, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: s_add_i32 s14, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: s_lshl_b32 s5, s23, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: s_and_b32 s15, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s23, 16 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s15, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s40, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v15 -; SI-NEXT: v_add_i32_e32 v2, vcc, 3, v11 -; SI-NEXT: v_add_i32_e32 v4, vcc, 3, v7 -; SI-NEXT: v_add_i32_e32 v3, vcc, 3, v3 -; SI-NEXT: s_add_i32 s41, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v6, s40 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_alignbit_b32 v18, s41, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, s41, v6, 16 -; SI-NEXT: v_alignbit_b32 v30, s41, v6, 8 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_or_b32_e32 v1, v36, v1 -; SI-NEXT: v_or_b32_e32 v2, v35, v2 -; SI-NEXT: v_or_b32_e32 v4, v34, v4 -; SI-NEXT: v_or_b32_e32 v3, v33, v3 -; SI-NEXT: v_alignbit_b32 v19, s15, v6, 24 -; SI-NEXT: v_alignbit_b32 v26, s15, v6, 16 -; SI-NEXT: v_alignbit_b32 v31, s15, v6, 8 -; SI-NEXT: v_mov_b32_e32 v6, s12 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x30000, v1 -; SI-NEXT: v_mov_b32_e32 v15, s6 -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x30000, v2 -; SI-NEXT: v_mov_b32_e32 v10, s7 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x30000, v4 -; SI-NEXT: v_mov_b32_e32 v7, s8 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x30000, v3 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_alignbit_b32 v17, s13, v6, 24 -; SI-NEXT: v_alignbit_b32 v23, s13, v6, 16 -; SI-NEXT: v_alignbit_b32 v29, s13, v6, 8 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_alignbit_b32 v16, s11, v6, 24 -; SI-NEXT: v_alignbit_b32 v20, s11, v6, 16 -; SI-NEXT: v_alignbit_b32 v27, s11, v6, 8 -; SI-NEXT: v_alignbit_b32 v9, v3, v5, 24 -; SI-NEXT: v_alignbit_b32 v12, v3, v5, 16 -; SI-NEXT: v_alignbit_b32 v21, v3, v5, 8 -; SI-NEXT: v_alignbit_b32 v6, v7, v4, 24 -; SI-NEXT: v_alignbit_b32 v8, v7, v4, 16 -; SI-NEXT: v_alignbit_b32 v13, v7, v4, 8 -; SI-NEXT: v_alignbit_b32 v24, v10, v2, 24 -; SI-NEXT: v_alignbit_b32 v28, v10, v2, 16 -; SI-NEXT: v_alignbit_b32 v32, v10, v2, 8 -; SI-NEXT: v_alignbit_b32 v10, v15, v1, 24 -; SI-NEXT: v_alignbit_b32 v14, v15, v1, 16 -; SI-NEXT: v_alignbit_b32 v22, v15, v1, 8 -; SI-NEXT: s_lshr_b32 s89, s41, 24 -; SI-NEXT: s_lshr_b32 s88, s41, 16 -; SI-NEXT: s_lshr_b32 s78, s41, 8 -; SI-NEXT: s_lshr_b32 s79, s15, 24 -; SI-NEXT: s_lshr_b32 s77, s15, 16 -; SI-NEXT: s_lshr_b32 s75, s15, 8 -; SI-NEXT: s_lshr_b32 s76, s13, 24 -; SI-NEXT: s_lshr_b32 s74, s13, 16 -; SI-NEXT: s_lshr_b32 s72, s13, 8 -; SI-NEXT: s_lshr_b32 s73, s11, 24 -; SI-NEXT: s_lshr_b32 s63, s11, 16 -; SI-NEXT: s_lshr_b32 s61, s11, 8 -; SI-NEXT: s_lshr_b32 s62, s9, 24 -; SI-NEXT: s_lshr_b32 s60, s9, 16 -; SI-NEXT: s_lshr_b32 s58, s9, 8 -; SI-NEXT: s_lshr_b32 s59, s8, 24 -; SI-NEXT: s_lshr_b32 s57, s8, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 8 -; SI-NEXT: s_lshr_b32 s56, s7, 24 -; SI-NEXT: s_lshr_b32 s46, s7, 16 -; SI-NEXT: s_lshr_b32 s45, s7, 8 -; SI-NEXT: s_lshr_b32 s44, s6, 24 -; SI-NEXT: s_lshr_b32 s43, s6, 16 -; SI-NEXT: s_lshr_b32 s42, s6, 8 +; SI-NEXT: s_or_b32 s15, s17, s15 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s31, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s40, s16, 0x30000 +; SI-NEXT: s_add_i32 s16, s93, 3 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_lshl_b32 s17, s19, 16 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: s_add_i32 s41, s16, 0x30000 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 24 +; SI-NEXT: v_writelane_b32 v21, s16, 4 +; SI-NEXT: v_writelane_b32 v21, s17, 5 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 16 +; SI-NEXT: v_writelane_b32 v21, s16, 2 +; SI-NEXT: v_writelane_b32 v21, s17, 3 +; SI-NEXT: s_lshr_b64 s[16:17], s[40:41], 8 +; SI-NEXT: s_add_i32 s14, s14, 0x30000 +; SI-NEXT: s_add_i32 s15, s15, 0x30000 +; SI-NEXT: v_writelane_b32 v21, s16, 0 +; SI-NEXT: v_writelane_b32 v21, s17, 1 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 24 +; SI-NEXT: v_writelane_b32 v21, s16, 10 +; SI-NEXT: v_writelane_b32 v21, s17, 11 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 16 +; SI-NEXT: v_writelane_b32 v21, s16, 8 +; SI-NEXT: v_writelane_b32 v21, s17, 9 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: s_add_i32 s10, s10, 0x30000 +; SI-NEXT: s_add_i32 s11, s11, 0x30000 +; SI-NEXT: v_writelane_b32 v21, s16, 6 +; SI-NEXT: v_writelane_b32 v21, s17, 7 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 24 +; SI-NEXT: v_writelane_b32 v21, s16, 16 +; SI-NEXT: v_writelane_b32 v21, s17, 17 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_add_i32 s6, s6, 0x30000 +; SI-NEXT: s_add_i32 s7, s7, 0x30000 +; SI-NEXT: s_add_i32 s8, s8, 0x30000 +; SI-NEXT: s_add_i32 s9, s9, 0x30000 +; SI-NEXT: v_writelane_b32 v21, s16, 14 +; SI-NEXT: v_writelane_b32 v21, s17, 15 +; SI-NEXT: s_lshr_b64 s[16:17], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[34:35], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[36:37], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[88:89], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[4:5], 8 +; SI-NEXT: v_writelane_b32 v21, s16, 12 +; SI-NEXT: s_lshr_b64 s[54:55], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[64:65], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[66:67], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[48:49], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[50:51], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[52:53], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[38:39], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[30:31], s[6:7], 8 +; SI-NEXT: s_lshr_b32 s47, s41, 24 +; SI-NEXT: s_lshr_b32 s74, s41, 16 +; SI-NEXT: s_lshr_b32 s60, s41, 8 +; SI-NEXT: s_lshr_b32 s56, s15, 24 +; SI-NEXT: s_lshr_b32 s98, s15, 16 +; SI-NEXT: s_lshr_b32 s87, s15, 8 +; SI-NEXT: s_lshr_b32 s86, s11, 24 +; SI-NEXT: s_lshr_b32 s84, s11, 16 +; SI-NEXT: s_lshr_b32 s82, s11, 8 +; SI-NEXT: s_lshr_b32 s81, s43, 24 +; SI-NEXT: s_lshr_b32 s80, s43, 16 +; SI-NEXT: s_lshr_b32 s71, s43, 8 +; SI-NEXT: s_lshr_b32 s70, s13, 24 +; SI-NEXT: s_lshr_b32 s69, s13, 16 +; SI-NEXT: s_lshr_b32 s68, s13, 8 +; SI-NEXT: s_lshr_b32 s77, s9, 24 +; SI-NEXT: s_lshr_b32 s75, s9, 16 +; SI-NEXT: s_lshr_b32 s73, s9, 8 +; SI-NEXT: s_lshr_b32 s37, s7, 24 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s95, s7, 8 +; SI-NEXT: s_lshr_b32 s91, s5, 24 +; SI-NEXT: s_lshr_b32 s89, s5, 16 +; SI-NEXT: s_lshr_b32 s79, s5, 8 +; SI-NEXT: v_writelane_b32 v21, s17, 13 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v30 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s41, 0xff -; SI-NEXT: s_lshl_b32 s5, s78, 8 -; SI-NEXT: v_and_b32_e32 v7, 0xff, v25 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s88, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v18 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s16, s89, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s16, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s14, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v31 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s15, 0xff -; SI-NEXT: s_lshl_b32 s5, s75, 8 +; SI-NEXT: v_readlane_b32 s18, v21, 0 +; SI-NEXT: v_readlane_b32 s19, v21, 1 +; SI-NEXT: s_lshl_b32 s17, s18, 8 +; SI-NEXT: v_readlane_b32 s18, v21, 2 +; SI-NEXT: s_and_b32 s16, s40, 0xff +; SI-NEXT: v_readlane_b32 s19, v21, 3 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s18, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 4 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s18, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xff +; SI-NEXT: s_lshl_b32 s17, s60, 8 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: s_and_b32 s17, s74, 0xff +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: s_lshl_b32 s18, s47, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: v_readlane_b32 s16, v21, 6 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s17, v21, 7 +; SI-NEXT: s_lshl_b32 s16, s16, 8 +; SI-NEXT: v_readlane_b32 s19, v21, 5 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: v_readlane_b32 s16, v21, 8 +; SI-NEXT: v_readlane_b32 s17, v21, 9 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: v_readlane_b32 s18, v21, 10 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s18, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v26 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s77, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v19 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s79, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s12, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v29 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s72, 8 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s87, 8 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: s_and_b32 s15, s98, 0xff +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: s_lshl_b32 s16, s56, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_readlane_b32 s14, v21, 12 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: v_readlane_b32 s15, v21, 13 +; SI-NEXT: s_lshl_b32 s14, s14, 8 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: v_readlane_b32 s14, v21, 14 +; SI-NEXT: v_readlane_b32 s15, v21, 15 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: v_readlane_b32 s16, v21, 16 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s16, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s10, s10, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s74, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v17 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s12, s76, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s12, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s82, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s84, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s14, s86, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: s_and_b32 s4, s10, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v27 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s61, 8 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s42, 0xff +; SI-NEXT: s_lshl_b32 s11, s66, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s64, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s14, s54, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v20 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s63, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v16 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s10, s73, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v7, v11, v7 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s10, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s43, 0xff +; SI-NEXT: s_lshl_b32 s11, s71, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s80, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s14, s81, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s14, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: buffer_store_dword v7, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v21 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s58, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v12 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s60, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s12, 0xff +; SI-NEXT: s_lshl_b32 s11, s52, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s50, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s48, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v9 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s9, s62, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s9, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s13, 0xff +; SI-NEXT: s_lshl_b32 s11, s68, 8 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: s_and_b32 s11, s69, 0xff +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: s_lshl_b32 s12, s70, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s38, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s36, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s34, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v3, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v13 -; SI-NEXT: s_and_b32 s4, s8, 0xff -; SI-NEXT: s_lshl_b32 s5, s47, 8 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v8 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s57, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s73, 8 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_and_b32 s9, s75, 0xff +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_lshl_b32 s10, s77, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s8, s59, 24 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s8, s5 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s30, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s94, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s92, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v32 -; SI-NEXT: s_and_b32 s4, s7, 0xff -; SI-NEXT: s_lshl_b32 s5, s45, 8 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v28 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s46, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s95, 8 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_and_b32 s7, s35, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s8, s37, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v24 -; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s7, s56, 24 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s90, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s88, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s78, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s7, s5 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v22 -; SI-NEXT: s_and_b32 s4, s6, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v14 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s79, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s43, 0xff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v10 +; SI-NEXT: s_and_b32 s5, s89, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s44, 24 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: s_lshl_b32 s6, s91, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s19, v21, 11 +; SI-NEXT: v_readlane_b32 s17, v21, 17 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s35, v37, 3 -; SI-NEXT: v_readlane_b32 s34, v37, 2 -; SI-NEXT: v_readlane_b32 s31, v37, 1 -; SI-NEXT: v_readlane_b32 s30, v37, 0 +; SI-NEXT: v_readlane_b32 s99, v20, 35 +; SI-NEXT: v_readlane_b32 s98, v20, 34 +; SI-NEXT: v_readlane_b32 s97, v20, 33 +; SI-NEXT: v_readlane_b32 s96, v20, 32 +; SI-NEXT: v_readlane_b32 s87, v20, 31 +; SI-NEXT: v_readlane_b32 s86, v20, 30 +; SI-NEXT: v_readlane_b32 s85, v20, 29 +; SI-NEXT: v_readlane_b32 s84, v20, 28 +; SI-NEXT: v_readlane_b32 s83, v20, 27 +; SI-NEXT: v_readlane_b32 s82, v20, 26 +; SI-NEXT: v_readlane_b32 s81, v20, 25 +; SI-NEXT: v_readlane_b32 s80, v20, 24 +; SI-NEXT: v_readlane_b32 s71, v20, 23 +; SI-NEXT: v_readlane_b32 s70, v20, 22 +; SI-NEXT: v_readlane_b32 s69, v20, 21 +; SI-NEXT: v_readlane_b32 s68, v20, 20 +; SI-NEXT: v_readlane_b32 s67, v20, 19 +; SI-NEXT: v_readlane_b32 s66, v20, 18 +; SI-NEXT: v_readlane_b32 s65, v20, 17 +; SI-NEXT: v_readlane_b32 s64, v20, 16 +; SI-NEXT: v_readlane_b32 s55, v20, 15 +; SI-NEXT: v_readlane_b32 s54, v20, 14 +; SI-NEXT: v_readlane_b32 s53, v20, 13 +; SI-NEXT: v_readlane_b32 s52, v20, 12 +; SI-NEXT: v_readlane_b32 s51, v20, 11 +; SI-NEXT: v_readlane_b32 s50, v20, 10 +; SI-NEXT: v_readlane_b32 s49, v20, 9 +; SI-NEXT: v_readlane_b32 s48, v20, 8 +; SI-NEXT: v_readlane_b32 s39, v20, 7 +; SI-NEXT: v_readlane_b32 s38, v20, 6 +; SI-NEXT: v_readlane_b32 s37, v20, 5 +; SI-NEXT: v_readlane_b32 s36, v20, 4 +; SI-NEXT: v_readlane_b32 s35, v20, 3 +; SI-NEXT: v_readlane_b32 s34, v20, 2 +; SI-NEXT: v_readlane_b32 s31, v20, 1 +; SI-NEXT: v_readlane_b32 s30, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v21, s4, 0 +; SI-NEXT: v_writelane_b32 v21, s5, 1 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr78 -; SI-NEXT: ; implicit-def: $sgpr88 -; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr47 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr87 +; SI-NEXT: ; implicit-def: $sgpr98 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr82 +; SI-NEXT: ; implicit-def: $sgpr84 +; SI-NEXT: ; implicit-def: $sgpr86 +; SI-NEXT: ; implicit-def: $sgpr71 +; SI-NEXT: ; implicit-def: $sgpr80 +; SI-NEXT: ; implicit-def: $sgpr81 +; SI-NEXT: ; implicit-def: $sgpr68 +; SI-NEXT: ; implicit-def: $sgpr69 +; SI-NEXT: ; implicit-def: $sgpr70 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr95 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr91 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr66 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr54 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $sgpr10 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr6 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: v_writelane_b32 v21, s4, 2 +; SI-NEXT: v_writelane_b32 v21, s5, 3 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 4 +; SI-NEXT: v_writelane_b32 v21, s5, 5 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 6 +; SI-NEXT: v_writelane_b32 v21, s5, 7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 8 +; SI-NEXT: v_writelane_b32 v21, s5, 9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 10 +; SI-NEXT: v_writelane_b32 v21, s5, 11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 12 +; SI-NEXT: v_writelane_b32 v21, s5, 13 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 14 +; SI-NEXT: v_writelane_b32 v21, s5, 15 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: v_writelane_b32 v21, s4, 16 +; SI-NEXT: v_writelane_b32 v21, s5, 17 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v32i16_to_v64i8_scalar: @@ -69207,433 +69064,541 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v46, v30 ; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:20 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:68 -; SI-NEXT: v_readfirstlane_b32 s15, v27 -; SI-NEXT: v_readfirstlane_b32 s40, v26 -; SI-NEXT: v_readfirstlane_b32 s12, v19 -; SI-NEXT: v_readfirstlane_b32 s13, v18 -; SI-NEXT: v_readfirstlane_b32 s10, v11 -; SI-NEXT: v_readfirstlane_b32 s11, v10 -; SI-NEXT: v_readfirstlane_b32 s8, v3 -; SI-NEXT: v_readfirstlane_b32 s9, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v1 -; SI-NEXT: v_readfirstlane_b32 s6, v0 -; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v9 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v5 -; SI-NEXT: v_lshlrev_b32_e32 v49, 8, v15 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v17 -; SI-NEXT: v_lshlrev_b32_e32 v51, 24, v13 -; SI-NEXT: v_lshlrev_b32_e32 v53, 8, v23 -; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v25 -; SI-NEXT: v_lshlrev_b32_e32 v54, 24, v21 -; SI-NEXT: v_lshlrev_b32_e32 v44, 24, v29 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:56 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:72 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 +; SI-NEXT: v_readfirstlane_b32 s43, v1 +; SI-NEXT: v_readfirstlane_b32 s42, v0 +; SI-NEXT: v_lshlrev_b32_e32 v42, 8, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v5 +; SI-NEXT: v_lshlrev_b32_e32 v40, 8, v11 +; SI-NEXT: v_lshlrev_b32_e32 v53, 24, v13 +; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v19 +; SI-NEXT: v_lshlrev_b32_e32 v52, 24, v21 +; SI-NEXT: v_lshlrev_b32_e32 v59, 8, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v29 +; SI-NEXT: v_lshlrev_b32_e32 v47, 8, v7 +; SI-NEXT: v_lshlrev_b32_e32 v45, 24, v9 +; SI-NEXT: v_lshlrev_b32_e32 v57, 8, v15 +; SI-NEXT: v_lshlrev_b32_e32 v56, 24, v17 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v23 +; SI-NEXT: v_lshlrev_b32_e32 v41, 24, v25 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_readfirstlane_b32 s44, v31 -; SI-NEXT: v_readfirstlane_b32 s45, v32 -; SI-NEXT: v_readfirstlane_b32 s42, v33 -; SI-NEXT: v_readfirstlane_b32 s43, v34 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v35 -; SI-NEXT: v_lshlrev_b32_e32 v43, 8, v36 -; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v42, 24, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v38 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v56, 8, v38 +; SI-NEXT: v_lshlrev_b32_e32 v13, 24, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 8, v48 +; SI-NEXT: s_waitcnt vmcnt(12) +; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v39 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_lshlrev_b32_e32 v61, 8, v37 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v58, 24, v39 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_lshlrev_b32_e32 v47, 24, v48 +; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v30 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v60, 8, v59 +; SI-NEXT: v_lshlrev_b32_e32 v30, 24, v31 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v61, 24, v61 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v59, 24, v62 +; SI-NEXT: v_lshlrev_b32_e32 v38, 8, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshlrev_b32_e32 v29, 24, v34 +; SI-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v2 +; SI-NEXT: v_or_b32_e32 v0, v0, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v0, v18, v0 +; SI-NEXT: v_or_b32_e32 v37, v1, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v12 +; SI-NEXT: v_or_b32_e32 v1, v1, v40 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(10) expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v44 +; SI-NEXT: v_or_b32_e32 v44, v53, v9 +; SI-NEXT: v_or_b32_e32 v33, v1, v44 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v9, 0xff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v15, v46 +; SI-NEXT: v_or_b32_e32 v46, v52, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_or_b32_e32 v55, v3, v9 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v27, v13 +; SI-NEXT: v_mov_b32_e32 v58, v8 +; SI-NEXT: v_mov_b32_e32 v49, v45 +; SI-NEXT: v_mov_b32_e32 v36, v24 +; SI-NEXT: v_mov_b32_e32 v34, v26 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s14, s23, 24 +; SI-NEXT: s_lshl_b32 s6, s19, 24 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s14, s5 -; SI-NEXT: s_or_b32 s41, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 -; SI-NEXT: s_lshl_b32 s14, s19, 24 -; SI-NEXT: s_or_b32 s4, s14, s4 -; SI-NEXT: s_and_b32 s14, s28, 0xff -; SI-NEXT: s_lshl_b32 s46, s29, 8 -; SI-NEXT: s_or_b32 s14, s14, s46 -; SI-NEXT: s_and_b32 s46, s6, 0xff -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_lshl_b32 s47, s7, 24 -; SI-NEXT: s_or_b32 s57, s47, s46 -; SI-NEXT: s_and_b32 s46, s26, 0xff -; SI-NEXT: s_lshl_b32 s46, s46, 16 -; SI-NEXT: s_lshl_b32 s47, s27, 24 -; SI-NEXT: s_or_b32 s46, s47, s46 -; SI-NEXT: s_and_b32 s47, s16, 0xff -; SI-NEXT: s_lshl_b32 s56, s17, 8 -; SI-NEXT: s_or_b32 s47, s47, s56 -; SI-NEXT: s_and_b32 s47, s47, 0xffff -; SI-NEXT: v_mov_b32_e32 v1, s4 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v8 -; SI-NEXT: s_or_b32 s47, s47, s4 +; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: s_or_b32 s6, s4, s12 ; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s56, s25, 8 -; SI-NEXT: v_or_b32_e32 v9, v9, v0 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_or_b32_e32 v11, v2, v10 +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s26, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_or_b32 s14, s7, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s7, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s7 +; SI-NEXT: s_and_b32 s7, s5, 0xffff +; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_or_b32 s5, s8, s5 +; SI-NEXT: s_or_b32 s13, s7, s5 +; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 16 +; SI-NEXT: s_and_b32 s7, s28, 0xff +; SI-NEXT: s_lshl_b32 s9, s29, 8 +; SI-NEXT: s_or_b32 s7, s7, s9 +; SI-NEXT: s_and_b32 s9, s7, 0xffff +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_lshl_b32 s10, s43, 24 +; SI-NEXT: s_or_b32 s7, s10, s7 +; SI-NEXT: s_or_b32 s15, s9, s7 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v5, s46 -; SI-NEXT: v_or_b32_e32 v10, v9, v11 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v4 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v14 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v16 -; SI-NEXT: s_or_b32 s46, s4, s46 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s56, s8, 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 16 +; SI-NEXT: s_or_b32 s4, s4, s14 +; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s11, s7, 16 +; SI-NEXT: s_mov_b32 s7, s13 +; SI-NEXT: s_mov_b32 s5, s15 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v48, v1, v46 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v59 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v35, v1, v55 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v43, v13, v9 +; SI-NEXT: v_or_b32_e32 v50, v1, v43 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v54 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v19 ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v13, v13, v49 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: v_or_b32_e32 v15, v3, v9 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_or_b32_e32 v19, v7, v17 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v36, v13, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v12 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v22 -; SI-NEXT: v_and_b32_e32 v18, 0xff, v24 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v55 -; SI-NEXT: v_or_b32_e32 v35, s4, v15 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s56, s10, 8 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v17, v17, v53 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: v_or_b32_e32 v23, v51, v13 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_or_b32_e32 v27, v52, v18 -; SI-NEXT: v_or_b32_e32 v62, v47, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v41 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v18, v17, v27 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v20 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v30 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_or_b32_e32 v37, s4, v23 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s56, s12, 8 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v21, v21, v43 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; SI-NEXT: v_or_b32_e32 v33, v58, v32 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v45 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v34, 0xff, v46 -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: v_or_b32_e32 v25, v54, v17 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v31, v42, v26 -; SI-NEXT: v_or_b32_e32 v32, v32, v60 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v38, v21, v31 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v28 -; SI-NEXT: v_and_b32_e32 v32, 0xffff, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_mov_b32_e32 v13, v4 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v59 +; SI-NEXT: v_mov_b32_e32 v59, v3 +; SI-NEXT: v_mov_b32_e32 v3, v63 +; SI-NEXT: v_mov_b32_e32 v63, v40 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_or_b32_e32 v42, v11, v9 +; SI-NEXT: v_or_b32_e32 v54, v1, v42 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v9, 0xff, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v47 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v11, v45, v9 +; SI-NEXT: v_or_b32_e32 v1, v1, v11 +; SI-NEXT: v_mov_b32_e32 v19, v10 +; SI-NEXT: v_lshr_b64 v[9:10], v[0:1], 16 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xff, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v63, v59, v34 -; SI-NEXT: v_or_b32_e32 v39, s4, v25 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s56, s15, 8 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v48, v32, v63 -; SI-NEXT: v_and_b32_e32 v32, 0xff, v57 -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: v_or_b32_e32 v29, v44, v21 -; SI-NEXT: v_and_b32_e32 v26, 0xff, v40 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v26, v26, v56 -; SI-NEXT: v_or_b32_e32 v34, v61, v32 -; SI-NEXT: v_or_b32_e32 v32, s4, v29 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s56, s42, 8 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: v_or_b32_e32 v26, v26, v62 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v17, v18, v25, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 -; SI-NEXT: v_or_b32_e32 v33, s4, v33 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s56, s44, 8 -; SI-NEXT: s_and_b32 s14, s14, 0xffff -; SI-NEXT: s_or_b32 s4, s4, s56 -; SI-NEXT: s_or_b32 s14, s14, s57 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_alignbit_b32 v1, s41, v1, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v15, 16 -; SI-NEXT: v_alignbit_b32 v13, v36, v23, 16 -; SI-NEXT: v_alignbit_b32 v21, v38, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 -; SI-NEXT: v_or_b32_e32 v34, s4, v34 -; SI-NEXT: s_lshr_b32 s56, s5, 16 -; SI-NEXT: s_lshr_b32 s57, s57, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v16 +; SI-NEXT: v_or_b32_e32 v0, v0, v57 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v23, v56, v10 +; SI-NEXT: v_mov_b32_e32 v8, v6 +; SI-NEXT: v_mov_b32_e32 v6, v14 +; SI-NEXT: v_or_b32_e32 v45, v0, v23 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v24 +; SI-NEXT: v_lshr_b64 v[9:10], v[44:45], 16 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v14, v41, v14 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v24, v17 +; SI-NEXT: v_mov_b32_e32 v17, v47 +; SI-NEXT: v_or_b32_e32 v47, v0, v14 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[9:10], v[46:47], 16 +; SI-NEXT: v_mov_b32_e32 v46, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v46 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v62 +; SI-NEXT: v_or_b32_e32 v0, v0, v61 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v31, v7, v15 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v9, v61 +; SI-NEXT: v_mov_b32_e32 v61, v7 +; SI-NEXT: v_mov_b32_e32 v7, v5 +; SI-NEXT: v_mov_b32_e32 v5, v52 +; SI-NEXT: v_mov_b32_e32 v52, v41 +; SI-NEXT: v_mov_b32_e32 v41, v62 +; SI-NEXT: v_mov_b32_e32 v62, v57 +; SI-NEXT: v_mov_b32_e32 v57, v53 +; SI-NEXT: v_mov_b32_e32 v53, v56 +; SI-NEXT: v_or_b32_e32 v56, v0, v31 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v51, v22 +; SI-NEXT: v_lshr_b64 v[21:22], v[55:56], 16 ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v27 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xff, v10 +; SI-NEXT: v_or_b32_e32 v0, v0, v25 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v22, v30, v15 +; SI-NEXT: v_or_b32_e32 v44, v0, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v32 +; SI-NEXT: v_and_b32_e32 v15, 0xff, v60 +; SI-NEXT: v_or_b32_e32 v0, v0, v38 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v38, v29, v15 +; SI-NEXT: v_lshr_b64 v[25:26], v[43:44], 16 +; SI-NEXT: v_or_b32_e32 v43, v0, v38 +; SI-NEXT: v_mov_b32_e32 v0, v30 +; SI-NEXT: v_lshr_b64 v[29:30], v[42:43], 16 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v40, v63 +; SI-NEXT: v_mov_b32_e32 v63, v3 +; SI-NEXT: v_mov_b32_e32 v3, v59 +; SI-NEXT: v_mov_b32_e32 v59, v2 +; SI-NEXT: v_mov_b32_e32 v10, v19 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v13 +; SI-NEXT: v_mov_b32_e32 v13, v27 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v22 +; SI-NEXT: v_mov_b32_e32 v14, v6 +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v8, v58 +; SI-NEXT: v_mov_b32_e32 v22, v51 +; SI-NEXT: v_mov_b32_e32 v51, v44 +; SI-NEXT: v_mov_b32_e32 v44, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v55, v43 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v0 +; SI-NEXT: v_mov_b32_e32 v26, v34 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v31 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v62 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v63 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v38 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_mov_b32_e32 v34, v45 +; SI-NEXT: v_mov_b32_e32 v45, v49 +; SI-NEXT: v_mov_b32_e32 v49, v47 +; SI-NEXT: v_mov_b32_e32 v47, v17 +; SI-NEXT: v_mov_b32_e32 v17, v24 +; SI-NEXT: v_mov_b32_e32 v24, v36 +; SI-NEXT: v_mov_b32_e32 v36, v56 +; SI-NEXT: v_mov_b32_e32 v56, v53 +; SI-NEXT: v_mov_b32_e32 v53, v57 +; SI-NEXT: v_mov_b32_e32 v57, v62 +; SI-NEXT: v_mov_b32_e32 v62, v41 +; SI-NEXT: v_mov_b32_e32 v41, v52 +; SI-NEXT: v_mov_b32_e32 v52, v5 +; SI-NEXT: v_mov_b32_e32 v5, v7 +; SI-NEXT: v_mov_b32_e32 v7, v61 +; SI-NEXT: v_mov_b32_e32 v61, v9 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true -; SI-NEXT: s_add_i32 s45, s45, 3 -; SI-NEXT: s_and_b32 s4, s45, 0xff -; SI-NEXT: s_lshl_b32 s5, s44, 8 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v57 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_add_i32 s26, s26, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: s_and_b32 s6, s26, 0xff ; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v61, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v45 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_add_i32 s28, s28, 3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s28, 0xff +; SI-NEXT: s_lshl_b32 s6, s29, 8 +; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s7, s42, 0xff +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s43, 24 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s16, 0xff +; SI-NEXT: s_lshl_b32 s7, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s8, s18, 0xff +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s19, 24 +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_and_b32 s7, s20, 0xff +; SI-NEXT: s_lshl_b32 s8, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_and_b32 s9, s22, 0xff +; SI-NEXT: s_addk_i32 s7, 0x300 +; SI-NEXT: s_lshl_b32 s8, s23, 24 +; SI-NEXT: s_lshl_b32 s9, s9, 16 +; SI-NEXT: s_and_b32 s7, s7, 0xffff +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_add_i32 s6, s6, 0x3000000 +; SI-NEXT: s_add_i32 s7, s7, 0x3000000 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s9, s7, 16 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v54, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v46 -; SI-NEXT: v_or_b32_e32 v1, v60, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v59, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s43, 0xff -; SI-NEXT: s_lshl_b32 s5, s42, 8 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v41 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v1, v9, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v55, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v58, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v40 +; SI-NEXT: v_or_b32_e32 v1, v13, v1 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v55 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v17, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v50, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v55 -; SI-NEXT: v_or_b32_e32 v1, v56, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v47, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s40, s40, 3 -; SI-NEXT: v_add_i32_e32 v26, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s40, 0xff -; SI-NEXT: s_lshl_b32 s5, s15, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v51, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v28 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v44, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v32, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v30 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v62 +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v50 -; SI-NEXT: v_or_b32_e32 v1, v43, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v42, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s13, 0xff -; SI-NEXT: s_lshl_b32 s5, s12, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v7, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v0 +; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v20 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v54, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v39, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v22 +; SI-NEXT: v_or_b32_e32 v1, v52, v1 +; SI-NEXT: v_lshr_b64 v[25:26], v[50:51], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[54:55], 16 +; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v36 +; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v48, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v22 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v24 +; SI-NEXT: v_or_b32_e32 v0, v5, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v24 -; SI-NEXT: v_or_b32_e32 v1, v53, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v52, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: v_add_i32_e32 v18, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s11, 0xff -; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v41, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v49, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v10 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v12 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v51, v1 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: v_add_i32_e32 v37, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v53, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v33, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v14 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v16 +; SI-NEXT: v_or_b32_e32 v0, v57, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_add_i32_e32 v5, vcc, 3, v16 -; SI-NEXT: v_or_b32_e32 v1, v49, v1 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_add_i32_e32 v1, vcc, 0x300, v1 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 -; SI-NEXT: v_or_b32_e32 v1, v5, v1 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: v_add_i32_e32 v36, vcc, 0x3000000, v1 -; SI-NEXT: s_and_b32 s4, s9, 0xff -; SI-NEXT: s_lshl_b32 s5, s8, 8 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v56, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v34, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v4 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 +; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: v_or_b32_e32 v1, v3, v1 -; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 -; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s46, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s28, 0xff -; SI-NEXT: s_lshl_b32 s5, s29, 8 -; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s7, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s14, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s16, 0xff -; SI-NEXT: s_lshl_b32 s5, s17, 8 -; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s18, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s19, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 -; SI-NEXT: v_add_i32_e32 v35, vcc, 0x3000000, v1 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v6 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v1, v18, v1 +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_add_i32_e32 v37, vcc, 0x3000000, v0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v6 +; SI-NEXT: v_and_b32_e32 v0, 0xff, v0 ; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v8 -; SI-NEXT: s_add_i32 s47, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: v_or_b32_e32 v0, v47, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_and_b32 s6, s22, 0xff ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x300, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 -; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: v_or_b32_e32 v1, v45, v1 ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x3000000, v0 -; SI-NEXT: s_add_i32 s41, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v0, s47 -; SI-NEXT: v_alignbit_b32 v1, s41, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s46 -; SI-NEXT: v_alignbit_b32 v5, s14, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, v10, v35, 16 -; SI-NEXT: v_alignbit_b32 v13, v36, v37, 16 -; SI-NEXT: v_alignbit_b32 v17, v18, v39, 16 -; SI-NEXT: v_alignbit_b32 v21, v38, v32, 16 -; SI-NEXT: v_alignbit_b32 v25, v26, v33, 16 -; SI-NEXT: v_alignbit_b32 v29, v48, v34, 16 -; SI-NEXT: s_lshr_b32 s56, s41, 16 -; SI-NEXT: s_lshr_b32 s57, s14, 16 -; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v36 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v38 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v48 -; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: v_add_i32_e32 v38, vcc, 0x3000000, v0 +; SI-NEXT: v_lshr_b64 v[0:1], v[37:38], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[33:34], 16 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[0:1], v[48:49], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[35:36], 16 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v38 +; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v49 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: .LBB99_3: ; %end +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload @@ -69650,55 +69615,62 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v0, s47 -; SI-NEXT: v_mov_b32_e32 v2, s41 -; SI-NEXT: v_mov_b32_e32 v3, s56 -; SI-NEXT: v_mov_b32_e32 v4, s46 -; SI-NEXT: v_mov_b32_e32 v6, s14 -; SI-NEXT: v_mov_b32_e32 v7, s57 -; SI-NEXT: v_mov_b32_e32 v8, v35 -; SI-NEXT: v_mov_b32_e32 v12, v37 -; SI-NEXT: v_mov_b32_e32 v14, v36 -; SI-NEXT: v_mov_b32_e32 v16, v39 -; SI-NEXT: v_mov_b32_e32 v20, v32 -; SI-NEXT: v_mov_b32_e32 v22, v38 -; SI-NEXT: v_mov_b32_e32 v24, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s7 +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_mov_b32_e32 v4, s4 +; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v6, s5 +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_mov_b32_e32 v8, v37 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_mov_b32_e32 v10, v38 +; SI-NEXT: v_mov_b32_e32 v12, v33 +; SI-NEXT: v_mov_b32_e32 v14, v34 +; SI-NEXT: v_mov_b32_e32 v16, v48 +; SI-NEXT: v_mov_b32_e32 v18, v49 +; SI-NEXT: v_mov_b32_e32 v20, v35 +; SI-NEXT: v_mov_b32_e32 v22, v36 +; SI-NEXT: v_mov_b32_e32 v24, v50 +; SI-NEXT: v_mov_b32_e32 v26, v51 +; SI-NEXT: v_mov_b32_e32 v28, v54 +; SI-NEXT: v_mov_b32_e32 v30, v55 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: v_mov_b32_e32 v39, v32 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_branch .LBB99_2 ; @@ -69721,139 +69693,126 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v20 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_mov_b32_e32 v32, v26 +; VI-NEXT: v_mov_b32_e32 v30, v24 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v20, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 -; VI-NEXT: v_mov_b32_e32 v51, v23 -; VI-NEXT: v_mov_b32_e32 v30, v26 -; VI-NEXT: v_mov_b32_e32 v26, v22 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB99_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -69862,6 +69821,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -69870,70 +69830,75 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB99_3 ; VI-NEXT: .LBB99_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 -; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 -; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 -; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v26, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v9, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 -; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v6, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 -; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v5, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 @@ -69961,12 +69926,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: s_or_b32 s10, s11, s10 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -69976,7 +69940,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -69988,10 +69952,10 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -70001,12 +69965,11 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 @@ -70014,11 +69977,12 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -70044,22 +70008,8 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB99_4: -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v31, v10 -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_mov_b32_e32 v37, v8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB99_2 ; @@ -70082,244 +70032,228 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v34, v30 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v48, v30 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v32, v22 +; GFX9-NEXT: v_mov_b32_e32 v30, v20 +; GFX9-NEXT: v_mov_b32_e32 v49, v14 +; GFX9-NEXT: v_mov_b32_e32 v22, v12 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB99_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v50, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v22 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB99_3 ; GFX9-NEXT: .LBB99_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 -; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -70344,13 +70278,18 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -70359,69 +70298,48 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 ; GFX9-NEXT: s_addk_i32 s10, 0x300 -; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 ; GFX9-NEXT: .LBB99_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -70443,27 +70361,7 @@ define inreg <32 x i16> @bitcast_v64i8_to_v32i16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB99_4: -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v10 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB99_2 ; @@ -76984,716 +76882,611 @@ define inreg <64 x i8> @bitcast_v32f16_to_v64i8_scalar(<32 x half> inreg %a, i32 ; SI-LABEL: bitcast_v32f16_to_v64i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v22, s17 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v40, s30, 0 +; SI-NEXT: v_writelane_b32 v40, s31, 1 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v9, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v7, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s20 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v11, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v10, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s24 ; SI-NEXT: v_cvt_f16_f32_e32 v5, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s29 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v17, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 +; SI-NEXT: v_writelane_b32 v40, s34, 2 +; SI-NEXT: v_writelane_b32 v40, s35, 3 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 +; SI-NEXT: v_writelane_b32 v40, s36, 4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: v_writelane_b32 v40, s37, 5 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 -; SI-NEXT: v_or_b32_e32 v37, v10, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v7 -; SI-NEXT: v_or_b32_e32 v32, v9, v8 -; SI-NEXT: v_alignbit_b32 v8, v32, v37, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v32, v37, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v32, v37, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_or_b32_e32 v24, v12, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; SI-NEXT: v_or_b32_e32 v23, v11, v8 -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v23, v24, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v46 -; SI-NEXT: v_or_b32_e32 v18, v42, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; SI-NEXT: v_or_b32_e32 v19, v14, v8 -; SI-NEXT: v_alignbit_b32 v8, v19, v18, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v19, v18, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v19, v18, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v26 -; SI-NEXT: v_or_b32_e32 v16, v25, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v20 -; SI-NEXT: v_or_b32_e32 v17, v28, v8 -; SI-NEXT: v_alignbit_b32 v8, v17, v16, 24 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v17, v16, 16 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v8, v17, v16, 8 -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v27 -; SI-NEXT: v_or_b32_e32 v15, v21, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; SI-NEXT: v_or_b32_e32 v14, v62, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_or_b32_e32 v12, v34, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; SI-NEXT: v_or_b32_e32 v13, v30, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v53 -; SI-NEXT: v_or_b32_e32 v10, v50, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; SI-NEXT: v_or_b32_e32 v11, v48, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_or_b32_e32 v9, v40, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 -; SI-NEXT: v_or_b32_e32 v8, v55, v8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v8, v9, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v8, v9, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v8, v9, 8 -; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 -; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 -; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 -; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 -; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 -; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 -; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 -; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 -; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 +; SI-NEXT: v_readfirstlane_b32 s4, v21 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v9 +; SI-NEXT: s_or_b32 s18, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v8 +; SI-NEXT: s_or_b32 s19, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v12 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v11 +; SI-NEXT: s_or_b32 s16, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v6 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v10 +; SI-NEXT: s_or_b32 s17, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: s_or_b32 s14, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v13 +; SI-NEXT: s_or_b32 s15, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v17 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: s_or_b32 s12, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v20 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v22 +; SI-NEXT: s_or_b32 s13, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v25 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v24 +; SI-NEXT: s_or_b32 s10, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v4 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v23 +; SI-NEXT: s_or_b32 s11, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v28 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v27 +; SI-NEXT: s_or_b32 s8, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v26 +; SI-NEXT: s_or_b32 s9, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v33 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v31 +; SI-NEXT: s_or_b32 s6, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v2 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v29 +; SI-NEXT: s_or_b32 s7, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s4, v38 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s5, v37 +; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_readfirstlane_b32 s21, v35 +; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_or_b32 s5, s21, s5 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s45, s19, 8 +; SI-NEXT: s_lshr_b32 s43, s17, 8 +; SI-NEXT: s_lshr_b32 s41, s15, 8 +; SI-NEXT: s_lshr_b32 s29, s13, 8 +; SI-NEXT: s_lshr_b32 s27, s11, 8 +; SI-NEXT: s_lshr_b32 s25, s9, 8 +; SI-NEXT: s_lshr_b32 s23, s7, 8 +; SI-NEXT: s_lshr_b32 s21, s5, 8 +; SI-NEXT: v_bfe_u32 v48, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v39, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v32, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v30, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v1, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v18, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v9, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v10, v55 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_readfirstlane_b32 s4, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v35 +; SI-NEXT: v_readfirstlane_b32 s5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v8 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v11, v53 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; SI-NEXT: v_or_b32_e32 v8, v10, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v48 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: v_readfirstlane_b32 s6, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v31 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: v_readfirstlane_b32 s5, v1 +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: v_readfirstlane_b32 s6, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v28 +; SI-NEXT: v_readfirstlane_b32 s7, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v29 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v34 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v2 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: v_readfirstlane_b32 s8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v27 +; SI-NEXT: v_readfirstlane_b32 s7, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_or_b32_e32 v11, v12, v11 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v30 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: s_lshl_b32 s7, s7, 16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_or_b32 s7, s8, s7 +; SI-NEXT: v_readfirstlane_b32 s8, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v25 +; SI-NEXT: v_readfirstlane_b32 s9, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v21 -; SI-NEXT: v_or_b32_e32 v12, v14, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v27 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v13, v13, v15 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v62 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_readfirstlane_b32 s10, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: s_lshl_b32 s8, s8, 16 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: s_or_b32 s8, s9, s8 +; SI-NEXT: v_readfirstlane_b32 s9, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v15, v15, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v26 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; SI-NEXT: v_or_b32_e32 v14, v16, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 +; SI-NEXT: v_readfirstlane_b32 s11, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v23 +; SI-NEXT: s_lshl_b32 s9, s9, 16 ; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: s_or_b32 s9, s10, s9 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_readfirstlane_b32 s10, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: s_lshl_b32 s10, s10, 16 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v42 +; SI-NEXT: s_or_b32 s10, s11, s10 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: s_lshl_b32 s11, s11, 16 +; SI-NEXT: v_readfirstlane_b32 s12, v18 +; SI-NEXT: s_or_b32 s11, s12, s11 +; SI-NEXT: v_readfirstlane_b32 s12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v20 +; SI-NEXT: v_readfirstlane_b32 s13, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v17 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: s_or_b32 s12, s13, s12 +; SI-NEXT: v_readfirstlane_b32 s13, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v5 +; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: s_lshl_b32 s13, s13, 16 +; SI-NEXT: v_readfirstlane_b32 s14, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_readfirstlane_b32 s17, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v21 +; SI-NEXT: s_or_b32 s13, s14, s13 +; SI-NEXT: v_readfirstlane_b32 s14, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: v_readfirstlane_b32 s15, v14 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v19 -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v18, v21, v18 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_alignbit_b32 v57, v14, v15, 24 -; SI-NEXT: v_alignbit_b32 v58, v14, v15, 16 -; SI-NEXT: v_alignbit_b32 v61, v14, v15, 8 -; SI-NEXT: v_alignbit_b32 v44, v13, v12, 24 -; SI-NEXT: v_alignbit_b32 v47, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v56, v13, v12, 8 -; SI-NEXT: v_alignbit_b32 v43, v11, v10, 8 -; SI-NEXT: v_lshrrev_b32_e32 v31, 8, v17 -; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v14 -; SI-NEXT: v_lshrrev_b32_e32 v63, 8, v13 -; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v45, 8, v8 -; SI-NEXT: v_bfe_u32 v54, v7, 8, 8 -; SI-NEXT: v_bfe_u32 v51, v6, 8, 8 -; SI-NEXT: v_bfe_u32 v49, v5, 8, 8 -; SI-NEXT: v_bfe_u32 v38, v20, 8, 8 -; SI-NEXT: v_bfe_u32 v33, v4, 8, 8 -; SI-NEXT: v_bfe_u32 v29, v3, 8, 8 -; SI-NEXT: v_bfe_u32 v60, v1, 8, 8 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v19, v19, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_lshrrev_b32_e32 v36, 8, v19 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v24, v22, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v23, v23, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v39, 8, v23 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v37, v22, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 -; SI-NEXT: v_or_b32_e32 v32, v25, v21 -; SI-NEXT: v_alignbit_b32 v21, v32, v37, 24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v32, v37, 16 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v32, v37, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v23, v24, 24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v23, v24, 16 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v23, v24, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v19, v18, 24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v19, v18, 16 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v19, v18, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v17, v16, 24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v17, v16, 16 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v17, v16, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v11, v10, 24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v11, v10, 16 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v8, v9, 24 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v8, v9, 16 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v21, v8, v9, 8 -; SI-NEXT: v_lshrrev_b32_e32 v52, 8, v32 -; SI-NEXT: v_bfe_u32 v22, v2, 8, 8 -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_or_b32 s14, s15, s14 +; SI-NEXT: v_readfirstlane_b32 s15, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: s_lshl_b32 s15, s15, 16 +; SI-NEXT: v_readfirstlane_b32 s16, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_readfirstlane_b32 s16, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_readfirstlane_b32 s17, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: s_lshl_b32 s17, s17, 16 +; SI-NEXT: v_readfirstlane_b32 s18, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: s_or_b32 s17, s18, s17 +; SI-NEXT: v_readfirstlane_b32 s18, v11 +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: v_readfirstlane_b32 s19, v9 +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_readfirstlane_b32 s19, v7 +; SI-NEXT: s_lshl_b32 s19, s19, 16 +; SI-NEXT: v_readfirstlane_b32 s20, v8 +; SI-NEXT: s_or_b32 s19, s20, s19 +; SI-NEXT: s_lshr_b64 s[20:21], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[22:23], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[26:27], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[28:29], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[16:17], 8 +; SI-NEXT: s_lshr_b64 s[40:41], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[44:45], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 24 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[12:13], 8 +; SI-NEXT: s_lshr_b64 s[60:61], s[10:11], 24 +; SI-NEXT: s_lshr_b64 s[72:73], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[10:11], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[78:79], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[88:89], s[6:7], 24 +; SI-NEXT: s_lshr_b64 s[92:93], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[6:7], 8 +; SI-NEXT: s_lshr_b64 s[30:31], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[34:35], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[36:37], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s45, s19, 8 +; SI-NEXT: s_lshr_b32 s43, s17, 8 +; SI-NEXT: s_lshr_b32 s41, s15, 8 +; SI-NEXT: s_lshr_b32 s29, s13, 8 +; SI-NEXT: s_lshr_b32 s27, s11, 8 +; SI-NEXT: s_lshr_b32 s25, s9, 8 +; SI-NEXT: s_lshr_b32 s23, s7, 8 +; SI-NEXT: s_lshr_b32 s21, s5, 8 +; SI-NEXT: v_bfe_u32 v48, v7, 8, 8 +; SI-NEXT: v_bfe_u32 v39, v6, 8, 8 +; SI-NEXT: v_bfe_u32 v36, v5, 8, 8 +; SI-NEXT: v_bfe_u32 v34, v20, 8, 8 +; SI-NEXT: v_bfe_u32 v32, v4, 8, 8 +; SI-NEXT: v_bfe_u32 v30, v3, 8, 8 +; SI-NEXT: v_bfe_u32 v19, v2, 8, 8 +; SI-NEXT: v_bfe_u32 v18, v1, 8, 8 ; SI-NEXT: .LBB105_3: ; %end -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v37 +; SI-NEXT: s_and_b32 s18, s18, 0xff +; SI-NEXT: s_lshl_b32 s26, s26, 8 +; SI-NEXT: s_and_b32 s22, s22, 0xff +; SI-NEXT: s_or_b32 s18, s18, s26 +; SI-NEXT: s_lshl_b32 s22, s22, 16 +; SI-NEXT: s_lshl_b32 s20, s20, 24 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: s_or_b32 s20, s20, s22 +; SI-NEXT: s_or_b32 s18, s18, s20 +; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: s_and_b32 s18, s19, 0xff +; SI-NEXT: s_lshl_b32 s19, s45, 8 ; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s18, s18, s19 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 -; SI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v26, 24, v26 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v25, 0xff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v25, v26, v25 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 -; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v25, 8, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 24, v54 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v7, v25, v7 -; SI-NEXT: v_or_b32_e32 v7, v21, v7 -; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v8, 24, v48 +; SI-NEXT: s_and_b32 s18, s18, 0xffff +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_or_b32_e32 v7, s18, v7 +; SI-NEXT: s_and_b32 s16, s16, 0xff +; SI-NEXT: s_lshl_b32 s18, s42, 8 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: s_and_b32 s18, s28, 0xff +; SI-NEXT: s_lshl_b32 s18, s18, 16 +; SI-NEXT: s_lshl_b32 s19, s24, 24 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s18, s19, s18 +; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s18 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v24 -; SI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; SI-NEXT: v_or_b32_e32 v7, v7, v21 -; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v24 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v24, v21 -; SI-NEXT: v_or_b32_e32 v7, v7, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v7, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v39 -; SI-NEXT: v_or_b32_e32 v7, v7, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v51 -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v6, v21, v6 +; SI-NEXT: v_add_i32_e32 v7, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v8, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xff +; SI-NEXT: s_lshl_b32 s17, s43, 8 +; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 +; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v39 +; SI-NEXT: s_and_b32 s16, s16, 0xffff ; SI-NEXT: v_or_b32_e32 v6, v7, v6 +; SI-NEXT: v_or_b32_e32 v6, s16, v6 +; SI-NEXT: s_and_b32 s14, s14, 0xff +; SI-NEXT: s_lshl_b32 s16, s56, 8 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: s_and_b32 s16, s44, 0xff +; SI-NEXT: s_lshl_b32 s16, s16, 16 +; SI-NEXT: s_lshl_b32 s17, s40, 24 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 ; SI-NEXT: v_add_i32_e32 v7, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v18 -; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v18 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v18, v7 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 16, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v36 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v49 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v5, v7, v5 +; SI-NEXT: v_add_i32_e32 v6, vcc, 16, v0 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xff +; SI-NEXT: s_lshl_b32 s15, s41, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 +; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v36 +; SI-NEXT: s_and_b32 s14, s14, 0xffff ; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s14, v5 +; SI-NEXT: s_and_b32 s12, s12, 0xff +; SI-NEXT: s_lshl_b32 s14, s62, 8 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: s_and_b32 s14, s58, 0xff +; SI-NEXT: s_lshl_b32 s14, s14, 16 +; SI-NEXT: s_lshl_b32 s15, s46, 24 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_or_b32 s14, s15, s14 ; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v16 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v5, vcc, 24, v0 +; SI-NEXT: v_mov_b32_e32 v6, s12 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_and_b32 s12, s13, 0xff +; SI-NEXT: s_lshl_b32 s13, s29, 8 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v20 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v31 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v38 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v34 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_or_b32_e32 v5, s12, v5 +; SI-NEXT: s_and_b32 s10, s10, 0xff +; SI-NEXT: s_lshl_b32 s12, s76, 8 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: s_and_b32 s12, s72, 0xff +; SI-NEXT: s_lshl_b32 s12, s12, 16 +; SI-NEXT: s_lshl_b32 s13, s60, 24 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_or_b32 s12, s13, s12 ; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 ; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v61 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 24, v57 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v6, v7, v6 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v28 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v33 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_or_b32_e32 v4, v6, v4 +; SI-NEXT: v_add_i32_e32 v5, vcc, 32, v0 +; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xff +; SI-NEXT: s_lshl_b32 s11, s27, 8 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v32 +; SI-NEXT: s_and_b32 s10, s10, 0xffff ; SI-NEXT: v_or_b32_e32 v4, v5, v4 +; SI-NEXT: v_or_b32_e32 v4, s10, v4 +; SI-NEXT: s_and_b32 s8, s8, 0xff +; SI-NEXT: s_lshl_b32 s10, s90, 8 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: s_and_b32 s10, s78, 0xff +; SI-NEXT: s_lshl_b32 s10, s10, 16 +; SI-NEXT: s_lshl_b32 s11, s74, 24 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 ; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v56 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_and_b32_e32 v5, 0xff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v44 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v5, v6, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v63 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v29 -; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: v_add_i32_e32 v4, vcc, 40, v0 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xff +; SI-NEXT: s_lshl_b32 s9, s25, 8 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v30 +; SI-NEXT: s_and_b32 s8, s8, 0xffff ; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: v_or_b32_e32 v3, s8, v3 +; SI-NEXT: s_and_b32 s6, s6, 0xff +; SI-NEXT: s_lshl_b32 s8, s94, 8 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: s_and_b32 s8, s92, 0xff +; SI-NEXT: s_lshl_b32 s8, s8, 16 +; SI-NEXT: s_lshl_b32 s9, s88, 24 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s8, s9, s8 ; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v43 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v59 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v22 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v2, v4, v2 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xff +; SI-NEXT: s_lshl_b32 s7, s23, 8 +; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 +; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v19 +; SI-NEXT: s_and_b32 s6, s6, 0xffff ; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_or_b32_e32 v2, s6, v2 +; SI-NEXT: s_and_b32 s4, s4, 0xff +; SI-NEXT: s_lshl_b32 s6, s36, 8 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: s_and_b32 s6, s34, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s30, 24 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v9 -; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v45 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v60 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xff +; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v2, 24, v18 +; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, s4, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: v_readlane_b32 s37, v40, 5 +; SI-NEXT: v_readlane_b32 s36, v40, 4 +; SI-NEXT: v_readlane_b32 s35, v40, 3 +; SI-NEXT: v_readlane_b32 s34, v40, 2 +; SI-NEXT: v_readlane_b32 s31, v40, 1 +; SI-NEXT: v_readlane_b32 s30, v40, 0 +; SI-NEXT: s_or_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $sgpr45 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr43 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr41 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr63 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; kill: killed $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr29 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr27 +; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr25 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr21 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $sgpr36 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB105_2 ; ; VI-LABEL: bitcast_v32f16_to_v64i8_scalar: @@ -81898,139 +81691,126 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v20 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_mov_b32_e32 v32, v26 +; VI-NEXT: v_mov_b32_e32 v30, v24 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v20, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 -; VI-NEXT: v_mov_b32_e32 v51, v23 -; VI-NEXT: v_mov_b32_e32 v30, v26 -; VI-NEXT: v_mov_b32_e32 v26, v22 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB107_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -82039,6 +81819,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -82047,70 +81828,75 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB107_3 ; VI-NEXT: .LBB107_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 -; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 -; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 -; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v26, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v9, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 -; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v6, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 -; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v5, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 @@ -82138,12 +81924,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: s_or_b32 s10, s11, s10 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -82153,7 +81938,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -82165,10 +81950,10 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -82178,12 +81963,11 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 @@ -82191,11 +81975,12 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -82221,22 +82006,8 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB107_4: -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v31, v10 -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_mov_b32_e32 v37, v8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB107_2 ; @@ -82259,244 +82030,228 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v34, v30 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v48, v30 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v32, v22 +; GFX9-NEXT: v_mov_b32_e32 v30, v20 +; GFX9-NEXT: v_mov_b32_e32 v49, v14 +; GFX9-NEXT: v_mov_b32_e32 v22, v12 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB107_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v50, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v22 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB107_3 ; GFX9-NEXT: .LBB107_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 -; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -82521,13 +82276,18 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -82536,69 +82296,48 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 ; GFX9-NEXT: s_addk_i32 s10, 0x300 -; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 ; GFX9-NEXT: .LBB107_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -82620,27 +82359,7 @@ define inreg <32 x half> @bitcast_v64i8_to_v32f16_scalar(<64 x i8> inreg %a, i32 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB107_4: -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v10 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB107_2 ; @@ -86513,598 +86232,499 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mul_f32_e64 v19, 1.0, s17 -; SI-NEXT: v_mul_f32_e32 v42, 1.0, v2 -; SI-NEXT: v_mul_f32_e32 v20, 1.0, v1 -; SI-NEXT: v_mul_f32_e32 v25, 1.0, v4 -; SI-NEXT: v_mul_f32_e32 v28, 1.0, v3 -; SI-NEXT: v_mul_f32_e32 v43, 1.0, v6 -; SI-NEXT: v_mul_f32_e32 v23, 1.0, v5 -; SI-NEXT: v_mul_f32_e32 v31, 1.0, v8 -; SI-NEXT: v_mul_f32_e32 v34, 1.0, v7 -; SI-NEXT: v_mul_f32_e32 v44, 1.0, v10 -; SI-NEXT: v_mul_f32_e32 v29, 1.0, v9 -; SI-NEXT: v_mul_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_mul_f32_e32 v35, 1.0, v11 -; SI-NEXT: v_mul_f32_e32 v56, 1.0, v14 -; SI-NEXT: v_mul_f32_e32 v33, 1.0, v13 +; SI-NEXT: v_mul_f32_e32 v33, 1.0, v2 +; SI-NEXT: v_mul_f32_e32 v27, 1.0, v1 +; SI-NEXT: v_mul_f32_e32 v50, 1.0, v4 +; SI-NEXT: v_mul_f32_e32 v52, 1.0, v3 +; SI-NEXT: v_mul_f32_e32 v39, 1.0, v6 +; SI-NEXT: v_mul_f32_e32 v49, 1.0, v5 +; SI-NEXT: v_mul_f32_e32 v44, 1.0, v8 +; SI-NEXT: v_mul_f32_e32 v46, 1.0, v7 +; SI-NEXT: v_mul_f32_e32 v40, 1.0, v10 +; SI-NEXT: v_mul_f32_e32 v43, 1.0, v9 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mul_f32_e32 v61, 1.0, v12 +; SI-NEXT: v_mul_f32_e32 v25, 1.0, v11 +; SI-NEXT: v_mul_f32_e32 v57, 1.0, v14 +; SI-NEXT: v_mul_f32_e32 v60, 1.0, v13 ; SI-NEXT: v_mul_f32_e32 v36, 1.0, v16 -; SI-NEXT: v_mul_f32_e32 v39, 1.0, v15 -; SI-NEXT: v_mul_f32_e32 v48, 1.0, v18 -; SI-NEXT: v_mul_f32_e32 v32, 1.0, v17 +; SI-NEXT: v_mul_f32_e32 v37, 1.0, v15 +; SI-NEXT: v_mul_f32_e32 v32, 1.0, v18 +; SI-NEXT: v_mul_f32_e32 v34, 1.0, v17 ; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v22, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v26, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s21 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s20 -; SI-NEXT: v_mul_f32_e64 v41, 1.0, s23 +; SI-NEXT: v_mul_f32_e64 v29, 1.0, s23 ; SI-NEXT: v_mul_f32_e64 v4, 1.0, s22 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s25 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s24 -; SI-NEXT: v_mul_f32_e64 v26, 1.0, s27 +; SI-NEXT: v_mul_f32_e64 v38, 1.0, s24 +; SI-NEXT: v_mul_f32_e64 v35, 1.0, s27 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s26 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s29 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s28 -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill +; SI-NEXT: v_mul_f32_e64 v53, 1.0, s29 +; SI-NEXT: v_mul_f32_e64 v55, 1.0, s28 +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v22 -; SI-NEXT: v_alignbit_b32 v27, v1, v3, 16 -; SI-NEXT: v_alignbit_b32 v30, v24, v2, 16 -; SI-NEXT: v_alignbit_b32 v1, v30, v27, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v27, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v30, v27, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v23, v1, v3, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_alignbit_b32 v21, v1, v6, 16 -; SI-NEXT: v_alignbit_b32 v19, v17, v4, 16 -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v19, v21, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_alignbit_b32 v20, v1, v6, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v26 -; SI-NEXT: v_alignbit_b32 v15, v1, v9, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v7, 16 -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v16, v15, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v42 -; SI-NEXT: v_alignbit_b32 v10, v1, v11, 16 -; SI-NEXT: v_alignbit_b32 v11, v9, v20, 16 -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v1, v11, v10, 8 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v17, v1, v38, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v53 +; SI-NEXT: v_alignbit_b32 v14, v1, v55, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_alignbit_b32 v11, v1, v52, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v25 -; SI-NEXT: v_alignbit_b32 v6, v1, v28, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; SI-NEXT: v_alignbit_b32 v3, v1, v34, 16 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v56 -; SI-NEXT: v_alignbit_b32 v2, v1, v35, 16 -; SI-NEXT: v_alignbit_b32 v8, v7, v33, 16 -; SI-NEXT: v_alignbit_b32 v4, v8, v2, 24 +; SI-NEXT: v_alignbit_b32 v8, v1, v46, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v61 +; SI-NEXT: v_alignbit_b32 v21, v19, v4, 16 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v35 +; SI-NEXT: v_alignbit_b32 v4, v1, v25, 16 ; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v36 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v48 -; SI-NEXT: v_alignbit_b32 v1, v1, v39, 16 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v43 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_alignbit_b32 v5, v4, v32, 16 -; SI-NEXT: v_mov_b32_e32 v31, v23 -; SI-NEXT: v_alignbit_b32 v20, v18, v23, 16 -; SI-NEXT: v_alignbit_b32 v14, v12, v29, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v1, 24 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_alignbit_b32 v36, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v25, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v50, v8, v2, 16 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v23, v5, v1, 16 -; SI-NEXT: v_alignbit_b32 v32, v5, v1, 8 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_mov_b32_e32 v37, v33 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v23, v41 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_mov_b32_e32 v28, v26 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v26 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v42 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: v_mov_b32_e32 v34, v44 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 -; SI-NEXT: v_mov_b32_e32 v33, v56 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v48 -; SI-NEXT: v_mov_b32_e32 v48, v32 -; SI-NEXT: v_mov_b32_e32 v32, v50 -; SI-NEXT: v_mov_b32_e32 v50, v25 -; SI-NEXT: v_mov_b32_e32 v25, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_alignbit_b32 v18, v16, v7, 16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v39 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v40 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v57 +; SI-NEXT: v_alignbit_b32 v3, v1, v37, 16 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_alignbit_b32 v24, v22, v2, 16 +; SI-NEXT: v_alignbit_b32 v15, v13, v27, 16 +; SI-NEXT: v_alignbit_b32 v12, v10, v49, 16 +; SI-NEXT: v_alignbit_b32 v9, v7, v43, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v60, 16 +; SI-NEXT: v_alignbit_b32 v2, v1, v34, 16 +; SI-NEXT: v_readfirstlane_b32 s8, v23 +; SI-NEXT: v_readfirstlane_b32 s9, v24 +; SI-NEXT: v_readfirstlane_b32 s14, v20 +; SI-NEXT: v_readfirstlane_b32 s15, v21 +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s26, v14 +; SI-NEXT: v_readfirstlane_b32 s27, v15 +; SI-NEXT: v_readfirstlane_b32 s42, v11 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s56, v8 +; SI-NEXT: v_readfirstlane_b32 s57, v9 +; SI-NEXT: v_readfirstlane_b32 s62, v4 +; SI-NEXT: v_readfirstlane_b32 s63, v5 +; SI-NEXT: v_readfirstlane_b32 s76, v3 +; SI-NEXT: v_readfirstlane_b32 s77, v2 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 +; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 +; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 +; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 +; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v35 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v39 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v40 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v57 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true +; SI-NEXT: v_and_b32_e32 v14, 0xffff0000, v53 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v55 +; SI-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; SI-NEXT: v_alignbit_b32 v14, v14, v13, 16 +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v38 +; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v36 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v39 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff0000, v44 +; SI-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v37 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v46 +; SI-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v52 +; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v11 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v49 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v53 -; SI-NEXT: v_add_f32_e32 v42, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v42 -; SI-NEXT: v_alignbit_b32 v5, v4, v2, 16 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v33 -; SI-NEXT: v_add_f32_e32 v43, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v43 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v34 -; SI-NEXT: v_add_f32_e32 v44, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v44 -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v29 -; SI-NEXT: v_add_f32_e32 v45, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v45 -; SI-NEXT: v_alignbit_b32 v48, v5, v1, 8 -; SI-NEXT: v_lshrrev_b32_e32 v43, 24, v43 -; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v42 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v37 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v8, v7, v3, 16 -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v32, v8, v2, 16 -; SI-NEXT: v_alignbit_b32 v51, v8, v2, 8 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v61 +; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_alignbit_b32 v3, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v34 +; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_alignbit_b32 v8, v8, v7, 16 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v43 +; SI-NEXT: v_alignbit_b32 v11, v11, v10, 16 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v49 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v32 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v57 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v7 +; SI-NEXT: v_and_b32_e32 v7, 0xffff0000, v40 +; SI-NEXT: v_add_f32_e32 v12, 0x40c00000, v10 +; SI-NEXT: v_and_b32_e32 v10, 0xffff0000, v39 +; SI-NEXT: v_add_f32_e32 v32, 0x40c00000, v1 +; SI-NEXT: v_alignbit_b32 v4, v5, v4, 16 +; SI-NEXT: v_and_b32_e32 v5, 0xffff0000, v60 +; SI-NEXT: v_add_f32_e32 v25, 0x40c00000, v6 +; SI-NEXT: v_add_f32_e32 v34, 0x40c00000, v7 +; SI-NEXT: v_add_f32_e32 v36, 0x40c00000, v10 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v34 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v36 +; SI-NEXT: v_alignbit_b32 v2, v1, v2, 16 +; SI-NEXT: v_alignbit_b32 v5, v6, v5, 16 +; SI-NEXT: v_alignbit_b32 v9, v7, v9, 16 +; SI-NEXT: v_alignbit_b32 v12, v10, v12, 16 +; SI-NEXT: v_readfirstlane_b32 s76, v3 +; SI-NEXT: v_readfirstlane_b32 s77, v2 +; SI-NEXT: v_readfirstlane_b32 s62, v4 +; SI-NEXT: v_readfirstlane_b32 s63, v5 +; SI-NEXT: v_readfirstlane_b32 s56, v8 +; SI-NEXT: v_readfirstlane_b32 s57, v9 +; SI-NEXT: v_readfirstlane_b32 s42, v11 +; SI-NEXT: v_readfirstlane_b32 s43, v12 +; SI-NEXT: v_readfirstlane_b32 s26, v14 +; SI-NEXT: s_lshr_b64 s[40:41], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[42:43], 8 +; SI-NEXT: s_lshr_b64 s[46:47], s[56:57], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[56:57], 8 +; SI-NEXT: s_lshr_b64 s[60:61], s[62:63], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[62:63], 8 +; SI-NEXT: s_lshr_b64 s[74:75], s[76:77], 16 +; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v12 +; SI-NEXT: v_lshrrev_b32_e32 v41, 8, v9 +; SI-NEXT: v_lshrrev_b32_e32 v54, 8, v5 +; SI-NEXT: v_lshrrev_b32_e32 v48, 8, v2 +; SI-NEXT: v_lshrrev_b32_e32 v56, 24, v36 +; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v34 +; SI-NEXT: v_lshrrev_b32_e32 v42, 24, v25 +; SI-NEXT: v_lshrrev_b32_e32 v51, 24, v32 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; SI-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15 ; SI-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_alignbit_b32 v15, v15, v13, 16 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v21, v19, v17, 16 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; SI-NEXT: v_alignbit_b32 v3, v6, v3, 16 -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v35 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v14, v12, v6, 16 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v50, v14, v3, 24 -; SI-NEXT: v_alignbit_b32 v52, v14, v3, 16 -; SI-NEXT: v_alignbit_b32 v54, v14, v3, 8 -; SI-NEXT: v_lshrrev_b32_e32 v60, 8, v14 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17 +; SI-NEXT: v_alignbit_b32 v17, v17, v16, 16 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: v_readfirstlane_b32 s20, v17 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v16, 0x40c00000, v13 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 -; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v17 -; SI-NEXT: v_and_b32_e32 v17, 0xffff0000, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v28 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_add_f32_e32 v41, 0x40c00000, v17 -; SI-NEXT: v_add_f32_e32 v56, 0x40c00000, v13 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v41 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v56 -; SI-NEXT: v_lshrrev_b32_e32 v41, 24, v41 -; SI-NEXT: v_lshrrev_b32_e32 v61, 24, v56 -; SI-NEXT: v_lshrrev_b32_e32 v56, 8, v8 -; SI-NEXT: v_alignbit_b32 v19, v17, v19, 16 -; SI-NEXT: v_alignbit_b32 v16, v13, v16, 16 -; SI-NEXT: v_lshrrev_b32_e32 v46, 8, v19 -; SI-NEXT: v_lshrrev_b32_e32 v57, 8, v16 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 -; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; SI-NEXT: v_alignbit_b32 v6, v9, v6, 16 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v31 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v20, v18, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_alignbit_b32 v25, v20, v6, 24 -; SI-NEXT: v_alignbit_b32 v55, v20, v6, 16 -; SI-NEXT: v_alignbit_b32 v40, v20, v6, 8 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; SI-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 +; SI-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 ; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_alignbit_b32 v27, v23, v22, 16 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v9 -; SI-NEXT: v_alignbit_b32 v10, v10, v9, 16 -; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v20, v20, v19, 16 +; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_alignbit_b32 v23, v23, v22, 16 +; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; SI-NEXT: v_add_f32_e32 v15, 0x40c00000, v13 +; SI-NEXT: v_and_b32_e32 v13, 0xffff0000, v33 +; SI-NEXT: v_add_f32_e32 v18, 0x40c00000, v16 +; SI-NEXT: v_and_b32_e32 v16, 0xffff0000, v35 +; SI-NEXT: v_add_f32_e32 v33, 0x40c00000, v13 +; SI-NEXT: v_add_f32_e32 v31, 0x40c00000, v16 +; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v31 +; SI-NEXT: v_alignbit_b32 v15, v13, v15, 16 +; SI-NEXT: v_alignbit_b32 v18, v16, v18, 16 +; SI-NEXT: v_readfirstlane_b32 s27, v15 +; SI-NEXT: v_readfirstlane_b32 s21, v18 +; SI-NEXT: v_readfirstlane_b32 s14, v20 +; SI-NEXT: v_readfirstlane_b32 s8, v23 +; SI-NEXT: s_lshr_b64 s[18:19], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], 8 +; SI-NEXT: s_lshr_b64 s[24:25], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[28:29], s[26:27], 8 +; SI-NEXT: v_lshrrev_b32_e32 v28, 8, v18 +; SI-NEXT: v_lshrrev_b32_e32 v59, 8, v15 +; SI-NEXT: v_lshrrev_b32_e32 v31, 24, v31 +; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v33 ; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; SI-NEXT: v_add_f32_e32 v21, 0x40c00000, v19 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 -; SI-NEXT: v_add_f32_e32 v59, 0x40c00000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v59 -; SI-NEXT: v_alignbit_b32 v30, v24, v22, 16 -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 24 -; SI-NEXT: v_lshrrev_b32_e32 v62, 8, v30 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v30, v27, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v19, v21, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xffff0000, v26 -; SI-NEXT: v_add_f32_e32 v47, 0x40c00000, v9 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v47 -; SI-NEXT: v_alignbit_b32 v11, v9, v11, 16 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 24 -; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v11 -; SI-NEXT: v_lshrrev_b32_e32 v63, 24, v47 -; SI-NEXT: v_lshrrev_b32_e32 v47, 8, v20 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v11, v10, 8 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v8, v2, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v5, v1, 24 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_alignbit_b32 v22, v5, v1, 16 -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v22, 24, v59 -; SI-NEXT: v_lshrrev_b32_e32 v59, 24, v45 -; SI-NEXT: v_lshrrev_b32_e32 v45, 24, v44 -; SI-NEXT: v_lshrrev_b32_e32 v44, 8, v5 +; SI-NEXT: v_and_b32_e32 v19, 0xffff0000, v29 +; SI-NEXT: v_add_f32_e32 v24, 0x40c00000, v22 +; SI-NEXT: v_and_b32_e32 v22, 0xffff0000, v26 +; SI-NEXT: v_add_f32_e32 v29, 0x40c00000, v19 +; SI-NEXT: v_add_f32_e32 v26, 0x40c00000, v22 +; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v26 +; SI-NEXT: v_alignbit_b32 v21, v19, v21, 16 +; SI-NEXT: v_alignbit_b32 v24, v22, v24, 16 +; SI-NEXT: v_readfirstlane_b32 s15, v21 +; SI-NEXT: v_readfirstlane_b32 s9, v24 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], 8 +; SI-NEXT: s_lshr_b64 s[8:9], s[14:15], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[16:17], s[14:15], 8 +; SI-NEXT: s_lshr_b64 s[14:15], s[20:21], 24 +; SI-NEXT: s_lshr_b64 s[20:21], s[26:27], 24 +; SI-NEXT: s_lshr_b64 s[26:27], s[42:43], 24 +; SI-NEXT: s_lshr_b64 s[42:43], s[56:57], 24 +; SI-NEXT: s_lshr_b64 s[56:57], s[62:63], 24 +; SI-NEXT: s_lshr_b64 s[62:63], s[76:77], 24 +; SI-NEXT: s_lshr_b64 s[76:77], s[76:77], 8 +; SI-NEXT: v_lshrrev_b32_e32 v27, 8, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 8, v21 +; SI-NEXT: v_lshrrev_b32_e32 v30, 24, v26 +; SI-NEXT: v_lshrrev_b32_e32 v62, 24, v29 ; SI-NEXT: .LBB109_3: ; %end -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v27, 0xff, v27 -; SI-NEXT: v_and_b32_e32 v24, 0xff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 24, v22 -; SI-NEXT: v_or_b32_e32 v22, v22, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: v_and_b32_e32 v21, 0xff, v21 -; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_and_b32_e32 v15, 0xff, v15 -; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v6 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v36, 8, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_or_b32_e32 v27, v27, v36 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v35, 0xff, v23 -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v33, 24, v23 -; SI-NEXT: v_or_b32_e32 v33, v33, v35 -; SI-NEXT: v_or_b32_e32 v27, v27, v33 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v30, 8, v62 -; SI-NEXT: v_or_b32_e32 v27, v27, v30 -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v22, v27, v22 -; SI-NEXT: buffer_store_dword v22, v24, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v22, 8, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v23 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v23 +; SI-NEXT: s_lshl_b32 s5, s10, 8 +; SI-NEXT: v_or_b32_e32 v23, s5, v23 +; SI-NEXT: s_and_b32 s5, s6, 0xff +; SI-NEXT: s_lshl_b32 s5, s5, 16 +; SI-NEXT: s_lshl_b32 s4, s4, 24 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: v_or_b32_e32 v23, s4, v23 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v23, 0xff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 8, v27 ; SI-NEXT: v_and_b32_e32 v22, 0xff, v22 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v20 +; SI-NEXT: s_lshl_b32 s4, s16, 8 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v24, 24, v30 +; SI-NEXT: v_or_b32_e32 v20, s4, v20 +; SI-NEXT: s_and_b32 s4, s12, 0xff +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v22, v24, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s8, 24 +; SI-NEXT: v_or_b32_e32 v22, v23, v22 +; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v20, s4, v20 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v46 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v41 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_or_b32_e32 v17, v21, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: v_and_b32_e32 v20, 0xff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 8, v58 +; SI-NEXT: v_and_b32_e32 v19, 0xff, v19 +; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 +; SI-NEXT: s_lshl_b32 s4, s22, 8 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v21, 24, v62 +; SI-NEXT: v_or_b32_e32 v17, s4, v17 +; SI-NEXT: s_and_b32 s4, s18, 0xff +; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_or_b32_e32 v19, v21, v19 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s14, 24 +; SI-NEXT: v_or_b32_e32 v19, v20, v19 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v17, s4, v17 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v19, 24, v19 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v17, 0xff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v19, v17 -; SI-NEXT: v_or_b32_e32 v15, v15, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 8, v28 +; SI-NEXT: v_and_b32_e32 v16, 0xff, v16 +; SI-NEXT: v_and_b32_e32 v14, 0xff, v14 +; SI-NEXT: s_lshl_b32 s4, s28, 8 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v18, 24, v31 +; SI-NEXT: v_or_b32_e32 v14, s4, v14 +; SI-NEXT: s_and_b32 s4, s24, 0xff +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v16, v18, v16 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s20, 24 +; SI-NEXT: v_or_b32_e32 v16, v17, v16 +; SI-NEXT: v_add_i32_e32 v17, vcc, 20, v0 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v14, s4, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 8, v57 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 24, v61 -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; SI-NEXT: v_or_b32_e32 v13, v16, v13 -; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v16, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v15 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v14, 0xff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 8, v59 ; SI-NEXT: v_and_b32_e32 v13, 0xff, v13 +; SI-NEXT: v_and_b32_e32 v11, 0xff, v11 +; SI-NEXT: s_lshl_b32 s4, s44, 8 +; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v15, 24, v63 +; SI-NEXT: v_or_b32_e32 v11, s4, v11 +; SI-NEXT: s_and_b32 s4, s40, 0xff +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v13, v15, v13 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s26, 24 +; SI-NEXT: v_or_b32_e32 v13, v14, v13 +; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v11, s4, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 8, v58 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 24, v63 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_or_b32_e32 v9, v11, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v40 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v25 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 8, v47 +; SI-NEXT: v_and_b32_e32 v10, 0xff, v10 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v8 +; SI-NEXT: s_lshl_b32 s4, s58, 8 +; SI-NEXT: v_or_b32_e32 v11, v11, v12 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v12, 24, v56 +; SI-NEXT: v_or_b32_e32 v8, s4, v8 +; SI-NEXT: s_and_b32 s4, s46, 0xff +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v10, v12, v10 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s42, 24 +; SI-NEXT: v_or_b32_e32 v10, v11, v10 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v8, s4, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v47 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_and_b32_e32 v9, 0xff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 24, v59 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_or_b32_e32 v6, v6, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v6, v9, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v54 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v50 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v60 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_and_b32_e32 v6, 0xff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_and_b32_e32 v8, 0xff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v41 +; SI-NEXT: v_and_b32_e32 v7, 0xff, v7 +; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 +; SI-NEXT: s_lshl_b32 s4, s72, 8 +; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v9, 24, v45 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v4, s4, v4 +; SI-NEXT: s_and_b32 s4, s60, 0xff +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_or_b32_e32 v7, v9, v7 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s56, 24 +; SI-NEXT: v_or_b32_e32 v7, v8, v7 +; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v4, s4, v4 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v51 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v56 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v43 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v4, 0xff, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v54 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_and_b32_e32 v5, 0xff, v6 +; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 +; SI-NEXT: s_lshl_b32 s4, s76, 8 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v6, 24, v42 +; SI-NEXT: v_or_b32_e32 v3, s4, v3 +; SI-NEXT: s_and_b32 s4, s74, 0xff +; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: s_lshl_b32 s4, s4, 16 +; SI-NEXT: s_lshl_b32 s5, s62, 24 +; SI-NEXT: v_or_b32_e32 v4, v4, v5 +; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, s4, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v48 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v44 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 -; SI-NEXT: v_and_b32_e32 v2, 0xff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v42 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_or_b32_e32 v1, v1, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 24, v51 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v0, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload @@ -87126,97 +86746,70 @@ define inreg <64 x i8> @bitcast_v32bf16_to_v64i8_scalar(<32 x bfloat> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: v_mov_b32_e32 v53, v32 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v49, v48 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v37, v33 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v33, v56 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v35, v29 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v34, v44 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v31, v23 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v29, v43 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v28, v26 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v26, v42 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: v_mov_b32_e32 v23, v41 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr23 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr22 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr20 +; SI-NEXT: ; implicit-def: $sgpr16 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr19 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr18 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr14 +; SI-NEXT: ; implicit-def: $sgpr28 +; SI-NEXT: ; implicit-def: $sgpr24 +; SI-NEXT: ; implicit-def: $sgpr20 +; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr61 -; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr26 +; SI-NEXT: ; implicit-def: $vgpr12 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr42 ; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; kill: killed $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; kill: killed $vgpr4 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: s_branch .LBB109_2 ; ; VI-LABEL: bitcast_v32bf16_to_v64i8_scalar: @@ -92395,139 +91988,126 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; VI-NEXT: v_mov_b32_e32 v55, v20 -; VI-NEXT: v_mov_b32_e32 v53, v14 -; VI-NEXT: v_mov_b32_e32 v34, v12 -; VI-NEXT: v_mov_b32_e32 v32, v0 +; VI-NEXT: v_mov_b32_e32 v31, v30 +; VI-NEXT: v_mov_b32_e32 v38, v28 +; VI-NEXT: v_mov_b32_e32 v32, v26 +; VI-NEXT: v_mov_b32_e32 v30, v24 +; VI-NEXT: v_mov_b32_e32 v26, v22 +; VI-NEXT: v_mov_b32_e32 v49, v20 +; VI-NEXT: v_mov_b32_e32 v48, v14 +; VI-NEXT: v_mov_b32_e32 v39, v12 +; VI-NEXT: v_mov_b32_e32 v20, v10 +; VI-NEXT: v_mov_b32_e32 v35, v8 +; VI-NEXT: v_mov_b32_e32 v34, v6 +; VI-NEXT: v_mov_b32_e32 v36, v4 +; VI-NEXT: v_mov_b32_e32 v37, v0 ; VI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 -; VI-NEXT: buffer_load_ushort v31, off, s[0:3], s32 -; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_ushort v28, off, s[0:3], s32 +; VI-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; VI-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; VI-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; VI-NEXT: buffer_load_ushort v54, off, s[0:3], s32 offset:12 -; VI-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:24 +; VI-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; VI-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:20 -; VI-NEXT: buffer_load_ushort v20, off, s[0:3], s32 offset:32 +; VI-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; VI-NEXT: buffer_load_ushort v42, off, s[0:3], s32 offset:28 ; VI-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; VI-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 ; VI-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; VI-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:44 -; VI-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; VI-NEXT: buffer_load_ushort v22, off, s[0:3], s32 offset:56 ; VI-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; VI-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; VI-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:64 ; VI-NEXT: buffer_load_ushort v47, off, s[0:3], s32 offset:60 -; VI-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; VI-NEXT: buffer_load_ushort v24, off, s[0:3], s32 offset:72 ; VI-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:68 -; VI-NEXT: v_mov_b32_e32 v51, v23 -; VI-NEXT: v_mov_b32_e32 v30, v26 -; VI-NEXT: v_mov_b32_e32 v26, v22 ; VI-NEXT: v_lshlrev_b32_e32 v50, 8, v1 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v3 ; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v5 ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v7 -; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v9 -; VI-NEXT: v_lshlrev_b32_e32 v9, 8, v11 -; VI-NEXT: v_lshlrev_b32_e32 v11, 8, v13 -; VI-NEXT: v_lshlrev_b32_e32 v13, 8, v15 -; VI-NEXT: v_lshlrev_b32_e32 v15, 8, v17 -; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v19 -; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v21 -; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v51 -; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v25 +; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v9 +; VI-NEXT: v_lshlrev_b32_e32 v53, 8, v11 +; VI-NEXT: v_lshlrev_b32_e32 v55, 8, v13 +; VI-NEXT: v_lshlrev_b32_e32 v40, 8, v15 +; VI-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; VI-NEXT: v_lshlrev_b32_e32 v19, 8, v19 +; VI-NEXT: v_lshlrev_b32_e32 v21, 8, v21 +; VI-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; VI-NEXT: v_lshlrev_b32_e32 v25, 8, v25 ; VI-NEXT: v_lshlrev_b32_e32 v27, 8, v27 ; VI-NEXT: v_lshlrev_b32_e32 v29, 8, v29 -; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; VI-NEXT: s_waitcnt vmcnt(14) ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v31 -; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v33 +; VI-NEXT: v_lshlrev_b32_e32 v46, 8, v28 +; VI-NEXT: v_lshlrev_b32_e32 v56, 8, v4 ; VI-NEXT: s_and_b64 s[4:5], vcc, exec -; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v35 -; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v37 -; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v20 +; VI-NEXT: v_lshlrev_b32_e32 v58, 8, v6 +; VI-NEXT: v_lshlrev_b32_e32 v59, 8, v8 +; VI-NEXT: v_lshlrev_b32_e32 v60, 8, v10 ; VI-NEXT: s_waitcnt vmcnt(13) -; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v12 +; VI-NEXT: v_lshlrev_b32_e32 v61, 8, v12 ; VI-NEXT: s_waitcnt vmcnt(11) -; VI-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; VI-NEXT: v_lshlrev_b32_e32 v62, 8, v14 ; VI-NEXT: s_waitcnt vmcnt(9) -; VI-NEXT: v_lshlrev_b32_e32 v38, 8, v39 +; VI-NEXT: v_lshlrev_b32_e32 v63, 8, v22 ; VI-NEXT: s_waitcnt vmcnt(7) -; VI-NEXT: v_lshlrev_b32_e32 v51, 8, v48 +; VI-NEXT: v_lshlrev_b32_e32 v33, 8, v33 ; VI-NEXT: s_waitcnt vmcnt(5) -; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v49 +; VI-NEXT: v_lshlrev_b32_e32 v22, 8, v24 ; VI-NEXT: s_cbranch_scc0 .LBB111_4 ; VI-NEXT: ; %bb.1: ; %cmp.false ; VI-NEXT: v_or_b32_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v36, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_or_b32_sdwa v1, v34, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_or_b32_sdwa v0, v10, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_or_b32_sdwa v2, v6, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v20, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v39, v55 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v53, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v16, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_or_b32_sdwa v3, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v48, v40 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v7, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v18, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v55, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v37, v8 +; VI-NEXT: v_or_b32_sdwa v0, v18, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v49, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v8, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v26, v56 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v24, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v40, v9 +; VI-NEXT: v_or_b32_sdwa v0, v26, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v30, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v9, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v30, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v28, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v31, v10 +; VI-NEXT: v_or_b32_sdwa v0, v32, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v38, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v10, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: s_and_b32 s4, s28, 0xff -; VI-NEXT: s_lshl_b32 s5, s29, 8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: s_or_b32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: s_and_b32 s4, s4, 0xffff -; VI-NEXT: v_mov_b32_e32 v20, v5 -; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: s_lshl_b32 s5, s17, 8 -; VI-NEXT: s_lshl_b32 s6, s19, 8 -; VI-NEXT: s_lshl_b32 s7, s23, 8 -; VI-NEXT: s_lshl_b32 s8, s27, 8 -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_sdwa v0, v34, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v31, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v52, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v11, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v54, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v41, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v54, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v41, v59 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v12, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v42, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v43, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v42, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v43, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v13, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v44, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v1, v45, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v44, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v1, v45, v63 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s28, 0xff +; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: v_or_b32_sdwa v14, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v47, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v47, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_waitcnt vmcnt(4) ; VI-NEXT: v_or_b32_sdwa v1, v57, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: v_or_b32_sdwa v2, v34, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v3, v35, v51 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v15, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_or_b32_sdwa v0, v32, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: v_or_b32_sdwa v0, v37, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v5, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v3, s4, v0 ; VI-NEXT: s_and_b32 s4, s16, 0xff +; VI-NEXT: s_lshl_b32 s5, s17, 8 ; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_and_b32 s5, s18, 0xff +; VI-NEXT: s_lshl_b32 s6, s19, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -92536,6 +92116,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s6, s21, 8 ; VI-NEXT: s_or_b32 s5, s5, s6 ; VI-NEXT: s_and_b32 s6, s22, 0xff +; VI-NEXT: s_lshl_b32 s7, s23, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_lshl_b32 s6, s6, 16 @@ -92544,70 +92125,75 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_lshl_b32 s7, s25, 8 ; VI-NEXT: s_or_b32 s6, s6, s7 ; VI-NEXT: s_and_b32 s7, s26, 0xff +; VI-NEXT: s_lshl_b32 s8, s27, 8 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: s_lshl_b32 s7, s7, 16 ; VI-NEXT: s_or_b32 s6, s6, s7 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v30, v34 ; VI-NEXT: s_cbranch_execnz .LBB111_3 ; VI-NEXT: .LBB111_2: ; %cmp.true +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v44 -; VI-NEXT: v_or_b32_sdwa v3, v39, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v43 -; VI-NEXT: v_or_b32_sdwa v13, v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v13, v61, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v42 -; VI-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v15, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v41 -; VI-NEXT: v_or_b32_sdwa v12, v62, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v12, v59, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v54 -; VI-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v39, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 -; VI-NEXT: v_or_b32_sdwa v11, v60, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 -; VI-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v30, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 -; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 -; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v26, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v24 -; VI-NEXT: v_or_b32_sdwa v9, v58, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v23 +; VI-NEXT: s_waitcnt vmcnt(5) ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v57 -; VI-NEXT: v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v22, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v55 -; VI-NEXT: v_or_b32_sdwa v8, v46, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v52 +; VI-NEXT: v_or_b32_sdwa v11, v56, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v3, v46, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_mov_b32_e32 v26, v24 +; VI-NEXT: v_add_u32_e32 v24, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v38 +; VI-NEXT: v_or_b32_sdwa v10, v29, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v27, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v30 +; VI-NEXT: v_or_b32_sdwa v9, v25, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v28 +; VI-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v23, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v49 +; VI-NEXT: v_or_b32_sdwa v8, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v18 -; VI-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v18, vcc, 0x300, v3 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v16 -; VI-NEXT: v_or_b32_sdwa v7, v21, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v53 -; VI-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v36 -; VI-NEXT: v_or_b32_sdwa v6, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v31 +; VI-NEXT: v_or_b32_sdwa v7, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v48 ; VI-NEXT: v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v16, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v39 +; VI-NEXT: v_or_b32_sdwa v6, v55, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v20 +; VI-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v17, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 -; VI-NEXT: v_or_b32_sdwa v5, v49, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v35 -; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; VI-NEXT: v_or_b32_sdwa v5, v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v34 ; VI-NEXT: s_add_i32 s28, s28, 3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v19, vcc, 0x300, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v26 ; VI-NEXT: s_and_b32 s4, s28, 0xff ; VI-NEXT: s_lshl_b32 s5, s29, 8 ; VI-NEXT: s_add_i32 s26, s26, 3 @@ -92635,12 +92221,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_and_b32 s10, s16, 0xff ; VI-NEXT: s_lshl_b32 s11, s17, 8 ; VI-NEXT: s_or_b32 s10, s11, s10 -; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; VI-NEXT: v_add_u32_e32 v1, vcc, 3, v47 ; VI-NEXT: s_addk_i32 s6, 0x300 ; VI-NEXT: s_addk_i32 s8, 0x300 ; VI-NEXT: s_addk_i32 s10, 0x300 -; VI-NEXT: v_or_b32_sdwa v1, v48, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v1, v33, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v45 ; VI-NEXT: s_addk_i32 s4, 0x300 ; VI-NEXT: s_lshl_b32 s5, s5, 16 @@ -92650,7 +92235,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_and_b32 s8, s8, 0xffff ; VI-NEXT: s_and_b32 s6, s6, 0xffff ; VI-NEXT: v_add_u32_e32 v1, vcc, 0x300, v1 -; VI-NEXT: v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; VI-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: s_or_b32 s9, s9, s10 ; VI-NEXT: s_or_b32 s7, s7, s8 ; VI-NEXT: s_or_b32 s5, s5, s6 @@ -92662,10 +92247,10 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_or_b32_sdwa v6, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v7, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v8, v8, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v9, v9, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v10, v10, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v11, v11, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; VI-NEXT: v_or_b32_sdwa v12, v12, v39 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v9, v9, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v10, v10, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v11, v11, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_or_b32_sdwa v12, v12, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v13, v13, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v2, v2, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -92675,12 +92260,11 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_add_u32_e32 v8, vcc, 0x3000000, v8 ; VI-NEXT: v_add_u32_e32 v9, vcc, 0x3000000, v9 ; VI-NEXT: v_add_u32_e32 v10, vcc, 0x3000000, v10 -; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 -; VI-NEXT: s_waitcnt vmcnt(2) -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 -; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; VI-NEXT: v_add_u32_e32 v11, vcc, 0x3000000, v11 ; VI-NEXT: v_add_u32_e32 v12, vcc, 0x3000000, v12 ; VI-NEXT: v_add_u32_e32 v13, vcc, 0x3000000, v13 ; VI-NEXT: v_add_u32_e32 v14, vcc, 0x3000000, v2 @@ -92688,11 +92272,12 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: v_mov_b32_e32 v0, s9 ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s5 -; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v3 +; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u32_e32 v20, vcc, 0x300, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v32 +; VI-NEXT: v_add_u32_e32 v3, vcc, 3, v37 ; VI-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_e32 v3, s4, v3 ; VI-NEXT: v_or_b32_sdwa v4, v4, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 @@ -92718,22 +92303,8 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; VI-NEXT: .LBB111_4: -; VI-NEXT: v_mov_b32_e32 v25, v23 -; VI-NEXT: v_mov_b32_e32 v23, v26 -; VI-NEXT: v_mov_b32_e32 v26, v30 -; VI-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; VI-NEXT: v_mov_b32_e32 v48, v51 -; VI-NEXT: v_mov_b32_e32 v31, v10 -; VI-NEXT: v_mov_b32_e32 v36, v34 -; VI-NEXT: v_mov_b32_e32 v35, v6 -; VI-NEXT: v_mov_b32_e32 v37, v8 -; VI-NEXT: v_mov_b32_e32 v39, v14 -; VI-NEXT: v_mov_b32_e32 v21, v15 -; VI-NEXT: v_mov_b32_e32 v19, v13 -; VI-NEXT: v_mov_b32_e32 v17, v11 -; VI-NEXT: v_mov_b32_e32 v40, v9 -; VI-NEXT: v_mov_b32_e32 v49, v7 -; VI-NEXT: v_mov_b32_e32 v20, v5 +; VI-NEXT: v_mov_b32_e32 v24, v36 +; VI-NEXT: v_mov_b32_e32 v28, v26 ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; VI-NEXT: s_branch .LBB111_2 ; @@ -92756,244 +92327,228 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-NEXT: v_mov_b32_e32 v34, v30 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:76 -; GFX9-NEXT: buffer_load_ushort v32, off, s[0:3], s32 -; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:8 +; GFX9-NEXT: v_mov_b32_e32 v48, v30 +; GFX9-NEXT: v_mov_b32_e32 v33, v28 +; GFX9-NEXT: v_mov_b32_e32 v37, v26 +; GFX9-NEXT: v_mov_b32_e32 v34, v24 +; GFX9-NEXT: v_mov_b32_e32 v32, v22 +; GFX9-NEXT: v_mov_b32_e32 v30, v20 +; GFX9-NEXT: v_mov_b32_e32 v49, v14 +; GFX9-NEXT: v_mov_b32_e32 v22, v12 +; GFX9-NEXT: v_mov_b32_e32 v39, v10 +; GFX9-NEXT: v_mov_b32_e32 v35, v8 +; GFX9-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-NEXT: v_mov_b32_e32 v28, v4 +; GFX9-NEXT: v_mov_b32_e32 v26, v2 +; GFX9-NEXT: v_mov_b32_e32 v24, v0 +; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 +; GFX9-NEXT: buffer_load_ushort v2, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_ushort v4, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_ushort v52, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_ushort v30, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_ushort v6, off, s[0:3], s32 offset:16 ; GFX9-NEXT: buffer_load_ushort v41, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_ushort v8, off, s[0:3], s32 offset:24 ; GFX9-NEXT: buffer_load_ushort v40, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_ushort v33, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_ushort v10, off, s[0:3], s32 offset:32 ; GFX9-NEXT: buffer_load_ushort v44, off, s[0:3], s32 offset:28 -; GFX9-NEXT: buffer_load_ushort v37, off, s[0:3], s32 offset:40 +; GFX9-NEXT: buffer_load_ushort v12, off, s[0:3], s32 offset:40 ; GFX9-NEXT: buffer_load_ushort v43, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_ushort v35, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_ushort v14, off, s[0:3], s32 offset:48 ; GFX9-NEXT: buffer_load_ushort v46, off, s[0:3], s32 offset:44 -; GFX9-NEXT: buffer_load_ushort v39, off, s[0:3], s32 offset:56 +; GFX9-NEXT: buffer_load_ushort v36, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_ushort v45, off, s[0:3], s32 offset:52 -; GFX9-NEXT: buffer_load_ushort v48, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_ushort v38, off, s[0:3], s32 offset:64 ; GFX9-NEXT: buffer_load_ushort v57, off, s[0:3], s32 offset:60 -; GFX9-NEXT: buffer_load_ushort v49, off, s[0:3], s32 offset:72 +; GFX9-NEXT: buffer_load_ushort v31, off, s[0:3], s32 offset:72 ; GFX9-NEXT: buffer_load_ushort v56, off, s[0:3], s32 offset:68 -; GFX9-NEXT: v_mov_b32_e32 v51, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v50, 8, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 8, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 8, v13 -; GFX9-NEXT: v_lshlrev_b32_e32 v13, 8, v17 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX9-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v19 -; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v21 -; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v23 -; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v51 -; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v27 -; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v29 -; GFX9-NEXT: s_waitcnt vmcnt(19) -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v36 -; GFX9-NEXT: s_waitcnt vmcnt(18) -; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v32 -; GFX9-NEXT: s_waitcnt vmcnt(17) -; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v38 +; GFX9-NEXT: v_lshlrev_b32_e32 v54, 8, v11 +; GFX9-NEXT: v_lshlrev_b32_e32 v53, 8, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v55, 8, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v42, 8, v19 +; GFX9-NEXT: v_lshlrev_b32_e32 v19, 8, v21 +; GFX9-NEXT: v_lshlrev_b32_e32 v23, 8, v23 +; GFX9-NEXT: v_lshlrev_b32_e32 v21, 8, v25 +; GFX9-NEXT: v_lshlrev_b32_e32 v27, 8, v27 +; GFX9-NEXT: v_lshlrev_b32_e32 v25, 8, v29 +; GFX9-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(22) +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(21) +; GFX9-NEXT: v_lshlrev_b32_e32 v47, 8, v2 +; GFX9-NEXT: s_waitcnt vmcnt(20) +; GFX9-NEXT: v_lshlrev_b32_e32 v29, 8, v4 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v30 -; GFX9-NEXT: s_waitcnt vmcnt(13) -; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v31 -; GFX9-NEXT: s_waitcnt vmcnt(11) -; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v33 -; GFX9-NEXT: s_waitcnt vmcnt(9) -; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v37 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_lshlrev_b32_e32 v31, 8, v35 -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_lshlrev_b32_e32 v51, 8, v39 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v48 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshlrev_b32_e32 v17, 8, v49 -; GFX9-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-NEXT: s_waitcnt vmcnt(18) +; GFX9-NEXT: v_lshlrev_b32_e32 v59, 8, v6 +; GFX9-NEXT: s_waitcnt vmcnt(16) +; GFX9-NEXT: v_lshlrev_b32_e32 v58, 8, v8 +; GFX9-NEXT: s_waitcnt vmcnt(14) +; GFX9-NEXT: v_lshlrev_b32_e32 v61, 8, v10 +; GFX9-NEXT: s_waitcnt vmcnt(12) +; GFX9-NEXT: v_lshlrev_b32_e32 v60, 8, v12 +; GFX9-NEXT: s_waitcnt vmcnt(10) +; GFX9-NEXT: v_lshlrev_b32_e32 v63, 8, v14 +; GFX9-NEXT: s_waitcnt vmcnt(8) +; GFX9-NEXT: v_lshlrev_b32_e32 v62, 8, v36 +; GFX9-NEXT: s_waitcnt vmcnt(6) +; GFX9-NEXT: v_lshlrev_b32_e32 v38, 8, v38 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_lshlrev_b32_e32 v36, 8, v31 ; GFX9-NEXT: s_cbranch_scc0 .LBB111_4 ; GFX9-NEXT: ; %bb.1: ; %cmp.false ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 ; GFX9-NEXT: s_or_b32 s4, s4, s5 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v24, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_or_b32_sdwa v2, v2, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v50, v3 -; GFX9-NEXT: v_or_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v4, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v3, v0, 16, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v20, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v35, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: v_lshl_or_b32 v5, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v10, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v39, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v12, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v22, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v6, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v14, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v49, v55 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v16, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v16, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v7, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v18, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v32, v16 +; GFX9-NEXT: v_or_b32_sdwa v1, v30, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v8, v1, 16, v0 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_or_b32_sdwa v0, v22, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_or_b32_sdwa v1, v24, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v0, v32, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 +; GFX9-NEXT: v_or_b32_sdwa v1, v34, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v9, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v26, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v37, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v28, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_mov_b32_e32 v48, v10 +; GFX9-NEXT: v_or_b32_sdwa v1, v33, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v10, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v34, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v48, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_and_b32 s4, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s17, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v52, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v52, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s19, 8 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 ; GFX9-NEXT: v_lshl_or_b32 v11, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v41, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v41, v59 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v40, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v40, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_and_b32 s5, s20, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s21, 8 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 ; GFX9-NEXT: v_lshl_or_b32 v12, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v44, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v44, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s22, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s23, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v43, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v43, v60 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 ; GFX9-NEXT: v_lshl_or_b32 v13, v1, 16, v0 -; GFX9-NEXT: v_or_b32_sdwa v0, v46, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v0, v46, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_and_b32 s6, s24, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s25, 8 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_or_b32_sdwa v1, v45, v51 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v45, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_and_b32 s7, s26, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s27, 8 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 +; GFX9-NEXT: v_or_b32_sdwa v2, v26, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v14, v1, 16, v0 +; GFX9-NEXT: v_or_b32_sdwa v0, v57, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_or_b32_sdwa v1, v56, v36 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 ; GFX9-NEXT: v_lshl_or_b32 v4, v4, 16, v2 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v0, v57, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_mov_b32_e32 v18, v22 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v1, v56, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v20, v24 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: s_cbranch_execnz .LBB111_3 ; GFX9-NEXT: .LBB111_2: ; %cmp.true ; GFX9-NEXT: v_add_u32_e32 v3, 3, v45 -; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v14, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v44 -; GFX9-NEXT: v_or_b32_sdwa v3, v38, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v13, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v43 -; GFX9-NEXT: v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v15, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v41 -; GFX9-NEXT: v_or_b32_sdwa v3, v63, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v12, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v40 -; GFX9-NEXT: v_or_b32_sdwa v3, v62, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v36, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 -; GFX9-NEXT: v_or_b32_sdwa v3, v61, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v11, 0x300, v3 ; GFX9-NEXT: v_add_u32_e32 v3, 3, v52 -; GFX9-NEXT: v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v24, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 -; GFX9-NEXT: v_or_b32_sdwa v3, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v29, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v29, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 +; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v10, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 -; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_or_b32_sdwa v3, v25, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v25, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 -; GFX9-NEXT: v_or_b32_sdwa v3, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v9, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v37 -; GFX9-NEXT: v_or_b32_sdwa v3, v47, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v34 +; GFX9-NEXT: v_or_b32_sdwa v3, v21, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v21, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 -; GFX9-NEXT: v_or_b32_sdwa v3, v27, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 -; GFX9-NEXT: v_or_b32_sdwa v3, v23, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v29 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v18 ; GFX9-NEXT: v_or_b32_sdwa v3, v42, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v32 +; GFX9-NEXT: v_add_u32_e32 v8, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v30 ; GFX9-NEXT: v_or_b32_sdwa v3, v19, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v23, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v48 +; GFX9-NEXT: v_add_u32_e32 v18, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v49 ; GFX9-NEXT: v_or_b32_sdwa v3, v55, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 3, v33 +; GFX9-NEXT: v_add_u32_e32 v7, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v16 ; GFX9-NEXT: v_or_b32_sdwa v3, v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v16, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v39 +; GFX9-NEXT: v_or_b32_sdwa v3, v54, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v6, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v22 +; GFX9-NEXT: v_or_b32_sdwa v3, v53, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v17, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(15) -; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 -; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_or_b32_sdwa v1, v20, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v20 +; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-NEXT: v_or_b32_sdwa v3, v51, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v35 +; GFX9-NEXT: v_or_b32_sdwa v3, v31, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v26 +; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 3, v28 ; GFX9-NEXT: s_add_i32 s28, s28, 3 ; GFX9-NEXT: s_and_b32 s4, s28, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s29, 8 @@ -93018,13 +92573,18 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_and_b32 s9, s16, 0xff ; GFX9-NEXT: s_lshl_b32 s10, s17, 8 ; GFX9-NEXT: s_add_i32 s18, s18, 3 +; GFX9-NEXT: v_add_u32_e32 v0, 3, v57 ; GFX9-NEXT: v_add_u32_e32 v2, 3, v46 ; GFX9-NEXT: s_or_b32 s9, s10, s9 ; GFX9-NEXT: s_and_b32 s10, s18, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s19, 8 -; GFX9-NEXT: v_or_b32_sdwa v2, v54, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_add_u32_e32 v1, 3, v56 +; GFX9-NEXT: v_or_b32_sdwa v2, v63, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_or_b32 s10, s11, s10 ; GFX9-NEXT: v_add_u32_e32 v0, 0x300, v0 +; GFX9-NEXT: v_or_b32_sdwa v1, v36, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u32_e32 v2, 0x300, v2 ; GFX9-NEXT: s_addk_i32 s4, 0x300 ; GFX9-NEXT: s_addk_i32 s5, 0x300 @@ -93033,69 +92593,48 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_addk_i32 s8, 0x300 ; GFX9-NEXT: s_addk_i32 s9, 0x300 ; GFX9-NEXT: s_addk_i32 s10, 0x300 -; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v1, 0x300, v1 ; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX9-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX9-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; GFX9-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; GFX9-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 ; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6 -; GFX9-NEXT: v_lshl_or_b32 v7, v23, 16, v7 -; GFX9-NEXT: v_lshl_or_b32 v8, v16, 16, v8 +; GFX9-NEXT: v_lshl_or_b32 v7, v16, 16, v7 +; GFX9-NEXT: v_lshl_or_b32 v8, v18, 16, v8 ; GFX9-NEXT: v_lshl_or_b32 v9, v21, 16, v9 ; GFX9-NEXT: v_lshl_or_b32 v10, v25, 16, v10 -; GFX9-NEXT: v_lshl_or_b32 v11, v24, 16, v11 -; GFX9-NEXT: v_lshl_or_b32 v12, v36, 16, v12 +; GFX9-NEXT: v_lshl_or_b32 v11, v29, 16, v11 +; GFX9-NEXT: v_lshl_or_b32 v12, v24, 16, v12 ; GFX9-NEXT: v_lshl_or_b32 v13, v15, 16, v13 ; GFX9-NEXT: v_lshl_or_b32 v14, v14, 16, v2 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: v_lshl_or_b32 v15, v1, 16, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v5, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v19, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v5, v19, 16, v5 -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v4, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v50, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: v_add_u32_e32 v20, 0x300, v3 -; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-NEXT: v_lshl_or_b32 v4, v20, 16, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v3, 3, v3 -; GFX9-NEXT: v_or_b32_sdwa v3, v18, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_or_b32_sdwa v3, v22, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: v_mov_b32_e32 v22, 0xffff ; GFX9-NEXT: v_add_u32_e32 v3, 0x300, v3 +; GFX9-NEXT: v_and_b32_e32 v22, s4, v22 ; GFX9-NEXT: v_lshl_or_b32 v3, v3, 16, v22 ; GFX9-NEXT: .LBB111_3: ; %end ; GFX9-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload @@ -93117,27 +92656,7 @@ define inreg <32 x bfloat> @bitcast_v64i8_to_v32bf16_scalar(<64 x i8> inreg %a, ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: .LBB111_4: -; GFX9-NEXT: v_mov_b32_e32 v30, v18 -; GFX9-NEXT: v_mov_b32_e32 v49, v20 -; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-NEXT: v_mov_b32_e32 v54, v31 -; GFX9-NEXT: v_mov_b32_e32 v29, v14 -; GFX9-NEXT: v_mov_b32_e32 v48, v10 -; GFX9-NEXT: v_mov_b32_e32 v39, v26 -; GFX9-NEXT: v_mov_b32_e32 v32, v16 -; GFX9-NEXT: v_mov_b32_e32 v16, v22 -; GFX9-NEXT: v_mov_b32_e32 v33, v12 -; GFX9-NEXT: v_mov_b32_e32 v35, v28 -; GFX9-NEXT: v_mov_b32_e32 v37, v24 -; GFX9-NEXT: v_mov_b32_e32 v31, v51 -; GFX9-NEXT: v_mov_b32_e32 v27, v25 -; GFX9-NEXT: v_mov_b32_e32 v23, v21 -; GFX9-NEXT: v_mov_b32_e32 v42, v15 -; GFX9-NEXT: v_mov_b32_e32 v19, v13 -; GFX9-NEXT: v_mov_b32_e32 v55, v11 -; GFX9-NEXT: v_mov_b32_e32 v17, v9 -; GFX9-NEXT: v_mov_b32_e32 v50, v3 +; GFX9-NEXT: v_mov_b32_e32 v31, v5 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX9-NEXT: s_branch .LBB111_2 ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll index e66762f1e02c2..a1c0a87b65e02 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll @@ -2760,216 +2760,214 @@ define inreg <36 x i16> @bitcast_v18i32_to_v36i16_scalar(<18 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s9, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v4 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s28 -; SI-NEXT: v_mov_b32_e32 v4, s26 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 16 -; SI-NEXT: s_lshr_b32 s11, s8, 16 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s27, 16 -; SI-NEXT: s_lshr_b32 s14, s25, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 16 -; SI-NEXT: s_lshr_b32 s40, s21, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s8, s8, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s28 -; SI-NEXT: v_mov_b32_e32 v4, s26 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 16 -; SI-NEXT: s_lshr_b32 s11, s8, 16 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s27, 16 -; SI-NEXT: s_lshr_b32 s14, s25, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 16 -; SI-NEXT: s_lshr_b32 s40, s21, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s9, s56, 16 +; SI-NEXT: s_and_b32 s11, s16, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s11, s76, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s46, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s15, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s75, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s13, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s12, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s11, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s10, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s11, s73, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s25, 0xffff +; SI-NEXT: s_lshl_b32 s11, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s27, 0xffff +; SI-NEXT: s_lshl_b32 s11, s63, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s29, 0xffff +; SI-NEXT: s_lshl_b32 s11, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s9, s10, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s61, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v18i32_to_v36i16_scalar: @@ -9746,207 +9744,207 @@ define inreg <36 x i16> @bitcast_v18f32_to_v36i16_scalar(<18 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_mov_b32_e32 v19, s16 -; SI-NEXT: v_mov_b32_e32 v18, s17 +; SI-NEXT: v_mov_b32_e32 v18, s16 +; SI-NEXT: v_mov_b32_e32 v19, s17 ; SI-NEXT: v_mov_b32_e32 v16, s18 -; SI-NEXT: v_mov_b32_e32 v15, s19 +; SI-NEXT: v_mov_b32_e32 v17, s19 ; SI-NEXT: v_mov_b32_e32 v14, s20 -; SI-NEXT: v_mov_b32_e32 v13, s21 +; SI-NEXT: v_mov_b32_e32 v15, s21 ; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_mov_b32_e32 v13, s23 ; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v9, s25 +; SI-NEXT: v_mov_b32_e32 v11, s25 ; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v7, s27 +; SI-NEXT: v_mov_b32_e32 v9, s27 ; SI-NEXT: v_mov_b32_e32 v6, s28 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v5, s29 +; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v21, v5, v6, 16 -; SI-NEXT: v_alignbit_b32 v22, v7, v8, 16 -; SI-NEXT: v_alignbit_b32 v25, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v29, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v31, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v33, v18, v19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 ; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 ; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_alignbit_b32 v17, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v21, v5, v6, 16 -; SI-NEXT: v_alignbit_b32 v22, v7, v8, 16 -; SI-NEXT: v_alignbit_b32 v25, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v27, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v29, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v31, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v33, v18, v19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v11 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 ; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_or_b32_e32 v19, v19, v33 -; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v35 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_or_b32_e32 v14, v14, v16 +; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 -; SI-NEXT: v_add_i32_e32 v15, vcc, 16, v0 +; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v34 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 +; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v32 -; SI-NEXT: v_or_b32_e32 v11, v11, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v24 +; SI-NEXT: v_or_b32_e32 v10, v10, v12 +; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v30 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 +; SI-NEXT: v_or_b32_e32 v8, v8, v10 +; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v28 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 +; SI-NEXT: v_or_b32_e32 v6, v6, v8 +; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 +; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v26 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v20 -; SI-NEXT: v_or_b32_e32 v1, v1, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr17 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v18f32_to_v36i16_scalar: @@ -15972,216 +15970,214 @@ define inreg <36 x i16> @bitcast_v9i64_to_v36i16_scalar(<9 x i64> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; SI-NEXT: v_readfirstlane_b32 s9, v1 -; SI-NEXT: v_readfirstlane_b32 s8, v2 -; SI-NEXT: v_readfirstlane_b32 s7, v3 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v1 +; SI-NEXT: v_readfirstlane_b32 s7, v2 +; SI-NEXT: v_readfirstlane_b32 s4, v3 +; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v4 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s28 -; SI-NEXT: v_mov_b32_e32 v4, s26 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 16 -; SI-NEXT: s_lshr_b32 s11, s8, 16 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s27, 16 -; SI-NEXT: s_lshr_b32 s14, s25, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 16 -; SI-NEXT: s_lshr_b32 s40, s21, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s28 -; SI-NEXT: v_mov_b32_e32 v4, s26 -; SI-NEXT: v_mov_b32_e32 v5, s24 -; SI-NEXT: v_mov_b32_e32 v6, s22 -; SI-NEXT: v_mov_b32_e32 v7, s20 -; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v9, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s29, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s27, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s25, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s23, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s21, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s19, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s17, v9, 16 -; SI-NEXT: s_lshr_b32 s10, s6, 16 -; SI-NEXT: s_lshr_b32 s11, s8, 16 -; SI-NEXT: s_lshr_b32 s12, s29, 16 -; SI-NEXT: s_lshr_b32 s13, s27, 16 -; SI-NEXT: s_lshr_b32 s14, s25, 16 -; SI-NEXT: s_lshr_b32 s15, s23, 16 -; SI-NEXT: s_lshr_b32 s40, s21, 16 -; SI-NEXT: s_lshr_b32 s41, s19, 16 -; SI-NEXT: s_lshr_b32 s42, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s60, s5, 16 +; SI-NEXT: s_lshr_b32 s61, s7, 16 +; SI-NEXT: s_lshr_b32 s62, s29, 16 +; SI-NEXT: s_lshr_b32 s63, s27, 16 +; SI-NEXT: s_lshr_b32 s72, s25, 16 +; SI-NEXT: s_lshr_b32 s73, s23, 16 +; SI-NEXT: s_lshr_b32 s74, s21, 16 +; SI-NEXT: s_lshr_b32 s75, s19, 16 +; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: s_lshl_b32 s9, s56, 16 +; SI-NEXT: s_and_b32 s11, s16, 0xffff +; SI-NEXT: s_or_b32 s9, s11, s9 +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: s_and_b32 s9, s17, 0xffff +; SI-NEXT: s_lshl_b32 s11, s76, 16 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s18, 0xffff +; SI-NEXT: s_lshl_b32 s11, s46, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s15, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_lshl_b32 s11, s75, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s13, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s12, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s21, 0xffff +; SI-NEXT: s_lshl_b32 s11, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s11, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s22, 0xffff +; SI-NEXT: s_lshl_b32 s11, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s10, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s23, 0xffff +; SI-NEXT: s_lshl_b32 s11, s73, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s24, 0xffff +; SI-NEXT: s_lshl_b32 s11, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s25, 0xffff +; SI-NEXT: s_lshl_b32 s11, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s26, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s27, 0xffff +; SI-NEXT: s_lshl_b32 s11, s63, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s28, 0xffff +; SI-NEXT: s_lshl_b32 s11, s12, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s9, s29, 0xffff +; SI-NEXT: s_lshl_b32 s11, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s9, s9, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s9 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s9, s10, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s6, s6, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s61, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s8, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr75 +; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr73 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr63 ; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr62 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr61 +; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr60 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v9i64_to_v36i16_scalar: @@ -21460,97 +21456,97 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: v_mov_b32_e32 v7, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v5, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v21, v7, v6, 16 -; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 -; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v29, v15, v14, 16 -; SI-NEXT: v_alignbit_b32 v31, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v34, v19, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 -; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 -; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 -; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 -; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 -; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 -; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_alignbit_b32 v5, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v20, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v21, v7, v6, 16 -; SI-NEXT: v_alignbit_b32 v22, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v24, v11, v10, 16 -; SI-NEXT: v_alignbit_b32 v26, v13, v12, 16 -; SI-NEXT: v_alignbit_b32 v29, v15, v14, 16 -; SI-NEXT: v_alignbit_b32 v31, v17, v16, 16 -; SI-NEXT: v_alignbit_b32 v34, v19, v18, 16 -; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v19 +; SI-NEXT: v_add_f64 v[6:7], v[6:7], 1.0 +; SI-NEXT: v_lshr_b64 v[20:21], v[3:4], 16 +; SI-NEXT: v_add_f64 v[8:9], v[8:9], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 +; SI-NEXT: v_add_f64 v[10:11], v[10:11], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[6:7], 16 +; SI-NEXT: v_add_f64 v[12:13], v[12:13], 1.0 +; SI-NEXT: v_lshr_b64 v[23:24], v[8:9], 16 +; SI-NEXT: v_add_f64 v[14:15], v[14:15], 1.0 +; SI-NEXT: v_lshr_b64 v[24:25], v[10:11], 16 +; SI-NEXT: v_add_f64 v[16:17], v[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[25:26], v[12:13], 16 +; SI-NEXT: v_add_f64 v[18:19], v[18:19], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[14:15], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[18:19], 16 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v19 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 ; SI-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 ; SI-NEXT: v_or_b32_e32 v18, v18, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v27 +; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: v_or_b32_e32 v16, v16, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v16, v18, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 ; SI-NEXT: v_or_b32_e32 v16, v16, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v26 +; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 ; SI-NEXT: v_or_b32_e32 v14, v14, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v14, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v14, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 ; SI-NEXT: v_or_b32_e32 v14, v14, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 ; SI-NEXT: v_or_b32_e32 v12, v12, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v34 ; SI-NEXT: v_or_b32_e32 v12, v12, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen @@ -21562,79 +21558,79 @@ define inreg <36 x i16> @bitcast_v9f64_to_v36i16_scalar(<9 x double> inreg %a, i ; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v33 ; SI-NEXT: v_or_b32_e32 v10, v10, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v23 ; SI-NEXT: v_or_b32_e32 v8, v8, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v8, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v32 ; SI-NEXT: v_or_b32_e32 v8, v8, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v22 ; SI-NEXT: v_or_b32_e32 v6, v6, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v6, v8, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v31 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v5 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr24 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr26 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr24 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v9f64_to_v36i16_scalar: @@ -28520,302 +28516,321 @@ define inreg <36 x i16> @bitcast_v36f16_to_v36i16_scalar(<36 x half> inreg %a, i ; SI-LABEL: bitcast_v36f16_to_v36i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v12 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v45, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v55, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v36, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v35, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v33, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v21, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v18, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v30, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v23 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v21, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_or_b32_e32 v1, v1, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v6, v6, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_or_b32_e32 v23, v23, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v18, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v16, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v14, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v50 +; SI-NEXT: v_or_b32_e32 v10, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_or_b32_e32 v8, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v22 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v39 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v6, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v38 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v11 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_or_b32_e32 v24, v24, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v36 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v45 +; SI-NEXT: v_or_b32_e32 v43, v12, v17 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v54 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_or_b32_e32 v45, v11, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 +; SI-NEXT: v_or_b32_e32 v42, v12, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v53 +; SI-NEXT: v_or_b32_e32 v54, v19, v9 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_or_b32_e32 v40, v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v51 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v27, v27, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v13, v13, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v17 -; SI-NEXT: v_or_b32_e32 v16, v16, v37 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v20 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v35 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v34 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v31 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v22, v22, v36 -; SI-NEXT: v_or_b32_e32 v21, v21, v35 -; SI-NEXT: v_or_b32_e32 v18, v18, v34 -; SI-NEXT: v_or_b32_e32 v15, v15, v33 -; SI-NEXT: v_or_b32_e32 v29, v29, v32 -; SI-NEXT: v_or_b32_e32 v26, v26, v31 -; SI-NEXT: v_or_b32_e32 v11, v11, v30 -; SI-NEXT: v_or_b32_e32 v8, v8, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v9 -; SI-NEXT: v_alignbit_b32 v36, v19, v36, 16 -; SI-NEXT: v_alignbit_b32 v35, v16, v35, 16 -; SI-NEXT: v_alignbit_b32 v34, v13, v34, 16 -; SI-NEXT: v_alignbit_b32 v33, v27, v33, 16 -; SI-NEXT: v_alignbit_b32 v32, v24, v32, 16 -; SI-NEXT: v_alignbit_b32 v31, v23, v31, 16 -; SI-NEXT: v_alignbit_b32 v30, v6, v30, 16 -; SI-NEXT: v_alignbit_b32 v12, v3, v12, 16 -; SI-NEXT: v_alignbit_b32 v9, v1, v9, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v53, v12, v7 +; SI-NEXT: v_or_b32_e32 v51, v11, v5 +; SI-NEXT: v_or_b32_e32 v48, v19, v3 +; SI-NEXT: v_or_b32_e32 v38, v22, v1 +; SI-NEXT: v_lshr_b64 v[34:35], v[20:21], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v36 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v22, v22, v36 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v45 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v33 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v32 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v28 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v30 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v25 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v28 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v30 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 -; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v12 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v9 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v22 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v39 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v38 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll index b8091d8256457..47cb6bd3b3bb6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll @@ -2838,240 +2838,238 @@ define inreg <40 x i16> @bitcast_v20i32_to_v40i16_scalar(<20 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v6 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v6, s24 -; SI-NEXT: v_mov_b32_e32 v7, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s18 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 -; SI-NEXT: s_lshr_b32 s12, s6, 16 -; SI-NEXT: s_lshr_b32 s13, s8, 16 -; SI-NEXT: s_lshr_b32 s14, s10, 16 -; SI-NEXT: s_lshr_b32 s15, s29, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 -; SI-NEXT: s_lshr_b32 s42, s23, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s27, 16 +; SI-NEXT: s_lshr_b32 s77, s25, 16 +; SI-NEXT: s_lshr_b32 s78, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s19, 16 +; SI-NEXT: s_lshr_b32 s89, s17, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s10, s10, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v6, s24 -; SI-NEXT: v_mov_b32_e32 v7, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s18 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 -; SI-NEXT: s_lshr_b32 s12, s6, 16 -; SI-NEXT: s_lshr_b32 s13, s8, 16 -; SI-NEXT: s_lshr_b32 s14, s10, 16 -; SI-NEXT: s_lshr_b32 s15, s29, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 -; SI-NEXT: s_lshr_b32 s42, s23, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s27, 16 +; SI-NEXT: s_lshr_b32 s77, s25, 16 +; SI-NEXT: s_lshr_b32 s78, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s19, 16 +; SI-NEXT: s_lshr_b32 s89, s17, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_lshl_b32 s11, s60, 16 +; SI-NEXT: s_and_b32 s13, s16, 0xffff +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s13, s89, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_lshl_b32 s11, s58, 16 +; SI-NEXT: s_and_b32 s13, s18, 0xffff +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s13, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s15, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s13, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s12, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s25, 0xffff +; SI-NEXT: s_lshl_b32 s13, s77, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s26, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s28, 0xffff +; SI-NEXT: s_lshl_b32 s13, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s29, 0xffff +; SI-NEXT: s_lshl_b32 s13, s75, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s73, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v20i32_to_v40i16_scalar: @@ -10571,165 +10569,165 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_mov_b32_e32 v21, s16 +; SI-NEXT: v_mov_b32_e32 v19, s16 ; SI-NEXT: v_mov_b32_e32 v20, s17 -; SI-NEXT: v_mov_b32_e32 v19, s18 -; SI-NEXT: v_mov_b32_e32 v17, s19 -; SI-NEXT: v_mov_b32_e32 v16, s20 -; SI-NEXT: v_mov_b32_e32 v15, s21 -; SI-NEXT: v_mov_b32_e32 v14, s22 -; SI-NEXT: v_mov_b32_e32 v13, s23 -; SI-NEXT: v_mov_b32_e32 v12, s24 -; SI-NEXT: v_mov_b32_e32 v11, s25 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: v_mov_b32_e32 v8, s28 +; SI-NEXT: v_mov_b32_e32 v17, s18 +; SI-NEXT: v_mov_b32_e32 v18, s19 +; SI-NEXT: v_mov_b32_e32 v15, s20 +; SI-NEXT: v_mov_b32_e32 v16, s21 +; SI-NEXT: v_mov_b32_e32 v13, s22 +; SI-NEXT: v_mov_b32_e32 v14, s23 +; SI-NEXT: v_mov_b32_e32 v11, s24 +; SI-NEXT: v_mov_b32_e32 v12, s25 +; SI-NEXT: v_mov_b32_e32 v9, s26 +; SI-NEXT: v_mov_b32_e32 v10, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v7, s29 +; SI-NEXT: v_mov_b32_e32 v7, s28 +; SI-NEXT: v_mov_b32_e32 v8, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 -; SI-NEXT: v_alignbit_b32 v26, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v28, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v33, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v35, v17, v19, 16 -; SI-NEXT: v_alignbit_b32 v37, v20, v21, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_alignbit_b32 v18, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, v7, v8, 16 -; SI-NEXT: v_alignbit_b32 v26, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v28, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v31, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v33, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v35, v17, v19, 16 -; SI-NEXT: v_alignbit_b32 v37, v20, v21, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v21, v21, v37 -; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 +; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 +; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 +; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v31 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v24 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 +; SI-NEXT: v_or_b32_e32 v7, v7, v9 ; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v30 +; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen @@ -10741,7 +10739,7 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -10753,45 +10751,45 @@ define inreg <40 x i16> @bitcast_v20f32_to_v40i16_scalar(<20 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v18 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v20f32_to_v40i16_scalar: @@ -17582,240 +17580,238 @@ define inreg <40 x i16> @bitcast_v10i64_to_v40i16_scalar(<10 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; SI-NEXT: v_readfirstlane_b32 s11, v1 -; SI-NEXT: v_readfirstlane_b32 s10, v2 -; SI-NEXT: v_readfirstlane_b32 s9, v3 -; SI-NEXT: v_readfirstlane_b32 s8, v4 -; SI-NEXT: v_readfirstlane_b32 s7, v5 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v1 +; SI-NEXT: v_readfirstlane_b32 s9, v2 +; SI-NEXT: v_readfirstlane_b32 s6, v3 +; SI-NEXT: v_readfirstlane_b32 s7, v4 +; SI-NEXT: v_readfirstlane_b32 s4, v5 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v6 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v6, s24 -; SI-NEXT: v_mov_b32_e32 v7, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s18 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 -; SI-NEXT: s_lshr_b32 s12, s6, 16 -; SI-NEXT: s_lshr_b32 s13, s8, 16 -; SI-NEXT: s_lshr_b32 s14, s10, 16 -; SI-NEXT: s_lshr_b32 s15, s29, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 -; SI-NEXT: s_lshr_b32 s42, s23, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s27, 16 +; SI-NEXT: s_lshr_b32 s77, s25, 16 +; SI-NEXT: s_lshr_b32 s78, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s19, 16 +; SI-NEXT: s_lshr_b32 s89, s17, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 -; SI-NEXT: s_add_u32 s28, s28, 3 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s28 -; SI-NEXT: v_mov_b32_e32 v5, s26 -; SI-NEXT: v_mov_b32_e32 v6, s24 -; SI-NEXT: v_mov_b32_e32 v7, s22 -; SI-NEXT: v_mov_b32_e32 v8, s20 -; SI-NEXT: v_mov_b32_e32 v9, s18 -; SI-NEXT: v_mov_b32_e32 v10, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s29, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s27, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s25, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s23, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s21, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s19, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s17, v10, 16 -; SI-NEXT: s_lshr_b32 s12, s6, 16 -; SI-NEXT: s_lshr_b32 s13, s8, 16 -; SI-NEXT: s_lshr_b32 s14, s10, 16 -; SI-NEXT: s_lshr_b32 s15, s29, 16 -; SI-NEXT: s_lshr_b32 s40, s27, 16 -; SI-NEXT: s_lshr_b32 s41, s25, 16 -; SI-NEXT: s_lshr_b32 s42, s23, 16 -; SI-NEXT: s_lshr_b32 s43, s21, 16 -; SI-NEXT: s_lshr_b32 s44, s19, 16 -; SI-NEXT: s_lshr_b32 s45, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s72, s5, 16 +; SI-NEXT: s_lshr_b32 s73, s7, 16 +; SI-NEXT: s_lshr_b32 s74, s9, 16 +; SI-NEXT: s_lshr_b32 s75, s29, 16 +; SI-NEXT: s_lshr_b32 s76, s27, 16 +; SI-NEXT: s_lshr_b32 s77, s25, 16 +; SI-NEXT: s_lshr_b32 s78, s23, 16 +; SI-NEXT: s_lshr_b32 s79, s21, 16 +; SI-NEXT: s_lshr_b32 s88, s19, 16 +; SI-NEXT: s_lshr_b32 s89, s17, 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_lshl_b32 s11, s60, 16 +; SI-NEXT: s_and_b32 s13, s16, 0xffff +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: s_and_b32 s11, s17, 0xffff +; SI-NEXT: s_lshl_b32 s13, s89, 16 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_lshl_b32 s11, s58, 16 +; SI-NEXT: s_and_b32 s13, s18, 0xffff +; SI-NEXT: s_or_b32 s11, s13, s11 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_lshl_b32 s13, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s20, 0xffff +; SI-NEXT: s_lshl_b32 s13, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s21, 0xffff +; SI-NEXT: s_lshl_b32 s13, s79, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s15, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s22, 0xffff +; SI-NEXT: s_lshl_b32 s13, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s23, 0xffff +; SI-NEXT: s_lshl_b32 s13, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s13, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s24, 0xffff +; SI-NEXT: s_lshl_b32 s13, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s12, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x48, v0 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s25, 0xffff +; SI-NEXT: s_lshl_b32 s13, s77, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s26, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s27, 0xffff +; SI-NEXT: s_lshl_b32 s13, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s28, 0xffff +; SI-NEXT: s_lshl_b32 s13, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s11, s29, 0xffff +; SI-NEXT: s_lshl_b32 s13, s75, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s11, s11, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s11, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s8, s8, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s12, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s73, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s10, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr56 +; SI-NEXT: ; implicit-def: $sgpr79 +; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr77 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr75 ; SI-NEXT: ; implicit-def: $sgpr14 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr13 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr74 ; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr73 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr72 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v10i64_to_v40i16_scalar: @@ -23902,126 +23898,126 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v8, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_alignbit_b32 v21, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v22, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v23, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v24, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v28, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v30, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v35, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v37, v20, v19, 16 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v20 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_lshr_b64 v[21:22], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[22:23], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[23:24], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v20 ; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 -; SI-NEXT: v_or_b32_e32 v19, v19, v37 +; SI-NEXT: v_or_b32_e32 v19, v19, v26 ; SI-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v50 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v49 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v25 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v38 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 ; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen @@ -24033,7 +24029,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v7, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v36 ; SI-NEXT: v_or_b32_e32 v7, v7, v8 ; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen @@ -24045,7 +24041,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -24057,7 +24053,7 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -24069,33 +24065,33 @@ define inreg <40 x i16> @bitcast_v10f64_to_v40i16_scalar(<10 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v10f64_to_v40i16_scalar: @@ -32185,338 +32181,367 @@ define inreg <40 x i16> @bitcast_v40f16_to_v40i16_scalar(<40 x half> inreg %a, i ; SI-LABEL: bitcast_v40f16_to_v40i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v15, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v42, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v26 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v58, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v59, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v56, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v12, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v25, s29 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v27, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v26 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v48 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v39 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v25 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v34, v25, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v38 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v23 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v45, v22, v17 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v54 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_or_b32_e32 v54, v22, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v51 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v57, v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_or_b32_e32 v58, v1, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v43, v3, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_or_b32_e32 v47, v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v41 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v40, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_or_b32_e32 v41, v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v7 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_or_b32_e32 v51, v22, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v12, v12, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v10 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v2, v2, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v53 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v49 +; SI-NEXT: v_or_b32_e32 v4, v4, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v52 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_or_b32_e32 v28, v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 +; SI-NEXT: v_or_b32_e32 v6, v6, v21 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_or_b32_e32 v8, v8, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v44 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_or_b32_e32 v10, v10, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v42 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v21 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v21 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v44 +; SI-NEXT: v_or_b32_e32 v14, v14, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 +; SI-NEXT: v_or_b32_e32 v18, v18, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v33, v25, v33 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v22 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v35, v35, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_or_b32_e32 v14, v14, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 -; SI-NEXT: v_or_b32_e32 v20, v20, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v18 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v39 -; SI-NEXT: v_or_b32_e32 v19, v19, v25 -; SI-NEXT: v_or_b32_e32 v22, v22, v26 -; SI-NEXT: v_or_b32_e32 v21, v21, v27 -; SI-NEXT: v_or_b32_e32 v16, v16, v24 -; SI-NEXT: v_or_b32_e32 v15, v15, v48 -; SI-NEXT: v_or_b32_e32 v30, v30, v38 -; SI-NEXT: v_or_b32_e32 v29, v29, v37 -; SI-NEXT: v_or_b32_e32 v11, v11, v51 -; SI-NEXT: v_or_b32_e32 v6, v6, v23 -; SI-NEXT: v_or_b32_e32 v4, v4, v52 -; SI-NEXT: v_alignbit_b32 v49, v19, v26, 16 -; SI-NEXT: v_alignbit_b32 v26, v20, v27, 16 -; SI-NEXT: v_alignbit_b32 v25, v14, v24, 16 -; SI-NEXT: v_alignbit_b32 v24, v35, v48, 16 -; SI-NEXT: v_alignbit_b32 v48, v33, v50, 16 -; SI-NEXT: v_alignbit_b32 v39, v28, v38, 16 -; SI-NEXT: v_alignbit_b32 v38, v12, v37, 16 -; SI-NEXT: v_alignbit_b32 v37, v9, v51, 16 -; SI-NEXT: v_alignbit_b32 v36, v3, v23, 16 -; SI-NEXT: v_alignbit_b32 v23, v5, v52, 16 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v21 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v46 +; SI-NEXT: v_or_b32_e32 v16, v16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v59 +; SI-NEXT: v_or_b32_e32 v12, v12, v22 +; SI-NEXT: v_or_b32_e32 v20, v20, v21 +; SI-NEXT: v_lshr_b64 v[25:26], v[17:18], 16 +; SI-NEXT: v_or_b32_e32 v50, v23, v1 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[23:24], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[21:22], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v22, v22, v27 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v58 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v26 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v36 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v25 -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v13, v14, v13 -; SI-NEXT: v_add_i32_e32 v14, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v24 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v34 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v31 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v32 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v39 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v10, v13, v10 -; SI-NEXT: v_add_i32_e32 v13, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v38 -; SI-NEXT: v_or_b32_e32 v10, v10, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v37 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v36 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v26 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v21 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index ab1f8606cffd7..67c9bfe9d9f3b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -1065,24 +1065,23 @@ define inreg <4 x i16> @bitcast_i64_to_v4i16_scalar(i64 inreg %a, i32 inreg %b) ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_i64_to_v4i16_scalar: @@ -2708,38 +2707,39 @@ define inreg <8 x i8> @bitcast_i64_to_v8i8_scalar(i64 inreg %a, i32 inreg %b) { ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB25_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s12, s17, 24 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB25_3 ; SI-NEXT: .LBB25_2: ; %cmp.true ; SI-NEXT: s_add_u32 s16, s16, 3 ; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s12, s17, 24 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: .LBB25_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB25_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB25_2 ; ; VI-LABEL: bitcast_i64_to_v8i8_scalar: @@ -4222,23 +4222,23 @@ define inreg <4 x i16> @bitcast_f64_to_v4i16_scalar(double inreg %a, i32 inreg % ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB37_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB37_4 ; SI-NEXT: .LBB37_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[4:5], s[16:17], 1.0 -; SI-NEXT: v_alignbit_b32 v1, v5, v4, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; SI-NEXT: s_branch .LBB37_5 ; SI-NEXT: .LBB37_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB37_2 ; SI-NEXT: .LBB37_4: ; SI-NEXT: v_mov_b32_e32 v5, s17 ; SI-NEXT: v_mov_b32_e32 v4, s16 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: .LBB37_5: ; %end ; SI-NEXT: v_mov_b32_e32 v0, v4 ; SI-NEXT: v_mov_b32_e32 v2, v5 @@ -5836,40 +5836,43 @@ define inreg <8 x i8> @bitcast_f64_to_v8i8_scalar(double inreg %a, i32 inreg %b) ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB49_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s8, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 8 +; SI-NEXT: s_lshr_b32 s14, s17, 24 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s12, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB49_4 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[8:9], s[16:17], 1.0 -; SI-NEXT: v_alignbit_b32 v3, v9, v8, 24 -; SI-NEXT: v_alignbit_b32 v2, v9, v8, 16 -; SI-NEXT: v_alignbit_b32 v1, v9, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v9 +; SI-NEXT: v_add_f64 v[10:11], s[16:17], 1.0 +; SI-NEXT: v_lshr_b64 v[3:4], v[10:11], 24 +; SI-NEXT: v_lshr_b64 v[8:9], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v11 ; SI-NEXT: s_branch .LBB49_5 ; SI-NEXT: .LBB49_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr14 ; SI-NEXT: s_branch .LBB49_2 ; SI-NEXT: .LBB49_4: -; SI-NEXT: v_mov_b32_e32 v9, s17 -; SI-NEXT: v_mov_b32_e32 v8, s16 -; SI-NEXT: v_mov_b32_e32 v7, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v7, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v5, s12 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: .LBB49_5: ; %end -; SI-NEXT: v_mov_b32_e32 v0, v8 -; SI-NEXT: v_mov_b32_e32 v4, v9 +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v4, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_f64_to_v8i8_scalar: @@ -7049,24 +7052,23 @@ define inreg <4 x i16> @bitcast_v2i32_to_v4i16_scalar(<2 x i32> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB57_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 ; SI-NEXT: .LBB57_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v3, s8 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB57_2 ; ; VI-LABEL: bitcast_v2i32_to_v4i16_scalar: @@ -8688,38 +8690,39 @@ define inreg <8 x i8> @bitcast_v2i32_to_v8i8_scalar(<2 x i32> inreg %a, i32 inre ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB69_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s12, s17, 24 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB69_3 ; SI-NEXT: .LBB69_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s12, s17, 24 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 8 ; SI-NEXT: .LBB69_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB69_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB69_2 ; ; VI-LABEL: bitcast_v2i32_to_v8i8_scalar: @@ -9564,24 +9567,27 @@ define inreg <4 x i16> @bitcast_v2f32_to_v4i16_scalar(<2 x float> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB73_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s8, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB73_4 ; SI-NEXT: .LBB73_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v5, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v4, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[1:2], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5 +; SI-NEXT: s_branch .LBB73_5 ; SI-NEXT: .LBB73_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB73_2 ; SI-NEXT: .LBB73_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: v_mov_b32_e32 v5, s17 +; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: .LBB73_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v4 +; SI-NEXT: v_mov_b32_e32 v2, v5 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v4i16_scalar: @@ -11206,38 +11212,44 @@ define inreg <8 x i8> @bitcast_v2f32_to_v8i8_scalar(<2 x float> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s18, 0 ; SI-NEXT: s_cbranch_scc0 .LBB85_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s12, s17, 24 +; SI-NEXT: s_lshr_b32 s13, s17, 16 +; SI-NEXT: s_lshr_b32 s14, s17, 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB85_4 ; SI-NEXT: .LBB85_2: ; %cmp.true -; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v11, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v10, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[3:4], v[10:11], 24 +; SI-NEXT: v_lshr_b64 v[8:9], v[10:11], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[10:11], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v11 +; SI-NEXT: s_branch .LBB85_5 ; SI-NEXT: .LBB85_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr13 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: s_branch .LBB85_2 ; SI-NEXT: .LBB85_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v10, s16 +; SI-NEXT: v_mov_b32_e32 v11, s17 +; SI-NEXT: v_mov_b32_e32 v5, s14 +; SI-NEXT: v_mov_b32_e32 v6, s13 +; SI-NEXT: v_mov_b32_e32 v7, s12 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v8, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: .LBB85_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v10 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v4, v11 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v2f32_to_v8i8_scalar: @@ -12327,7 +12339,7 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: s_cmp_lg_u32 s20, 0 @@ -12336,23 +12348,24 @@ define inreg <4 x i16> @bitcast_v4f16_to_v4i16_scalar(<4 x half> inreg %a, i32 i ; SI-NEXT: s_cbranch_execnz .LBB91_3 ; SI-NEXT: .LBB91_2: ; %cmp.true ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v4 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: .LBB91_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB91_4: ; SI-NEXT: s_branch .LBB91_2 @@ -12938,34 +12951,35 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v4, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: s_cbranch_scc0 .LBB95_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: s_cbranch_execnz .LBB95_3 ; SI-NEXT: .LBB95_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 -; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 +; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16 +; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16 ; SI-NEXT: .LBB95_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB95_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: s_branch .LBB95_2 @@ -13395,52 +13409,52 @@ define inreg <8 x i8> @bitcast_v4i16_to_v8i8_scalar(<4 x i16> inreg %a, i32 inre ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 -; SI-NEXT: s_lshr_b32 s9, s7, 8 -; SI-NEXT: s_and_b32 s10, s19, 0xffff -; SI-NEXT: s_bfe_u32 s8, s19, 0x80008 +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s9, s5, 8 +; SI-NEXT: s_and_b32 s11, s19, 0xffff +; SI-NEXT: s_bfe_u32 s7, s19, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB97_3 ; SI-NEXT: .LBB97_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 -; SI-NEXT: s_lshr_b32 s8, s7, 24 -; SI-NEXT: s_lshr_b32 s10, s7, 16 -; SI-NEXT: s_lshr_b32 s9, s7, 8 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s7, s5, 24 +; SI-NEXT: s_lshr_b32 s11, s5, 16 +; SI-NEXT: s_lshr_b32 s9, s5, 8 ; SI-NEXT: .LBB97_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v4, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s5 ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 -; SI-NEXT: v_mov_b32_e32 v7, s8 +; SI-NEXT: v_mov_b32_e32 v6, s11 +; SI-NEXT: v_mov_b32_e32 v7, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB97_4: -; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 -; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr8 +; SI-NEXT: ; implicit-def: $sgpr6 +; SI-NEXT: ; implicit-def: $sgpr9 +; SI-NEXT: ; implicit-def: $sgpr11 +; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: s_branch .LBB97_2 ; ; VI-LABEL: bitcast_v4i16_to_v8i8_scalar: @@ -13970,27 +13984,27 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; SI-NEXT: s_cmp_lg_u32 s24, 0 ; SI-NEXT: s_cbranch_scc0 .LBB99_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_or_b32 s4, s6, s4 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s17, 8 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 -; SI-NEXT: s_or_b32 s6, s6, s4 -; SI-NEXT: s_lshr_b32 s8, s5, 16 +; SI-NEXT: s_or_b32 s10, s6, s5 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s7, s6 +; SI-NEXT: s_or_b32 s11, s5, s8 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], 16 +; SI-NEXT: s_or_b32 s4, s4, s10 +; SI-NEXT: s_lshr_b32 s7, s8, 16 +; SI-NEXT: s_mov_b32 s5, s11 ; SI-NEXT: s_cbranch_execnz .LBB99_3 ; SI-NEXT: .LBB99_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -14004,34 +14018,33 @@ define inreg <4 x i16> @bitcast_v8i8_to_v4i16_scalar(<8 x i8> inreg %a, i32 inre ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s7, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 -; SI-NEXT: s_lshr_b32 s8, s7, 16 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 ; SI-NEXT: .LBB99_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s8 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB99_4: +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: s_branch .LBB99_2 ; ; VI-LABEL: bitcast_v8i8_to_v4i16_scalar: @@ -15220,53 +15233,55 @@ define inreg <8 x i8> @bitcast_v4f16_to_v8i8_scalar(<4 x half> inreg %a, i32 inr ; SI-LABEL: bitcast_v4f16_to_v8i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v9, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v8, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s18 ; SI-NEXT: s_cmp_lg_u32 s20, 0 ; SI-NEXT: s_cbranch_scc0 .LBB105_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_or_b32_e32 v9, v8, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v0, v9, v0 -; SI-NEXT: v_or_b32_e32 v4, v8, v1 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB105_3 ; SI-NEXT: .LBB105_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v9, v2, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v2, v1 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_or_b32_e32 v10, v0, v1 +; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: .LBB105_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB105_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: s_branch .LBB105_2 @@ -16420,48 +16435,50 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s20, 0 -; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v9, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v12, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18 ; SI-NEXT: s_cbranch_scc0 .LBB109_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 -; SI-NEXT: v_alignbit_b32 v0, v0, v11, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v9, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_alignbit_b32 v9, v1, v12, 16 +; SI-NEXT: v_alignbit_b32 v10, v6, v8, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 ; SI-NEXT: s_cbranch_execnz .LBB109_3 ; SI-NEXT: .LBB109_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_alignbit_b32 v9, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; SI-NEXT: v_alignbit_b32 v10, v6, v1, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24 +; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; SI-NEXT: .LBB109_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v9 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v10 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB109_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll index 9f5c9c4c509ed..2cc7c448b2e11 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll @@ -3022,264 +3022,260 @@ define inreg <44 x i16> @bitcast_v22i32_to_v44i16_scalar(<22 x i32> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_readfirstlane_b32 s9, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v6 -; SI-NEXT: v_readfirstlane_b32 s7, v7 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v8 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s28 -; SI-NEXT: v_mov_b32_e32 v6, s26 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 -; SI-NEXT: s_lshr_b32 s14, s6, 16 -; SI-NEXT: s_lshr_b32 s15, s8, 16 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s12, 16 -; SI-NEXT: s_lshr_b32 s42, s29, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: s_lshr_b32 s47, s19, 16 -; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s29, 16 +; SI-NEXT: s_lshr_b32 s89, s27, 16 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s19, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s12, s12, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s28 -; SI-NEXT: v_mov_b32_e32 v6, s26 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 -; SI-NEXT: s_lshr_b32 s14, s6, 16 -; SI-NEXT: s_lshr_b32 s15, s8, 16 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s12, 16 -; SI-NEXT: s_lshr_b32 s42, s29, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: s_lshr_b32 s47, s19, 16 -; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s29, 16 +; SI-NEXT: s_lshr_b32 s89, s27, 16 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s19, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_lshl_b32 s13, s72, 16 +; SI-NEXT: s_and_b32 s15, s16, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s15, s94, 16 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_lshl_b32 s13, s62, 16 +; SI-NEXT: s_and_b32 s15, s18, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: s_and_b32 s13, s19, 0xffff +; SI-NEXT: s_lshl_b32 s15, s93, 16 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: s_lshl_b32 s13, s60, 16 +; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s15, s92, 16 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s15, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s91, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s90, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s15, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s27, 0xffff +; SI-NEXT: s_lshl_b32 s15, s89, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s28, 0xffff +; SI-NEXT: s_lshl_b32 s15, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s29, 0xffff +; SI-NEXT: s_lshl_b32 s15, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s10, s10, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s79, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v22i32_to_v44i16_scalar: @@ -11524,171 +11520,171 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_mov_b32_e32 v23, s16 +; SI-NEXT: v_mov_b32_e32 v21, s16 ; SI-NEXT: v_mov_b32_e32 v22, s17 -; SI-NEXT: v_mov_b32_e32 v21, s18 -; SI-NEXT: v_mov_b32_e32 v19, s19 -; SI-NEXT: v_mov_b32_e32 v18, s20 -; SI-NEXT: v_mov_b32_e32 v17, s21 -; SI-NEXT: v_mov_b32_e32 v16, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v14, s24 -; SI-NEXT: v_mov_b32_e32 v12, s25 -; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v19, s18 +; SI-NEXT: v_mov_b32_e32 v20, s19 +; SI-NEXT: v_mov_b32_e32 v17, s20 +; SI-NEXT: v_mov_b32_e32 v18, s21 +; SI-NEXT: v_mov_b32_e32 v15, s22 +; SI-NEXT: v_mov_b32_e32 v16, s23 +; SI-NEXT: v_mov_b32_e32 v13, s24 +; SI-NEXT: v_mov_b32_e32 v14, s25 +; SI-NEXT: v_mov_b32_e32 v11, s26 +; SI-NEXT: v_mov_b32_e32 v12, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v11, s27 -; SI-NEXT: v_mov_b32_e32 v10, s28 -; SI-NEXT: v_mov_b32_e32 v9, s29 +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_mov_b32_e32 v10, s29 ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 -; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 -; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 +; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 ; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 ; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_alignbit_b32 v20, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v28, v9, v10, 16 -; SI-NEXT: v_alignbit_b32 v30, v11, v13, 16 -; SI-NEXT: v_alignbit_b32 v32, v12, v14, 16 -; SI-NEXT: v_alignbit_b32 v34, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v37, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v39, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshr_b64 v[28:29], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[30:31], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_lshr_b64 v[31:32], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_lshr_b64 v[32:33], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v22 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_or_b32_e32 v23, v23, v49 -; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v33 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_or_b32_e32 v21, v21, v33 +; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v39 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v53 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 12, v0 +; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v37 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v52 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 +; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v51 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v30 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v29 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 -; SI-NEXT: v_or_b32_e32 v12, v12, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v12, v14, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v30 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v28 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v36 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v48 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v28 -; SI-NEXT: v_or_b32_e32 v10, v10, v11 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v27 +; SI-NEXT: v_or_b32_e32 v9, v9, v11 ; SI-NEXT: v_add_i32_e32 v11, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v39 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen @@ -11700,7 +11696,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -11712,7 +11708,7 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -11724,47 +11720,47 @@ define inreg <44 x i16> @bitcast_v22f32_to_v44i16_scalar(<22 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v20 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr27 +; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v22f32_to_v44i16_scalar: @@ -19266,264 +19262,260 @@ define inreg <44 x i16> @bitcast_v11i64_to_v44i16_scalar(<11 x i64> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 -; SI-NEXT: v_readfirstlane_b32 s13, v1 -; SI-NEXT: v_readfirstlane_b32 s12, v2 -; SI-NEXT: v_readfirstlane_b32 s11, v3 -; SI-NEXT: v_readfirstlane_b32 s10, v4 -; SI-NEXT: v_readfirstlane_b32 s9, v5 -; SI-NEXT: v_readfirstlane_b32 s8, v6 -; SI-NEXT: v_readfirstlane_b32 s7, v7 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v1 +; SI-NEXT: v_readfirstlane_b32 s11, v2 +; SI-NEXT: v_readfirstlane_b32 s8, v3 +; SI-NEXT: v_readfirstlane_b32 s9, v4 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: v_readfirstlane_b32 s7, v6 +; SI-NEXT: v_readfirstlane_b32 s4, v7 +; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v8 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s28 -; SI-NEXT: v_mov_b32_e32 v6, s26 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 -; SI-NEXT: s_lshr_b32 s14, s6, 16 -; SI-NEXT: s_lshr_b32 s15, s8, 16 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s12, 16 -; SI-NEXT: s_lshr_b32 s42, s29, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: s_lshr_b32 s47, s19, 16 -; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s29, 16 +; SI-NEXT: s_lshr_b32 s89, s27, 16 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s19, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s28 -; SI-NEXT: v_mov_b32_e32 v6, s26 -; SI-NEXT: v_mov_b32_e32 v7, s24 -; SI-NEXT: v_mov_b32_e32 v8, s22 -; SI-NEXT: v_mov_b32_e32 v9, s20 -; SI-NEXT: v_mov_b32_e32 v10, s18 -; SI-NEXT: v_mov_b32_e32 v11, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s29, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s27, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s25, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s23, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s21, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s19, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s17, v11, 16 -; SI-NEXT: s_lshr_b32 s14, s6, 16 -; SI-NEXT: s_lshr_b32 s15, s8, 16 -; SI-NEXT: s_lshr_b32 s40, s10, 16 -; SI-NEXT: s_lshr_b32 s41, s12, 16 -; SI-NEXT: s_lshr_b32 s42, s29, 16 -; SI-NEXT: s_lshr_b32 s43, s27, 16 -; SI-NEXT: s_lshr_b32 s44, s25, 16 -; SI-NEXT: s_lshr_b32 s45, s23, 16 -; SI-NEXT: s_lshr_b32 s46, s21, 16 -; SI-NEXT: s_lshr_b32 s47, s19, 16 -; SI-NEXT: s_lshr_b32 s56, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s76, s5, 16 +; SI-NEXT: s_lshr_b32 s77, s7, 16 +; SI-NEXT: s_lshr_b32 s78, s9, 16 +; SI-NEXT: s_lshr_b32 s79, s11, 16 +; SI-NEXT: s_lshr_b32 s88, s29, 16 +; SI-NEXT: s_lshr_b32 s89, s27, 16 +; SI-NEXT: s_lshr_b32 s90, s25, 16 +; SI-NEXT: s_lshr_b32 s91, s23, 16 +; SI-NEXT: s_lshr_b32 s92, s21, 16 +; SI-NEXT: s_lshr_b32 s93, s19, 16 +; SI-NEXT: s_lshr_b32 s94, s17, 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: buffer_store_dword v11, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: s_lshl_b32 s13, s72, 16 +; SI-NEXT: s_and_b32 s15, s16, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: v_mov_b32_e32 v1, s13 +; SI-NEXT: s_and_b32 s13, s17, 0xffff +; SI-NEXT: s_lshl_b32 s15, s94, 16 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_lshl_b32 s13, s62, 16 +; SI-NEXT: s_and_b32 s15, s18, 0xffff +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: v_mov_b32_e32 v3, s13 +; SI-NEXT: s_and_b32 s13, s19, 0xffff +; SI-NEXT: s_lshl_b32 s15, s93, 16 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: v_mov_b32_e32 v4, s13 +; SI-NEXT: s_lshl_b32 s13, s60, 16 +; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s13, s15, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_lshl_b32 s15, s92, 16 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s22, 0xffff +; SI-NEXT: s_lshl_b32 s15, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s23, 0xffff +; SI-NEXT: s_lshl_b32 s15, s91, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s24, 0xffff +; SI-NEXT: s_lshl_b32 s15, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s25, 0xffff +; SI-NEXT: s_lshl_b32 s15, s90, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s26, 0xffff +; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s15, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s27, 0xffff +; SI-NEXT: s_lshl_b32 s15, s89, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s14, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s28, 0xffff +; SI-NEXT: s_lshl_b32 s15, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s13, s29, 0xffff +; SI-NEXT: s_lshl_b32 s15, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s13, s13, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s13, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s10, s10, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s79, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s77, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s12, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr58 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr89 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr79 ; SI-NEXT: ; implicit-def: $sgpr40 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr15 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr78 ; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr77 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr76 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v11i64_to_v44i16_scalar: @@ -26272,131 +26264,131 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: v_mov_b32_e32 v10, s29 ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[19:20], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_alignbit_b32 v23, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v24, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v25, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v26, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v27, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v34, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v36, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v39, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v22 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshr_b64 v[23:24], v[7:8], 16 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_lshr_b64 v[24:25], v[5:6], 16 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[25:26], v[3:4], 16 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[11:12], 16 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v22 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v34 ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_or_b32_e32 v21, v21, v49 +; SI-NEXT: v_or_b32_e32 v21, v21, v30 ; SI-NEXT: buffer_store_dword v21, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v54 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v53 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v29 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v28 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v50 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v49 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen @@ -26408,7 +26400,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v48 ; SI-NEXT: v_or_b32_e32 v9, v9, v10 ; SI-NEXT: v_add_i32_e32 v10, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen @@ -26420,7 +26412,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -26432,7 +26424,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -26444,7 +26436,7 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -26456,35 +26448,35 @@ define inreg <44 x i16> @bitcast_v11f64_to_v44i16_scalar(<11 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr28 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v11f64_to_v44i16_scalar: @@ -35515,368 +35507,413 @@ define inreg <44 x i16> @bitcast_v44f16_to_v44i16_scalar(<44 x half> inreg %a, i ; SI-LABEL: bitcast_v44f16_to_v44i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_cvt_f16_f32_e32 v57, v2 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_cvt_f16_f32_e32 v58, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v40, s16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v63, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v60, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v52, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_or_b32_e32 v26, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v24 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_or_b32_e32 v1, v1, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v3, v3, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v6, v6, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v24, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v16, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v29 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v22, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v18, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v14, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v42 +; SI-NEXT: v_or_b32_e32 v10, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v8 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v41 +; SI-NEXT: v_or_b32_e32 v8, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v27 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_or_b32_e32 v6, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v53 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v4 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v33 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v9, v9, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v52 +; SI-NEXT: v_or_b32_e32 v2, v2, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_or_b32_e32 v13, v13, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_or_b32_e32 v31, v31, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_or_b32_e32 v54, v12, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v58 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v60, v19, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v46 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v34, v34, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_or_b32_e32 v40, v11, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_or_b32_e32 v58, v12, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v45 +; SI-NEXT: v_or_b32_e32 v46, v19, v9 +; SI-NEXT: v_or_b32_e32 v62, v11, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v19, v35 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_or_b32_e32 v56, v11, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v43 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_or_b32_e32 v37, v37, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_or_b32_e32 v18, v18, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v22 -; SI-NEXT: v_or_b32_e32 v21, v21, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v25 -; SI-NEXT: v_or_b32_e32 v24, v24, v53 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v49, 16, v49 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v27, v27, v52 -; SI-NEXT: v_or_b32_e32 v26, v26, v30 -; SI-NEXT: v_or_b32_e32 v23, v23, v29 -; SI-NEXT: v_or_b32_e32 v20, v20, v28 -; SI-NEXT: v_or_b32_e32 v39, v39, v51 -; SI-NEXT: v_or_b32_e32 v36, v36, v50 -; SI-NEXT: v_or_b32_e32 v33, v33, v49 -; SI-NEXT: v_or_b32_e32 v15, v15, v48 -; SI-NEXT: v_or_b32_e32 v11, v11, v17 -; SI-NEXT: v_or_b32_e32 v8, v8, v16 -; SI-NEXT: v_or_b32_e32 v5, v5, v12 -; SI-NEXT: v_alignbit_b32 v52, v24, v52, 16 -; SI-NEXT: v_alignbit_b32 v30, v21, v30, 16 -; SI-NEXT: v_alignbit_b32 v29, v18, v29, 16 -; SI-NEXT: v_alignbit_b32 v28, v37, v28, 16 -; SI-NEXT: v_alignbit_b32 v51, v34, v51, 16 -; SI-NEXT: v_alignbit_b32 v50, v31, v50, 16 -; SI-NEXT: v_alignbit_b32 v49, v13, v49, 16 -; SI-NEXT: v_alignbit_b32 v48, v9, v48, 16 -; SI-NEXT: v_alignbit_b32 v17, v6, v17, 16 -; SI-NEXT: v_alignbit_b32 v16, v3, v16, 16 -; SI-NEXT: v_alignbit_b32 v12, v1, v12, 16 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v45, v12, v7 +; SI-NEXT: v_or_b32_e32 v12, v19, v3 +; SI-NEXT: v_or_b32_e32 v43, v11, v5 +; SI-NEXT: v_or_b32_e32 v11, v20, v1 +; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 +; SI-NEXT: v_mov_b32_e32 v35, v12 +; SI-NEXT: v_mov_b32_e32 v33, v11 +; SI-NEXT: v_lshr_b64 v[30:31], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[19:20], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[11:12], v[1:2], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_or_b32_e32 v27, v27, v52 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v30 -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v29 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v28 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v35 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v48 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v54 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v48 -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v13, v13, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v9, v9, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v12 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v29 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v60 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v44 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v45 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v43 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v53 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll index 152a48bec2636..c35e183fa787f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll @@ -3189,289 +3189,301 @@ define inreg <48 x i16> @bitcast_v24i32_to_v48i16_scalar(<24 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v24i32_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v12, s30, 0 +; SI-NEXT: v_writelane_b32 v12, s31, 1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_readfirstlane_b32 s14, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v5 -; SI-NEXT: v_readfirstlane_b32 s10, v6 -; SI-NEXT: v_readfirstlane_b32 s9, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v8 -; SI-NEXT: v_readfirstlane_b32 s7, v9 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_writelane_b32 v12, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_and_b64 s[14:15], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v10 +; SI-NEXT: v_writelane_b32 v12, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s18 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 -; SI-NEXT: s_lshr_b32 s40, s6, 16 -; SI-NEXT: s_lshr_b32 s41, s8, 16 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: s_lshr_b32 s43, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: s_lshr_b32 s45, s29, 16 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 16 -; SI-NEXT: s_lshr_b32 s58, s19, 16 -; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s29, 16 +; SI-NEXT: s_lshr_b32 s94, s27, 16 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s21, 16 +; SI-NEXT: s_lshr_b32 s34, s19, 16 +; SI-NEXT: s_lshr_b32 s35, s17, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s14, s14, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s18 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 -; SI-NEXT: s_lshr_b32 s40, s6, 16 -; SI-NEXT: s_lshr_b32 s41, s8, 16 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: s_lshr_b32 s43, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: s_lshr_b32 s45, s29, 16 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 16 -; SI-NEXT: s_lshr_b32 s58, s19, 16 -; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s29, 16 +; SI-NEXT: s_lshr_b32 s94, s27, 16 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s21, 16 +; SI-NEXT: s_lshr_b32 s34, s19, 16 +; SI-NEXT: s_lshr_b32 s35, s17, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_lshl_b32 s15, s76, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: s_and_b32 s15, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s35, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_lshl_b32 s15, s74, 16 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: s_and_b32 s15, s19, 0xffff +; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s21, 0xffff +; SI-NEXT: s_lshl_b32 s16, s31, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s22, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s95, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s94, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s28, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s29, 0xffff +; SI-NEXT: s_lshl_b32 s16, s93, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s12, s12, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s92, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s91, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s90, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s89, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s35, v12, 3 +; SI-NEXT: v_readlane_b32 s34, v12, 2 +; SI-NEXT: v_readlane_b32 s31, v12, 1 +; SI-NEXT: v_readlane_b32 s30, v12, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v24i32_to_v48i16_scalar: @@ -5100,88 +5112,88 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -5191,13 +5203,13 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -5207,60 +5219,60 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -5291,17 +5303,17 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5309,7 +5321,7 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -5339,66 +5351,64 @@ define inreg <24 x i32> @bitcast_v48i16_to_v24i32_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v48i16_to_v24i32_scalar: @@ -12563,180 +12573,186 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_mov_b32_e32 v26, s16 +; SI-NEXT: v_mov_b32_e32 v23, s16 ; SI-NEXT: v_mov_b32_e32 v24, s17 -; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v21, s18 ; SI-NEXT: v_mov_b32_e32 v22, s19 -; SI-NEXT: v_mov_b32_e32 v20, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_mov_b32_e32 v18, s22 -; SI-NEXT: v_mov_b32_e32 v15, s23 -; SI-NEXT: v_mov_b32_e32 v17, s24 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v17, s22 +; SI-NEXT: v_mov_b32_e32 v18, s23 +; SI-NEXT: v_mov_b32_e32 v15, s24 ; SI-NEXT: v_mov_b32_e32 v16, s25 -; SI-NEXT: v_mov_b32_e32 v14, s26 -; SI-NEXT: v_mov_b32_e32 v13, s27 -; SI-NEXT: v_mov_b32_e32 v12, s28 -; SI-NEXT: v_mov_b32_e32 v11, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s26 +; SI-NEXT: v_mov_b32_e32 v14, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v11, s28 +; SI-NEXT: v_mov_b32_e32 v12, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 -; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 -; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 ; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 ; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 ; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_alignbit_b32 v21, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v25, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v31, v11, v12, 16 -; SI-NEXT: v_alignbit_b32 v34, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v36, v16, v17, 16 -; SI-NEXT: v_alignbit_b32 v38, v15, v18, 16 -; SI-NEXT: v_alignbit_b32 v48, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v51, v22, v23, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v26, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_or_b32_e32 v26, v26, v53 -; SI-NEXT: buffer_store_dword v26, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v24, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v24, v26, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v37 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v31 +; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v51 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v48 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v41 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 -; SI-NEXT: v_or_b32_e32 v15, v15, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v15, v18, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 +; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen @@ -12748,7 +12764,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -12760,7 +12776,7 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -12772,62 +12788,64 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v24f32_to_v48i16_scalar: @@ -14442,88 +14460,88 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -14533,13 +14551,13 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -14549,60 +14567,60 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -14633,17 +14651,17 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -14651,7 +14669,7 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -14681,66 +14699,64 @@ define inreg <24 x float> @bitcast_v48i16_to_v24f32_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v48i16_to_v24f32_scalar: @@ -21132,289 +21148,301 @@ define inreg <48 x i16> @bitcast_v12i64_to_v48i16_scalar(<12 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v12i64_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v12, s30, 0 +; SI-NEXT: v_writelane_b32 v12, s31, 1 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; SI-NEXT: v_readfirstlane_b32 s15, v1 -; SI-NEXT: v_readfirstlane_b32 s14, v2 -; SI-NEXT: v_readfirstlane_b32 s13, v3 -; SI-NEXT: v_readfirstlane_b32 s12, v4 -; SI-NEXT: v_readfirstlane_b32 s11, v5 -; SI-NEXT: v_readfirstlane_b32 s10, v6 -; SI-NEXT: v_readfirstlane_b32 s9, v7 -; SI-NEXT: v_readfirstlane_b32 s8, v8 -; SI-NEXT: v_readfirstlane_b32 s7, v9 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v10 +; SI-NEXT: v_writelane_b32 v12, s34, 2 +; SI-NEXT: v_readfirstlane_b32 s12, v1 +; SI-NEXT: v_readfirstlane_b32 s13, v2 +; SI-NEXT: v_readfirstlane_b32 s10, v3 +; SI-NEXT: v_readfirstlane_b32 s11, v4 +; SI-NEXT: v_readfirstlane_b32 s8, v5 +; SI-NEXT: v_readfirstlane_b32 s9, v6 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: v_readfirstlane_b32 s7, v8 +; SI-NEXT: v_readfirstlane_b32 s4, v9 +; SI-NEXT: s_and_b64 s[14:15], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v10 +; SI-NEXT: v_writelane_b32 v12, s35, 3 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s18 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 -; SI-NEXT: s_lshr_b32 s40, s6, 16 -; SI-NEXT: s_lshr_b32 s41, s8, 16 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: s_lshr_b32 s43, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: s_lshr_b32 s45, s29, 16 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 16 -; SI-NEXT: s_lshr_b32 s58, s19, 16 -; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s29, 16 +; SI-NEXT: s_lshr_b32 s94, s27, 16 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s21, 16 +; SI-NEXT: s_lshr_b32 s34, s19, 16 +; SI-NEXT: s_lshr_b32 s35, s17, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s15, s15, 3 -; SI-NEXT: s_addc_u32 s14, s14, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s28 -; SI-NEXT: v_mov_b32_e32 v7, s26 -; SI-NEXT: v_mov_b32_e32 v8, s24 -; SI-NEXT: v_mov_b32_e32 v9, s22 -; SI-NEXT: v_mov_b32_e32 v10, s20 -; SI-NEXT: v_mov_b32_e32 v11, s18 -; SI-NEXT: v_mov_b32_e32 v12, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s29, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s27, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s25, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s23, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s21, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s19, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s17, v12, 16 -; SI-NEXT: s_lshr_b32 s40, s6, 16 -; SI-NEXT: s_lshr_b32 s41, s8, 16 -; SI-NEXT: s_lshr_b32 s42, s10, 16 -; SI-NEXT: s_lshr_b32 s43, s12, 16 -; SI-NEXT: s_lshr_b32 s44, s14, 16 -; SI-NEXT: s_lshr_b32 s45, s29, 16 -; SI-NEXT: s_lshr_b32 s46, s27, 16 -; SI-NEXT: s_lshr_b32 s47, s25, 16 -; SI-NEXT: s_lshr_b32 s56, s23, 16 -; SI-NEXT: s_lshr_b32 s57, s21, 16 -; SI-NEXT: s_lshr_b32 s58, s19, 16 -; SI-NEXT: s_lshr_b32 s59, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s88, s5, 16 +; SI-NEXT: s_lshr_b32 s89, s7, 16 +; SI-NEXT: s_lshr_b32 s90, s9, 16 +; SI-NEXT: s_lshr_b32 s91, s11, 16 +; SI-NEXT: s_lshr_b32 s92, s13, 16 +; SI-NEXT: s_lshr_b32 s93, s29, 16 +; SI-NEXT: s_lshr_b32 s94, s27, 16 +; SI-NEXT: s_lshr_b32 s95, s25, 16 +; SI-NEXT: s_lshr_b32 s30, s23, 16 +; SI-NEXT: s_lshr_b32 s31, s21, 16 +; SI-NEXT: s_lshr_b32 s34, s19, 16 +; SI-NEXT: s_lshr_b32 s35, s17, 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; SI-NEXT: buffer_store_dword v12, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: s_lshl_b32 s15, s76, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_mov_b32_e32 v1, s15 +; SI-NEXT: s_and_b32 s15, s17, 0xffff +; SI-NEXT: s_lshl_b32 s16, s35, 16 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_lshl_b32 s15, s74, 16 +; SI-NEXT: s_and_b32 s16, s18, 0xffff +; SI-NEXT: s_or_b32 s15, s16, s15 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: s_and_b32 s15, s19, 0xffff +; SI-NEXT: s_lshl_b32 s16, s34, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s20, 0xffff +; SI-NEXT: s_lshl_b32 s16, s72, 16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s21, 0xffff +; SI-NEXT: s_lshl_b32 s16, s31, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s22, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s23, 0xffff +; SI-NEXT: s_lshl_b32 s16, s30, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s24, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s25, 0xffff +; SI-NEXT: s_lshl_b32 s16, s95, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s26, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s27, 0xffff +; SI-NEXT: s_lshl_b32 s16, s94, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s41, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s28, 0xffff +; SI-NEXT: s_lshl_b32 s16, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s40, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s15, s29, 0xffff +; SI-NEXT: s_lshl_b32 s16, s93, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s15, s15, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s15 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s15, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s12, s12, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s92, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s91, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s90, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s89, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s14, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s35, v12, 3 +; SI-NEXT: v_readlane_b32 s34, v12, 2 +; SI-NEXT: v_readlane_b32 s31, v12, 1 +; SI-NEXT: v_readlane_b32 s30, v12, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr62 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr60 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr94 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr93 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr91 ; SI-NEXT: ; implicit-def: $sgpr42 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr41 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr90 ; SI-NEXT: ; implicit-def: $sgpr40 +; SI-NEXT: ; implicit-def: $sgpr89 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr88 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v12i64_to_v48i16_scalar: @@ -23043,88 +23071,88 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -23134,13 +23162,13 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -23150,60 +23178,60 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -23234,17 +23262,17 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -23252,7 +23280,7 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -23282,66 +23310,64 @@ define inreg <12 x i64> @bitcast_v48i16_to_v12i64_scalar(<48 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v48i16_to_v12i64_scalar: @@ -28937,153 +28963,159 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v11, s28 ; SI-NEXT: v_mov_b32_e32 v12, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 +; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_alignbit_b32 v25, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v26, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v27, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v28, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v29, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v33, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v35, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v38, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v48, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v50, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v53, v24, v23, 16 -; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v22 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v24 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[25:26], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[11:12], 16 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_lshr_b64 v[26:27], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v24 ; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v37 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_or_b32_e32 v23, v23, v53 +; SI-NEXT: v_or_b32_e32 v23, v23, v31 ; SI-NEXT: buffer_store_dword v23, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v42 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v36 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v41 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v35 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v40 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v54 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v30 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v53 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v11 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 ; SI-NEXT: v_or_b32_e32 v11, v11, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v11, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 ; SI-NEXT: v_or_b32_e32 v11, v11, v12 ; SI-NEXT: v_add_i32_e32 v12, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen @@ -29095,7 +29127,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v11, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -29107,7 +29139,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -29119,7 +29151,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -29131,7 +29163,7 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -29143,38 +29175,40 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr54 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr52 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr31 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v12f64_to_v48i16_scalar: @@ -30765,88 +30799,88 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v4 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v2 +; SI-NEXT: v_mov_b32_e32 v56, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:4 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_mov_b32_e32 v38, v16 -; SI-NEXT: v_mov_b32_e32 v39, v14 -; SI-NEXT: v_mov_b32_e32 v48, v12 -; SI-NEXT: v_mov_b32_e32 v49, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v1 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v13 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:4 +; SI-NEXT: v_mov_b32_e32 v31, v22 +; SI-NEXT: v_mov_b32_e32 v34, v20 +; SI-NEXT: v_mov_b32_e32 v35, v18 +; SI-NEXT: v_mov_b32_e32 v36, v16 +; SI-NEXT: v_mov_b32_e32 v37, v14 +; SI-NEXT: v_mov_b32_e32 v38, v12 +; SI-NEXT: v_mov_b32_e32 v39, v10 +; SI-NEXT: v_mov_b32_e32 v48, v8 +; SI-NEXT: v_mov_b32_e32 v50, v6 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v13 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v27 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v2 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v4 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v40 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v41 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v12, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v7, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v9, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v10, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v13, v0, v61 +; SI-NEXT: v_or_b32_e32 v11, v0, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_or_b32_e32 v12, v0, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v15, v0, v44 +; SI-NEXT: v_or_b32_e32 v13, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v14, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v15, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v16, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v16, v0, v58 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v17, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_or_b32_e32 v17, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v33 +; SI-NEXT: v_or_b32_e32 v19, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v27 +; SI-NEXT: v_or_b32_e32 v20, v0, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff @@ -30856,13 +30890,13 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v22, v0, v25 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v22, v0, v27 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v58 -; SI-NEXT: v_or_b32_e32 v23, v0, v34 +; SI-NEXT: v_or_b32_e32 v8, v1, v57 +; SI-NEXT: v_or_b32_e32 v23, v0, v25 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -30872,60 +30906,60 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_or_b32_e32 v0, v40, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v41, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v24 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v33, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 @@ -30956,17 +30990,17 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v25, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v63 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v58, v1 +; SI-NEXT: v_or_b32_e32 v1, v57, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -30974,7 +31008,7 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v34, v0 +; SI-NEXT: v_or_b32_e32 v0, v25, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -31004,66 +31038,64 @@ define inreg <12 x double> @bitcast_v48i16_to_v12f64_scalar(<48 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: v_mov_b32_e32 v43, v34 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v44, v25 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: v_mov_b32_e32 v46, v27 -; SI-NEXT: v_mov_b32_e32 v47, v33 -; SI-NEXT: v_mov_b32_e32 v56, v32 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v40, v49 -; SI-NEXT: v_mov_b32_e32 v49, v39 -; SI-NEXT: v_mov_b32_e32 v39, v37 -; SI-NEXT: v_mov_b32_e32 v37, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v48 -; SI-NEXT: v_mov_b32_e32 v48, v38 -; SI-NEXT: v_mov_b32_e32 v38, v36 -; SI-NEXT: v_mov_b32_e32 v36, v24 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v42, v41 -; SI-NEXT: v_mov_b32_e32 v41, v57 -; SI-NEXT: v_mov_b32_e32 v57, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v37 -; SI-NEXT: v_mov_b32_e32 v37, v39 -; SI-NEXT: v_mov_b32_e32 v39, v49 -; SI-NEXT: v_mov_b32_e32 v49, v40 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v25, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v31, v57 +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v46, v51 +; SI-NEXT: v_mov_b32_e32 v51, v39 +; SI-NEXT: v_mov_b32_e32 v39, v34 +; SI-NEXT: v_mov_b32_e32 v34, v30 +; SI-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v52 +; SI-NEXT: v_mov_b32_e32 v52, v48 +; SI-NEXT: v_mov_b32_e32 v48, v35 +; SI-NEXT: v_mov_b32_e32 v35, v28 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v55, v57 ; SI-NEXT: v_mov_b32_e32 v57, v41 -; SI-NEXT: v_mov_b32_e32 v41, v42 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v24, v36 -; SI-NEXT: v_mov_b32_e32 v36, v38 -; SI-NEXT: v_mov_b32_e32 v38, v48 -; SI-NEXT: v_mov_b32_e32 v48, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v32 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v41, v49 +; SI-NEXT: v_mov_b32_e32 v49, v36 +; SI-NEXT: v_mov_b32_e32 v36, v26 +; SI-NEXT: v_mov_b32_e32 v42, v50 +; SI-NEXT: v_mov_b32_e32 v50, v37 +; SI-NEXT: v_mov_b32_e32 v37, v24 +; SI-NEXT: v_mov_b32_e32 v33, v32 ; SI-NEXT: v_mov_b32_e32 v32, v56 -; SI-NEXT: v_mov_b32_e32 v33, v47 -; SI-NEXT: v_mov_b32_e32 v27, v46 +; SI-NEXT: v_mov_b32_e32 v56, v40 +; SI-NEXT: v_mov_b32_e32 v40, v38 +; SI-NEXT: v_mov_b32_e32 v38, v31 +; SI-NEXT: v_mov_b32_e32 v43, v25 +; SI-NEXT: v_mov_b32_e32 v44, v27 +; SI-NEXT: v_mov_b32_e32 v53, v45 +; SI-NEXT: v_mov_b32_e32 v45, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v26, v36 +; SI-NEXT: v_mov_b32_e32 v36, v49 +; SI-NEXT: v_mov_b32_e32 v49, v41 +; SI-NEXT: v_mov_b32_e32 v41, v57 +; SI-NEXT: v_mov_b32_e32 v57, v55 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v34 +; SI-NEXT: v_mov_b32_e32 v34, v39 +; SI-NEXT: v_mov_b32_e32 v39, v51 +; SI-NEXT: v_mov_b32_e32 v51, v46 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v34, v43 +; SI-NEXT: v_mov_b32_e32 v45, v53 +; SI-NEXT: v_mov_b32_e32 v27, v44 +; SI-NEXT: v_mov_b32_e32 v25, v43 +; SI-NEXT: v_mov_b32_e32 v31, v38 +; SI-NEXT: v_mov_b32_e32 v38, v40 +; SI-NEXT: v_mov_b32_e32 v40, v56 +; SI-NEXT: v_mov_b32_e32 v56, v32 +; SI-NEXT: v_mov_b32_e32 v32, v33 +; SI-NEXT: v_mov_b32_e32 v24, v37 +; SI-NEXT: v_mov_b32_e32 v37, v50 +; SI-NEXT: v_mov_b32_e32 v50, v42 +; SI-NEXT: v_mov_b32_e32 v28, v35 +; SI-NEXT: v_mov_b32_e32 v35, v48 +; SI-NEXT: v_mov_b32_e32 v48, v52 +; SI-NEXT: v_mov_b32_e32 v52, v47 ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v48i16_to_v12f64_scalar: @@ -38404,13 +38436,13 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v54, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v47 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 ; SI-NEXT: s_waitcnt vmcnt(13) @@ -38455,74 +38487,83 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v29, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 +; SI-NEXT: v_mov_b32_e32 v35, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v43 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v29 -; SI-NEXT: v_or_b32_e32 v60, v30, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v26 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v30 -; SI-NEXT: v_or_b32_e32 v58, v33, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; SI-NEXT: v_or_b32_e32 v60, v28, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v52 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v40 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v52 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v54 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v52, v35, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v25 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v57 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v59 -; SI-NEXT: v_or_b32_e32 v57, v33, v23 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v34 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v31 +; SI-NEXT: v_or_b32_e32 v52, v32, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v35 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v25 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 +; SI-NEXT: v_or_b32_e32 v49, v31, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v61 +; SI-NEXT: v_or_b32_e32 v35, v31, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v23 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v56 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v57, v32, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 ; SI-NEXT: v_or_b32_e32 v56, v24, v22 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 @@ -38544,7 +38585,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 @@ -38557,7 +38598,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 ; SI-NEXT: v_or_b32_e32 v59, v2, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 @@ -38580,121 +38621,115 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v36 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v36 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_or_b32_e32 v14, v14, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v58 ; SI-NEXT: v_or_b32_e32 v20, v20, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v18 ; SI-NEXT: v_or_b32_e32 v19, v19, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v16 ; SI-NEXT: v_or_b32_e32 v17, v17, v24 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v15 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v35 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v37 -; SI-NEXT: v_or_b32_e32 v38, v33, v24 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v32 +; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v38, v31, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v42 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v36 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v37, v35, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v51 +; SI-NEXT: v_or_b32_e32 v37, v32, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v51 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v48 +; SI-NEXT: v_or_b32_e32 v58, v30, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v48, v24 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v32 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v39 -; SI-NEXT: v_or_b32_e32 v51, v33, v35 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v48 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v39 +; SI-NEXT: v_or_b32_e32 v51, v31, v32 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v48 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v50, v24, v33 +; SI-NEXT: v_or_b32_e32 v50, v24, v31 ; SI-NEXT: v_or_b32_e32 v8, v8, v29 ; SI-NEXT: v_or_b32_e32 v7, v7, v30 ; SI-NEXT: v_or_b32_e32 v6, v6, v55 ; SI-NEXT: v_or_b32_e32 v21, v21, v45 -; SI-NEXT: v_or_b32_e32 v28, v28, v25 -; SI-NEXT: v_or_b32_e32 v27, v27, v46 -; SI-NEXT: v_alignbit_b32 v44, v50, v31, 16 -; SI-NEXT: v_alignbit_b32 v43, v51, v32, 16 +; SI-NEXT: v_alignbit_b32 v44, v50, v27, 16 +; SI-NEXT: v_alignbit_b32 v43, v51, v28, 16 ; SI-NEXT: v_alignbit_b32 v42, v37, v29, 16 +; SI-NEXT: v_mov_b32_e32 v29, v49 ; SI-NEXT: v_alignbit_b32 v41, v38, v30, 16 ; SI-NEXT: v_alignbit_b32 v40, v17, v55, 16 ; SI-NEXT: v_alignbit_b32 v55, v19, v45, 16 ; SI-NEXT: v_alignbit_b32 v54, v20, v26, 16 ; SI-NEXT: v_alignbit_b32 v26, v14, v25, 16 ; SI-NEXT: v_alignbit_b32 v25, v10, v46, 16 +; SI-NEXT: v_mov_b32_e32 v46, v35 ; SI-NEXT: v_alignbit_b32 v24, v11, v23, 16 ; SI-NEXT: v_alignbit_b32 v23, v5, v22, 16 ; SI-NEXT: v_alignbit_b32 v22, v59, v47, 16 ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v60 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v44 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: buffer_store_dword v29, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v48 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v58 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v39 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v60 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v48 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v58 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v43 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v39 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 -; SI-NEXT: v_or_b32_e32 v8, v8, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v8, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v36 -; SI-NEXT: v_or_b32_e32 v8, v8, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v8, v29, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v8, v27, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 @@ -38744,7 +38779,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v28 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v29 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v26 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 @@ -38756,7 +38791,7 @@ define <48 x i16> @bitcast_v48f16_to_v48i16(<48 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v27 +; SI-NEXT: v_and_b32_e32 v6, 0xffff, v46 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v25 ; SI-NEXT: v_or_b32_e32 v6, v6, v7 ; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 @@ -39282,428 +39317,464 @@ define inreg <48 x i16> @bitcast_v48f16_to_v48i16_scalar(<48 x half> inreg %a, i ; SI-LABEL: bitcast_v48f16_to_v48i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:8 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v42, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v59, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v57, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v17, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v16, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v52, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v50, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v29, s29 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v26, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v32 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v20 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v40, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v35 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v30 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v29 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v50 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v57 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v59, v3, v19 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v62, v1, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v55 +; SI-NEXT: v_or_b32_e32 v57, v3, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v44 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v21 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v35, v5, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v3, v3, v13 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v53 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_or_b32_e32 v33, v5, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v32 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v32, v1, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v30 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_or_b32_e32 v29, v3, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v28, v26, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v31 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_or_b32_e32 v27, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v43 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v25 +; SI-NEXT: v_or_b32_e32 v44, v26, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v43 +; SI-NEXT: v_or_b32_e32 v2, v2, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v40 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v41, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 +; SI-NEXT: v_or_b32_e32 v4, v4, v25 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 +; SI-NEXT: v_or_b32_e32 v6, v6, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v56 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v45 +; SI-NEXT: v_or_b32_e32 v8, v8, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v47 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v60 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v25 ; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_or_b32_e32 v2, v2, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v26 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v56 +; SI-NEXT: v_or_b32_e32 v10, v10, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v63 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_or_b32_e32 v14, v14, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v5, v5, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_or_b32_e32 v11, v11, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v10, v10, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_or_b32_e32 v14, v14, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v25 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v35, v35, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v33 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_or_b32_e32 v34, v34, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v25 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: v_or_b32_e32 v18, v18, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 +; SI-NEXT: v_or_b32_e32 v22, v22, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v42 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v26 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_or_b32_e32 v38, v38, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_or_b32_e32 v49, v49, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v17 -; SI-NEXT: v_or_b32_e32 v18, v18, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v19 -; SI-NEXT: v_or_b32_e32 v23, v23, v50 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v20 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v53 -; SI-NEXT: v_or_b32_e32 v22, v22, v50 -; SI-NEXT: v_or_b32_e32 v25, v25, v30 -; SI-NEXT: v_or_b32_e32 v24, v24, v29 -; SI-NEXT: v_or_b32_e32 v21, v21, v41 -; SI-NEXT: v_or_b32_e32 v16, v16, v28 -; SI-NEXT: v_or_b32_e32 v48, v48, v54 -; SI-NEXT: v_or_b32_e32 v39, v39, v42 -; SI-NEXT: v_or_b32_e32 v32, v32, v52 -; SI-NEXT: v_or_b32_e32 v31, v31, v51 -; SI-NEXT: v_or_b32_e32 v15, v15, v43 -; SI-NEXT: v_or_b32_e32 v8, v8, v27 -; SI-NEXT: v_or_b32_e32 v7, v7, v26 -; SI-NEXT: v_or_b32_e32 v6, v6, v44 -; SI-NEXT: v_alignbit_b32 v40, v22, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v23, v29, 16 -; SI-NEXT: v_alignbit_b32 v29, v18, v41, 16 -; SI-NEXT: v_alignbit_b32 v28, v49, v28, 16 -; SI-NEXT: v_alignbit_b32 v55, v38, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v34, v42, 16 -; SI-NEXT: v_alignbit_b32 v53, v35, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v50, v11, v27, 16 -; SI-NEXT: v_alignbit_b32 v27, v5, v26, 16 -; SI-NEXT: v_alignbit_b32 v26, v2, v44, 16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v25 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_or_b32_e32 v16, v16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v58 +; SI-NEXT: v_lshr_b64 v[50:51], v[15:16], 16 +; SI-NEXT: v_or_b32_e32 v20, v20, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v42 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: v_lshr_b64 v[29:30], v[21:22], 16 +; SI-NEXT: v_or_b32_e32 v24, v24, v25 +; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[25:26], v[1:2], 16 +; SI-NEXT: v_mov_b32_e32 v31, v44 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[54:55], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[13:14], 16 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v53, v32 +; SI-NEXT: v_mov_b32_e32 v49, v28 +; SI-NEXT: v_mov_b32_e32 v39, v27 +; SI-NEXT: v_lshr_b64 v[36:37], v[11:12], 16 +; SI-NEXT: v_mov_b32_e32 v11, v33 +; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[3:4], 16 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 -; SI-NEXT: v_or_b32_e32 v20, v22, v20 -; SI-NEXT: v_add_i32_e32 v22, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v30 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v52 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v29 -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v17, v18, v17 -; SI-NEXT: v_add_i32_e32 v18, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v28 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v50 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v36 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(6) expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v55 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v39 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v54 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v38 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v53 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 -; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: v_or_b32_e32 v12, v16, v12 -; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v36 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v52 -; SI-NEXT: v_or_b32_e32 v12, v12, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v12, v16, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v34 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v51 -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v12, v12, v13 -; SI-NEXT: v_add_i32_e32 v13, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v56 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v50 -; SI-NEXT: v_or_b32_e32 v8, v8, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v8, v3 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v27 -; SI-NEXT: v_or_b32_e32 v3, v3, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v3, v7, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v49 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v26 -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v40 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v41 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v31 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v25 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll index 97d040b545c09..29005a42d8860 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll @@ -3408,313 +3408,333 @@ define inreg <52 x i16> @bitcast_v26i32_to_v52i16_scalar(<26 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v26i32_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v14, s30, 0 +; SI-NEXT: v_writelane_b32 v14, s31, 1 +; SI-NEXT: v_writelane_b32 v14, s34, 2 +; SI-NEXT: v_writelane_b32 v14, s35, 3 +; SI-NEXT: v_writelane_b32 v14, s36, 4 +; SI-NEXT: v_writelane_b32 v14, s37, 5 +; SI-NEXT: v_writelane_b32 v14, s38, 6 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s41, v1 -; SI-NEXT: v_readfirstlane_b32 s40, v2 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_readfirstlane_b32 s14, v4 -; SI-NEXT: v_readfirstlane_b32 s13, v5 -; SI-NEXT: v_readfirstlane_b32 s12, v6 -; SI-NEXT: v_readfirstlane_b32 s11, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s9, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v10 -; SI-NEXT: v_readfirstlane_b32 s7, v11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_writelane_b32 v14, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_and_b64 s[40:41], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: v_writelane_b32 v14, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 -; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: s_lshr_b32 s43, s8, 16 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s12, 16 -; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: s_lshr_b32 s47, s40, 16 -; SI-NEXT: s_lshr_b32 s56, s29, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s25, 16 -; SI-NEXT: s_lshr_b32 s59, s23, 16 -; SI-NEXT: s_lshr_b32 s60, s21, 16 -; SI-NEXT: s_lshr_b32 s61, s19, 16 -; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s29, 16 +; SI-NEXT: s_lshr_b32 s35, s27, 16 +; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s21, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s40, s40, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 -; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: s_lshr_b32 s43, s8, 16 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s12, 16 -; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: s_lshr_b32 s47, s40, 16 -; SI-NEXT: s_lshr_b32 s56, s29, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s25, 16 -; SI-NEXT: s_lshr_b32 s59, s23, 16 -; SI-NEXT: s_lshr_b32 s60, s21, 16 -; SI-NEXT: s_lshr_b32 s61, s19, 16 -; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s29, 16 +; SI-NEXT: s_lshr_b32 s35, s27, 16 +; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s21, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_lshl_b32 s41, s88, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s41 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s37, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s36, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s35, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s31, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s30, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s95, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s94, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s93, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s48, v14, 8 +; SI-NEXT: v_readlane_b32 s39, v14, 7 +; SI-NEXT: v_readlane_b32 s38, v14, 6 +; SI-NEXT: v_readlane_b32 s37, v14, 5 +; SI-NEXT: v_readlane_b32 s36, v14, 4 +; SI-NEXT: v_readlane_b32 s35, v14, 3 +; SI-NEXT: v_readlane_b32 s34, v14, 2 +; SI-NEXT: v_readlane_b32 s31, v14, 1 +; SI-NEXT: v_readlane_b32 s30, v14, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v26i32_to_v52i16_scalar: @@ -5490,116 +5510,119 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -5609,72 +5632,74 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -5684,13 +5709,13 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -5700,17 +5725,17 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -5718,7 +5743,7 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -5748,85 +5773,87 @@ define inreg <26 x i32> @bitcast_v52i16_to_v26i32_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v52i16_to_v26i32_scalar: @@ -13639,211 +13666,217 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_mov_b32_e32 v28, s16 -; SI-NEXT: v_mov_b32_e32 v27, s17 -; SI-NEXT: v_mov_b32_e32 v25, s18 +; SI-NEXT: v_mov_b32_e32 v25, s16 +; SI-NEXT: v_mov_b32_e32 v26, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 ; SI-NEXT: v_mov_b32_e32 v24, s19 -; SI-NEXT: v_mov_b32_e32 v21, s20 -; SI-NEXT: v_mov_b32_e32 v19, s21 -; SI-NEXT: v_mov_b32_e32 v22, s22 +; SI-NEXT: v_mov_b32_e32 v19, s20 +; SI-NEXT: v_mov_b32_e32 v20, s21 +; SI-NEXT: v_mov_b32_e32 v21, s22 +; SI-NEXT: v_mov_b32_e32 v22, s23 +; SI-NEXT: v_mov_b32_e32 v17, s24 +; SI-NEXT: v_mov_b32_e32 v18, s25 +; SI-NEXT: v_mov_b32_e32 v15, s26 +; SI-NEXT: v_mov_b32_e32 v16, s27 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v20, s23 -; SI-NEXT: v_mov_b32_e32 v18, s24 -; SI-NEXT: v_mov_b32_e32 v17, s25 -; SI-NEXT: v_mov_b32_e32 v16, s26 -; SI-NEXT: v_mov_b32_e32 v15, s27 -; SI-NEXT: v_mov_b32_e32 v14, s28 -; SI-NEXT: v_mov_b32_e32 v13, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v13, s28 +; SI-NEXT: v_mov_b32_e32 v14, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 -; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 ; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 ; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_alignbit_b32 v23, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v26, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v33, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v35, v13, v14, 16 -; SI-NEXT: v_alignbit_b32 v37, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v48, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v50, v20, v22, 16 -; SI-NEXT: v_alignbit_b32 v52, v19, v21, 16 -; SI-NEXT: v_alignbit_b32 v54, v24, v25, 16 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v27, v28, 16 -; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v28, v28, v40 -; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v44 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 +; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 -; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v43 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v52 -; SI-NEXT: v_or_b32_e32 v21, v21, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v42 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 -; SI-NEXT: v_or_b32_e32 v19, v19, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 +; SI-NEXT: v_or_b32_e32 v19, v19, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 +; SI-NEXT: v_or_b32_e32 v19, v19, v20 +; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v35 -; SI-NEXT: v_or_b32_e32 v14, v14, v15 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v13 ; SI-NEXT: v_add_i32_e32 v13, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13855,7 +13888,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13867,7 +13900,7 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -13879,68 +13912,71 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 -; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr36 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr32 +; SI-NEXT: ; implicit-def: $vgpr28 +; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v26f32_to_v52i16_scalar: @@ -15715,116 +15751,119 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -15834,72 +15873,74 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -15909,13 +15950,13 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -15925,17 +15966,17 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -15943,7 +15984,7 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -15973,85 +16014,87 @@ define inreg <26 x float> @bitcast_v52i16_to_v26f32_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v52i16_to_v26f32_scalar: @@ -23059,313 +23102,333 @@ define inreg <52 x i16> @bitcast_v13i64_to_v52i16_scalar(<13 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v13i64_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v14, s30, 0 +; SI-NEXT: v_writelane_b32 v14, s31, 1 +; SI-NEXT: v_writelane_b32 v14, s34, 2 +; SI-NEXT: v_writelane_b32 v14, s35, 3 +; SI-NEXT: v_writelane_b32 v14, s36, 4 +; SI-NEXT: v_writelane_b32 v14, s37, 5 +; SI-NEXT: v_writelane_b32 v14, s38, 6 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13 -; SI-NEXT: v_readfirstlane_b32 s41, v1 -; SI-NEXT: v_readfirstlane_b32 s40, v2 -; SI-NEXT: v_readfirstlane_b32 s15, v3 -; SI-NEXT: v_readfirstlane_b32 s14, v4 -; SI-NEXT: v_readfirstlane_b32 s13, v5 -; SI-NEXT: v_readfirstlane_b32 s12, v6 -; SI-NEXT: v_readfirstlane_b32 s11, v7 -; SI-NEXT: v_readfirstlane_b32 s10, v8 -; SI-NEXT: v_readfirstlane_b32 s9, v9 -; SI-NEXT: v_readfirstlane_b32 s8, v10 -; SI-NEXT: v_readfirstlane_b32 s7, v11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v12 +; SI-NEXT: v_writelane_b32 v14, s39, 7 +; SI-NEXT: v_readfirstlane_b32 s14, v1 +; SI-NEXT: v_readfirstlane_b32 s15, v2 +; SI-NEXT: v_readfirstlane_b32 s12, v3 +; SI-NEXT: v_readfirstlane_b32 s13, v4 +; SI-NEXT: v_readfirstlane_b32 s10, v5 +; SI-NEXT: v_readfirstlane_b32 s11, v6 +; SI-NEXT: v_readfirstlane_b32 s8, v7 +; SI-NEXT: v_readfirstlane_b32 s9, v8 +; SI-NEXT: v_readfirstlane_b32 s6, v9 +; SI-NEXT: v_readfirstlane_b32 s7, v10 +; SI-NEXT: v_readfirstlane_b32 s4, v11 +; SI-NEXT: s_and_b64 s[40:41], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v12 +; SI-NEXT: v_writelane_b32 v14, s48, 8 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 -; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: s_lshr_b32 s43, s8, 16 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s12, 16 -; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: s_lshr_b32 s47, s40, 16 -; SI-NEXT: s_lshr_b32 s56, s29, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s25, 16 -; SI-NEXT: s_lshr_b32 s59, s23, 16 -; SI-NEXT: s_lshr_b32 s60, s21, 16 -; SI-NEXT: s_lshr_b32 s61, s19, 16 -; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s29, 16 +; SI-NEXT: s_lshr_b32 s35, s27, 16 +; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s21, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s40, s40, 0 -; SI-NEXT: s_add_u32 s15, s15, 3 -; SI-NEXT: s_addc_u32 s14, s14, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s28 -; SI-NEXT: v_mov_b32_e32 v8, s26 -; SI-NEXT: v_mov_b32_e32 v9, s24 -; SI-NEXT: v_mov_b32_e32 v10, s22 -; SI-NEXT: v_mov_b32_e32 v11, s20 -; SI-NEXT: v_mov_b32_e32 v12, s18 -; SI-NEXT: v_mov_b32_e32 v13, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s29, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s27, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s25, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s23, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s21, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s19, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s17, v13, 16 -; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: s_lshr_b32 s43, s8, 16 -; SI-NEXT: s_lshr_b32 s44, s10, 16 -; SI-NEXT: s_lshr_b32 s45, s12, 16 -; SI-NEXT: s_lshr_b32 s46, s14, 16 -; SI-NEXT: s_lshr_b32 s47, s40, 16 -; SI-NEXT: s_lshr_b32 s56, s29, 16 -; SI-NEXT: s_lshr_b32 s57, s27, 16 -; SI-NEXT: s_lshr_b32 s58, s25, 16 -; SI-NEXT: s_lshr_b32 s59, s23, 16 -; SI-NEXT: s_lshr_b32 s60, s21, 16 -; SI-NEXT: s_lshr_b32 s61, s19, 16 -; SI-NEXT: s_lshr_b32 s62, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s92, s5, 16 +; SI-NEXT: s_lshr_b32 s93, s7, 16 +; SI-NEXT: s_lshr_b32 s94, s9, 16 +; SI-NEXT: s_lshr_b32 s95, s11, 16 +; SI-NEXT: s_lshr_b32 s30, s13, 16 +; SI-NEXT: s_lshr_b32 s31, s15, 16 +; SI-NEXT: s_lshr_b32 s34, s29, 16 +; SI-NEXT: s_lshr_b32 s35, s27, 16 +; SI-NEXT: s_lshr_b32 s36, s25, 16 +; SI-NEXT: s_lshr_b32 s37, s23, 16 +; SI-NEXT: s_lshr_b32 s38, s21, 16 +; SI-NEXT: s_lshr_b32 s39, s19, 16 +; SI-NEXT: s_lshr_b32 s48, s17, 16 +; SI-NEXT: s_lshr_b64 s[40:41], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; SI-NEXT: buffer_store_dword v13, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: s_lshl_b32 s41, s88, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s41 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s78, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s76, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s37, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s36, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s35, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s43, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s31, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s42, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s30, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s95, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s94, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s93, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s40, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s92, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s48, v14, 8 +; SI-NEXT: v_readlane_b32 s39, v14, 7 +; SI-NEXT: v_readlane_b32 s38, v14, 6 +; SI-NEXT: v_readlane_b32 s37, v14, 5 +; SI-NEXT: v_readlane_b32 s36, v14, 4 +; SI-NEXT: v_readlane_b32 s35, v14, 3 +; SI-NEXT: v_readlane_b32 s34, v14, 2 +; SI-NEXT: v_readlane_b32 s31, v14, 1 +; SI-NEXT: v_readlane_b32 s30, v14, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr48 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr39 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr38 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr37 +; SI-NEXT: ; implicit-def: $sgpr72 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr31 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr95 ; SI-NEXT: ; implicit-def: $sgpr44 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr43 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr93 +; SI-NEXT: ; implicit-def: $sgpr92 ; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr40 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v13i64_to_v52i16_scalar: @@ -25141,116 +25204,119 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -25260,72 +25326,74 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -25335,13 +25403,13 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -25351,17 +25419,17 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -25369,7 +25437,7 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -25399,85 +25467,87 @@ define inreg <13 x i64> @bitcast_v52i16_to_v13i64_scalar(<52 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v52i16_to_v13i64_scalar: @@ -31639,171 +31709,177 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v13, s28 ; SI-NEXT: v_mov_b32_e32 v14, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v28, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v29, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v30, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v31, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v32, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v34, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v37, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v39, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v49, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v54, v24, v23, 16 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_lshr_b64 v[27:28], v[11:12], 16 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_lshr_b64 v[28:29], v[9:10], 16 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_lshr_b64 v[29:30], v[7:8], 16 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[5:6], 16 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[3:4], 16 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v2 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v18 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v26 ; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v49 ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_or_b32_e32 v25, v25, v40 +; SI-NEXT: v_or_b32_e32 v25, v25, v35 ; SI-NEXT: buffer_store_dword v25, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v47 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v38 ; SI-NEXT: v_or_b32_e32 v19, v19, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v19, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v37 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v34 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v33 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v42 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v34 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 ; SI-NEXT: v_or_b32_e32 v13, v13, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v13, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v41 ; SI-NEXT: v_or_b32_e32 v13, v13, v14 ; SI-NEXT: v_add_i32_e32 v14, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen @@ -31815,7 +31891,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -31827,7 +31903,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -31839,7 +31915,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -31851,7 +31927,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -31863,7 +31939,7 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -31875,44 +31951,47 @@ define inreg <52 x i16> @bitcast_v13f64_to_v52i16_scalar(<13 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr34 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 +; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v13f64_to_v52i16_scalar: @@ -33661,116 +33740,119 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: v_mov_b32_e32 v47, v8 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v6 +; SI-NEXT: v_mov_b32_e32 v32, v4 +; SI-NEXT: v_mov_b32_e32 v34, v2 +; SI-NEXT: v_mov_b32_e32 v37, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:20 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_mov_b32_e32 v39, v12 -; SI-NEXT: v_mov_b32_e32 v48, v10 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v5 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v7 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v9 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v29 +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:20 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v62, v30 +; SI-NEXT: v_mov_b32_e32 v30, v24 +; SI-NEXT: v_mov_b32_e32 v38, v22 +; SI-NEXT: v_mov_b32_e32 v39, v20 +; SI-NEXT: v_mov_b32_e32 v48, v18 +; SI-NEXT: v_mov_b32_e32 v49, v16 +; SI-NEXT: v_mov_b32_e32 v50, v14 +; SI-NEXT: v_mov_b32_e32 v40, v12 +; SI-NEXT: v_mov_b32_e32 v41, v10 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v5 +; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v9 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v11 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v29 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v2 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v8 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v63 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v10, v0, v42 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 -; SI-NEXT: v_or_b32_e32 v11, v0, v41 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v9, v0, v61 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v10, v0, v60 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 +; SI-NEXT: v_or_b32_e32 v11, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v12, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v13, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v14, v0, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_or_b32_e32 v15, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 -; SI-NEXT: v_or_b32_e32 v12, v0, v56 +; SI-NEXT: v_or_b32_e32 v16, v0, v54 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 -; SI-NEXT: v_or_b32_e32 v13, v0, v47 +; SI-NEXT: v_or_b32_e32 v17, v0, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v14, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 -; SI-NEXT: v_or_b32_e32 v16, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_or_b32_e32 v17, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_or_b32_e32 v18, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 +; SI-NEXT: v_or_b32_e32 v19, v0, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v26 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v20, v0, v31 +; SI-NEXT: v_or_b32_e32 v20, v0, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 +; SI-NEXT: v_or_b32_e32 v21, v0, v46 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v29 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v22, v0, v45 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_or_b32_e32 v23, v0, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v24, v0, v27 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v24, v0, v29 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v8, v1, v62 -; SI-NEXT: v_or_b32_e32 v25, v0, v32 +; SI-NEXT: v_or_b32_e32 v8, v1, v63 +; SI-NEXT: v_or_b32_e32 v25, v0, v27 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -33780,72 +33862,74 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_or_b32_e32 v0, v31, v0 ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_or_b32_e32 v0, v61, v0 ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v42, v0 +; SI-NEXT: v_or_b32_e32 v0, v60, v0 ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v41, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v58, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v56, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v26 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 @@ -33855,13 +33939,13 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 ; SI-NEXT: s_add_i32 s22, s22, 3 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v51, v0 ; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 ; SI-NEXT: s_add_i32 s24, s24, 3 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: s_or_b32 s7, s8, s7 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 @@ -33871,17 +33955,17 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: v_or_b32_e32 v0, v27, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v34 ; SI-NEXT: s_or_b32 s9, s10, s9 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_or_b32 s10, s11, s10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v1, v62, v1 +; SI-NEXT: v_or_b32_e32 v1, v63, v1 ; SI-NEXT: s_add_i32 s4, s4, 0x30000 ; SI-NEXT: s_add_i32 s5, s5, 0x30000 ; SI-NEXT: s_add_i32 s6, s6, 0x30000 @@ -33889,7 +33973,7 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_add_i32 s8, s8, 0x30000 ; SI-NEXT: s_add_i32 s9, s9, 0x30000 ; SI-NEXT: s_add_i32 s10, s10, 0x30000 -; SI-NEXT: v_or_b32_e32 v0, v32, v0 +; SI-NEXT: v_or_b32_e32 v0, v27, v0 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -33919,85 +34003,87 @@ define inreg <13 x double> @bitcast_v52i16_to_v13f64_scalar(<52 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v47, v43 +; SI-NEXT: v_mov_b32_e32 v43, v50 +; SI-NEXT: v_mov_b32_e32 v50, v38 +; SI-NEXT: v_mov_b32_e32 v38, v62 +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v62, v56 +; SI-NEXT: v_mov_b32_e32 v56, v44 +; SI-NEXT: v_mov_b32_e32 v44, v40 +; SI-NEXT: v_mov_b32_e32 v40, v39 +; SI-NEXT: v_mov_b32_e32 v39, v28 +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v59, v45 +; SI-NEXT: v_mov_b32_e32 v45, v41 +; SI-NEXT: v_mov_b32_e32 v41, v48 +; SI-NEXT: v_mov_b32_e32 v48, v26 +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v60, v52 +; SI-NEXT: v_mov_b32_e32 v52, v46 +; SI-NEXT: s_waitcnt vmcnt(6) +; SI-NEXT: v_mov_b32_e32 v46, v42 +; SI-NEXT: v_mov_b32_e32 v42, v49 +; SI-NEXT: v_mov_b32_e32 v49, v30 +; SI-NEXT: v_mov_b32_e32 v61, v63 ; SI-NEXT: v_mov_b32_e32 v63, v57 -; SI-NEXT: v_mov_b32_e32 v57, v32 -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v42, v50 -; SI-NEXT: v_mov_b32_e32 v50, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: v_mov_b32_e32 v32, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v40 -; SI-NEXT: v_mov_b32_e32 v40, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(3) expcnt(0) -; SI-NEXT: v_mov_b32_e32 v41, v49 -; SI-NEXT: v_mov_b32_e32 v49, v38 -; SI-NEXT: v_mov_b32_e32 v38, v35 -; SI-NEXT: v_mov_b32_e32 v35, v26 -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_mov_b32_e32 v47, v44 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v46, v43 -; SI-NEXT: v_mov_b32_e32 v45, v58 -; SI-NEXT: v_mov_b32_e32 v58, v27 -; SI-NEXT: v_mov_b32_e32 v44, v60 -; SI-NEXT: v_mov_b32_e32 v60, v29 -; SI-NEXT: v_mov_b32_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v31 -; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v62 -; SI-NEXT: v_mov_b32_e32 v62, v43 -; SI-NEXT: v_mov_b32_e32 v29, v60 -; SI-NEXT: v_mov_b32_e32 v60, v44 -; SI-NEXT: v_mov_b32_e32 v27, v58 -; SI-NEXT: v_mov_b32_e32 v58, v45 -; SI-NEXT: v_mov_b32_e32 v43, v46 -; SI-NEXT: v_mov_b32_e32 v44, v47 -; SI-NEXT: v_mov_b32_e32 v45, v56 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v26, v35 -; SI-NEXT: v_mov_b32_e32 v35, v38 -; SI-NEXT: v_mov_b32_e32 v38, v49 -; SI-NEXT: v_mov_b32_e32 v49, v41 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v40 -; SI-NEXT: v_mov_b32_e32 v40, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 +; SI-NEXT: v_mov_b32_e32 v57, v27 +; SI-NEXT: v_mov_b32_e32 v53, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v55 ; SI-NEXT: v_mov_b32_e32 v55, v32 -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v50 -; SI-NEXT: v_mov_b32_e32 v50, v42 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v32, v57 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v29 +; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 +; SI-NEXT: v_mov_b32_e32 v29, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v55 +; SI-NEXT: v_mov_b32_e32 v55, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v53 +; SI-NEXT: v_mov_b32_e32 v27, v57 ; SI-NEXT: v_mov_b32_e32 v57, v63 -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v63, v61 +; SI-NEXT: v_mov_b32_e32 v30, v49 +; SI-NEXT: v_mov_b32_e32 v49, v42 +; SI-NEXT: v_mov_b32_e32 v42, v46 +; SI-NEXT: v_mov_b32_e32 v46, v52 +; SI-NEXT: v_mov_b32_e32 v52, v60 +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v26, v48 +; SI-NEXT: v_mov_b32_e32 v48, v41 +; SI-NEXT: v_mov_b32_e32 v41, v45 +; SI-NEXT: v_mov_b32_e32 v45, v59 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v39 +; SI-NEXT: v_mov_b32_e32 v39, v40 +; SI-NEXT: v_mov_b32_e32 v40, v44 +; SI-NEXT: v_mov_b32_e32 v44, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v62, v38 +; SI-NEXT: v_mov_b32_e32 v38, v50 +; SI-NEXT: v_mov_b32_e32 v50, v43 +; SI-NEXT: v_mov_b32_e32 v43, v47 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v52i16_to_v13f64_scalar: @@ -35773,11 +35859,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: s_lshr_b32 s40, s6, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s40 ; SI-NEXT: s_lshr_b32 s40, s9, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s40 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v63, s40 ; SI-NEXT: s_lshr_b32 s40, s8, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s40 ; SI-NEXT: s_lshr_b32 s40, s11, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v11, s40 +; SI-NEXT: v_cvt_f32_f16_e32 v62, s40 ; SI-NEXT: s_lshr_b32 s40, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s40 ; SI-NEXT: s_lshr_b32 s40, s13, 16 @@ -35816,20 +35903,20 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v42, s40 ; SI-NEXT: s_lshr_b32 s40, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v44, s40 -; SI-NEXT: v_cvt_f32_f16_e32 v46, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v8, s7 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v17, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s6 ; SI-NEXT: v_cvt_f32_f16_e32 v12, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v3, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v17, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s12 +; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s14 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s14 ; SI-NEXT: v_cvt_f32_f16_e32 v27, s29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s28 +; SI-NEXT: v_cvt_f32_f16_e32 v11, s28 ; SI-NEXT: v_cvt_f32_f16_e32 v30, s27 ; SI-NEXT: v_cvt_f32_f16_e32 v32, s26 ; SI-NEXT: v_cvt_f32_f16_e32 v34, s25 @@ -35846,22 +35933,6 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 ; SI-NEXT: v_add_f64 v[54:55], s[18:19], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v55 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v42 ; SI-NEXT: v_add_f64 v[49:50], s[20:21], 1.0 ; SI-NEXT: v_add_f64 v[37:38], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[33:34], s[24:25], 1.0 @@ -35872,41 +35943,54 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_add_f64 v[14:15], s[10:11], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v49 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v50 +; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v1 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v54 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v55 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v49 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v33 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v34 +; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v31 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v32 +; SI-NEXT: s_waitcnt expcnt(5) ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 +; SI-NEXT: s_waitcnt expcnt(4) ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v19 +; SI-NEXT: s_waitcnt expcnt(2) ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v14 +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v11 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v12 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 @@ -35915,13 +35999,17 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v48, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v62 +; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v60 @@ -35935,14 +36023,12 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v51 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v42, v10 -; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v51, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v44 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v10, v44 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v43 @@ -36028,7 +36114,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v11 ; SI-NEXT: v_add_i32_e32 v26, vcc, 48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 @@ -36042,7 +36128,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v4 ; SI-NEXT: v_add_i32_e32 v22, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 @@ -36056,7 +36142,7 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v20, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v8 ; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 @@ -36070,56 +36156,56 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: buffer_store_dword v10, v16, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v10, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v13, v7 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; SI-NEXT: v_or_b32_e32 v10, v13, v10 ; SI-NEXT: buffer_store_dword v10, v14, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 -; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_add_i32_e32 v13, vcc, 0x4c, v0 ; SI-NEXT: v_or_b32_e32 v10, v11, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; SI-NEXT: buffer_store_dword v10, v13, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v45 -; SI-NEXT: v_add_i32_e32 v11, vcc, 0x50, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: buffer_store_dword v9, v11, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 +; SI-NEXT: v_or_b32_e32 v3, v3, v9 +; SI-NEXT: buffer_store_dword v3, v10, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v7, v9, v7 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 +; SI-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v4, v4, v5 -; SI-NEXT: buffer_store_dword v4, v7, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v25 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v21 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v4, v5, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -36167,33 +36253,33 @@ define inreg <52 x half> @bitcast_v13f64_to_v52f16_scalar(<13 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr28 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: ; implicit-def: $vgpr26 ; SI-NEXT: ; implicit-def: $vgpr27 ; SI-NEXT: ; implicit-def: $vgpr24 -; SI-NEXT: ; implicit-def: $vgpr25 +; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 -; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr8 ; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr15 -; SI-NEXT: ; implicit-def: $vgpr11 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr9 ; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr8 +; SI-NEXT: ; implicit-def: $vgpr21 ; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr45 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_branch .LBB53_2 ; @@ -42201,23 +42287,22 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:88 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v51, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v52, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v43, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f16_f32_e32 v7, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v42, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v41, v18 @@ -42227,6 +42312,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v40, v22 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v23 ; SI-NEXT: v_cvt_f16_f32_e32 v19, v24 +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v55, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v27 @@ -42236,35 +42322,35 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v18, v46 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v47 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v56 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v12, v58 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v59 ; SI-NEXT: v_cvt_f16_f32_e32 v59, v60 ; SI-NEXT: v_cvt_f16_f32_e32 v27, v61 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v62 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v63 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v63 ; SI-NEXT: v_cvt_f16_f32_e32 v58, v31 ; SI-NEXT: s_waitcnt vmcnt(13) ; SI-NEXT: v_cvt_f16_f32_e32 v26, v36 ; SI-NEXT: s_waitcnt vmcnt(12) ; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v63, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v48 ; SI-NEXT: s_waitcnt vmcnt(10) ; SI-NEXT: v_cvt_f16_f32_e32 v36, v53 ; SI-NEXT: s_waitcnt vmcnt(9) ; SI-NEXT: v_cvt_f16_f32_e32 v25, v54 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v32 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v48, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v33 ; SI-NEXT: s_waitcnt vmcnt(6) ; SI-NEXT: v_cvt_f16_f32_e32 v31, v34 ; SI-NEXT: s_waitcnt vmcnt(5) ; SI-NEXT: v_cvt_f16_f32_e32 v24, v35 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v54, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v37 ; SI-NEXT: s_waitcnt vmcnt(3) ; SI-NEXT: v_cvt_f16_f32_e32 v32, v39 ; SI-NEXT: s_waitcnt vmcnt(2) @@ -42276,38 +42362,46 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_mov_b32_e32 v22, v2 ; SI-NEXT: v_mov_b32_e32 v39, v3 ; SI-NEXT: v_mov_b32_e32 v49, v5 -; SI-NEXT: v_mov_b32_e32 v60, v7 -; SI-NEXT: v_mov_b32_e32 v62, v8 +; SI-NEXT: v_mov_b32_e32 v54, v7 +; SI-NEXT: v_mov_b32_e32 v61, v8 +; SI-NEXT: v_mov_b32_e32 v63, v4 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v33, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v33 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v34 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v61 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v62 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 +; SI-NEXT: v_or_b32_e32 v63, v34, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v42 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v48 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 @@ -42319,13 +42413,13 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v8 ; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v61, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v7 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 @@ -42340,8 +42434,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v11 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 @@ -42355,94 +42448,82 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v34, v2 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 +; SI-NEXT: v_or_b32_e32 v2, v33, v30 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v2, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v33, v2 -; SI-NEXT: v_or_b32_e32 v2, v34, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v43 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v2, v33, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v34 -; SI-NEXT: v_or_b32_e32 v62, v35, v42 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v54 +; SI-NEXT: v_or_b32_e32 v61, v35, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v41 +; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v34 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v49 ; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v49 +; SI-NEXT: v_or_b32_e32 v54, v33, v41 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 ; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v35 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_or_b32_e32 v60, v34, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v40 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; SI-NEXT: v_or_b32_e32 v49, v33, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v55 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v2, v48 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 +; SI-NEXT: v_or_b32_e32 v49, v34, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v55 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v34, v29 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 ; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v37 ; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v56 +; SI-NEXT: v_or_b32_e32 v39, v35, v55 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v39 +; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_or_b32_e32 v22, v22, v29 ; SI-NEXT: v_or_b32_e32 v37, v33, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_or_b32_e32 v35, v34, v27 ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v26 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v58 ; SI-NEXT: v_or_b32_e32 v59, v28, v26 -; SI-NEXT: v_or_b32_e32 v39, v35, v55 -; SI-NEXT: v_or_b32_e32 v30, v30, v27 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v34 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v36 @@ -42461,18 +42542,22 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 ; SI-NEXT: v_or_b32_e32 v31, v25, v57 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 -; SI-NEXT: v_or_b32_e32 v54, v5, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v48 -; SI-NEXT: v_or_b32_e32 v53, v3, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v63 +; SI-NEXT: v_or_b32_e32 v53, v5, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v11, v2 +; SI-NEXT: v_or_b32_e32 v48, v3, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v62 ; SI-NEXT: v_or_b32_e32 v9, v9, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v60 ; SI-NEXT: v_or_b32_e32 v14, v14, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v10 ; SI-NEXT: v_or_b32_e32 v12, v12, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v17 +; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 ; SI-NEXT: v_or_b32_e32 v18, v18, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 @@ -42480,8 +42565,8 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v20 ; SI-NEXT: v_or_b32_e32 v15, v15, v25 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v13 -; SI-NEXT: v_or_b32_e32 v4, v4, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v11 +; SI-NEXT: v_or_b32_e32 v11, v4, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v2 ; SI-NEXT: v_or_b32_e32 v6, v6, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v52 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v50 @@ -42497,10 +42582,10 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v52, v25, v33 ; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v50 ; SI-NEXT: v_or_b32_e32 v51, v28, v25 -; SI-NEXT: v_alignbit_b32 v45, v51, v38, 16 +; SI-NEXT: v_alignbit_b32 v45, v51, v30, 16 ; SI-NEXT: v_alignbit_b32 v44, v52, v44, 16 ; SI-NEXT: v_alignbit_b32 v43, v6, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, v4, v41, 16 +; SI-NEXT: v_alignbit_b32 v42, v11, v41, 16 ; SI-NEXT: v_alignbit_b32 v41, v15, v46, 16 ; SI-NEXT: v_alignbit_b32 v40, v21, v55, 16 ; SI-NEXT: v_alignbit_b32 v55, v23, v29, 16 @@ -42508,56 +42593,60 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v28, v12, v27, 16 ; SI-NEXT: v_alignbit_b32 v27, v14, v26, 16 ; SI-NEXT: v_alignbit_b32 v26, v9, v56, 16 -; SI-NEXT: v_alignbit_b32 v25, v53, v24, 16 -; SI-NEXT: v_alignbit_b32 v24, v54, v57, 16 +; SI-NEXT: v_mov_b32_e32 v56, v35 +; SI-NEXT: v_alignbit_b32 v25, v48, v24, 16 +; SI-NEXT: v_alignbit_b32 v24, v53, v57, 16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v45 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v50 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v51 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v50 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v44 +; SI-NEXT: v_or_b32_e32 v30, v30, v33 +; SI-NEXT: v_add_i32_e32 v33, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v30, v33, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v30, 0xffff, v52 +; SI-NEXT: v_or_b32_e32 v1, v30, v1 +; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v61 +; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v43 +; SI-NEXT: v_or_b32_e32 v1, v1, v30 +; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v30, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v44 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v2 -; SI-NEXT: v_or_b32_e32 v33, v33, v34 -; SI-NEXT: v_add_i32_e32 v34, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v33, v34, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v33, 0xffff, v52 -; SI-NEXT: v_or_b32_e32 v1, v33, v1 -; SI-NEXT: v_add_i32_e32 v33, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v43 -; SI-NEXT: v_or_b32_e32 v1, v1, v33 -; SI-NEXT: v_add_i32_e32 v33, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v1, v33, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v6 ; SI-NEXT: v_add_i32_e32 v6, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v1, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v13 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -42610,7 +42699,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v28 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 64, v0 @@ -42629,7 +42718,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v61 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -42641,7 +42730,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v63 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v62 ; SI-NEXT: v_or_b32_e32 v1, v1, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen @@ -42652,8 +42741,8 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -42664,7 +42753,7 @@ define <52 x i16> @bitcast_v52f16_to_v52i16(<52 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 @@ -43203,482 +43292,533 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i ; SI-LABEL: bitcast_v52f16_to_v52i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:4 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:24 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:28 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v10 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:32 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v18, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s28 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v14 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v18 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v44, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v51, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s23 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v11, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v54, s29 +; SI-NEXT: s_waitcnt vmcnt(10) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_cvt_f16_f32_e32 v53, v32 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v33 ; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v28 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v36 ; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v38 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v28, v47 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v42, s25 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v39 +; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v39, s25 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v43 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v42 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v53 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v40 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v51 -; SI-NEXT: v_lshlrev_b32_e32 v51, 16, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v43 +; SI-NEXT: v_mov_b32_e32 v38, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v41 +; SI-NEXT: v_mov_b32_e32 v28, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v36, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v39 +; SI-NEXT: v_mov_b32_e32 v9, v15 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v39, v7, v19 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v38 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v38, v7, v15 +; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_mov_b32_e32 v29, v11 +; SI-NEXT: v_or_b32_e32 v5, v5, v23 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v29 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v25 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_or_b32_e32 v5, v5, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_or_b32_e32 v3, v3, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v8 -; SI-NEXT: v_or_b32_e32 v9, v9, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v7 -; SI-NEXT: v_or_b32_e32 v14, v14, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: v_or_b32_e32 v12, v12, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v16 -; SI-NEXT: v_or_b32_e32 v17, v17, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v31 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_or_b32_e32 v36, v36, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v32 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v34, v34, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v29 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 +; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 ; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_or_b32_e32 v48, v29, v48 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_or_b32_e32 v50, v50, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v18 -; SI-NEXT: v_or_b32_e32 v19, v19, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v22 -; SI-NEXT: v_or_b32_e32 v25, v25, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v45 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v40 -; SI-NEXT: v_or_b32_e32 v24, v24, v29 -; SI-NEXT: v_or_b32_e32 v27, v27, v43 -; SI-NEXT: v_or_b32_e32 v26, v26, v45 -; SI-NEXT: v_or_b32_e32 v21, v21, v30 -; SI-NEXT: v_or_b32_e32 v20, v20, v41 -; SI-NEXT: v_or_b32_e32 v49, v49, v46 -; SI-NEXT: v_or_b32_e32 v37, v37, v55 -; SI-NEXT: v_or_b32_e32 v35, v35, v54 -; SI-NEXT: v_or_b32_e32 v33, v33, v47 -; SI-NEXT: v_or_b32_e32 v15, v15, v52 -; SI-NEXT: v_or_b32_e32 v13, v13, v51 -; SI-NEXT: v_or_b32_e32 v11, v11, v56 -; SI-NEXT: v_or_b32_e32 v6, v6, v28 -; SI-NEXT: v_or_b32_e32 v4, v4, v57 -; SI-NEXT: v_alignbit_b32 v44, v24, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v25, v45, 16 -; SI-NEXT: v_alignbit_b32 v42, v19, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v50, v41, 16 -; SI-NEXT: v_alignbit_b32 v41, v48, v46, 16 -; SI-NEXT: v_alignbit_b32 v40, v34, v55, 16 -; SI-NEXT: v_alignbit_b32 v55, v36, v54, 16 -; SI-NEXT: v_alignbit_b32 v54, v17, v47, 16 -; SI-NEXT: v_alignbit_b32 v53, v12, v52, 16 -; SI-NEXT: v_alignbit_b32 v52, v14, v51, 16 -; SI-NEXT: v_alignbit_b32 v51, v9, v56, 16 -; SI-NEXT: v_alignbit_b32 v29, v3, v28, 16 -; SI-NEXT: v_alignbit_b32 v28, v5, v57, 16 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v28 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v7, v7, v17 +; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v9, v44 +; SI-NEXT: v_or_b32_e32 v5, v5, v21 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v56 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v9 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v5, v5, v11 +; SI-NEXT: v_or_b32_e32 v56, v7, v13 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v31 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v40 +; SI-NEXT: v_or_b32_e32 v36, v1, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v55 +; SI-NEXT: v_or_b32_e32 v37, v28, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v53 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v35, v3, v5 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 +; SI-NEXT: v_or_b32_e32 v33, v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_or_b32_e32 v31, v29, v1 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v29, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v30 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v57 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v29 +; SI-NEXT: v_or_b32_e32 v4, v4, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v51 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 +; SI-NEXT: v_or_b32_e32 v6, v6, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v47 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_or_b32_e32 v10, v10, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v63 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v47 +; SI-NEXT: v_or_b32_e32 v12, v12, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v63 +; SI-NEXT: v_or_b32_e32 v14, v14, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v62 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_or_b32_e32 v18, v18, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v58 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v27 +; SI-NEXT: v_lshr_b64 v[50:51], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v62 +; SI-NEXT: v_or_b32_e32 v22, v22, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v61 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v58 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v45 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v28 +; SI-NEXT: v_lshr_b64 v[54:55], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v45, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v46 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v61 +; SI-NEXT: v_lshr_b64 v[52:53], v[21:22], 16 +; SI-NEXT: v_or_b32_e32 v16, v16, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v45 +; SI-NEXT: v_or_b32_e32 v20, v20, v27 +; SI-NEXT: v_mov_b32_e32 v53, v33 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v46 +; SI-NEXT: v_or_b32_e32 v24, v24, v27 +; SI-NEXT: v_lshr_b64 v[43:44], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[7:8], 16 +; SI-NEXT: v_mov_b32_e32 v7, v56 +; SI-NEXT: v_lshr_b64 v[55:56], v[3:4], 16 +; SI-NEXT: v_mov_b32_e32 v44, v37 +; SI-NEXT: v_lshr_b64 v[41:42], v[19:20], 16 +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_lshr_b64 v[39:40], v[15:16], 16 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: v_lshr_b64 v[37:38], v[11:12], 16 +; SI-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v42, v36 +; SI-NEXT: v_mov_b32_e32 v40, v35 +; SI-NEXT: v_mov_b32_e32 v51, v32 +; SI-NEXT: v_lshr_b64 v[48:49], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v34, v31 +; SI-NEXT: v_lshr_b64 v[31:32], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 +; SI-NEXT: v_mov_b32_e32 v32, v29 ; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; SI-NEXT: v_or_b32_e32 v27, v27, v44 -; SI-NEXT: v_or_b32_e32 v23, v24, v23 -; SI-NEXT: v_add_i32_e32 v24, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v43 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v46 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; SI-NEXT: v_or_b32_e32 v22, v23, v22 -; SI-NEXT: v_add_i32_e32 v23, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v42 -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; SI-NEXT: v_or_b32_e32 v21, v21, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v18, v19, v18 -; SI-NEXT: v_add_i32_e32 v19, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v30 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v49 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v39 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v40 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v35 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v55 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v31 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v54 -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: v_or_b32_e32 v16, v17, v16 -; SI-NEXT: v_add_i32_e32 v17, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v54 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v53 -; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; SI-NEXT: v_or_b32_e32 v15, v15, v16 -; SI-NEXT: v_add_i32_e32 v16, vcc, 64, v0 -; SI-NEXT: v_or_b32_e32 v10, v12, v10 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v52 -; SI-NEXT: v_or_b32_e32 v10, v10, v12 -; SI-NEXT: v_add_i32_e32 v12, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v10, v12, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v10, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; SI-NEXT: v_or_b32_e32 v7, v10, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v51 -; SI-NEXT: v_or_b32_e32 v7, v7, v10 -; SI-NEXT: v_add_i32_e32 v10, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v7, v10, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v7, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; SI-NEXT: v_or_b32_e32 v7, v7, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v29 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: v_or_b32_e32 v6, v6, v7 -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x58, v0 -; SI-NEXT: v_or_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v7 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v42 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v40 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v28 -; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 +; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB59_4: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll index a43ce77b20631..8ee5b966f40b8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll @@ -3637,337 +3637,368 @@ define inreg <56 x i16> @bitcast_v28i32_to_v56i16_scalar(<28 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v28i32_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s30, 0 +; SI-NEXT: v_writelane_b32 v16, s31, 1 +; SI-NEXT: v_writelane_b32 v16, s34, 2 +; SI-NEXT: v_writelane_b32 v16, s35, 3 +; SI-NEXT: v_writelane_b32 v16, s36, 4 +; SI-NEXT: v_writelane_b32 v16, s37, 5 +; SI-NEXT: v_writelane_b32 v16, s38, 6 +; SI-NEXT: v_writelane_b32 v16, s39, 7 +; SI-NEXT: v_writelane_b32 v16, s48, 8 +; SI-NEXT: v_writelane_b32 v16, s49, 9 +; SI-NEXT: v_writelane_b32 v16, s50, 10 +; SI-NEXT: v_writelane_b32 v16, s51, 11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v2 -; SI-NEXT: v_readfirstlane_b32 s41, v3 -; SI-NEXT: v_readfirstlane_b32 s40, v4 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_readfirstlane_b32 s14, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v10 -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v13 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_writelane_b32 v16, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s40, v1 +; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_and_b64 s[42:43], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_writelane_b32 v16, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s22 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s8, 16 -; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: s_lshr_b32 s47, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s14, 16 -; SI-NEXT: s_lshr_b32 s57, s40, 16 -; SI-NEXT: s_lshr_b32 s58, s42, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 -; SI-NEXT: s_lshr_b32 s60, s27, 16 -; SI-NEXT: s_lshr_b32 s61, s25, 16 -; SI-NEXT: s_lshr_b32 s62, s23, 16 -; SI-NEXT: s_lshr_b32 s63, s21, 16 -; SI-NEXT: s_lshr_b32 s72, s19, 16 -; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s41, 16 +; SI-NEXT: s_lshr_b32 s39, s29, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 16 +; SI-NEXT: s_lshr_b32 s49, s25, 16 +; SI-NEXT: s_lshr_b32 s50, s23, 16 +; SI-NEXT: s_lshr_b32 s51, s21, 16 +; SI-NEXT: s_lshr_b32 s52, s19, 16 +; SI-NEXT: s_lshr_b32 s53, s17, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s43, s43, 3 ; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s42, s42, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s22 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s8, 16 -; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: s_lshr_b32 s47, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s14, 16 -; SI-NEXT: s_lshr_b32 s57, s40, 16 -; SI-NEXT: s_lshr_b32 s58, s42, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 -; SI-NEXT: s_lshr_b32 s60, s27, 16 -; SI-NEXT: s_lshr_b32 s61, s25, 16 -; SI-NEXT: s_lshr_b32 s62, s23, 16 -; SI-NEXT: s_lshr_b32 s63, s21, 16 -; SI-NEXT: s_lshr_b32 s72, s19, 16 -; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s41, 16 +; SI-NEXT: s_lshr_b32 s39, s29, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 16 +; SI-NEXT: s_lshr_b32 s49, s25, 16 +; SI-NEXT: s_lshr_b32 s50, s23, 16 +; SI-NEXT: s_lshr_b32 s51, s21, 16 +; SI-NEXT: s_lshr_b32 s52, s19, 16 +; SI-NEXT: s_lshr_b32 s53, s17, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_lshl_b32 s43, s92, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s43 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s90, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_lshl_b32 s16, s88, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s37, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s36, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s35, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s53, v16, 13 +; SI-NEXT: v_readlane_b32 s52, v16, 12 +; SI-NEXT: v_readlane_b32 s51, v16, 11 +; SI-NEXT: v_readlane_b32 s50, v16, 10 +; SI-NEXT: v_readlane_b32 s49, v16, 9 +; SI-NEXT: v_readlane_b32 s48, v16, 8 +; SI-NEXT: v_readlane_b32 s39, v16, 7 +; SI-NEXT: v_readlane_b32 s38, v16, 6 +; SI-NEXT: v_readlane_b32 s37, v16, 5 +; SI-NEXT: v_readlane_b32 s36, v16, 4 +; SI-NEXT: v_readlane_b32 s35, v16, 3 +; SI-NEXT: v_readlane_b32 s34, v16, 2 +; SI-NEXT: v_readlane_b32 s31, v16, 1 +; SI-NEXT: v_readlane_b32 s30, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v28i32_to_v56i16_scalar: @@ -5900,48 +5931,52 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -5950,87 +5985,87 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -6040,9 +6075,10 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -6085,96 +6121,96 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -6198,74 +6234,83 @@ define inreg <28 x i32> @bitcast_v56i16_to_v28i32_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v56i16_to_v28i32_scalar: @@ -14755,223 +14800,227 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_mov_b32_e32 v30, s16 -; SI-NEXT: v_mov_b32_e32 v29, s17 -; SI-NEXT: v_mov_b32_e32 v25, s18 -; SI-NEXT: v_mov_b32_e32 v23, s19 -; SI-NEXT: v_mov_b32_e32 v28, s20 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v27, s16 +; SI-NEXT: v_mov_b32_e32 v28, s17 +; SI-NEXT: v_mov_b32_e32 v23, s18 +; SI-NEXT: v_mov_b32_e32 v24, s19 +; SI-NEXT: v_mov_b32_e32 v25, s20 ; SI-NEXT: v_mov_b32_e32 v26, s21 -; SI-NEXT: v_mov_b32_e32 v24, s22 +; SI-NEXT: v_mov_b32_e32 v21, s22 ; SI-NEXT: v_mov_b32_e32 v22, s23 -; SI-NEXT: v_mov_b32_e32 v20, s24 -; SI-NEXT: v_mov_b32_e32 v19, s25 -; SI-NEXT: v_mov_b32_e32 v18, s26 -; SI-NEXT: v_mov_b32_e32 v17, s27 -; SI-NEXT: v_mov_b32_e32 v16, s28 -; SI-NEXT: v_mov_b32_e32 v15, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v19, s24 +; SI-NEXT: v_mov_b32_e32 v20, s25 +; SI-NEXT: v_mov_b32_e32 v17, s26 +; SI-NEXT: v_mov_b32_e32 v18, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v15, s28 +; SI-NEXT: v_mov_b32_e32 v16, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 -; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 +; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 +; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 ; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 +; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 ; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 ; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 ; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 ; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 -; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 -; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_alignbit_b32 v21, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v27, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v37, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v39, v15, v16, 16 -; SI-NEXT: v_alignbit_b32 v49, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v51, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v54, v22, v24, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v28, 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v23, v25, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v29, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_or_b32_e32 v30, v30, v44 -; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v56 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v52 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v37 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v42 -; SI-NEXT: v_or_b32_e32 v25, v25, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v25, v29, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v51 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v27 +; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v58 +; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 +; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v46 -; SI-NEXT: v_or_b32_e32 v23, v23, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v45 -; SI-NEXT: v_or_b32_e32 v22, v22, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 +; SI-NEXT: v_or_b32_e32 v21, v21, v23 +; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v51 -; SI-NEXT: v_or_b32_e32 v20, v20, v22 -; SI-NEXT: v_add_i32_e32 v22, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v20, v22, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v49 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 -; SI-NEXT: v_or_b32_e32 v16, v16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 +; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14983,7 +15032,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -14995,7 +15044,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15007,7 +15056,7 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -15019,74 +15068,77 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v30 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v29 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr49 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr27 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr35 +; SI-NEXT: ; implicit-def: $vgpr30 +; SI-NEXT: ; implicit-def: $vgpr29 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v28f32_to_v56i16_scalar: @@ -17015,48 +17067,52 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -17065,87 +17121,87 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -17155,9 +17211,10 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -17200,96 +17257,96 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -17313,74 +17370,83 @@ define inreg <28 x float> @bitcast_v56i16_to_v28f32_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v56i16_to_v28f32_scalar: @@ -19525,8 +19591,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v24 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v29 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v23 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v17 @@ -19565,8 +19631,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a, ; SI-NEXT: v_cvt_f32_f16_e32 v4, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v47 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v45 ; SI-NEXT: v_mov_b32_e32 v45, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v43 @@ -25008,337 +25074,368 @@ define inreg <56 x i16> @bitcast_v14i64_to_v56i16_scalar(<14 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v14i64_to_v56i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v16, s30, 0 +; SI-NEXT: v_writelane_b32 v16, s31, 1 +; SI-NEXT: v_writelane_b32 v16, s34, 2 +; SI-NEXT: v_writelane_b32 v16, s35, 3 +; SI-NEXT: v_writelane_b32 v16, s36, 4 +; SI-NEXT: v_writelane_b32 v16, s37, 5 +; SI-NEXT: v_writelane_b32 v16, s38, 6 +; SI-NEXT: v_writelane_b32 v16, s39, 7 +; SI-NEXT: v_writelane_b32 v16, s48, 8 +; SI-NEXT: v_writelane_b32 v16, s49, 9 +; SI-NEXT: v_writelane_b32 v16, s50, 10 +; SI-NEXT: v_writelane_b32 v16, s51, 11 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; SI-NEXT: v_readfirstlane_b32 s43, v1 -; SI-NEXT: v_readfirstlane_b32 s42, v2 -; SI-NEXT: v_readfirstlane_b32 s41, v3 -; SI-NEXT: v_readfirstlane_b32 s40, v4 -; SI-NEXT: v_readfirstlane_b32 s15, v5 -; SI-NEXT: v_readfirstlane_b32 s14, v6 -; SI-NEXT: v_readfirstlane_b32 s13, v7 -; SI-NEXT: v_readfirstlane_b32 s12, v8 -; SI-NEXT: v_readfirstlane_b32 s11, v9 -; SI-NEXT: v_readfirstlane_b32 s10, v10 -; SI-NEXT: v_readfirstlane_b32 s9, v11 -; SI-NEXT: v_readfirstlane_b32 s8, v12 -; SI-NEXT: v_readfirstlane_b32 s7, v13 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v14 +; SI-NEXT: v_writelane_b32 v16, s52, 12 +; SI-NEXT: v_readfirstlane_b32 s40, v1 +; SI-NEXT: v_readfirstlane_b32 s41, v2 +; SI-NEXT: v_readfirstlane_b32 s14, v3 +; SI-NEXT: v_readfirstlane_b32 s15, v4 +; SI-NEXT: v_readfirstlane_b32 s12, v5 +; SI-NEXT: v_readfirstlane_b32 s13, v6 +; SI-NEXT: v_readfirstlane_b32 s10, v7 +; SI-NEXT: v_readfirstlane_b32 s11, v8 +; SI-NEXT: v_readfirstlane_b32 s8, v9 +; SI-NEXT: v_readfirstlane_b32 s9, v10 +; SI-NEXT: v_readfirstlane_b32 s6, v11 +; SI-NEXT: v_readfirstlane_b32 s7, v12 +; SI-NEXT: v_readfirstlane_b32 s4, v13 +; SI-NEXT: s_and_b64 s[42:43], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v14 +; SI-NEXT: v_writelane_b32 v16, s53, 13 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s22 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s8, 16 -; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: s_lshr_b32 s47, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s14, 16 -; SI-NEXT: s_lshr_b32 s57, s40, 16 -; SI-NEXT: s_lshr_b32 s58, s42, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 -; SI-NEXT: s_lshr_b32 s60, s27, 16 -; SI-NEXT: s_lshr_b32 s61, s25, 16 -; SI-NEXT: s_lshr_b32 s62, s23, 16 -; SI-NEXT: s_lshr_b32 s63, s21, 16 -; SI-NEXT: s_lshr_b32 s72, s19, 16 -; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s41, 16 +; SI-NEXT: s_lshr_b32 s39, s29, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 16 +; SI-NEXT: s_lshr_b32 s49, s25, 16 +; SI-NEXT: s_lshr_b32 s50, s23, 16 +; SI-NEXT: s_lshr_b32 s51, s21, 16 +; SI-NEXT: s_lshr_b32 s52, s19, 16 +; SI-NEXT: s_lshr_b32 s53, s17, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s43, s43, 3 -; SI-NEXT: s_addc_u32 s42, s42, 0 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s40, s40, 0 -; SI-NEXT: s_add_u32 s15, s15, 3 -; SI-NEXT: s_addc_u32 s14, s14, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s28 -; SI-NEXT: v_mov_b32_e32 v9, s26 -; SI-NEXT: v_mov_b32_e32 v10, s24 -; SI-NEXT: v_mov_b32_e32 v11, s22 -; SI-NEXT: v_mov_b32_e32 v12, s20 -; SI-NEXT: v_mov_b32_e32 v13, s18 -; SI-NEXT: v_mov_b32_e32 v14, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s29, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s27, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s25, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s23, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s21, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s19, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s17, v14, 16 -; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_lshr_b32 s45, s8, 16 -; SI-NEXT: s_lshr_b32 s46, s10, 16 -; SI-NEXT: s_lshr_b32 s47, s12, 16 -; SI-NEXT: s_lshr_b32 s56, s14, 16 -; SI-NEXT: s_lshr_b32 s57, s40, 16 -; SI-NEXT: s_lshr_b32 s58, s42, 16 -; SI-NEXT: s_lshr_b32 s59, s29, 16 -; SI-NEXT: s_lshr_b32 s60, s27, 16 -; SI-NEXT: s_lshr_b32 s61, s25, 16 -; SI-NEXT: s_lshr_b32 s62, s23, 16 -; SI-NEXT: s_lshr_b32 s63, s21, 16 -; SI-NEXT: s_lshr_b32 s72, s19, 16 -; SI-NEXT: s_lshr_b32 s73, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s30, s5, 16 +; SI-NEXT: s_lshr_b32 s31, s7, 16 +; SI-NEXT: s_lshr_b32 s34, s9, 16 +; SI-NEXT: s_lshr_b32 s35, s11, 16 +; SI-NEXT: s_lshr_b32 s36, s13, 16 +; SI-NEXT: s_lshr_b32 s37, s15, 16 +; SI-NEXT: s_lshr_b32 s38, s41, 16 +; SI-NEXT: s_lshr_b32 s39, s29, 16 +; SI-NEXT: s_lshr_b32 s48, s27, 16 +; SI-NEXT: s_lshr_b32 s49, s25, 16 +; SI-NEXT: s_lshr_b32 s50, s23, 16 +; SI-NEXT: s_lshr_b32 s51, s21, 16 +; SI-NEXT: s_lshr_b32 s52, s19, 16 +; SI-NEXT: s_lshr_b32 s53, s17, 16 +; SI-NEXT: s_lshr_b64 s[42:43], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; SI-NEXT: buffer_store_dword v14, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v14, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: s_lshl_b32 s43, s92, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s43 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s90, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_lshl_b32 s16, s88, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s22, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s45, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s44, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s37, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s36, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s35, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s31, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s42, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s30, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s53, v16, 13 +; SI-NEXT: v_readlane_b32 s52, v16, 12 +; SI-NEXT: v_readlane_b32 s51, v16, 11 +; SI-NEXT: v_readlane_b32 s50, v16, 10 +; SI-NEXT: v_readlane_b32 s49, v16, 9 +; SI-NEXT: v_readlane_b32 s48, v16, 8 +; SI-NEXT: v_readlane_b32 s39, v16, 7 +; SI-NEXT: v_readlane_b32 s38, v16, 6 +; SI-NEXT: v_readlane_b32 s37, v16, 5 +; SI-NEXT: v_readlane_b32 s36, v16, 4 +; SI-NEXT: v_readlane_b32 s35, v16, 3 +; SI-NEXT: v_readlane_b32 s34, v16, 2 +; SI-NEXT: v_readlane_b32 s31, v16, 1 +; SI-NEXT: v_readlane_b32 s30, v16, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr51 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr50 +; SI-NEXT: ; implicit-def: $sgpr76 +; SI-NEXT: ; implicit-def: $sgpr49 +; SI-NEXT: ; implicit-def: $sgpr74 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr35 ; SI-NEXT: ; implicit-def: $sgpr46 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr45 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr34 ; SI-NEXT: ; implicit-def: $sgpr44 +; SI-NEXT: ; implicit-def: $sgpr31 +; SI-NEXT: ; implicit-def: $sgpr42 +; SI-NEXT: ; implicit-def: $sgpr30 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v14i64_to_v56i16_scalar: @@ -27271,48 +27368,52 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -27321,87 +27422,87 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -27411,9 +27512,10 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -27456,96 +27558,96 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -27569,74 +27671,83 @@ define inreg <14 x i64> @bitcast_v56i16_to_v14i64_scalar(<56 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v56i16_to_v14i64_scalar: @@ -34376,194 +34487,198 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v15, s28 ; SI-NEXT: v_mov_b32_e32 v16, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_lshr_b64 v[29:30], v[13:14], 16 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_lshr_b64 v[30:31], v[11:12], 16 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[9:10], 16 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 ; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 -; SI-NEXT: v_alignbit_b32 v29, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v30, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v31, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v32, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v33, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v34, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v36, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v38, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v48, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v51, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v53, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v40, v26, v25, 16 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v42, v24, v23, 16 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v16 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v18 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v44, v28, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v20 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v22 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v28 ; SI-NEXT: .LBB49_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v52 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v44 -; SI-NEXT: v_or_b32_e32 v27, v27, v44 +; SI-NEXT: v_or_b32_e32 v27, v27, v37 ; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v51 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v58 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v40 -; SI-NEXT: v_or_b32_e32 v23, v23, v24 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 +; SI-NEXT: v_and_b32_e32 v24, 0xffff, v25 +; SI-NEXT: v_or_b32_e32 v23, v24, v23 ; SI-NEXT: v_add_i32_e32 v24, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v46 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v56 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v47 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v36 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v46 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v35 ; SI-NEXT: v_or_b32_e32 v15, v15, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v15, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v15, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v45 ; SI-NEXT: v_or_b32_e32 v15, v15, v16 ; SI-NEXT: v_add_i32_e32 v16, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v15, v16, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v36 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v15 ; SI-NEXT: v_add_i32_e32 v15, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v15, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -34575,7 +34690,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -34587,7 +34702,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -34599,7 +34714,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -34611,7 +34726,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -34623,7 +34738,7 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -34635,50 +34750,53 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr58 +; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr57 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr46 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr41 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr55 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr39 ; SI-NEXT: ; implicit-def: $vgpr30 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr35 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v14f64_to_v56i16_scalar: @@ -36579,48 +36697,52 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(6) +; SI-NEXT: v_mov_b32_e32 v57, v12 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_mov_b32_e32 v58, v10 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v60, v8 +; SI-NEXT: v_mov_b32_e32 v33, v6 +; SI-NEXT: v_mov_b32_e32 v35, v4 +; SI-NEXT: v_mov_b32_e32 v39, v2 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v61, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:36 -; SI-NEXT: v_mov_b32_e32 v32, v26 -; SI-NEXT: v_mov_b32_e32 v33, v24 -; SI-NEXT: v_mov_b32_e32 v34, v22 -; SI-NEXT: v_mov_b32_e32 v35, v20 -; SI-NEXT: v_mov_b32_e32 v36, v18 -; SI-NEXT: v_mov_b32_e32 v37, v16 -; SI-NEXT: v_mov_b32_e32 v38, v14 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v3 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:36 +; SI-NEXT: v_mov_b32_e32 v31, v26 +; SI-NEXT: v_mov_b32_e32 v41, v24 +; SI-NEXT: v_mov_b32_e32 v42, v22 +; SI-NEXT: v_mov_b32_e32 v43, v20 +; SI-NEXT: v_mov_b32_e32 v49, v18 +; SI-NEXT: v_mov_b32_e32 v44, v16 +; SI-NEXT: v_mov_b32_e32 v45, v14 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v16, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v11 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v13 -; SI-NEXT: v_lshlrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v21 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v23 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v29 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v13 +; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v19 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 +; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v25 +; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill @@ -36629,87 +36751,87 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v6 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v8 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v10 -; SI-NEXT: s_waitcnt vmcnt(7) expcnt(6) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v52, 16, v10 +; SI-NEXT: s_waitcnt vmcnt(7) +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v12 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v7, v0, v18 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 ; SI-NEXT: v_or_b32_e32 v9, v0, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 ; SI-NEXT: v_or_b32_e32 v10, v0, v14 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v11, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: v_or_b32_e32 v12, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 -; SI-NEXT: v_or_b32_e32 v13, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 +; SI-NEXT: v_or_b32_e32 v13, v0, v63 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v14, v0, v44 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 -; SI-NEXT: v_or_b32_e32 v15, v0, v43 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v14, v0, v62 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v15, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v16, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 -; SI-NEXT: v_or_b32_e32 v17, v0, v59 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 -; SI-NEXT: v_or_b32_e32 v8, v1, v18 +; SI-NEXT: v_or_b32_e32 v16, v0, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 +; SI-NEXT: v_or_b32_e32 v17, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v18, v0, v58 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 -; SI-NEXT: v_or_b32_e32 v19, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v18, v0, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 +; SI-NEXT: v_or_b32_e32 v19, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v31 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v20, v0, v56 +; SI-NEXT: v_or_b32_e32 v20, v0, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v28 ; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_and_b32 s5, s18, 0xffff ; SI-NEXT: s_lshl_b32 s6, s19, 16 -; SI-NEXT: v_or_b32_e32 v21, v0, v47 +; SI-NEXT: v_or_b32_e32 v21, v0, v34 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 ; SI-NEXT: s_or_b32 s5, s5, s6 ; SI-NEXT: s_and_b32 s6, s20, 0xffff ; SI-NEXT: s_lshl_b32 s7, s21, 16 -; SI-NEXT: v_or_b32_e32 v22, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v22, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 -; SI-NEXT: v_or_b32_e32 v23, v0, v45 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v23, v0, v32 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 -; SI-NEXT: v_or_b32_e32 v24, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v24, v0, v59 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_or_b32_e32 v25, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v25, v0, v53 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v46 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v26, v0, v29 +; SI-NEXT: v_or_b32_e32 v26, v0, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: s_or_b32 s10, s10, s11 -; SI-NEXT: v_or_b32_e32 v27, v0, v63 +; SI-NEXT: v_or_b32_e32 v27, v0, v29 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -36719,9 +36841,10 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -36764,96 +36887,96 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v63, v0 ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v44, v0 +; SI-NEXT: v_or_b32_e32 v0, v62, v0 ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v43, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v48, v0 ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v37, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v31 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v55, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v28 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v46, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v45, v0 +; SI-NEXT: v_or_b32_e32 v0, v32, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v59, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v53, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v46 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v29, v0 +; SI-NEXT: v_or_b32_e32 v0, v52, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v29, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 @@ -36877,74 +37000,83 @@ define inreg <14 x double> @bitcast_v56i16_to_v14f64_scalar(<56 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v56 -; SI-NEXT: v_mov_b32_e32 v56, v45 -; SI-NEXT: v_mov_b32_e32 v45, v29 -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v62, v58 +; SI-NEXT: v_mov_b32_e32 v58, v51 +; SI-NEXT: v_mov_b32_e32 v51, v47 +; SI-NEXT: v_mov_b32_e32 v47, v44 ; SI-NEXT: v_mov_b32_e32 v44, v41 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_mov_b32_e32 v41, v39 -; SI-NEXT: v_mov_b32_e32 v39, v36 -; SI-NEXT: v_mov_b32_e32 v36, v33 -; SI-NEXT: v_mov_b32_e32 v33, v30 -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v43, v55 -; SI-NEXT: v_mov_b32_e32 v55, v54 -; SI-NEXT: v_mov_b32_e32 v54, v53 -; SI-NEXT: v_mov_b32_e32 v53, v52 -; SI-NEXT: v_mov_b32_e32 v52, v51 -; SI-NEXT: v_mov_b32_e32 v51, v50 -; SI-NEXT: v_mov_b32_e32 v50, v49 -; SI-NEXT: v_mov_b32_e32 v49, v42 -; SI-NEXT: v_mov_b32_e32 v42, v48 -; SI-NEXT: v_mov_b32_e32 v48, v37 -; SI-NEXT: v_mov_b32_e32 v37, v34 -; SI-NEXT: v_mov_b32_e32 v34, v28 -; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v57 +; SI-NEXT: v_mov_b32_e32 v41, v30 +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v63, v59 +; SI-NEXT: v_mov_b32_e32 v59, v56 +; SI-NEXT: s_waitcnt vmcnt(9) +; SI-NEXT: v_mov_b32_e32 v56, v50 +; SI-NEXT: v_mov_b32_e32 v50, v45 +; SI-NEXT: v_mov_b32_e32 v45, v42 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v57 ; SI-NEXT: v_mov_b32_e32 v57, v46 -; SI-NEXT: v_mov_b32_e32 v46, v61 -; SI-NEXT: v_mov_b32_e32 v61, v58 -; SI-NEXT: v_mov_b32_e32 v58, v47 -; SI-NEXT: v_mov_b32_e32 v47, v31 +; SI-NEXT: v_mov_b32_e32 v46, v49 +; SI-NEXT: v_mov_b32_e32 v49, v43 +; SI-NEXT: v_mov_b32_e32 v43, v31 +; SI-NEXT: v_mov_b32_e32 v53, v40 +; SI-NEXT: v_mov_b32_e32 v40, v48 +; SI-NEXT: v_mov_b32_e32 v48, v39 +; SI-NEXT: v_mov_b32_e32 v39, v38 +; SI-NEXT: v_mov_b32_e32 v38, v37 +; SI-NEXT: v_mov_b32_e32 v37, v36 +; SI-NEXT: v_mov_b32_e32 v36, v35 +; SI-NEXT: v_mov_b32_e32 v35, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v34, v33 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v32 +; SI-NEXT: v_mov_b32_e32 v32, v60 +; SI-NEXT: v_mov_b32_e32 v60, v29 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v31, v47 -; SI-NEXT: v_mov_b32_e32 v47, v58 -; SI-NEXT: v_mov_b32_e32 v58, v61 -; SI-NEXT: v_mov_b32_e32 v61, v46 +; SI-NEXT: v_mov_b32_e32 v29, v60 +; SI-NEXT: v_mov_b32_e32 v60, v32 +; SI-NEXT: v_mov_b32_e32 v32, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v33, v34 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v35 +; SI-NEXT: v_mov_b32_e32 v35, v36 +; SI-NEXT: v_mov_b32_e32 v36, v37 +; SI-NEXT: v_mov_b32_e32 v37, v38 +; SI-NEXT: v_mov_b32_e32 v38, v39 +; SI-NEXT: v_mov_b32_e32 v39, v48 +; SI-NEXT: v_mov_b32_e32 v48, v40 +; SI-NEXT: v_mov_b32_e32 v40, v53 +; SI-NEXT: v_mov_b32_e32 v31, v43 +; SI-NEXT: v_mov_b32_e32 v43, v49 +; SI-NEXT: v_mov_b32_e32 v49, v46 ; SI-NEXT: v_mov_b32_e32 v46, v57 -; SI-NEXT: v_mov_b32_e32 v57, v60 -; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v28, v34 -; SI-NEXT: v_mov_b32_e32 v34, v37 -; SI-NEXT: v_mov_b32_e32 v37, v48 -; SI-NEXT: v_mov_b32_e32 v48, v42 -; SI-NEXT: v_mov_b32_e32 v42, v49 -; SI-NEXT: v_mov_b32_e32 v49, v50 -; SI-NEXT: v_mov_b32_e32 v50, v51 -; SI-NEXT: v_mov_b32_e32 v51, v52 -; SI-NEXT: v_mov_b32_e32 v52, v53 -; SI-NEXT: v_mov_b32_e32 v53, v54 -; SI-NEXT: v_mov_b32_e32 v54, v55 -; SI-NEXT: v_mov_b32_e32 v55, v43 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v33 -; SI-NEXT: v_mov_b32_e32 v33, v36 -; SI-NEXT: v_mov_b32_e32 v36, v39 -; SI-NEXT: v_mov_b32_e32 v39, v41 -; SI-NEXT: v_mov_b32_e32 v41, v44 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v29, v45 -; SI-NEXT: v_mov_b32_e32 v45, v56 +; SI-NEXT: v_mov_b32_e32 v57, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v28, v42 +; SI-NEXT: v_mov_b32_e32 v42, v45 +; SI-NEXT: v_mov_b32_e32 v45, v50 +; SI-NEXT: v_mov_b32_e32 v50, v56 ; SI-NEXT: v_mov_b32_e32 v56, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v59, v63 +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v30, v41 +; SI-NEXT: v_mov_b32_e32 v41, v44 +; SI-NEXT: v_mov_b32_e32 v44, v47 +; SI-NEXT: v_mov_b32_e32 v47, v51 +; SI-NEXT: v_mov_b32_e32 v51, v58 +; SI-NEXT: v_mov_b32_e32 v58, v62 +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v56i16_to_v14f64_scalar: @@ -37772,7 +37904,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr30 ; SI-NEXT: ; implicit-def: $vgpr63 @@ -37784,7 +37915,6 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr57 ; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr42 ; SI-NEXT: ; implicit-def: $vgpr62 ; SI-NEXT: ; kill: killed $vgpr29 ; SI-NEXT: ; implicit-def: $vgpr29 @@ -37794,23 +37924,25 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v29 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, v29 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v21 @@ -37819,31 +37951,31 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v57, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v17 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 ; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v42, v28 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v27 -; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v43, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v26 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v29 @@ -37875,6 +38007,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v29 @@ -37882,26 +38015,25 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v54, v29 ; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v29 -; SI-NEXT: v_mov_b32_e32 v29, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v26 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_mov_b32_e32 v29, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 @@ -37936,34 +38068,24 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[54:55], v[1:2], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v18 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 -; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 -; SI-NEXT: v_mov_b32_e32 v42, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v43 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 +; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 ; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v55 ; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v4 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v5 @@ -37977,14 +38099,16 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v14 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v24 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -38005,13 +38129,14 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v52, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v40, v55 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 @@ -38020,6 +38145,9 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v3, v42 ; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v46, v46 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 @@ -38034,29 +38162,29 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v47, v26 -; SI-NEXT: v_mov_b32_e32 v45, v27 -; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v55, v1 -; SI-NEXT: v_mov_b32_e32 v43, v28 -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v47, v25 +; SI-NEXT: v_mov_b32_e32 v45, v26 +; SI-NEXT: v_mov_b32_e32 v43, v27 +; SI-NEXT: v_mov_b32_e32 v42, v28 +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v55 @@ -38107,14 +38235,16 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 ; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v30 ; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 @@ -38123,7 +38253,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 @@ -38132,7 +38262,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 @@ -38141,7 +38271,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 @@ -38150,7 +38280,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v56 ; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 @@ -38159,7 +38289,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v46 ; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 @@ -38168,7 +38298,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v44 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -38178,8 +38308,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38189,8 +38319,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38200,8 +38330,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38211,8 +38341,8 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -38221,7 +38351,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 @@ -38230,7 +38360,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 @@ -38239,7 +38369,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 @@ -38248,19 +38378,10 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v42 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_or_b32_e32 v1, v2, v1 -; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -38269,7 +38390,7 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 @@ -38278,20 +38399,27 @@ define <56 x half> @bitcast_v14f64_to_v56f16(<14 x double> %a, i32 %b) { ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v47 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v42 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -38889,17 +39017,17 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s42, s5, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v41, s42 ; SI-NEXT: s_lshr_b32 s42, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s42 ; SI-NEXT: s_lshr_b32 s42, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s42 ; SI-NEXT: s_lshr_b32 s42, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v25, s42 ; SI-NEXT: s_lshr_b32 s42, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s42 ; SI-NEXT: s_lshr_b32 s42, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v29, s42 ; SI-NEXT: s_lshr_b32 s42, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s42 ; SI-NEXT: s_lshr_b32 s42, s10, 16 @@ -38907,7 +39035,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_lshr_b32 s42, s13, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v48, s42 ; SI-NEXT: s_lshr_b32 s42, s12, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v33, s42 ; SI-NEXT: s_lshr_b32 s42, s15, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v20, s42 ; SI-NEXT: s_lshr_b32 s42, s14, 16 @@ -38944,18 +39072,18 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v46, s42 ; SI-NEXT: s_lshr_b32 s42, s16, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v56, s42 +; SI-NEXT: v_cvt_f32_f16_e32 v21, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v34, s4 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_cvt_f32_f16_e32 v57, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v33, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v34, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v57, s7 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s9 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_cvt_f32_f16_e32 v58, s6 +; SI-NEXT: v_cvt_f32_f16_e32 v58, s8 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v59, s11 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_cvt_f32_f16_e32 v60, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v16, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v7, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v60, s10 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s13 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s12 ; SI-NEXT: v_cvt_f32_f16_e32 v23, s15 @@ -38969,13 +39097,13 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v38, s25 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s24 ; SI-NEXT: v_cvt_f32_f16_e32 v50, s23 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s22 +; SI-NEXT: v_cvt_f32_f16_e32 v16, s22 ; SI-NEXT: v_cvt_f32_f16_e32 v54, s21 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s20 -; SI-NEXT: v_cvt_f32_f16_e32 v41, s19 +; SI-NEXT: v_cvt_f32_f16_e32 v7, s20 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s19 ; SI-NEXT: v_cvt_f32_f16_e32 v43, s18 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s17 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s17 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s16 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true ; SI-NEXT: v_add_f64 v[1:2], s[16:17], 1.0 @@ -38995,37 +39123,41 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v4 +; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v22, v5 ; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v41, v43 +; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 +; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v57, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v43 ; SI-NEXT: v_cvt_f32_f16_e32 v43, v42 ; SI-NEXT: v_add_f64 v[49:50], s[22:23], 1.0 ; SI-NEXT: v_add_f64 v[37:38], s[24:25], 1.0 ; SI-NEXT: v_add_f64 v[15:16], s[10:11], 1.0 -; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v37 ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v15 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v39, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v44 ; SI-NEXT: v_add_f64 v[53:54], s[20:21], 1.0 -; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v7 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v46 +; SI-NEXT: v_add_f64 v[35:36], s[26:27], 1.0 ; SI-NEXT: v_add_f64 v[30:31], s[28:29], 1.0 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 -; SI-NEXT: v_add_f64 v[18:19], s[12:13], 1.0 ; SI-NEXT: v_add_f64 v[11:12], s[8:9], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v35 @@ -39035,20 +39167,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 ; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v18 ; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v19 ; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v60, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 @@ -39058,20 +39186,20 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v36, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 ; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v48 +; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 ; SI-NEXT: v_cvt_f32_f16_e32 v48, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v63 @@ -39094,9 +39222,9 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 ; SI-NEXT: .LBB53_3: ; %end ; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v9, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v2 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v9 @@ -39114,14 +39242,14 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v42 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v18 ; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 ; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -39135,7 +39263,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 ; SI-NEXT: v_add_i32_e32 v9, vcc, 24, v0 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; SI-NEXT: v_or_b32_e32 v5, v6, v5 @@ -39218,7 +39346,7 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v33 ; SI-NEXT: v_cvt_f16_f32_e32 v5, v8 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 @@ -39233,16 +39361,16 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 ; SI-NEXT: buffer_store_dword v4, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v4, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v58 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0x54, v0 ; SI-NEXT: v_or_b32_e32 v4, v5, v4 @@ -39254,35 +39382,35 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v40 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_or_b32_e32 v2, v3, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 ; SI-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v58 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v34 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v21 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -39306,19 +39434,19 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr56 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr18 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr55 ; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr53 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr16 ; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr49 @@ -39343,25 +39471,25 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr23 ; SI-NEXT: ; implicit-def: $vgpr20 ; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr19 ; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr7 -; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr16 -; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr2 +; SI-NEXT: ; implicit-def: $vgpr13 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr10 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr21 -; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr17 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr25 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr29 +; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr34 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr21 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v14f64_to_v56f16_scalar: @@ -45958,30 +46086,30 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:100 ; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:104 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v56, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 ; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v13 ; SI-NEXT: v_cvt_f16_f32_e32 v45, v14 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v44, v18 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v15 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v20 ; SI-NEXT: v_cvt_f16_f32_e32 v11, v21 @@ -45992,9 +46120,9 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v21, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v18, v28 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v7, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cvt_f16_f32_e32 v22, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v57 @@ -46008,69 +46136,76 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v63, v35 ; SI-NEXT: v_cvt_f16_f32_e32 v61, v49 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v59, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v55 ; SI-NEXT: v_cvt_f16_f32_e32 v49, v31 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v34 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v50, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v35, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v37 ; SI-NEXT: s_waitcnt vmcnt(11) ; SI-NEXT: v_cvt_f16_f32_e32 v27, v38 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v55, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v53, v39 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v32, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 ; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v33, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v51 ; SI-NEXT: s_waitcnt vmcnt(7) ; SI-NEXT: v_cvt_f16_f32_e32 v26, v52 ; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v54 ; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v34, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v41 ; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v42 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_or_saveexec_b64 s[4:5], s[4:5] -; SI-NEXT: v_mov_b32_e32 v48, v7 -; SI-NEXT: v_mov_b32_e32 v51, v9 -; SI-NEXT: v_mov_b32_e32 v52, v11 -; SI-NEXT: v_mov_b32_e32 v54, v13 -; SI-NEXT: v_mov_b32_e32 v41, v12 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v52, v7 +; SI-NEXT: v_mov_b32_e32 v54, v9 +; SI-NEXT: v_mov_b32_e32 v55, v11 +; SI-NEXT: v_mov_b32_e32 v41, v13 +; SI-NEXT: v_mov_b32_e32 v48, v5 ; SI-NEXT: s_xor_b64 exec, exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB58_2 ; SI-NEXT: ; %bb.1: ; %cmp.true -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v47 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v37, v56 +; SI-NEXT: v_mov_b32_e32 v7, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v39 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v38 ; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v33 +; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v21 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 ; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 ; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v11 ; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 ; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 @@ -46085,6 +46220,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 ; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 @@ -46092,7 +46228,6 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 ; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 @@ -46103,108 +46238,100 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 ; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 ; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v37, v7 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v39, v7 -; SI-NEXT: v_or_b32_e32 v7, v37, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v46 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v31 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v39 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_or_b32_e32 v7, v38, v46 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v38, v45 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v54 +; SI-NEXT: v_or_b32_e32 v5, v38, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v46 +; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v37 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v37, v9 +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v48, v39, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 +; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v39 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v41 ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v31, v7 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_or_b32_e32 v7, v31, v45 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v44 -; SI-NEXT: v_or_b32_e32 v41, v37, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v9 +; SI-NEXT: v_or_b32_e32 v9, v37, v45 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v7, v55 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v52 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v51 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_or_b32_e32 v9, v38, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_or_b32_e32 v54, v39, v43 -; SI-NEXT: v_or_b32_e32 v52, v38, v42 -; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v25 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v37 +; SI-NEXT: v_or_b32_e32 v41, v39, v43 +; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v38 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v54 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v9, v63 +; SI-NEXT: v_or_b32_e32 v55, v37, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v52 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v38 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v48 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v51, v25, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 +; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 ; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 +; SI-NEXT: v_or_b32_e32 v54, v25, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v40 ; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v51 +; SI-NEXT: v_or_b32_e32 v52, v37, v40 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v48, v37, v40 ; SI-NEXT: v_cvt_f16_f32_e32 v37, v30 ; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v25 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v62 -; SI-NEXT: v_or_b32_e32 v57, v21, v30 +; SI-NEXT: v_or_b32_e32 v51, v21, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v29 -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v37 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v61 -; SI-NEXT: v_or_b32_e32 v62, v25, v37 +; SI-NEXT: v_or_b32_e32 v62, v25, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v28 ; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v49 @@ -46217,56 +46344,61 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v61, v29, v28 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v27 ; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v60 ; SI-NEXT: v_or_b32_e32 v49, v21, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v33 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v29 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 ; SI-NEXT: v_cvt_f16_f32_e32 v29, v26 -; SI-NEXT: v_or_b32_e32 v35, v25, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v34 +; SI-NEXT: v_or_b32_e32 v37, v25, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v32 ; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; SI-NEXT: v_or_b32_e32 v33, v29, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v21, v34 +; SI-NEXT: v_or_b32_e32 v39, v29, v26 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v25 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v53 -; SI-NEXT: v_or_b32_e32 v36, v21, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v32 -; SI-NEXT: v_or_b32_e32 v55, v7, v21 -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f32_f16_e32 v25, v50 -; SI-NEXT: v_alignbit_b32 v26, v36, v26, 16 +; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v25 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v32 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 +; SI-NEXT: v_or_b32_e32 v34, v21, v25 +; SI-NEXT: v_cvt_f32_f16_e32 v25, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v50 +; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v25 ; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v50 -; SI-NEXT: v_or_b32_e32 v53, v25, v21 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v58 -; SI-NEXT: v_or_b32_e32 v59, v13, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v9 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v31 +; SI-NEXT: v_or_b32_e32 v53, v7, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v36 +; SI-NEXT: v_or_b32_e32 v50, v25, v21 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; SI-NEXT: v_or_b32_e32 v35, v13, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v63 ; SI-NEXT: v_or_b32_e32 v16, v16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v14 ; SI-NEXT: v_or_b32_e32 v15, v15, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v21, v22 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v20 -; SI-NEXT: v_alignbit_b32 v29, v59, v28, 16 -; SI-NEXT: v_alignbit_b32 v28, v53, v27, 16 +; SI-NEXT: v_alignbit_b32 v29, v35, v28, 16 ; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 ; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_alignbit_b32 v27, v55, v60, 16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_alignbit_b32 v28, v50, v27, 16 ; SI-NEXT: v_or_b32_e32 v22, v21, v22 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v18 ; SI-NEXT: v_or_b32_e32 v24, v24, v21 @@ -46274,16 +46406,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v17 ; SI-NEXT: v_or_b32_e32 v10, v10, v21 -; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 -; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v16, v37, 16 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v12 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v7 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v21 ; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v8 ; SI-NEXT: v_or_b32_e32 v5, v5, v21 @@ -46294,14 +46417,22 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_alignbit_b32 v56, v3, v47, 16 ; SI-NEXT: v_alignbit_b32 v47, v6, v46, 16 ; SI-NEXT: v_alignbit_b32 v46, v5, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v1, v31, 16 -; SI-NEXT: v_alignbit_b32 v21, v24, v38, 16 -; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: v_alignbit_b32 v45, v1, v57, 16 +; SI-NEXT: v_alignbit_b32 v44, v10, v43, 16 +; SI-NEXT: v_alignbit_b32 v43, v19, v42, 16 +; SI-NEXT: v_alignbit_b32 v21, v24, v58, 16 +; SI-NEXT: v_alignbit_b32 v25, v22, v40, 16 +; SI-NEXT: v_alignbit_b32 v40, v15, v30, 16 +; SI-NEXT: v_alignbit_b32 v30, v16, v59, 16 +; SI-NEXT: v_alignbit_b32 v27, v53, v60, 16 +; SI-NEXT: v_mov_b32_e32 v60, v37 +; SI-NEXT: v_alignbit_b32 v26, v34, v26, 16 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: .LBB58_2: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v37, 16, v56 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v56 ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 @@ -46309,15 +46440,13 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v31, 0xffff, v7 -; SI-NEXT: v_or_b32_e32 v31, v31, v37 -; SI-NEXT: buffer_store_dword v31, v0, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v37, 0xffff, v5 +; SI-NEXT: v_or_b32_e32 v37, v37, v38 +; SI-NEXT: buffer_store_dword v37, v0, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v47 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-NEXT: v_or_b32_e32 v3, v3, v4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen @@ -46335,26 +46464,28 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v5 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v2, 0xffff, v41 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v12 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v41 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v0 @@ -46366,7 +46497,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 40, v0 @@ -46378,7 +46509,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v21 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 48, v0 @@ -46390,7 +46521,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v52 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v25 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 56, v0 @@ -46402,7 +46533,7 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v51 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 @@ -46432,8 +46563,8 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x50, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v59 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v58 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -46444,32 +46575,32 @@ define <56 x i16> @bitcast_v56f16_to_v56i16(<56 x half> %a, i32 %b) { ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x58, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v36 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v60 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v55 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v39 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v34 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -47071,501 +47202,582 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:40 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 -; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v25, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v21, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v23, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v20, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v19, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v22, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v26 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v56, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v3, s16 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v61, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v58, s23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v12 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v17 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v21 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v25 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v8, v29 +; SI-NEXT: v_cvt_f16_f32_e32 v47, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v45, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v20, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v43, s25 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s26 +; SI-NEXT: v_cvt_f16_f32_e32 v41, s29 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v46, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v44, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v34, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v36, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v35, v28 +; SI-NEXT: s_waitcnt vmcnt(13) +; SI-NEXT: v_cvt_f16_f32_e32 v33, v38 ; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 ; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v53, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v62, v51 ; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v53 ; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v5, v47 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v8, v56 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v30, v57 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v58 -; SI-NEXT: s_waitcnt vmcnt(5) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v57, v60 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v29, v61 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v63 -; SI-NEXT: v_cvt_f16_f32_e32 v56, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s29 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v55 +; SI-NEXT: s_waitcnt vmcnt(8) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v24, s18 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB59_3 ; SI-NEXT: .LBB59_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v33, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v44 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v47 -; SI-NEXT: v_add_f32_e32 v45, 0x38000000, v45 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v44 -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v33 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v33 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v43 -; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v44 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v44, 0x38000000, v56 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v40 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v41 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v44, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v53 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v54 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_mov_b32_e32 v28, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v45 +; SI-NEXT: v_mov_b32_e32 v51, v23 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v43 +; SI-NEXT: v_mov_b32_e32 v49, v19 +; SI-NEXT: v_mov_b32_e32 v53, v36 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v36, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v7, v41 +; SI-NEXT: v_mov_b32_e32 v38, v15 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v54 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_or_b32_e32 v5, v5, v23 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v50 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v48 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v37 +; SI-NEXT: v_mov_b32_e32 v37, v11 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v35 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v62 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 +; SI-NEXT: v_mov_b32_e32 v33, v31 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v53 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v29 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: v_or_b32_e32 v57, v54, v29 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 +; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 +; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 ; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v49 -; SI-NEXT: v_or_b32_e32 v4, v4, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 ; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v39 -; SI-NEXT: v_or_b32_e32 v7, v7, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_or_b32_e32 v6, v6, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 +; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 +; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_or_b32_e32 v13, v13, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_add_f32_e32 v49, 0x38000000, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_or_b32_e32 v16, v16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v49 -; SI-NEXT: v_add_f32_e32 v51, 0x38000000, v51 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v15, v15, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v52, 0x38000000, v52 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_or_b32_e32 v34, v34, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v52 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 +; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v28, v19 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v31 +; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_or_b32_e32 v37, v37, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v35 +; SI-NEXT: v_or_b32_e32 v28, v28, v15 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 ; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_or_b32_e32 v36, v36, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 ; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 +; SI-NEXT: v_or_b32_e32 v39, v5, v29 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v5, v31, v25 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v38 ; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v48, 0x38000000, v48 -; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_or_b32_e32 v51, v51, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 ; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_or_b32_e32 v52, v52, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_or_b32_e32 v20, v20, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v21 -; SI-NEXT: v_or_b32_e32 v27, v27, v54 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v56 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v44 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v41 -; SI-NEXT: v_or_b32_e32 v25, v25, v54 -; SI-NEXT: v_or_b32_e32 v28, v28, v47 -; SI-NEXT: v_or_b32_e32 v26, v26, v46 -; SI-NEXT: v_or_b32_e32 v23, v23, v45 -; SI-NEXT: v_or_b32_e32 v22, v22, v33 -; SI-NEXT: v_or_b32_e32 v50, v50, v43 -; SI-NEXT: v_or_b32_e32 v48, v48, v42 -; SI-NEXT: v_or_b32_e32 v38, v38, v58 -; SI-NEXT: v_or_b32_e32 v3, v3, v40 -; SI-NEXT: v_or_b32_e32 v18, v18, v55 -; SI-NEXT: v_or_b32_e32 v17, v17, v59 -; SI-NEXT: v_or_b32_e32 v12, v12, v53 -; SI-NEXT: v_or_b32_e32 v10, v10, v30 -; SI-NEXT: v_or_b32_e32 v8, v8, v60 -; SI-NEXT: v_alignbit_b32 v56, v25, v47, 16 -; SI-NEXT: v_alignbit_b32 v47, v27, v46, 16 -; SI-NEXT: v_alignbit_b32 v46, v20, v45, 16 -; SI-NEXT: v_alignbit_b32 v45, v52, v33, 16 -; SI-NEXT: v_alignbit_b32 v44, v51, v43, 16 -; SI-NEXT: v_alignbit_b32 v43, v36, v42, 16 -; SI-NEXT: v_alignbit_b32 v42, v37, v58, 16 -; SI-NEXT: v_alignbit_b32 v41, v34, v40, 16 -; SI-NEXT: v_alignbit_b32 v40, v15, v55, 16 -; SI-NEXT: v_alignbit_b32 v55, v16, v59, 16 -; SI-NEXT: v_alignbit_b32 v54, v13, v53, 16 -; SI-NEXT: v_alignbit_b32 v53, v6, v30, 16 -; SI-NEXT: v_alignbit_b32 v30, v7, v60, 16 -; SI-NEXT: v_alignbit_b32 v29, v4, v29, 16 -; SI-NEXT: .LBB59_3: ; %end -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v56 -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_or_b32_e32 v28, v28, v33 -; SI-NEXT: v_or_b32_e32 v24, v25, v24 -; SI-NEXT: v_add_i32_e32 v25, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v28, v0, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v47 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v27 -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; SI-NEXT: v_or_b32_e32 v21, v24, v21 -; SI-NEXT: v_add_i32_e32 v24, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v21, v24, s[0:3], 0 offen +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v28, v28, v21 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v46 -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v19, v20, v19 -; SI-NEXT: v_add_i32_e32 v20, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v28, v37 +; SI-NEXT: v_or_b32_e32 v38, v31, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v56 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 +; SI-NEXT: v_or_b32_e32 v37, v28, v11 +; SI-NEXT: v_or_b32_e32 v62, v31, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v33 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v17 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_or_b32_e32 v5, v5, v9 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v52 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v39 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f32_f16_e32 v5, v46 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v5 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_or_b32_e32 v35, v28, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v34 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v34, v1, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v31 +; SI-NEXT: v_or_b32_e32 v56, v28, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v42 +; SI-NEXT: v_cvt_f16_f32_e32 v32, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v33, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v32 +; SI-NEXT: v_or_b32_e32 v4, v4, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v60 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v33 +; SI-NEXT: v_or_b32_e32 v6, v6, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v57 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v36 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v60 +; SI-NEXT: v_or_b32_e32 v8, v8, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 +; SI-NEXT: v_or_b32_e32 v10, v10, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 +; SI-NEXT: v_or_b32_e32 v12, v12, v28 +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_or_b32_e32 v14, v14, v27 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v28 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v50 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v44 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: v_or_b32_e32 v18, v18, v27 +; SI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshr_b64 v[52:53], v[17:18], 16 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v27 +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v49 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; SI-NEXT: v_or_b32_e32 v22, v22, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v59 +; SI-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_lshr_b64 v[54:55], v[21:22], 16 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v28 +; SI-NEXT: v_cvt_f32_f16_e32 v28, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v30, v51 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v36 +; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 +; SI-NEXT: v_or_b32_e32 v26, v26, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v59 +; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v43 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v36, v30 +; SI-NEXT: v_or_b32_e32 v30, v28, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v58 +; SI-NEXT: v_lshr_b64 v[41:42], v[29:30], 16 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v36 +; SI-NEXT: v_or_b32_e32 v16, v16, v28 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v58, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v27, v61 +; SI-NEXT: v_lshr_b64 v[43:44], v[15:16], 16 +; SI-NEXT: v_mov_b32_e32 v44, v34 +; SI-NEXT: v_mov_b32_e32 v42, v33 +; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v27 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v58 +; SI-NEXT: v_or_b32_e32 v20, v20, v27 +; SI-NEXT: v_lshr_b64 v[45:46], v[19:20], 16 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v61 +; SI-NEXT: v_or_b32_e32 v24, v24, v27 +; SI-NEXT: v_lshr_b64 v[33:34], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[47:48], v[23:24], 16 +; SI-NEXT: v_mov_b32_e32 v23, v36 +; SI-NEXT: v_mov_b32_e32 v46, v35 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_mov_b32_e32 v7, v63 +; SI-NEXT: v_mov_b32_e32 v34, v56 +; SI-NEXT: v_mov_b32_e32 v56, v62 +; SI-NEXT: v_lshr_b64 v[62:63], v[3:4], 16 +; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: v_mov_b32_e32 v19, v39 +; SI-NEXT: v_mov_b32_e32 v15, v38 +; SI-NEXT: v_lshr_b64 v[39:40], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[11:12], 16 +; SI-NEXT: v_mov_b32_e32 v11, v37 +; SI-NEXT: v_lshr_b64 v[37:38], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[27:28], v[1:2], 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v36 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v35 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v38 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v42 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v45 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v19, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 -; SI-NEXT: v_or_b32_e32 v19, v19, v20 -; SI-NEXT: v_add_i32_e32 v20, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v58 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v43 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v36 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v41 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v39 +; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v41 -; SI-NEXT: v_or_b32_e32 v3, v3, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v3, v19, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v32 -; SI-NEXT: v_or_b32_e32 v3, v3, v19 -; SI-NEXT: v_add_i32_e32 v19, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v3, v19, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v54 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v40 -; SI-NEXT: v_or_b32_e32 v3, v3, v18 -; SI-NEXT: v_add_i32_e32 v18, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v3, v18, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v3, v3, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v17 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v55 -; SI-NEXT: v_or_b32_e32 v3, v3, v14 -; SI-NEXT: v_add_i32_e32 v14, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v3, v14, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v54 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v11 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v53 -; SI-NEXT: v_or_b32_e32 v3, v3, v9 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v30 -; SI-NEXT: v_or_b32_e32 v3, v3, v5 -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v48 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v7 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v63 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v57 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v56 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v35 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v46 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v33 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v42 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v62 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v32 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v29 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v34 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v27 ; SI-NEXT: v_or_b32_e32 v1, v1, v3 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll index 4f46875076809..967f1a9b442b0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll @@ -3847,361 +3847,396 @@ define inreg <60 x i16> @bitcast_v30i32_to_v60i16_scalar(<30 x i32> inreg %a, i3 ; SI-LABEL: bitcast_v30i32_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s45, v1 -; SI-NEXT: v_readfirstlane_b32 s44, v2 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s41, v5 -; SI-NEXT: v_readfirstlane_b32 s40, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s12, v10 -; SI-NEXT: v_readfirstlane_b32 s11, v11 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s42, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s40, v3 +; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_and_b64 s[44:45], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_writelane_b32 v18, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB13_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s20 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 -; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 16 -; SI-NEXT: s_lshr_b32 s56, s10, 16 -; SI-NEXT: s_lshr_b32 s57, s12, 16 -; SI-NEXT: s_lshr_b32 s58, s14, 16 -; SI-NEXT: s_lshr_b32 s59, s40, 16 -; SI-NEXT: s_lshr_b32 s60, s42, 16 -; SI-NEXT: s_lshr_b32 s61, s44, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 -; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s41, 16 +; SI-NEXT: s_lshr_b32 s49, s43, 16 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s19, 16 +; SI-NEXT: s_lshr_b32 s64, s17, 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB13_3 ; SI-NEXT: .LBB13_2: ; %cmp.true +; SI-NEXT: s_add_i32 s17, s17, 3 ; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_add_i32 s19, s19, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s21, s21, 3 ; SI-NEXT: s_add_i32 s20, s20, 3 +; SI-NEXT: s_add_i32 s23, s23, 3 ; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_add_i32 s25, s25, 3 ; SI-NEXT: s_add_i32 s24, s24, 3 +; SI-NEXT: s_add_i32 s27, s27, 3 ; SI-NEXT: s_add_i32 s26, s26, 3 +; SI-NEXT: s_add_i32 s29, s29, 3 ; SI-NEXT: s_add_i32 s28, s28, 3 -; SI-NEXT: s_add_i32 s45, s45, 3 ; SI-NEXT: s_add_i32 s43, s43, 3 -; SI-NEXT: s_add_i32 s41, s41, 3 -; SI-NEXT: s_add_i32 s15, s15, 3 -; SI-NEXT: s_add_i32 s13, s13, 3 -; SI-NEXT: s_add_i32 s11, s11, 3 -; SI-NEXT: s_add_i32 s9, s9, 3 -; SI-NEXT: s_add_i32 s7, s7, 3 -; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: s_add_i32 s19, s19, 3 -; SI-NEXT: s_add_i32 s21, s21, 3 -; SI-NEXT: s_add_i32 s23, s23, 3 -; SI-NEXT: s_add_i32 s25, s25, 3 -; SI-NEXT: s_add_i32 s27, s27, 3 -; SI-NEXT: s_add_i32 s29, s29, 3 -; SI-NEXT: s_add_i32 s44, s44, 3 ; SI-NEXT: s_add_i32 s42, s42, 3 +; SI-NEXT: s_add_i32 s41, s41, 3 ; SI-NEXT: s_add_i32 s40, s40, 3 +; SI-NEXT: s_add_i32 s15, s15, 3 ; SI-NEXT: s_add_i32 s14, s14, 3 +; SI-NEXT: s_add_i32 s13, s13, 3 ; SI-NEXT: s_add_i32 s12, s12, 3 +; SI-NEXT: s_add_i32 s11, s11, 3 ; SI-NEXT: s_add_i32 s10, s10, 3 +; SI-NEXT: s_add_i32 s9, s9, 3 ; SI-NEXT: s_add_i32 s8, s8, 3 +; SI-NEXT: s_add_i32 s7, s7, 3 ; SI-NEXT: s_add_i32 s6, s6, 3 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s20 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 -; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 16 -; SI-NEXT: s_lshr_b32 s56, s10, 16 -; SI-NEXT: s_lshr_b32 s57, s12, 16 -; SI-NEXT: s_lshr_b32 s58, s14, 16 -; SI-NEXT: s_lshr_b32 s59, s40, 16 -; SI-NEXT: s_lshr_b32 s60, s42, 16 -; SI-NEXT: s_lshr_b32 s61, s44, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 -; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_add_i32 s5, s5, 3 +; SI-NEXT: s_add_i32 s4, s4, 3 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s41, 16 +; SI-NEXT: s_lshr_b32 s49, s43, 16 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s19, 16 +; SI-NEXT: s_lshr_b32 s64, s17, 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 ; SI-NEXT: .LBB13_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s75, 16 -; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_lshl_b32 s45, s30, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s45 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s94, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s92, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_lshl_b32 s16, s90, 16 +; SI-NEXT: s_and_b32 s17, s22, 0xffff +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s44, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s37, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s36, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB13_4: -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: s_branch .LBB13_2 ; ; VI-LABEL: bitcast_v30i32_to_v60i16_scalar: @@ -6300,41 +6335,44 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -6345,8 +6383,8 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -6358,51 +6396,51 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB15_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -6415,17 +6453,17 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -6436,30 +6474,30 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB15_3 ; SI-NEXT: .LBB15_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -6502,119 +6540,119 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB15_3: ; %end @@ -6637,35 +6675,67 @@ define inreg <30 x i32> @bitcast_v60i16_to_v30i32_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB15_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB15_2 ; ; VI-LABEL: bitcast_v60i16_to_v30i32_scalar: @@ -15867,245 +15937,248 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_mov_b32_e32 v30, s16 +; SI-NEXT: v_mov_b32_e32 v27, s16 ; SI-NEXT: v_mov_b32_e32 v28, s17 -; SI-NEXT: v_mov_b32_e32 v33, s18 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_mov_b32_e32 v32, s19 -; SI-NEXT: v_mov_b32_e32 v29, s20 -; SI-NEXT: v_mov_b32_e32 v27, s21 -; SI-NEXT: v_mov_b32_e32 v25, s22 +; SI-NEXT: v_mov_b32_e32 v29, s18 +; SI-NEXT: v_mov_b32_e32 v30, s19 +; SI-NEXT: v_mov_b32_e32 v25, s20 +; SI-NEXT: v_mov_b32_e32 v26, s21 +; SI-NEXT: v_mov_b32_e32 v23, s22 ; SI-NEXT: v_mov_b32_e32 v24, s23 -; SI-NEXT: v_mov_b32_e32 v23, s24 -; SI-NEXT: v_mov_b32_e32 v21, s25 -; SI-NEXT: v_mov_b32_e32 v20, s26 -; SI-NEXT: v_mov_b32_e32 v19, s27 -; SI-NEXT: v_mov_b32_e32 v18, s28 -; SI-NEXT: v_mov_b32_e32 v17, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v21, s24 +; SI-NEXT: v_mov_b32_e32 v22, s25 +; SI-NEXT: v_mov_b32_e32 v19, s26 +; SI-NEXT: v_mov_b32_e32 v20, s27 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: v_mov_b32_e32 v17, s28 +; SI-NEXT: v_mov_b32_e32 v18, s29 +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB29_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v48, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 ; SI-NEXT: s_cbranch_execnz .LBB29_3 ; SI-NEXT: .LBB29_2: ; %cmp.true -; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 -; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 -; SI-NEXT: v_add_f32_e32 v32, 1.0, v32 -; SI-NEXT: v_add_f32_e32 v33, 1.0, v33 -; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 -; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 -; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 -; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 +; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 +; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 +; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 +; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 +; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 +; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 +; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 +; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 +; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 +; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[17:18], 16 +; SI-NEXT: v_add_f32_e32 v22, 1.0, v22 ; SI-NEXT: v_add_f32_e32 v21, 1.0, v21 +; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 +; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 +; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[19:20], 16 +; SI-NEXT: v_add_f32_e32 v24, 1.0, v24 ; SI-NEXT: v_add_f32_e32 v23, 1.0, v23 -; SI-NEXT: v_add_f32_e32 v19, 1.0, v19 -; SI-NEXT: v_add_f32_e32 v20, 1.0, v20 -; SI-NEXT: v_add_f32_e32 v17, 1.0, v17 -; SI-NEXT: v_add_f32_e32 v18, 1.0, v18 -; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 -; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 -; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 -; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 -; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 -; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 ; SI-NEXT: v_add_f32_e32 v8, 1.0, v8 ; SI-NEXT: v_add_f32_e32 v7, 1.0, v7 -; SI-NEXT: v_add_f32_e32 v10, 1.0, v10 -; SI-NEXT: v_add_f32_e32 v9, 1.0, v9 -; SI-NEXT: v_add_f32_e32 v12, 1.0, v12 -; SI-NEXT: v_add_f32_e32 v11, 1.0, v11 -; SI-NEXT: v_add_f32_e32 v14, 1.0, v14 -; SI-NEXT: v_add_f32_e32 v13, 1.0, v13 -; SI-NEXT: v_add_f32_e32 v16, 1.0, v16 -; SI-NEXT: v_add_f32_e32 v15, 1.0, v15 -; SI-NEXT: v_alignbit_b32 v22, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v26, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v31, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v38, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v48, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v51, v17, v18, 16 -; SI-NEXT: v_alignbit_b32 v53, v19, v20, 16 -; SI-NEXT: v_alignbit_b32 v55, v21, v23, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v25, 16 -; SI-NEXT: v_alignbit_b32 v44, v27, v29, 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[50:51], v[21:22], 16 +; SI-NEXT: v_add_f32_e32 v26, 1.0, v26 +; SI-NEXT: v_add_f32_e32 v25, 1.0, v25 +; SI-NEXT: v_add_f32_e32 v6, 1.0, v6 +; SI-NEXT: v_add_f32_e32 v5, 1.0, v5 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[23:24], 16 +; SI-NEXT: v_add_f32_e32 v30, 1.0, v30 +; SI-NEXT: v_add_f32_e32 v29, 1.0, v29 +; SI-NEXT: v_add_f32_e32 v4, 1.0, v4 +; SI-NEXT: v_add_f32_e32 v3, 1.0, v3 +; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[25:26], 16 +; SI-NEXT: v_add_f32_e32 v28, 1.0, v28 +; SI-NEXT: v_add_f32_e32 v27, 1.0, v27 +; SI-NEXT: v_add_f32_e32 v2, 1.0, v2 +; SI-NEXT: v_add_f32_e32 v1, 1.0, v1 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v32, v33, 16 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v18 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v30, 16 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v50, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v22 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v32 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v28 ; SI-NEXT: .LBB29_3: ; %end -; SI-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_or_b32_e32 v30, v30, v56 -; SI-NEXT: buffer_store_dword v30, v0, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v60 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v46 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 8, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v54 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v27, v39 +; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v28, 0xffff, v32 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v59 -; SI-NEXT: v_or_b32_e32 v28, v28, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v28, v30, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 +; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 ; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v44 -; SI-NEXT: v_or_b32_e32 v28, v28, v29 -; SI-NEXT: v_add_i32_e32 v29, vcc, 16, v0 -; SI-NEXT: buffer_store_dword v28, v29, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 +; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v58 +; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v61 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 -; SI-NEXT: v_add_i32_e32 v28, vcc, 20, v0 +; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 +; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v24, 0xffff, v24 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v57 -; SI-NEXT: v_or_b32_e32 v24, v24, v25 -; SI-NEXT: v_add_i32_e32 v25, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v24, v25, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v60 +; SI-NEXT: v_or_b32_e32 v25, v25, v26 +; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v51 ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; SI-NEXT: v_or_b32_e32 v23, v23, v25 +; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v55 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v59 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 -; SI-NEXT: v_add_i32_e32 v24, vcc, 32, v0 +; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v50 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 -; SI-NEXT: v_add_i32_e32 v23, vcc, 36, v0 +; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v53 -; SI-NEXT: v_or_b32_e32 v20, v20, v21 -; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen +; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v58 +; SI-NEXT: v_or_b32_e32 v21, v21, v22 +; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v49 +; SI-NEXT: v_or_b32_e32 v19, v19, v21 +; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v57 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v51 -; SI-NEXT: v_or_b32_e32 v18, v18, v19 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v48 +; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v43 +; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v56 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v38 ; SI-NEXT: v_or_b32_e32 v1, v1, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -16117,7 +16190,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -16129,7 +16202,7 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -16141,92 +16214,94 @@ define inreg <60 x i16> @bitcast_v30f32_to_v60i16_scalar(<30 x float> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v33 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v13 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v26 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v32 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x68, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v22 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v31 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v37 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB29_4: -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr44 +; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr48 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr51 +; SI-NEXT: ; implicit-def: $vgpr44 ; SI-NEXT: ; implicit-def: $vgpr43 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr42 -; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr41 ; SI-NEXT: ; implicit-def: $vgpr40 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr50 +; SI-NEXT: ; implicit-def: $vgpr33 +; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr49 -; SI-NEXT: ; implicit-def: $vgpr26 -; SI-NEXT: ; implicit-def: $vgpr39 -; SI-NEXT: ; implicit-def: $vgpr22 -; SI-NEXT: ; implicit-def: $vgpr37 ; SI-NEXT: s_branch .LBB29_2 ; ; VI-LABEL: bitcast_v30f32_to_v60i16_scalar: @@ -18310,41 +18385,44 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -18355,8 +18433,8 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -18368,51 +18446,51 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB31_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -18425,17 +18503,17 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -18446,30 +18524,30 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB31_3 ; SI-NEXT: .LBB31_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -18512,119 +18590,119 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB31_3: ; %end @@ -18647,35 +18725,67 @@ define inreg <30 x float> @bitcast_v60i16_to_v30f32_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB31_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB31_2 ; ; VI-LABEL: bitcast_v60i16_to_v30f32_scalar: @@ -26969,361 +27079,396 @@ define inreg <60 x i16> @bitcast_v15i64_to_v60i16_scalar(<15 x i64> inreg %a, i3 ; SI-LABEL: bitcast_v15i64_to_v60i16_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: s_mov_b64 exec, s[4:5] +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_writelane_b32 v18, s30, 0 +; SI-NEXT: v_writelane_b32 v18, s31, 1 +; SI-NEXT: v_writelane_b32 v18, s34, 2 +; SI-NEXT: v_writelane_b32 v18, s35, 3 +; SI-NEXT: v_writelane_b32 v18, s36, 4 +; SI-NEXT: v_writelane_b32 v18, s37, 5 +; SI-NEXT: v_writelane_b32 v18, s38, 6 +; SI-NEXT: v_writelane_b32 v18, s39, 7 +; SI-NEXT: v_writelane_b32 v18, s48, 8 +; SI-NEXT: v_writelane_b32 v18, s49, 9 +; SI-NEXT: v_writelane_b32 v18, s50, 10 +; SI-NEXT: v_writelane_b32 v18, s51, 11 +; SI-NEXT: v_writelane_b32 v18, s52, 12 +; SI-NEXT: v_writelane_b32 v18, s53, 13 +; SI-NEXT: v_writelane_b32 v18, s54, 14 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v17 -; SI-NEXT: v_readfirstlane_b32 s45, v1 -; SI-NEXT: v_readfirstlane_b32 s44, v2 -; SI-NEXT: v_readfirstlane_b32 s43, v3 -; SI-NEXT: v_readfirstlane_b32 s42, v4 -; SI-NEXT: v_readfirstlane_b32 s41, v5 -; SI-NEXT: v_readfirstlane_b32 s40, v6 -; SI-NEXT: v_readfirstlane_b32 s15, v7 -; SI-NEXT: v_readfirstlane_b32 s14, v8 -; SI-NEXT: v_readfirstlane_b32 s13, v9 -; SI-NEXT: v_readfirstlane_b32 s12, v10 -; SI-NEXT: v_readfirstlane_b32 s11, v11 -; SI-NEXT: v_readfirstlane_b32 s10, v12 -; SI-NEXT: v_readfirstlane_b32 s9, v13 -; SI-NEXT: v_readfirstlane_b32 s8, v14 -; SI-NEXT: v_readfirstlane_b32 s7, v15 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_readfirstlane_b32 s6, v16 +; SI-NEXT: v_writelane_b32 v18, s55, 15 +; SI-NEXT: v_readfirstlane_b32 s42, v1 +; SI-NEXT: v_readfirstlane_b32 s43, v2 +; SI-NEXT: v_readfirstlane_b32 s40, v3 +; SI-NEXT: v_readfirstlane_b32 s41, v4 +; SI-NEXT: v_readfirstlane_b32 s14, v5 +; SI-NEXT: v_readfirstlane_b32 s15, v6 +; SI-NEXT: v_readfirstlane_b32 s12, v7 +; SI-NEXT: v_readfirstlane_b32 s13, v8 +; SI-NEXT: v_readfirstlane_b32 s10, v9 +; SI-NEXT: v_readfirstlane_b32 s11, v10 +; SI-NEXT: v_readfirstlane_b32 s8, v11 +; SI-NEXT: v_readfirstlane_b32 s9, v12 +; SI-NEXT: v_readfirstlane_b32 s6, v13 +; SI-NEXT: v_readfirstlane_b32 s7, v14 +; SI-NEXT: v_readfirstlane_b32 s4, v15 +; SI-NEXT: s_and_b64 s[44:45], vcc, exec +; SI-NEXT: v_readfirstlane_b32 s5, v16 +; SI-NEXT: v_writelane_b32 v18, s64, 16 ; SI-NEXT: s_cbranch_scc0 .LBB41_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s20 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 -; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 16 -; SI-NEXT: s_lshr_b32 s56, s10, 16 -; SI-NEXT: s_lshr_b32 s57, s12, 16 -; SI-NEXT: s_lshr_b32 s58, s14, 16 -; SI-NEXT: s_lshr_b32 s59, s40, 16 -; SI-NEXT: s_lshr_b32 s60, s42, 16 -; SI-NEXT: s_lshr_b32 s61, s44, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 -; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s41, 16 +; SI-NEXT: s_lshr_b32 s49, s43, 16 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s19, 16 +; SI-NEXT: s_lshr_b32 s64, s17, 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB41_3 ; SI-NEXT: .LBB41_2: ; %cmp.true -; SI-NEXT: s_add_u32 s16, s16, 3 -; SI-NEXT: s_addc_u32 s17, s17, 0 -; SI-NEXT: s_add_u32 s18, s18, 3 -; SI-NEXT: s_addc_u32 s19, s19, 0 -; SI-NEXT: s_add_u32 s20, s20, 3 -; SI-NEXT: s_addc_u32 s21, s21, 0 -; SI-NEXT: s_add_u32 s22, s22, 3 -; SI-NEXT: s_addc_u32 s23, s23, 0 -; SI-NEXT: s_add_u32 s24, s24, 3 -; SI-NEXT: s_addc_u32 s25, s25, 0 -; SI-NEXT: s_add_u32 s26, s26, 3 -; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s4, s4, 3 +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: s_add_u32 s6, s6, 3 +; SI-NEXT: s_addc_u32 s7, s7, 0 +; SI-NEXT: s_add_u32 s8, s8, 3 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_add_u32 s10, s10, 3 +; SI-NEXT: s_addc_u32 s11, s11, 0 +; SI-NEXT: s_add_u32 s12, s12, 3 +; SI-NEXT: s_addc_u32 s13, s13, 0 +; SI-NEXT: s_add_u32 s14, s14, 3 +; SI-NEXT: s_addc_u32 s15, s15, 0 +; SI-NEXT: s_add_u32 s40, s40, 3 +; SI-NEXT: s_addc_u32 s41, s41, 0 +; SI-NEXT: s_add_u32 s42, s42, 3 +; SI-NEXT: s_addc_u32 s43, s43, 0 ; SI-NEXT: s_add_u32 s28, s28, 3 ; SI-NEXT: s_addc_u32 s29, s29, 0 -; SI-NEXT: s_add_u32 s45, s45, 3 -; SI-NEXT: s_addc_u32 s44, s44, 0 -; SI-NEXT: s_add_u32 s43, s43, 3 -; SI-NEXT: s_addc_u32 s42, s42, 0 -; SI-NEXT: s_add_u32 s41, s41, 3 -; SI-NEXT: s_addc_u32 s40, s40, 0 -; SI-NEXT: s_add_u32 s15, s15, 3 -; SI-NEXT: s_addc_u32 s14, s14, 0 -; SI-NEXT: s_add_u32 s13, s13, 3 -; SI-NEXT: s_addc_u32 s12, s12, 0 -; SI-NEXT: s_add_u32 s11, s11, 3 -; SI-NEXT: s_addc_u32 s10, s10, 0 -; SI-NEXT: s_add_u32 s9, s9, 3 -; SI-NEXT: s_addc_u32 s8, s8, 0 -; SI-NEXT: s_add_u32 s7, s7, 3 -; SI-NEXT: s_addc_u32 s6, s6, 0 -; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: v_mov_b32_e32 v2, s9 -; SI-NEXT: v_mov_b32_e32 v3, s11 -; SI-NEXT: v_mov_b32_e32 v4, s13 -; SI-NEXT: v_mov_b32_e32 v5, s15 -; SI-NEXT: v_mov_b32_e32 v6, s41 -; SI-NEXT: v_mov_b32_e32 v7, s43 -; SI-NEXT: v_mov_b32_e32 v8, s45 -; SI-NEXT: v_mov_b32_e32 v9, s28 -; SI-NEXT: v_mov_b32_e32 v10, s26 -; SI-NEXT: v_mov_b32_e32 v11, s24 -; SI-NEXT: v_mov_b32_e32 v12, s22 -; SI-NEXT: v_mov_b32_e32 v13, s20 -; SI-NEXT: v_mov_b32_e32 v14, s18 -; SI-NEXT: v_mov_b32_e32 v15, s16 -; SI-NEXT: v_alignbit_b32 v1, s6, v1, 16 -; SI-NEXT: v_alignbit_b32 v2, s8, v2, 16 -; SI-NEXT: v_alignbit_b32 v3, s10, v3, 16 -; SI-NEXT: v_alignbit_b32 v4, s12, v4, 16 -; SI-NEXT: v_alignbit_b32 v5, s14, v5, 16 -; SI-NEXT: v_alignbit_b32 v6, s40, v6, 16 -; SI-NEXT: v_alignbit_b32 v7, s42, v7, 16 -; SI-NEXT: v_alignbit_b32 v8, s44, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s29, v9, 16 -; SI-NEXT: v_alignbit_b32 v10, s27, v10, 16 -; SI-NEXT: v_alignbit_b32 v11, s25, v11, 16 -; SI-NEXT: v_alignbit_b32 v12, s23, v12, 16 -; SI-NEXT: v_alignbit_b32 v13, s21, v13, 16 -; SI-NEXT: v_alignbit_b32 v14, s19, v14, 16 -; SI-NEXT: v_alignbit_b32 v15, s17, v15, 16 -; SI-NEXT: s_lshr_b32 s46, s6, 16 -; SI-NEXT: s_lshr_b32 s47, s8, 16 -; SI-NEXT: s_lshr_b32 s56, s10, 16 -; SI-NEXT: s_lshr_b32 s57, s12, 16 -; SI-NEXT: s_lshr_b32 s58, s14, 16 -; SI-NEXT: s_lshr_b32 s59, s40, 16 -; SI-NEXT: s_lshr_b32 s60, s42, 16 -; SI-NEXT: s_lshr_b32 s61, s44, 16 -; SI-NEXT: s_lshr_b32 s62, s29, 16 -; SI-NEXT: s_lshr_b32 s63, s27, 16 -; SI-NEXT: s_lshr_b32 s72, s25, 16 -; SI-NEXT: s_lshr_b32 s73, s23, 16 -; SI-NEXT: s_lshr_b32 s74, s21, 16 -; SI-NEXT: s_lshr_b32 s75, s19, 16 -; SI-NEXT: s_lshr_b32 s76, s17, 16 +; SI-NEXT: s_add_u32 s26, s26, 3 +; SI-NEXT: s_addc_u32 s27, s27, 0 +; SI-NEXT: s_add_u32 s24, s24, 3 +; SI-NEXT: s_addc_u32 s25, s25, 0 +; SI-NEXT: s_add_u32 s22, s22, 3 +; SI-NEXT: s_addc_u32 s23, s23, 0 +; SI-NEXT: s_add_u32 s20, s20, 3 +; SI-NEXT: s_addc_u32 s21, s21, 0 +; SI-NEXT: s_add_u32 s18, s18, 3 +; SI-NEXT: s_addc_u32 s19, s19, 0 +; SI-NEXT: s_add_u32 s16, s16, 3 +; SI-NEXT: s_addc_u32 s17, s17, 0 +; SI-NEXT: s_lshr_b32 s34, s5, 16 +; SI-NEXT: s_lshr_b32 s35, s7, 16 +; SI-NEXT: s_lshr_b32 s36, s9, 16 +; SI-NEXT: s_lshr_b32 s37, s11, 16 +; SI-NEXT: s_lshr_b32 s38, s13, 16 +; SI-NEXT: s_lshr_b32 s39, s15, 16 +; SI-NEXT: s_lshr_b32 s48, s41, 16 +; SI-NEXT: s_lshr_b32 s49, s43, 16 +; SI-NEXT: s_lshr_b32 s50, s29, 16 +; SI-NEXT: s_lshr_b32 s51, s27, 16 +; SI-NEXT: s_lshr_b32 s52, s25, 16 +; SI-NEXT: s_lshr_b32 s53, s23, 16 +; SI-NEXT: s_lshr_b32 s54, s21, 16 +; SI-NEXT: s_lshr_b32 s55, s19, 16 +; SI-NEXT: s_lshr_b32 s64, s17, 16 +; SI-NEXT: s_lshr_b64 s[44:45], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[46:47], s[6:7], 16 +; SI-NEXT: s_lshr_b64 s[56:57], s[8:9], 16 +; SI-NEXT: s_lshr_b64 s[58:59], s[10:11], 16 +; SI-NEXT: s_lshr_b64 s[60:61], s[12:13], 16 +; SI-NEXT: s_lshr_b64 s[62:63], s[14:15], 16 +; SI-NEXT: s_lshr_b64 s[72:73], s[40:41], 16 +; SI-NEXT: s_lshr_b64 s[74:75], s[42:43], 16 +; SI-NEXT: s_lshr_b64 s[76:77], s[28:29], 16 +; SI-NEXT: s_lshr_b64 s[78:79], s[26:27], 16 +; SI-NEXT: s_lshr_b64 s[88:89], s[24:25], 16 +; SI-NEXT: s_lshr_b64 s[90:91], s[22:23], 16 +; SI-NEXT: s_lshr_b64 s[92:93], s[20:21], 16 +; SI-NEXT: s_lshr_b64 s[94:95], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[30:31], s[16:17], 16 ; SI-NEXT: .LBB41_3: ; %end -; SI-NEXT: s_and_b32 s4, s16, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; SI-NEXT: v_or_b32_e32 v15, s4, v15 -; SI-NEXT: s_and_b32 s4, s17, 0xffff -; SI-NEXT: s_lshl_b32 s5, s76, 16 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: v_mov_b32_e32 v16, s4 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: buffer_store_dword v15, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v15, vcc, 4, v0 -; SI-NEXT: v_or_b32_e32 v14, s4, v14 -; SI-NEXT: s_and_b32 s4, s19, 0xffff -; SI-NEXT: s_lshl_b32 s5, s75, 16 -; SI-NEXT: buffer_store_dword v16, v15, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v15, vcc, 8, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v14, v15, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v15, s4 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; SI-NEXT: s_lshl_b32 s45, s30, 16 +; SI-NEXT: s_and_b32 s16, s16, 0xffff +; SI-NEXT: s_or_b32 s16, s16, s45 +; SI-NEXT: v_mov_b32_e32 v1, s16 +; SI-NEXT: s_and_b32 s16, s17, 0xffff +; SI-NEXT: s_lshl_b32 s17, s64, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_lshl_b32 s16, s94, 16 +; SI-NEXT: s_and_b32 s17, s18, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v3, s16 +; SI-NEXT: s_and_b32 s16, s19, 0xffff +; SI-NEXT: s_lshl_b32 s17, s55, 16 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: v_mov_b32_e32 v4, s16 +; SI-NEXT: s_lshl_b32 s16, s92, 16 +; SI-NEXT: s_and_b32 s17, s20, 0xffff +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: v_mov_b32_e32 v5, s16 +; SI-NEXT: s_and_b32 s16, s21, 0xffff +; SI-NEXT: s_lshl_b32 s17, s54, 16 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v14, vcc, 12, v0 -; SI-NEXT: v_or_b32_e32 v13, s4, v13 -; SI-NEXT: s_and_b32 s4, s21, 0xffff -; SI-NEXT: s_lshl_b32 s5, s74, 16 -; SI-NEXT: buffer_store_dword v15, v14, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v14, vcc, 16, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v13, v14, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v14, s4 -; SI-NEXT: s_and_b32 s4, s22, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; SI-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 8, v0 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 12, v0 +; SI-NEXT: s_lshl_b32 s16, s90, 16 +; SI-NEXT: s_and_b32 s17, s22, 0xffff +; SI-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 16, v0 +; SI-NEXT: s_or_b32 s16, s17, s16 +; SI-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 20, v0 +; SI-NEXT: s_waitcnt expcnt(3) +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s23, 0xffff +; SI-NEXT: s_lshl_b32 s17, s53, 16 +; SI-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen +; SI-NEXT: v_add_i32_e32 v1, vcc, 24, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v13, vcc, 20, v0 -; SI-NEXT: v_or_b32_e32 v12, s4, v12 -; SI-NEXT: s_and_b32 s4, s23, 0xffff -; SI-NEXT: s_lshl_b32 s5, s73, 16 -; SI-NEXT: buffer_store_dword v14, v13, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v13, vcc, 24, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v12, v13, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v13, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s24, 0xffff +; SI-NEXT: s_lshl_b32 s17, s88, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 28, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v12, vcc, 28, v0 -; SI-NEXT: v_or_b32_e32 v11, s4, v11 -; SI-NEXT: s_and_b32 s4, s25, 0xffff -; SI-NEXT: s_lshl_b32 s5, s72, 16 -; SI-NEXT: buffer_store_dword v13, v12, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v12, vcc, 32, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v11, v12, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v12, s4 -; SI-NEXT: s_and_b32 s4, s26, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s25, 0xffff +; SI-NEXT: s_lshl_b32 s17, s52, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 32, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v11, vcc, 36, v0 -; SI-NEXT: v_or_b32_e32 v10, s4, v10 -; SI-NEXT: s_and_b32 s4, s27, 0xffff -; SI-NEXT: s_lshl_b32 s5, s63, 16 -; SI-NEXT: buffer_store_dword v12, v11, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v11, vcc, 40, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v10, v11, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v11, s4 -; SI-NEXT: s_and_b32 s4, s28, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s26, 0xffff +; SI-NEXT: s_lshl_b32 s17, s78, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 36, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: v_or_b32_e32 v9, s4, v9 -; SI-NEXT: s_and_b32 s4, s29, 0xffff -; SI-NEXT: s_lshl_b32 s5, s62, 16 -; SI-NEXT: buffer_store_dword v11, v10, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v10, vcc, 48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v9, v10, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v10, s4 -; SI-NEXT: s_and_b32 s4, s45, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s27, 0xffff +; SI-NEXT: s_lshl_b32 s17, s51, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 40, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 -; SI-NEXT: v_or_b32_e32 v8, s4, v8 -; SI-NEXT: s_and_b32 s4, s44, 0xffff -; SI-NEXT: s_lshl_b32 s5, s61, 16 -; SI-NEXT: buffer_store_dword v10, v9, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v8, v9, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v9, s4 -; SI-NEXT: s_and_b32 s4, s43, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s28, 0xffff +; SI-NEXT: s_lshl_b32 s17, s76, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 44, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 -; SI-NEXT: v_or_b32_e32 v7, s4, v7 -; SI-NEXT: s_and_b32 s4, s42, 0xffff -; SI-NEXT: s_lshl_b32 s5, s60, 16 -; SI-NEXT: buffer_store_dword v9, v8, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v7, v8, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_and_b32 s4, s41, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s29, 0xffff +; SI-NEXT: s_lshl_b32 s17, s50, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 48, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x44, v0 -; SI-NEXT: v_or_b32_e32 v6, s4, v6 -; SI-NEXT: s_and_b32 s4, s40, 0xffff -; SI-NEXT: s_lshl_b32 s5, s59, 16 -; SI-NEXT: buffer_store_dword v8, v7, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v7, vcc, 0x48, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v6, v7, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v7, s4 -; SI-NEXT: s_and_b32 s4, s15, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s42, 0xffff +; SI-NEXT: s_lshl_b32 s17, s74, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 52, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 -; SI-NEXT: v_or_b32_e32 v5, s4, v5 -; SI-NEXT: s_and_b32 s4, s14, 0xffff -; SI-NEXT: s_lshl_b32 s5, s58, 16 -; SI-NEXT: buffer_store_dword v7, v6, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v6, s4 -; SI-NEXT: s_and_b32 s4, s13, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s43, 0xffff +; SI-NEXT: s_lshl_b32 s17, s49, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 56, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 -; SI-NEXT: v_or_b32_e32 v4, s4, v4 -; SI-NEXT: s_and_b32 s4, s12, 0xffff -; SI-NEXT: s_lshl_b32 s5, s57, 16 -; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v5, vcc, 0x58, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v4, v5, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v5, s4 -; SI-NEXT: s_and_b32 s4, s11, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s40, 0xffff +; SI-NEXT: s_lshl_b32 s17, s72, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 60, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 -; SI-NEXT: v_or_b32_e32 v3, s4, v3 -; SI-NEXT: s_and_b32 s4, s10, 0xffff -; SI-NEXT: s_lshl_b32 s5, s56, 16 -; SI-NEXT: buffer_store_dword v5, v4, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x60, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v4, s4 -; SI-NEXT: s_and_b32 s4, s9, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s16, s41, 0xffff +; SI-NEXT: s_lshl_b32 s17, s48, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 64, v0 +; SI-NEXT: s_or_b32 s16, s16, s17 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 -; SI-NEXT: v_or_b32_e32 v2, s4, v2 -; SI-NEXT: s_and_b32 s4, s8, 0xffff -; SI-NEXT: s_lshl_b32 s5, s47, 16 -; SI-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen -; SI-NEXT: v_mov_b32_e32 v3, s4 -; SI-NEXT: s_and_b32 s4, s7, 0xffff -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_mov_b32_e32 v2, s16 +; SI-NEXT: s_and_b32 s14, s14, 0xffff +; SI-NEXT: s_lshl_b32 s16, s62, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x44, v0 +; SI-NEXT: s_or_b32 s14, s14, s16 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 -; SI-NEXT: v_or_b32_e32 v1, s4, v1 -; SI-NEXT: s_and_b32 s4, s6, 0xffff -; SI-NEXT: s_lshl_b32 s5, s46, 16 -; SI-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen -; SI-NEXT: v_add_i32_e32 v2, vcc, 0x70, v0 +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s14, s15, 0xffff +; SI-NEXT: s_lshl_b32 s15, s39, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x48, v0 +; SI-NEXT: s_or_b32 s14, s14, s15 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: s_and_b32 s12, s12, 0xffff +; SI-NEXT: s_lshl_b32 s14, s60, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x4c, v0 +; SI-NEXT: s_or_b32 s12, s12, s14 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s12, s13, 0xffff +; SI-NEXT: s_lshl_b32 s13, s38, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x50, v0 +; SI-NEXT: s_or_b32 s12, s12, s13 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s12 +; SI-NEXT: s_and_b32 s10, s10, 0xffff +; SI-NEXT: s_lshl_b32 s12, s58, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x54, v0 +; SI-NEXT: s_or_b32 s10, s10, s12 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s10, s11, 0xffff +; SI-NEXT: s_lshl_b32 s11, s37, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x58, v0 +; SI-NEXT: s_or_b32 s10, s10, s11 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: s_and_b32 s8, s8, 0xffff +; SI-NEXT: s_lshl_b32 s10, s56, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x5c, v0 +; SI-NEXT: s_or_b32 s8, s8, s10 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s8, s9, 0xffff +; SI-NEXT: s_lshl_b32 s9, s36, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x60, v0 +; SI-NEXT: s_or_b32 s8, s8, s9 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_lshl_b32 s8, s46, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x64, v0 +; SI-NEXT: s_or_b32 s6, s6, s8 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s6, s7, 0xffff +; SI-NEXT: s_lshl_b32 s7, s35, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x68, v0 +; SI-NEXT: s_or_b32 s6, s6, s7 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: s_and_b32 s4, s4, 0xffff +; SI-NEXT: s_lshl_b32 s6, s44, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x6c, v0 +; SI-NEXT: s_or_b32 s4, s4, s6 +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_lshl_b32 s5, s34, 16 +; SI-NEXT: v_add_i32_e32 v1, vcc, 0x70, v0 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s4 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: v_readlane_b32 s64, v18, 16 +; SI-NEXT: v_readlane_b32 s55, v18, 15 +; SI-NEXT: v_readlane_b32 s54, v18, 14 +; SI-NEXT: v_readlane_b32 s53, v18, 13 +; SI-NEXT: v_readlane_b32 s52, v18, 12 +; SI-NEXT: v_readlane_b32 s51, v18, 11 +; SI-NEXT: v_readlane_b32 s50, v18, 10 +; SI-NEXT: v_readlane_b32 s49, v18, 9 +; SI-NEXT: v_readlane_b32 s48, v18, 8 +; SI-NEXT: v_readlane_b32 s39, v18, 7 +; SI-NEXT: v_readlane_b32 s38, v18, 6 +; SI-NEXT: v_readlane_b32 s37, v18, 5 +; SI-NEXT: v_readlane_b32 s36, v18, 4 +; SI-NEXT: v_readlane_b32 s35, v18, 3 +; SI-NEXT: v_readlane_b32 s34, v18, 2 +; SI-NEXT: v_readlane_b32 s31, v18, 1 +; SI-NEXT: v_readlane_b32 s30, v18, 0 +; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; SI-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB41_4: -; SI-NEXT: ; implicit-def: $vgpr15 +; SI-NEXT: ; implicit-def: $sgpr30 +; SI-NEXT: ; implicit-def: $sgpr64 +; SI-NEXT: ; implicit-def: $sgpr94 +; SI-NEXT: ; implicit-def: $sgpr55 +; SI-NEXT: ; implicit-def: $sgpr92 +; SI-NEXT: ; implicit-def: $sgpr54 +; SI-NEXT: ; implicit-def: $sgpr90 +; SI-NEXT: ; implicit-def: $sgpr53 +; SI-NEXT: ; implicit-def: $sgpr88 +; SI-NEXT: ; implicit-def: $sgpr52 +; SI-NEXT: ; implicit-def: $sgpr78 +; SI-NEXT: ; implicit-def: $sgpr51 ; SI-NEXT: ; implicit-def: $sgpr76 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $sgpr75 -; SI-NEXT: ; implicit-def: $vgpr13 +; SI-NEXT: ; implicit-def: $sgpr50 ; SI-NEXT: ; implicit-def: $sgpr74 -; SI-NEXT: ; implicit-def: $vgpr12 -; SI-NEXT: ; implicit-def: $sgpr73 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr49 ; SI-NEXT: ; implicit-def: $sgpr72 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $sgpr63 -; SI-NEXT: ; implicit-def: $vgpr9 +; SI-NEXT: ; implicit-def: $sgpr48 ; SI-NEXT: ; implicit-def: $sgpr62 -; SI-NEXT: ; implicit-def: $vgpr8 -; SI-NEXT: ; implicit-def: $sgpr61 -; SI-NEXT: ; implicit-def: $vgpr7 +; SI-NEXT: ; implicit-def: $sgpr39 ; SI-NEXT: ; implicit-def: $sgpr60 -; SI-NEXT: ; implicit-def: $vgpr6 -; SI-NEXT: ; implicit-def: $sgpr59 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr38 ; SI-NEXT: ; implicit-def: $sgpr58 -; SI-NEXT: ; implicit-def: $vgpr4 -; SI-NEXT: ; implicit-def: $sgpr57 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr37 ; SI-NEXT: ; implicit-def: $sgpr56 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $sgpr47 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $sgpr36 ; SI-NEXT: ; implicit-def: $sgpr46 +; SI-NEXT: ; implicit-def: $sgpr35 +; SI-NEXT: ; implicit-def: $sgpr34 +; SI-NEXT: ; implicit-def: $sgpr44 ; SI-NEXT: s_branch .LBB41_2 ; ; VI-LABEL: bitcast_v15i64_to_v60i16_scalar: @@ -29422,41 +29567,44 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -29467,8 +29615,8 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -29480,51 +29628,51 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -29537,17 +29685,17 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -29558,30 +29706,30 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -29624,119 +29772,119 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB43_3: ; %end @@ -29759,35 +29907,67 @@ define inreg <15 x i64> @bitcast_v60i16_to_v15i64_scalar(<60 x i16> inreg %a, i3 ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB43_2 ; ; VI-LABEL: bitcast_v60i16_to_v15i64_scalar: @@ -37157,203 +37337,207 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: v_mov_b32_e32 v17, s28 ; SI-NEXT: v_mov_b32_e32 v18, s29 -; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_cbranch_scc0 .LBB49_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v37, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v39, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 +; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 +; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 +; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[1:2], 16 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[25:26], 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 ; SI-NEXT: s_cbranch_execnz .LBB49_3 ; SI-NEXT: .LBB49_2: ; %cmp.true -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 -; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 -; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 -; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 -; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 -; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 -; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 -; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[15:16], v[15:16], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 -; SI-NEXT: v_alignbit_b32 v31, v16, v15, 16 -; SI-NEXT: v_alignbit_b32 v32, v14, v13, 16 -; SI-NEXT: v_alignbit_b32 v33, v12, v11, 16 -; SI-NEXT: v_alignbit_b32 v34, v10, v9, 16 -; SI-NEXT: v_alignbit_b32 v35, v8, v7, 16 -; SI-NEXT: v_alignbit_b32 v36, v6, v5, 16 -; SI-NEXT: v_alignbit_b32 v37, v4, v3, 16 -; SI-NEXT: v_alignbit_b32 v39, v2, v1, 16 -; SI-NEXT: v_alignbit_b32 v50, v18, v17, 16 -; SI-NEXT: v_alignbit_b32 v52, v20, v19, 16 -; SI-NEXT: v_alignbit_b32 v55, v22, v21, 16 -; SI-NEXT: v_alignbit_b32 v41, v24, v23, 16 -; SI-NEXT: v_alignbit_b32 v43, v26, v25, 16 +; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 +; SI-NEXT: v_lshr_b64 v[31:32], v[15:16], 16 +; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 +; SI-NEXT: v_lshr_b64 v[32:33], v[13:14], 16 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 +; SI-NEXT: v_lshr_b64 v[33:34], v[11:12], 16 +; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 +; SI-NEXT: v_add_f64 v[1:2], v[1:2], 1.0 +; SI-NEXT: v_lshr_b64 v[34:35], v[9:10], 16 +; SI-NEXT: v_add_f64 v[3:4], v[3:4], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 +; SI-NEXT: v_lshr_b64 v[35:36], v[7:8], 16 +; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 +; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 +; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 +; SI-NEXT: v_lshr_b64 v[36:37], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[51:52], v[1:2], 16 +; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshr_b64 v[37:38], v[3:4], 16 +; SI-NEXT: v_lshr_b64 v[52:53], v[23:24], 16 +; SI-NEXT: v_lshr_b64 v[38:39], v[17:18], 16 +; SI-NEXT: v_lshr_b64 v[48:49], v[19:20], 16 +; SI-NEXT: v_lshr_b64 v[53:54], v[25:26], 16 +; SI-NEXT: v_lshr_b64 v[49:50], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[54:55], v[29:30], 16 +; SI-NEXT: v_lshr_b64 v[39:40], v[27:28], 16 +; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v4 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v2 ; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: v_alignbit_b32 v46, v30, v29, 16 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v18 +; SI-NEXT: s_waitcnt expcnt(5) +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v20 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_alignbit_b32 v56, v28, v27, 16 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v2 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v22 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v24 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v26 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v30 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v28 ; SI-NEXT: .LBB49_3: ; %end -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 +; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; SI-NEXT: v_or_b32_e32 v27, v27, v56 +; SI-NEXT: v_or_b32_e32 v27, v27, v39 ; SI-NEXT: buffer_store_dword v27, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v60 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v63 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 4, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v27, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v46 -; SI-NEXT: v_or_b32_e32 v27, v27, v28 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v54 +; SI-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; SI-NEXT: v_or_b32_e32 v27, v28, v27 ; SI-NEXT: v_add_i32_e32 v28, vcc, 8, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v27, 0xffff, v30 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v59 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v62 ; SI-NEXT: v_or_b32_e32 v27, v27, v28 ; SI-NEXT: v_add_i32_e32 v28, vcc, 12, v0 ; SI-NEXT: buffer_store_dword v27, v28, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v43 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v53 +; SI-NEXT: v_and_b32_e32 v25, 0xffff, v25 ; SI-NEXT: v_or_b32_e32 v25, v25, v27 ; SI-NEXT: v_add_i32_e32 v27, vcc, 16, v0 ; SI-NEXT: buffer_store_dword v25, v27, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v58 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v61 ; SI-NEXT: v_or_b32_e32 v25, v25, v26 ; SI-NEXT: v_add_i32_e32 v26, vcc, 20, v0 ; SI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen -; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v41 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; SI-NEXT: v_and_b32_e32 v23, 0xffff, v23 ; SI-NEXT: v_or_b32_e32 v23, v23, v25 ; SI-NEXT: v_add_i32_e32 v25, vcc, 24, v0 ; SI-NEXT: buffer_store_dword v23, v25, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v23, 0xffff, v24 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v57 +; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v60 ; SI-NEXT: v_or_b32_e32 v23, v23, v24 ; SI-NEXT: v_add_i32_e32 v24, vcc, 28, v0 ; SI-NEXT: buffer_store_dword v23, v24, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v21 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v55 +; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v49 ; SI-NEXT: v_or_b32_e32 v21, v21, v23 ; SI-NEXT: v_add_i32_e32 v23, vcc, 32, v0 ; SI-NEXT: buffer_store_dword v21, v23, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v47 +; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v59 ; SI-NEXT: v_or_b32_e32 v21, v21, v22 ; SI-NEXT: v_add_i32_e32 v22, vcc, 36, v0 ; SI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v19 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v52 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v48 ; SI-NEXT: v_or_b32_e32 v19, v19, v21 ; SI-NEXT: v_add_i32_e32 v21, vcc, 40, v0 ; SI-NEXT: buffer_store_dword v19, v21, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v45 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v58 ; SI-NEXT: v_or_b32_e32 v19, v19, v20 ; SI-NEXT: v_add_i32_e32 v20, vcc, 44, v0 ; SI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v17 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v50 +; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v38 ; SI-NEXT: v_or_b32_e32 v17, v17, v19 ; SI-NEXT: v_add_i32_e32 v19, vcc, 48, v0 ; SI-NEXT: buffer_store_dword v17, v19, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v44 +; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v57 ; SI-NEXT: v_or_b32_e32 v17, v17, v18 ; SI-NEXT: v_add_i32_e32 v18, vcc, 52, v0 ; SI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v39 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v51 ; SI-NEXT: v_or_b32_e32 v1, v1, v17 ; SI-NEXT: v_add_i32_e32 v17, vcc, 56, v0 ; SI-NEXT: buffer_store_dword v1, v17, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v56 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 60, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37365,7 +37549,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v40 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v47 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x44, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37377,7 +37561,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v54 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v46 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x4c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37389,7 +37573,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v53 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v45 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x54, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37401,7 +37585,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v51 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v44 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x5c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37413,7 +37597,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v49 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v43 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x64, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37425,7 +37609,7 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v48 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v42 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v2, vcc, 0x6c, v0 ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen @@ -37437,56 +37621,59 @@ define inreg <60 x i16> @bitcast_v15f64_to_v60i16_scalar(<15 x double> inreg %a, ; SI-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v38 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v41 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB49_4: -; SI-NEXT: ; implicit-def: $vgpr56 +; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr54 +; SI-NEXT: ; implicit-def: $vgpr62 +; SI-NEXT: ; implicit-def: $vgpr53 +; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr52 ; SI-NEXT: ; implicit-def: $vgpr60 -; SI-NEXT: ; implicit-def: $vgpr46 +; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr59 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr57 -; SI-NEXT: ; implicit-def: $vgpr55 +; SI-NEXT: ; implicit-def: $vgpr56 ; SI-NEXT: ; implicit-def: $vgpr47 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr46 ; SI-NEXT: ; implicit-def: $vgpr45 -; SI-NEXT: ; implicit-def: $vgpr50 ; SI-NEXT: ; implicit-def: $vgpr44 -; SI-NEXT: ; implicit-def: $vgpr39 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr42 +; SI-NEXT: ; implicit-def: $vgpr41 +; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr37 -; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr54 ; SI-NEXT: ; implicit-def: $vgpr35 -; SI-NEXT: ; implicit-def: $vgpr53 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr51 ; SI-NEXT: ; implicit-def: $vgpr33 -; SI-NEXT: ; implicit-def: $vgpr49 ; SI-NEXT: ; implicit-def: $vgpr32 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: s_branch .LBB49_2 ; ; VI-LABEL: bitcast_v15f64_to_v60i16_scalar: @@ -39540,41 +39727,44 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: v_mov_b32_e32 v39, v16 -; SI-NEXT: v_mov_b32_e32 v48, v14 -; SI-NEXT: v_mov_b32_e32 v49, v12 -; SI-NEXT: v_mov_b32_e32 v50, v10 -; SI-NEXT: v_mov_b32_e32 v51, v8 -; SI-NEXT: v_mov_b32_e32 v52, v6 -; SI-NEXT: v_mov_b32_e32 v53, v4 -; SI-NEXT: v_mov_b32_e32 v54, v2 -; SI-NEXT: v_mov_b32_e32 v55, v0 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_mov_b32_e32 v60, v16 +; SI-NEXT: v_mov_b32_e32 v53, v14 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_mov_b32_e32 v62, v12 +; SI-NEXT: v_mov_b32_e32 v32, v10 +; SI-NEXT: v_mov_b32_e32 v55, v8 +; SI-NEXT: v_mov_b32_e32 v37, v6 +; SI-NEXT: v_mov_b32_e32 v41, v4 +; SI-NEXT: v_mov_b32_e32 v44, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_mov_b32_e32 v63, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:4 ; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:16 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:12 ; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:24 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:20 ; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:32 -; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:28 ; SI-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:40 -; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:36 ; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:44 ; SI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_mov_b32_e32 v30, v28 -; SI-NEXT: v_mov_b32_e32 v33, v26 -; SI-NEXT: v_mov_b32_e32 v34, v24 -; SI-NEXT: v_mov_b32_e32 v35, v22 -; SI-NEXT: v_mov_b32_e32 v36, v20 -; SI-NEXT: v_mov_b32_e32 v37, v18 -; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v3 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v5 +; SI-NEXT: v_mov_b32_e32 v39, v26 +; SI-NEXT: v_mov_b32_e32 v48, v24 +; SI-NEXT: v_mov_b32_e32 v49, v22 +; SI-NEXT: v_mov_b32_e32 v47, v20 +; SI-NEXT: v_mov_b32_e32 v50, v18 +; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v3 +; SI-NEXT: v_lshlrev_b32_e32 v28, 16, v5 ; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v22, 16, v9 ; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v11 @@ -39585,8 +39775,8 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v21 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 ; SI-NEXT: v_lshlrev_b32_e32 v46, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v59, 16, v27 -; SI-NEXT: v_lshlrev_b32_e32 v58, 16, v29 +; SI-NEXT: v_lshlrev_b32_e32 v45, 16, v27 +; SI-NEXT: v_lshlrev_b32_e32 v43, 16, v29 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill @@ -39598,51 +39788,51 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v2 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v4 +; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v4 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: v_lshlrev_b32_e32 v47, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v62, 16, v8 -; SI-NEXT: v_lshlrev_b32_e32 v61, 16, v10 -; SI-NEXT: v_lshlrev_b32_e32 v60, 16, v12 -; SI-NEXT: v_lshlrev_b32_e32 v31, 16, v14 +; SI-NEXT: v_lshlrev_b32_e32 v38, 16, v6 +; SI-NEXT: v_lshlrev_b32_e32 v36, 16, v8 +; SI-NEXT: v_lshlrev_b32_e32 v35, 16, v10 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v14 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_lshlrev_b32_e32 v63, 16, v16 +; SI-NEXT: v_lshlrev_b32_e32 v33, 16, v16 ; SI-NEXT: s_cbranch_scc0 .LBB51_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 -; SI-NEXT: v_or_b32_e32 v7, v0, v32 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 -; SI-NEXT: v_or_b32_e32 v9, v0, v26 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v63 +; SI-NEXT: v_or_b32_e32 v7, v0, v31 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 +; SI-NEXT: v_or_b32_e32 v9, v0, v28 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 ; SI-NEXT: v_or_b32_e32 v10, v0, v24 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v55 ; SI-NEXT: v_or_b32_e32 v11, v0, v22 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v32 ; SI-NEXT: v_or_b32_e32 v12, v0, v20 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v62 ; SI-NEXT: s_waitcnt expcnt(6) ; SI-NEXT: v_or_b32_e32 v13, v0, v13 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v53 ; SI-NEXT: v_or_b32_e32 v14, v0, v18 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v60 ; SI-NEXT: v_or_b32_e32 v15, v0, v15 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v37 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v50 ; SI-NEXT: v_or_b32_e32 v16, v0, v17 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v47 ; SI-NEXT: v_or_b32_e32 v17, v0, v5 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v49 ; SI-NEXT: v_or_b32_e32 v18, v0, v3 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v48 ; SI-NEXT: v_or_b32_e32 v19, v0, v46 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v33 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v39 ; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: v_or_b32_e32 v20, v0, v59 +; SI-NEXT: v_or_b32_e32 v20, v0, v45 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v30 -; SI-NEXT: v_or_b32_e32 v21, v0, v58 +; SI-NEXT: v_or_b32_e32 v21, v0, v43 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 @@ -39655,17 +39845,17 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_and_b32 s7, s22, 0xffff ; SI-NEXT: s_lshl_b32 s8, s23, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v44 ; SI-NEXT: s_or_b32 s7, s7, s8 ; SI-NEXT: s_and_b32 s8, s24, 0xffff ; SI-NEXT: s_lshl_b32 s9, s25, 16 +; SI-NEXT: v_or_b32_e32 v8, v1, v26 ; SI-NEXT: s_or_b32 s8, s8, s9 ; SI-NEXT: s_and_b32 s9, s26, 0xffff ; SI-NEXT: s_lshl_b32 s10, s27, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 ; SI-NEXT: s_or_b32 s9, s9, s10 ; SI-NEXT: s_and_b32 s10, s28, 0xffff ; SI-NEXT: s_lshl_b32 s11, s29, 16 -; SI-NEXT: v_or_b32_e32 v8, v1, v28 ; SI-NEXT: s_or_b32 s10, s10, s11 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 @@ -39676,30 +39866,30 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_or_b32_e32 v22, v0, v57 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v45 -; SI-NEXT: v_or_b32_e32 v23, v0, v56 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v44 +; SI-NEXT: v_or_b32_e32 v22, v0, v42 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v61 +; SI-NEXT: v_or_b32_e32 v23, v0, v40 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v52 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: v_or_b32_e32 v24, v0, v47 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v43 -; SI-NEXT: v_or_b32_e32 v25, v0, v62 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v42 +; SI-NEXT: v_or_b32_e32 v24, v0, v38 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v59 +; SI-NEXT: v_or_b32_e32 v25, v0, v36 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v58 ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_or_b32_e32 v26, v0, v61 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v41 -; SI-NEXT: v_or_b32_e32 v27, v0, v60 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v40 +; SI-NEXT: v_or_b32_e32 v26, v0, v35 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v51 +; SI-NEXT: v_or_b32_e32 v27, v0, v34 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v57 ; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: v_or_b32_e32 v28, v0, v31 -; SI-NEXT: v_and_b32_e32 v0, 0xffff, v38 -; SI-NEXT: v_or_b32_e32 v29, v0, v63 +; SI-NEXT: v_or_b32_e32 v28, v0, v54 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v56 +; SI-NEXT: v_or_b32_e32 v29, v0, v33 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_cbranch_execnz .LBB51_3 ; SI-NEXT: .LBB51_2: ; %cmp.true ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v63 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff @@ -39742,119 +39932,119 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: v_mov_b32_e32 v6, s10 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 -; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v54 +; SI-NEXT: v_add_i32_e32 v1, vcc, 3, v44 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: v_add_i32_e32 v8, vcc, 0x30000, v1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v7, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v9, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v10, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v55 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v11, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v32 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v12, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v62 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v13, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v53 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v14, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v60 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v15, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v37 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v50 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v16, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v36 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v47 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v17, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v35 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v49 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_or_b32_e32 v0, v1, v0 ; SI-NEXT: v_add_i32_e32 v18, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v34 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v48 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-NEXT: v_or_b32_e32 v0, v46, v0 ; SI-NEXT: v_add_i32_e32 v19, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v33 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v39 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v59, v0 +; SI-NEXT: v_or_b32_e32 v0, v45, v0 ; SI-NEXT: v_add_i32_e32 v20, vcc, 0x30000, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v30 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v58, v0 +; SI-NEXT: v_or_b32_e32 v0, v43, v0 ; SI-NEXT: v_add_i32_e32 v21, vcc, 0x30000, v0 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v0 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v57, v0 +; SI-NEXT: v_or_b32_e32 v0, v42, v0 ; SI-NEXT: v_add_i32_e32 v22, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v45 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v61 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v56, v0 +; SI-NEXT: v_or_b32_e32 v0, v40, v0 ; SI-NEXT: v_add_i32_e32 v23, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v44 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v52 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v47, v0 +; SI-NEXT: v_or_b32_e32 v0, v38, v0 ; SI-NEXT: v_add_i32_e32 v24, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v43 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v59 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v62, v0 +; SI-NEXT: v_or_b32_e32 v0, v36, v0 ; SI-NEXT: v_add_i32_e32 v25, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v42 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v58 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v61, v0 +; SI-NEXT: v_or_b32_e32 v0, v35, v0 ; SI-NEXT: v_add_i32_e32 v26, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v41 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v51 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v60, v0 +; SI-NEXT: v_or_b32_e32 v0, v34, v0 ; SI-NEXT: v_add_i32_e32 v27, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v40 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v57 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v31, v0 +; SI-NEXT: v_or_b32_e32 v0, v54, v0 ; SI-NEXT: v_add_i32_e32 v28, vcc, 0x30000, v0 -; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v38 +; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v56 ; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SI-NEXT: v_or_b32_e32 v0, v63, v0 +; SI-NEXT: v_or_b32_e32 v0, v33, v0 ; SI-NEXT: v_add_i32_e32 v29, vcc, 0x30000, v0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: .LBB51_3: ; %end @@ -39877,35 +40067,67 @@ define inreg <15 x double> @bitcast_v60i16_to_v15f64_scalar(<60 x i16> inreg %a, ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB51_4: -; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_mov_b32_e32 v46, v44 -; SI-NEXT: v_mov_b32_e32 v32, v30 -; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_mov_b32_e32 v59, v58 -; SI-NEXT: v_mov_b32_e32 v58, v57 -; SI-NEXT: v_mov_b32_e32 v57, v56 -; SI-NEXT: v_mov_b32_e32 v56, v47 -; SI-NEXT: v_mov_b32_e32 v47, v62 +; SI-NEXT: v_mov_b32_e32 v45, v43 +; SI-NEXT: v_mov_b32_e32 v44, v42 +; SI-NEXT: v_mov_b32_e32 v43, v41 +; SI-NEXT: v_mov_b32_e32 v42, v40 +; SI-NEXT: v_mov_b32_e32 v41, v38 +; SI-NEXT: v_mov_b32_e32 v40, v37 +; SI-NEXT: v_mov_b32_e32 v38, v36 +; SI-NEXT: v_mov_b32_e32 v37, v35 +; SI-NEXT: v_mov_b32_e32 v36, v55 +; SI-NEXT: v_mov_b32_e32 v55, v34 +; SI-NEXT: v_mov_b32_e32 v35, v54 +; SI-NEXT: v_mov_b32_e32 v54, v33 +; SI-NEXT: v_mov_b32_e32 v34, v32 +; SI-NEXT: v_mov_b32_e32 v33, v62 ; SI-NEXT: v_mov_b32_e32 v62, v60 -; SI-NEXT: v_mov_b32_e32 v60, v63 -; SI-NEXT: v_mov_b32_e32 v63, v61 -; SI-NEXT: v_mov_b32_e32 v61, v31 +; SI-NEXT: v_mov_b32_e32 v32, v63 +; SI-NEXT: v_mov_b32_e32 v63, v53 +; SI-NEXT: v_mov_b32_e32 v53, v61 +; SI-NEXT: v_mov_b32_e32 v61, v52 +; SI-NEXT: v_mov_b32_e32 v52, v59 +; SI-NEXT: v_mov_b32_e32 v59, v51 +; SI-NEXT: v_mov_b32_e32 v51, v57 +; SI-NEXT: v_mov_b32_e32 v57, v50 +; SI-NEXT: v_mov_b32_e32 v50, v47 +; SI-NEXT: v_mov_b32_e32 v47, v48 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 -; SI-NEXT: v_mov_b32_e32 v44, v46 -; SI-NEXT: v_mov_b32_e32 v31, v61 -; SI-NEXT: v_mov_b32_e32 v61, v63 -; SI-NEXT: v_mov_b32_e32 v63, v60 ; SI-NEXT: v_mov_b32_e32 v60, v62 -; SI-NEXT: v_mov_b32_e32 v62, v47 -; SI-NEXT: v_mov_b32_e32 v47, v56 -; SI-NEXT: v_mov_b32_e32 v56, v57 -; SI-NEXT: v_mov_b32_e32 v57, v58 -; SI-NEXT: v_mov_b32_e32 v58, v59 -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; SI-NEXT: v_mov_b32_e32 v30, v32 +; SI-NEXT: v_mov_b32_e32 v30, v48 +; SI-NEXT: v_mov_b32_e32 v48, v47 +; SI-NEXT: v_mov_b32_e32 v47, v50 +; SI-NEXT: v_mov_b32_e32 v50, v57 +; SI-NEXT: v_mov_b32_e32 v57, v51 +; SI-NEXT: v_mov_b32_e32 v51, v59 +; SI-NEXT: v_mov_b32_e32 v59, v52 +; SI-NEXT: v_mov_b32_e32 v52, v61 +; SI-NEXT: v_mov_b32_e32 v61, v53 +; SI-NEXT: v_mov_b32_e32 v53, v63 +; SI-NEXT: v_mov_b32_e32 v63, v32 +; SI-NEXT: v_mov_b32_e32 v62, v33 +; SI-NEXT: v_mov_b32_e32 v32, v34 +; SI-NEXT: v_mov_b32_e32 v33, v54 +; SI-NEXT: v_mov_b32_e32 v54, v35 +; SI-NEXT: v_mov_b32_e32 v34, v55 +; SI-NEXT: v_mov_b32_e32 v55, v36 +; SI-NEXT: v_mov_b32_e32 v35, v37 +; SI-NEXT: v_mov_b32_e32 v36, v38 +; SI-NEXT: v_mov_b32_e32 v37, v40 +; SI-NEXT: v_mov_b32_e32 v38, v41 +; SI-NEXT: v_mov_b32_e32 v40, v42 +; SI-NEXT: v_mov_b32_e32 v41, v43 +; SI-NEXT: v_mov_b32_e32 v42, v44 +; SI-NEXT: v_mov_b32_e32 v43, v45 +; SI-NEXT: v_mov_b32_e32 v44, v46 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: s_branch .LBB51_2 ; ; VI-LABEL: bitcast_v60i16_to_v15f64_scalar: @@ -40737,6 +40959,10 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: ; kill: killed $vgpr38 ; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 +; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr38 ; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill @@ -40777,16 +41003,14 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr32 ; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr34 -; SI-NEXT: ; implicit-def: $vgpr59 ; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr63 +; SI-NEXT: ; implicit-def: $vgpr58 ; SI-NEXT: ; implicit-def: $vgpr60 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; kill: killed $vgpr38 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr38 -; SI-NEXT: ; implicit-def: $vgpr58 -; SI-NEXT: ; kill: killed $vgpr58 ; SI-NEXT: s_waitcnt vmcnt(14) ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v31 ; SI-NEXT: ; implicit-def: $vgpr31 @@ -40824,55 +41048,53 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: ; kill: killed $vgpr31 ; SI-NEXT: ; implicit-def: $vgpr31 -; SI-NEXT: ; kill: killed $vgpr31 -; SI-NEXT: ; implicit-def: $vgpr31 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_2 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 -; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v32 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v28 -; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v32 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v6 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v5 -; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v33 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v52, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v41, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v31 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v37 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v34 ; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v19 -; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v18 -; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v17 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v15 -; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v13 -; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v12 -; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v10 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v9 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v8 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v2 +; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v20 +; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v19 +; SI-NEXT: v_lshrrev_b32_e32 v54, 16, v18 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v42, 16, v16 +; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v14 +; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v13 +; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v12 +; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v9 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 @@ -40899,30 +41121,30 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v55, v2 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v50, v4 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v1, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v35 -; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v4, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v51 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v4, v36 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v6, v38 +; SI-NEXT: v_mov_b32_e32 v38, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v39 +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -40931,33 +41153,32 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v31, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v53 -; SI-NEXT: v_cvt_f32_f16_e32 v36, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v40 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v51, v47 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v57 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v60 -; SI-NEXT: v_mov_b32_e32 v60, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v62 -; SI-NEXT: v_mov_b32_e32 v62, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v44, v59 -; SI-NEXT: v_mov_b32_e32 v59, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v61 -; SI-NEXT: v_mov_b32_e32 v61, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v63 -; SI-NEXT: v_mov_b32_e32 v63, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v38 -; SI-NEXT: v_mov_b32_e32 v38, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 +; SI-NEXT: v_cvt_f32_f16_e32 v36, v53 +; SI-NEXT: v_cvt_f32_f16_e32 v32, v54 +; SI-NEXT: v_cvt_f32_f16_e32 v33, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v35, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v37, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v39, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v49, v45 +; SI-NEXT: v_cvt_f32_f16_e32 v51, v46 +; SI-NEXT: v_cvt_f32_f16_e32 v53, v47 +; SI-NEXT: v_cvt_f32_f16_e32 v54, v56 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v57 +; SI-NEXT: v_cvt_f32_f16_e32 v42, v58 +; SI-NEXT: v_mov_b32_e32 v58, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v60 +; SI-NEXT: v_mov_b32_e32 v60, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v44, v62 +; SI-NEXT: v_mov_b32_e32 v62, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v46, v61 +; SI-NEXT: v_mov_b32_e32 v61, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v63 +; SI-NEXT: v_mov_b32_e32 v63, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v48 -; SI-NEXT: v_mov_b32_e32 v48, v29 +; SI-NEXT: v_mov_b32_e32 v48, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v3 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $vgpr3 @@ -40978,32 +41199,24 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; SI-NEXT: s_cbranch_execz .LBB52_4 ; SI-NEXT: ; %bb.3: ; %cmp.true -; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 -; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_add_f64 v[5:6], v[5:6], 1.0 ; SI-NEXT: v_add_f64 v[29:30], v[29:30], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v30 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v29 +; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 ; SI-NEXT: v_add_f64 v[49:50], v[3:4], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 ; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v36 -; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v29 +; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; SI-NEXT: v_lshrrev_b32_e32 v34, 16, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v34 +; SI-NEXT: v_add_f64 v[32:33], v[1:2], 1.0 +; SI-NEXT: v_add_f64 v[7:8], v[7:8], 1.0 ; SI-NEXT: v_add_f64 v[9:10], v[9:10], 1.0 ; SI-NEXT: v_add_f64 v[11:12], v[11:12], 1.0 ; SI-NEXT: v_add_f64 v[13:14], v[13:14], 1.0 @@ -41011,8 +41224,9 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_add_f64 v[17:18], v[17:18], 1.0 ; SI-NEXT: v_add_f64 v[19:20], v[19:20], 1.0 ; SI-NEXT: v_add_f64 v[21:22], v[21:22], 1.0 +; SI-NEXT: v_add_f64 v[23:24], v[23:24], 1.0 ; SI-NEXT: v_add_f64 v[25:26], v[25:26], 1.0 -; SI-NEXT: v_add_f64 v[27:28], v[27:28], 1.0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v32 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v49 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7 @@ -41032,15 +41246,16 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 ; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v22 ; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v25 -; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v28 +; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v24 +; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v25 +; SI-NEXT: v_lshrrev_b32_e32 v62, 16, v26 +; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v27 +; SI-NEXT: v_lshrrev_b32_e32 v38, 16, v30 ; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v30 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v29 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v30 +; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 ; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 @@ -41067,15 +41282,16 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v52, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v55, v33 ; SI-NEXT: v_cvt_f32_f16_e32 v41, v32 -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v6, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v62 ; SI-NEXT: v_cvt_f32_f16_e32 v60, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v58 +; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v63 ; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 -; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v31, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v57 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v56 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v47 @@ -41095,22 +41311,22 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f32_f16_e32 v46, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v47, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 -; SI-NEXT: s_waitcnt vmcnt(4) ; SI-NEXT: v_cvt_f32_f16_e32 v57, v1 -; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v29, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v28, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v24, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v23, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill @@ -41119,7 +41335,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; SI-NEXT: .LBB52_4: ; %end ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_cvt_f16_f32_e32 v1, v57 @@ -41238,7 +41454,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v37 ; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 @@ -41247,7 +41463,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v35 ; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 @@ -41256,7 +41472,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 ; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 @@ -41292,16 +41508,16 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 @@ -41310,7 +41526,7 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v63 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 @@ -41319,47 +41535,47 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f16_f32_e32 v1, v58 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v31 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v62 +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -41370,19 +41586,19 @@ define <60 x half> @bitcast_v15f64_to_v60f16(<15 x double> %a, i32 %b) { ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v2, v38 -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v48 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen @@ -42027,29 +42243,28 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_lshr_b32 s44, s5, 16 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v40, s44 ; SI-NEXT: s_lshr_b32 s44, s4, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v52, s44 ; SI-NEXT: s_lshr_b32 s44, s7, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 ; SI-NEXT: s_lshr_b32 s44, s6, 16 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_cvt_f32_f16_e32 v61, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v36, s44 ; SI-NEXT: s_lshr_b32 s44, s9, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v33, s44 ; SI-NEXT: s_lshr_b32 s44, s8, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v48, s44 +; SI-NEXT: s_waitcnt expcnt(4) +; SI-NEXT: v_cvt_f32_f16_e32 v59, s44 ; SI-NEXT: s_lshr_b32 s44, s11, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v25, s44 ; SI-NEXT: s_lshr_b32 s44, s10, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s44 ; SI-NEXT: s_lshr_b32 s44, s13, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v18, s44 ; SI-NEXT: s_lshr_b32 s44, s12, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v17, s44 ; SI-NEXT: s_lshr_b32 s44, s15, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v43, s44 +; SI-NEXT: v_cvt_f32_f16_e32 v14, s44 ; SI-NEXT: s_lshr_b32 s44, s14, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v22, s44 ; SI-NEXT: s_lshr_b32 s44, s41, 16 @@ -42087,15 +42302,17 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: s_lshr_b32 s44, s17, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v58, s44 ; SI-NEXT: s_lshr_b32 s44, s16, 16 +; SI-NEXT: s_waitcnt expcnt(3) ; SI-NEXT: v_cvt_f32_f16_e32 v60, s44 -; SI-NEXT: v_cvt_f32_f16_e32 v14, s5 -; SI-NEXT: v_cvt_f32_f16_e32 v18, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v19, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v29, s6 -; SI-NEXT: v_cvt_f32_f16_e32 v52, s9 -; SI-NEXT: v_cvt_f32_f16_e32 v40, s8 -; SI-NEXT: v_cvt_f32_f16_e32 v45, s11 -; SI-NEXT: v_cvt_f32_f16_e32 v47, s10 +; SI-NEXT: v_cvt_f32_f16_e32 v43, s9 +; SI-NEXT: v_cvt_f32_f16_e32 v45, s8 +; SI-NEXT: v_cvt_f32_f16_e32 v47, s11 +; SI-NEXT: s_waitcnt expcnt(2) +; SI-NEXT: v_cvt_f32_f16_e32 v61, s10 ; SI-NEXT: s_waitcnt expcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v62, s13 ; SI-NEXT: s_waitcnt expcnt(0) @@ -42146,9 +42363,9 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v53 ; SI-NEXT: v_add_f64 v[20:21], s[12:13], 1.0 ; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v15 ; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21 +; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v62, v21 ; SI-NEXT: v_cvt_f32_f16_e32 v63, v20 ; SI-NEXT: v_cvt_f32_f16_e32 v20, v58 @@ -42156,17 +42373,17 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v58, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v57, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v1, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; SI-NEXT: v_add_f64 v[37:38], s[26:27], 1.0 -; SI-NEXT: v_cvt_f32_f16_e32 v47, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v61, v15 ; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v38 ; SI-NEXT: v_lshrrev_b32_e32 v44, 16, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v15, v37 ; SI-NEXT: v_cvt_f32_f16_e32 v37, v55 ; SI-NEXT: v_add_f64 v[7:8], s[6:7], 1.0 ; SI-NEXT: v_add_f64 v[34:35], s[28:29], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v41 ; SI-NEXT: v_add_f64 v[30:31], s[42:43], 1.0 @@ -42174,7 +42391,7 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v35 ; SI-NEXT: v_lshrrev_b32_e32 v32, 16, v30 ; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v52, v12 +; SI-NEXT: v_cvt_f32_f16_e32 v43, v12 ; SI-NEXT: v_cvt_f32_f16_e32 v12, v30 ; SI-NEXT: v_cvt_f32_f16_e32 v30, v32 ; SI-NEXT: v_cvt_f32_f16_e32 v32, v39 @@ -42183,37 +42400,35 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v16 ; SI-NEXT: v_lshrrev_b32_e32 v46, 16, v50 ; SI-NEXT: v_lshrrev_b32_e32 v56, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v45, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v47, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v16, v49 ; SI-NEXT: v_cvt_f32_f16_e32 v49, v46 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_add_f64 v[3:4], s[4:5], 1.0 ; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v34 -; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v3 +; SI-NEXT: v_lshrrev_b32_e32 v52, 16, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v34 ; SI-NEXT: v_cvt_f32_f16_e32 v34, v51 ; SI-NEXT: v_cvt_f32_f16_e32 v51, v56 ; SI-NEXT: v_add_f64 v[26:27], s[40:41], 1.0 ; SI-NEXT: v_add_f64 v[22:23], s[14:15], 1.0 -; SI-NEXT: v_lshrrev_b32_e32 v48, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v26 -; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v14, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v5 ; SI-NEXT: v_lshrrev_b32_e32 v60, 16, v54 ; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v26 ; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v27 -; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v23 -; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v22 +; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v23 +; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v36, 16, v7 +; SI-NEXT: v_lshrrev_b32_e32 v40, 16, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v29, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v45, v11 ; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v22 ; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v26 ; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 ; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 ; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 @@ -42221,244 +42436,244 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v53 ; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 +; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v52, v52 +; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 ; SI-NEXT: v_cvt_f32_f16_e32 v36, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v61, v61 ; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v43 +; SI-NEXT: v_cvt_f32_f16_e32 v59, v59 ; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 -; SI-NEXT: v_mov_b32_e32 v59, v22 +; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 ; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v13 -; SI-NEXT: v_mov_b32_e32 v13, v26 +; SI-NEXT: v_cvt_f32_f16_e32 v14, v14 +; SI-NEXT: v_cvt_f32_f16_e32 v22, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v10 +; SI-NEXT: v_cvt_f32_f16_e32 v26, v9 ; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 ; SI-NEXT: v_cvt_f32_f16_e32 v53, v60 ; SI-NEXT: v_cvt_f32_f16_e32 v58, v58 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v55, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v55, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v41, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v41, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v44, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v44, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v46, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v46, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v56, v6 -; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v56, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v60, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v60, v5 ; SI-NEXT: .LBB53_3: ; %end -; SI-NEXT: v_cvt_f16_f32_e32 v6, v60 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v57 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v60 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v9, v57 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v58 -; SI-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v0 -; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; SI-NEXT: v_or_b32_e32 v9, v10, v9 -; SI-NEXT: buffer_store_dword v9, v6, s[0:3], 0 offen -; SI-NEXT: v_cvt_f16_f32_e32 v6, v56 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 +; SI-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v9, v21 -; SI-NEXT: v_add_i32_e32 v10, vcc, 8, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 4, v0 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v6, v5, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v56 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v20 -; SI-NEXT: v_add_i32_e32 v10, vcc, 12, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v6, v21 +; SI-NEXT: v_add_i32_e32 v9, vcc, 8, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v8 -; SI-NEXT: v_add_i32_e32 v10, vcc, 16, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v20 +; SI-NEXT: v_add_i32_e32 v9, vcc, 12, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v42 -; SI-NEXT: v_add_i32_e32 v10, vcc, 20, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v44 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v8 +; SI-NEXT: v_add_i32_e32 v9, vcc, 16, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v55 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v7 -; SI-NEXT: v_add_i32_e32 v10, vcc, 24, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v42 +; SI-NEXT: v_add_i32_e32 v9, vcc, 20, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v9, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v54 -; SI-NEXT: v_add_i32_e32 v10, vcc, 28, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v7 +; SI-NEXT: v_add_i32_e32 v7, vcc, 24, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v51 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v16 -; SI-NEXT: v_add_i32_e32 v10, vcc, 32, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v53 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v54 +; SI-NEXT: v_add_i32_e32 v7, vcc, 28, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v49 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v50 -; SI-NEXT: v_add_i32_e32 v10, vcc, 36, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v16 +; SI-NEXT: v_add_i32_e32 v7, vcc, 32, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v15 -; SI-NEXT: v_add_i32_e32 v10, vcc, 40, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v50 +; SI-NEXT: v_add_i32_e32 v7, vcc, 36, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v37 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v38 -; SI-NEXT: v_add_i32_e32 v10, vcc, 44, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v6, v9, v6 -; SI-NEXT: buffer_store_dword v6, v10, s[0:3], 0 offen +; SI-NEXT: v_cvt_f16_f32_e32 v5, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v15 +; SI-NEXT: v_add_i32_e32 v7, vcc, 40, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v34 -; SI-NEXT: v_add_i32_e32 v9, vcc, 48, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v37 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v38 +; SI-NEXT: v_add_i32_e32 v7, vcc, 44, v0 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v5, v6, v5 +; SI-NEXT: buffer_store_dword v5, v7, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v5, v34 +; SI-NEXT: v_add_i32_e32 v6, vcc, 48, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; SI-NEXT: v_or_b32_e32 v3, v3, v6 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v3, v3, v5 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v32 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v35 -; SI-NEXT: v_add_i32_e32 v9, vcc, 52, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v35 +; SI-NEXT: v_add_i32_e32 v6, vcc, 52, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v12 -; SI-NEXT: v_add_i32_e32 v9, vcc, 56, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v12 +; SI-NEXT: v_add_i32_e32 v6, vcc, 56, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v31 -; SI-NEXT: v_add_i32_e32 v9, vcc, 60, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v31 +; SI-NEXT: v_add_i32_e32 v6, vcc, 60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v11 -; SI-NEXT: v_add_i32_e32 v9, vcc, 64, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 +; SI-NEXT: v_add_i32_e32 v6, vcc, 64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v27 -; SI-NEXT: v_add_i32_e32 v9, vcc, 0x44, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v27 +; SI-NEXT: v_add_i32_e32 v6, vcc, 0x44, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_or_b32_e32 v3, v6, v3 -; SI-NEXT: buffer_store_dword v3, v9, s[0:3], 0 offen +; SI-NEXT: v_or_b32_e32 v3, v5, v3 +; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v22 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x48, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x48, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v23 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x4c, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x4c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v3, v17 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x50, v0 +; SI-NEXT: v_add_i32_e32 v5, vcc, 0x50, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 -; SI-NEXT: buffer_store_dword v3, v6, s[0:3], 0 offen +; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v18 ; SI-NEXT: v_cvt_f16_f32_e32 v4, v62 ; SI-NEXT: v_add_i32_e32 v5, vcc, 0x54, v0 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_or_b32_e32 v3, v4, v3 ; SI-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v61 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x58, v0 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v47 ; SI-NEXT: v_add_i32_e32 v4, vcc, 0x5c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v45 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v1, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v43 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v61 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v29 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v48 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v19 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v13 ; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 ; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v59 -; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v40 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v1 @@ -42521,27 +42736,27 @@ define inreg <60 x half> @bitcast_v15f64_to_v60f16_scalar(<15 x double> inreg %a ; SI-NEXT: ; implicit-def: $vgpr4 ; SI-NEXT: ; implicit-def: $vgpr22 ; SI-NEXT: ; implicit-def: $vgpr23 -; SI-NEXT: ; implicit-def: $vgpr43 +; SI-NEXT: ; implicit-def: $vgpr14 ; SI-NEXT: ; implicit-def: $vgpr63 ; SI-NEXT: ; implicit-def: $vgpr17 ; SI-NEXT: ; implicit-def: $vgpr62 -; SI-NEXT: ; implicit-def: $vgpr5 -; SI-NEXT: ; implicit-def: $vgpr47 +; SI-NEXT: ; implicit-def: $vgpr18 +; SI-NEXT: ; implicit-def: $vgpr61 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr47 ; SI-NEXT: ; implicit-def: $vgpr25 -; SI-NEXT: ; implicit-def: $vgpr40 -; SI-NEXT: ; implicit-def: $vgpr48 -; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr45 +; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr43 ; SI-NEXT: ; implicit-def: $vgpr33 ; SI-NEXT: ; implicit-def: $vgpr29 -; SI-NEXT: ; implicit-def: $vgpr61 +; SI-NEXT: ; implicit-def: $vgpr36 ; SI-NEXT: ; implicit-def: $vgpr19 +; SI-NEXT: ; implicit-def: $vgpr48 ; SI-NEXT: ; implicit-def: $vgpr13 -; SI-NEXT: ; implicit-def: $vgpr18 -; SI-NEXT: ; implicit-def: $vgpr36 -; SI-NEXT: ; implicit-def: $vgpr14 -; SI-NEXT: ; implicit-def: $vgpr59 +; SI-NEXT: ; implicit-def: $vgpr52 +; SI-NEXT: ; implicit-def: $vgpr10 +; SI-NEXT: ; implicit-def: $vgpr40 ; SI-NEXT: s_branch .LBB53_2 ; ; VI-LABEL: bitcast_v15f64_to_v60f16_scalar: @@ -50964,566 +51179,651 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; SI-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64 -; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 -; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:4 -; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:8 -; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:12 -; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:16 -; SI-NEXT: s_waitcnt expcnt(6) -; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:20 -; SI-NEXT: s_waitcnt expcnt(5) -; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:24 -; SI-NEXT: s_waitcnt expcnt(4) -; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; SI-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:28 +; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:32 +; SI-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 +; SI-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:4 +; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 +; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:40 +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:52 +; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:56 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: s_waitcnt expcnt(3) -; SI-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:32 -; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:36 -; SI-NEXT: s_waitcnt expcnt(1) -; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:40 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v30 +; SI-NEXT: v_cvt_f16_f32_e32 v30, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v24 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v18 +; SI-NEXT: v_cvt_f16_f32_e32 v54, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v22 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v36, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v26 +; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v62, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v23, s19 +; SI-NEXT: v_cvt_f16_f32_e32 v19, s23 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v10 +; SI-NEXT: v_cvt_f16_f32_e32 v42, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v26, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v40, v8 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v11 +; SI-NEXT: v_cvt_f16_f32_e32 v22, v9 +; SI-NEXT: v_cvt_f16_f32_e32 v47, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v18, v13 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:44 -; SI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 -; SI-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 -; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:56 -; SI-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:60 -; SI-NEXT: v_cvt_f16_f32_e32 v51, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v6 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f16_f32_e32 v42, v7 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v14 +; SI-NEXT: v_cvt_f16_f32_e32 v52, v16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, v17 +; SI-NEXT: v_cvt_f16_f32_e32 v14, v21 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v1, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v38, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v14 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v19 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v20, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v24 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v25 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v29, s22 -; SI-NEXT: v_cvt_f16_f32_e32 v30, s23 -; SI-NEXT: v_cvt_f16_f32_e32 v26, s26 -; SI-NEXT: v_cvt_f16_f32_e32 v27, s27 -; SI-NEXT: v_cvt_f16_f32_e32 v28, s28 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v15 +; SI-NEXT: v_cvt_f16_f32_e32 v12, v25 +; SI-NEXT: v_cvt_f16_f32_e32 v8, v28 +; SI-NEXT: v_cvt_f16_f32_e32 v10, v29 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f16_f32_e32 v58, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v28, s26 ; SI-NEXT: s_waitcnt vmcnt(14) -; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v23 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v44 -; SI-NEXT: v_cvt_f16_f32_e32 v9, v45 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v32 +; SI-NEXT: v_cvt_f16_f32_e32 v15, v33 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v34 +; SI-NEXT: v_cvt_f16_f32_e32 v11, v35 +; SI-NEXT: v_cvt_f16_f32_e32 v59, v38 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v39 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v39, v50 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v49 +; SI-NEXT: v_cvt_f16_f32_e32 v34, s24 +; SI-NEXT: v_cvt_f16_f32_e32 v49, s27 +; SI-NEXT: v_cvt_f16_f32_e32 v32, s28 +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v50, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v57, v53 +; SI-NEXT: s_waitcnt vmcnt(14) +; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v41 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v43 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v44 ; SI-NEXT: s_waitcnt vmcnt(13) -; SI-NEXT: v_cvt_f16_f32_e32 v10, v46 -; SI-NEXT: s_waitcnt vmcnt(12) -; SI-NEXT: v_cvt_f16_f32_e32 v15, v47 -; SI-NEXT: s_waitcnt vmcnt(11) -; SI-NEXT: v_cvt_f16_f32_e32 v23, v57 -; SI-NEXT: s_waitcnt vmcnt(10) -; SI-NEXT: v_cvt_f16_f32_e32 v6, v58 -; SI-NEXT: s_waitcnt vmcnt(9) -; SI-NEXT: v_cvt_f16_f32_e32 v7, v59 -; SI-NEXT: s_waitcnt vmcnt(8) -; SI-NEXT: v_cvt_f16_f32_e32 v14, v60 -; SI-NEXT: s_waitcnt vmcnt(7) -; SI-NEXT: v_cvt_f16_f32_e32 v19, v61 -; SI-NEXT: s_waitcnt vmcnt(6) -; SI-NEXT: v_cvt_f16_f32_e32 v3, v62 -; SI-NEXT: s_waitcnt vmcnt(4) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v63 -; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f16_f32_e32 v36, v39 -; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: v_cvt_f16_f32_e32 v62, v50 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: s_waitcnt vmcnt(2) expcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v2, v53 -; SI-NEXT: v_cvt_f16_f32_e32 v47, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v60, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v44, s18 -; SI-NEXT: v_cvt_f16_f32_e32 v45, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v46, s20 -; SI-NEXT: v_cvt_f16_f32_e32 v59, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v43, s24 -; SI-NEXT: v_cvt_f16_f32_e32 v58, s25 -; SI-NEXT: v_cvt_f16_f32_e32 v57, s29 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v45 +; SI-NEXT: v_cvt_f16_f32_e32 v35, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v38, s21 +; SI-NEXT: v_cvt_f16_f32_e32 v33, s22 +; SI-NEXT: v_cvt_f16_f32_e32 v44, s29 ; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cbranch_scc0 .LBB59_2 +; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt vmcnt(14) expcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v46 +; SI-NEXT: v_cvt_f16_f32_e32 v46, s25 +; SI-NEXT: s_cbranch_scc0 .LBB59_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_branch .LBB59_3 -; SI-NEXT: .LBB59_2: -; SI-NEXT: s_mov_b64 s[4:5], -1 -; SI-NEXT: .LBB59_3: ; %Flow -; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; SI-NEXT: v_mov_b32_e32 v61, v14 -; SI-NEXT: v_mov_b32_e32 v63, v15 -; SI-NEXT: v_mov_b32_e32 v15, v18 -; SI-NEXT: v_mov_b32_e32 v18, v22 -; SI-NEXT: v_mov_b32_e32 v22, v33 -; SI-NEXT: v_mov_b32_e32 v33, v11 -; SI-NEXT: v_mov_b32_e32 v11, v8 -; SI-NEXT: v_mov_b32_e32 v8, v5 -; SI-NEXT: v_mov_b32_e32 v5, v42 -; SI-NEXT: v_mov_b32_e32 v42, v1 -; SI-NEXT: s_cbranch_vccnz .LBB59_5 -; SI-NEXT: ; %bb.4: ; %cmp.true -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: s_cbranch_execnz .LBB59_3 +; SI-NEXT: .LBB59_2: ; %cmp.true +; SI-NEXT: v_cvt_f32_f16_e32 v5, v23 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v35 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v58 +; SI-NEXT: v_mov_b32_e32 v43, v34 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v23, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v23 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v35, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v19 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v33 +; SI-NEXT: v_lshlrev_b32_e32 v34, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v19, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v19 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_or_b32_e32 v33, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v49 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v28 +; SI-NEXT: v_mov_b32_e32 v41, v32 +; SI-NEXT: v_lshlrev_b32_e32 v32, 16, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v49, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v46 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v49 +; SI-NEXT: v_mov_b32_e32 v53, v27 +; SI-NEXT: v_or_b32_e32 v28, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v30 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_mov_b32_e32 v38, v55 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v62 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v10, v10 +; SI-NEXT: v_lshlrev_b32_e32 v29, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v42 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_cvt_f32_f16_e32 v9, v9 -; SI-NEXT: v_cvt_f32_f16_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 -; SI-NEXT: v_cvt_f32_f16_e32 v12, v12 -; SI-NEXT: v_cvt_f32_f16_e32 v17, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v40 +; SI-NEXT: v_mov_b32_e32 v40, v54 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v21, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v47 +; SI-NEXT: v_lshr_b64 v[46:47], v[27:28], 16 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v17, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v52 +; SI-NEXT: v_mov_b32_e32 v52, v15 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v30, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v26 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 -; SI-NEXT: v_add_f32_e32 v7, 0x38000000, v7 -; SI-NEXT: v_cvt_f32_f16_e32 v16, v16 -; SI-NEXT: v_cvt_f32_f16_e32 v21, v21 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 -; SI-NEXT: v_add_f32_e32 v6, 0x38000000, v6 -; SI-NEXT: v_add_f32_e32 v10, 0x38000000, v10 -; SI-NEXT: v_cvt_f32_f16_e32 v20, v20 -; SI-NEXT: v_cvt_f32_f16_e32 v32, v32 -; SI-NEXT: v_or_b32_e32 v62, v1, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 -; SI-NEXT: v_add_f32_e32 v9, 0x38000000, v9 -; SI-NEXT: v_add_f32_e32 v13, 0x38000000, v13 -; SI-NEXT: v_cvt_f32_f16_e32 v31, v31 -; SI-NEXT: v_cvt_f32_f16_e32 v35, v35 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 -; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 -; SI-NEXT: v_add_f32_e32 v12, 0x38000000, v12 -; SI-NEXT: v_add_f32_e32 v17, 0x38000000, v17 -; SI-NEXT: v_cvt_f32_f16_e32 v34, v34 -; SI-NEXT: v_cvt_f32_f16_e32 v38, v38 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v4 -; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 -; SI-NEXT: v_cvt_f16_f32_e32 v17, v17 -; SI-NEXT: v_add_f32_e32 v16, 0x38000000, v16 -; SI-NEXT: v_add_f32_e32 v21, 0x38000000, v21 -; SI-NEXT: v_cvt_f32_f16_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v3, v3, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v7 -; SI-NEXT: v_cvt_f16_f32_e32 v16, v16 -; SI-NEXT: v_cvt_f16_f32_e32 v21, v21 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v26, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v22 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v22, v3, v5 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v3, v18 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; SI-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; SI-NEXT: v_or_b32_e32 v18, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v37 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_mov_b32_e32 v51, v11 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v55, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v55 +; SI-NEXT: v_or_b32_e32 v16, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v63 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v14 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v37, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v37 +; SI-NEXT: v_or_b32_e32 v14, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v61 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v63, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v63 +; SI-NEXT: v_or_b32_e32 v12, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v60 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v10 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v61, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v61 +; SI-NEXT: v_or_b32_e32 v10, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v7 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v57 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v7, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; SI-NEXT: v_or_b32_e32 v57, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v59 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_lshr_b64 v[58:59], v[34:35], 16 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v50 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v8 +; SI-NEXT: v_mov_b32_e32 v8, v48 +; SI-NEXT: v_cvt_f16_f32_e32 v48, v5 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v48 +; SI-NEXT: v_or_b32_e32 v6, v3, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v4 +; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v1 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cvt_f32_f16_e32 v1, v44 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v31 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v60, v4 +; SI-NEXT: v_mov_b32_e32 v59, v48 +; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v38 +; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v60 +; SI-NEXT: v_or_b32_e32 v4, v3, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v38, v43 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshr_b64 v[47:48], v[17:18], 16 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshr_b64 v[44:45], v[29:30], 16 +; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v24 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v39 +; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v31, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_lshlrev_b32_e32 v20, 16, v31 +; SI-NEXT: v_or_b32_e32 v2, v2, v20 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v62 +; SI-NEXT: v_or_b32_e32 v62, v24, v32 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 ; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 -; SI-NEXT: v_add_f32_e32 v32, 0x38000000, v32 -; SI-NEXT: v_or_b32_e32 v6, v6, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 -; SI-NEXT: v_cvt_f16_f32_e32 v32, v32 -; SI-NEXT: v_add_f32_e32 v31, 0x38000000, v31 -; SI-NEXT: v_add_f32_e32 v35, 0x38000000, v35 -; SI-NEXT: v_or_b32_e32 v9, v9, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v31, v31 -; SI-NEXT: v_cvt_f16_f32_e32 v35, v35 -; SI-NEXT: v_add_f32_e32 v34, 0x38000000, v34 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v42, v24, v27 +; SI-NEXT: v_mov_b32_e32 v48, v62 +; SI-NEXT: v_or_b32_e32 v50, v20, v34 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v41 +; SI-NEXT: v_mov_b32_e32 v34, v42 +; SI-NEXT: v_lshr_b64 v[42:43], v[25:26], 16 +; SI-NEXT: v_mov_b32_e32 v62, v50 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v41, v20, v29 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 -; SI-NEXT: v_or_b32_e32 v12, v12, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v17 -; SI-NEXT: v_cvt_f16_f32_e32 v34, v34 ; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 -; SI-NEXT: v_add_f32_e32 v37, 0x38000000, v37 -; SI-NEXT: v_or_b32_e32 v16, v16, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v21 -; SI-NEXT: v_cvt_f16_f32_e32 v37, v37 -; SI-NEXT: v_or_b32_e32 v20, v20, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v32 -; SI-NEXT: v_or_b32_e32 v31, v31, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v35 -; SI-NEXT: v_or_b32_e32 v34, v34, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v38 -; SI-NEXT: v_or_b32_e32 v37, v37, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v48, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v51 -; SI-NEXT: v_cvt_f32_f16_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v49, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v48 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v26, v26 -; SI-NEXT: v_lshlrev_b32_e32 v48, 16, v49 -; SI-NEXT: v_cvt_f32_f16_e32 v30, v30 -; SI-NEXT: v_or_b32_e32 v48, v39, v48 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v52 -; SI-NEXT: v_cvt_f32_f16_e32 v29, v29 -; SI-NEXT: v_add_f32_e32 v27, 0x38000000, v27 -; SI-NEXT: v_cvt_f16_f32_e32 v27, v27 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v52, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v26, 0x38000000, v26 -; SI-NEXT: v_add_f32_e32 v30, 0x38000000, v30 -; SI-NEXT: v_cvt_f16_f32_e32 v26, v26 -; SI-NEXT: v_cvt_f16_f32_e32 v30, v30 -; SI-NEXT: v_add_f32_e32 v29, 0x38000000, v29 -; SI-NEXT: v_cvt_f16_f32_e32 v29, v29 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; SI-NEXT: v_or_b32_e32 v51, v39, v50 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v27 -; SI-NEXT: v_or_b32_e32 v26, v26, v39 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v30 -; SI-NEXT: v_or_b32_e32 v29, v29, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v45 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v44 -; SI-NEXT: v_cvt_f32_f16_e32 v53, v59 -; SI-NEXT: v_cvt_f32_f16_e32 v56, v56 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 -; SI-NEXT: v_cvt_f16_f32_e32 v45, v39 -; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v53, 0x38000000, v53 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 -; SI-NEXT: v_cvt_f16_f32_e32 v53, v53 -; SI-NEXT: v_or_b32_e32 v44, v39, v50 -; SI-NEXT: v_cvt_f32_f16_e32 v39, v60 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v47 -; SI-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; SI-NEXT: v_add_f32_e32 v56, 0x38000000, v56 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v24, v8 +; SI-NEXT: v_or_b32_e32 v8, v38, v25 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v20, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: v_or_b32_e32 v8, v24, v21 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v20, v17 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; SI-NEXT: v_cvt_f32_f16_e32 v24, v40 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v36 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v38, v15 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_cvt_f32_f16_e32 v38, v53 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: v_or_b32_e32 v54, v24, v13 +; SI-NEXT: v_cvt_f32_f16_e32 v24, v52 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v52, v38, v9 +; SI-NEXT: v_mov_b32_e32 v27, v52 +; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 +; SI-NEXT: v_or_b32_e32 v36, v20, v11 +; SI-NEXT: v_cvt_f32_f16_e32 v20, v51 +; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 +; SI-NEXT: v_lshr_b64 v[52:53], v[15:16], 16 +; SI-NEXT: v_add_f32_e32 v20, 0x38000000, v20 +; SI-NEXT: v_cvt_f16_f32_e32 v20, v20 +; SI-NEXT: v_or_b32_e32 v51, v24, v56 +; SI-NEXT: v_mov_b32_e32 v15, v51 +; SI-NEXT: v_lshr_b64 v[50:51], v[13:14], 16 +; SI-NEXT: v_or_b32_e32 v24, v20, v5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v38, v8 +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_add_f32_e32 v38, 0x38000000, v38 +; SI-NEXT: v_cvt_f16_f32_e32 v38, v38 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v39, v8 +; SI-NEXT: v_or_b32_e32 v8, v38, v3 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; SI-NEXT: v_add_f32_e32 v39, 0x38000000, v39 ; SI-NEXT: v_cvt_f16_f32_e32 v39, v39 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v56, v56 -; SI-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v23, v23 -; SI-NEXT: v_or_b32_e32 v47, v50, v39 -; SI-NEXT: v_cvt_f32_f16_e32 v50, v46 -; SI-NEXT: v_cvt_f32_f16_e32 v46, v58 -; SI-NEXT: v_lshlrev_b32_e32 v56, 16, v56 -; SI-NEXT: v_add_f32_e32 v23, 0x38000000, v23 -; SI-NEXT: v_add_f32_e32 v50, 0x38000000, v50 -; SI-NEXT: v_add_f32_e32 v46, 0x38000000, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v50, v50 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v46 -; SI-NEXT: v_cvt_f16_f32_e32 v23, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v57, v57 -; SI-NEXT: v_or_b32_e32 v46, v50, v53 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v58 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v8, v39, v1 +; SI-NEXT: v_lshr_b64 v[38:39], v[32:33], 16 +; SI-NEXT: v_mov_b32_e32 v32, v41 +; SI-NEXT: v_lshr_b64 v[40:41], v[21:22], 16 +; SI-NEXT: v_lshr_b64 v[20:21], v[11:12], 16 +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[56:57], 16 +; SI-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; SI-NEXT: v_mov_b32_e32 v11, v24 +; SI-NEXT: v_lshr_b64 v[8:9], v[9:10], 16 +; SI-NEXT: v_mov_b32_e32 v39, v31 +; SI-NEXT: v_mov_b32_e32 v31, v60 +; SI-NEXT: v_mov_b32_e32 v60, v61 +; SI-NEXT: v_mov_b32_e32 v61, v63 +; SI-NEXT: v_mov_b32_e32 v63, v37 +; SI-NEXT: v_mov_b32_e32 v37, v55 +; SI-NEXT: v_lshr_b64 v[55:56], v[5:6], 16 +; SI-NEXT: v_lshr_b64 v[24:25], v[3:4], 16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshr_b64 v[20:21], v[1:2], 16 +; SI-NEXT: .LBB59_3: ; %end +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v58 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v62 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v35 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v23 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 4, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v38 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v48 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v33 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v19 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 12, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v46 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v34 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v28 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v49 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 20, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v44 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v32 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 24, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v30 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 28, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v42 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 32, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v26 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 36, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v40 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; SI-NEXT: v_or_b32_e32 v1, v3, v1 +; SI-NEXT: v_add_i32_e32 v3, vcc, 40, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v47 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 52, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v52 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 56, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v16 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v37 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v54 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v50 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v14 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v63 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x44, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v36 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v58, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v42, v42 -; SI-NEXT: v_cvt_f32_f16_e32 v41, v41 -; SI-NEXT: v_cvt_f32_f16_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f32_f16_e32 v55, v55 -; SI-NEXT: v_cvt_f32_f16_e32 v54, v54 -; SI-NEXT: v_cvt_f32_f16_e32 v25, v25 -; SI-NEXT: v_or_b32_e32 v14, v58, v56 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v63 -; SI-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; SI-NEXT: v_cvt_f32_f16_e32 v24, v24 -; SI-NEXT: v_cvt_f32_f16_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x48, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v14, v36 -; SI-NEXT: v_cvt_f32_f16_e32 v43, v43 -; SI-NEXT: v_cvt_f32_f16_e32 v28, v28 -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: v_cvt_f32_f16_e32 v33, v33 -; SI-NEXT: v_cvt_f32_f16_e32 v22, v22 -; SI-NEXT: v_cvt_f32_f16_e32 v18, v18 -; SI-NEXT: v_cvt_f32_f16_e32 v15, v15 -; SI-NEXT: v_or_b32_e32 v63, v58, v23 -; SI-NEXT: v_cvt_f32_f16_e32 v58, v61 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v12 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v61 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x4c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v27 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x50, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v10 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v60 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x54, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v15 ; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: v_cvt_f32_f16_e32 v36, v1 -; SI-NEXT: v_add_f32_e32 v57, 0x38000000, v57 -; SI-NEXT: v_add_f32_e32 v42, 0x38000000, v42 -; SI-NEXT: v_add_f32_e32 v41, 0x38000000, v41 -; SI-NEXT: v_add_f32_e32 v40, 0x38000000, v40 -; SI-NEXT: v_add_f32_e32 v55, 0x38000000, v55 -; SI-NEXT: v_add_f32_e32 v54, 0x38000000, v54 -; SI-NEXT: v_add_f32_e32 v25, 0x38000000, v25 -; SI-NEXT: v_add_f32_e32 v24, 0x38000000, v24 -; SI-NEXT: v_add_f32_e32 v19, 0x38000000, v19 -; SI-NEXT: v_add_f32_e32 v14, 0x38000000, v14 -; SI-NEXT: v_add_f32_e32 v43, 0x38000000, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v57, v57 -; SI-NEXT: v_add_f32_e32 v28, 0x38000000, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v42, v42 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v41, v41 -; SI-NEXT: v_add_f32_e32 v8, 0x38000000, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v40, v40 -; SI-NEXT: v_add_f32_e32 v11, 0x38000000, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v55, v55 -; SI-NEXT: v_add_f32_e32 v33, 0x38000000, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v54, v54 -; SI-NEXT: v_add_f32_e32 v22, 0x38000000, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v25, v25 -; SI-NEXT: v_add_f32_e32 v18, 0x38000000, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v24, v24 -; SI-NEXT: v_add_f32_e32 v15, 0x38000000, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v19, v19 -; SI-NEXT: v_add_f32_e32 v58, 0x38000000, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 -; SI-NEXT: v_add_f32_e32 v36, 0x38000000, v36 -; SI-NEXT: v_cvt_f16_f32_e32 v43, v43 -; SI-NEXT: v_cvt_f16_f32_e32 v28, v28 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 -; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 -; SI-NEXT: v_cvt_f16_f32_e32 v33, v33 -; SI-NEXT: v_cvt_f16_f32_e32 v22, v22 -; SI-NEXT: v_cvt_f16_f32_e32 v18, v18 -; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v58, v58 -; SI-NEXT: v_cvt_f16_f32_e32 v36, v36 -; SI-NEXT: v_lshlrev_b32_e32 v57, 16, v57 -; SI-NEXT: v_lshlrev_b32_e32 v42, 16, v42 -; SI-NEXT: v_lshlrev_b32_e32 v41, 16, v41 -; SI-NEXT: v_lshlrev_b32_e32 v40, 16, v40 -; SI-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; SI-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; SI-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; SI-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; SI-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; SI-NEXT: v_or_b32_e32 v43, v43, v50 -; SI-NEXT: v_or_b32_e32 v28, v28, v57 -; SI-NEXT: v_or_b32_e32 v5, v5, v42 -; SI-NEXT: v_or_b32_e32 v8, v8, v41 -; SI-NEXT: v_or_b32_e32 v11, v11, v40 -; SI-NEXT: v_or_b32_e32 v33, v33, v55 -; SI-NEXT: v_or_b32_e32 v22, v22, v54 -; SI-NEXT: v_or_b32_e32 v18, v18, v25 -; SI-NEXT: v_or_b32_e32 v15, v15, v24 -; SI-NEXT: v_or_b32_e32 v61, v58, v19 -; SI-NEXT: v_or_b32_e32 v1, v36, v14 -; SI-NEXT: v_alignbit_b32 v60, v44, v39, 16 -; SI-NEXT: v_alignbit_b32 v59, v29, v53, 16 -; SI-NEXT: v_alignbit_b32 v58, v26, v50, 16 -; SI-NEXT: v_alignbit_b32 v57, v51, v57, 16 -; SI-NEXT: v_alignbit_b32 v56, v48, v56, 16 -; SI-NEXT: v_alignbit_b32 v42, v37, v42, 16 -; SI-NEXT: v_alignbit_b32 v41, v34, v41, 16 -; SI-NEXT: v_alignbit_b32 v40, v31, v40, 16 -; SI-NEXT: v_alignbit_b32 v55, v20, v55, 16 -; SI-NEXT: v_alignbit_b32 v54, v16, v54, 16 -; SI-NEXT: v_alignbit_b32 v25, v12, v25, 16 -; SI-NEXT: v_alignbit_b32 v24, v9, v24, 16 -; SI-NEXT: v_alignbit_b32 v23, v6, v23, 16 -; SI-NEXT: v_alignbit_b32 v19, v3, v19, 16 -; SI-NEXT: v_alignbit_b32 v36, v62, v14, 16 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; SI-NEXT: .LBB59_5: ; %end -; SI-NEXT: v_and_b32_e32 v39, 0xffff, v47 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v60 -; SI-NEXT: v_or_b32_e32 v39, v39, v50 -; SI-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff, v44 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v45 -; SI-NEXT: v_or_b32_e32 v39, v39, v50 -; SI-NEXT: v_add_i32_e32 v50, vcc, 4, v0 -; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v39, 0xffff, v46 -; SI-NEXT: v_lshlrev_b32_e32 v50, 16, v59 -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; SI-NEXT: v_or_b32_e32 v39, v39, v50 -; SI-NEXT: v_add_i32_e32 v50, vcc, 8, v0 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 12, v0 -; SI-NEXT: buffer_store_dword v39, v50, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v29, 0xffff, v43 -; SI-NEXT: v_lshlrev_b32_e32 v30, 16, v58 -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; SI-NEXT: v_or_b32_e32 v29, v29, v30 -; SI-NEXT: v_add_i32_e32 v30, vcc, 16, v0 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 20, v0 -; SI-NEXT: buffer_store_dword v29, v30, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v28 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v57 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 24, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v51 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v52 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 28, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v56 -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x6c, v0 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v14 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 32, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v26, 0xffff, v48 -; SI-NEXT: v_lshlrev_b32_e32 v27, 16, v49 -; SI-NEXT: v_or_b32_e32 v26, v26, v27 -; SI-NEXT: v_add_i32_e32 v27, vcc, 36, v0 -; SI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v42 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 40, v0 -; SI-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v37 -; SI-NEXT: v_lshlrev_b32_e32 v26, 16, v38 -; SI-NEXT: v_or_b32_e32 v5, v5, v26 -; SI-NEXT: v_add_i32_e32 v26, vcc, 44, v0 -; SI-NEXT: buffer_store_dword v5, v26, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v8 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v41 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 48, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v34 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v35 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 52, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v40 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 56, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v31 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v32 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 60, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v33 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v55 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 64, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v20 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v21 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x44, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v22 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v54 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x48, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v16 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v17 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x4c, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v18 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v25 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x50, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v12 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x54, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v15 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v24 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x58, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v9 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x5c, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v63 -; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v23 -; SI-NEXT: v_or_b32_e32 v5, v5, v8 -; SI-NEXT: v_add_i32_e32 v8, vcc, 0x60, v0 -; SI-NEXT: buffer_store_dword v5, v8, s[0:3], 0 offen -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x64, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x58, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_and_b32_e32 v5, 0xffff, v61 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v19 -; SI-NEXT: v_or_b32_e32 v5, v5, v6 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0x68, v0 -; SI-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v36 -; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 -; SI-NEXT: v_and_b32_e32 v1, 0xffff, v62 -; SI-NEXT: v_or_b32_e32 v3, v3, v4 -; SI-NEXT: v_add_i32_e32 v4, vcc, 0x70, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v57 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x5c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v11 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v55 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x60, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v6 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v59 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x64, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v24 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x68, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v31 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x6c, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v20 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, 0x70, v0 +; SI-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v39 ; SI-NEXT: v_or_b32_e32 v1, v1, v2 ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x74, v0 -; SI-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; SI-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload @@ -51543,6 +51843,8 @@ define inreg <60 x i16> @bitcast_v60f16_to_v60i16_scalar(<60 x half> inreg %a, i ; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: .LBB59_4: +; SI-NEXT: s_branch .LBB59_2 ; ; VI-LABEL: bitcast_v60f16_to_v60i16_scalar: ; VI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 4aded5da3668a..685e2fbdecfad 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -632,51 +632,53 @@ define inreg <12 x i8> @bitcast_v3i32_to_v12i8_scalar(<3 x i32> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB5_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s19, s17, 24 +; SI-NEXT: s_lshr_b32 s22, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 8 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB5_3 ; SI-NEXT: .LBB5_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 -; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_add_i32 s18, s18, 3 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b32 s19, s17, 24 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[12:13], s[16:17], 8 +; SI-NEXT: s_lshr_b32 s22, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 8 +; SI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 8 ; SI-NEXT: .LBB5_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s4 ; SI-NEXT: v_mov_b32_e32 v4, s17 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s19 ; SI-NEXT: v_mov_b32_e32 v8, s18 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v10, s10 +; SI-NEXT: v_mov_b32_e32 v11, s6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB5_4: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 +; SI-NEXT: ; implicit-def: $sgpr12 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 ; SI-NEXT: s_branch .LBB5_2 ; ; VI-LABEL: bitcast_v3i32_to_v12i8_scalar: @@ -3133,31 +3135,29 @@ define inreg <6 x i16> @bitcast_v3i32_to_v6i16_scalar(<3 x i32> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB17_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB17_3 ; SI-NEXT: .LBB17_2: ; %cmp.true -; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_add_i32 s18, s18, 3 ; SI-NEXT: s_add_i32 s17, s17, 3 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_add_i32 s16, s16, 3 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 ; SI-NEXT: .LBB17_3: ; %end ; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v2, s17 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v3, s10 ; SI-NEXT: v_mov_b32_e32 v4, s18 +; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB17_4: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB17_2 ; ; VI-LABEL: bitcast_v3i32_to_v6i16_scalar: @@ -3762,50 +3762,59 @@ define inreg <12 x i8> @bitcast_v3f32_to_v12i8_scalar(<3 x float> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB21_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v11, s4, v0, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v0, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v0, 8 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v3, s17, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s17, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 8 -; SI-NEXT: s_lshr_b32 s6, s17, 24 -; SI-NEXT: s_lshr_b32 s7, s17, 16 -; SI-NEXT: s_lshr_b32 s8, s17, 8 +; SI-NEXT: s_lshr_b32 s19, s17, 24 +; SI-NEXT: s_lshr_b32 s22, s17, 16 +; SI-NEXT: s_lshr_b32 s23, s17, 8 +; SI-NEXT: s_lshr_b64 s[10:11], s[18:19], 24 +; SI-NEXT: s_lshr_b64 s[12:13], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[14:15], s[18:19], 8 +; SI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 +; SI-NEXT: s_lshr_b64 s[8:9], s[16:17], 8 ; SI-NEXT: s_cbranch_execnz .LBB21_4 ; SI-NEXT: .LBB21_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v8, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v4, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_alignbit_b32 v11, s4, v8, 24 -; SI-NEXT: v_alignbit_b32 v10, s4, v8, 16 -; SI-NEXT: v_alignbit_b32 v9, s4, v8, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v17, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v16, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[11:12], v[8:9], 24 +; SI-NEXT: v_lshr_b64 v[3:4], v[16:17], 24 +; SI-NEXT: v_lshr_b64 v[14:15], v[16:17], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[16:17], 8 +; SI-NEXT: v_lshr_b64 v[12:13], v[8:9], 16 +; SI-NEXT: v_lshr_b64 v[9:10], v[8:9], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v17 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v17 +; SI-NEXT: s_branch .LBB21_5 ; SI-NEXT: .LBB21_3: -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr9 -; SI-NEXT: ; implicit-def: $vgpr10 -; SI-NEXT: ; implicit-def: $vgpr11 +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr23 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr19 +; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr12 +; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: s_branch .LBB21_2 ; SI-NEXT: .LBB21_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v4, s17 +; SI-NEXT: v_mov_b32_e32 v16, s16 +; SI-NEXT: v_mov_b32_e32 v17, s17 ; SI-NEXT: v_mov_b32_e32 v8, s18 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: v_mov_b32_e32 v6, s7 -; SI-NEXT: v_mov_b32_e32 v7, s6 +; SI-NEXT: v_mov_b32_e32 v5, s23 +; SI-NEXT: v_mov_b32_e32 v6, s22 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_mov_b32_e32 v11, s10 +; SI-NEXT: v_mov_b32_e32 v12, s12 +; SI-NEXT: v_mov_b32_e32 v9, s14 +; SI-NEXT: v_mov_b32_e32 v3, s4 +; SI-NEXT: v_mov_b32_e32 v14, s6 +; SI-NEXT: v_mov_b32_e32 v1, s8 +; SI-NEXT: .LBB21_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v16 +; SI-NEXT: v_mov_b32_e32 v2, v14 +; SI-NEXT: v_mov_b32_e32 v4, v17 +; SI-NEXT: v_mov_b32_e32 v10, v12 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v12i8_scalar: @@ -6282,30 +6291,33 @@ define inreg <6 x i16> @bitcast_v3f32_to_v6i16_scalar(<3 x float> inreg %a, i32 ; SI-NEXT: s_cmp_lg_u32 s19, 0 ; SI-NEXT: s_cbranch_scc0 .LBB33_3 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_mov_b32_e32 v0, s18 -; SI-NEXT: v_alignbit_b32 v5, s4, v0, 16 -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_alignbit_b32 v1, s17, v0, 16 -; SI-NEXT: s_lshr_b32 s6, s17, 16 +; SI-NEXT: s_lshr_b32 s10, s17, 16 +; SI-NEXT: s_lshr_b64 s[4:5], s[18:19], 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[16:17], 16 ; SI-NEXT: s_cbranch_execnz .LBB33_4 ; SI-NEXT: .LBB33_2: ; %cmp.true ; SI-NEXT: v_add_f32_e64 v4, s18, 1.0 -; SI-NEXT: v_add_f32_e64 v2, s17, 1.0 -; SI-NEXT: v_add_f32_e64 v0, s16, 1.0 -; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 -; SI-NEXT: v_alignbit_b32 v5, s4, v4, 16 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-NEXT: s_setpc_b64 s[30:31] +; SI-NEXT: v_add_f32_e64 v8, s17, 1.0 +; SI-NEXT: v_add_f32_e64 v7, s16, 1.0 +; SI-NEXT: v_lshr_b64 v[1:2], v[7:8], 16 +; SI-NEXT: v_lshr_b64 v[5:6], v[4:5], 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8 +; SI-NEXT: s_branch .LBB33_5 ; SI-NEXT: .LBB33_3: -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr5 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: s_branch .LBB33_2 ; SI-NEXT: .LBB33_4: -; SI-NEXT: v_mov_b32_e32 v0, s16 -; SI-NEXT: v_mov_b32_e32 v2, s17 +; SI-NEXT: v_mov_b32_e32 v7, s16 +; SI-NEXT: v_mov_b32_e32 v8, s17 ; SI-NEXT: v_mov_b32_e32 v4, s18 -; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v3, s10 +; SI-NEXT: v_mov_b32_e32 v5, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: .LBB33_5: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: bitcast_v3f32_to_v6i16_scalar: @@ -7981,62 +7993,64 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s22, 0 -; SI-NEXT: v_mul_f32_e64 v16, 1.0, s17 -; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16 -; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19 -; SI-NEXT: v_mul_f32_e64 v15, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v12, 1.0, s21 -; SI-NEXT: v_mul_f32_e64 v13, 1.0, s20 +; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17 +; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16 +; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18 +; SI-NEXT: v_mul_f32_e64 v0, 1.0, s21 +; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20 ; SI-NEXT: s_cbranch_scc0 .LBB39_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v16 -; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v12 -; SI-NEXT: v_alignbit_b32 v0, v0, v17, 16 -; SI-NEXT: v_alignbit_b32 v4, v6, v15, 16 -; SI-NEXT: v_alignbit_b32 v8, v10, v13, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15 +; SI-NEXT: v_alignbit_b32 v12, v1, v18, 16 +; SI-NEXT: v_alignbit_b32 v13, v6, v16, 16 +; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_alignbit_b32 v8, v10, v14, 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 +; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v15 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; SI-NEXT: s_cbranch_execnz .LBB39_3 ; SI-NEXT: .LBB39_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 -; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18 +; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14 -; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 ; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7 -; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; SI-NEXT: v_alignbit_b32 v4, v6, v1, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; SI-NEXT: v_add_f32_e32 v11, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; SI-NEXT: v_alignbit_b32 v13, v6, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14 +; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v11 +; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 ; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7 -; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v11 +; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0 ; SI-NEXT: .LBB39_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v12 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB39_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr7 @@ -9524,69 +9538,71 @@ define inreg <12 x i8> @bitcast_v6f16_to_v12i8_scalar(<6 x half> inreg %a, i32 i ; SI-LABEL: bitcast_v6f16_to_v12i8_scalar: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_cvt_f16_f32_e32 v15, s17 -; SI-NEXT: v_cvt_f16_f32_e32 v14, s16 +; SI-NEXT: v_cvt_f16_f32_e32 v16, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v15, s16 ; SI-NEXT: v_cvt_f16_f32_e32 v6, s19 -; SI-NEXT: v_cvt_f16_f32_e32 v13, s18 +; SI-NEXT: v_cvt_f16_f32_e32 v14, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v10, s21 -; SI-NEXT: v_cvt_f16_f32_e32 v12, s20 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s20 ; SI-NEXT: s_cmp_lg_u32 s22, 0 ; SI-NEXT: s_cbranch_scc0 .LBB43_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v15 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v16 +; SI-NEXT: v_or_b32_e32 v12, v15, v1 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v10 -; SI-NEXT: v_or_b32_e32 v0, v14, v0 -; SI-NEXT: v_or_b32_e32 v4, v13, v1 -; SI-NEXT: v_or_b32_e32 v8, v12, v7 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_or_b32_e32 v13, v14, v1 +; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 +; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 +; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_or_b32_e32 v8, v0, v2 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 ; SI-NEXT: s_cbranch_execnz .LBB43_3 ; SI-NEXT: .LBB43_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v0, v15 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v14 -; SI-NEXT: v_cvt_f32_f16_e32 v2, v6 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v13 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v16 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v15 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v6 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v14 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_cvt_f16_f32_e32 v6, v2 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v12, v2, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v10 -; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v3, v12 +; SI-NEXT: v_cvt_f16_f32_e32 v6, v3 +; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 ; SI-NEXT: v_cvt_f16_f32_e32 v10, v1 -; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; SI-NEXT: v_or_b32_e32 v4, v2, v3 -; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v10 -; SI-NEXT: v_or_b32_e32 v8, v1, v2 -; SI-NEXT: v_alignbit_b32 v3, v4, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, v4, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, v4, v0, 8 -; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v6 +; SI-NEXT: v_or_b32_e32 v13, v2, v1 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24 +; SI-NEXT: v_or_b32_e32 v8, v0, v1 +; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16 +; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8 +; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8 ; SI-NEXT: v_bfe_u32 v7, v6, 8, 8 ; SI-NEXT: v_bfe_u32 v11, v10, 8, 8 ; SI-NEXT: .LBB43_3: ; %end +; SI-NEXT: v_mov_b32_e32 v0, v12 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v4, v13 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB43_4: -; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: ; implicit-def: $vgpr12 ; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 +; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr5 ; SI-NEXT: ; implicit-def: $vgpr7 ; SI-NEXT: ; implicit-def: $vgpr8 @@ -10274,37 +10290,37 @@ define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 in ; SI-NEXT: s_cmp_lg_u32 s28, 0 ; SI-NEXT: s_cbranch_scc0 .LBB45_4 ; SI-NEXT: ; %bb.1: ; %cmp.false -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 +; SI-NEXT: s_and_b32 s4, s16, 0xff +; SI-NEXT: s_lshl_b32 s5, s17, 8 ; SI-NEXT: s_or_b32 s4, s4, s5 -; SI-NEXT: s_and_b32 s5, s22, 0xff +; SI-NEXT: s_and_b32 s5, s18, 0xff ; SI-NEXT: s_lshl_b32 s5, s5, 16 -; SI-NEXT: s_lshl_b32 s6, s23, 24 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s6, s5 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xff -; SI-NEXT: s_lshl_b32 s4, s4, 16 ; SI-NEXT: s_lshl_b32 s6, s19, 24 -; SI-NEXT: s_or_b32 s4, s6, s4 -; SI-NEXT: s_and_b32 s6, s16, 0xff -; SI-NEXT: s_lshl_b32 s8, s17, 8 -; SI-NEXT: s_or_b32 s6, s6, s8 -; SI-NEXT: s_and_b32 s6, s6, 0xffff -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_or_b32 s6, s6, s4 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s8, s25, 8 -; SI-NEXT: s_or_b32 s4, s4, s8 -; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_lshl_b32 s9, s27, 24 +; SI-NEXT: s_or_b32 s12, s6, s5 +; SI-NEXT: s_and_b32 s5, s24, 0xff +; SI-NEXT: s_lshl_b32 s6, s25, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s26, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s27, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s8, s7, s6 +; SI-NEXT: s_or_b32 s10, s5, s8 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_and_b32 s6, s22, 0xff +; SI-NEXT: s_lshl_b32 s6, s6, 16 +; SI-NEXT: s_lshl_b32 s7, s23, 24 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s9, s7, s6 +; SI-NEXT: s_or_b32 s13, s5, s9 ; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s10, s9, s8 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 -; SI-NEXT: s_or_b32 s8, s4, s10 -; SI-NEXT: s_lshr_b32 s9, s5, 16 -; SI-NEXT: s_lshr_b32 s10, s10, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[12:13], 16 +; SI-NEXT: s_or_b32 s4, s4, s12 +; SI-NEXT: s_lshr_b32 s7, s9, 16 +; SI-NEXT: s_lshr_b32 s11, s8, 16 +; SI-NEXT: s_mov_b32 s5, s13 ; SI-NEXT: s_cbranch_execnz .LBB45_3 ; SI-NEXT: .LBB45_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 @@ -10318,52 +10334,51 @@ define inreg <6 x i16> @bitcast_v12i8_to_v6i16_scalar(<12 x i8> inreg %a, i32 in ; SI-NEXT: s_lshl_b32 s6, s6, 16 ; SI-NEXT: s_and_b32 s4, s4, 0xffff ; SI-NEXT: s_or_b32 s5, s5, s6 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s20, 0xff -; SI-NEXT: s_lshl_b32 s5, s21, 8 -; SI-NEXT: s_add_i32 s22, s22, 3 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s20, 0xff +; SI-NEXT: s_lshl_b32 s6, s21, 8 +; SI-NEXT: s_add_i32 s22, s22, 3 +; SI-NEXT: s_or_b32 s5, s6, s5 ; SI-NEXT: s_and_b32 s7, s22, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s23, 24 +; SI-NEXT: s_addk_i32 s5, 0x300 +; SI-NEXT: s_lshl_b32 s6, s23, 24 ; SI-NEXT: s_lshl_b32 s7, s7, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s7 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s5, 0xffff +; SI-NEXT: s_or_b32 s6, s6, s7 ; SI-NEXT: s_add_i32 s24, s24, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x3000000 -; SI-NEXT: s_and_b32 s4, s24, 0xff -; SI-NEXT: s_lshl_b32 s5, s25, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s24, 0xff +; SI-NEXT: s_lshl_b32 s7, s25, 8 ; SI-NEXT: s_add_i32 s26, s26, 3 -; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_or_b32 s6, s7, s6 ; SI-NEXT: s_and_b32 s8, s26, 0xff -; SI-NEXT: s_addk_i32 s4, 0x300 -; SI-NEXT: s_lshl_b32 s5, s27, 24 +; SI-NEXT: s_addk_i32 s6, 0x300 +; SI-NEXT: s_lshl_b32 s7, s27, 24 ; SI-NEXT: s_lshl_b32 s8, s8, 16 -; SI-NEXT: s_and_b32 s4, s4, 0xffff -; SI-NEXT: s_or_b32 s5, s5, s8 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s8, s4, 0x3000000 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 16 -; SI-NEXT: s_lshr_b32 s9, s7, 16 -; SI-NEXT: s_lshr_b32 s10, s8, 16 +; SI-NEXT: s_and_b32 s6, s6, 0xffff +; SI-NEXT: s_or_b32 s7, s7, s8 +; SI-NEXT: s_add_i32 s4, s4, 0x3000000 +; SI-NEXT: s_add_i32 s5, s5, 0x3000000 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s10, s6, 0x3000000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 16 +; SI-NEXT: s_lshr_b32 s7, s5, 16 +; SI-NEXT: s_lshr_b32 s11, s10, 16 ; SI-NEXT: .LBB45_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v2, s7 -; SI-NEXT: v_mov_b32_e32 v3, s9 -; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v5, s10 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_mov_b32_e32 v3, s7 +; SI-NEXT: v_mov_b32_e32 v4, s10 +; SI-NEXT: v_mov_b32_e32 v5, s11 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB45_4: +; SI-NEXT: ; implicit-def: $sgpr4 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: ; implicit-def: $sgpr7 -; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr11 ; SI-NEXT: s_branch .LBB45_2 ; ; VI-LABEL: bitcast_v12i8_to_v6i16_scalar: @@ -10954,74 +10969,74 @@ define inreg <12 x i8> @bitcast_v6i16_to_v12i8_scalar(<6 x i16> inreg %a, i32 in ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s6, s4, s5 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 -; SI-NEXT: s_or_b32 s7, s4, s5 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: s_or_b32 s8, s4, s5 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 -; SI-NEXT: s_lshr_b32 s9, s7, 8 -; SI-NEXT: s_lshr_b32 s12, s8, 8 -; SI-NEXT: s_and_b32 s10, s19, 0xffff -; SI-NEXT: s_and_b32 s13, s21, 0xffff +; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 +; SI-NEXT: s_or_b32 s5, s5, s6 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_and_b32 s9, s20, 0xffff +; SI-NEXT: s_lshl_b32 s11, s21, 16 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_or_b32 s14, s9, s11 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_lshr_b32 s15, s14, 8 +; SI-NEXT: s_and_b32 s9, s19, 0xffff +; SI-NEXT: s_and_b32 s22, s21, 0xffff ; SI-NEXT: s_bfe_u32 s11, s19, 0x80008 -; SI-NEXT: s_bfe_u32 s14, s21, 0x80008 +; SI-NEXT: s_bfe_u32 s23, s21, 0x80008 ; SI-NEXT: s_cbranch_execnz .LBB47_3 ; SI-NEXT: .LBB47_2: ; %cmp.true ; SI-NEXT: s_add_i32 s16, s16, 3 ; SI-NEXT: s_and_b32 s4, s16, 0xffff ; SI-NEXT: s_lshl_b32 s5, s17, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 ; SI-NEXT: s_add_i32 s18, s18, 3 -; SI-NEXT: s_add_i32 s6, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s18, 0xffff -; SI-NEXT: s_lshl_b32 s5, s19, 16 ; SI-NEXT: s_or_b32 s4, s5, s4 +; SI-NEXT: s_and_b32 s5, s18, 0xffff +; SI-NEXT: s_lshl_b32 s6, s19, 16 ; SI-NEXT: s_add_i32 s20, s20, 3 -; SI-NEXT: s_add_i32 s7, s4, 0x30000 -; SI-NEXT: s_and_b32 s4, s20, 0xffff -; SI-NEXT: s_lshl_b32 s5, s21, 16 -; SI-NEXT: s_or_b32 s4, s5, s4 -; SI-NEXT: s_add_i32 s8, s4, 0x30000 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_alignbit_b32 v3, s7, v0, 24 -; SI-NEXT: v_alignbit_b32 v2, s7, v0, 16 -; SI-NEXT: v_alignbit_b32 v1, s7, v0, 8 -; SI-NEXT: s_lshr_b32 s11, s7, 24 -; SI-NEXT: s_lshr_b32 s10, s7, 16 -; SI-NEXT: s_lshr_b32 s9, s7, 8 -; SI-NEXT: s_lshr_b32 s14, s8, 24 -; SI-NEXT: s_lshr_b32 s13, s8, 16 -; SI-NEXT: s_lshr_b32 s12, s8, 8 +; SI-NEXT: s_or_b32 s5, s6, s5 +; SI-NEXT: s_and_b32 s6, s20, 0xffff +; SI-NEXT: s_lshl_b32 s7, s21, 16 +; SI-NEXT: s_add_i32 s4, s4, 0x30000 +; SI-NEXT: s_add_i32 s5, s5, 0x30000 +; SI-NEXT: s_or_b32 s6, s7, s6 +; SI-NEXT: s_add_i32 s14, s6, 0x30000 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], 24 +; SI-NEXT: s_lshr_b64 s[8:9], s[4:5], 16 +; SI-NEXT: s_lshr_b64 s[10:11], s[4:5], 8 +; SI-NEXT: s_lshr_b32 s11, s5, 24 +; SI-NEXT: s_lshr_b32 s9, s5, 16 +; SI-NEXT: s_lshr_b32 s7, s5, 8 +; SI-NEXT: s_lshr_b32 s23, s14, 24 +; SI-NEXT: s_lshr_b32 s22, s14, 16 +; SI-NEXT: s_lshr_b32 s15, s14, 8 ; SI-NEXT: .LBB47_3: ; %end -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v4, s7 -; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_mov_b32_e32 v6, s10 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s8 +; SI-NEXT: v_mov_b32_e32 v3, s6 +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v6, s9 ; SI-NEXT: v_mov_b32_e32 v7, s11 -; SI-NEXT: v_mov_b32_e32 v8, s8 -; SI-NEXT: v_mov_b32_e32 v9, s12 -; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_mov_b32_e32 v11, s14 +; SI-NEXT: v_mov_b32_e32 v8, s14 +; SI-NEXT: v_mov_b32_e32 v9, s15 +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB47_4: +; SI-NEXT: ; implicit-def: $sgpr4 +; SI-NEXT: ; implicit-def: $sgpr10 +; SI-NEXT: ; implicit-def: $sgpr8 ; SI-NEXT: ; implicit-def: $sgpr6 -; SI-NEXT: ; implicit-def: $vgpr1 -; SI-NEXT: ; implicit-def: $vgpr2 -; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $sgpr7 ; SI-NEXT: ; implicit-def: $sgpr9 -; SI-NEXT: ; implicit-def: $sgpr10 ; SI-NEXT: ; implicit-def: $sgpr11 -; SI-NEXT: ; implicit-def: $sgpr8 -; SI-NEXT: ; implicit-def: $sgpr12 -; SI-NEXT: ; implicit-def: $sgpr13 ; SI-NEXT: ; implicit-def: $sgpr14 +; SI-NEXT: ; implicit-def: $sgpr15 +; SI-NEXT: ; implicit-def: $sgpr22 +; SI-NEXT: ; implicit-def: $sgpr23 ; SI-NEXT: s_branch .LBB47_2 ; ; VI-LABEL: bitcast_v6i16_to_v12i8_scalar: @@ -12541,44 +12556,45 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3 ; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16 ; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17 ; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18 -; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19 +; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19 ; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20 ; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21 ; SI-NEXT: s_cbranch_scc0 .LBB53_4 ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11 -; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10 +; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10 ; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9 ; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8 ; SI-NEXT: s_cbranch_execnz .LBB53_3 ; SI-NEXT: .LBB53_2: ; %cmp.true -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10 ; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11 -; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 ; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v8 +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6 -; SI-NEXT: v_alignbit_b32 v4, v5, v2, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 ; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7 -; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16 -; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 +; SI-NEXT: v_alignbit_b32 v4, v5, v9, 16 ; SI-NEXT: .LBB53_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB53_4: ; SI-NEXT: ; implicit-def: $vgpr0 -; SI-NEXT: ; implicit-def: $vgpr1 +; SI-NEXT: ; implicit-def: $vgpr6 ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: ; implicit-def: $vgpr3 ; SI-NEXT: ; implicit-def: $vgpr4 @@ -13210,7 +13226,7 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: v_cvt_f16_f32_e32 v0, s16 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s17 +; SI-NEXT: v_cvt_f16_f32_e32 v6, s17 ; SI-NEXT: v_cvt_f16_f32_e32 v2, s18 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s19 ; SI-NEXT: v_cvt_f16_f32_e32 v4, s20 @@ -13220,32 +13236,33 @@ define inreg <6 x i16> @bitcast_v6f16_to_v6i16_scalar(<6 x half> inreg %a, i32 i ; SI-NEXT: ; %bb.1: ; %cmp.false ; SI-NEXT: s_cbranch_execnz .LBB57_3 ; SI-NEXT: .LBB57_2: ; %cmp.true -; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 -; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v6 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_add_f32_e32 v3, 0x38000000, v3 ; SI-NEXT: v_add_f32_e32 v1, 0x38000000, v1 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v2, 0x38000000, v2 -; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, 0x38000000, v5 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_add_f32_e32 v4, 0x38000000, v4 ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v5 -; SI-NEXT: v_or_b32_e32 v4, v4, v6 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; SI-NEXT: v_or_b32_e32 v2, v2, v6 +; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v5 +; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16 ; SI-NEXT: v_or_b32_e32 v0, v0, v1 -; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 +; SI-NEXT: v_or_b32_e32 v4, v4, v8 ; SI-NEXT: .LBB57_3: ; %end +; SI-NEXT: v_mov_b32_e32 v1, v6 ; SI-NEXT: s_setpc_b64 s[30:31] ; SI-NEXT: .LBB57_4: ; SI-NEXT: s_branch .LBB57_2 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index b2dcd77274989..e27164c2d6d69 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -4610,50 +4610,48 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], 30 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_and_b32 s8, s6, 0x7fff +; GFX6-NEXT: s_and_b32 s6, s4, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff -; GFX6-NEXT: s_and_b32 s7, s4, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], 30 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: s_bfe_u32 s7, s10, 0xf000f -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s7 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc -; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 -; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: s_bfe_u32 s9, s10, 0xf000f +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_mul_f32_e32 v0, v4, v5 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_mad_f32 v4, -v0, v3, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v1 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc +; GFX6-NEXT: v_mul_f32_e32 v0, v5, v6 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v0 +; GFX6-NEXT: v_mad_f32 v0, -v0, v1, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s0, s8 -; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4667,46 +4665,44 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], 30 +; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s5, s2, 0x7fff +; GFX9-NEXT: s_bfe_u32 s8, s2, 0xf000f +; GFX9-NEXT: s_lshr_b64 s[2:3], s[6:7], 30 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 -; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 -; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7fff +; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v0, v5, v6 +; GFX9-NEXT: s_and_b32 s4, s4, 0x7fff +; GFX9-NEXT: v_trunc_f32_e32 v0, v0 +; GFX9-NEXT: v_mad_f32 v5, -v0, v4, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v0, vcc +; GFX9-NEXT: v_mul_f32_e32 v0, v6, v7 +; GFX9-NEXT: v_trunc_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v0 +; GFX9-NEXT: v_mad_f32 v0, -v0, v1, v6 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v1 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v5, vcc ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 @@ -4797,58 +4793,56 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], 30 ; GFX6-NEXT: s_mov_b32 s0, s8 -; GFX6-NEXT: s_and_b32 s8, s4, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX6-NEXT: s_and_b32 s7, s10, 0x7fff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 -; GFX6-NEXT: s_bfe_u32 s5, s4, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s8, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s10, v1 -; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 -; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 +; GFX6-NEXT: s_and_b32 s8, s6, 0x7fff +; GFX6-NEXT: s_and_b32 s6, s4, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: s_and_b32 s6, s10, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[4:5], 30 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX6-NEXT: s_bfe_u32 s6, s4, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX6-NEXT: s_bfe_u32 s11, s10, 0xf000f +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s8 ; GFX6-NEXT: s_lshr_b32 s4, s4, 15 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s4 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 -; GFX6-NEXT: s_lshr_b32 s6, s10, 15 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 -; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 -; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, v1, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: v_trunc_f32_e32 v0, v0 +; GFX6-NEXT: v_mad_f32 v1, -v0, v3, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 ; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc +; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 +; GFX6-NEXT: s_lshr_b32 s9, s10, 15 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v0 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v1 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -4862,54 +4856,52 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s6, 0x7fff +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], 30 +; GFX9-NEXT: s_and_b32 s5, s6, 0x7fff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_and_b32 s4, s2, 0x7fff -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX9-NEXT: s_bfe_u32 s4, s6, 0xf000f -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0xf000f -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 +; GFX9-NEXT: s_and_b32 s8, s4, 0x7fff +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 30 +; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0xf000f +; GFX9-NEXT: s_and_b32 s3, s4, 0x7fff +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s8 +; GFX9-NEXT: v_trunc_f32_e32 v1, v1 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 +; GFX9-NEXT: v_mad_f32 v5, -v1, v4, v5 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 +; GFX9-NEXT: v_mul_f32_e32 v4, v6, v7 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 -; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 -; GFX9-NEXT: s_lshr_b32 s3, s6, 15 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mad_f32 v4, -v4, v3, v6 +; GFX9-NEXT: s_lshr_b32 s4, s6, 15 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 ; GFX9-NEXT: s_lshr_b32 s3, s2, 15 -; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 -; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, s2, v0 +; GFX9-NEXT: v_sub_u32_e32 v5, s3, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, s8, v3 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5006,64 +4998,63 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: s_bfe_i32 s6, s4, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 -; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 30 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], 30 +; GFX6-NEXT: s_bfe_i32 s7, s4, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s7 +; GFX6-NEXT: s_mov_b32 s0, s8 +; GFX6-NEXT: s_mov_b32 s1, s9 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], 30 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: s_xor_b32 s5, s5, s6 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_xor_b32 s5, s5, s7 ; GFX6-NEXT: s_ashr_i32 s5, s5, 30 ; GFX6-NEXT: s_or_b32 s5, s5, 1 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v2| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 ; GFX6-NEXT: s_cselect_b32 s5, s5, 0 ; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, s5, v4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s5, v2 ; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 +; GFX6-NEXT: s_or_b32 s7, s4, 1 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: s_cselect_b32 s4, s7, 0 +; GFX6-NEXT: s_bfe_i32 s5, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: s_bfe_i32 s4, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX6-NEXT: s_xor_b32 s4, s4, s5 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, v1 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mad_f32 v1, -v4, v0, v1 +; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v0| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s4, v5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v5, -v1, v2, v5 -; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 ; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX6-NEXT: s_mov_b32 s0, s8 -; GFX6-NEXT: s_mov_b32 s1, s9 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5077,60 +5068,59 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_bfe_i32 s3, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], 30 +; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: s_bfe_i32 s5, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX9-NEXT: s_xor_b32 s3, s5, s3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX9-NEXT: s_ashr_i32 s3, s3, 30 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], 30 ; GFX9-NEXT: s_or_b32 s3, s3, 1 -; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 -; GFX9-NEXT: v_trunc_f32_e32 v5, v5 -; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[10:11], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[10:11], s[10:11], exec ; GFX9-NEXT: s_cselect_b32 s3, s3, 0 -; GFX9-NEXT: s_bfe_i32 s4, s6, 0xf000f -; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 ; GFX9-NEXT: s_bfe_i32 s2, s2, 0xf000f -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_add_u32_e32 v4, s3, v5 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_ashr_i32 s2, s2, 30 +; GFX9-NEXT: v_add_u32_e32 v3, s3, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 +; GFX9-NEXT: s_or_b32 s5, s2, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: s_cselect_b32 s2, s5, 0 +; GFX9-NEXT: s_bfe_i32 s3, s8, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v4, s2, v4 +; GFX9-NEXT: s_bfe_i32 s2, s4, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 +; GFX9-NEXT: s_xor_b32 s2, s2, s3 ; GFX9-NEXT: s_ashr_i32 s2, s2, 30 -; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 ; GFX9-NEXT: s_or_b32 s4, s2, 1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v5|, |v3| -; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 +; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cselect_b32 s2, s4, 0 -; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX9-NEXT: v_add_u32_e32 v5, s2, v6 -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 -; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v5 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 @@ -5233,74 +5223,73 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s6, s10, 0xf0000 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 30 -; GFX6-NEXT: s_bfe_i32 s5, s4, 0xf0000 -; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s6 -; GFX6-NEXT: s_xor_b32 s5, s6, s5 -; GFX6-NEXT: s_ashr_i32 s5, s5, 30 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 ; GFX6-NEXT: s_mov_b32 s0, s8 ; GFX6-NEXT: s_mov_b32 s1, s9 -; GFX6-NEXT: s_lshr_b32 s8, s10, 15 -; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 -; GFX6-NEXT: v_trunc_f32_e32 v6, v6 -; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 -; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX6-NEXT: s_lshr_b32 s9, s4, 15 -; GFX6-NEXT: s_or_b32 s5, s5, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[6:7], |v5|, |v4| -; GFX6-NEXT: s_and_b64 s[6:7], s[6:7], exec -; GFX6-NEXT: s_cselect_b32 s5, s5, 0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, s5, v6 -; GFX6-NEXT: v_mul_lo_u32 v4, v4, s4 -; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s4 -; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s5 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 +; GFX6-NEXT: s_lshr_b64 s[8:9], s[4:5], 30 +; GFX6-NEXT: s_bfe_i32 s5, s4, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: s_bfe_i32 s12, s10, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s12 +; GFX6-NEXT: s_xor_b32 s5, s12, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], 30 +; GFX6-NEXT: s_ashr_i32 s5, s5, 30 +; GFX6-NEXT: s_and_b32 s7, s6, 0x7fff +; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX6-NEXT: s_lshr_b32 s11, s10, 15 +; GFX6-NEXT: s_and_b32 s9, s8, 0x7fff +; GFX6-NEXT: s_lshr_b32 s14, s4, 15 +; GFX6-NEXT: s_or_b32 s5, s5, 1 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, |v0| +; GFX6-NEXT: s_and_b64 s[12:13], s[12:13], exec +; GFX6-NEXT: s_cselect_b32 s5, s5, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v2 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 +; GFX6-NEXT: s_bfe_i32 s4, s4, 0xf000f +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_bfe_i32 s5, s10, 0xf000f +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v0 +; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: s_ashr_i32 s4, s4, 30 +; GFX6-NEXT: s_or_b32 s10, s4, 1 +; GFX6-NEXT: v_mul_f32_e32 v3, v0, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v0, -v3, v1, v0 +; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, |v1| +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec +; GFX6-NEXT: s_cselect_b32 s4, s10, 0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v3 +; GFX6-NEXT: s_bfe_i32 s4, s8, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 +; GFX6-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX6-NEXT: s_xor_b32 s4, s5, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 -; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v7, v7 -; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| -; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_alignbit_b32 v0, s11, v0, 30 +; GFX6-NEXT: v_mul_lo_u32 v0, v0, s14 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v1| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, s4, v7 -; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 -; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 -; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| -; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v5, s9 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v5 -; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s11, v0 +; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s7, v1 ; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 -; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 -; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_waitcnt expcnt(0) @@ -5312,78 +5301,77 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s3 -; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf0000 -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s4 -; GFX9-NEXT: s_xor_b32 s3, s4, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: s_ashr_i32 s3, s3, 30 -; GFX9-NEXT: s_lshr_b32 s8, s2, 15 -; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v6, v6 -; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 -; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 -; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 -; GFX9-NEXT: s_lshr_b32 s7, s6, 15 -; GFX9-NEXT: s_or_b32 s3, s3, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, |v4| -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s3, s3, 0 -; GFX9-NEXT: v_add_u32_e32 v4, s3, v6 -; GFX9-NEXT: s_bfe_i32 s3, s6, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s3 -; GFX9-NEXT: s_bfe_i32 s4, s2, 0xf000f -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s4 -; GFX9-NEXT: s_xor_b32 s3, s4, s3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 -; GFX9-NEXT: s_ashr_i32 s3, s3, 30 -; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 -; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 -; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 -; GFX9-NEXT: s_or_b32 s3, s3, 1 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v6|, |v5| -; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[2:3], 30 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 +; GFX9-NEXT: s_lshr_b64 s[8:9], s[6:7], 30 +; GFX9-NEXT: s_bfe_i32 s7, s2, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GFX9-NEXT: s_xor_b32 s5, s7, s5 +; GFX9-NEXT: s_ashr_i32 s5, s5, 30 +; GFX9-NEXT: s_lshr_b32 s3, s2, 15 +; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX9-NEXT: s_and_b32 s9, s4, 0x7fff +; GFX9-NEXT: s_and_b32 s12, s8, 0x7fff +; GFX9-NEXT: s_lshr_b32 s13, s6, 15 +; GFX9-NEXT: s_or_b32 s5, s5, 1 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[10:11], |v1|, |v0| +; GFX9-NEXT: s_and_b64 s[10:11], s[10:11], exec +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: v_add_u32_e32 v0, s5, v2 +; GFX9-NEXT: s_bfe_i32 s5, s6, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0xf000f +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v1 +; GFX9-NEXT: s_xor_b32 s5, s6, s5 +; GFX9-NEXT: s_ashr_i32 s5, s5, 30 +; GFX9-NEXT: s_or_b32 s5, s5, 1 +; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_mad_f32 v2, -v3, v1, v2 +; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v1| +; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 +; GFX9-NEXT: v_add_u32_e32 v1, s5, v3 +; GFX9-NEXT: s_bfe_i32 s5, s8, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX9-NEXT: s_bfe_i32 s4, s4, 0xf0000 +; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 +; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX9-NEXT: s_ashr_i32 s4, s4, 30 +; GFX9-NEXT: s_or_b32 s6, s4, 1 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s13 +; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX9-NEXT: v_trunc_f32_e32 v4, v4 +; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 +; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s3, s3, 0 -; GFX9-NEXT: v_add_u32_e32 v5, s3, v7 -; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 -; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 -; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 -; GFX9-NEXT: v_trunc_f32_e32 v7, v7 -; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 -; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| -; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 -; GFX9-NEXT: v_mul_lo_u32 v5, v5, s7 -; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 -; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 -; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 +; GFX9-NEXT: s_cselect_b32 s4, s6, 0 +; GFX9-NEXT: v_add_u32_e32 v2, s4, v4 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s12 +; GFX9-NEXT: v_sub_u32_e32 v4, s2, v0 +; GFX9-NEXT: v_sub_u32_e32 v5, s3, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_sub_u32_e32 v0, s9, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 0x7fff, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 ; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 -; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 -; GFX9-NEXT: global_store_short v2, v0, s[0:1] offset:4 +; GFX9-NEXT: global_store_short v3, v0, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm %r = srem <3 x i15> %x, %y store <3 x i15> %r, ptr addrspace(1) %out @@ -7792,8 +7780,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX6-NEXT: s_ashr_i32 s8, s1, 31 @@ -7803,143 +7792,175 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_sub_u32 s4, 0, s10 -; GFX6-NEXT: s_subb_u32 s5, 0, s11 +; GFX6-NEXT: s_sub_u32 s12, 0, s10 +; GFX6-NEXT: s_subb_u32 s13, 0, s11 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s12, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s12 -; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_addc_u32 s3, s3, s12 -; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s11 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 -; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 -; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 -; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s3 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s1, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_mul_i32 s15, s13, s0 +; GFX6-NEXT: s_mul_i32 s16, s12, s0 +; GFX6-NEXT: s_add_i32 s1, s17, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 +; GFX6-NEXT: s_add_i32 s1, s1, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 +; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: s_mul_i32 s17, s0, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 +; GFX6-NEXT: s_add_u32 s15, s15, s17 +; GFX6-NEXT: v_readfirstlane_b32 s17, v0 +; GFX6-NEXT: s_addc_u32 s17, 0, s17 +; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: v_readfirstlane_b32 s18, v4 +; GFX6-NEXT: s_add_u32 s15, s15, s16 +; GFX6-NEXT: s_addc_u32 s15, s17, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s1, s14, s1 +; GFX6-NEXT: s_add_u32 s1, s15, s1 +; GFX6-NEXT: s_addc_u32 s15, 0, s16 +; GFX6-NEXT: s_add_u32 s16, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_addc_u32 s14, s14, s15 +; GFX6-NEXT: s_mul_i32 s0, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_add_i32 s0, s1, s0 +; GFX6-NEXT: s_mul_i32 s13, s13, s16 +; GFX6-NEXT: s_mul_i32 s1, s12, s16 +; GFX6-NEXT: s_add_i32 s0, s0, s13 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 +; GFX6-NEXT: s_mul_i32 s13, s16, s0 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_mul_i32 s1, s14, s1 +; GFX6-NEXT: s_addc_u32 s15, 0, s15 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_add_u32 s1, s13, s1 +; GFX6-NEXT: s_addc_u32 s1, s15, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s0, s14, s0 +; GFX6-NEXT: s_add_u32 s0, s1, s0 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 +; GFX6-NEXT: s_add_u32 s15, s16, s0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_addc_u32 s14, s14, s12 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: s_add_u32 s0, s6, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s1, s7, s12 +; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 +; GFX6-NEXT: s_mul_i32 s1, s6, s14 +; GFX6-NEXT: v_readfirstlane_b32 s16, v3 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: s_add_u32 s1, s16, s1 +; GFX6-NEXT: s_addc_u32 s4, 0, s4 +; GFX6-NEXT: s_mul_i32 s15, s7, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_add_u32 s1, s1, s15 +; GFX6-NEXT: s_addc_u32 s1, s4, s16 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_addc_u32 s4, s4, 0 +; GFX6-NEXT: s_mul_i32 s14, s7, s14 +; GFX6-NEXT: s_add_u32 s14, s1, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX6-NEXT: s_addc_u32 s15, 0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mul_i32 s4, s10, s15 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_add_i32 s4, s5, s4 +; GFX6-NEXT: s_mul_i32 s5, s11, s14 +; GFX6-NEXT: s_add_i32 s16, s4, s5 +; GFX6-NEXT: s_sub_i32 s17, s7, s16 +; GFX6-NEXT: s_mul_i32 s4, s10, s14 +; GFX6-NEXT: s_sub_u32 s6, s6, s4 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s18, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_subb_u32 s17, s17, s11 +; GFX6-NEXT: s_sub_u32 s19, s6, s10 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s4, s17, 0 +; GFX6-NEXT: s_cmp_ge_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s5, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s19, s10 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s4, s11 +; GFX6-NEXT: s_cselect_b32 s4, s17, s5 +; GFX6-NEXT: s_add_u32 s5, s14, 1 +; GFX6-NEXT: s_addc_u32 s17, s15, 0 +; GFX6-NEXT: s_add_u32 s19, s14, 2 +; GFX6-NEXT: s_addc_u32 s20, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_cselect_b32 s4, s19, s5 +; GFX6-NEXT: s_cselect_b32 s5, s20, s17 +; GFX6-NEXT: s_cmp_lg_u32 s18, 0 +; GFX6-NEXT: s_subb_u32 s7, s7, s16 +; GFX6-NEXT: s_cmp_ge_u32 s7, s11 +; GFX6-NEXT: s_cselect_b32 s16, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s6, s10 +; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s7, s11 +; GFX6-NEXT: s_cselect_b32 s6, s6, s16 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cselect_b32 s5, s5, s15 +; GFX6-NEXT: s_cselect_b32 s4, s4, s14 +; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_sub_u32 s4, s4, s6 +; GFX6-NEXT: s_subb_u32 s5, s5, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 -; GFX9-NEXT: s_ashr_i32 s2, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s2 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s0, 0, s6 -; GFX9-NEXT: s_subb_u32 s1, 0, s7 +; GFX9-NEXT: s_ashr_i32 s6, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s6 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_addc_u32 s1, s1, s6 +; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_sub_u32 s10, 0, s8 +; GFX9-NEXT: s_subb_u32 s11, 0, s9 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -7949,130 +7970,122 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s4, v2 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 -; GFX9-NEXT: s_mul_i32 s12, s0, s4 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s5 -; GFX9-NEXT: s_mul_i32 s13, s1, s5 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_mul_i32 s15, s0, s5 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s5, s15 -; GFX9-NEXT: s_mul_hi_u32 s13, s5, s12 -; GFX9-NEXT: s_mul_i32 s5, s5, s12 -; GFX9-NEXT: s_add_u32 s5, s14, s5 +; GFX9-NEXT: v_readfirstlane_b32 s12, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: s_mul_i32 s5, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s4 +; GFX9-NEXT: s_mul_i32 s13, s11, s4 +; GFX9-NEXT: s_add_i32 s5, s14, s5 +; GFX9-NEXT: s_mul_i32 s15, s10, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s4, s15 +; GFX9-NEXT: s_mul_i32 s16, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX9-NEXT: s_add_u32 s14, s14, s16 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 -; GFX9-NEXT: s_mul_i32 s15, s4, s15 -; GFX9-NEXT: s_add_u32 s5, s5, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s4, s12 -; GFX9-NEXT: s_addc_u32 s5, s13, s16 -; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s4, s12 -; GFX9-NEXT: s_add_u32 s5, s5, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s5, v1 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s4, s4, s12 -; GFX9-NEXT: v_readfirstlane_b32 s12, v1 -; GFX9-NEXT: s_mul_i32 s5, s0, s4 -; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s5, s13, s5 -; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s5, s5, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s0 -; GFX9-NEXT: s_mul_i32 s14, s4, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s5 -; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s5 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_addc_u32 s12, 0, s15 -; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5 -; GFX9-NEXT: s_addc_u32 s0, s12, s13 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s5, s4, s5 -; GFX9-NEXT: s_add_u32 s0, s0, s5 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s4, s1 -; GFX9-NEXT: s_ashr_i32 s4, s11, 31 -; GFX9-NEXT: s_add_u32 s0, s10, s4 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_add_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s5 +; GFX9-NEXT: s_addc_u32 s13, s13, s17 +; GFX9-NEXT: s_addc_u32 s14, s16, 0 +; GFX9-NEXT: s_mul_i32 s5, s12, s5 +; GFX9-NEXT: s_add_u32 s5, s13, s5 +; GFX9-NEXT: s_addc_u32 s13, 0, s14 +; GFX9-NEXT: s_add_u32 s14, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s4, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s14 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s4 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 +; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_addc_u32 s15, 0, s15 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_mul_i32 s4, s12, s4 +; GFX9-NEXT: s_add_u32 s4, s10, s4 +; GFX9-NEXT: s_addc_u32 s10, 0, s5 +; GFX9-NEXT: s_add_u32 s11, s14, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s10, s12, s10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s11, s4 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s13, v1 -; GFX9-NEXT: s_mul_i32 s1, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12 -; GFX9-NEXT: s_add_u32 s1, s14, s1 -; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 -; GFX9-NEXT: s_mul_i32 s13, s11, s13 -; GFX9-NEXT: s_add_u32 s1, s1, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12 -; GFX9-NEXT: s_addc_u32 s0, s0, s15 -; GFX9-NEXT: s_addc_u32 s1, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s11, s12 -; GFX9-NEXT: s_add_u32 s12, s0, s12 -; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s7, s12 -; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s6, s12 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_sub_i32 s0, s11, s14 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s10, s0, s7 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s10, s10, 0 -; GFX9-NEXT: s_cmp_ge_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v2 -; GFX9-NEXT: s_cmp_eq_u32 s10, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s15 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s12, 1 -; GFX9-NEXT: s_addc_u32 s10, s13, 0 -; GFX9-NEXT: s_add_u32 s1, s12, 2 -; GFX9-NEXT: s_addc_u32 s15, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v3, s0 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: v_mov_b32_e32 v4, s15 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s11, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3] -; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_mul_i32 s13, s2, s10 +; GFX9-NEXT: s_mul_hi_u32 s14, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s10 +; GFX9-NEXT: s_add_u32 s13, s14, s13 +; GFX9-NEXT: s_addc_u32 s12, 0, s12 +; GFX9-NEXT: s_mul_hi_u32 s15, s3, s11 +; GFX9-NEXT: s_mul_i32 s11, s3, s11 +; GFX9-NEXT: s_add_u32 s11, s13, s11 +; GFX9-NEXT: s_mul_hi_u32 s14, s3, s10 +; GFX9-NEXT: s_addc_u32 s11, s12, s15 +; GFX9-NEXT: s_addc_u32 s12, s14, 0 +; GFX9-NEXT: s_mul_i32 s10, s3, s10 +; GFX9-NEXT: s_add_u32 s14, s11, s10 +; GFX9-NEXT: s_addc_u32 s15, 0, s12 +; GFX9-NEXT: s_mul_i32 s10, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s11, s8, s14 +; GFX9-NEXT: s_add_i32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s11, s9, s14 +; GFX9-NEXT: s_add_i32 s16, s10, s11 +; GFX9-NEXT: s_sub_i32 s12, s3, s16 +; GFX9-NEXT: s_mul_i32 s10, s8, s14 +; GFX9-NEXT: s_sub_u32 s2, s2, s10 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s17, s12, s9 +; GFX9-NEXT: s_sub_u32 s18, s2, s8 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s12, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s18, s8 +; GFX9-NEXT: s_cselect_b32 s17, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s12, s9 +; GFX9-NEXT: s_cselect_b32 s12, s17, s13 +; GFX9-NEXT: s_add_u32 s13, s14, 1 +; GFX9-NEXT: s_addc_u32 s17, s15, 0 +; GFX9-NEXT: s_add_u32 s18, s14, 2 +; GFX9-NEXT: s_addc_u32 s19, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s12, s18, s13 +; GFX9-NEXT: s_cselect_b32 s13, s19, s17 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s3, s3, s16 +; GFX9-NEXT: s_cmp_ge_u32 s3, s9 +; GFX9-NEXT: s_cselect_b32 s10, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s3, s9 +; GFX9-NEXT: s_cselect_b32 s2, s2, s10 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s3, s13, s15 +; GFX9-NEXT: s_cselect_b32 s2, s12, s14 +; GFX9-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -8276,276 +8289,343 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 -; GFX6-NEXT: s_lshl_b64 s[14:15], 0x1000, s14 -; GFX6-NEXT: s_ashr_i32 s12, s1, 31 -; GFX6-NEXT: s_add_u32 s0, s0, s12 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: s_addc_u32 s1, s1, s12 -; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[12:13] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX6-NEXT: s_sub_u32 s0, 0, s2 -; GFX6-NEXT: s_subb_u32 s1, 0, s3 -; GFX6-NEXT: s_ashr_i32 s16, s9, 31 +; GFX6-NEXT: s_lshl_b64 s[6:7], 0x1000, s12 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s14 +; GFX6-NEXT: s_ashr_i32 s2, s7, 31 +; GFX6-NEXT: s_add_u32 s6, s6, s2 +; GFX6-NEXT: s_mov_b32 s3, s2 +; GFX6-NEXT: s_addc_u32 s7, s7, s2 +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_u32 s14, 0, s6 +; GFX6-NEXT: s_subb_u32 s15, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s8, s16 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s9, s16 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[16:17] -; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s12, v0 +; GFX6-NEXT: s_mul_i32 s13, s14, s16 +; GFX6-NEXT: v_readfirstlane_b32 s19, v2 +; GFX6-NEXT: s_mul_i32 s17, s15, s12 +; GFX6-NEXT: s_mul_i32 s18, s14, s12 +; GFX6-NEXT: s_add_i32 s13, s19, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s18 +; GFX6-NEXT: s_add_i32 s13, s13, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s18 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 +; GFX6-NEXT: s_mul_i32 s20, s12, s13 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13 +; GFX6-NEXT: s_add_u32 s17, s17, s20 +; GFX6-NEXT: v_readfirstlane_b32 s20, v0 +; GFX6-NEXT: s_mul_i32 s18, s16, s18 +; GFX6-NEXT: s_addc_u32 s20, 0, s20 +; GFX6-NEXT: v_readfirstlane_b32 s19, v4 +; GFX6-NEXT: s_add_u32 s17, s17, s18 +; GFX6-NEXT: s_addc_u32 s17, s20, s19 +; GFX6-NEXT: v_readfirstlane_b32 s18, v1 +; GFX6-NEXT: s_addc_u32 s18, s18, 0 +; GFX6-NEXT: s_mul_i32 s13, s16, s13 +; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: s_addc_u32 s17, 0, s18 +; GFX6-NEXT: s_add_u32 s18, s12, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s18 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_addc_u32 s16, s16, s17 +; GFX6-NEXT: s_mul_i32 s12, s14, s16 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_add_i32 s12, s13, s12 +; GFX6-NEXT: s_mul_i32 s15, s15, s18 +; GFX6-NEXT: s_mul_i32 s13, s14, s18 +; GFX6-NEXT: s_add_i32 s12, s12, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v3, s16, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s18, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s16, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s18, v0 +; GFX6-NEXT: s_mul_i32 s15, s18, s12 +; GFX6-NEXT: v_readfirstlane_b32 s19, v2 +; GFX6-NEXT: s_add_u32 s15, s19, s15 +; GFX6-NEXT: v_readfirstlane_b32 s17, v0 +; GFX6-NEXT: s_mul_i32 s13, s16, s13 +; GFX6-NEXT: s_addc_u32 s17, 0, s17 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: s_add_u32 s13, s15, s13 +; GFX6-NEXT: s_addc_u32 s13, s17, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s12, s16, s12 +; GFX6-NEXT: s_add_u32 s12, s13, s12 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: s_add_u32 s15, s18, s12 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_addc_u32 s14, s16, s14 +; GFX6-NEXT: s_ashr_i32 s12, s9, 31 +; GFX6-NEXT: s_add_u32 s8, s8, s12 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: s_addc_u32 s9, s9, s12 +; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s15 +; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX6-NEXT: s_mul_i32 s16, s8, s14 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 +; GFX6-NEXT: v_readfirstlane_b32 s18, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 -; GFX6-NEXT: v_mov_b32_e32 v5, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 -; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 -; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 -; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GFX6-NEXT: s_xor_b64 s[0:1], s[16:17], s[12:13] -; GFX6-NEXT: s_ashr_i32 s8, s15, 31 -; GFX6-NEXT: s_add_u32 s12, s14, s8 -; GFX6-NEXT: v_mov_b32_e32 v6, s9 -; GFX6-NEXT: s_mov_b32 s9, s8 -; GFX6-NEXT: s_addc_u32 s13, s15, s8 -; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s12 -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s13 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 -; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 -; GFX6-NEXT: v_rcp_f32_e32 v6, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v6 -; GFX6-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: s_sub_u32 s2, 0, s12 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX6-NEXT: s_subb_u32 s3, 0, s13 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v3, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 -; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: s_ashr_i32 s2, s11, 31 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s10, s10, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_mov_b32 s3, s2 -; GFX6-NEXT: s_addc_u32 s11, s11, s2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, s1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 -; GFX6-NEXT: v_mov_b32_e32 v7, s13 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 -; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc -; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 -; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v2 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 2, v2 -; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v7, v9, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v8, s11 -; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, s1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: s_add_u32 s16, s18, s16 +; GFX6-NEXT: s_addc_u32 s17, 0, s17 +; GFX6-NEXT: s_mul_i32 s15, s9, s15 +; GFX6-NEXT: v_readfirstlane_b32 s18, v1 +; GFX6-NEXT: s_add_u32 s15, s16, s15 +; GFX6-NEXT: s_addc_u32 s15, s17, s18 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s14, s9, s14 +; GFX6-NEXT: s_add_u32 s17, s15, s14 +; GFX6-NEXT: v_mov_b32_e32 v0, s17 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s14, s6, s16 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_add_i32 s14, s15, s14 +; GFX6-NEXT: s_mul_i32 s15, s7, s17 +; GFX6-NEXT: s_add_i32 s18, s14, s15 +; GFX6-NEXT: s_sub_i32 s19, s9, s18 +; GFX6-NEXT: s_mul_i32 s14, s6, s17 +; GFX6-NEXT: s_sub_u32 s8, s8, s14 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s20, s14, s15 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s19, s19, s7 +; GFX6-NEXT: s_sub_u32 s21, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX6-NEXT: s_or_b32 s14, s14, s15 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_subb_u32 s14, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s14, s7 +; GFX6-NEXT: s_cselect_b32 s15, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s21, s6 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s14, s7 +; GFX6-NEXT: s_cselect_b32 s14, s19, s15 +; GFX6-NEXT: s_add_u32 s15, s17, 1 +; GFX6-NEXT: s_addc_u32 s19, s16, 0 +; GFX6-NEXT: s_add_u32 s21, s17, 2 +; GFX6-NEXT: s_addc_u32 s22, s16, 0 +; GFX6-NEXT: s_cmp_lg_u32 s14, 0 +; GFX6-NEXT: s_cselect_b32 s14, s21, s15 +; GFX6-NEXT: s_cselect_b32 s15, s22, s19 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s18 +; GFX6-NEXT: s_cmp_ge_u32 s9, s7 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s8, s6 +; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s9, s7 +; GFX6-NEXT: s_cselect_b32 s6, s6, s18 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cselect_b32 s7, s15, s16 +; GFX6-NEXT: s_cselect_b32 s6, s14, s17 +; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX6-NEXT: s_sub_u32 s14, s6, s2 +; GFX6-NEXT: s_subb_u32 s15, s7, s3 +; GFX6-NEXT: s_ashr_i32 s6, s1, 31 +; GFX6-NEXT: s_add_u32 s0, s0, s6 +; GFX6-NEXT: s_mov_b32 s7, s6 +; GFX6-NEXT: s_addc_u32 s1, s1, s6 +; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GFX6-NEXT: s_sub_u32 s12, 0, s8 +; GFX6-NEXT: s_subb_u32 s13, 0, s9 +; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s1, s12, s16 +; GFX6-NEXT: v_readfirstlane_b32 s3, v2 +; GFX6-NEXT: s_mul_i32 s0, s13, s2 +; GFX6-NEXT: s_add_i32 s1, s3, s1 +; GFX6-NEXT: s_add_i32 s3, s1, s0 +; GFX6-NEXT: s_mul_i32 s17, s12, s2 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s17 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mul_i32 s4, s2, s3 +; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_readfirstlane_b32 s18, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s17 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 +; GFX6-NEXT: s_add_u32 s4, s18, s4 +; GFX6-NEXT: s_addc_u32 s5, 0, s5 +; GFX6-NEXT: s_mul_i32 s17, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s18, v0 +; GFX6-NEXT: s_add_u32 s4, s4, s17 +; GFX6-NEXT: s_addc_u32 s4, s5, s18 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s16, s3 +; GFX6-NEXT: s_add_u32 s3, s4, s3 +; GFX6-NEXT: s_addc_u32 s4, 0, s5 +; GFX6-NEXT: s_add_u32 s5, s2, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s4, s16, s4 +; GFX6-NEXT: s_mul_i32 s2, s12, s4 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_add_i32 s2, s3, s2 +; GFX6-NEXT: s_mul_i32 s13, s13, s5 +; GFX6-NEXT: s_mul_i32 s3, s12, s5 +; GFX6-NEXT: s_add_i32 s2, s2, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_mul_i32 s13, s5, s2 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_add_u32 s3, s13, s3 +; GFX6-NEXT: s_addc_u32 s3, s16, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s2, s4, s2 +; GFX6-NEXT: s_add_u32 s2, s3, s2 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 +; GFX6-NEXT: s_add_u32 s13, s5, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s12, s4, s12 +; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_add_u32 s2, s10, s4 +; GFX6-NEXT: s_mov_b32 s5, s4 +; GFX6-NEXT: s_addc_u32 s3, s11, s4 +; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 +; GFX6-NEXT: s_mul_i32 s2, s10, s12 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2 +; GFX6-NEXT: v_readfirstlane_b32 s17, v3 +; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 +; GFX6-NEXT: s_add_u32 s2, s17, s2 +; GFX6-NEXT: s_addc_u32 s16, 0, s16 +; GFX6-NEXT: s_mul_i32 s13, s11, s13 +; GFX6-NEXT: v_readfirstlane_b32 s17, v1 +; GFX6-NEXT: s_add_u32 s2, s2, s13 +; GFX6-NEXT: s_addc_u32 s2, s16, s17 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_addc_u32 s13, s13, 0 +; GFX6-NEXT: s_mul_i32 s12, s11, s12 +; GFX6-NEXT: s_add_u32 s16, s2, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_addc_u32 s17, 0, s13 +; GFX6-NEXT: s_mul_i32 s12, s8, s17 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_add_i32 s12, s13, s12 +; GFX6-NEXT: s_mul_i32 s13, s9, s16 +; GFX6-NEXT: s_add_i32 s18, s12, s13 +; GFX6-NEXT: s_sub_i32 s19, s11, s18 +; GFX6-NEXT: s_mul_i32 s12, s8, s16 +; GFX6-NEXT: s_sub_u32 s10, s10, s12 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s20, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s19, s19, s9 +; GFX6-NEXT: s_sub_u32 s21, s10, s8 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s12, s19, 0 +; GFX6-NEXT: s_cmp_ge_u32 s12, s9 +; GFX6-NEXT: s_cselect_b32 s13, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s21, s8 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s12, s9 +; GFX6-NEXT: s_cselect_b32 s12, s19, s13 +; GFX6-NEXT: s_add_u32 s13, s16, 1 +; GFX6-NEXT: s_addc_u32 s19, s17, 0 +; GFX6-NEXT: s_add_u32 s21, s16, 2 +; GFX6-NEXT: s_addc_u32 s22, s17, 0 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_cselect_b32 s12, s21, s13 +; GFX6-NEXT: s_cselect_b32 s13, s22, s19 +; GFX6-NEXT: s_cmp_lg_u32 s20, 0 +; GFX6-NEXT: s_subb_u32 s11, s11, s18 +; GFX6-NEXT: s_cmp_ge_u32 s11, s9 +; GFX6-NEXT: s_cselect_b32 s18, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s10, s8 +; GFX6-NEXT: s_cselect_b32 s8, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s11, s9 +; GFX6-NEXT: s_cselect_b32 s8, s8, s18 +; GFX6-NEXT: s_cmp_lg_u32 s8, 0 +; GFX6-NEXT: s_cselect_b32 s9, s13, s17 +; GFX6-NEXT: s_cselect_b32 s8, s12, s16 +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5] +; GFX6-NEXT: s_sub_u32 s4, s6, s4 +; GFX6-NEXT: s_subb_u32 s5, s7, s5 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: v_mov_b32_e32 v1, s15 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 -; GFX9-NEXT: s_lshl_b64 s[6:7], 0x1000, s14 -; GFX9-NEXT: s_ashr_i32 s12, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s12 -; GFX9-NEXT: s_mov_b32 s13, s12 -; GFX9-NEXT: s_addc_u32 s1, s1, s12 -; GFX9-NEXT: s_xor_b64 s[14:15], s[0:1], s[12:13] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX9-NEXT: s_sub_u32 s0, 0, s14 -; GFX9-NEXT: s_subb_u32 s1, 0, s15 +; GFX9-NEXT: s_lshl_b64 s[6:7], 0x1000, s12 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s14 +; GFX9-NEXT: s_ashr_i32 s2, s7, 31 +; GFX9-NEXT: s_add_u32 s6, s6, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s7, s7, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_u32 s14, 0, s6 +; GFX9-NEXT: s_subb_u32 s15, 0, s7 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -8554,270 +8634,255 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_readfirstlane_b32 s16, v1 +; GFX9-NEXT: v_readfirstlane_b32 s12, v0 +; GFX9-NEXT: s_mul_i32 s13, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s12 +; GFX9-NEXT: s_mul_i32 s17, s15, s12 +; GFX9-NEXT: s_add_i32 s13, s18, s13 +; GFX9-NEXT: s_mul_i32 s19, s14, s12 +; GFX9-NEXT: s_add_i32 s13, s13, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s12, s19 +; GFX9-NEXT: s_mul_i32 s20, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s13 +; GFX9-NEXT: s_add_u32 s18, s18, s20 +; GFX9-NEXT: s_addc_u32 s17, 0, s17 +; GFX9-NEXT: s_mul_hi_u32 s20, s16, s19 +; GFX9-NEXT: s_mul_i32 s19, s16, s19 +; GFX9-NEXT: s_add_u32 s18, s18, s19 +; GFX9-NEXT: s_mul_hi_u32 s21, s16, s13 +; GFX9-NEXT: s_addc_u32 s17, s17, s20 +; GFX9-NEXT: s_addc_u32 s18, s21, 0 +; GFX9-NEXT: s_mul_i32 s13, s16, s13 +; GFX9-NEXT: s_add_u32 s13, s17, s13 +; GFX9-NEXT: s_addc_u32 s17, 0, s18 +; GFX9-NEXT: s_add_u32 s18, s12, s13 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_addc_u32 s16, s16, s17 +; GFX9-NEXT: s_mul_i32 s12, s14, s16 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s18 +; GFX9-NEXT: s_add_i32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s15, s15, s18 +; GFX9-NEXT: s_add_i32 s12, s12, s15 +; GFX9-NEXT: s_mul_i32 s14, s14, s18 +; GFX9-NEXT: s_mul_hi_u32 s15, s16, s14 +; GFX9-NEXT: s_mul_i32 s17, s16, s14 +; GFX9-NEXT: s_mul_i32 s20, s18, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s18, s14 +; GFX9-NEXT: s_mul_hi_u32 s19, s18, s12 +; GFX9-NEXT: s_add_u32 s14, s14, s20 +; GFX9-NEXT: s_addc_u32 s19, 0, s19 +; GFX9-NEXT: s_add_u32 s14, s14, s17 +; GFX9-NEXT: s_mul_hi_u32 s13, s16, s12 +; GFX9-NEXT: s_addc_u32 s14, s19, s15 +; GFX9-NEXT: s_addc_u32 s13, s13, 0 +; GFX9-NEXT: s_mul_i32 s12, s16, s12 +; GFX9-NEXT: s_add_u32 s12, s14, s12 +; GFX9-NEXT: s_addc_u32 s14, 0, s13 +; GFX9-NEXT: s_add_u32 s15, s18, s12 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_addc_u32 s14, s16, s14 +; GFX9-NEXT: s_ashr_i32 s12, s9, 31 +; GFX9-NEXT: s_add_u32 s8, s8, s12 +; GFX9-NEXT: s_mov_b32 s13, s12 +; GFX9-NEXT: s_addc_u32 s9, s9, s12 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] +; GFX9-NEXT: s_mul_i32 s17, s8, s14 +; GFX9-NEXT: s_mul_hi_u32 s18, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s14 +; GFX9-NEXT: s_add_u32 s17, s18, s17 +; GFX9-NEXT: s_addc_u32 s16, 0, s16 +; GFX9-NEXT: s_mul_hi_u32 s19, s9, s15 +; GFX9-NEXT: s_mul_i32 s15, s9, s15 +; GFX9-NEXT: s_add_u32 s15, s17, s15 +; GFX9-NEXT: s_mul_hi_u32 s18, s9, s14 +; GFX9-NEXT: s_addc_u32 s15, s16, s19 +; GFX9-NEXT: s_addc_u32 s16, s18, 0 +; GFX9-NEXT: s_mul_i32 s14, s9, s14 +; GFX9-NEXT: s_add_u32 s18, s15, s14 +; GFX9-NEXT: s_addc_u32 s19, 0, s16 +; GFX9-NEXT: s_mul_i32 s14, s6, s19 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s18 +; GFX9-NEXT: s_add_i32 s14, s15, s14 +; GFX9-NEXT: s_mul_i32 s15, s7, s18 +; GFX9-NEXT: s_add_i32 s20, s14, s15 +; GFX9-NEXT: s_sub_i32 s16, s9, s20 +; GFX9-NEXT: s_mul_i32 s14, s6, s18 +; GFX9-NEXT: s_sub_u32 s8, s8, s14 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s21, s16, s7 +; GFX9-NEXT: s_sub_u32 s22, s8, s6 +; GFX9-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GFX9-NEXT: s_subb_u32 s16, s21, 0 +; GFX9-NEXT: s_cmp_ge_u32 s16, s7 +; GFX9-NEXT: s_cselect_b32 s17, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s22, s6 +; GFX9-NEXT: s_cselect_b32 s21, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s16, s7 +; GFX9-NEXT: s_cselect_b32 s16, s21, s17 +; GFX9-NEXT: s_add_u32 s17, s18, 1 +; GFX9-NEXT: s_addc_u32 s21, s19, 0 +; GFX9-NEXT: s_add_u32 s22, s18, 2 +; GFX9-NEXT: s_addc_u32 s23, s19, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b32 s16, s22, s17 +; GFX9-NEXT: s_cselect_b32 s17, s23, s21 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s9, s9, s20 +; GFX9-NEXT: s_cmp_ge_u32 s9, s7 +; GFX9-NEXT: s_cselect_b32 s14, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s8, s6 +; GFX9-NEXT: s_cselect_b32 s6, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s9, s7 +; GFX9-NEXT: s_cselect_b32 s6, s6, s14 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s7, s17, s19 +; GFX9-NEXT: s_cselect_b32 s6, s16, s18 +; GFX9-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3] +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] +; GFX9-NEXT: s_sub_u32 s14, s6, s2 +; GFX9-NEXT: s_subb_u32 s15, s7, s3 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s16, s0, s4 -; GFX9-NEXT: s_mul_hi_u32 s18, s0, s5 -; GFX9-NEXT: s_mul_i32 s17, s1, s5 -; GFX9-NEXT: s_add_i32 s16, s18, s16 -; GFX9-NEXT: s_mul_i32 s19, s0, s5 -; GFX9-NEXT: s_add_i32 s16, s16, s17 -; GFX9-NEXT: s_mul_hi_u32 s17, s5, s16 -; GFX9-NEXT: s_mul_i32 s18, s5, s16 -; GFX9-NEXT: s_mul_hi_u32 s5, s5, s19 -; GFX9-NEXT: s_add_u32 s5, s5, s18 +; GFX9-NEXT: v_readfirstlane_b32 s13, v2 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s16, s8, s13 +; GFX9-NEXT: s_mul_i32 s5, s9, s4 +; GFX9-NEXT: s_add_i32 s12, s12, s16 +; GFX9-NEXT: s_add_i32 s12, s12, s5 +; GFX9-NEXT: s_mul_i32 s17, s8, s4 +; GFX9-NEXT: s_mul_i32 s16, s4, s12 +; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s12 +; GFX9-NEXT: s_add_u32 s16, s18, s16 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s19, s13, s17 +; GFX9-NEXT: s_mul_i32 s17, s13, s17 +; GFX9-NEXT: s_add_u32 s16, s16, s17 +; GFX9-NEXT: s_mul_hi_u32 s18, s13, s12 +; GFX9-NEXT: s_addc_u32 s5, s5, s19 +; GFX9-NEXT: s_addc_u32 s16, s18, 0 +; GFX9-NEXT: s_mul_i32 s12, s13, s12 +; GFX9-NEXT: s_add_u32 s5, s5, s12 +; GFX9-NEXT: s_addc_u32 s12, 0, s16 +; GFX9-NEXT: s_add_u32 s16, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s13, s12 +; GFX9-NEXT: s_mul_i32 s4, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s16 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s9, s16 +; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s16 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_mul_i32 s13, s12, s8 +; GFX9-NEXT: s_mul_i32 s18, s16, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s16, s8 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s18 ; GFX9-NEXT: s_addc_u32 s17, 0, s17 -; GFX9-NEXT: s_mul_hi_u32 s20, s4, s19 -; GFX9-NEXT: s_mul_i32 s19, s4, s19 -; GFX9-NEXT: s_add_u32 s5, s5, s19 -; GFX9-NEXT: s_mul_hi_u32 s18, s4, s16 -; GFX9-NEXT: s_addc_u32 s5, s17, s20 -; GFX9-NEXT: s_addc_u32 s17, s18, 0 -; GFX9-NEXT: s_mul_i32 s16, s4, s16 -; GFX9-NEXT: s_add_u32 s5, s5, s16 -; GFX9-NEXT: s_addc_u32 s16, 0, s17 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s4, s4, s16 -; GFX9-NEXT: v_readfirstlane_b32 s16, v0 -; GFX9-NEXT: s_mul_i32 s5, s0, s4 -; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16 -; GFX9-NEXT: s_add_i32 s5, s17, s5 -; GFX9-NEXT: s_mul_i32 s1, s1, s16 -; GFX9-NEXT: s_add_i32 s5, s5, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s16 -; GFX9-NEXT: s_mul_hi_u32 s17, s4, s0 -; GFX9-NEXT: s_mul_i32 s18, s4, s0 -; GFX9-NEXT: s_mul_i32 s20, s16, s5 -; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0 -; GFX9-NEXT: s_mul_hi_u32 s19, s16, s5 -; GFX9-NEXT: s_add_u32 s0, s0, s20 -; GFX9-NEXT: s_addc_u32 s16, 0, s19 -; GFX9-NEXT: s_add_u32 s0, s0, s18 -; GFX9-NEXT: s_mul_hi_u32 s1, s4, s5 -; GFX9-NEXT: s_addc_u32 s0, s16, s17 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s5, s4, s5 -; GFX9-NEXT: s_add_u32 s0, s0, s5 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s16, s4, s1 -; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_add_u32 s0, s8, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX9-NEXT: s_addc_u32 s8, s17, s9 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_mul_i32 s4, s12, s4 +; GFX9-NEXT: s_add_u32 s4, s8, s4 +; GFX9-NEXT: s_addc_u32 s8, 0, s5 +; GFX9-NEXT: s_add_u32 s13, s16, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s8 +; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_add_u32 s8, s10, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s9, s4 -; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s17, v0 -; GFX9-NEXT: s_mul_i32 s1, s8, s16 -; GFX9-NEXT: s_mul_hi_u32 s18, s8, s17 -; GFX9-NEXT: s_mul_hi_u32 s0, s8, s16 -; GFX9-NEXT: s_add_u32 s1, s18, s1 -; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s19, s9, s17 -; GFX9-NEXT: s_mul_i32 s17, s9, s17 -; GFX9-NEXT: s_add_u32 s1, s1, s17 -; GFX9-NEXT: s_mul_hi_u32 s18, s9, s16 -; GFX9-NEXT: s_addc_u32 s0, s0, s19 -; GFX9-NEXT: s_addc_u32 s1, s18, 0 -; GFX9-NEXT: s_mul_i32 s16, s9, s16 -; GFX9-NEXT: s_add_u32 s16, s0, s16 -; GFX9-NEXT: s_addc_u32 s17, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s14, s17 -; GFX9-NEXT: s_mul_hi_u32 s1, s14, s16 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s15, s16 -; GFX9-NEXT: s_add_i32 s18, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s14, s16 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_sub_i32 s0, s9, s18 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s8, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s8, s0, s15 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s14, v0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s8, s8, 0 -; GFX9-NEXT: s_cmp_ge_u32 s8, s15 +; GFX9-NEXT: s_addc_u32 s9, s11, s4 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] +; GFX9-NEXT: s_mul_i32 s11, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s10, s8, s12 +; GFX9-NEXT: s_add_u32 s11, s16, s11 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 +; GFX9-NEXT: s_mul_i32 s13, s9, s13 +; GFX9-NEXT: s_add_u32 s11, s11, s13 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX9-NEXT: s_addc_u32 s10, s10, s17 +; GFX9-NEXT: s_addc_u32 s11, s16, 0 +; GFX9-NEXT: s_mul_i32 s12, s9, s12 +; GFX9-NEXT: s_add_u32 s16, s10, s12 +; GFX9-NEXT: s_addc_u32 s17, 0, s11 +; GFX9-NEXT: s_mul_i32 s10, s6, s17 +; GFX9-NEXT: s_mul_hi_u32 s11, s6, s16 +; GFX9-NEXT: s_add_i32 s10, s11, s10 +; GFX9-NEXT: s_mul_i32 s11, s7, s16 +; GFX9-NEXT: s_add_i32 s18, s10, s11 +; GFX9-NEXT: s_sub_i32 s12, s9, s18 +; GFX9-NEXT: s_mul_i32 s10, s6, s16 +; GFX9-NEXT: s_sub_u32 s8, s8, s10 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s19, s12, s7 +; GFX9-NEXT: s_sub_u32 s20, s8, s6 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s12, s19, 0 +; GFX9-NEXT: s_cmp_ge_u32 s12, s7 +; GFX9-NEXT: s_cselect_b32 s13, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s20, s6 ; GFX9-NEXT: s_cselect_b32 s19, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v1 -; GFX9-NEXT: s_cmp_eq_u32 s8, s15 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s19 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s16, 1 -; GFX9-NEXT: s_addc_u32 s8, s17, 0 -; GFX9-NEXT: s_add_u32 s1, s16, 2 +; GFX9-NEXT: s_cmp_eq_u32 s12, s7 +; GFX9-NEXT: s_cselect_b32 s12, s19, s13 +; GFX9-NEXT: s_add_u32 s13, s16, 1 ; GFX9-NEXT: s_addc_u32 s19, s17, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s9, s18 -; GFX9-NEXT: s_cmp_ge_u32 s0, s15 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s15 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: s_xor_b64 s[0:1], s[4:5], s[12:13] -; GFX9-NEXT: s_ashr_i32 s4, s7, 31 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: s_add_u32 s6, s6, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s7, s7, s4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s7 -; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1 -; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 -; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: s_sub_u32 s0, 0, s6 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: s_subb_u32 s1, 0, s7 -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v6, vcc -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: v_readfirstlane_b32 s13, v3 -; GFX9-NEXT: s_mul_hi_u32 s12, s0, s8 -; GFX9-NEXT: s_mul_i32 s14, s0, s13 -; GFX9-NEXT: s_mul_i32 s9, s1, s8 -; GFX9-NEXT: s_add_i32 s12, s12, s14 -; GFX9-NEXT: s_add_i32 s12, s12, s9 -; GFX9-NEXT: s_mul_i32 s15, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s8, s12 -; GFX9-NEXT: s_mul_i32 s14, s8, s12 -; GFX9-NEXT: s_mul_hi_u32 s8, s8, s15 -; GFX9-NEXT: s_add_u32 s8, s8, s14 -; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: s_mul_hi_u32 s16, s13, s15 -; GFX9-NEXT: s_mul_i32 s15, s13, s15 -; GFX9-NEXT: s_add_u32 s8, s8, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s13, s12 -; GFX9-NEXT: s_addc_u32 s8, s9, s16 -; GFX9-NEXT: s_addc_u32 s9, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s13, s12 -; GFX9-NEXT: s_add_u32 s8, s8, s12 -; GFX9-NEXT: s_addc_u32 s9, 0, s9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s8, s13, s9 -; GFX9-NEXT: v_readfirstlane_b32 s12, v2 -; GFX9-NEXT: s_mul_i32 s9, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s9, s13, s9 -; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s9, s9, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s0 -; GFX9-NEXT: s_mul_i32 s14, s8, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s9 -; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s9 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_addc_u32 s12, 0, s15 -; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s8, s9 -; GFX9-NEXT: s_addc_u32 s0, s12, s13 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s9, s8, s9 -; GFX9-NEXT: s_add_u32 s0, s0, s9 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s12, s8, s1 -; GFX9-NEXT: s_ashr_i32 s8, s11, 31 -; GFX9-NEXT: s_add_u32 s0, s10, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s11, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] -; GFX9-NEXT: v_readfirstlane_b32 s13, v2 -; GFX9-NEXT: s_mul_i32 s1, s10, s12 -; GFX9-NEXT: s_mul_hi_u32 s14, s10, s13 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s12 -; GFX9-NEXT: s_add_u32 s1, s14, s1 -; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s11, s13 -; GFX9-NEXT: s_mul_i32 s13, s11, s13 -; GFX9-NEXT: s_add_u32 s1, s1, s13 -; GFX9-NEXT: s_mul_hi_u32 s14, s11, s12 -; GFX9-NEXT: s_addc_u32 s0, s0, s15 -; GFX9-NEXT: s_addc_u32 s1, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s11, s12 -; GFX9-NEXT: s_add_u32 s12, s0, s12 -; GFX9-NEXT: s_addc_u32 s13, 0, s1 -; GFX9-NEXT: s_mul_i32 s0, s6, s13 -; GFX9-NEXT: s_mul_hi_u32 s1, s6, s12 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s7, s12 -; GFX9-NEXT: s_add_i32 s14, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s6, s12 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: s_sub_i32 s0, s11, s14 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s10, s0, s7 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s6, v2 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s10, s10, 0 -; GFX9-NEXT: s_cmp_ge_u32 s10, s7 -; GFX9-NEXT: s_cselect_b32 s15, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v3 -; GFX9-NEXT: s_cmp_eq_u32 s10, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s12, 1 -; GFX9-NEXT: s_addc_u32 s10, s13, 0 -; GFX9-NEXT: s_add_u32 s1, s12, 2 -; GFX9-NEXT: s_addc_u32 s15, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-NEXT: v_mov_b32_e32 v6, s15 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s11, s14 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 -; GFX9-NEXT: s_cmp_eq_u32 s0, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[4:5] -; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2 -; GFX9-NEXT: v_mov_b32_e32 v6, s1 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] +; GFX9-NEXT: s_add_u32 s20, s16, 2 +; GFX9-NEXT: s_addc_u32 s21, s17, 0 +; GFX9-NEXT: s_cmp_lg_u32 s12, 0 +; GFX9-NEXT: s_cselect_b32 s12, s20, s13 +; GFX9-NEXT: s_cselect_b32 s13, s21, s19 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s9, s9, s18 +; GFX9-NEXT: s_cmp_ge_u32 s9, s7 +; GFX9-NEXT: s_cselect_b32 s10, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s8, s6 +; GFX9-NEXT: s_cselect_b32 s6, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s9, s7 +; GFX9-NEXT: s_cselect_b32 s6, s6, s10 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s7, s13, s17 +; GFX9-NEXT: s_cselect_b32 s6, s12, s16 +; GFX9-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: s_xor_b64 s[4:5], s[6:7], s[2:3] +; GFX9-NEXT: s_sub_u32 s2, s4, s2 +; GFX9-NEXT: s_subb_u32 s3, s5, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = sdiv <2 x i64> %x, %shl.y @@ -8983,8 +9048,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 @@ -8994,130 +9058,167 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX6-NEXT: s_sub_u32 s4, 0, s8 -; GFX6-NEXT: s_subb_u32 s5, 0, s9 +; GFX6-NEXT: s_sub_u32 s10, 0, s8 +; GFX6-NEXT: s_subb_u32 s11, 0, s9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s10, s3, 31 -; GFX6-NEXT: s_add_u32 s2, s2, s10 -; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_addc_u32 s3, s3, s10 -; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 -; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 -; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v4, s13 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 -; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX6-NEXT: v_mov_b32_e32 v2, s10 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s10, v0 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: v_readfirstlane_b32 s0, v0 +; GFX6-NEXT: s_mul_i32 s1, s10, s12 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_mul_i32 s13, s11, s0 +; GFX6-NEXT: s_mul_i32 s14, s10, s0 +; GFX6-NEXT: s_add_i32 s1, s15, s1 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s14 +; GFX6-NEXT: s_add_i32 s1, s1, s13 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s14 +; GFX6-NEXT: v_readfirstlane_b32 s13, v3 +; GFX6-NEXT: s_mul_i32 s15, s0, s1 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1 +; GFX6-NEXT: s_add_u32 s13, s13, s15 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_addc_u32 s15, 0, s15 +; GFX6-NEXT: s_mul_i32 s14, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s16, v4 +; GFX6-NEXT: s_add_u32 s13, s13, s14 +; GFX6-NEXT: s_addc_u32 s13, s15, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 +; GFX6-NEXT: s_add_u32 s1, s13, s1 +; GFX6-NEXT: s_addc_u32 s13, 0, s14 +; GFX6-NEXT: s_add_u32 s14, s0, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_addc_u32 s12, s12, s13 +; GFX6-NEXT: s_mul_i32 s0, s10, s12 +; GFX6-NEXT: v_readfirstlane_b32 s1, v0 +; GFX6-NEXT: s_add_i32 s0, s1, s0 +; GFX6-NEXT: s_mul_i32 s11, s11, s14 +; GFX6-NEXT: s_mul_i32 s1, s10, s14 +; GFX6-NEXT: s_add_i32 s0, s0, s11 +; GFX6-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s14, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0 +; GFX6-NEXT: s_mul_i32 s11, s14, s0 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s11, s15, s11 +; GFX6-NEXT: v_readfirstlane_b32 s13, v0 +; GFX6-NEXT: s_mul_i32 s1, s12, s1 +; GFX6-NEXT: s_addc_u32 s13, 0, s13 +; GFX6-NEXT: v_readfirstlane_b32 s10, v3 +; GFX6-NEXT: s_add_u32 s1, s11, s1 +; GFX6-NEXT: s_addc_u32 s1, s13, s10 +; GFX6-NEXT: v_readfirstlane_b32 s10, v1 +; GFX6-NEXT: s_addc_u32 s10, s10, 0 +; GFX6-NEXT: s_mul_i32 s0, s12, s0 +; GFX6-NEXT: s_add_u32 s0, s1, s0 +; GFX6-NEXT: s_addc_u32 s10, 0, s10 +; GFX6-NEXT: s_add_u32 s13, s14, s0 +; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_cmp_lg_u32 s0, 0 +; GFX6-NEXT: s_addc_u32 s12, s12, s10 +; GFX6-NEXT: s_ashr_i32 s10, s7, 31 +; GFX6-NEXT: s_add_u32 s0, s6, s10 +; GFX6-NEXT: s_mov_b32 s11, s10 +; GFX6-NEXT: s_addc_u32 s1, s7, s10 +; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: v_readfirstlane_b32 s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2 +; GFX6-NEXT: s_mul_i32 s1, s6, s12 +; GFX6-NEXT: v_readfirstlane_b32 s14, v3 +; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX6-NEXT: s_add_u32 s1, s14, s1 +; GFX6-NEXT: s_addc_u32 s4, 0, s4 +; GFX6-NEXT: s_mul_i32 s13, s7, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_add_u32 s1, s1, s13 +; GFX6-NEXT: s_addc_u32 s1, s4, s14 +; GFX6-NEXT: v_readfirstlane_b32 s4, v0 +; GFX6-NEXT: s_addc_u32 s4, s4, 0 +; GFX6-NEXT: s_mul_i32 s12, s7, s12 +; GFX6-NEXT: s_add_u32 s12, s1, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_addc_u32 s4, 0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_mul_i32 s4, s8, s4 +; GFX6-NEXT: v_readfirstlane_b32 s5, v0 +; GFX6-NEXT: s_add_i32 s4, s5, s4 +; GFX6-NEXT: s_mul_i32 s5, s9, s12 +; GFX6-NEXT: s_add_i32 s13, s4, s5 +; GFX6-NEXT: s_sub_i32 s14, s7, s13 +; GFX6-NEXT: s_mul_i32 s4, s8, s12 +; GFX6-NEXT: s_sub_u32 s6, s6, s4 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s14, s14, s9 +; GFX6-NEXT: s_sub_u32 s15, s6, s8 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s16, s14, 0 +; GFX6-NEXT: s_cmp_ge_u32 s16, s9 +; GFX6-NEXT: s_cselect_b32 s5, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s15, s8 +; GFX6-NEXT: s_cselect_b32 s17, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s16, s9 +; GFX6-NEXT: s_cselect_b32 s17, s17, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s14, s14, s9 +; GFX6-NEXT: s_sub_u32 s18, s15, s8 +; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX6-NEXT: s_or_b32 s4, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s4, 0 +; GFX6-NEXT: s_subb_u32 s4, s14, 0 +; GFX6-NEXT: s_cmp_lg_u32 s17, 0 +; GFX6-NEXT: s_cselect_b32 s14, s18, s15 +; GFX6-NEXT: s_cselect_b32 s4, s4, s16 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s5, s7, s13 +; GFX6-NEXT: s_cmp_ge_u32 s5, s9 +; GFX6-NEXT: s_cselect_b32 s7, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s6, s8 +; GFX6-NEXT: s_cselect_b32 s8, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s5, s9 +; GFX6-NEXT: s_cselect_b32 s7, s8, s7 +; GFX6-NEXT: s_cmp_lg_u32 s7, 0 +; GFX6-NEXT: s_cselect_b32 s5, s4, s5 +; GFX6-NEXT: s_cselect_b32 s4, s14, s6 +; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11] +; GFX6-NEXT: s_sub_u32 s4, s4, s10 +; GFX6-NEXT: s_subb_u32 s5, s5, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 ; GFX9-NEXT: s_ashr_i32 s2, s1, 31 @@ -9127,8 +9228,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_u32 s0, 0, s6 -; GFX9-NEXT: s_subb_u32 s1, 0, s7 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_sub_u32 s8, 0, s6 +; GFX9-NEXT: s_subb_u32 s9, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -9138,127 +9240,123 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; GFX9-NEXT: v_madmk_f32 v1, v2, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s4, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3 -; GFX9-NEXT: s_mul_i32 s5, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s12, s4 -; GFX9-NEXT: s_mul_i32 s13, s0, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s12, s3, s13 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 -; GFX9-NEXT: s_mul_i32 s3, s3, s4 -; GFX9-NEXT: s_add_u32 s3, s12, s3 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13 -; GFX9-NEXT: s_mul_i32 s13, s2, s13 -; GFX9-NEXT: s_add_u32 s3, s3, s13 -; GFX9-NEXT: s_mul_hi_u32 s12, s2, s4 -; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: s_addc_u32 s5, s12, 0 -; GFX9-NEXT: s_mul_i32 s4, s2, s4 -; GFX9-NEXT: s_add_u32 s3, s3, s4 -; GFX9-NEXT: s_addc_u32 s4, 0, s5 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s4 +; GFX9-NEXT: v_readfirstlane_b32 s10, v2 ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4 -; GFX9-NEXT: s_add_i32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s4 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0 -; GFX9-NEXT: s_mul_i32 s12, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s4, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0 -; GFX9-NEXT: s_mul_hi_u32 s13, s4, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_addc_u32 s4, 0, s13 -; GFX9-NEXT: s_add_u32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_addc_u32 s0, s4, s5 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s1 -; GFX9-NEXT: s_ashr_i32 s4, s11, 31 -; GFX9-NEXT: s_add_u32 s0, s10, s4 +; GFX9-NEXT: s_mul_i32 s5, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s12, s8, s4 +; GFX9-NEXT: s_mul_i32 s11, s9, s4 +; GFX9-NEXT: s_add_i32 s5, s12, s5 +; GFX9-NEXT: s_mul_i32 s13, s8, s4 +; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s4, s13 +; GFX9-NEXT: s_mul_i32 s14, s4, s5 +; GFX9-NEXT: s_mul_hi_u32 s11, s4, s5 +; GFX9-NEXT: s_add_u32 s12, s12, s14 +; GFX9-NEXT: s_addc_u32 s11, 0, s11 +; GFX9-NEXT: s_mul_hi_u32 s15, s10, s13 +; GFX9-NEXT: s_mul_i32 s13, s10, s13 +; GFX9-NEXT: s_add_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s5 +; GFX9-NEXT: s_addc_u32 s11, s11, s15 +; GFX9-NEXT: s_addc_u32 s12, s14, 0 +; GFX9-NEXT: s_mul_i32 s5, s10, s5 +; GFX9-NEXT: s_add_u32 s5, s11, s5 +; GFX9-NEXT: s_addc_u32 s11, 0, s12 +; GFX9-NEXT: s_add_u32 s12, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_i32 s4, s8, s10 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s12 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s9, s9, s12 +; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_mul_i32 s8, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s8 +; GFX9-NEXT: s_mul_i32 s11, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s12, s4 +; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 +; GFX9-NEXT: s_mul_hi_u32 s13, s12, s4 +; GFX9-NEXT: s_add_u32 s8, s8, s14 +; GFX9-NEXT: s_addc_u32 s13, 0, s13 +; GFX9-NEXT: s_add_u32 s8, s8, s11 +; GFX9-NEXT: s_mul_hi_u32 s5, s10, s4 +; GFX9-NEXT: s_addc_u32 s8, s13, s9 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_mul_i32 s4, s10, s4 +; GFX9-NEXT: s_add_u32 s4, s8, s4 +; GFX9-NEXT: s_addc_u32 s8, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s12, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s10, s8 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s4, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s4 ; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s1, s11, s4 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s3, v1 -; GFX9-NEXT: s_mul_i32 s1, s10, s2 -; GFX9-NEXT: s_mul_hi_u32 s5, s10, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2 -; GFX9-NEXT: s_add_u32 s1, s5, s1 -; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3 -; GFX9-NEXT: s_mul_i32 s3, s11, s3 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_mul_hi_u32 s5, s11, s2 -; GFX9-NEXT: s_addc_u32 s0, s0, s12 -; GFX9-NEXT: s_addc_u32 s1, s5, 0 -; GFX9-NEXT: s_mul_i32 s2, s11, s2 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_i32 s1, s6, s1 -; GFX9-NEXT: s_mul_hi_u32 s2, s6, s0 -; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_mul_i32 s2, s7, s0 -; GFX9-NEXT: s_mul_i32 s0, s6, s0 -; GFX9-NEXT: s_add_i32 s5, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s1, s11, s5 -; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s10, v1 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s10, s1, s7 -; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s6, v1 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s12, s10, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s7 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v2 -; GFX9-NEXT: s_cmp_eq_u32 s12, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v4, s13 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] -; GFX9-NEXT: s_subb_u32 s2, s10, s7 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s6, v2 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s2, s2, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s12 -; GFX9-NEXT: v_mov_b32_e32 v4, s2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s11, s5 -; GFX9-NEXT: s_cmp_ge_u32 s0, s7 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 -; GFX9-NEXT: s_cmp_eq_u32 s0, s7 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s4, v1 -; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[8:9] +; GFX9-NEXT: s_addc_u32 s3, s3, s4 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_mul_i32 s11, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX9-NEXT: s_add_u32 s11, s12, s11 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s9 +; GFX9-NEXT: s_mul_i32 s9, s3, s9 +; GFX9-NEXT: s_add_u32 s9, s11, s9 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s13 +; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s8, s9, s8 +; GFX9-NEXT: s_addc_u32 s9, 0, s10 +; GFX9-NEXT: s_mul_i32 s9, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX9-NEXT: s_add_i32 s9, s10, s9 +; GFX9-NEXT: s_mul_i32 s10, s7, s8 +; GFX9-NEXT: s_add_i32 s12, s9, s10 +; GFX9-NEXT: s_sub_i32 s10, s3, s12 +; GFX9-NEXT: s_mul_i32 s8, s6, s8 +; GFX9-NEXT: s_sub_u32 s2, s2, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s13, s10, s7 +; GFX9-NEXT: s_sub_u32 s14, s2, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s15, s13, 0 +; GFX9-NEXT: s_cmp_ge_u32 s15, s7 +; GFX9-NEXT: s_cselect_b32 s16, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s14, s6 +; GFX9-NEXT: s_cselect_b32 s17, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s15, s7 +; GFX9-NEXT: s_cselect_b32 s16, s17, s16 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s13, s13, s7 +; GFX9-NEXT: s_sub_u32 s17, s14, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: s_cselect_b32 s11, s17, s14 +; GFX9-NEXT: s_cselect_b32 s10, s10, s15 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s3, s3, s12 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s8, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s6, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lg_u32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s3, s10, s3 +; GFX9-NEXT: s_cselect_b32 s2, s11, s2 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -9353,272 +9451,347 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX6-LABEL: srem_v2i64_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 -; GFX6-NEXT: s_lshl_b64 s[16:17], 0x1000, s14 +; GFX6-NEXT: s_lshl_b64 s[2:3], 0x1000, s12 +; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s14 +; GFX6-NEXT: s_ashr_i32 s6, s3, 31 +; GFX6-NEXT: s_add_u32 s2, s2, s6 +; GFX6-NEXT: s_mov_b32 s7, s6 +; GFX6-NEXT: s_addc_u32 s3, s3, s6 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX6-NEXT: s_sub_u32 s12, 0, s2 +; GFX6-NEXT: s_subb_u32 s13, 0, s3 +; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX6-NEXT: v_rcp_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GFX6-NEXT: v_trunc_f32_e32 v1, v1 +; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s6, v0 +; GFX6-NEXT: s_mul_i32 s7, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_mul_i32 s15, s13, s6 +; GFX6-NEXT: s_mul_i32 s16, s12, s6 +; GFX6-NEXT: s_add_i32 s7, s17, s7 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16 +; GFX6-NEXT: s_add_i32 s7, s7, s15 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s7 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16 +; GFX6-NEXT: v_readfirstlane_b32 s15, v3 +; GFX6-NEXT: s_mul_i32 s18, s6, s7 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s7 +; GFX6-NEXT: s_add_u32 s15, s15, s18 +; GFX6-NEXT: v_readfirstlane_b32 s18, v0 +; GFX6-NEXT: s_mul_i32 s16, s14, s16 +; GFX6-NEXT: s_addc_u32 s18, 0, s18 +; GFX6-NEXT: v_readfirstlane_b32 s17, v4 +; GFX6-NEXT: s_add_u32 s15, s15, s16 +; GFX6-NEXT: s_addc_u32 s15, s18, s17 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_addc_u32 s16, s16, 0 +; GFX6-NEXT: s_mul_i32 s7, s14, s7 +; GFX6-NEXT: s_add_u32 s7, s15, s7 +; GFX6-NEXT: s_addc_u32 s15, 0, s16 +; GFX6-NEXT: s_add_u32 s16, s6, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s16 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_addc_u32 s14, s14, s15 +; GFX6-NEXT: s_mul_i32 s6, s12, s14 +; GFX6-NEXT: v_readfirstlane_b32 s7, v0 +; GFX6-NEXT: s_add_i32 s6, s7, s6 +; GFX6-NEXT: s_mul_i32 s13, s13, s16 +; GFX6-NEXT: s_mul_i32 s7, s12, s16 +; GFX6-NEXT: s_add_i32 s6, s6, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0 +; GFX6-NEXT: s_mul_i32 s13, s16, s6 +; GFX6-NEXT: v_readfirstlane_b32 s17, v2 +; GFX6-NEXT: s_add_u32 s13, s17, s13 +; GFX6-NEXT: v_readfirstlane_b32 s15, v0 +; GFX6-NEXT: s_mul_i32 s7, s14, s7 +; GFX6-NEXT: s_addc_u32 s15, 0, s15 +; GFX6-NEXT: v_readfirstlane_b32 s12, v3 +; GFX6-NEXT: s_add_u32 s7, s13, s7 +; GFX6-NEXT: s_addc_u32 s7, s15, s12 +; GFX6-NEXT: v_readfirstlane_b32 s12, v1 +; GFX6-NEXT: s_addc_u32 s12, s12, 0 +; GFX6-NEXT: s_mul_i32 s6, s14, s6 +; GFX6-NEXT: s_add_u32 s6, s7, s6 +; GFX6-NEXT: s_addc_u32 s12, 0, s12 +; GFX6-NEXT: s_add_u32 s13, s16, s6 +; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX6-NEXT: s_or_b32 s6, s6, s7 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_addc_u32 s12, s14, s12 +; GFX6-NEXT: s_ashr_i32 s6, s9, 31 +; GFX6-NEXT: s_add_u32 s8, s8, s6 +; GFX6-NEXT: s_mov_b32 s7, s6 +; GFX6-NEXT: s_addc_u32 s9, s9, s6 +; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s13 +; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX6-NEXT: s_mul_i32 s14, s8, s12 +; GFX6-NEXT: v_readfirstlane_b32 s15, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 +; GFX6-NEXT: v_readfirstlane_b32 s16, v3 +; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX6-NEXT: s_add_u32 s14, s16, s14 +; GFX6-NEXT: s_addc_u32 s15, 0, s15 +; GFX6-NEXT: s_mul_i32 s13, s9, s13 +; GFX6-NEXT: v_readfirstlane_b32 s16, v1 +; GFX6-NEXT: s_add_u32 s13, s14, s13 +; GFX6-NEXT: s_addc_u32 s13, s15, s16 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_addc_u32 s14, s14, 0 +; GFX6-NEXT: s_mul_i32 s12, s9, s12 +; GFX6-NEXT: s_add_u32 s12, s13, s12 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: s_addc_u32 s13, 0, s14 +; GFX6-NEXT: s_mul_i32 s13, s2, s13 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_add_i32 s13, s14, s13 +; GFX6-NEXT: s_mul_i32 s14, s3, s12 +; GFX6-NEXT: s_add_i32 s14, s13, s14 +; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: s_mul_i32 s12, s2, s12 +; GFX6-NEXT: s_sub_u32 s8, s8, s12 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s16, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s3 +; GFX6-NEXT: s_sub_u32 s17, s8, s2 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s18, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s3 +; GFX6-NEXT: s_cselect_b32 s13, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s2 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, s3 +; GFX6-NEXT: s_cselect_b32 s19, s19, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s3 +; GFX6-NEXT: s_sub_u32 s20, s17, s2 +; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX6-NEXT: s_or_b32 s12, s12, s13 +; GFX6-NEXT: s_cmp_lg_u32 s12, 0 +; GFX6-NEXT: s_subb_u32 s12, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s13, s20, s17 +; GFX6-NEXT: s_cselect_b32 s12, s12, s18 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_cmp_ge_u32 s9, s3 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s8, s2 +; GFX6-NEXT: s_cselect_b32 s2, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s9, s3 +; GFX6-NEXT: s_cselect_b32 s2, s2, s14 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_cselect_b32 s3, s12, s9 +; GFX6-NEXT: s_cselect_b32 s2, s13, s8 +; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] +; GFX6-NEXT: s_sub_u32 s12, s2, s6 +; GFX6-NEXT: s_subb_u32 s13, s3, s6 ; GFX6-NEXT: s_ashr_i32 s2, s1, 31 ; GFX6-NEXT: s_add_u32 s0, s0, s2 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s1, s1, s2 -; GFX6-NEXT: s_xor_b64 s[14:15], s[0:1], s[2:3] -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 -; GFX6-NEXT: s_sub_u32 s0, 0, s14 -; GFX6-NEXT: s_subb_u32 s1, 0, s15 -; GFX6-NEXT: s_ashr_i32 s12, s9, 31 +; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_u32 s8, 0, s6 +; GFX6-NEXT: s_subb_u32 s9, 0, s7 ; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 -; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: s_add_u32 s0, s8, s12 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: s_addc_u32 s1, s9, s12 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX6-NEXT: v_mul_hi_u32 v4, s8, v1 -; GFX6-NEXT: v_mul_hi_u32 v5, s9, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: v_readfirstlane_b32 s2, v0 +; GFX6-NEXT: s_mul_i32 s1, s8, s14 +; GFX6-NEXT: v_readfirstlane_b32 s3, v2 +; GFX6-NEXT: s_mul_i32 s0, s9, s2 +; GFX6-NEXT: s_add_i32 s1, s3, s1 +; GFX6-NEXT: s_add_i32 s3, s1, s0 +; GFX6-NEXT: s_mul_i32 s15, s8, s2 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 +; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GFX6-NEXT: s_mul_i32 s4, s2, s3 +; GFX6-NEXT: v_readfirstlane_b32 s5, v2 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15 +; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3 +; GFX6-NEXT: s_add_u32 s4, s16, s4 +; GFX6-NEXT: s_addc_u32 s5, 0, s5 +; GFX6-NEXT: s_mul_i32 s15, s14, s15 +; GFX6-NEXT: v_readfirstlane_b32 s16, v0 +; GFX6-NEXT: s_add_u32 s4, s4, s15 +; GFX6-NEXT: s_addc_u32 s4, s5, s16 +; GFX6-NEXT: v_readfirstlane_b32 s5, v1 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_mul_i32 s3, s14, s3 +; GFX6-NEXT: s_add_u32 s3, s4, s3 +; GFX6-NEXT: s_addc_u32 s4, 0, s5 +; GFX6-NEXT: s_add_u32 s5, s2, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s5 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s4, s14, s4 +; GFX6-NEXT: s_mul_i32 s2, s8, s4 +; GFX6-NEXT: v_readfirstlane_b32 s3, v0 +; GFX6-NEXT: s_add_i32 s2, s3, s2 +; GFX6-NEXT: s_mul_i32 s9, s9, s5 +; GFX6-NEXT: s_mul_i32 s3, s8, s5 +; GFX6-NEXT: s_add_i32 s2, s2, s9 +; GFX6-NEXT: v_mov_b32_e32 v2, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX6-NEXT: s_mul_i32 s9, s5, s2 +; GFX6-NEXT: v_readfirstlane_b32 s15, v2 +; GFX6-NEXT: s_add_u32 s9, s15, s9 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_mul_i32 s3, s4, s3 +; GFX6-NEXT: s_addc_u32 s14, 0, s14 +; GFX6-NEXT: v_readfirstlane_b32 s8, v3 +; GFX6-NEXT: s_add_u32 s3, s9, s3 +; GFX6-NEXT: s_addc_u32 s3, s14, s8 +; GFX6-NEXT: v_readfirstlane_b32 s8, v1 +; GFX6-NEXT: s_addc_u32 s8, s8, 0 +; GFX6-NEXT: s_mul_i32 s2, s4, s2 +; GFX6-NEXT: s_add_u32 s2, s3, s2 +; GFX6-NEXT: s_addc_u32 s8, 0, s8 +; GFX6-NEXT: s_add_u32 s14, s5, s2 +; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s15, s4, s8 +; GFX6-NEXT: s_ashr_i32 s4, s11, 31 +; GFX6-NEXT: s_add_u32 s2, s10, s4 +; GFX6-NEXT: s_mov_b32 s5, s4 +; GFX6-NEXT: s_addc_u32 s3, s11, s4 +; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, s15 +; GFX6-NEXT: v_mul_hi_u32 v1, s8, v0 +; GFX6-NEXT: v_mov_b32_e32 v2, s14 +; GFX6-NEXT: v_mul_hi_u32 v3, s8, v2 +; GFX6-NEXT: s_mul_i32 s2, s8, s15 +; GFX6-NEXT: v_readfirstlane_b32 s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v1, s9, v2 +; GFX6-NEXT: v_readfirstlane_b32 s11, v3 ; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, s14, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0 -; GFX6-NEXT: v_mul_lo_u32 v3, s15, v0 -; GFX6-NEXT: v_mul_lo_u32 v0, s14, v0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s15 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s14, v0 -; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s15, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s14, v4 -; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s15, v5 -; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s14, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX6-NEXT: s_ashr_i32 s0, s17, 31 -; GFX6-NEXT: s_add_u32 s2, s16, s0 -; GFX6-NEXT: s_mov_b32 s1, s0 -; GFX6-NEXT: s_addc_u32 s3, s17, s0 -; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[0:1] -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s9 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 -; GFX6-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 -; GFX6-NEXT: v_rcp_f32_e32 v4, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1 -; GFX6-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX6-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v4 -; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: s_sub_u32 s0, 0, s8 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: s_subb_u32 s1, 0, s9 -; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 -; GFX6-NEXT: s_ashr_i32 s14, s11, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v4, v3 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 -; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 -; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 -; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 -; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: s_add_u32 s0, s10, s14 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: s_addc_u32 s1, s11, s14 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, s10, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s11, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, s11, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 -; GFX6-NEXT: v_mov_b32_e32 v6, s12 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 -; GFX6-NEXT: v_mov_b32_e32 v5, s9 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 -; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 -; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 -; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 -; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] -; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s11 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 -; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 -; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 -; GFX6-NEXT: v_mov_b32_e32 v4, s14 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc -; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GFX6-NEXT: s_add_u32 s2, s11, s2 +; GFX6-NEXT: s_addc_u32 s10, 0, s10 +; GFX6-NEXT: s_mul_i32 s11, s9, s14 +; GFX6-NEXT: v_readfirstlane_b32 s14, v1 +; GFX6-NEXT: s_add_u32 s2, s2, s11 +; GFX6-NEXT: s_addc_u32 s2, s10, s14 +; GFX6-NEXT: v_readfirstlane_b32 s10, v0 +; GFX6-NEXT: s_addc_u32 s10, s10, 0 +; GFX6-NEXT: s_mul_i32 s11, s9, s15 +; GFX6-NEXT: s_add_u32 s11, s2, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s11 +; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX6-NEXT: s_addc_u32 s10, 0, s10 +; GFX6-NEXT: s_mul_i32 s10, s6, s10 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_readfirstlane_b32 s14, v0 +; GFX6-NEXT: s_add_i32 s10, s14, s10 +; GFX6-NEXT: s_mul_i32 s14, s7, s11 +; GFX6-NEXT: s_add_i32 s14, s10, s14 +; GFX6-NEXT: s_sub_i32 s15, s9, s14 +; GFX6-NEXT: s_mul_i32 s10, s6, s11 +; GFX6-NEXT: s_sub_u32 s8, s8, s10 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s16, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s7 +; GFX6-NEXT: s_sub_u32 s17, s8, s6 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s10, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s18, s15, 0 +; GFX6-NEXT: s_cmp_ge_u32 s18, s7 +; GFX6-NEXT: s_cselect_b32 s11, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s17, s6 +; GFX6-NEXT: s_cselect_b32 s19, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s18, s7 +; GFX6-NEXT: s_cselect_b32 s19, s19, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s15, s15, s7 +; GFX6-NEXT: s_sub_u32 s20, s17, s6 +; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX6-NEXT: s_or_b32 s10, s10, s11 +; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_subb_u32 s10, s15, 0 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 +; GFX6-NEXT: s_cselect_b32 s11, s20, s17 +; GFX6-NEXT: s_cselect_b32 s10, s10, s18 +; GFX6-NEXT: s_cmp_lg_u32 s16, 0 +; GFX6-NEXT: s_subb_u32 s9, s9, s14 +; GFX6-NEXT: s_cmp_ge_u32 s9, s7 +; GFX6-NEXT: s_cselect_b32 s14, -1, 0 +; GFX6-NEXT: s_cmp_ge_u32 s8, s6 +; GFX6-NEXT: s_cselect_b32 s6, -1, 0 +; GFX6-NEXT: s_cmp_eq_u32 s9, s7 +; GFX6-NEXT: s_cselect_b32 s6, s6, s14 +; GFX6-NEXT: s_cmp_lg_u32 s6, 0 +; GFX6-NEXT: s_cselect_b32 s7, s10, s9 +; GFX6-NEXT: s_cselect_b32 s6, s11, s8 +; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; GFX6-NEXT: s_sub_u32 s5, s6, s4 +; GFX6-NEXT: s_subb_u32 s4, s7, s4 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mov_b32_e32 v0, s12 +; GFX6-NEXT: v_mov_b32_e32 v1, s13 +; GFX6-NEXT: v_mov_b32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: srem_v2i64_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s12 -; GFX9-NEXT: s_lshl_b64 s[14:15], 0x1000, s14 -; GFX9-NEXT: s_ashr_i32 s2, s1, 31 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_mov_b32 s3, s2 -; GFX9-NEXT: s_addc_u32 s1, s1, s2 -; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3] -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GFX9-NEXT: s_sub_u32 s0, 0, s12 -; GFX9-NEXT: s_subb_u32 s1, 0, s13 +; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s12 +; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s14 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_add_u32 s2, s2, s6 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_addc_u32 s3, s3, s6 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_u32 s12, 0, s2 +; GFX9-NEXT: s_subb_u32 s13, 0, s3 ; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -9627,264 +9800,257 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s2, v1 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s4, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3 -; GFX9-NEXT: s_mul_i32 s5, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s16, s4 -; GFX9-NEXT: s_mul_i32 s17, s0, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4 -; GFX9-NEXT: s_mul_i32 s16, s3, s4 -; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17 -; GFX9-NEXT: s_add_u32 s3, s3, s16 -; GFX9-NEXT: s_addc_u32 s5, 0, s5 -; GFX9-NEXT: s_mul_hi_u32 s18, s2, s17 -; GFX9-NEXT: s_mul_i32 s17, s2, s17 -; GFX9-NEXT: s_add_u32 s3, s3, s17 -; GFX9-NEXT: s_mul_hi_u32 s16, s2, s4 -; GFX9-NEXT: s_addc_u32 s3, s5, s18 -; GFX9-NEXT: s_addc_u32 s5, s16, 0 -; GFX9-NEXT: s_mul_i32 s4, s2, s4 -; GFX9-NEXT: s_add_u32 s3, s3, s4 -; GFX9-NEXT: s_addc_u32 s4, 0, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s4 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s5, s0, s4 -; GFX9-NEXT: s_add_i32 s3, s5, s3 -; GFX9-NEXT: s_mul_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s4 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s0 -; GFX9-NEXT: s_mul_i32 s16, s2, s0 -; GFX9-NEXT: s_mul_i32 s18, s4, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s4, s0 -; GFX9-NEXT: s_mul_hi_u32 s17, s4, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s18 -; GFX9-NEXT: s_addc_u32 s4, 0, s17 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_addc_u32 s0, s4, s5 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s1 -; GFX9-NEXT: s_ashr_i32 s16, s9, 31 -; GFX9-NEXT: s_add_u32 s0, s8, s16 -; GFX9-NEXT: s_mov_b32 s17, s16 -; GFX9-NEXT: s_addc_u32 s1, s9, s16 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s1, s4, s2 -; GFX9-NEXT: s_mul_hi_u32 s8, s4, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s4, s2 -; GFX9-NEXT: s_add_u32 s1, s8, s1 -; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s9, s5, s3 -; GFX9-NEXT: s_mul_i32 s3, s5, s3 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_mul_hi_u32 s8, s5, s2 -; GFX9-NEXT: s_addc_u32 s0, s0, s9 -; GFX9-NEXT: s_addc_u32 s1, s8, 0 -; GFX9-NEXT: s_mul_i32 s2, s5, s2 +; GFX9-NEXT: v_readfirstlane_b32 s14, v1 +; GFX9-NEXT: v_readfirstlane_b32 s6, v0 +; GFX9-NEXT: s_mul_i32 s7, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s6 +; GFX9-NEXT: s_mul_i32 s15, s13, s6 +; GFX9-NEXT: s_add_i32 s7, s16, s7 +; GFX9-NEXT: s_mul_i32 s17, s12, s6 +; GFX9-NEXT: s_add_i32 s7, s7, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s6, s17 +; GFX9-NEXT: s_mul_i32 s18, s6, s7 +; GFX9-NEXT: s_mul_hi_u32 s15, s6, s7 +; GFX9-NEXT: s_add_u32 s16, s16, s18 +; GFX9-NEXT: s_addc_u32 s15, 0, s15 +; GFX9-NEXT: s_mul_hi_u32 s18, s14, s17 +; GFX9-NEXT: s_mul_i32 s17, s14, s17 +; GFX9-NEXT: s_add_u32 s16, s16, s17 +; GFX9-NEXT: s_mul_hi_u32 s19, s14, s7 +; GFX9-NEXT: s_addc_u32 s15, s15, s18 +; GFX9-NEXT: s_addc_u32 s16, s19, 0 +; GFX9-NEXT: s_mul_i32 s7, s14, s7 +; GFX9-NEXT: s_add_u32 s7, s15, s7 +; GFX9-NEXT: s_addc_u32 s15, 0, s16 +; GFX9-NEXT: s_add_u32 s16, s6, s7 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_addc_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_i32 s6, s12, s14 +; GFX9-NEXT: s_mul_hi_u32 s7, s12, s16 +; GFX9-NEXT: s_add_i32 s6, s7, s6 +; GFX9-NEXT: s_mul_i32 s13, s13, s16 +; GFX9-NEXT: s_add_i32 s6, s6, s13 +; GFX9-NEXT: s_mul_i32 s12, s12, s16 +; GFX9-NEXT: s_mul_hi_u32 s13, s14, s12 +; GFX9-NEXT: s_mul_i32 s15, s14, s12 +; GFX9-NEXT: s_mul_i32 s18, s16, s6 +; GFX9-NEXT: s_mul_hi_u32 s12, s16, s12 +; GFX9-NEXT: s_mul_hi_u32 s17, s16, s6 +; GFX9-NEXT: s_add_u32 s12, s12, s18 +; GFX9-NEXT: s_addc_u32 s17, 0, s17 +; GFX9-NEXT: s_add_u32 s12, s12, s15 +; GFX9-NEXT: s_mul_hi_u32 s7, s14, s6 +; GFX9-NEXT: s_addc_u32 s12, s17, s13 +; GFX9-NEXT: s_addc_u32 s7, s7, 0 +; GFX9-NEXT: s_mul_i32 s6, s14, s6 +; GFX9-NEXT: s_add_u32 s6, s12, s6 +; GFX9-NEXT: s_addc_u32 s12, 0, s7 +; GFX9-NEXT: s_add_u32 s13, s16, s6 +; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX9-NEXT: s_addc_u32 s12, s14, s12 +; GFX9-NEXT: s_ashr_i32 s6, s9, 31 +; GFX9-NEXT: s_add_u32 s8, s8, s6 +; GFX9-NEXT: s_mov_b32 s7, s6 +; GFX9-NEXT: s_addc_u32 s9, s9, s6 +; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GFX9-NEXT: s_mul_i32 s15, s8, s12 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s8, s12 +; GFX9-NEXT: s_add_u32 s15, s16, s15 +; GFX9-NEXT: s_addc_u32 s14, 0, s14 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s13 +; GFX9-NEXT: s_mul_i32 s13, s9, s13 +; GFX9-NEXT: s_add_u32 s13, s15, s13 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX9-NEXT: s_addc_u32 s13, s14, s17 +; GFX9-NEXT: s_addc_u32 s14, s16, 0 +; GFX9-NEXT: s_mul_i32 s12, s9, s12 +; GFX9-NEXT: s_add_u32 s12, s13, s12 +; GFX9-NEXT: s_addc_u32 s13, 0, s14 +; GFX9-NEXT: s_mul_i32 s13, s2, s13 +; GFX9-NEXT: s_mul_hi_u32 s14, s2, s12 +; GFX9-NEXT: s_add_i32 s13, s14, s13 +; GFX9-NEXT: s_mul_i32 s14, s3, s12 +; GFX9-NEXT: s_add_i32 s16, s13, s14 +; GFX9-NEXT: s_sub_i32 s14, s9, s16 +; GFX9-NEXT: s_mul_i32 s12, s2, s12 +; GFX9-NEXT: s_sub_u32 s8, s8, s12 +; GFX9-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s17, s14, s3 +; GFX9-NEXT: s_sub_u32 s18, s8, s2 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s19, s17, 0 +; GFX9-NEXT: s_cmp_ge_u32 s19, s3 +; GFX9-NEXT: s_cselect_b32 s20, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s18, s2 +; GFX9-NEXT: s_cselect_b32 s21, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s19, s3 +; GFX9-NEXT: s_cselect_b32 s20, s21, s20 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s17, s17, s3 +; GFX9-NEXT: s_sub_u32 s21, s18, s2 +; GFX9-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GFX9-NEXT: s_subb_u32 s14, s17, 0 +; GFX9-NEXT: s_cmp_lg_u32 s20, 0 +; GFX9-NEXT: s_cselect_b32 s15, s21, s18 +; GFX9-NEXT: s_cselect_b32 s14, s14, s19 +; GFX9-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GFX9-NEXT: s_subb_u32 s9, s9, s16 +; GFX9-NEXT: s_cmp_ge_u32 s9, s3 +; GFX9-NEXT: s_cselect_b32 s12, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s8, s2 +; GFX9-NEXT: s_cselect_b32 s2, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s9, s3 +; GFX9-NEXT: s_cselect_b32 s2, s2, s12 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s3, s14, s9 +; GFX9-NEXT: s_cselect_b32 s2, s15, s8 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] +; GFX9-NEXT: s_sub_u32 s12, s2, s6 +; GFX9-NEXT: s_subb_u32 s13, s3, s6 +; GFX9-NEXT: s_ashr_i32 s2, s1, 31 ; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_i32 s1, s12, s1 -; GFX9-NEXT: s_mul_hi_u32 s2, s12, s0 -; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_mul_i32 s2, s13, s0 -; GFX9-NEXT: s_mul_i32 s0, s12, s0 -; GFX9-NEXT: s_add_i32 s8, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: s_sub_i32 s1, s5, s8 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s4, s1, s13 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s9, s4, 0 -; GFX9-NEXT: s_cmp_ge_u32 s9, s13 -; GFX9-NEXT: s_cselect_b32 s17, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 -; GFX9-NEXT: s_cmp_eq_u32 s9, s13 -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] -; GFX9-NEXT: s_subb_u32 s2, s4, s13 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s2, s2, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s5, s8 -; GFX9-NEXT: s_cmp_ge_u32 s0, s13 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s13 -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s0 -; GFX9-NEXT: s_ashr_i32 s0, s15, 31 -; GFX9-NEXT: s_add_u32 s2, s14, s0 -; GFX9-NEXT: s_mov_b32 s1, s0 -; GFX9-NEXT: s_addc_u32 s3, s15, s0 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_xor_b32_e32 v0, s16, v0 -; GFX9-NEXT: v_xor_b32_e32 v2, s16, v2 -; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3 -; GFX9-NEXT: v_rcp_f32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v5, s16 -; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s16, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v5, vcc -; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 -; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 +; GFX9-NEXT: s_mov_b32 s3, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s2 +; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: s_sub_u32 s6, 0, s2 +; GFX9-NEXT: s_subb_u32 s7, 0, s3 +; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 +; GFX9-NEXT: v_rcp_f32_e32 v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: s_sub_u32 s0, 0, s4 -; GFX9-NEXT: s_subb_u32 s1, 0, s5 -; GFX9-NEXT: v_readfirstlane_b32 s2, v2 -; GFX9-NEXT: v_readfirstlane_b32 s9, v3 -; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 -; GFX9-NEXT: s_mul_i32 s12, s0, s9 -; GFX9-NEXT: s_mul_i32 s3, s1, s2 -; GFX9-NEXT: s_add_i32 s8, s8, s12 -; GFX9-NEXT: s_add_i32 s8, s8, s3 -; GFX9-NEXT: s_mul_i32 s13, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s3, s2, s8 -; GFX9-NEXT: s_mul_i32 s12, s2, s8 -; GFX9-NEXT: s_mul_hi_u32 s2, s2, s13 -; GFX9-NEXT: s_add_u32 s2, s2, s12 -; GFX9-NEXT: s_addc_u32 s3, 0, s3 -; GFX9-NEXT: s_mul_hi_u32 s14, s9, s13 -; GFX9-NEXT: s_mul_i32 s13, s9, s13 -; GFX9-NEXT: s_add_u32 s2, s2, s13 -; GFX9-NEXT: s_mul_hi_u32 s12, s9, s8 -; GFX9-NEXT: s_addc_u32 s2, s3, s14 -; GFX9-NEXT: s_addc_u32 s3, s12, 0 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_readfirstlane_b32 s9, v2 +; GFX9-NEXT: s_mul_hi_u32 s8, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s6, s9 +; GFX9-NEXT: s_mul_i32 s5, s7, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s14 +; GFX9-NEXT: s_add_i32 s8, s8, s5 +; GFX9-NEXT: s_mul_i32 s15, s6, s4 +; GFX9-NEXT: s_mul_i32 s14, s4, s8 +; GFX9-NEXT: s_mul_hi_u32 s16, s4, s15 +; GFX9-NEXT: s_mul_hi_u32 s5, s4, s8 +; GFX9-NEXT: s_add_u32 s14, s16, s14 +; GFX9-NEXT: s_addc_u32 s5, 0, s5 +; GFX9-NEXT: s_mul_hi_u32 s17, s9, s15 +; GFX9-NEXT: s_mul_i32 s15, s9, s15 +; GFX9-NEXT: s_add_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s9, s8 +; GFX9-NEXT: s_addc_u32 s5, s5, s17 +; GFX9-NEXT: s_addc_u32 s14, s16, 0 ; GFX9-NEXT: s_mul_i32 s8, s9, s8 -; GFX9-NEXT: s_add_u32 s2, s2, s8 -; GFX9-NEXT: s_addc_u32 s3, 0, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s9, s3 -; GFX9-NEXT: v_readfirstlane_b32 s8, v2 -; GFX9-NEXT: s_mul_i32 s3, s0, s2 -; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 -; GFX9-NEXT: s_add_i32 s3, s9, s3 -; GFX9-NEXT: s_mul_i32 s1, s1, s8 -; GFX9-NEXT: s_add_i32 s3, s3, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s8 -; GFX9-NEXT: s_mul_hi_u32 s9, s2, s0 -; GFX9-NEXT: s_mul_i32 s12, s2, s0 -; GFX9-NEXT: s_mul_i32 s14, s8, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s8, s0 -; GFX9-NEXT: s_mul_hi_u32 s13, s8, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_addc_u32 s8, 0, s13 -; GFX9-NEXT: s_add_u32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 -; GFX9-NEXT: s_addc_u32 s0, s8, s9 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s3, s2, s3 -; GFX9-NEXT: s_add_u32 s0, s0, s3 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s2, s2, s1 -; GFX9-NEXT: s_ashr_i32 s8, s11, 31 -; GFX9-NEXT: s_add_u32 s0, s10, s8 -; GFX9-NEXT: s_mov_b32 s9, s8 -; GFX9-NEXT: s_addc_u32 s1, s11, s8 -; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] -; GFX9-NEXT: v_readfirstlane_b32 s3, v2 -; GFX9-NEXT: s_mul_i32 s1, s10, s2 -; GFX9-NEXT: s_mul_hi_u32 s9, s10, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s2 -; GFX9-NEXT: s_add_u32 s1, s9, s1 -; GFX9-NEXT: s_addc_u32 s0, 0, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s11, s3 -; GFX9-NEXT: s_mul_i32 s3, s11, s3 -; GFX9-NEXT: s_add_u32 s1, s1, s3 -; GFX9-NEXT: s_mul_hi_u32 s9, s11, s2 -; GFX9-NEXT: s_addc_u32 s0, s0, s12 -; GFX9-NEXT: s_addc_u32 s1, s9, 0 -; GFX9-NEXT: s_mul_i32 s2, s11, s2 -; GFX9-NEXT: s_add_u32 s0, s0, s2 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_i32 s1, s4, s1 -; GFX9-NEXT: s_mul_hi_u32 s2, s4, s0 -; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_mul_i32 s2, s5, s0 -; GFX9-NEXT: s_mul_i32 s0, s4, s0 -; GFX9-NEXT: s_add_i32 s9, s1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: s_sub_i32 s1, s11, s9 -; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s10, s1, s5 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s12, s10, 0 -; GFX9-NEXT: s_cmp_ge_u32 s12, s5 -; GFX9-NEXT: s_cselect_b32 s13, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v3 -; GFX9-NEXT: s_cmp_eq_u32 s12, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v6, s13 -; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] -; GFX9-NEXT: s_subb_u32 s2, s10, s5 -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s2, s2, 0 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s12 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s11, s9 -; GFX9-NEXT: s_cmp_ge_u32 s0, s5 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GFX9-NEXT: s_cmp_eq_u32 s0, s5 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v7, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_mov_b32_e32 v7, s0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 -; GFX9-NEXT: v_xor_b32_e32 v3, s8, v5 -; GFX9-NEXT: v_mov_b32_e32 v5, s8 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] +; GFX9-NEXT: s_add_u32 s5, s5, s8 +; GFX9-NEXT: s_addc_u32 s8, 0, s14 +; GFX9-NEXT: s_add_u32 s14, s4, s5 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s4, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s5, s6, s14 +; GFX9-NEXT: s_add_i32 s4, s5, s4 +; GFX9-NEXT: s_mul_i32 s7, s7, s14 +; GFX9-NEXT: s_add_i32 s4, s4, s7 +; GFX9-NEXT: s_mul_i32 s6, s6, s14 +; GFX9-NEXT: s_mul_hi_u32 s7, s8, s6 +; GFX9-NEXT: s_mul_i32 s9, s8, s6 +; GFX9-NEXT: s_mul_i32 s16, s14, s4 +; GFX9-NEXT: s_mul_hi_u32 s6, s14, s6 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s4 +; GFX9-NEXT: s_add_u32 s6, s6, s16 +; GFX9-NEXT: s_addc_u32 s15, 0, s15 +; GFX9-NEXT: s_add_u32 s6, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s5, s8, s4 +; GFX9-NEXT: s_addc_u32 s6, s15, s7 +; GFX9-NEXT: s_addc_u32 s5, s5, 0 +; GFX9-NEXT: s_mul_i32 s4, s8, s4 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s6, 0, s5 +; GFX9-NEXT: s_add_u32 s9, s14, s4 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX9-NEXT: s_addc_u32 s8, s8, s6 +; GFX9-NEXT: s_ashr_i32 s4, s11, 31 +; GFX9-NEXT: s_add_u32 s6, s10, s4 +; GFX9-NEXT: s_mov_b32 s5, s4 +; GFX9-NEXT: s_addc_u32 s7, s11, s4 +; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; GFX9-NEXT: s_mul_i32 s11, s6, s8 +; GFX9-NEXT: s_mul_hi_u32 s14, s6, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s6, s8 +; GFX9-NEXT: s_add_u32 s11, s14, s11 +; GFX9-NEXT: s_addc_u32 s10, 0, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s7, s9 +; GFX9-NEXT: s_mul_i32 s9, s7, s9 +; GFX9-NEXT: s_add_u32 s9, s11, s9 +; GFX9-NEXT: s_mul_hi_u32 s14, s7, s8 +; GFX9-NEXT: s_addc_u32 s9, s10, s15 +; GFX9-NEXT: s_addc_u32 s10, s14, 0 +; GFX9-NEXT: s_mul_i32 s8, s7, s8 +; GFX9-NEXT: s_add_u32 s8, s9, s8 +; GFX9-NEXT: s_addc_u32 s9, 0, s10 +; GFX9-NEXT: s_mul_i32 s9, s2, s9 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX9-NEXT: s_add_i32 s9, s10, s9 +; GFX9-NEXT: s_mul_i32 s10, s3, s8 +; GFX9-NEXT: s_add_i32 s14, s9, s10 +; GFX9-NEXT: s_sub_i32 s10, s7, s14 +; GFX9-NEXT: s_mul_i32 s8, s2, s8 +; GFX9-NEXT: s_sub_u32 s6, s6, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s15, s10, s3 +; GFX9-NEXT: s_sub_u32 s16, s6, s2 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s17, s15, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s3 +; GFX9-NEXT: s_cselect_b32 s18, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s16, s2 +; GFX9-NEXT: s_cselect_b32 s19, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s17, s3 +; GFX9-NEXT: s_cselect_b32 s18, s19, s18 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s15, s15, s3 +; GFX9-NEXT: s_sub_u32 s19, s16, s2 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s15, 0 +; GFX9-NEXT: s_cmp_lg_u32 s18, 0 +; GFX9-NEXT: s_cselect_b32 s11, s19, s16 +; GFX9-NEXT: s_cselect_b32 s10, s10, s17 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s7, s7, s14 +; GFX9-NEXT: s_cmp_ge_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s8, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s2, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s7, s3 +; GFX9-NEXT: s_cselect_b32 s2, s2, s8 +; GFX9-NEXT: s_cmp_lg_u32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s3, s10, s7 +; GFX9-NEXT: s_cselect_b32 s2, s11, s6 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_sub_u32 s2, s2, s4 +; GFX9-NEXT: s_subb_u32 s3, s3, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i64> , %y %r = srem <2 x i64> %x, %shl.y diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll index d301f16512a60..37040123ee20c 100644 --- a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -7,8 +7,8 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 ; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 -; CHECK-NEXT: v_mov_b32_e32 v30, 0x9037ab78 -; CHECK-NEXT: v_mov_b32_e32 v31, 0x3e21eeb6 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 +; CHECK-NEXT: v_mov_b32_e32 v20, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s0, 0 ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 @@ -16,9 +16,12 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: s_bitcmp1_b32 s0, 8 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 +; CHECK-NEXT: v_accvgpr_write_b32 a3, v1 +; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 ; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: v_accvgpr_write_b32 a2, v0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 ; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f ; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 @@ -34,15 +37,14 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 ; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_mov_b32_e32 v20, 0x57b87036 -; CHECK-NEXT: v_mov_b32_e32 v21, 0x3fb3b136 +; CHECK-NEXT: v_mov_b32_e32 v0, 0x57b87036 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3fb3b136 ; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 ; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 ; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19] -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b64_e32 v[16:17], 0 -; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 +; CHECK-NEXT: v_mov_b32_e32 v21, v20 +; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31 ; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_1: ; %Flow9 @@ -62,11 +64,12 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] ; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] -; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[30:31] +; CHECK-NEXT: v_accvgpr_read_b32 v27, a3 +; CHECK-NEXT: v_accvgpr_read_b32 v26, a2 ; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] -; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[20:21] -; CHECK-NEXT: v_accvgpr_write_b32 a2, 0 -; CHECK-NEXT: v_accvgpr_write_b32 a3, 0 +; CHECK-NEXT: v_mov_b64_e32 v[16:17], v[0:1] +; CHECK-NEXT: v_accvgpr_write_b32 a0, 0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[24:25] ; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] @@ -93,32 +96,30 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 ; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 -; CHECK-NEXT: v_accvgpr_read_b32 v29, a3 -; CHECK-NEXT: v_accvgpr_read_b32 v28, a2 +; CHECK-NEXT: v_accvgpr_read_b32 v29, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v28, a0 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 ; CHECK-NEXT: s_mov_b64 vcc, s[2:3] -; CHECK-NEXT: ; implicit-def: $agpr2_agpr3 +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: v_accvgpr_mov_b32 a3, a1 -; CHECK-NEXT: v_accvgpr_mov_b32 a2, a0 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v30 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v31 ; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] ; CHECK-NEXT: s_mov_b64 vcc, s[6:7] ; CHECK-NEXT: s_cbranch_vccz .LBB0_5 ; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 ; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 -; CHECK-NEXT: v_accvgpr_write_b32 a2, v26 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v26 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a3, v27 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v27 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: s_branch .LBB0_5 ; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[10:11] -; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 ; CHECK-NEXT: s_mov_b64 s[22:23], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v25 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11] ; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] ; CHECK-NEXT: s_branch .LBB0_15 ; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 @@ -135,21 +136,19 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] ; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] ; CHECK-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[8:9] -; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v17, v16 +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] +; CHECK-NEXT: global_store_dwordx2 v20, v[16:17], s[12:13] ; CHECK-NEXT: s_cselect_b32 s23, s23, 0 ; CHECK-NEXT: s_cselect_b32 s22, s22, 0 ; CHECK-NEXT: s_mov_b64 s[8:9], -1 -; CHECK-NEXT: global_store_dwordx2 v0, v[16:17], s[12:13] ; CHECK-NEXT: s_branch .LBB0_14 ; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[8:9], 0 ; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 -; CHECK-NEXT: .LBB0_14: ; %Flow8 +; CHECK-NEXT: .LBB0_14: ; %Flow6 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 -; CHECK-NEXT: v_mov_b64_e32 v[16:17], 0 -; CHECK-NEXT: v_accvgpr_write_b32 a1, v25 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[24:25] ; CHECK-NEXT: .LBB0_15: ; %Flow6 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], -1 @@ -158,7 +157,7 @@ define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, ; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 ; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: s_mov_b64 s[24:25], 0 -; CHECK-NEXT: global_store_dwordx2 v0, v[16:17], s[12:13] +; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] ; CHECK-NEXT: s_branch .LBB0_1 ; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock ; CHECK-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir index c456f9c4b16e5..a2ec87053a8d5 100644 --- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir +++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir @@ -49,7 +49,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $exec, $sgpr30, $sgpr31, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F + ; GCN-NEXT: liveins: $exec, $sgpr30, $sgpr31, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr57 = COPY $vgpr9, implicit $exec ; GCN-NEXT: renamable $vgpr56 = COPY $vgpr8, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 44c719f3635c8..94ba5cdd09df4 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16 +; xUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250TRUE16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX1250,GFX1250FAKE16 + +; FIXME: real-true16 version of gfx1250 test fails define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GCN-LABEL: test_load_store: @@ -76,6 +80,15 @@ define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in store bfloat %val, ptr addrspace(1) %out ret void @@ -135,6 +148,14 @@ define <2 x bfloat> @v_load_global_v2bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b32 v0, v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <2 x bfloat>, ptr addrspace(1) %ptr ret <2 x bfloat> %load } @@ -195,6 +216,14 @@ define <3 x bfloat> @v_load_global_v3bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <3 x bfloat>, ptr addrspace(1) %ptr ret <3 x bfloat> %load } @@ -257,6 +286,14 @@ define <4 x bfloat> @v_load_global_v4bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <4 x bfloat>, ptr addrspace(1) %ptr ret <4 x bfloat> %load } @@ -323,6 +360,14 @@ define <6 x bfloat> @v_load_global_v6bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b96 v[0:2], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <6 x bfloat>, ptr addrspace(1) %ptr ret <6 x bfloat> %load } @@ -393,6 +438,14 @@ define <8 x bfloat> @v_load_global_v8bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[0:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <8 x bfloat>, ptr addrspace(1) %ptr ret <8 x bfloat> %load } @@ -511,6 +564,17 @@ define <16 x bfloat> @v_load_global_v16bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[0:3], v[4:5], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[4:5], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr ret <16 x bfloat> %load } @@ -683,6 +747,19 @@ define <32 x bfloat> @v_load_global_v32bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v12, v0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_load_b128 v[0:3], v[12:13], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[12:13], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[12:13], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[12:13], off offset:48 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr ret <32 x bfloat> %load } @@ -973,6 +1050,23 @@ define <64 x bfloat> @v_load_global_v64bf16(ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_load_global_v64bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_mov_b32 v28, v0 +; GFX1250-NEXT: s_clause 0x7 +; GFX1250-NEXT: global_load_b128 v[0:3], v[28:29], off +; GFX1250-NEXT: global_load_b128 v[4:7], v[28:29], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[28:29], off offset:32 +; GFX1250-NEXT: global_load_b128 v[12:15], v[28:29], off offset:48 +; GFX1250-NEXT: global_load_b128 v[16:19], v[28:29], off offset:64 +; GFX1250-NEXT: global_load_b128 v[20:23], v[28:29], off offset:80 +; GFX1250-NEXT: global_load_b128 v[24:27], v[28:29], off offset:96 +; GFX1250-NEXT: global_load_b128 v[28:31], v[28:29], off offset:112 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <64 x bfloat>, ptr addrspace(1) %ptr ret <64 x bfloat> %load } @@ -1042,6 +1136,14 @@ define void @v_store_global_v2bf16(<2 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v[1:2], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <2 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1115,6 +1217,15 @@ define void @v_store_global_v3bf16(<3 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b16 v[2:3], v1, off offset:4 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <3 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1183,6 +1294,13 @@ define void @v_store_global_v4bf16(<4 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <4 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1267,6 +1385,13 @@ define void @v_store_global_v8bf16(<8 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <8 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1393,6 +1518,15 @@ define void @v_store_global_v16bf16(<16 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <16 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -1610,6 +1744,17 @@ define void @v_store_global_v32bf16(<32 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[16:17], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[16:17], v[12:15], off offset:48 +; GFX1250-NEXT: global_store_b128 v[16:17], v[8:11], off offset:32 +; GFX1250-NEXT: global_store_b128 v[16:17], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <32 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -2148,6 +2293,26 @@ define void @v_store_global_v64bf16(<64 x bfloat> %val, ptr addrspace(1) %ptr) { ; GFX11-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[32:33], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_store_global_v64bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_clause 0x7 +; GFX1250-NEXT: global_store_b128 v[32:33], v[28:31], off offset:112 +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off offset:96 +; GFX1250-NEXT: global_store_b128 v[32:33], v[20:23], off offset:80 +; GFX1250-NEXT: global_store_b128 v[32:33], v[16:19], off offset:64 +; GFX1250-NEXT: global_store_b128 v[32:33], v[12:15], off offset:48 +; GFX1250-NEXT: global_store_b128 v[32:33], v[8:11], off offset:32 +; GFX1250-NEXT: global_store_b128 v[32:33], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <64 x bfloat> %val, ptr addrspace(1) %ptr ret void } @@ -2227,6 +2392,16 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) { ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v4, off ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v5, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_store_fpimm: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v4, 0x3f80 +; GFX1250-NEXT: v_mov_b32_e32 v5, 0x4228 +; GFX1250-NEXT: global_store_b16 v[0:1], v4, off +; GFX1250-NEXT: global_store_b16 v[2:3], v5, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat 1.0, ptr addrspace(1) %ptr0 store bfloat 42.0, ptr addrspace(1) %ptr1 ret void @@ -2330,6 +2505,16 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_f32_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load float, ptr addrspace(1) %in %val.bf16 = fptrunc float %val to bfloat store bfloat %val.bf16, ptr addrspace(1) %out @@ -2488,6 +2673,29 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc_lo ; GFX11-NEXT: global_store_d16_hi_b16 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_f64_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 +; GFX1250-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]| +; GFX1250-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s0 +; GFX1250-NEXT: v_dual_add_nc_u32 v0, v6, v0 :: v_dual_bitop2_b32 v7, 1, v6 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v7 +; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load double, ptr addrspace(1) %in %val.bf16 = fptrunc double %val to bfloat store bfloat %val.bf16, ptr addrspace(1) %out @@ -2560,6 +2768,16 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_bf16_to_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val.f32 = fpext bfloat %val to float store float %val.f32, ptr addrspace(1) %out @@ -2639,6 +2857,18 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_bf16_to_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val.f64 = fpext bfloat %val to double store double %val.f64, ptr addrspace(1) %out @@ -2705,6 +2935,15 @@ define void @test_load_store_v2bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <2 x bfloat>, ptr addrspace(1) %in store <2 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2770,6 +3009,15 @@ define void @test_load_store_v4bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <4 x bfloat>, ptr addrspace(1) %in store <4 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2835,6 +3083,15 @@ define void @test_load_store_v8bf16(ptr addrspace(1) %in, ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[2:3], v[4:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <8 x bfloat>, ptr addrspace(1) %in store <8 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2924,6 +3181,19 @@ define void @test_load_store_v16bf16(ptr addrspace(1) %in, ptr addrspace(1) %out ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[2:3], v[8:11], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_load_store_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16 +; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[2:3], v[8:11], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load <16 x bfloat>, ptr addrspace(1) %in store <16 x bfloat> %val, ptr addrspace(1) %out ret void @@ -2990,6 +3260,14 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b16 v[1:2], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %in, ptr addrspace(1) %out ret void } @@ -3059,6 +3337,14 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v[1:2], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <2 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3132,6 +3418,15 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4 ; GFX11-NEXT: global_store_b32 v[2:3], v0, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b16 v[2:3], v1, off offset:4 +; GFX1250-NEXT: global_store_b32 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <3 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3200,6 +3495,13 @@ define void @test_arg_store_v4bf16(<4 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <4 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3284,6 +3586,13 @@ define void @test_arg_store_v8bf16(<8 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b128 v[4:5], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <8 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3410,6 +3719,15 @@ define void @test_arg_store_v16bf16(<16 x bfloat> %in, ptr addrspace(1) %out) { ; GFX11-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 ; GFX11-NEXT: global_store_b128 v[8:9], v[0:3], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_arg_store_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off offset:16 +; GFX1250-NEXT: global_store_b128 v[8:9], v[0:3], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store <16 x bfloat> %in, ptr addrspace(1) %out ret void } @@ -3477,6 +3795,14 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1) ; GFX11FAKE16-NEXT: v_mov_b32_e32 v2, s4 ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_inreg_arg_store: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s4 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %in, ptr addrspace(1) %out ret void } @@ -3539,6 +3865,13 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: scratch_store_b16 off, v0, s32 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_byval: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %val, ptr addrspace(5) %bv %retval = load bfloat, ptr addrspace(5) %bv ret bfloat %retval @@ -3595,6 +3928,13 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_store_b16 v0, v1, off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_sret: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 v0, v1, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] store bfloat %val, ptr addrspace(5) %sret ret void } @@ -3667,6 +4007,15 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: global_store_b16 v[2:3], v0, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_bitcast_from_bfloat: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[2:3], v0, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load bfloat, ptr addrspace(1) %in %val_int = bitcast bfloat %val to i16 store i16 %val_int, ptr addrspace(1) %out @@ -3741,6 +4090,15 @@ define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_bitcast_to_bfloat: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v2, v[2:3], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v2, off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to bfloat store bfloat %val_fp, ptr addrspace(1) %out @@ -3777,6 +4135,12 @@ define bfloat @test_ret(bfloat %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret bfloat %in } @@ -3811,6 +4175,12 @@ define <2 x bfloat> @test_ret_v2bf16(<2 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v2bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <2 x bfloat> %in } @@ -3845,6 +4215,12 @@ define <3 x bfloat> @test_ret_v3bf16(<3 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v3bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <3 x bfloat> %in } @@ -3879,6 +4255,12 @@ define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v4bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <4 x bfloat> %in } @@ -3913,6 +4295,12 @@ define <8 x bfloat> @test_ret_v8bf16(<8 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v8bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <8 x bfloat> %in } @@ -3947,6 +4335,12 @@ define <16 x bfloat> @test_ret_v16bf16(<16 x bfloat> %in) { ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_ret_v16bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: ret <16 x bfloat> %in } @@ -4161,6 +4555,38 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v4, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b16 v1, v0, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v4, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call bfloat @test_arg_store(bfloat %in) store volatile bfloat %result, ptr addrspace(5) %out @@ -4387,6 +4813,38 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v2bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v4, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v4, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b32 v1, v0, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v4, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v4, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <2 x bfloat> @test_arg_store_v2bf16(<2 x bfloat> %in) store volatile <2 x bfloat> %result, ptr addrspace(5) %out @@ -4629,6 +5087,41 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v3bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_mov_b32_e32 v4, v2 +; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b16 v4, v1, off offset:4 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: scratch_store_b32 v4, v0, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <3 x bfloat> @test_arg_store_v2bf16(<3 x bfloat> %in) store volatile <3 x bfloat> %result, ptr addrspace(5) %out @@ -4883,6 +5376,39 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v4bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_mov_b32_e32 v4, v2 +; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b64 v4, v[0:1], off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <4 x bfloat> @test_arg_store_v2bf16(<4 x bfloat> %in) store volatile <4 x bfloat> %result, ptr addrspace(5) %out @@ -5190,6 +5716,38 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v8bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v5, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v5, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v5, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b128 v4, v[0:3], off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v5, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v5, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <8 x bfloat> @test_arg_store_v2bf16(<8 x bfloat> %in) store volatile <8 x bfloat> %result, ptr addrspace(5) %out @@ -5609,6 +6167,40 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { ; GFX11-NEXT: s_mov_b32 s33, s2 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_call_v16bf16: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, s33 +; GFX1250-NEXT: s_mov_b32 s33, s32 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_store_b32 off, v9, s33 ; 4-byte Folded Spill +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_get_pc_i64 s[0:1] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], test_arg_store_v2bf16@gotpcrel+4 +; GFX1250-NEXT: v_writelane_b32 v9, s30, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX1250-NEXT: s_add_co_i32 s32, s32, 16 +; GFX1250-NEXT: v_writelane_b32 v9, s31, 1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_swap_pc_i64 s[30:31], s[0:1] +; GFX1250-NEXT: scratch_store_b128 v8, v[4:7], off offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: scratch_store_b128 v8, v[0:3], off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: v_readlane_b32 s31, v9, 1 +; GFX1250-NEXT: v_readlane_b32 s30, v9, 0 +; GFX1250-NEXT: s_mov_b32 s32, s33 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-NEXT: scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-NEXT: s_mov_b32 s33, s2 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %result = call <16 x bfloat> @test_arg_store_v2bf16(<16 x bfloat> %in) store volatile <16 x bfloat> %result, ptr addrspace(5) %out @@ -5693,6 +6285,16 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) { ; GFX11FAKE16-NEXT: scratch_load_u16 v0, off, s32 glc dlc ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_alloca_load_store_ret: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_store_b16 off, v0, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: scratch_load_u16 v0, off, s32 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] entry: %in.addr = alloca bfloat, align 2, addrspace(5) store volatile bfloat %in, ptr addrspace(5) %in.addr, align 2 @@ -6105,6 +6707,28 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) { ; GFX11FAKE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 ; GFX11FAKE16-NEXT: scratch_store_b16 v0, v1, off offset:128 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_overflow_stack: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: s_clause 0x5 +; GFX1250-NEXT: scratch_store_b128 v0, v[22:25], off offset:80 +; GFX1250-NEXT: scratch_store_b128 v0, v[18:21], off offset:64 +; GFX1250-NEXT: scratch_store_b128 v0, v[14:17], off offset:48 +; GFX1250-NEXT: scratch_store_b128 v0, v[10:13], off offset:32 +; GFX1250-NEXT: scratch_store_b128 v0, v[6:9], off offset:16 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_clause 0x2 +; GFX1250-NEXT: scratch_store_b128 v0, v[30:33], off offset:112 +; GFX1250-NEXT: scratch_store_b128 v0, v[26:29], off offset:96 +; GFX1250-NEXT: scratch_store_b16 v0, v1, off offset:128 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %ins.0 = insertvalue { <32 x i32>, bfloat } poison, <32 x i32> %b, 0 %ins.1 = insertvalue { <32 x i32>, bfloat } %ins.0 ,bfloat %a, 1 ret { <32 x i32>, bfloat } %ins.1 @@ -6172,6 +6796,16 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v2bf16_to_v2f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <2 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <2 x bfloat> %load to <2 x float> ret <2 x float> %fpext @@ -6255,6 +6889,17 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v3bf16_to_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <3 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <3 x bfloat> %load to <3 x float> ret <3 x float> %fpext @@ -6334,6 +6979,18 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v4bf16_to_v4f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <4 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <4 x bfloat> %load to <4 x float> ret <4 x float> %fpext @@ -6423,6 +7080,19 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v5bf16_to_v5f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <5 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <5 x bfloat> %load to <5 x float> ret <5 x float> %fpext @@ -6527,6 +7197,19 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v6bf16_to_v6f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <6 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <6 x bfloat> %load to <6 x float> ret <6 x float> %fpext @@ -6630,6 +7313,21 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX11-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v8bf16_to_v8f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <8 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <8 x bfloat> %load to <8 x float> ret <8 x float> %fpext @@ -6797,6 +7495,32 @@ define <16 x float> @global_extload_v16bf16_to_v16f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v14, 16, v15 ; GFX11-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v16bf16_to_v16f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v12 :: v_dual_lshlrev_b32 v10, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <16 x bfloat> %load to <16 x float> ret <16 x float> %fpext @@ -7088,6 +7812,50 @@ define <32 x float> @global_extload_v32bf16_to_v32f32(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_lshlrev_b32_e32 v30, 16, v31 ; GFX11-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v32bf16_to_v32f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off +; GFX1250-NEXT: global_load_b128 v[12:15], v[0:1], off offset:16 +; GFX1250-NEXT: global_load_b128 v[20:23], v[0:1], off offset:32 +; GFX1250-NEXT: global_load_b128 v[28:31], v[0:1], off offset:48 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX1250-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v12 :: v_dual_lshlrev_b32 v10, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v12 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: v_dual_lshlrev_b32 v16, 16, v20 :: v_dual_lshlrev_b32 v18, 16, v21 +; GFX1250-NEXT: v_and_b32_e32 v17, 0xffff0000, v20 +; GFX1250-NEXT: v_and_b32_e32 v19, 0xffff0000, v21 +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v22 +; GFX1250-NEXT: v_and_b32_e32 v21, 0xffff0000, v22 +; GFX1250-NEXT: v_lshlrev_b32_e32 v22, 16, v23 +; GFX1250-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v24, 16, v28 :: v_dual_lshlrev_b32 v26, 16, v29 +; GFX1250-NEXT: v_and_b32_e32 v25, 0xffff0000, v28 +; GFX1250-NEXT: v_and_b32_e32 v27, 0xffff0000, v29 +; GFX1250-NEXT: v_lshlrev_b32_e32 v28, 16, v30 +; GFX1250-NEXT: v_and_b32_e32 v29, 0xffff0000, v30 +; GFX1250-NEXT: v_lshlrev_b32_e32 v30, 16, v31 +; GFX1250-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <32 x bfloat> %load to <32 x float> ret <32 x float> %fpext @@ -7179,6 +7947,19 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v2bf16_to_v2f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <2 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <2 x bfloat> %load to <2 x double> ret <2 x double> %fpext @@ -7285,6 +8066,22 @@ define <3 x double> @global_extload_v3bf16_to_v3f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v3bf16_to_v3f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[0:1], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <3 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <3 x bfloat> %load to <3 x double> ret <3 x double> %fpext @@ -7390,6 +8187,23 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v4bf16_to_v4f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <4 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <4 x bfloat> %load to <4 x double> ret <4 x double> %fpext @@ -7509,6 +8323,24 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v5bf16_to_v5f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <5 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <5 x bfloat> %load to <5 x double> ret <5 x double> %fpext @@ -7636,6 +8468,26 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v6bf16_to_v6f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <6 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <6 x bfloat> %load to <6 x double> ret <6 x double> %fpext @@ -7787,6 +8639,28 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v8bf16_to_v8f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v10 :: v_dual_lshlrev_b32 v12, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v11 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <8 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <8 x bfloat> %load to <8 x double> ret <8 x double> %fpext @@ -8050,6 +8924,46 @@ define <16 x double> @global_extload_v16bf16_to_v16f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v16bf16_to_v16f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off +; GFX1250-NEXT: global_load_b128 v[24:27], v[0:1], off offset:16 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v10 :: v_dual_lshlrev_b32 v12, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v11 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v16, 16, v24 :: v_dual_lshlrev_b32 v20, 16, v25 +; GFX1250-NEXT: v_and_b32_e32 v18, 0xffff0000, v24 +; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff0000, v25 +; GFX1250-NEXT: v_dual_lshlrev_b32 v24, 16, v26 :: v_dual_lshlrev_b32 v28, 16, v27 +; GFX1250-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX1250-NEXT: v_and_b32_e32 v30, 0xffff0000, v27 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v14 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v16 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v18 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v22 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v24 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v26 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v28 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v30 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <16 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <16 x bfloat> %load to <16 x double> ret <16 x double> %fpext @@ -9570,6 +10484,131 @@ define <32 x double> @global_extload_v32bf16_to_v32f64(ptr addrspace(1) %ptr) { ; GFX11-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: global_extload_v32bf16_to_v32f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: s_clause 0x1f +; GFX1250-NEXT: global_load_u16 v1, v[2:3], off offset:2 +; GFX1250-NEXT: global_load_u16 v4, v[2:3], off offset:12 +; GFX1250-NEXT: global_load_u16 v5, v[2:3], off offset:8 +; GFX1250-NEXT: global_load_u16 v6, v[2:3], off offset:4 +; GFX1250-NEXT: global_load_u16 v7, v[2:3], off +; GFX1250-NEXT: global_load_u16 v8, v[2:3], off offset:6 +; GFX1250-NEXT: global_load_u16 v9, v[2:3], off offset:10 +; GFX1250-NEXT: global_load_u16 v10, v[2:3], off offset:14 +; GFX1250-NEXT: global_load_u16 v11, v[2:3], off offset:18 +; GFX1250-NEXT: global_load_u16 v12, v[2:3], off offset:62 +; GFX1250-NEXT: global_load_u16 v13, v[2:3], off offset:60 +; GFX1250-NEXT: global_load_u16 v14, v[2:3], off offset:58 +; GFX1250-NEXT: global_load_u16 v15, v[2:3], off offset:56 +; GFX1250-NEXT: global_load_u16 v16, v[2:3], off offset:28 +; GFX1250-NEXT: global_load_u16 v17, v[2:3], off offset:24 +; GFX1250-NEXT: global_load_u16 v18, v[2:3], off offset:20 +; GFX1250-NEXT: global_load_u16 v19, v[2:3], off offset:16 +; GFX1250-NEXT: global_load_u16 v20, v[2:3], off offset:22 +; GFX1250-NEXT: global_load_u16 v21, v[2:3], off offset:26 +; GFX1250-NEXT: global_load_u16 v22, v[2:3], off offset:30 +; GFX1250-NEXT: global_load_u16 v23, v[2:3], off offset:34 +; GFX1250-NEXT: global_load_u16 v24, v[2:3], off offset:44 +; GFX1250-NEXT: global_load_u16 v25, v[2:3], off offset:40 +; GFX1250-NEXT: global_load_u16 v26, v[2:3], off offset:36 +; GFX1250-NEXT: global_load_u16 v27, v[2:3], off offset:32 +; GFX1250-NEXT: global_load_u16 v28, v[2:3], off offset:38 +; GFX1250-NEXT: global_load_u16 v29, v[2:3], off offset:42 +; GFX1250-NEXT: global_load_u16 v30, v[2:3], off offset:46 +; GFX1250-NEXT: global_load_u16 v31, v[2:3], off offset:50 +; GFX1250-NEXT: global_load_u16 v32, v[2:3], off offset:52 +; GFX1250-NEXT: global_load_u16 v33, v[2:3], off offset:48 +; GFX1250-NEXT: global_load_u16 v34, v[2:3], off offset:54 +; GFX1250-NEXT: s_wait_loadcnt 0x1e +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v37, 16, v4 +; GFX1250-NEXT: s_wait_loadcnt 0x1c +; GFX1250-NEXT: v_dual_lshlrev_b32 v81, 16, v5 :: v_dual_lshlrev_b32 v85, 16, v6 +; GFX1250-NEXT: s_wait_loadcnt 0x1a +; GFX1250-NEXT: v_dual_lshlrev_b32 v84, 16, v7 :: v_dual_lshlrev_b32 v35, 16, v8 +; GFX1250-NEXT: s_wait_loadcnt 0x18 +; GFX1250-NEXT: v_dual_lshlrev_b32 v80, 16, v9 :: v_dual_lshlrev_b32 v36, 16, v10 +; GFX1250-NEXT: s_wait_loadcnt 0x15 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v12 :: v_dual_lshlrev_b32 v3, 16, v13 +; GFX1250-NEXT: s_wait_loadcnt 0x14 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v11 :: v_dual_lshlrev_b32 v6, 16, v14 +; GFX1250-NEXT: s_wait_loadcnt 0x13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v15 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; GFX1250-NEXT: s_wait_loadcnt 0x11 +; GFX1250-NEXT: v_dual_lshlrev_b32 v68, 16, v17 :: v_dual_lshlrev_b32 v39, 16, v16 +; GFX1250-NEXT: s_wait_loadcnt 0xe +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_dual_lshlrev_b32 v21, 16, v21 :: v_dual_lshlrev_b32 v38, 16, v22 +; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: v_dual_lshlrev_b32 v48, 16, v23 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_dual_lshlrev_b32 v49, 16, v28 :: v_dual_lshlrev_b32 v64, 16, v29 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: v_dual_lshlrev_b32 v50, 16, v30 :: v_dual_lshlrev_b32 v51, 16, v31 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v33, 16, v33 :: v_dual_lshlrev_b32 v52, 16, v34 +; GFX1250-NEXT: v_dual_lshlrev_b32 v32, 16, v32 :: v_dual_lshlrev_b32 v69, 16, v27 +; GFX1250-NEXT: v_lshlrev_b32_e32 v70, 16, v26 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[14:15], v35 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[54:55], v52 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[52:53], v32 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[30:31], v38 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[28:29], v39 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[34:35], v48 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[38:39], v49 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[48:49], v33 +; GFX1250-NEXT: v_dual_lshlrev_b32 v13, 16, v19 :: v_dual_lshlrev_b32 v82, 16, v18 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[66:67], v64 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[64:65], v25 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:240 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[4:5], v50 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[50:51], v51 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[2:3], v24 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[18:19], v36 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[16:17], v37 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[36:37], v70 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[32:33], v69 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[70:71], v21 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[68:69], v68 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[26:27], v20 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[24:25], v82 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[22:23], v12 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[20:21], v13 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[82:83], v80 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[80:81], v81 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[12:13], v85 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 +; GFX1250-NEXT: scratch_store_b128 v0, v[6:9], off offset:224 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_cvt_f64_f32_e32 v[8:9], v84 +; GFX1250-NEXT: s_clause 0xd +; GFX1250-NEXT: scratch_store_b128 v0, v[52:55], off offset:208 +; GFX1250-NEXT: scratch_store_b128 v0, v[48:51], off offset:192 +; GFX1250-NEXT: scratch_store_b128 v0, v[2:5], off offset:176 +; GFX1250-NEXT: scratch_store_b128 v0, v[64:67], off offset:160 +; GFX1250-NEXT: scratch_store_b128 v0, v[36:39], off offset:144 +; GFX1250-NEXT: scratch_store_b128 v0, v[32:35], off offset:128 +; GFX1250-NEXT: scratch_store_b128 v0, v[28:31], off offset:112 +; GFX1250-NEXT: scratch_store_b128 v0, v[68:71], off offset:96 +; GFX1250-NEXT: scratch_store_b128 v0, v[24:27], off offset:80 +; GFX1250-NEXT: scratch_store_b128 v0, v[20:23], off offset:64 +; GFX1250-NEXT: scratch_store_b128 v0, v[16:19], off offset:48 +; GFX1250-NEXT: scratch_store_b128 v0, v[80:83], off offset:32 +; GFX1250-NEXT: scratch_store_b128 v0, v[12:15], off offset:16 +; GFX1250-NEXT: scratch_store_b128 v0, v[8:11], off +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %load = load <32 x bfloat>, ptr addrspace(1) %ptr %fpext = fpext <32 x bfloat> %load to <32 x double> ret <32 x double> %fpext @@ -9686,6 +10725,16 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd bfloat %a, %b ret bfloat %op } @@ -9859,6 +10908,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -10093,6 +11149,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <3 x bfloat> %a, %b ret <3 x bfloat> %op } @@ -10383,6 +11447,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -10921,6 +11993,16 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <8 x bfloat> %a, %b ret <8 x bfloat> %op } @@ -11951,6 +13033,20 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_add_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_add_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_add_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_add_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <16 x bfloat> %a, %b ret <16 x bfloat> %op } @@ -14043,6 +15139,30 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_add_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_add_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_add_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_add_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_add_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_add_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_add_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_add_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_add_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_add_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_add_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fadd <32 x bfloat> %a, %b ret <32 x bfloat> %op } @@ -14147,6 +15267,16 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_bf16_fpimm_0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %add = fadd bfloat %arg0, 1.0 ret bfloat %add } @@ -14251,6 +15381,16 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fadd_bf16_fpimm_1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %add = fadd bfloat %arg0, 42.0 ret bfloat %add } @@ -14366,6 +15506,16 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub bfloat %a, %b ret bfloat %op } @@ -14539,6 +15689,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -14773,6 +15930,22 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub <3 x bfloat> %a, %b ret <3 x bfloat> %op } @@ -15063,6 +16236,25 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fsub_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_dual_sub_f32 v4, v5, v4 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_f32_e32 v5, v7, v6 +; GFX1250-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fsub <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -15178,6 +16370,13 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul bfloat %a, %b ret bfloat %op } @@ -15351,6 +16550,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <2 x bfloat> %a, %b ret <2 x bfloat> %op } @@ -15585,6 +16791,14 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <3 x bfloat> %a, %b ret <3 x bfloat> %op } @@ -15875,6 +17089,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <4 x bfloat> %a, %b ret <4 x bfloat> %op } @@ -16413,6 +17635,16 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <8 x bfloat> %a, %b ret <8 x bfloat> %op } @@ -17443,6 +18675,20 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_mul_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_mul_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_mul_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_mul_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <16 x bfloat> %a, %b ret <16 x bfloat> %op } @@ -19535,6 +20781,30 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmul_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_mul_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_mul_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_mul_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_mul_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_mul_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_mul_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_mul_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_mul_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_mul_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_mul_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_mul_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_mul_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fmul <32 x bfloat> %a, %b ret <32 x bfloat> %op } @@ -19741,6 +21011,32 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fdiv_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 +; GFX1250-NEXT: v_rcp_f32_e32 v3, v2 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_f32 v4, -v2, v3, 1.0 +; GFX1250-NEXT: v_fmac_f32_e32 v3, v4, v3 +; GFX1250-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v5, v4, v3 +; GFX1250-NEXT: v_fma_f32 v6, -v2, v5, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fmac_f32_e32 v5, v6, v3 +; GFX1250-NEXT: v_fma_f32 v2, -v2, v5, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; GFX1250-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fdiv bfloat %a, %b ret bfloat %op } @@ -19795,6 +21091,13 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.fabs.bf16(bfloat %a) ret bfloat %op } @@ -19838,6 +21141,13 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_and_b32 s0, s0, 0x7fff +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %op = call bfloat @llvm.fabs.bf16(bfloat %a) %cast = bitcast bfloat %op to i16 %zext = zext i16 %cast to i32 @@ -19887,6 +21197,13 @@ define bfloat @v_fneg_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fneg_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fneg bfloat %a ret bfloat %op } @@ -19933,6 +21250,13 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_fneg_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_xor_b32 s0, s0, 0x8000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %op = fneg bfloat %a %cast = bitcast bfloat %op to i16 %zext = zext i16 %cast to i32 @@ -19992,6 +21316,13 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fneg_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %fabs = call bfloat @llvm.fabs.bf16(bfloat %a) %op = fneg bfloat %fabs ret bfloat %op @@ -20045,6 +21376,13 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_fneg_fabs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_bitset1_b32 s0, 15 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0 +; GFX1250-NEXT: ; return to shader part epilog %fabs = call bfloat @llvm.fabs.bf16(bfloat %a) %op = fneg bfloat %fabs %cast = bitcast bfloat %op to i16 @@ -20172,6 +21510,16 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.minnum.bf16(bfloat %a, bfloat %b) ret bfloat %op } @@ -20345,6 +21693,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %op } @@ -20579,6 +21934,14 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) ret <3 x bfloat> %op } @@ -20869,6 +22232,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op } @@ -21407,6 +22778,16 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_min_num_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op } @@ -22437,6 +23818,20 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_min_num_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_min_num_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_min_num_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_min_num_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_min_num_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <16 x bfloat> @llvm.minnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) ret <16 x bfloat> %op } @@ -24529,6 +25924,30 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_minnum_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_min_num_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_min_num_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_min_num_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_min_num_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_min_num_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_min_num_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_min_num_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_min_num_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_min_num_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_min_num_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_min_num_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_min_num_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <32 x bfloat> @llvm.minnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) ret <32 x bfloat> %op } @@ -24653,6 +26072,16 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.maxnum.bf16(bfloat %a, bfloat %b) ret bfloat %op } @@ -24826,6 +26255,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) ret <2 x bfloat> %op } @@ -25060,6 +26496,14 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) ret <3 x bfloat> %op } @@ -25350,6 +26794,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v2 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) ret <4 x bfloat> %op } @@ -25888,6 +27340,16 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v4 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v5 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v2, v6 +; GFX1250-NEXT: v_pk_max_num_bf16 v3, v3, v7 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) ret <8 x bfloat> %op } @@ -26918,6 +28380,20 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v4, v4, v13, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v8 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v9 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v2, v10 +; GFX1250-NEXT: v_pk_max_num_bf16 v3, v3, v11 +; GFX1250-NEXT: v_pk_max_num_bf16 v4, v4, v12 +; GFX1250-NEXT: v_pk_max_num_bf16 v5, v5, v13 +; GFX1250-NEXT: v_pk_max_num_bf16 v6, v6, v14 +; GFX1250-NEXT: v_pk_max_num_bf16 v7, v7, v15 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <16 x bfloat> @llvm.maxnum.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) ret <16 x bfloat> %op } @@ -29010,6 +30486,30 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_maxnum_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v16 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v17 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v2, v18 +; GFX1250-NEXT: v_pk_max_num_bf16 v3, v3, v19 +; GFX1250-NEXT: v_pk_max_num_bf16 v4, v4, v20 +; GFX1250-NEXT: v_pk_max_num_bf16 v5, v5, v21 +; GFX1250-NEXT: v_pk_max_num_bf16 v6, v6, v22 +; GFX1250-NEXT: v_pk_max_num_bf16 v7, v7, v23 +; GFX1250-NEXT: v_pk_max_num_bf16 v8, v8, v24 +; GFX1250-NEXT: v_pk_max_num_bf16 v9, v9, v25 +; GFX1250-NEXT: v_pk_max_num_bf16 v10, v10, v26 +; GFX1250-NEXT: v_pk_max_num_bf16 v11, v11, v27 +; GFX1250-NEXT: v_pk_max_num_bf16 v12, v12, v28 +; GFX1250-NEXT: v_pk_max_num_bf16 v13, v13, v29 +; GFX1250-NEXT: v_pk_max_num_bf16 v14, v14, v30 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v15, v15, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <32 x bfloat> @llvm.maxnum.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) ret <32 x bfloat> %op } @@ -29263,6 +30763,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sqrt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_sqrt_bf16_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.sqrt.bf16(bfloat %a) ret bfloat %op } @@ -29369,6 +30876,16 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_ldexp_bf16_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.ldexp.bf16.i32(bfloat %a, i32 %b) ret bfloat %op } @@ -29487,6 +31004,17 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_frexp_bf16_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX1250-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a) ret { bfloat, i16 } %op } @@ -29725,6 +31253,32 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_log_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-NEXT: v_log_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1250-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1250-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.log.bf16(bfloat %a) ret bfloat %op } @@ -29884,6 +31438,13 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_log2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_log_bf16_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.log2.bf16(bfloat %a) ret bfloat %op } @@ -30117,6 +31678,32 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_log10_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-NEXT: v_log_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1250-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1250-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.log10.bf16(bfloat %a) ret bfloat %op } @@ -30358,6 +31945,35 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_exp_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: s_mov_b32 s0, 0x3fb8aa3b +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v1 +; GFX1250-NEXT: v_rndne_f32_e32 v3, v2 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v4, v0, s0, -v2 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_mov_b32 s0, 0x32a5705f +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, s0, v4 op_sel_hi:[1,0,0] +; GFX1250-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX1250-NEXT: v_exp_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX1250-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.exp.bf16(bfloat %a) ret bfloat %op } @@ -30521,6 +32137,13 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_exp2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.exp2.bf16(bfloat %a) ret bfloat %op } @@ -30758,6 +32381,35 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_exp10_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: s_mov_b32 s0, 0x40549a78 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v2, 0x40549a78, v1 +; GFX1250-NEXT: v_rndne_f32_e32 v3, v2 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v4, v0, s0, -v2 op_sel_hi:[1,0,0] +; GFX1250-NEXT: s_mov_b32 s0, 0x33979a37 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, s0, v4 op_sel_hi:[1,0,0] +; GFX1250-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX1250-NEXT: v_exp_f32_e32 v0, v0 +; GFX1250-NEXT: v_nop +; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX1250-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.exp10.bf16(bfloat %a) ret bfloat %op } @@ -30864,6 +32516,16 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_ceil_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ceil_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.ceil.bf16(bfloat %a) ret bfloat %op } @@ -30970,6 +32632,16 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_trunc_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.trunc.bf16(bfloat %a) ret bfloat %op } @@ -31076,6 +32748,16 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_rint_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_rndne_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.rint.bf16(bfloat %a) ret bfloat %op } @@ -31182,6 +32864,16 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_nearbyint_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_rndne_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.nearbyint.bf16(bfloat %a) ret bfloat %op } @@ -31338,6 +33030,24 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_round_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v1, v0 +; GFX1250-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 +; GFX1250-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 +; GFX1250-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.round.bf16(bfloat %a) ret bfloat %op } @@ -31444,6 +33154,16 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_roundeven_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_rndne_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.roundeven.bf16(bfloat %a) ret bfloat %op } @@ -31550,6 +33270,16 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_floor_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_floor_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.floor.bf16(bfloat %a) ret bfloat %op } @@ -31654,6 +33384,16 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_canonicalize_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.canonicalize.bf16(bfloat %a) ret bfloat %op } @@ -31702,6 +33442,13 @@ define i1 @v_fcmp_false_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_false_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp false bfloat %a, %b ret i1 %op } @@ -31787,6 +33534,16 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_oeq_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp oeq bfloat %a, %b ret i1 %op } @@ -31872,6 +33629,16 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ogt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ogt bfloat %a, %b ret i1 %op } @@ -31957,6 +33724,16 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_oge_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp oge bfloat %a, %b ret i1 %op } @@ -32042,6 +33819,16 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_olt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp olt bfloat %a, %b ret i1 %op } @@ -32127,6 +33914,16 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ole_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ole bfloat %a, %b ret i1 %op } @@ -32212,6 +34009,16 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_one_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp one bfloat %a, %b ret i1 %op } @@ -32297,6 +34104,16 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_uno_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp uno bfloat %a, %b ret i1 %op } @@ -32382,6 +34199,16 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ueq_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ueq bfloat %a, %b ret i1 %op } @@ -32467,6 +34294,16 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ugt_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ugt bfloat %a, %b ret i1 %op } @@ -32552,6 +34389,16 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_uge_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp uge bfloat %a, %b ret i1 %op } @@ -32637,6 +34484,16 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ult_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ult bfloat %a, %b ret i1 %op } @@ -32722,6 +34579,16 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_ule_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp ule bfloat %a, %b ret i1 %op } @@ -32807,6 +34674,16 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 ; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_une_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp une bfloat %a, %b ret i1 %op } @@ -32847,6 +34724,13 @@ define i1 @v_fcmp_true_bf16(bfloat %a, bfloat %b) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fcmp_true_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fcmp true bfloat %a, %b ret i1 %op } @@ -32905,6 +34789,15 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_bf16_to_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi bfloat %x to i16 ret i16 %op } @@ -33005,6 +34898,19 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i16> ret <2 x i16> %op } @@ -33125,6 +35031,20 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) { ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1 ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i16> ret <3 x i16> %op } @@ -33277,6 +35197,24 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) { ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v1, v2, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i16> ret <4 x i16> %op } @@ -33335,6 +35273,15 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_bf16_to_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi bfloat %x to i32 ret i32 %op } @@ -33400,6 +35347,17 @@ define <2 x i32> @v_fptosi_v2bf16_to_v2i32(<2 x bfloat> %x) { ; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v1 ; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i32> ret <2 x i32> %op } @@ -33482,6 +35440,19 @@ define <3 x i32> @v_fptosi_v3bf16_to_v3i32(<3 x bfloat> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i32> ret <3 x i32> %op } @@ -33578,6 +35549,21 @@ define <4 x i32> @v_fptosi_v4bf16_to_v4i32(<4 x bfloat> %x) { ; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v4 ; GFX11-NEXT: v_cvt_i32_f32_e32 v3, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v0, v2 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v2, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v3 +; GFX1250-NEXT: v_cvt_i32_f32_e32 v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i32> ret <4 x i32> %op } @@ -33742,6 +35728,27 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_bf16_to_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v0, v0 +; GFX1250-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_floor_f32_e32 v1, v1 +; GFX1250-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| +; GFX1250-NEXT: v_ashrrev_i32_e32 v0, 31, v0 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v3, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_bitop2_b32 v3, v3, v0 bitop3:0x14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v2, v2, v0 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi bfloat %x to i64 ret i64 %op } @@ -33973,6 +35980,42 @@ define <2 x i64> @v_fptosi_v2bf16_to_v2i64(<2 x bfloat> %x) { ; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v4, v6 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, null, v3, v6, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_trunc_f32_e32 v3, v0 +; GFX1250-NEXT: v_mul_f32_e64 v2, 0x2f800000, |v3| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_floor_f32_e32 v5, v2 +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX1250-NEXT: v_trunc_f32_e32 v1, v1 +; GFX1250-NEXT: v_fma_f32 v3, 0xcf800000, v5, |v3| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v7, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mul_f32_e64 v0, 0x2f800000, |v1| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v8, v3 +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_floor_f32_e32 v4, v0 +; GFX1250-NEXT: v_dual_ashrrev_i32 v0, 31, v1 :: v_dual_bitop2_b32 v7, v7, v2 bitop3:0x14 +; GFX1250-NEXT: v_fma_f32 v6, 0xcf800000, v4, |v1| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_xor_b32_e32 v5, v4, v0 +; GFX1250-NEXT: v_xor_b32_e32 v4, v6, v0 +; GFX1250-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], v[6:7], v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <2 x bfloat> %x to <2 x i64> ret <2 x i64> %op } @@ -34293,6 +36336,52 @@ define <3 x i64> @v_fptosi_v3bf16_to_v3i64(<3 x bfloat> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v6, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_trunc_f32_e32 v6, v2 +; GFX1250-NEXT: v_trunc_f32_e32 v8, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_trunc_f32_e32 v7, v0 +; GFX1250-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v6| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v8| +; GFX1250-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v7| +; GFX1250-NEXT: v_dual_ashrrev_i32 v0, 31, v6 :: v_dual_ashrrev_i32 v2, 31, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_floor_f32_e32 v9, v1 +; GFX1250-NEXT: v_floor_f32_e32 v11, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_floor_f32_e32 v10, v3 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_ashrrev_i32 v4, 31, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_fma_f32 v6, 0xcf800000, v9, |v6| +; GFX1250-NEXT: v_fma_f32 v8, 0xcf800000, v11, |v8| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_fma_f32 v7, 0xcf800000, v10, |v7| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v6, v6 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v12, v7 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v13, v8 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v5, v4 +; GFX1250-NEXT: v_xor_b32_e32 v7, v9, v0 +; GFX1250-NEXT: v_xor_b32_e32 v6, v6, v0 +; GFX1250-NEXT: v_xor_b32_e32 v9, v10, v2 +; GFX1250-NEXT: v_xor_b32_e32 v8, v12, v2 +; GFX1250-NEXT: v_xor_b32_e32 v11, v11, v4 +; GFX1250-NEXT: v_xor_b32_e32 v10, v13, v4 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[6:7], v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], v[8:9], v[2:3] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[4:5], v[10:11], v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <3 x bfloat> %x to <3 x i64> ret <3 x i64> %op } @@ -34698,6 +36787,61 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, null, v7, v13, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v3, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_trunc_f32_e32 v7, v2 +; GFX1250-NEXT: v_trunc_f32_e32 v9, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_trunc_f32_e32 v8, v0 +; GFX1250-NEXT: v_trunc_f32_e32 v10, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v7| +; GFX1250-NEXT: v_mul_f32_e64 v5, 0x2f800000, |v9| +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_mul_f32_e64 v3, 0x2f800000, |v8| +; GFX1250-NEXT: v_mul_f32_e64 v11, 0x2f800000, |v10| +; GFX1250-NEXT: v_dual_ashrrev_i32 v0, 31, v7 :: v_dual_ashrrev_i32 v2, 31, v8 +; GFX1250-NEXT: v_floor_f32_e32 v12, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_floor_f32_e32 v13, v3 +; GFX1250-NEXT: v_floor_f32_e32 v14, v5 +; GFX1250-NEXT: v_floor_f32_e32 v11, v11 +; GFX1250-NEXT: v_dual_ashrrev_i32 v4, 31, v9 :: v_dual_ashrrev_i32 v6, 31, v10 +; GFX1250-NEXT: v_fma_f32 v7, 0xcf800000, v12, |v7| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GFX1250-NEXT: v_fma_f32 v8, 0xcf800000, v13, |v8| +; GFX1250-NEXT: v_fma_f32 v15, 0xcf800000, v14, |v9| +; GFX1250-NEXT: v_fma_f32 v16, 0xcf800000, v11, |v10| +; GFX1250-NEXT: v_cvt_u32_f32_e32 v7, v7 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v3, v2 +; GFX1250-NEXT: v_mov_b32_e32 v5, v4 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX1250-NEXT: v_xor_b32_e32 v9, v12, v0 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v10, v8 +; GFX1250-NEXT: v_xor_b32_e32 v8, v7, v0 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v12, v11 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v7, v15 +; GFX1250-NEXT: v_cvt_u32_f32_e32 v15, v16 +; GFX1250-NEXT: v_xor_b32_e32 v11, v13, v2 +; GFX1250-NEXT: v_xor_b32_e32 v10, v10, v2 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[8:9], v[0:1] +; GFX1250-NEXT: v_xor_b32_e32 v9, v14, v4 +; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_bitop2_b32 v8, v7, v4 bitop3:0x14 +; GFX1250-NEXT: v_xor_b32_e32 v13, v12, v6 +; GFX1250-NEXT: v_xor_b32_e32 v12, v15, v6 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], v[10:11], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_sub_nc_u64_e32 v[4:5], v[8:9], v[4:5] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[6:7], v[12:13], v[6:7] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = fptosi <4 x bfloat> %x to <4 x i64> ret <4 x i64> %op } @@ -34795,6 +36939,16 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_i16_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i16 %x to bfloat ret bfloat %op } @@ -34936,6 +37090,19 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v2i16_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i16> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -35125,6 +37292,23 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v3i16_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <3 x i16> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -35355,6 +37539,24 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v4i16_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_ashrrev_i32 v2, 16, v1 :: v_dual_ashrrev_i32 v3, 16, v0 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <4 x i16> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -35446,6 +37648,15 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_i32_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i32 %x to bfloat ret bfloat %op } @@ -35577,6 +37788,16 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v2i32_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i32> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -35750,6 +37971,18 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v3i32_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <3 x i32> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -35959,6 +38192,19 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v4i32_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v3, v3 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <4 x i32> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -36148,6 +38394,28 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_i64_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v2, v0, v1 +; GFX1250-NEXT: v_cls_i32_e32 v3, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v2 +; GFX1250-NEXT: v_add_nc_u32_e32 v2, 32, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_min_u32_e64 v2, v3, -1, v2 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp i64 %x to bfloat ret bfloat %op } @@ -36474,6 +38742,40 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v2i64_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v5, v0, v1 +; GFX1250-NEXT: v_xor_b32_e32 v4, v2, v3 +; GFX1250-NEXT: v_cls_i32_e32 v6, v3 +; GFX1250-NEXT: v_cls_i32_e32 v7, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4 +; GFX1250-NEXT: v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_min_u32_e64 v5, v7, -1, v5 +; GFX1250-NEXT: v_add_min_u32_e64 v4, v6, -1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250-NEXT: v_sub_nc_u32_e32 v3, 32, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <2 x i64> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -36929,6 +39231,53 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v3i64_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v8, v4, v5 +; GFX1250-NEXT: v_xor_b32_e32 v6, v2, v3 +; GFX1250-NEXT: v_cls_i32_e32 v10, v3 +; GFX1250-NEXT: v_cls_i32_e32 v9, v5 +; GFX1250-NEXT: v_cls_i32_e32 v11, v1 +; GFX1250-NEXT: v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7 +; GFX1250-NEXT: v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_min_u32_e64 v6, v10, -1, v6 +; GFX1250-NEXT: v_add_min_u32_e64 v7, v11, -1, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_add_nc_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: v_add_min_u32_e64 v8, v9, -1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5] +; GFX1250-NEXT: v_sub_nc_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v1 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v8 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <3 x i64> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -37509,6 +39858,64 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v9, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_sitofp_v4i64_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v9, v4, v5 +; GFX1250-NEXT: v_xor_b32_e32 v8, v6, v7 +; GFX1250-NEXT: v_cls_i32_e32 v12, v7 +; GFX1250-NEXT: v_cls_i32_e32 v13, v5 +; GFX1250-NEXT: v_cls_i32_e32 v14, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8 +; GFX1250-NEXT: v_xor_b32_e32 v10, v2, v3 +; GFX1250-NEXT: v_cls_i32_e32 v15, v1 +; GFX1250-NEXT: v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14 +; GFX1250-NEXT: v_add_min_u32_e64 v9, v13, -1, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_add_min_u32_e64 v8, v12, -1, v8 +; GFX1250-NEXT: v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v9, v[4:5] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_add_nc_u32_e32 v11, 32, v11 +; GFX1250-NEXT: v_add_min_u32_e64 v10, v14, -1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_min_u32_e64 v11, v15, -1, v11 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v10, v[2:3] +; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v11, v[0:1] +; GFX1250-NEXT: v_or_b32_e32 v6, v7, v6 +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54 +; GFX1250-NEXT: v_sub_nc_u32_e32 v7, 32, v9 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v4, v4 +; GFX1250-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v1, v6 +; GFX1250-NEXT: v_sub_nc_u32_e32 v6, 32, v11 +; GFX1250-NEXT: v_cvt_f32_i32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_i32_e32 v0, v0 +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1250-NEXT: v_ldexp_f32 v3, v4, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v5 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v6 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v3, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = sitofp <4 x i64> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -37607,6 +40014,16 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_i16_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp i16 %x to bfloat ret bfloat %op } @@ -37749,6 +40166,19 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v2i16_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <2 x i16> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -37942,6 +40372,23 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v3i16_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <3 x i16> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -38178,6 +40625,24 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v7, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v4i16_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 16, v1 :: v_dual_lshrrev_b32 v3, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <4 x i16> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -38269,6 +40734,15 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_i32_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp i32 %x to bfloat ret bfloat %op } @@ -38400,6 +40874,16 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v2i32_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <2 x i32> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -38573,6 +41057,18 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v3i32_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <3 x i32> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -38782,6 +41278,19 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v6, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v4i32_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <4 x i32> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -38935,6 +41444,24 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) { ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_i64_to_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v2, 32, v2 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v2, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp i64 %x to bfloat ret bfloat %op } @@ -39190,6 +41717,35 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v2i64_to_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v4, v3 +; GFX1250-NEXT: v_clz_i32_u32_e32 v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v4, 32, v4 +; GFX1250-NEXT: v_min_u32_e32 v5, 32, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v4, v[2:3] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v5, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v5 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 32, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_ldexp_f32 v1, v2, v1 +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <2 x i64> %x to <2 x bfloat> ret <2 x bfloat> %op } @@ -39548,6 +42104,45 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v1, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v3i64_to_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v6, v3 +; GFX1250-NEXT: v_clz_i32_u32_e32 v7, v1 +; GFX1250-NEXT: v_clz_i32_u32_e32 v8, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v6, 32, v6 +; GFX1250-NEXT: v_min_u32_e32 v7, 32, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v6, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v7, v[0:1] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v8, v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_sub_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v4 +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v8 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <3 x i64> %x to <3 x bfloat> ret <3 x bfloat> %op } @@ -39996,6 +42591,54 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_uitofp_v4i64_to_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_clz_i32_u32_e32 v8, v7 +; GFX1250-NEXT: v_clz_i32_u32_e32 v9, v3 +; GFX1250-NEXT: v_clz_i32_u32_e32 v10, v1 +; GFX1250-NEXT: v_clz_i32_u32_e32 v11, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_min_u32_e32 v8, 32, v8 +; GFX1250-NEXT: v_min_u32_e32 v9, 32, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_min_u32_e32 v10, 32, v10 +; GFX1250-NEXT: v_min_u32_e32 v11, 32, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[6:7], v8, v[6:7] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[2:3], v9, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_lshlrev_b64_e32 v[0:1], v10, v[0:1] +; GFX1250-NEXT: v_lshlrev_b64_e32 v[4:5], v11, v[4:5] +; GFX1250-NEXT: v_dual_sub_nc_u32 v8, 32, v8 :: v_dual_sub_nc_u32 v11, 32, v11 +; GFX1250-NEXT: v_min_u32_e32 v6, 1, v6 +; GFX1250-NEXT: v_min_u32_e32 v2, 1, v2 +; GFX1250-NEXT: v_min_u32_e32 v0, 1, v0 +; GFX1250-NEXT: v_min_u32_e32 v4, 1, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_sub_nc_u32 v9, 32, v9 :: v_dual_bitop2_b32 v6, v7, v6 bitop3:0x54 +; GFX1250-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_sub_nc_u32 v3, 32, v10 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54 +; GFX1250-NEXT: v_or_b32_e32 v1, v5, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v4, v6 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v2, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_ldexp_f32 v4, v4, v8 +; GFX1250-NEXT: v_ldexp_f32 v2, v2, v9 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1250-NEXT: v_ldexp_f32 v1, v1, v11 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = uitofp <4 x i64> %x to <4 x bfloat> ret <4 x bfloat> %op } @@ -40073,6 +42716,16 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, bfloat %a, bfloat %b ret bfloat %op } @@ -40156,6 +42809,17 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_fneg_lhs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %neg.a = fneg bfloat %a %op = select i1 %cond, bfloat %neg.a, bfloat %b ret bfloat %op @@ -40240,6 +42904,17 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) { ; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_fneg_rhs_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %neg.b = fneg bfloat %b %op = select i1 %cond, bfloat %a, bfloat %neg.b ret bfloat %op @@ -40349,6 +43024,19 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b) ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 16, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 16, v2 :: v_dual_cndmask_b32 v0, v2, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %op } @@ -40466,6 +43154,21 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo ; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 16, v2 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v5, 16, v3 :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b ret <2 x bfloat> %op } @@ -40552,6 +43255,17 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s1, v1, vcc_lo +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, bfloat %a, bfloat %b %cast = bitcast bfloat %op to i16 @@ -40687,6 +43401,21 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshr_b32 s2, s0, 16 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: s_lshr_b32 s3, s1, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s1, v2, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b %cast = bitcast <2 x bfloat> %op to i32 @@ -40824,6 +43553,22 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_vselect_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshr_b32 s2, s0, 16 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1250-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0 +; GFX1250-NEXT: s_lshr_b32 s0, s1, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s1, v3, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq <2 x i32> %c, zeroinitializer %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b %cast = bitcast <2 x bfloat> %op to i32 @@ -40925,6 +43670,16 @@ define <3 x bfloat> @v_select_v3bf16(i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b ret <3 x bfloat> %op } @@ -41033,6 +43788,16 @@ define <4 x bfloat> @v_select_v4bf16(i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b ret <4 x bfloat> %op } @@ -41168,6 +43933,17 @@ define <6 x bfloat> @v_select_v6bf16(i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b) ; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <6 x bfloat> %a, <6 x bfloat> %b ret <6 x bfloat> %op } @@ -41329,6 +44105,17 @@ define <8 x bfloat> @v_select_v8bf16(i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b) ; GFX11-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2 ; GFX11-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v5, v1 :: v_dual_cndmask_b32 v1, v6, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v7, v3 :: v_dual_cndmask_b32 v3, v8, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <8 x bfloat> %a, <8 x bfloat> %b ret <8 x bfloat> %op } @@ -41604,6 +44391,19 @@ define <16 x bfloat> @v_select_v16bf16(i1 %cond, <16 x bfloat> %a, <16 x bfloat> ; GFX11-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6 ; GFX11-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v9, v1 :: v_dual_cndmask_b32 v1, v10, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v11, v3 :: v_dual_cndmask_b32 v3, v12, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, v13, v5 :: v_dual_cndmask_b32 v5, v14, v6 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v15, v7 :: v_dual_cndmask_b32 v7, v16, v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <16 x bfloat> %a, <16 x bfloat> %b ret <16 x bfloat> %op } @@ -42234,6 +45034,27 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat> ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_select_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v17, v1 :: v_dual_cndmask_b32 v1, v18, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v19, v3 :: v_dual_cndmask_b32 v3, v20, v4 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, v21, v5 :: v_dual_cndmask_b32 v5, v22, v6 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v23, v7 :: v_dual_cndmask_b32 v7, v24, v8 +; GFX1250-NEXT: v_dual_cndmask_b32 v8, v25, v9 :: v_dual_cndmask_b32 v9, v26, v10 +; GFX1250-NEXT: v_dual_cndmask_b32 v10, v27, v11 :: v_dual_cndmask_b32 v11, v28, v12 +; GFX1250-NEXT: v_dual_cndmask_b32 v12, v29, v13 :: v_dual_cndmask_b32 v13, v30, v14 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v14, v31, v15 :: v_dual_cndmask_b32 v15, v32, v16 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select i1 %cond, <32 x bfloat> %a, <32 x bfloat> %b ret <32 x bfloat> %op } @@ -42349,6 +45170,20 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s2, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s3, v2, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <3 x bfloat> %a, <3 x bfloat> %b %cast = bitcast <3 x bfloat> %op to i48 @@ -42475,6 +45310,18 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX11-NEXT: v_readfirstlane_b32 s1, v0 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1 ; GFX11-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_select_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s3, v1, vcc_lo +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s2, v2, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1250-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq i32 %c, 0 %op = select i1 %cond, <4 x bfloat> %a, <4 x bfloat> %b %cast = bitcast <4 x bfloat> %op to <2 x i32> @@ -42709,6 +45556,33 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_readfirstlane_b32 s1, v1 ; GFX11FAKE16-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: s_vselect_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_lshr_b32 s4, s1, 16 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1 +; GFX1250-NEXT: s_lshr_b32 s4, s3, 16 +; GFX1250-NEXT: s_lshr_b32 s5, s0, 16 +; GFX1250-NEXT: v_mov_b32_e32 v6, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cndmask_b32_e32 v3, s4, v4, vcc_lo +; GFX1250-NEXT: v_mov_b32_e32 v4, s5 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1250-NEXT: s_lshr_b32 s0, s2, 16 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_cndmask_b32_e32 v1, s0, v4, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, s2, v6, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, s3, v5, vcc_lo +; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_readfirstlane_b32 s1, v1 +; GFX1250-NEXT: ; return to shader part epilog %cond = icmp eq <4 x i32> %c, zeroinitializer %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b %cast = bitcast <4 x bfloat> %op to <2 x i32> @@ -42912,6 +45786,28 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX1250-NEXT: v_dual_lshrrev_b32 v8, 16, v4 :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v9, 16, v6 :: v_dual_bitop2_b32 v3, 1, v3 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v7, v5, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v7, 16, v7 :: v_dual_lshrrev_b32 v5, 16, v5 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b ret <4 x bfloat> %op } @@ -43264,6 +46160,46 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: v_dual_lshrrev_b32 v17, 16, v14 :: v_dual_bitop2_b32 v5, 1, v5 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v16, 16, v10 :: v_dual_bitop2_b32 v3, 1, v3 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v15, v11, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX1250-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX1250-NEXT: v_dual_cndmask_b32 v4, v14, v10 :: v_dual_lshrrev_b32 v15, 16, v15 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX1250-NEXT: v_dual_lshrrev_b32 v14, 16, v12 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40 +; GFX1250-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX1250-NEXT: v_cndmask_b32_e32 v5, v17, v16, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, v13, v9, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX1250-NEXT: v_dual_cndmask_b32 v0, v12, v8 :: v_dual_lshrrev_b32 v13, 16, v13 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v14, v10, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v13, v9, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: v_cndmask_b32_e32 v7, v15, v11, vcc_lo +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b ret <8 x bfloat> %op } @@ -44002,6 +46938,74 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_dual_lshrrev_b32 v52, 16, v25 :: v_dual_bitop2_b32 v12, 1, v12 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v53, 16, v16 :: v_dual_bitop2_b32 v13, 1, v13 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v33, 16, v22 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX1250-NEXT: v_dual_lshrrev_b32 v34, 16, v30 :: v_dual_bitop2_b32 v3, 1, v3 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v51, 16, v17 :: v_dual_bitop2_b32 v10, 1, v10 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v12, v30, v22, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX1250-NEXT: v_dual_lshrrev_b32 v50, 16, v26 :: v_dual_bitop2_b32 v11, 1, v11 bitop3:0x40 +; GFX1250-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX1250-NEXT: v_dual_lshrrev_b32 v35, 16, v21 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v13, v34, v33, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX1250-NEXT: v_dual_lshrrev_b32 v36, 16, v29 :: v_dual_bitop2_b32 v4, 1, v4 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v49, 16, v18 :: v_dual_bitop2_b32 v8, 1, v8 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v10, v29, v21, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX1250-NEXT: v_dual_lshrrev_b32 v37, 16, v20 :: v_dual_bitop2_b32 v5, 1, v5 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v38, 16, v28 :: v_dual_bitop2_b32 v7, 1, v7 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v48, 16, v27 :: v_dual_bitop2_b32 v9, 1, v9 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v11, v36, v35, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX1250-NEXT: v_dual_lshrrev_b32 v39, 16, v19 :: v_dual_bitop2_b32 v6, 1, v6 bitop3:0x40 +; GFX1250-NEXT: v_dual_lshrrev_b32 v32, 16, v23 :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v8, v28, v20, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX1250-NEXT: v_dual_lshrrev_b32 v54, 16, v24 :: v_dual_bitop2_b32 v15, 1, v15 bitop3:0x40 +; GFX1250-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_cndmask_b32_e32 v6, v27, v19, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_cndmask_b32_e32 v4, v26, v18, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_cndmask_b32_e32 v2, v25, v17, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v52, v51, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_cndmask_b32_e32 v0, v24, v16, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v54, v53, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX1250-NEXT: v_cndmask_b32_e32 v5, v50, v49, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 16, v31 +; GFX1250-NEXT: v_cndmask_b32_e32 v7, v48, v39, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX1250-NEXT: v_cndmask_b32_e32 v14, v31, v23, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX1250-NEXT: v_cndmask_b32_e32 v15, v3, v32, vcc_lo +; GFX1250-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX1250-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b ret <16 x bfloat> %op } @@ -45856,6 +48860,178 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x ; GFX11FAKE16-NEXT: v_perm_b32 v14, v29, v28, 0x5040100 ; GFX11FAKE16-NEXT: v_perm_b32 v15, v31, v30, 0x5040100 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_vselect_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x1b +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:60 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:124 +; GFX1250-NEXT: scratch_load_u16 v33, off, s32 +; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:128 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:64 +; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:120 +; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:56 +; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:116 +; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:52 +; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:112 +; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:48 +; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:108 +; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:44 +; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:104 +; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:40 +; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:100 +; GFX1250-NEXT: scratch_load_b32 v55, off, s32 offset:36 +; GFX1250-NEXT: scratch_load_b32 v64, off, s32 offset:76 +; GFX1250-NEXT: scratch_load_b32 v65, off, s32 offset:12 +; GFX1250-NEXT: scratch_load_b32 v66, off, s32 offset:96 +; GFX1250-NEXT: scratch_load_b32 v67, off, s32 offset:32 +; GFX1250-NEXT: scratch_load_b32 v68, off, s32 offset:80 +; GFX1250-NEXT: scratch_load_b32 v69, off, s32 offset:84 +; GFX1250-NEXT: scratch_load_b32 v70, off, s32 offset:92 +; GFX1250-NEXT: scratch_load_b32 v71, off, s32 offset:28 +; GFX1250-NEXT: scratch_load_b32 v80, off, s32 offset:20 +; GFX1250-NEXT: scratch_load_b32 v81, off, s32 offset:88 +; GFX1250-NEXT: scratch_load_b32 v82, off, s32 offset:24 +; GFX1250-NEXT: v_and_b32_e32 v30, 1, v30 +; GFX1250-NEXT: v_and_b32_e32 v29, 1, v29 +; GFX1250-NEXT: v_and_b32_e32 v26, 1, v26 +; GFX1250-NEXT: v_and_b32_e32 v24, 1, v24 +; GFX1250-NEXT: v_and_b32_e32 v22, 1, v22 +; GFX1250-NEXT: v_and_b32_e32 v20, 1, v20 +; GFX1250-NEXT: v_and_b32_e32 v18, 1, v18 +; GFX1250-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX1250-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX1250-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX1250-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX1250-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX1250-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX1250-NEXT: v_and_b32_e32 v23, 1, v23 +; GFX1250-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX1250-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX1250-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX1250-NEXT: v_and_b32_e32 v21, 1, v21 +; GFX1250-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX1250-NEXT: v_and_b32_e32 v19, 1, v19 +; GFX1250-NEXT: s_wait_loadcnt 0x1a +; GFX1250-NEXT: v_dual_lshrrev_b32 v83, 16, v32 :: v_dual_bitop2_b32 v17, 1, v17 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v30 +; GFX1250-NEXT: v_and_b32_e32 v28, 1, v28 +; GFX1250-NEXT: s_wait_loadcnt 0x17 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_cndmask_b32 v30, v34, v35, s1 :: v_dual_bitop2_b32 v33, 1, v33 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28 +; GFX1250-NEXT: v_lshrrev_b32_e32 v28, 16, v31 +; GFX1250-NEXT: v_cmp_eq_u32_e64 s0, 1, v29 +; GFX1250-NEXT: scratch_load_b32 v29, off, s32 offset:16 +; GFX1250-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_lshrrev_b32 v34, 16, v34 +; GFX1250-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v33 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:72 +; GFX1250-NEXT: v_cndmask_b32_e64 v28, v83, v28, s0 +; GFX1250-NEXT: scratch_load_b32 v83, off, s32 offset:4 +; GFX1250-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:68 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26 +; GFX1250-NEXT: s_wait_loadcnt 0x1a +; GFX1250-NEXT: v_dual_cndmask_b32 v26, v36, v37, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24 +; GFX1250-NEXT: v_dual_lshrrev_b32 v37, 16, v37 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x18 +; GFX1250-NEXT: v_dual_lshrrev_b32 v36, 16, v36 :: v_dual_cndmask_b32 v24, v38, v39, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22 +; GFX1250-NEXT: v_dual_lshrrev_b32 v38, 16, v38 :: v_dual_bitop2_b32 v7, 1, v7 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x16 +; GFX1250-NEXT: v_dual_cndmask_b32 v22, v48, v49 :: v_dual_lshrrev_b32 v39, 16, v39 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20 +; GFX1250-NEXT: v_dual_lshrrev_b32 v49, 16, v49 :: v_dual_bitop2_b32 v8, 1, v8 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x14 +; GFX1250-NEXT: v_dual_lshrrev_b32 v48, 16, v48 :: v_dual_cndmask_b32 v20, v50, v51, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18 +; GFX1250-NEXT: v_dual_lshrrev_b32 v51, 16, v51 :: v_dual_bitop2_b32 v12, 1, v12 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x12 +; GFX1250-NEXT: v_dual_lshrrev_b32 v50, 16, v50 :: v_dual_cndmask_b32 v18, v52, v53, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16 +; GFX1250-NEXT: v_dual_lshrrev_b32 v53, 16, v53 :: v_dual_bitop2_b32 v14, 1, v14 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x10 +; GFX1250-NEXT: v_dual_lshrrev_b32 v52, 16, v52 :: v_dual_cndmask_b32 v16, v54, v55, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14 +; GFX1250-NEXT: v_dual_lshrrev_b32 v55, 16, v55 :: v_dual_lshrrev_b32 v54, 16, v54 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_cndmask_b32_e32 v14, v66, v67, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX1250-NEXT: v_dual_lshrrev_b32 v67, 16, v67 :: v_dual_lshrrev_b32 v66, 16, v66 +; GFX1250-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NEXT: v_cndmask_b32_e32 v12, v70, v71, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 +; GFX1250-NEXT: v_dual_lshrrev_b32 v70, 16, v70 :: v_dual_bitop2_b32 v25, 1, v25 bitop3:0x40 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_dual_cndmask_b32 v10, v81, v82 :: v_dual_lshrrev_b32 v71, 16, v71 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX1250-NEXT: v_dual_lshrrev_b32 v82, 16, v82 :: v_dual_bitop2_b32 v27, 1, v27 bitop3:0x40 +; GFX1250-NEXT: v_dual_cndmask_b32 v8, v69, v80 :: v_dual_lshrrev_b32 v81, 16, v81 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 +; GFX1250-NEXT: v_dual_lshrrev_b32 v80, 16, v80 :: v_dual_lshrrev_b32 v69, 16, v69 +; GFX1250-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NEXT: v_dual_cndmask_b32 v6, v68, v29 :: v_dual_lshrrev_b32 v29, 16, v29 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX1250-NEXT: v_dual_lshrrev_b32 v68, 16, v68 :: v_dual_cndmask_b32 v4, v64, v65, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX1250-NEXT: v_dual_lshrrev_b32 v65, 16, v65 :: v_dual_lshrrev_b32 v64, 16, v64 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_dual_cndmask_b32 v2, v32, v33 :: v_dual_lshrrev_b32 v33, 16, v33 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1250-NEXT: v_dual_lshrrev_b32 v32, 16, v32 :: v_dual_cndmask_b32 v0, v35, v83, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27 +; GFX1250-NEXT: v_dual_lshrrev_b32 v83, 16, v83 :: v_dual_cndmask_b32 v27, v36, v37, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25 +; GFX1250-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23 +; GFX1250-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_cndmask_b32 v23, v48, v49, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21 +; GFX1250-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19 +; GFX1250-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17 +; GFX1250-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15 +; GFX1250-NEXT: v_cndmask_b32_e32 v15, v66, v67, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13 +; GFX1250-NEXT: v_cndmask_b32_e32 v13, v70, v71, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11 +; GFX1250-NEXT: v_cndmask_b32_e32 v11, v81, v82, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 +; GFX1250-NEXT: v_cndmask_b32_e32 v7, v68, v29, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX1250-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1250-NEXT: v_cndmask_b32_e32 v1, v35, v83, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 +; GFX1250-NEXT: v_cndmask_b32_e32 v5, v64, v65, vcc_lo +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9 +; GFX1250-NEXT: v_cndmask_b32_e32 v9, v69, v80, vcc_lo +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v1, v3, v2, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v2, v5, v4, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v3, v7, v6, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v4, v9, v8, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v5, v11, v10, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v6, v13, v12, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v7, v15, v14, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v8, v17, v16, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v9, v19, v18, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v10, v21, v20, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v11, v23, v22, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v12, v25, v24, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v13, v27, v26, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v14, v28, v31, 0x5040100 +; GFX1250-NEXT: v_perm_b32 v15, v34, v30, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b ret <32 x bfloat> %op } @@ -45864,6 +49040,9 @@ declare bfloat @llvm.fma.bf16(bfloat, bfloat, bfloat) declare <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) declare <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) declare <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) +declare <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>) +declare <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat>, <16 x bfloat>, <16 x bfloat>) +declare <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat>, <32 x bfloat>, <32 x bfloat>) define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GCN-LABEL: v_fma_bf16: @@ -45987,6 +49166,13 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %op } @@ -46178,6 +49364,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) ret <2 x bfloat> %op } @@ -46446,6 +49639,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.fma.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op } @@ -46780,144 +49981,4825 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %op } -declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat) -declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) -declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) -declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) - -define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { -; GCN-LABEL: v_fmuladd_bf16: +define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) { +; GCN-LABEL: v_fma_v8bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 ; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 ; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: v_fma_f32 v7, v7, v15, v23 +; GCN-NEXT: v_fma_f32 v6, v6, v14, v22 +; GCN-NEXT: v_fma_f32 v5, v5, v13, v21 +; GCN-NEXT: v_fma_f32 v4, v4, v12, v20 +; GCN-NEXT: v_fma_f32 v3, v3, v11, v19 +; GCN-NEXT: v_fma_f32 v2, v2, v10, v18 +; GCN-NEXT: v_fma_f32 v1, v1, v9, v17 +; GCN-NEXT: v_fma_f32 v0, v0, v8, v16 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX7-LABEL: v_fmuladd_bf16: +; GFX7-LABEL: v_fma_v8bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_fma_f32 v7, v7, v15, v23 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_fma_f32 v6, v6, v14, v15 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v21 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_fma_f32 v5, v5, v13, v14 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_fma_f32 v4, v4, v12, v13 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_fma_f32 v3, v3, v11, v12 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_fma_f32 v2, v2, v10, v11 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v16 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_fma_f32 v1, v1, v9, v11 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v10 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 -; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 -; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_fma_f32 v0, v0, v8, v9 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX8-LABEL: v_fmuladd_bf16: +; GFX8-LABEL: v_fma_v8bf16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 -; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; GFX8-NEXT: v_fma_f32 v12, v14, v13, v12 +; GFX8-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, v13, v12 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_add_u32_e32 v13, vcc, s4, v13 +; GFX8-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3 +; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v2 +; GFX8-NEXT: v_fma_f32 v7, v13, v11, v7 +; GFX8-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, v11, v7 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_add_u32_e32 v11, vcc, s4, v11 +; GFX8-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX8-NEXT: v_fma_f32 v6, v11, v10, v6 +; GFX8-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v6 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s4, v10 +; GFX8-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v1 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; GFX8-NEXT: v_fma_f32 v5, v10, v9, v5 +; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v9 +; GFX8-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v5, 16 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v6, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v7, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX900-LABEL: v_fmuladd_bf16: +; GFX900-LABEL: v_fma_v8bf16: ; GFX900: ; %bb.0: ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 -; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; GFX900-NEXT: v_fma_f32 v12, v14, v13, v12 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_bfe_u32 v13, v12, 16, 1 ; GFX900-NEXT: s_movk_i32 s4, 0x7fff -; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_fma_f32 v3, v3, v7, v11 +; GFX900-NEXT: v_add3_u32 v13, v13, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_bfe_u32 v7, v3, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v13, v14, vcc +; GFX900-NEXT: v_add3_u32 v7, v7, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v7, v11, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v6 +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v2 +; GFX900-NEXT: v_fma_f32 v7, v13, v11, v7 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_bfe_u32 v11, v7, 16, 1 +; GFX900-NEXT: v_fma_f32 v2, v2, v6, v10 +; GFX900-NEXT: v_add3_u32 v11, v11, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_bfe_u32 v6, v2, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v11, v13, vcc +; GFX900-NEXT: v_add3_u32 v6, v6, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v6, v10, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v5 +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX900-NEXT: v_fma_f32 v6, v11, v10, v6 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX900-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX900-NEXT: v_add3_u32 v10, v10, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v10, v11, vcc +; GFX900-NEXT: v_add3_u32 v5, v5, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX900-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v0 +; GFX900-NEXT: v_fma_f32 v5, v10, v9, v5 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_bfe_u32 v9, v5, 16, 1 +; GFX900-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX900-NEXT: v_add3_u32 v9, v9, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc +; GFX900-NEXT: v_add3_u32 v4, v4, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v5, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v6, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v7, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX950-LABEL: v_fmuladd_bf16: +; GFX950-LABEL: v_fma_v8bf16: ; GFX950: ; %bb.0: ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_fmac_f32_e32 v12, v14, v13 +; GFX950-NEXT: v_fmac_f32_e32 v11, v3, v7 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v6 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; GFX950-NEXT: v_fmac_f32_e32 v3, v13, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_fmac_f32_e32 v7, v2, v6 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX950-NEXT: v_fmac_f32_e32 v2, v10, v6 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v6, v1, v5 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v4 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 +; GFX950-NEXT: v_fmac_f32_e32 v1, v9, v5 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX950-NEXT: v_fmac_f32_e32 v5, v0, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v5, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v6, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v7, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v11, v12 ; GFX950-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: v_fmuladd_bf16: +; GFX10-LABEL: v_fma_v8bf16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; GFX10-NEXT: v_fmac_f32_e32 v12, v14, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_fmac_f32_e32 v11, v3, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; GFX10-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_add3_u32 v13, v13, v12, 0x7fff +; GFX10-NEXT: v_fmac_f32_e32 v3, v14, v7 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v10 +; GFX10-NEXT: v_bfe_u32 v16, v11, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v13, v15, vcc_lo +; GFX10-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v7, v2, v6 +; GFX10-NEXT: v_add3_u32 v12, v16, v11, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v5 +; GFX10-NEXT: v_add3_u32 v13, v13, v3, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_fmac_f32_e32 v2, v14, v6 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_add3_u32 v6, v16, v7, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v15, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v14, v2, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_fmac_f32_e32 v9, v1, v5 +; GFX10-NEXT: v_fmac_f32_e32 v15, v18, v16 ; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; GFX10-NEXT: v_fmac_f32_e32 v8, v0, v4 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc_lo +; GFX10-NEXT: v_add3_u32 v0, v14, v2, 0x7fff +; GFX10-NEXT: v_bfe_u32 v4, v9, 16, 1 +; GFX10-NEXT: v_bfe_u32 v5, v15, 16, 1 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; GFX10-NEXT: v_add3_u32 v2, v5, v15, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX10-NEXT: v_add3_u32 v0, v4, v9, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v15 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_add3_u32 v5, v7, v8, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v13, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX10-NEXT: v_perm_b32 v0, v4, v2, 0x7060302 +; GFX10-NEXT: v_perm_b32 v2, v6, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v1, v5, v1, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v12, v17, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v7, v10, 0x7060302 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_fmuladd_bf16: +; GFX11TRUE16-LABEL: v_fma_v8bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l -; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v1, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_fmuladd_bf16: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX11FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1 -; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff -; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] - %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c) - ret bfloat %op -} +; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v2 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v3 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v11, v3, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v10 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v7, v2, v6 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v12, v14, v13 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_add3_u32 v13, v13, v12, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v17, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v1 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v10 +; GFX11TRUE16-NEXT: v_bfe_u32 v10, v11, 16, 1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v13, v7, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v14, v16, v15 +; GFX11TRUE16-NEXT: v_add3_u32 v2, v10, v11, 0x7fff +; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v5 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11TRUE16-NEXT: v_bfe_u32 v15, v14, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v2, v6, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_add3_u32 v10, v15, v14, 0x7fff +; GFX11TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v14 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v9, v1, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v12, vcc_lo +; GFX11TRUE16-NEXT: v_add3_u32 v10, v13, v7, 0x7fff +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11TRUE16-NEXT: v_bfe_u32 v7, v9, 16, 1 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v4, v10, v11 :: v_dual_and_b32 v5, 0xffff0000, v8 +; GFX11TRUE16-NEXT: v_add3_u32 v7, v7, v9, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v15, v17, v16 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v12, v15, 16, 1 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v13, v16, v14 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v5, v0, v1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v15 +; GFX11TRUE16-NEXT: v_add3_u32 v8, v12, v15, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v13, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v13 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v13, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v9, v11, v5, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v0, v12, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fma_v8bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v7 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v12, v14, v13 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v11, v3, v7 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v6 +; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v12 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11FAKE16-NEXT: v_add3_u32 v13, v13, v12, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v10 +; GFX11FAKE16-NEXT: v_bfe_u32 v16, v11, 16, 1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v3, v14, v7 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v10, v13, v15 :: v_dual_and_b32 v7, 0xffff0000, v10 +; GFX11FAKE16-NEXT: v_add3_u32 v12, v16, v11, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v14, 16, v1 +; GFX11FAKE16-NEXT: v_bfe_u32 v13, v3, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v15, 0x400000, v3 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_add3_u32 v13, v13, v3, 0x7fff +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v3, v13, v15 :: v_dual_and_b32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v7, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v8 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v9 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v2, v14, v6 :: v_dual_and_b32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11FAKE16-NEXT: v_add3_u32 v6, v16, v7, 0x7fff +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v9, v1, v5 :: v_dual_and_b32 v8, 0xffff0000, v8 +; GFX11FAKE16-NEXT: v_bfe_u32 v14, v2, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v9 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v8, v0, v4 +; GFX11FAKE16-NEXT: v_add3_u32 v0, v14, v2, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v4, v9, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_bfe_u32 v7, v8, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v15, v18, v16 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_add3_u32 v0, v4, v9, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v5, v15, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v15 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_add3_u32 v2, v5, v15, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v5, v7, v8, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v0, v13, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v4, v2, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v6, v3, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_perm_b32 v1, v5, v1, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v12, v17, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v10, 0x7060302 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v4, v8 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v5, v9 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v6, v10 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v7, v11 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %op = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c) + ret <8 x bfloat> %op +} + +define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) { +; GCN-LABEL: v_fma_v16bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_fma_f32 v15, v15, v31, v32 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_fma_f32 v14, v14, v30, v31 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GCN-NEXT: v_fma_f32 v13, v13, v29, v30 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GCN-NEXT: v_fma_f32 v12, v12, v28, v29 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GCN-NEXT: v_fma_f32 v11, v11, v27, v28 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_fma_f32 v10, v10, v26, v27 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_fma_f32 v9, v9, v25, v26 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_fma_f32 v8, v8, v24, v25 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_fma_f32 v7, v7, v23, v24 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_fma_f32 v6, v6, v22, v23 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_fma_f32 v5, v5, v21, v22 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_fma_f32 v4, v4, v20, v21 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_fma_f32 v3, v3, v19, v20 +; GCN-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_fma_f32 v2, v2, v18, v19 +; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_fma_f32 v1, v1, v17, v18 +; GCN-NEXT: v_fma_f32 v0, v0, v16, v19 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_fma_v16bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_fma_f32 v15, v15, v31, v32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_fma_f32 v14, v14, v30, v31 +; GFX7-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_fma_f32 v13, v13, v29, v30 +; GFX7-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:52 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_fma_f32 v12, v12, v28, v29 +; GFX7-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_fma_f32 v11, v11, v27, v28 +; GFX7-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_fma_f32 v10, v10, v26, v27 +; GFX7-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: v_fma_f32 v9, v9, v25, v26 +; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_fma_f32 v8, v8, v24, v25 +; GFX7-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:32 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_fma_f32 v7, v7, v23, v24 +; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_fma_f32 v6, v6, v22, v23 +; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_fma_f32 v5, v5, v21, v22 +; GFX7-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: v_fma_f32 v4, v4, v20, v21 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_fma_f32 v3, v3, v19, v20 +; GFX7-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_fma_f32 v2, v2, v18, v19 +; GFX7-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_fma_f32 v1, v1, v17, v18 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v19 +; GFX7-NEXT: v_fma_f32 v0, v0, v16, v17 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v16bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: v_fma_f32 v24, v26, v25, v24 +; GFX8-NEXT: v_fma_f32 v7, v7, v15, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v14 +; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: v_fma_f32 v15, v25, v23, v15 +; GFX8-NEXT: v_fma_f32 v6, v6, v14, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: v_fma_f32 v14, v23, v22, v14 +; GFX8-NEXT: v_fma_f32 v5, v5, v13, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v12 +; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: v_fma_f32 v13, v22, v21, v13 +; GFX8-NEXT: v_fma_f32 v4, v4, v12, v20 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: v_fma_f32 v12, v21, v20, v12 +; GFX8-NEXT: v_fma_f32 v3, v3, v11, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: v_fma_f32 v11, v20, v19, v11 +; GFX8-NEXT: v_fma_f32 v2, v2, v10, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: v_fma_f32 v10, v19, v18, v10 +; GFX8-NEXT: v_fma_f32 v1, v1, v9, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: v_fma_f32 v0, v0, v8, v16 +; GFX8-NEXT: v_bfe_u32 v8, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v24 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s4, v8 +; GFX8-NEXT: v_or_b32_e32 v16, 0x400000, v24 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v16, vcc +; GFX8-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v7 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_fma_f32 v9, v18, v17, v9 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v15 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v6 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v14 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v14 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v5 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v13 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v4 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v12 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v3 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v11 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v2 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v10 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v9 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v0 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v17, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v9, 16 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v10, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v11, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v12, 16 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v13, 16 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v14, 16 +; GFX8-NEXT: v_alignbit_b32 v6, v6, v15, 16 +; GFX8-NEXT: v_alignbit_b32 v7, v7, v8, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_fma_v16bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: v_fma_f32 v24, v26, v25, v24 +; GFX900-NEXT: v_fma_f32 v7, v7, v15, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v14 +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: v_fma_f32 v15, v25, v23, v15 +; GFX900-NEXT: v_fma_f32 v6, v6, v14, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v13 +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: v_fma_f32 v14, v23, v22, v14 +; GFX900-NEXT: v_fma_f32 v5, v5, v13, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v12 +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: v_fma_f32 v13, v22, v21, v13 +; GFX900-NEXT: v_fma_f32 v4, v4, v12, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v11 +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: v_fma_f32 v12, v21, v20, v12 +; GFX900-NEXT: v_fma_f32 v3, v3, v11, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v10 +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v2 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: v_fma_f32 v11, v20, v19, v11 +; GFX900-NEXT: v_fma_f32 v2, v2, v10, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v9 +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: v_fma_f32 v10, v19, v18, v10 +; GFX900-NEXT: v_fma_f32 v1, v1, v9, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v8 +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: v_fma_f32 v0, v0, v8, v16 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_bfe_u32 v8, v24, 16, 1 +; GFX900-NEXT: v_add3_u32 v8, v8, v24, s4 +; GFX900-NEXT: v_or_b32_e32 v16, 0x400000, v24 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v8, v16, vcc +; GFX900-NEXT: v_bfe_u32 v16, v7, 16, 1 +; GFX900-NEXT: v_fma_f32 v9, v18, v17, v9 +; GFX900-NEXT: v_add3_u32 v16, v16, v7, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v7 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v15, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v15, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v15 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v6, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v6, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v6 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v14, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v14, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v5, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v5 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v13, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v13, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v13 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v4, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v12, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v12 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v3, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v3 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v11, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v11, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v11 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v2, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v2 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v10, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v10, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v10 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v1, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v1, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v1 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v9, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v9, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc +; GFX900-NEXT: v_bfe_u32 v16, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v16, v17, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v9, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v10, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v11, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v12, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v13, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v14, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v15, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v8, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v16bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; GFX950-NEXT: v_and_b32_e32 v26, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_fmac_f32_e32 v24, v26, v25 +; GFX950-NEXT: v_fmac_f32_e32 v23, v7, v15 +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v22 +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v14 +; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v6 +; GFX950-NEXT: v_fmac_f32_e32 v7, v25, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v15, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_fmac_f32_e32 v15, v6, v14 +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v13 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v5 +; GFX950-NEXT: v_fmac_f32_e32 v6, v22, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v14, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_fmac_f32_e32 v14, v5, v13 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v12 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v4 +; GFX950-NEXT: v_fmac_f32_e32 v5, v21, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v13, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_fmac_f32_e32 v13, v4, v12 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v11 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v3 +; GFX950-NEXT: v_fmac_f32_e32 v4, v20, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v12, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_fmac_f32_e32 v12, v3, v11 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v10 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v2 +; GFX950-NEXT: v_fmac_f32_e32 v3, v19, v11 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_fmac_f32_e32 v11, v2, v10 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v9 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 +; GFX950-NEXT: v_fmac_f32_e32 v2, v18, v10 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v10, v1, v9 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v8 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v0 +; GFX950-NEXT: v_fmac_f32_e32 v1, v17, v9 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v9, v0, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v9, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v10, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v11, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v12, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v13, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v14, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v15, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v23, v24 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v16bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_fmac_f32_e32 v24, v26, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_fmac_f32_e32 v23, v7, v15 +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; GFX10-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX10-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX10-NEXT: v_bfe_u32 v28, v23, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX10-NEXT: v_add3_u32 v25, v25, v24, 0x7fff +; GFX10-NEXT: v_fmac_f32_e32 v7, v26, v15 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v22 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX10-NEXT: v_add3_u32 v24, v28, v23, 0x7fff +; GFX10-NEXT: v_bfe_u32 v26, v7, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v15, v6, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v22, v25, v27, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; GFX10-NEXT: v_lshlrev_b32_e32 v14, 16, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_fmac_f32_e32 v6, v27, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo +; GFX10-NEXT: v_add3_u32 v24, v26, v7, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX10-NEXT: v_bfe_u32 v26, v15, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX10-NEXT: v_add3_u32 v21, v26, v15, 0x7fff +; GFX10-NEXT: v_fmac_f32_e32 v14, v5, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v7, v24, v25, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v15 +; GFX10-NEXT: v_bfe_u32 v25, v6, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_fmac_f32_e32 v5, v26, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v15, v21, v24, vcc_lo +; GFX10-NEXT: v_add3_u32 v21, v25, v6, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX10-NEXT: v_bfe_u32 v25, v14, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_add3_u32 v20, v25, v14, 0x7fff +; GFX10-NEXT: v_fmac_f32_e32 v13, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v6, v21, v24, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v14 +; GFX10-NEXT: v_bfe_u32 v24, v5, 16, 1 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v3 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_fmac_f32_e32 v4, v25, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v14, v20, v21, vcc_lo +; GFX10-NEXT: v_add3_u32 v20, v24, v5, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX10-NEXT: v_bfe_u32 v24, v13, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v12, v3, v11 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_fmac_f32_e32 v19, v26, v25 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v20, v21, vcc_lo +; GFX10-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX10-NEXT: v_add3_u32 v21, v24, v13, 0x7fff +; GFX10-NEXT: v_bfe_u32 v24, v12, 16, 1 +; GFX10-NEXT: v_bfe_u32 v25, v19, 16, 1 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX10-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v4 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX10-NEXT: v_or_b32_e32 v26, 0x400000, v19 +; GFX10-NEXT: v_fmac_f32_e32 v18, v2, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v11, v20, vcc_lo +; GFX10-NEXT: v_add3_u32 v11, v24, v12, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v12 +; GFX10-NEXT: v_add3_u32 v24, v25, v19, 0x7fff +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_fmac_f32_e32 v2, v25, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_bfe_u32 v20, v2, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v17, v1, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v24, v26, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v16 +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_add3_u32 v1, v20, v2, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX10-NEXT: v_fmac_f32_e32 v24, v26, v25 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_fmac_f32_e32 v16, v0, v8 +; GFX10-NEXT: v_bfe_u32 v0, v17, 16, 1 +; GFX10-NEXT: v_bfe_u32 v27, v18, 16, 1 +; GFX10-NEXT: v_bfe_u32 v8, v24, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v17 +; GFX10-NEXT: v_add3_u32 v0, v0, v17, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX10-NEXT: v_bfe_u32 v2, v16, 16, 1 +; GFX10-NEXT: v_add3_u32 v8, v8, v24, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v24 +; GFX10-NEXT: v_or_b32_e32 v25, 0x400000, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v9, v0, v9, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX10-NEXT: v_add3_u32 v2, v2, v16, 0x7fff +; GFX10-NEXT: v_add3_u32 v12, v27, v18, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX10-NEXT: v_perm_b32 v1, v9, v1, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v25, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v8, v12, v19, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX10-NEXT: v_perm_b32 v2, v8, v10, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v12, v21, v3, vcc_lo +; GFX10-NEXT: v_perm_b32 v3, v11, v4, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v12, v5, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v14, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v15, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v23, v22, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11TRUE16-LABEL: v_fma_v16bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v14 +; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v15 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v24, v26, v25 :: v_dual_lshlrev_b32 v7, 16, v7 +; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v22 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11TRUE16-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v26, v28, v27 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v22, v6, v14 +; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 0xffff0000, v13 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11TRUE16-NEXT: v_add3_u32 v25, v25, v24, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v29, 0x400000, v24 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v23, v7, v15 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v25, v29, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v5 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v15, v23, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v24, v26, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v26 +; GFX11TRUE16-NEXT: v_add3_u32 v15, v15, v23, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v24, v24, v26, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v23, v22, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v14, v29, v28 :: v_dual_cndmask_b32 v15, v15, v25 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v12 +; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v24, v27, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v20 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11TRUE16-NEXT: v_add3_u32 v23, v23, v22, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v27, 0x400000, v22 +; GFX11TRUE16-NEXT: v_bfe_u32 v28, v14, 16, 1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v20, v4, v12 +; GFX11TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v24, v26, v25 +; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v14 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v21, v5, v13 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v23, v27, vcc_lo +; GFX11TRUE16-NEXT: v_add3_u32 v5, v28, v14, 0x7fff +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11TRUE16-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v27, v20, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v24 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v22, vcc_lo +; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21 +; GFX11TRUE16-NEXT: v_add3_u32 v14, v23, v21, 0x7fff +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11TRUE16-NEXT: v_add3_u32 v23, v25, v24, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v21, v27, v20, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v13.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v14, v22, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v3 +; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v18 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v14.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v12, v25, v4 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v23, v26, vcc_lo +; GFX11TRUE16-NEXT: v_and_b32_e32 v25, 0xffff0000, v10 +; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 0xffff0000, v2 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11TRUE16-NEXT: v_bfe_u32 v23, v12, 16, 1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v24, v26, v25 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v20, v21, v22 :: v_dual_and_b32 v25, 0xffff0000, v1 +; GFX11TRUE16-NEXT: v_add3_u32 v21, v23, v12, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_bfe_u32 v23, v24, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v12 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v20.h +; GFX11TRUE16-NEXT: v_add3_u32 v12, v23, v24, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v19, v3, v11 +; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v3, v21, v22 :: v_dual_and_b32 v22, 0xffff0000, v17 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11TRUE16-NEXT: v_bfe_u32 v18, v19, 16, 1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v22, v25, v23 :: v_dual_fmac_f32 v11, v2, v10 +; GFX11TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v19 +; GFX11TRUE16-NEXT: v_add3_u32 v2, v18, v19, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v24 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 +; GFX11TRUE16-NEXT: v_bfe_u32 v21, v11, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v17, v1, v9 :: v_dual_cndmask_b32 v10, v2, v10 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v12, v18, vcc_lo +; GFX11TRUE16-NEXT: v_add3_u32 v12, v21, v11, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 +; GFX11TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v11 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11TRUE16-NEXT: v_bfe_u32 v11, v17, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v21, v24, v23 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v9, v0, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v12, v18, vcc_lo +; GFX11TRUE16-NEXT: v_add3_u32 v11, v11, v17, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v17 +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v21, 16, 1 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11TRUE16-NEXT: v_add3_u32 v12, v19, v22, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v18, v9, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v21, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v16, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v22 +; GFX11TRUE16-NEXT: v_add3_u32 v16, v18, v9, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v9 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v8.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v0, v19, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v12, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v11.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v16, v17, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v18.h +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fma_v16bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v24, v26, v25 :: v_dual_and_b32 v23, 0xffff0000, v23 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v23, v7, v15 :: v_dual_lshlrev_b32 v26, 16, v6 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v14 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_bfe_u32 v25, v24, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v27, 0x400000, v24 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11FAKE16-NEXT: v_bfe_u32 v28, v23, 16, 1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX11FAKE16-NEXT: v_add3_u32 v25, v25, v24, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v22 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_add3_u32 v24, v28, v23, 0x7fff +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v7, v26, v15 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v22, v25, v27 :: v_dual_and_b32 v15, 0xffff0000, v22 +; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v23 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 +; GFX11FAKE16-NEXT: v_bfe_u32 v26, v7, 16, 1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v27, 16, v5 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v24, v25, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_add3_u32 v24, v26, v7, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v7 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v7, v24, v25 :: v_dual_and_b32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v15, v6, v14 :: v_dual_lshlrev_b32 v14, 16, v13 +; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v15 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v21 +; GFX11FAKE16-NEXT: v_bfe_u32 v26, v15, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v6, v27, v14 +; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v21 +; GFX11FAKE16-NEXT: v_add3_u32 v21, v26, v15, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v4 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_bfe_u32 v25, v6, 16, 1 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v14, v5, v13 :: v_dual_lshlrev_b32 v5, 16, v20 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v21, v24, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_add3_u32 v21, v25, v6, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v6 +; GFX11FAKE16-NEXT: v_bfe_u32 v25, v14, 16, 1 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v5, v26, v13 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v20 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v2 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11FAKE16-NEXT: v_add3_u32 v20, v25, v14, 0x7fff +; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v6, v21, v24 :: v_dual_lshlrev_b32 v25, 16, v3 +; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v14 +; GFX11FAKE16-NEXT: v_bfe_u32 v24, v5, 16, 1 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v13, v4, v12 :: v_dual_lshlrev_b32 v4, 16, v19 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v12, 16, v11 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v4, v25, v12 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v20, v21, vcc_lo +; GFX11FAKE16-NEXT: v_add3_u32 v20, v24, v5, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v21, 0x400000, v5 +; GFX11FAKE16-NEXT: v_bfe_u32 v24, v13, 16, 1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v19 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v10 +; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v12, v3, v11 :: v_dual_cndmask_b32 v5, v20, v21 +; GFX11FAKE16-NEXT: v_add3_u32 v21, v24, v13, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v13 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v19, 16, v18 +; GFX11FAKE16-NEXT: v_bfe_u32 v20, v4, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v24, v12, 16, 1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v19, v26, v25 +; GFX11FAKE16-NEXT: v_add3_u32 v11, v20, v4, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v4 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v18, v2, v10 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v17 +; GFX11FAKE16-NEXT: v_bfe_u32 v25, v19, 16, 1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v9 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v20, vcc_lo +; GFX11FAKE16-NEXT: v_add3_u32 v11, v24, v12, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v12 +; GFX11FAKE16-NEXT: v_add3_u32 v24, v25, v19, 0x7fff +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v1 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v2, v25, v10 :: v_dual_and_b32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v11, v20, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v25, 16, v8 +; GFX11FAKE16-NEXT: v_bfe_u32 v20, v2, 16, 1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v24, v26, vcc_lo +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v24, 16, v16 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v26, 16, v0 +; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v17, v1, v9 :: v_dual_and_b32 v0, 0xffff0000, v0 +; GFX11FAKE16-NEXT: v_add3_u32 v1, v20, v2, 0x7fff +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v24, v26, v25 +; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v16, v0, v8 +; GFX11FAKE16-NEXT: v_bfe_u32 v0, v17, 16, 1 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11FAKE16-NEXT: v_bfe_u32 v8, v24, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v24 +; GFX11FAKE16-NEXT: v_bfe_u32 v2, v16, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v17, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo +; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v17 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11FAKE16-NEXT: v_add3_u32 v8, v8, v24, 0x7fff +; GFX11FAKE16-NEXT: v_bfe_u32 v27, v18, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v16, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v0, v9, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX11FAKE16-NEXT: v_add3_u32 v12, v27, v18, 0x7fff +; GFX11FAKE16-NEXT: v_or_b32_e32 v19, 0x400000, v18 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v1, v9, v1, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v20, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v25, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v12, v19, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v2, v8, v10, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v21, v3, vcc_lo +; GFX11FAKE16-NEXT: v_perm_b32 v3, v11, v4, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v4, v12, v5, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v5, v14, v6, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v6, v15, v7, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v7, v23, v22, 0x7060302 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v8, v16 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v9, v17 +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v10, v18 +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v11, v19 +; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v12, v20 +; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v13, v21 +; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v14, v22 +; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v15, v23 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %op = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c) + ret <16 x bfloat> %op +} + +define <32 x bfloat> @v_fma_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) { +; GCN-LABEL: v_fma_v32bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: v_fma_f32 v31, v31, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 +; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v30, v30, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 +; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v29, v29, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 +; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v28, v28, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 +; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v27, v27, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 +; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v26, v26, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 +; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v25, v25, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 +; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v24, v24, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 +; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v23, v23, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 +; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v22, v22, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 +; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v21, v21, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 +; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v20, v20, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 +; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v19, v19, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 +; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v18, v18, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 +; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v17, v17, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 +; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v16, v16, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192 +; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v15, v15, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 +; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v14, v14, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 +; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v13, v13, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 +; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v12, v12, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 +; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v11, v11, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 +; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v10, v10, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:168 +; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v9, v9, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 +; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v8, v8, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 +; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v7, v7, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 +; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v6, v6, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:152 +; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v5, v5, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v4, v4, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 +; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v3, v3, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v2, v2, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v1, v1, v32, v33 +; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GCN-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GCN-NEXT: v_fma_f32 v0, v0, v32, v33 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_fma_v32bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:256 +; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_waitcnt vmcnt(2) +; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: v_fma_f32 v31, v31, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:252 +; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v30, v30, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:248 +; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v29, v29, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:244 +; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v28, v28, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 +; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v27, v27, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:236 +; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v26, v26, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:232 +; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v25, v25, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 +; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v24, v24, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:224 +; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v23, v23, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:220 +; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v22, v22, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:216 +; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v21, v21, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 +; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v20, v20, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:208 +; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v19, v19, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:204 +; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v18, v18, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:200 +; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v17, v17, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:196 +; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v16, v16, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:192 +; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v15, v15, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:188 +; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v14, v14, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:184 +; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v13, v13, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:180 +; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v12, v12, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:176 +; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v11, v11, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:172 +; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v10, v10, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:168 +; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v9, v9, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:164 +; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v8, v8, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:160 +; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v7, v7, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:156 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v6, v6, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:152 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v5, v5, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:148 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v4, v4, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:144 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v3, v3, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:140 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v2, v2, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:136 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v1, v1, v32, v33 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_waitcnt vmcnt(1) +; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v33, 1.0, v33 +; GFX7-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX7-NEXT: v_fma_f32 v0, v0, v32, v33 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v32bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX8-NEXT: s_movk_i32 s4, 0x7fff +; GFX8-NEXT: s_waitcnt vmcnt(1) +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v15, v15, v33, v32 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX8-NEXT: v_fma_f32 v31, v31, v35, v34 +; GFX8-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX8-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX8-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v14, v14, v30, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX8-NEXT: v_fma_f32 v32, v34, v32, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; GFX8-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX8-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v13, v13, v29, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GFX8-NEXT: v_fma_f32 v30, v34, v30, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; GFX8-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX8-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v12, v12, v28, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GFX8-NEXT: v_fma_f32 v29, v34, v29, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; GFX8-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX8-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v11, v11, v27, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GFX8-NEXT: v_fma_f32 v28, v34, v28, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX8-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX8-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v10, v10, v26, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX8-NEXT: v_fma_f32 v27, v34, v27, v35 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; GFX8-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v9, v9, v25, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GFX8-NEXT: v_fma_f32 v26, v35, v34, v26 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v8 +; GFX8-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v8, v8, v24, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GFX8-NEXT: v_fma_f32 v25, v35, v34, v25 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GFX8-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX8-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v7, v7, v23, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX8-NEXT: v_fma_f32 v24, v35, v34, v24 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; GFX8-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v6, v6, v22, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GFX8-NEXT: v_fma_f32 v23, v35, v34, v23 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; GFX8-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v5, v5, v21, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX8-NEXT: v_fma_f32 v22, v35, v34, v22 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; GFX8-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v4, v4, v20, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GFX8-NEXT: v_fma_f32 v21, v35, v34, v21 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; GFX8-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v20, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v3, v3, v19, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX8-NEXT: v_fma_f32 v20, v35, v34, v20 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; GFX8-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v2, v2, v18, v33 +; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX8-NEXT: v_fma_f32 v19, v35, v34, v19 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; GFX8-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX8-NEXT: v_fma_f32 v1, v1, v17, v33 +; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4 +; GFX8-NEXT: v_fma_f32 v18, v35, v34, v18 +; GFX8-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v35, 16, v0 +; GFX8-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v33, 16, v17 +; GFX8-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX8-NEXT: v_fma_f32 v0, v0, v16, v17 +; GFX8-NEXT: v_bfe_u32 v16, v31, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v16, v31 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s4, v16 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX8-NEXT: v_or_b32_e32 v17, 0x400000, v31 +; GFX8-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc +; GFX8-NEXT: v_bfe_u32 v17, v15, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX8-NEXT: v_or_b32_e32 v15, 0x400000, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc +; GFX8-NEXT: v_bfe_u32 v17, v32, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v32 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, s4, v17 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX8-NEXT: v_or_b32_e32 v31, 0x400000, v32 +; GFX8-NEXT: v_cndmask_b32_e32 v17, v17, v31, vcc +; GFX8-NEXT: v_bfe_u32 v31, v14, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v14 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX8-NEXT: v_or_b32_e32 v14, 0x400000, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v31, v14, vcc +; GFX8-NEXT: v_bfe_u32 v31, v30, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v30 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX8-NEXT: v_or_b32_e32 v30, 0x400000, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v30, v31, v30, vcc +; GFX8-NEXT: v_bfe_u32 v31, v13, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v13 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX8-NEXT: v_or_b32_e32 v13, 0x400000, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v31, v13, vcc +; GFX8-NEXT: v_bfe_u32 v31, v29, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v29 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX8-NEXT: v_or_b32_e32 v29, 0x400000, v29 +; GFX8-NEXT: v_cndmask_b32_e32 v29, v31, v29, vcc +; GFX8-NEXT: v_bfe_u32 v31, v12, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v12 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX8-NEXT: v_or_b32_e32 v12, 0x400000, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v31, v12, vcc +; GFX8-NEXT: v_bfe_u32 v31, v28, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v28 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX8-NEXT: v_or_b32_e32 v28, 0x400000, v28 +; GFX8-NEXT: v_cndmask_b32_e32 v28, v31, v28, vcc +; GFX8-NEXT: v_bfe_u32 v31, v11, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v11 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v31, v11, vcc +; GFX8-NEXT: v_bfe_u32 v31, v27, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v27 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX8-NEXT: v_or_b32_e32 v27, 0x400000, v27 +; GFX8-NEXT: v_cndmask_b32_e32 v27, v31, v27, vcc +; GFX8-NEXT: v_bfe_u32 v31, v10, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v10 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v31, v10, vcc +; GFX8-NEXT: v_bfe_u32 v31, v26, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v26 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX8-NEXT: v_or_b32_e32 v26, 0x400000, v26 +; GFX8-NEXT: v_cndmask_b32_e32 v26, v31, v26, vcc +; GFX8-NEXT: v_bfe_u32 v31, v9, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v9 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v31, v9, vcc +; GFX8-NEXT: v_bfe_u32 v31, v25, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v25 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX8-NEXT: v_or_b32_e32 v25, 0x400000, v25 +; GFX8-NEXT: v_cndmask_b32_e32 v25, v31, v25, vcc +; GFX8-NEXT: v_bfe_u32 v31, v8, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v8 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v31, v8, vcc +; GFX8-NEXT: v_bfe_u32 v31, v24, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v24 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX8-NEXT: v_or_b32_e32 v24, 0x400000, v24 +; GFX8-NEXT: v_cndmask_b32_e32 v24, v31, v24, vcc +; GFX8-NEXT: v_bfe_u32 v31, v7, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v7 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v31, v7, vcc +; GFX8-NEXT: v_bfe_u32 v31, v23, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v23 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX8-NEXT: v_or_b32_e32 v23, 0x400000, v23 +; GFX8-NEXT: v_cndmask_b32_e32 v23, v31, v23, vcc +; GFX8-NEXT: v_bfe_u32 v31, v6, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v6 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v31, v6, vcc +; GFX8-NEXT: v_bfe_u32 v31, v22, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v22 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX8-NEXT: v_or_b32_e32 v22, 0x400000, v22 +; GFX8-NEXT: v_cndmask_b32_e32 v22, v31, v22, vcc +; GFX8-NEXT: v_bfe_u32 v31, v5, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v5 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v31, v5, vcc +; GFX8-NEXT: v_bfe_u32 v31, v21, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v21 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX8-NEXT: v_or_b32_e32 v21, 0x400000, v21 +; GFX8-NEXT: v_cndmask_b32_e32 v21, v31, v21, vcc +; GFX8-NEXT: v_bfe_u32 v31, v4, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v4 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v31, v4, vcc +; GFX8-NEXT: v_bfe_u32 v31, v20, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v20 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX8-NEXT: v_or_b32_e32 v20, 0x400000, v20 +; GFX8-NEXT: v_cndmask_b32_e32 v20, v31, v20, vcc +; GFX8-NEXT: v_bfe_u32 v31, v3, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v3 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v31, v3, vcc +; GFX8-NEXT: v_bfe_u32 v31, v19, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v19 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX8-NEXT: v_or_b32_e32 v19, 0x400000, v19 +; GFX8-NEXT: v_cndmask_b32_e32 v19, v31, v19, vcc +; GFX8-NEXT: v_bfe_u32 v31, v2, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v2 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc +; GFX8-NEXT: v_bfe_u32 v31, v18, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v18 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX8-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX8-NEXT: v_cndmask_b32_e32 v18, v31, v18, vcc +; GFX8-NEXT: v_bfe_u32 v31, v1, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_fma_f32 v33, v35, v34, v33 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX8-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v31, v1, vcc +; GFX8-NEXT: v_bfe_u32 v31, v33, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, v31, v33 +; GFX8-NEXT: v_add_u32_e32 v31, vcc, s4, v31 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v33, v33 +; GFX8-NEXT: v_or_b32_e32 v32, 0x400000, v33 +; GFX8-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc +; GFX8-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v0 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, s4, v32 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_or_b32_e32 v0, 0x400000, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v15 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v13 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX8-NEXT: v_alignbit_b32 v0, v0, v31, 16 +; GFX8-NEXT: v_alignbit_b32 v1, v1, v18, 16 +; GFX8-NEXT: v_alignbit_b32 v2, v2, v19, 16 +; GFX8-NEXT: v_alignbit_b32 v3, v3, v20, 16 +; GFX8-NEXT: v_alignbit_b32 v4, v4, v21, 16 +; GFX8-NEXT: v_alignbit_b32 v5, v5, v22, 16 +; GFX8-NEXT: v_alignbit_b32 v6, v6, v23, 16 +; GFX8-NEXT: v_alignbit_b32 v7, v7, v24, 16 +; GFX8-NEXT: v_alignbit_b32 v8, v8, v25, 16 +; GFX8-NEXT: v_alignbit_b32 v9, v9, v26, 16 +; GFX8-NEXT: v_alignbit_b32 v10, v10, v27, 16 +; GFX8-NEXT: v_alignbit_b32 v11, v11, v28, 16 +; GFX8-NEXT: v_alignbit_b32 v12, v12, v29, 16 +; GFX8-NEXT: v_alignbit_b32 v13, v13, v30, 16 +; GFX8-NEXT: v_alignbit_b32 v14, v14, v17, 16 +; GFX8-NEXT: v_alignbit_b32 v15, v15, v16, 16 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_fma_v32bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GFX900-NEXT: v_lshlrev_b32_e32 v31, 16, v15 +; GFX900-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: s_waitcnt vmcnt(1) +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v32 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v15, v15, v33, v32 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 +; GFX900-NEXT: v_fma_f32 v31, v31, v35, v34 +; GFX900-NEXT: v_lshlrev_b32_e32 v32, 16, v30 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v14 +; GFX900-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX900-NEXT: v_and_b32_e32 v14, 0xffff0000, v14 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v31, v31 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v14, v14, v30, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56 +; GFX900-NEXT: v_fma_f32 v32, v34, v32, v35 +; GFX900-NEXT: v_lshlrev_b32_e32 v30, 16, v29 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v13 +; GFX900-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX900-NEXT: v_and_b32_e32 v13, 0xffff0000, v13 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v13, v13, v29, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52 +; GFX900-NEXT: v_fma_f32 v30, v34, v30, v35 +; GFX900-NEXT: v_lshlrev_b32_e32 v29, 16, v28 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v12 +; GFX900-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX900-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v12, v12, v28, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48 +; GFX900-NEXT: v_fma_f32 v29, v34, v29, v35 +; GFX900-NEXT: v_lshlrev_b32_e32 v28, 16, v27 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v11 +; GFX900-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX900-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v11, v11, v27, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44 +; GFX900-NEXT: v_fma_f32 v28, v34, v28, v35 +; GFX900-NEXT: v_lshlrev_b32_e32 v27, 16, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v10 +; GFX900-NEXT: v_and_b32_e32 v26, 0xffff0000, v26 +; GFX900-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v10, v10, v26, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40 +; GFX900-NEXT: v_fma_f32 v27, v34, v27, v35 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v9 +; GFX900-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX900-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v26, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v9, v9, v25, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 +; GFX900-NEXT: v_fma_f32 v26, v35, v34, v26 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v8 +; GFX900-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX900-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v25, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v8, v8, v24, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GFX900-NEXT: v_fma_f32 v25, v35, v34, v25 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v7 +; GFX900-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX900-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v24, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v7, v7, v23, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 +; GFX900-NEXT: v_fma_f32 v24, v35, v34, v24 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v6 +; GFX900-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX900-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v23, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v6, v6, v22, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24 +; GFX900-NEXT: v_fma_f32 v23, v35, v34, v23 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v5 +; GFX900-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX900-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v22, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v5, v5, v21, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX900-NEXT: v_fma_f32 v22, v35, v34, v22 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v4 +; GFX900-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX900-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v21, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v4, v4, v20, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16 +; GFX900-NEXT: v_fma_f32 v21, v35, v34, v21 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v3 +; GFX900-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX900-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v20, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v3, v3, v19, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 +; GFX900-NEXT: v_fma_f32 v20, v35, v34, v20 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v2 +; GFX900-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX900-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v2, v2, v18, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; GFX900-NEXT: v_fma_f32 v19, v35, v34, v19 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v17 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v1 +; GFX900-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX900-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v18, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v1, v1, v17, v33 +; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 +; GFX900-NEXT: v_fma_f32 v18, v35, v34, v18 +; GFX900-NEXT: v_lshlrev_b32_e32 v34, 16, v16 +; GFX900-NEXT: v_lshlrev_b32_e32 v35, 16, v0 +; GFX900-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX900-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v17, 16, v33 +; GFX900-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX900-NEXT: v_fma_f32 v0, v0, v16, v33 +; GFX900-NEXT: v_bfe_u32 v16, v31, 16, 1 +; GFX900-NEXT: v_add3_u32 v16, v16, v31, s4 +; GFX900-NEXT: v_or_b32_e32 v31, 0x400000, v31 +; GFX900-NEXT: v_cndmask_b32_e32 v16, v16, v31, vcc +; GFX900-NEXT: v_bfe_u32 v31, v15, 16, 1 +; GFX900-NEXT: v_add3_u32 v31, v31, v15, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v15, v15 +; GFX900-NEXT: v_or_b32_e32 v15, 0x400000, v15 +; GFX900-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc +; GFX900-NEXT: v_bfe_u32 v31, v32, 16, 1 +; GFX900-NEXT: v_add3_u32 v31, v31, v32, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v32, v32 +; GFX900-NEXT: v_or_b32_e32 v32, 0x400000, v32 +; GFX900-NEXT: v_cndmask_b32_e32 v31, v31, v32, vcc +; GFX900-NEXT: v_bfe_u32 v32, v14, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v14, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v14, v14 +; GFX900-NEXT: v_or_b32_e32 v14, 0x400000, v14 +; GFX900-NEXT: v_cndmask_b32_e32 v14, v32, v14, vcc +; GFX900-NEXT: v_bfe_u32 v32, v30, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v30, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v30, v30 +; GFX900-NEXT: v_or_b32_e32 v30, 0x400000, v30 +; GFX900-NEXT: v_cndmask_b32_e32 v30, v32, v30, vcc +; GFX900-NEXT: v_bfe_u32 v32, v13, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v13, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v13, v13 +; GFX900-NEXT: v_or_b32_e32 v13, 0x400000, v13 +; GFX900-NEXT: v_cndmask_b32_e32 v13, v32, v13, vcc +; GFX900-NEXT: v_bfe_u32 v32, v29, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v29, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v29, v29 +; GFX900-NEXT: v_or_b32_e32 v29, 0x400000, v29 +; GFX900-NEXT: v_cndmask_b32_e32 v29, v32, v29, vcc +; GFX900-NEXT: v_bfe_u32 v32, v12, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v12, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX900-NEXT: v_or_b32_e32 v12, 0x400000, v12 +; GFX900-NEXT: v_cndmask_b32_e32 v12, v32, v12, vcc +; GFX900-NEXT: v_bfe_u32 v32, v28, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v28, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v28, v28 +; GFX900-NEXT: v_or_b32_e32 v28, 0x400000, v28 +; GFX900-NEXT: v_cndmask_b32_e32 v28, v32, v28, vcc +; GFX900-NEXT: v_bfe_u32 v32, v11, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v11, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 +; GFX900-NEXT: v_or_b32_e32 v11, 0x400000, v11 +; GFX900-NEXT: v_cndmask_b32_e32 v11, v32, v11, vcc +; GFX900-NEXT: v_bfe_u32 v32, v27, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v27, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v27, v27 +; GFX900-NEXT: v_or_b32_e32 v27, 0x400000, v27 +; GFX900-NEXT: v_cndmask_b32_e32 v27, v32, v27, vcc +; GFX900-NEXT: v_bfe_u32 v32, v10, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v10, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v10, v10 +; GFX900-NEXT: v_or_b32_e32 v10, 0x400000, v10 +; GFX900-NEXT: v_cndmask_b32_e32 v10, v32, v10, vcc +; GFX900-NEXT: v_bfe_u32 v32, v26, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v26, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v26, v26 +; GFX900-NEXT: v_or_b32_e32 v26, 0x400000, v26 +; GFX900-NEXT: v_cndmask_b32_e32 v26, v32, v26, vcc +; GFX900-NEXT: v_bfe_u32 v32, v9, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v9, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v9, v9 +; GFX900-NEXT: v_or_b32_e32 v9, 0x400000, v9 +; GFX900-NEXT: v_cndmask_b32_e32 v9, v32, v9, vcc +; GFX900-NEXT: v_bfe_u32 v32, v25, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v25, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v25, v25 +; GFX900-NEXT: v_or_b32_e32 v25, 0x400000, v25 +; GFX900-NEXT: v_cndmask_b32_e32 v25, v32, v25, vcc +; GFX900-NEXT: v_bfe_u32 v32, v8, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v8, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX900-NEXT: v_or_b32_e32 v8, 0x400000, v8 +; GFX900-NEXT: v_cndmask_b32_e32 v8, v32, v8, vcc +; GFX900-NEXT: v_bfe_u32 v32, v24, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v24, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v24, v24 +; GFX900-NEXT: v_or_b32_e32 v24, 0x400000, v24 +; GFX900-NEXT: v_cndmask_b32_e32 v24, v32, v24, vcc +; GFX900-NEXT: v_bfe_u32 v32, v7, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v7, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 +; GFX900-NEXT: v_or_b32_e32 v7, 0x400000, v7 +; GFX900-NEXT: v_cndmask_b32_e32 v7, v32, v7, vcc +; GFX900-NEXT: v_bfe_u32 v32, v23, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v23, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v23, v23 +; GFX900-NEXT: v_or_b32_e32 v23, 0x400000, v23 +; GFX900-NEXT: v_cndmask_b32_e32 v23, v32, v23, vcc +; GFX900-NEXT: v_bfe_u32 v32, v6, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v6, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 +; GFX900-NEXT: v_or_b32_e32 v6, 0x400000, v6 +; GFX900-NEXT: v_cndmask_b32_e32 v6, v32, v6, vcc +; GFX900-NEXT: v_bfe_u32 v32, v22, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v22, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v22, v22 +; GFX900-NEXT: v_or_b32_e32 v22, 0x400000, v22 +; GFX900-NEXT: v_cndmask_b32_e32 v22, v32, v22, vcc +; GFX900-NEXT: v_bfe_u32 v32, v5, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v5, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 +; GFX900-NEXT: v_or_b32_e32 v5, 0x400000, v5 +; GFX900-NEXT: v_cndmask_b32_e32 v5, v32, v5, vcc +; GFX900-NEXT: v_bfe_u32 v32, v21, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v21, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v21, v21 +; GFX900-NEXT: v_or_b32_e32 v21, 0x400000, v21 +; GFX900-NEXT: v_cndmask_b32_e32 v21, v32, v21, vcc +; GFX900-NEXT: v_bfe_u32 v32, v4, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v4, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 +; GFX900-NEXT: v_or_b32_e32 v4, 0x400000, v4 +; GFX900-NEXT: v_cndmask_b32_e32 v4, v32, v4, vcc +; GFX900-NEXT: v_bfe_u32 v32, v20, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v20, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v20, v20 +; GFX900-NEXT: v_or_b32_e32 v20, 0x400000, v20 +; GFX900-NEXT: v_cndmask_b32_e32 v20, v32, v20, vcc +; GFX900-NEXT: v_bfe_u32 v32, v3, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v3, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX900-NEXT: v_or_b32_e32 v3, 0x400000, v3 +; GFX900-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX900-NEXT: v_bfe_u32 v32, v19, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v19, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v19, v19 +; GFX900-NEXT: v_or_b32_e32 v19, 0x400000, v19 +; GFX900-NEXT: v_cndmask_b32_e32 v19, v32, v19, vcc +; GFX900-NEXT: v_bfe_u32 v32, v2, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v2, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v2, v32, v2, vcc +; GFX900-NEXT: v_bfe_u32 v32, v18, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v18, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v18, v18 +; GFX900-NEXT: v_or_b32_e32 v18, 0x400000, v18 +; GFX900-NEXT: v_cndmask_b32_e32 v18, v32, v18, vcc +; GFX900-NEXT: v_bfe_u32 v32, v1, 16, 1 +; GFX900-NEXT: v_fma_f32 v17, v35, v34, v17 +; GFX900-NEXT: v_add3_u32 v32, v32, v1, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX900-NEXT: v_or_b32_e32 v1, 0x400000, v1 +; GFX900-NEXT: v_cndmask_b32_e32 v1, v32, v1, vcc +; GFX900-NEXT: v_bfe_u32 v32, v17, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v17, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v17, v17 +; GFX900-NEXT: v_or_b32_e32 v17, 0x400000, v17 +; GFX900-NEXT: v_cndmask_b32_e32 v17, v32, v17, vcc +; GFX900-NEXT: v_bfe_u32 v32, v0, 16, 1 +; GFX900-NEXT: v_add3_u32 v32, v32, v0, s4 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_or_b32_e32 v0, 0x400000, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc +; GFX900-NEXT: s_mov_b32 s4, 0x7060302 +; GFX900-NEXT: v_perm_b32 v0, v0, v17, s4 +; GFX900-NEXT: v_perm_b32 v1, v1, v18, s4 +; GFX900-NEXT: v_perm_b32 v2, v2, v19, s4 +; GFX900-NEXT: v_perm_b32 v3, v3, v20, s4 +; GFX900-NEXT: v_perm_b32 v4, v4, v21, s4 +; GFX900-NEXT: v_perm_b32 v5, v5, v22, s4 +; GFX900-NEXT: v_perm_b32 v6, v6, v23, s4 +; GFX900-NEXT: v_perm_b32 v7, v7, v24, s4 +; GFX900-NEXT: v_perm_b32 v8, v8, v25, s4 +; GFX900-NEXT: v_perm_b32 v9, v9, v26, s4 +; GFX900-NEXT: v_perm_b32 v10, v10, v27, s4 +; GFX900-NEXT: v_perm_b32 v11, v11, v28, s4 +; GFX900-NEXT: v_perm_b32 v12, v12, v29, s4 +; GFX900-NEXT: v_perm_b32 v13, v13, v30, s4 +; GFX900-NEXT: v_perm_b32 v14, v14, v31, s4 +; GFX900-NEXT: v_perm_b32 v15, v15, v16, s4 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fma_v32bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:64 +; GFX950-NEXT: scratch_load_dword v36, off, s32 +; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:60 +; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:56 +; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:52 +; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48 +; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:44 +; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:40 +; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:36 +; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:32 +; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:28 +; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:4 +; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:8 +; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:12 +; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:16 +; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:20 +; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:24 +; GFX950-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a14, v62 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a15, v63 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v43, 0xffff0000, v14 +; GFX950-NEXT: v_lshlrev_b32_e32 v45, 16, v14 +; GFX950-NEXT: v_and_b32_e32 v46, 0xffff0000, v29 +; GFX950-NEXT: v_lshlrev_b32_e32 v56, 16, v29 +; GFX950-NEXT: v_and_b32_e32 v59, 0xffff0000, v12 +; GFX950-NEXT: v_lshlrev_b32_e32 v61, 16, v12 +; GFX950-NEXT: v_and_b32_e32 v62, 0xffff0000, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX950-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v42, 0xffff0000, v30 +; GFX950-NEXT: v_lshlrev_b32_e32 v44, 16, v30 +; GFX950-NEXT: v_and_b32_e32 v47, 0xffff0000, v13 +; GFX950-NEXT: v_lshlrev_b32_e32 v57, 16, v13 +; GFX950-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v40, 0xffff0000, v15 +; GFX950-NEXT: v_lshlrev_b32_e32 v41, 16, v15 +; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse +; GFX950-NEXT: v_and_b32_e32 v58, 0xffff0000, v28 +; GFX950-NEXT: v_lshlrev_b32_e32 v60, 16, v28 +; GFX950-NEXT: s_waitcnt vmcnt(16) +; GFX950-NEXT: v_and_b32_e32 v15, 0xffff0000, v35 +; GFX950-NEXT: s_waitcnt vmcnt(15) +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v36 +; GFX950-NEXT: v_lshlrev_b32_e32 v63, 16, v36 +; GFX950-NEXT: s_waitcnt vmcnt(14) +; GFX950-NEXT: v_and_b32_e32 v14, 0xffff0000, v38 +; GFX950-NEXT: v_lshlrev_b32_e32 v29, 16, v38 +; GFX950-NEXT: s_waitcnt vmcnt(11) +; GFX950-NEXT: v_and_b32_e32 v36, 0xffff0000, v49 +; GFX950-NEXT: v_and_b32_e32 v38, 0xffff0000, v11 +; GFX950-NEXT: v_fmac_f32_e32 v36, v38, v62 +; GFX950-NEXT: v_lshlrev_b32_e32 v38, 16, v49 +; GFX950-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX950-NEXT: v_and_b32_e32 v13, 0xffff0000, v39 +; GFX950-NEXT: v_lshlrev_b32_e32 v30, 16, v39 +; GFX950-NEXT: v_fmac_f32_e32 v38, v11, v27 +; GFX950-NEXT: s_waitcnt vmcnt(10) +; GFX950-NEXT: v_and_b32_e32 v11, 0xffff0000, v50 +; GFX950-NEXT: v_and_b32_e32 v27, 0xffff0000, v26 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v10 +; GFX950-NEXT: v_fmac_f32_e32 v11, v39, v27 +; GFX950-NEXT: v_lshlrev_b32_e32 v27, 16, v50 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX950-NEXT: v_fmac_f32_e32 v27, v10, v26 +; GFX950-NEXT: s_waitcnt vmcnt(9) +; GFX950-NEXT: v_and_b32_e32 v10, 0xffff0000, v51 +; GFX950-NEXT: v_and_b32_e32 v26, 0xffff0000, v25 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 +; GFX950-NEXT: v_fmac_f32_e32 v10, v39, v26 +; GFX950-NEXT: v_lshlrev_b32_e32 v26, 16, v51 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX950-NEXT: v_fmac_f32_e32 v26, v9, v25 +; GFX950-NEXT: s_waitcnt vmcnt(8) +; GFX950-NEXT: v_and_b32_e32 v9, 0xffff0000, v52 +; GFX950-NEXT: v_and_b32_e32 v25, 0xffff0000, v24 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v8 +; GFX950-NEXT: v_fmac_f32_e32 v9, v39, v25 +; GFX950-NEXT: v_lshlrev_b32_e32 v25, 16, v52 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX950-NEXT: v_fmac_f32_e32 v25, v8, v24 +; GFX950-NEXT: s_waitcnt vmcnt(7) +; GFX950-NEXT: v_and_b32_e32 v8, 0xffff0000, v53 +; GFX950-NEXT: v_and_b32_e32 v24, 0xffff0000, v23 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 +; GFX950-NEXT: v_fmac_f32_e32 v8, v39, v24 +; GFX950-NEXT: v_lshlrev_b32_e32 v24, 16, v53 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX950-NEXT: v_fmac_f32_e32 v24, v7, v23 +; GFX950-NEXT: s_waitcnt vmcnt(6) +; GFX950-NEXT: v_and_b32_e32 v7, 0xffff0000, v54 +; GFX950-NEXT: v_and_b32_e32 v23, 0xffff0000, v22 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v6 +; GFX950-NEXT: v_fmac_f32_e32 v7, v39, v23 +; GFX950-NEXT: v_lshlrev_b32_e32 v23, 16, v54 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX950-NEXT: v_fmac_f32_e32 v23, v6, v22 +; GFX950-NEXT: s_waitcnt vmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v6, 0xffff0000, v55 +; GFX950-NEXT: v_and_b32_e32 v22, 0xffff0000, v21 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v5 +; GFX950-NEXT: v_fmac_f32_e32 v6, v39, v22 +; GFX950-NEXT: v_lshlrev_b32_e32 v22, 16, v55 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX950-NEXT: v_fmac_f32_e32 v22, v5, v21 +; GFX950-NEXT: v_and_b32_e32 v5, 0xffff0000, v37 +; GFX950-NEXT: v_and_b32_e32 v21, 0xffff0000, v20 +; GFX950-NEXT: v_and_b32_e32 v39, 0xffff0000, v4 +; GFX950-NEXT: v_fmac_f32_e32 v5, v39, v21 +; GFX950-NEXT: v_lshlrev_b32_e32 v21, 16, v37 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX950-NEXT: v_fmac_f32_e32 v21, v4, v20 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v34 +; GFX950-NEXT: v_and_b32_e32 v20, 0xffff0000, v19 +; GFX950-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; GFX950-NEXT: v_fmac_f32_e32 v4, v37, v20 +; GFX950-NEXT: v_lshlrev_b32_e32 v20, 16, v34 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX950-NEXT: v_fmac_f32_e32 v20, v3, v19 +; GFX950-NEXT: v_and_b32_e32 v3, 0xffff0000, v33 +; GFX950-NEXT: v_and_b32_e32 v19, 0xffff0000, v18 +; GFX950-NEXT: v_and_b32_e32 v34, 0xffff0000, v2 +; GFX950-NEXT: v_fmac_f32_e32 v3, v34, v19 +; GFX950-NEXT: v_lshlrev_b32_e32 v19, 16, v33 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_fmac_f32_e32 v19, v2, v18 +; GFX950-NEXT: v_and_b32_e32 v2, 0xffff0000, v32 +; GFX950-NEXT: v_and_b32_e32 v18, 0xffff0000, v17 +; GFX950-NEXT: v_and_b32_e32 v33, 0xffff0000, v1 +; GFX950-NEXT: v_fmac_f32_e32 v2, v33, v18 +; GFX950-NEXT: v_lshlrev_b32_e32 v18, 16, v32 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_fmac_f32_e32 v18, v1, v17 +; GFX950-NEXT: v_and_b32_e32 v1, 0xffff0000, v31 +; GFX950-NEXT: v_and_b32_e32 v17, 0xffff0000, v16 +; GFX950-NEXT: v_and_b32_e32 v32, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v28, 16, v35 +; GFX950-NEXT: v_fmac_f32_e32 v15, v40, v12 +; GFX950-NEXT: v_and_b32_e32 v12, 0xffff0000, v48 +; GFX950-NEXT: v_lshlrev_b32_e32 v35, 16, v48 +; GFX950-NEXT: v_fmac_f32_e32 v1, v32, v17 +; GFX950-NEXT: v_lshlrev_b32_e32 v17, 16, v31 +; GFX950-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v28, v41, v63 +; GFX950-NEXT: v_fmac_f32_e32 v14, v43, v42 +; GFX950-NEXT: v_fmac_f32_e32 v29, v45, v44 +; GFX950-NEXT: v_fmac_f32_e32 v13, v47, v46 +; GFX950-NEXT: v_fmac_f32_e32 v30, v57, v56 +; GFX950-NEXT: v_fmac_f32_e32 v12, v59, v58 +; GFX950-NEXT: v_fmac_f32_e32 v35, v61, v60 +; GFX950-NEXT: v_fmac_f32_e32 v17, v0, v16 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v17, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v1, v18, v2 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v2, v19, v3 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v3, v20, v4 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v4, v21, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v5, v22, v6 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v23, v7 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v7, v24, v8 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v8, v25, v9 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v9, v26, v10 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v10, v27, v11 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v11, v38, v36 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v12, v35, v12 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v13, v30, v13 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v14, v29, v14 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v15, v28, v15 +; GFX950-NEXT: v_accvgpr_read_b32 v63, a15 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v62, a14 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse +; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fma_v32bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_clause 0x8 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:40 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v15 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v15 +; GFX10-NEXT: v_and_b32_e32 v52, 0xffff0000, v10 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_lshlrev_b32_e32 v31, 16, v32 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v33 +; GFX10-NEXT: v_and_b32_e32 v15, 0xffff0000, v32 +; GFX10-NEXT: v_and_b32_e32 v32, 0xffff0000, v33 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32 +; GFX10-NEXT: v_fmac_f32_e32 v31, v49, v50 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v30 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v14 +; GFX10-NEXT: v_fmac_f32_e32 v15, v51, v32 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v32, 16, v34 +; GFX10-NEXT: v_and_b32_e32 v30, 0xffff0000, v30 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v14 +; GFX10-NEXT: v_and_b32_e32 v14, 0xffff0000, v34 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:28 +; GFX10-NEXT: v_fmac_f32_e32 v32, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v29 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v13 +; GFX10-NEXT: v_fmac_f32_e32 v14, v51, v30 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v30, 16, v35 +; GFX10-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v13 +; GFX10-NEXT: v_and_b32_e32 v13, 0xffff0000, v35 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:24 +; GFX10-NEXT: v_fmac_f32_e32 v30, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v28 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v12 +; GFX10-NEXT: v_fmac_f32_e32 v13, v51, v29 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v29, 16, v36 +; GFX10-NEXT: v_and_b32_e32 v28, 0xffff0000, v28 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v12 +; GFX10-NEXT: v_and_b32_e32 v12, 0xffff0000, v36 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20 +; GFX10-NEXT: v_fmac_f32_e32 v29, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v27 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v11 +; GFX10-NEXT: v_fmac_f32_e32 v12, v51, v28 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v28, 16, v37 +; GFX10-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 +; GFX10-NEXT: v_and_b32_e32 v11, 0xffff0000, v37 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:16 +; GFX10-NEXT: v_fmac_f32_e32 v28, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v26 +; GFX10-NEXT: v_lshlrev_b32_e32 v50, 16, v10 +; GFX10-NEXT: v_fmac_f32_e32 v11, v51, v27 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v27, 16, v38 +; GFX10-NEXT: v_and_b32_e32 v51, 0xffff0000, v26 +; GFX10-NEXT: v_and_b32_e32 v10, 0xffff0000, v38 +; GFX10-NEXT: v_lshlrev_b32_e32 v38, 16, v25 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_lshlrev_b32_e32 v26, 16, v39 +; GFX10-NEXT: v_fmac_f32_e32 v27, v50, v49 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v9 +; GFX10-NEXT: v_fmac_f32_e32 v10, v52, v51 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:8 +; GFX10-NEXT: v_and_b32_e32 v25, 0xffff0000, v25 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v31 +; GFX10-NEXT: v_fmac_f32_e32 v26, v49, v38 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:4 +; GFX10-NEXT: v_and_b32_e32 v49, 0xffff0000, v9 +; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v24 +; GFX10-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX10-NEXT: v_fmac_f32_e32 v9, v49, v25 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v8 +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_lshlrev_b32_e32 v25, 16, v48 +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX10-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; GFX10-NEXT: v_fmac_f32_e32 v25, v49, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v7 +; GFX10-NEXT: v_fmac_f32_e32 v48, v8, v24 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v22 +; GFX10-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v33 +; GFX10-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX10-NEXT: v_fmac_f32_e32 v8, v49, v39 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX10-NEXT: v_fmac_f32_e32 v33, v7, v23 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v21 +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v34 +; GFX10-NEXT: v_and_b32_e32 v34, 0xffff0000, v34 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v5 +; GFX10-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX10-NEXT: v_fmac_f32_e32 v7, v39, v24 +; GFX10-NEXT: v_fmac_f32_e32 v34, v6, v22 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v4 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v35 +; GFX10-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v19 +; GFX10-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX10-NEXT: v_fmac_f32_e32 v6, v23, v49 +; GFX10-NEXT: v_fmac_f32_e32 v35, v5, v21 +; GFX10-NEXT: v_lshlrev_b32_e32 v23, 16, v3 +; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v36 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX10-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; GFX10-NEXT: v_lshlrev_b32_e32 v49, 16, v18 +; GFX10-NEXT: v_lshlrev_b32_e32 v21, 16, v2 +; GFX10-NEXT: v_fmac_f32_e32 v5, v39, v24 +; GFX10-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX10-NEXT: v_fmac_f32_e32 v36, v4, v20 +; GFX10-NEXT: v_lshlrev_b32_e32 v20, 16, v16 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_lshlrev_b32_e32 v39, 16, v37 +; GFX10-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v24, 16, v17 +; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX10-NEXT: v_fmac_f32_e32 v39, v23, v22 +; GFX10-NEXT: v_and_b32_e32 v23, 0xffff0000, v37 +; GFX10-NEXT: v_lshlrev_b32_e32 v22, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX10-NEXT: v_fmac_f32_e32 v23, v3, v19 +; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_lshlrev_b32_e32 v37, 16, v50 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v51 +; GFX10-NEXT: v_and_b32_e32 v19, 0xffff0000, v51 +; GFX10-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GFX10-NEXT: v_cmp_u_f32_e64 s5, v33, v33 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v51, 16, v38 +; GFX10-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; GFX10-NEXT: v_fmac_f32_e32 v37, v21, v49 +; GFX10-NEXT: v_fmac_f32_e32 v50, v2, v18 +; GFX10-NEXT: v_fmac_f32_e32 v19, v1, v17 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v48 +; GFX10-NEXT: v_fmac_f32_e32 v38, v0, v16 +; GFX10-NEXT: v_bfe_u32 v0, v48, 16, 1 +; GFX10-NEXT: v_bfe_u32 v16, v33, 16, 1 +; GFX10-NEXT: v_bfe_u32 v2, v8, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v17, 0x400000, v33 +; GFX10-NEXT: v_bfe_u32 v18, v7, 16, 1 +; GFX10-NEXT: v_bfe_u32 v21, v34, 16, 1 +; GFX10-NEXT: v_add3_u32 v0, v0, v48, 0x7fff +; GFX10-NEXT: v_bfe_u32 v48, v35, 16, 1 +; GFX10-NEXT: v_add3_u32 v16, v16, v33, 0x7fff +; GFX10-NEXT: v_bfe_u32 v33, v5, 16, 1 +; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v24 +; GFX10-NEXT: v_fmac_f32_e32 v51, v22, v20 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v8 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v7 +; GFX10-NEXT: v_or_b32_e32 v22, 0x400000, v34 +; GFX10-NEXT: v_bfe_u32 v24, v6, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v8, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s4, v8, v8 +; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v35 +; GFX10-NEXT: v_add3_u32 v18, v18, v7, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s6, v7, v7 +; GFX10-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX10-NEXT: v_add3_u32 v21, v21, v34, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s7, v34, v34 +; GFX10-NEXT: v_bfe_u32 v34, v39, 16, 1 +; GFX10-NEXT: v_add3_u32 v48, v48, v35, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s9, v35, v35 +; GFX10-NEXT: v_bfe_u32 v35, v23, 16, 1 +; GFX10-NEXT: v_add3_u32 v33, v33, v5, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s10, v5, v5 +; GFX10-NEXT: v_bfe_u32 v5, v37, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v49, 0x400000, v6 +; GFX10-NEXT: v_add3_u32 v24, v24, v6, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s8, v6, v6 +; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v39 +; GFX10-NEXT: v_add3_u32 v34, v34, v39, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s11, v39, v39 +; GFX10-NEXT: v_or_b32_e32 v39, 0x400000, v23 +; GFX10-NEXT: v_add3_u32 v35, v35, v23, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s12, v23, v23 +; GFX10-NEXT: v_or_b32_e32 v23, 0x400000, v37 +; GFX10-NEXT: v_add3_u32 v5, v5, v37, 0x7fff +; GFX10-NEXT: v_cmp_u_f32_e64 s13, v37, v37 +; GFX10-NEXT: v_bfe_u32 v37, v31, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v53, v2, v4, s4 +; GFX10-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v17, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v18, v20, s6 +; GFX10-NEXT: v_add3_u32 v37, v37, v31, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v18, v21, v22, s7 +; GFX10-NEXT: v_or_b32_e32 v20, 0x400000, v3 +; GFX10-NEXT: v_bfe_u32 v22, v19, 16, 1 +; GFX10-NEXT: v_add3_u32 v4, v4, v3, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v31, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v15, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v15 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v15, v15 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v24, v49, s8 +; GFX10-NEXT: v_or_b32_e32 v24, 0x400000, v19 +; GFX10-NEXT: v_add3_u32 v37, v37, v15, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v7, v33, v7, s10 +; GFX10-NEXT: v_bfe_u32 v33, v51, 16, 1 +; GFX10-NEXT: v_add3_u32 v22, v22, v19, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v6, v34, v6, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v32, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v32 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v32, v32 +; GFX10-NEXT: v_or_b32_e32 v34, 0x400000, v51 +; GFX10-NEXT: v_cndmask_b32_e64 v35, v35, v39, s12 +; GFX10-NEXT: v_add3_u32 v37, v37, v32, 0x7fff +; GFX10-NEXT: v_bfe_u32 v39, v38, 16, 1 +; GFX10-NEXT: v_add3_u32 v33, v33, v51, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v23, s13 +; GFX10-NEXT: v_or_b32_e32 v23, 0x400000, v38 +; GFX10-NEXT: v_cndmask_b32_e64 v32, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v14, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v14 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v14, v14 +; GFX10-NEXT: v_add3_u32 v39, v39, v38, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v50 +; GFX10-NEXT: v_add3_u32 v37, v37, v14, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, v8, s9 +; GFX10-NEXT: v_perm_b32 v15, v15, v31, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v30, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v30 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v30, v30 +; GFX10-NEXT: v_perm_b32 v14, v14, v32, 0x7060302 +; GFX10-NEXT: v_add3_u32 v37, v37, v30, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v30, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v13, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v13 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v13, v13 +; GFX10-NEXT: v_add3_u32 v37, v37, v13, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v13, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v29, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v29 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v29, v29 +; GFX10-NEXT: v_perm_b32 v13, v13, v30, 0x7060302 +; GFX10-NEXT: v_add3_u32 v37, v37, v29, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v29, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v12, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v12 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v12, v12 +; GFX10-NEXT: v_add3_u32 v37, v37, v12, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v12, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v28, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v28 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v28, v28 +; GFX10-NEXT: v_perm_b32 v12, v12, v29, 0x7060302 +; GFX10-NEXT: v_add3_u32 v37, v37, v28, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v28, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v11, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v11 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v11, v11 +; GFX10-NEXT: v_add3_u32 v37, v37, v11, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v11, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v27, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v27 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v27, v27 +; GFX10-NEXT: v_perm_b32 v11, v11, v28, 0x7060302 +; GFX10-NEXT: v_add3_u32 v37, v37, v27, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v27, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v10, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v10 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v10, v10 +; GFX10-NEXT: v_add3_u32 v37, v37, v10, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v10, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v26, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v26 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v26, v26 +; GFX10-NEXT: v_perm_b32 v10, v10, v27, 0x7060302 +; GFX10-NEXT: v_add3_u32 v37, v37, v26, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v26, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v9, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v9 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v9, v9 +; GFX10-NEXT: v_add3_u32 v37, v37, v9, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v9, v37, v52, s14 +; GFX10-NEXT: v_bfe_u32 v37, v25, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v52, 0x400000, v25 +; GFX10-NEXT: v_cmp_u_f32_e64 s14, v25, v25 +; GFX10-NEXT: v_perm_b32 v9, v9, v26, 0x7060302 +; GFX10-NEXT: v_add3_u32 v37, v37, v25, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e64 v25, v37, v52, s14 +; GFX10-NEXT: v_cndmask_b32_e32 v52, v0, v1, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX10-NEXT: v_bfe_u32 v1, v50, 16, 1 +; GFX10-NEXT: v_bfe_u32 v37, v36, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v0, 0x400000, v36 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v20, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX10-NEXT: v_add3_u32 v1, v1, v50, 0x7fff +; GFX10-NEXT: v_add3_u32 v37, v37, v36, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v4, v22, v24, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX10-NEXT: v_cndmask_b32_e32 v19, v33, v34, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX10-NEXT: v_cndmask_b32_e32 v20, v39, v23, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX10-NEXT: v_perm_b32 v1, v4, v3, 0x7060302 +; GFX10-NEXT: v_perm_b32 v3, v35, v6, 0x7060302 +; GFX10-NEXT: v_perm_b32 v6, v18, v17, 0x7060302 +; GFX10-NEXT: v_perm_b32 v2, v2, v5, 0x7060302 +; GFX10-NEXT: v_cndmask_b32_e32 v22, v37, v0, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v20, v19, 0x7060302 +; GFX10-NEXT: v_perm_b32 v5, v8, v21, 0x7060302 +; GFX10-NEXT: v_perm_b32 v8, v52, v25, 0x7060302 +; GFX10-NEXT: v_perm_b32 v4, v22, v7, 0x7060302 +; GFX10-NEXT: v_perm_b32 v7, v16, v53, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11TRUE16-LABEL: v_fma_v32bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: s_clause 0x10 +; GFX11TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:64 +; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 +; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:60 +; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:56 +; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:52 +; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:48 +; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:44 +; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:40 +; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:36 +; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:32 +; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:28 +; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:24 +; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:20 +; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:16 +; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:12 +; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:8 +; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:4 +; GFX11TRUE16-NEXT: v_and_b32_e32 v99, 0xffff0000, v21 +; GFX11TRUE16-NEXT: v_and_b32_e32 v100, 0xffff0000, v5 +; GFX11TRUE16-NEXT: v_and_b32_e32 v101, 0xffff0000, v20 +; GFX11TRUE16-NEXT: v_and_b32_e32 v102, 0xffff0000, v4 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11TRUE16-NEXT: v_and_b32_e32 v115, 0xffff0000, v17 +; GFX11TRUE16-NEXT: v_and_b32_e32 v116, 0xffff0000, v1 +; GFX11TRUE16-NEXT: v_and_b32_e32 v97, 0xffff0000, v22 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX11TRUE16-NEXT: v_and_b32_e32 v117, 0xffff0000, v16 +; GFX11TRUE16-NEXT: v_and_b32_e32 v118, 0xffff0000, v0 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_and_b32_e32 v103, 0xffff0000, v19 +; GFX11TRUE16-NEXT: v_and_b32_e32 v112, 0xffff0000, v3 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v24 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX11TRUE16-NEXT: v_and_b32_e32 v113, 0xffff0000, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v114, 0xffff0000, v2 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11TRUE16-NEXT: v_and_b32_e32 v119, 0xffff0000, v31 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11TRUE16-NEXT: v_and_b32_e32 v128, 0xffff0000, v32 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11TRUE16-NEXT: v_and_b32_e32 v129, 0xffff0000, v33 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v33 +; GFX11TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v13 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11TRUE16-NEXT: v_and_b32_e32 v131, 0xffff0000, v35 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11TRUE16-NEXT: v_and_b32_e32 v133, 0xffff0000, v37 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11TRUE16-NEXT: v_and_b32_e32 v134, 0xffff0000, v38 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v37 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11TRUE16-NEXT: v_and_b32_e32 v144, 0xffff0000, v48 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v48, 16, v48 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11TRUE16-NEXT: v_and_b32_e32 v146, 0xffff0000, v50 +; GFX11TRUE16-NEXT: v_and_b32_e32 v145, 0xffff0000, v49 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v49, 16, v49 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11TRUE16-NEXT: v_and_b32_e32 v147, 0xffff0000, v51 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX11TRUE16-NEXT: v_and_b32_e32 v96, 0xffff0000, v7 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11TRUE16-NEXT: v_and_b32_e32 v148, 0xffff0000, v55 +; GFX11TRUE16-NEXT: v_and_b32_e32 v87, 0xffff0000, v23 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v25 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v146, v100, v99 :: v_dual_lshlrev_b32 v25, 16, v25 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 +; GFX11TRUE16-NEXT: v_and_b32_e32 v98, 0xffff0000, v6 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v9 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v48, v7, v23 +; GFX11TRUE16-NEXT: v_and_b32_e32 v135, 0xffff0000, v39 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v49, v6, v22 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v134, v84, v83 :: v_dual_lshlrev_b32 v13, 16, v13 +; GFX11TRUE16-NEXT: v_bfe_u32 v83, v146, 16, 1 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v51, v4, v20 :: v_dual_fmac_f32 v148, v118, v117 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v144, v96, v87 :: v_dual_and_b32 v81, 0xffff0000, v26 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v55, v0, v16 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v145, v98, v97 +; GFX11TRUE16-NEXT: v_or_b32_e32 v84, 0x400000, v146 +; GFX11TRUE16-NEXT: v_add3_u32 v83, v83, v146, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v8 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX11TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v10 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v147, v102, v101 :: v_dual_lshlrev_b32 v10, 16, v10 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v38 +; GFX11TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v28 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v37, v10, v26 :: v_dual_lshlrev_b32 v28, 16, v28 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v39, v8, v24 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v133, v82, v81 :: v_dual_and_b32 v70, 0xffff0000, v12 +; GFX11TRUE16-NEXT: v_bfe_u32 v97, v51, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v23, v37, 16, 1 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v135, v86, v85 :: v_dual_lshlrev_b32 v12, 16, v12 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v35, 16, v35 +; GFX11TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v11 +; GFX11TRUE16-NEXT: v_and_b32_e32 v132, 0xffff0000, v36 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v36 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 +; GFX11TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v133 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v37 +; GFX11TRUE16-NEXT: v_or_b32_e32 v98, 0x400000, v51 +; GFX11TRUE16-NEXT: v_add3_u32 v23, v23, v37, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v27 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11TRUE16-NEXT: v_add3_u32 v97, v97, v51, 0x7fff +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v15 +; GFX11TRUE16-NEXT: v_and_b32_e32 v130, 0xffff0000, v34 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v35, v12, v28 :: v_dual_lshlrev_b32 v34, 16, v34 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v36, v11, v27 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v50, v5, v21 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v132, v80, v71 :: v_dual_and_b32 v67, 0xffff0000, v29 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v130, v68, v67 :: v_dual_and_b32 v65, 0xffff0000, v30 +; GFX11TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v36 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v34, v13, v29 :: v_dual_fmac_f32 v31, v15, v32 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v119, v64, v128 :: v_dual_and_b32 v66, 0xffff0000, v14 +; GFX11TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v52 +; GFX11TRUE16-NEXT: v_and_b32_e32 v128, 0xffff0000, v53 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v129, v66, v65 :: v_dual_lshlrev_b32 v30, 16, v30 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 +; GFX11TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 +; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v64, v112, v103 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v38, v9, v25 +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v131, v70, v69 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v53, v2, v18 +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v119, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v2, v31, 16, 1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_dual_fmac_f32 v33, v14, v30 :: v_dual_fmac_f32 v52, v3, v19 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v54, v1, v17 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v119 +; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v31 +; GFX11TRUE16-NEXT: v_bfe_u32 v4, v129, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v119, 0x7fff +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v119, v119 +; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v31, 0x7fff +; GFX11TRUE16-NEXT: v_cmp_u_f32_e64 s0, v31, v31 +; GFX11TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v129 +; GFX11TRUE16-NEXT: v_bfe_u32 v6, v33, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v14, v132, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v15, v0, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v149, v2, v3, s0 +; GFX11TRUE16-NEXT: v_add3_u32 v2, v4, v129, 0x7fff +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129 +; GFX11TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v33 +; GFX11TRUE16-NEXT: v_bfe_u32 v8, v130, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v3, v6, v33, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v150, v14, v132, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v14, v2, v5, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v130 +; GFX11TRUE16-NEXT: v_bfe_u32 v10, v34, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v13, v35, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v4, v8, v130, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v33, v3, v7, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130 +; GFX11TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v34 +; GFX11TRUE16-NEXT: v_bfe_u32 v12, v131, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v6, v10, v34, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v10, v13, v35, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v13, v4, v9, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v131 +; GFX11TRUE16-NEXT: v_add3_u32 v8, v12, v131, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v35 +; GFX11TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v132 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v34, v6, v11, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v131, v131 +; GFX11TRUE16-NEXT: v_bfe_u32 v19, v36, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v21, v133, 16, 1 +; GFX11TRUE16-NEXT: v_bfe_u32 v25, v134, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v134 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v12, v8, v16, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11TRUE16-NEXT: v_add3_u32 v19, v19, v36, 0x7fff +; GFX11TRUE16-NEXT: v_add3_u32 v21, v21, v133, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v27, v38, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v25, v25, v134, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v16, v10, v17, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v132, v132 +; GFX11TRUE16-NEXT: v_or_b32_e32 v28, 0x400000, v38 +; GFX11TRUE16-NEXT: v_bfe_u32 v29, v135, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v27, v27, v38, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v30, 0x400000, v135 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v11, v150, v18, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11TRUE16-NEXT: v_bfe_u32 v65, v39, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v29, v29, v135, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v66, 0x400000, v39 +; GFX11TRUE16-NEXT: v_bfe_u32 v67, v144, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v17, v19, v20, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 +; GFX11TRUE16-NEXT: v_add3_u32 v65, v65, v39, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v68, 0x400000, v144 +; GFX11TRUE16-NEXT: v_bfe_u32 v69, v48, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v67, v67, v144, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v10, v21, v22, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11TRUE16-NEXT: v_or_b32_e32 v70, 0x400000, v48 +; GFX11TRUE16-NEXT: v_bfe_u32 v71, v145, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v69, v69, v48, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v80, 0x400000, v145 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v18, v23, v24, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134 +; GFX11TRUE16-NEXT: v_bfe_u32 v81, v49, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v71, v71, v145, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v82, 0x400000, v49 +; GFX11TRUE16-NEXT: v_bfe_u32 v85, v50, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v9, v25, v26, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11TRUE16-NEXT: v_add3_u32 v81, v81, v49, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v86, 0x400000, v50 +; GFX11TRUE16-NEXT: v_bfe_u32 v87, v147, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v85, v85, v50, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v19, v27, v28, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v135, v135 +; GFX11TRUE16-NEXT: v_or_b32_e32 v96, 0x400000, v147 +; GFX11TRUE16-NEXT: v_add3_u32 v87, v87, v147, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v99, v64, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v100, 0x400000, v64 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v8, v29, v30, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11TRUE16-NEXT: v_bfe_u32 v101, v52, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v99, v99, v64, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v102, 0x400000, v52 +; GFX11TRUE16-NEXT: v_bfe_u32 v117, v54, 16, 1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v20, v65, v66, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v144, v144 +; GFX11TRUE16-NEXT: v_add3_u32 v101, v101, v52, 0x7fff +; GFX11TRUE16-NEXT: v_or_b32_e32 v118, 0x400000, v54 +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v55, 16, 1 +; GFX11TRUE16-NEXT: v_add3_u32 v117, v117, v54, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v7, v67, v68, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v55 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v55, 0x7fff +; GFX11TRUE16-NEXT: v_bfe_u32 v119, v148, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v31, 0x400000, v148 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v21, v69, v70, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v8.l, v20.h +; GFX11TRUE16-NEXT: v_add3_u32 v119, v119, v148, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v9.l, v19.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v6, v71, v80, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v10.l, v18.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v11.l, v17.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v12.l, v16.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v13.l, v34.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v22, v81, v82, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v14.l, v33.h +; GFX11TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v6.l, v22.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v5, v83, v84, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v23, v85, v86, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v147, v147 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v5.l, v23.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v4, v87, v96, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v128, v114, v113 +; GFX11TRUE16-NEXT: v_bfe_u32 v113, v53, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v114, 0x400000, v53 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v24, v97, v98, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11TRUE16-NEXT: v_bfe_u32 v103, v128, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v112, 0x400000, v128 +; GFX11TRUE16-NEXT: v_add3_u32 v113, v113, v53, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v3, v99, v100, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11TRUE16-NEXT: v_add3_u32 v103, v103, v128, 0x7fff +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v25, v101, v102, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v128, v128 +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v32, v116, v115 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v2, v103, v112, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11TRUE16-NEXT: v_bfe_u32 v115, v32, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v116, 0x400000, v32 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v26, v113, v114, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_add3_u32 v115, v115, v32, 0x7fff +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v27, v117, v118, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v28, v0, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, v115, v116, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v148, v148 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v27.h +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v119, v31, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v28.h +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fma_v32bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: s_clause 0x10 +; GFX11FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:64 +; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 +; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:60 +; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:56 +; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:52 +; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:48 +; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:44 +; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:40 +; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:36 +; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:32 +; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:28 +; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:24 +; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:20 +; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:16 +; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:12 +; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:8 +; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:4 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v99, 16, v21 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v100, 16, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v97, 16, v22 +; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v22 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v101, 16, v20 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v102, 16, v4 +; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v20 +; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v117, 16, v16 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v118, 16, v0 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v87, 16, v23 +; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v23 +; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v16 +; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v98, 16, v6 +; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v6 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v103, 16, v19 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v112, 16, v3 +; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v19 +; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v85, 16, v24 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v113, 16, v18 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v114, 16, v2 +; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v115, 16, v17 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v116, 16, v1 +; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v24 +; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v17 +; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v32 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v129, 16, v33 +; GFX11FAKE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v33 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v68, 16, v13 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v131, 16, v35 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v133, 16, v37 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v134, 16, v38 +; GFX11FAKE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v37 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v144, 16, v48 +; GFX11FAKE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v48 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v146, 16, v50 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v145, 16, v49 +; GFX11FAKE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v49 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v84, 16, v9 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v147, 16, v51 +; GFX11FAKE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v51 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v96, 16, v7 +; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v83, 16, v25 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v146, v100, v99 :: v_dual_and_b32 v25, 0xffff0000, v25 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v135, 16, v39 +; GFX11FAKE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v39 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v48, v7, v23 :: v_dual_fmac_f32 v49, v6, v22 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v134, v84, v83 :: v_dual_and_b32 v13, 0xffff0000, v13 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v51, v4, v20 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v144, v96, v87 :: v_dual_lshlrev_b32 v81, 16, v26 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v145, v98, v97 :: v_dual_and_b32 v26, 0xffff0000, v26 +; GFX11FAKE16-NEXT: v_or_b32_e32 v84, 0x400000, v146 +; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v18 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v86, 16, v8 +; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v8 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v82, 16, v10 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v147, v102, v101 :: v_dual_and_b32 v10, 0xffff0000, v10 +; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v9 +; GFX11FAKE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v38 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v69, 16, v28 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v37, v10, v26 :: v_dual_and_b32 v28, 0xffff0000, v28 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v39, v8, v24 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v133, v82, v81 :: v_dual_lshlrev_b32 v70, 16, v12 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v135, v86, v85 :: v_dual_and_b32 v12, 0xffff0000, v12 +; GFX11FAKE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v35 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v80, 16, v11 +; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v11 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v132, 16, v36 +; GFX11FAKE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v36 +; GFX11FAKE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v50 +; GFX11FAKE16-NEXT: v_or_b32_e32 v22, 0x400000, v133 +; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v21 +; GFX11FAKE16-NEXT: v_or_b32_e32 v24, 0x400000, v37 +; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v71, 16, v27 +; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 0xffff0000, v27 +; GFX11FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v32 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v130, 16, v34 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v35, v12, v28 :: v_dual_and_b32 v34, 0xffff0000, v34 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v36, v11, v27 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v50, v5, v21 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v132, v80, v71 :: v_dual_lshlrev_b32 v67, 16, v29 +; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v29 +; GFX11FAKE16-NEXT: v_or_b32_e32 v98, 0x400000, v51 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v119, 16, v31 +; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v31 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v15 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v34, v13, v29 :: v_dual_and_b32 v15, 0xffff0000, v15 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v130, v68, v67 :: v_dual_lshlrev_b32 v65, 16, v30 +; GFX11FAKE16-NEXT: v_bfe_u32 v23, v37, 16, 1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v31, v15, v32 :: v_dual_lshlrev_b32 v66, 16, v14 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v119, v64, v128 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v64, 16, v52 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v128, 16, v53 +; GFX11FAKE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v53 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v15, 16, v54 +; GFX11FAKE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v54 +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; GFX11FAKE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v55 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v129, v66, v65 :: v_dual_and_b32 v30, 0xffff0000, v30 +; GFX11FAKE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v52 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v64, v112, v103 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v38, v9, v25 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v131, v70, v69 :: v_dual_and_b32 v14, 0xffff0000, v14 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v53, v2, v18 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v55, v0, v16 +; GFX11FAKE16-NEXT: v_bfe_u32 v0, v119, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v2, v31, 16, 1 +; GFX11FAKE16-NEXT: v_dual_fmac_f32 v33, v14, v30 :: v_dual_fmac_f32 v52, v3, v19 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v32, v1, v17 +; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v119 +; GFX11FAKE16-NEXT: v_or_b32_e32 v3, 0x400000, v31 +; GFX11FAKE16-NEXT: v_bfe_u32 v4, v129, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v119, 0x7fff +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v119, v119 +; GFX11FAKE16-NEXT: v_add3_u32 v2, v2, v31, 0x7fff +; GFX11FAKE16-NEXT: v_cmp_u_f32_e64 s0, v31, v31 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v128, v114, v113 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v54, v118, v117 +; GFX11FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v129 +; GFX11FAKE16-NEXT: v_bfe_u32 v6, v33, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v10, v34, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v14, v35, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v19, v36, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v27, v38, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v65, v39, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v69, v48, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v81, v49, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v85, v50, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v97, v51, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v101, v52, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v113, v53, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v117, v32, 16, 1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v148, v0, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v149, v2, v3, s0 +; GFX11FAKE16-NEXT: v_add3_u32 v2, v4, v129, 0x7fff +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v129, v129 +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v15, v116, v115 +; GFX11FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v33 +; GFX11FAKE16-NEXT: v_bfe_u32 v8, v130, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v3, v6, v33, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v6, v10, v34, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v10, v14, v35, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v14, v19, v36, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v19, v23, v37, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v23, v27, v38, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v27, v65, v39, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v65, v69, v48, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v69, v81, v49, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v81, v85, v50, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v85, v97, v51, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v97, v101, v52, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v101, v113, v53, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v113, v117, v32, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v117, v2, v5, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v130 +; GFX11FAKE16-NEXT: v_bfe_u32 v12, v131, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v17, v132, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v21, v133, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v25, v134, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v29, v135, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v67, v144, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v71, v145, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v83, v146, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v87, v147, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v99, v64, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v103, v128, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v115, v15, 16, 1 +; GFX11FAKE16-NEXT: v_bfe_u32 v119, v54, 16, 1 +; GFX11FAKE16-NEXT: v_add3_u32 v4, v8, v130, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v33, v3, v7, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v130, v130 +; GFX11FAKE16-NEXT: v_or_b32_e32 v11, 0x400000, v34 +; GFX11FAKE16-NEXT: v_add3_u32 v8, v12, v131, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v12, v17, v132, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v17, v21, v133, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v21, v25, v134, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v25, v29, v135, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v29, v67, v144, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v67, v71, v145, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v71, v83, v146, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v83, v87, v147, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v87, v99, v64, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v99, v103, v128, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v103, v115, v15, 0x7fff +; GFX11FAKE16-NEXT: v_add3_u32 v115, v119, v54, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v119, v4, v9, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11FAKE16-NEXT: v_or_b32_e32 v13, 0x400000, v131 +; GFX11FAKE16-NEXT: v_or_b32_e32 v16, 0x400000, v35 +; GFX11FAKE16-NEXT: v_or_b32_e32 v18, 0x400000, v132 +; GFX11FAKE16-NEXT: v_or_b32_e32 v20, 0x400000, v36 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v34, v6, v11, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v131, v131 +; GFX11FAKE16-NEXT: v_or_b32_e32 v26, 0x400000, v134 +; GFX11FAKE16-NEXT: v_or_b32_e32 v28, 0x400000, v38 +; GFX11FAKE16-NEXT: v_or_b32_e32 v30, 0x400000, v135 +; GFX11FAKE16-NEXT: v_or_b32_e32 v66, 0x400000, v39 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v8, v13, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11FAKE16-NEXT: v_or_b32_e32 v68, 0x400000, v144 +; GFX11FAKE16-NEXT: v_or_b32_e32 v70, 0x400000, v48 +; GFX11FAKE16-NEXT: v_or_b32_e32 v80, 0x400000, v145 +; GFX11FAKE16-NEXT: v_or_b32_e32 v82, 0x400000, v49 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v10, v16, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v132, v132 +; GFX11FAKE16-NEXT: v_or_b32_e32 v86, 0x400000, v50 +; GFX11FAKE16-NEXT: v_or_b32_e32 v96, 0x400000, v147 +; GFX11FAKE16-NEXT: v_or_b32_e32 v100, 0x400000, v64 +; GFX11FAKE16-NEXT: v_or_b32_e32 v102, 0x400000, v52 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v18, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11FAKE16-NEXT: v_or_b32_e32 v112, 0x400000, v128 +; GFX11FAKE16-NEXT: v_or_b32_e32 v116, 0x400000, v15 +; GFX11FAKE16-NEXT: v_or_b32_e32 v118, 0x400000, v32 +; GFX11FAKE16-NEXT: v_or_b32_e32 v31, 0x400000, v54 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v20, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v133, v133 +; GFX11FAKE16-NEXT: v_bfe_u32 v0, v55, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v55 +; GFX11FAKE16-NEXT: v_or_b32_e32 v114, 0x400000, v53 +; GFX11FAKE16-NEXT: v_perm_b32 v11, v12, v11, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v17, v22, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v55, 0x7fff +; GFX11FAKE16-NEXT: v_perm_b32 v12, v16, v13, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v13, v34, v119, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v14, v19, v24, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v134, v134 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v10, v14, v10, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v26, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v38, v38 +; GFX11FAKE16-NEXT: v_perm_b32 v14, v33, v117, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v23, v28, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v135, v135 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v9, v17, v9, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v25, v30, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v39, v39 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v27, v66, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v144, v144 +; GFX11FAKE16-NEXT: v_perm_b32 v8, v18, v8, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v29, v68, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v48, v48 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v65, v70, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v145, v145 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v7, v19, v7, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v67, v80, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v49, v49 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v69, v82, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v146, v146 +; GFX11FAKE16-NEXT: v_perm_b32 v6, v20, v6, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v71, v84, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v81, v86, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v147, v147 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v5, v21, v5, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v83, v96, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v64, v64 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v87, v100, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v97, v102, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v128, v128 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v3, v22, v3, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v99, v112, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v103, v116, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v113, v118, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v54, v54 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v115, v31, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v53, v53 +; GFX11FAKE16-NEXT: v_perm_b32 v1, v23, v15, 0x7060302 +; GFX11FAKE16-NEXT: v_perm_b32 v15, v149, v148, 0x7060302 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v24, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v101, v114, vcc_lo +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v51, v51 +; GFX11FAKE16-NEXT: v_perm_b32 v2, v25, v2, 0x7060302 +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v85, v98, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_perm_b32 v4, v26, v4, 0x7060302 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fma_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_clause 0x10 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 offset:64 +; GFX1250-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX1250-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX1250-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX1250-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX1250-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX1250-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX1250-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX1250-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX1250-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX1250-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX1250-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX1250-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX1250-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX1250-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX1250-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX1250-NEXT: scratch_load_b32 v55, off, s32 +; GFX1250-NEXT: s_wait_loadcnt 0xf +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v16, v32 +; GFX1250-NEXT: s_wait_loadcnt 0xe +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v17, v33 +; GFX1250-NEXT: s_wait_loadcnt 0xd +; GFX1250-NEXT: v_pk_fma_bf16 v2, v2, v18, v34 +; GFX1250-NEXT: s_wait_loadcnt 0xc +; GFX1250-NEXT: v_pk_fma_bf16 v3, v3, v19, v35 +; GFX1250-NEXT: s_wait_loadcnt 0xb +; GFX1250-NEXT: v_pk_fma_bf16 v4, v4, v20, v36 +; GFX1250-NEXT: s_wait_loadcnt 0xa +; GFX1250-NEXT: v_pk_fma_bf16 v5, v5, v21, v37 +; GFX1250-NEXT: s_wait_loadcnt 0x9 +; GFX1250-NEXT: v_pk_fma_bf16 v6, v6, v22, v38 +; GFX1250-NEXT: s_wait_loadcnt 0x8 +; GFX1250-NEXT: v_pk_fma_bf16 v7, v7, v23, v39 +; GFX1250-NEXT: s_wait_loadcnt 0x7 +; GFX1250-NEXT: v_pk_fma_bf16 v8, v8, v24, v48 +; GFX1250-NEXT: s_wait_loadcnt 0x6 +; GFX1250-NEXT: v_pk_fma_bf16 v9, v9, v25, v49 +; GFX1250-NEXT: s_wait_loadcnt 0x5 +; GFX1250-NEXT: v_pk_fma_bf16 v10, v10, v26, v50 +; GFX1250-NEXT: s_wait_loadcnt 0x4 +; GFX1250-NEXT: v_pk_fma_bf16 v11, v11, v27, v51 +; GFX1250-NEXT: s_wait_loadcnt 0x3 +; GFX1250-NEXT: v_pk_fma_bf16 v12, v12, v28, v52 +; GFX1250-NEXT: s_wait_loadcnt 0x2 +; GFX1250-NEXT: v_pk_fma_bf16 v13, v13, v29, v53 +; GFX1250-NEXT: s_wait_loadcnt 0x1 +; GFX1250-NEXT: v_pk_fma_bf16 v14, v14, v30, v54 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v15, v15, v55, v31 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %op = call <32 x bfloat> @llvm.fma.v32bf16(<32 x bfloat> %a, <32 x bfloat> %b, <32 x bfloat> %c) + ret <32 x bfloat> %op +} + +declare bfloat @llvm.fmuladd.bf16(bfloat, bfloat, bfloat) +declare <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat>, <2 x bfloat>, <2 x bfloat>) +declare <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat>, <3 x bfloat>, <3 x bfloat>) +declare <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat>, <4 x bfloat>, <4 x bfloat>) + +define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { +; GCN-LABEL: v_fmuladd_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_mul_f32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: v_fmuladd_bf16: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX7-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fmuladd_bf16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX8-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1 +; GFX8-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX900-LABEL: v_fmuladd_bf16: +; GFX900: ; %bb.0: +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX900-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX900-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX900-NEXT: s_movk_i32 s4, 0x7fff +; GFX900-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX900-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX900-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-LABEL: v_fmuladd_bf16: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX950-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v2, s0 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_fmuladd_bf16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX10-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX10-NEXT: v_add3_u32 v0, v0, v2, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11TRUE16-LABEL: v_fmuladd_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fmuladd_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_fmac_f32_e32 v2, v0, v1 +; GFX11FAKE16-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX11FAKE16-NEXT: v_or_b32_e32 v1, 0x400000, v2 +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff +; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %op = call bfloat @llvm.fmuladd.bf16(bfloat %a, bfloat %b, bfloat %c) + ret bfloat %op +} define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) { ; GCN-LABEL: v_fmuladd_v2bf16: @@ -47114,6 +54996,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) ret <2 x bfloat> %op } @@ -47394,6 +55283,14 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_alignbit_b32 v1, s0, v3, 16 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <3 x bfloat> @llvm.fmuladd.v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat> %c) ret <3 x bfloat> %op } @@ -47744,6 +55641,16 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl ; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302 ; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_fmuladd_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4 +; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c) ret <4 x bfloat> %op } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1250FAKE16: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 12f8a59f0b84b..d89b39348ad9a 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -17,50 +17,50 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr8_sgpr9, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4) ; GFX90A-NEXT: renamable $sgpr24_sgpr25_sgpr26_sgpr27 = S_LOAD_DWORDX4_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_LOAD_DWORDX2_IMM renamable $sgpr8_sgpr9, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4) ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_XOR_B64 renamable $sgpr12_sgpr13, -1, implicit-def dead $scc ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr30_sgpr31, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr5 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr4 = DS_READ_B32_gfx9 renamable $vgpr5, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.2, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.1.bb103: ; GFX90A-NEXT: successors: %bb.58(0x40000000), %bb.2(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.58, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.2: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46, $sgpr47, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr4, $vgpr5 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 ; GFX90A-NEXT: renamable $vgpr3 = IMPLICIT_DEF implicit-def $vgpr2 - ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20 - ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22 ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24 + ; GFX90A-NEXT: renamable $vgpr27 = IMPLICIT_DEF implicit-def $vgpr26 + ; GFX90A-NEXT: renamable $vgpr29 = IMPLICIT_DEF implicit-def $vgpr28 ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.3.Flow17: ; GFX90A-NEXT: successors: %bb.4(0x40000000), %bb.57(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: ; GFX90A-NEXT: successors: %bb.35(0x40000000), %bb.5(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr25, implicit $exec @@ -75,12 +75,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.5: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 @@ -98,47 +98,49 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.6.Flow20: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr26 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr28 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr29 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = COPY killed renamable $sgpr18, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 - ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.62, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.8.Flow32: ; GFX90A-NEXT: successors: %bb.9(0x40000000), %bb.10(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.10, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.9.bb89: ; GFX90A-NEXT: successors: %bb.10(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) @@ -146,16 +148,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.10.Flow33: ; GFX90A-NEXT: successors: %bb.11(0x40000000), %bb.12(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr46_sgpr47, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.12, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.11.bb84: ; GFX90A-NEXT: successors: %bb.12(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) @@ -163,16 +165,16 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.12.Flow34: ; GFX90A-NEXT: successors: %bb.13(0x40000000), %bb.14(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr44_sgpr45, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.14, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.13.bb79: ; GFX90A-NEXT: successors: %bb.14(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) @@ -180,10 +182,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.14.Flow35: ; GFX90A-NEXT: successors: %bb.15(0x40000000), %bb.16(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $vgpr0_vgpr1:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc - ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr40_sgpr41, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr42_sgpr43, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_EXECZ %bb.16, implicit $exec ; GFX90A-NEXT: {{ $}} @@ -357,15 +359,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.35.bb20: ; GFX90A-NEXT: successors: %bb.37(0x40000000), %bb.36(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i23) ; GFX90A-NEXT: renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 @@ -383,33 +385,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr44_vgpr45 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.36.Flow21: ; GFX90A-NEXT: successors: %bb.6(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.6 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.37.bb27: ; GFX90A-NEXT: successors: %bb.39(0x40000000), %bb.38(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr42_sgpr43, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr40_sgpr41, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i30) ; GFX90A-NEXT: renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec @@ -422,49 +424,51 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr56_vgpr57 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.38.Flow22: ; GFX90A-NEXT: successors: %bb.36(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr40_sgpr41, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_ANDN2_B64 killed renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr46_sgpr47, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_OR_B64 killed renamable $sgpr36_sgpr37, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.36 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.39.bb34: ; GFX90A-NEXT: successors: %bb.41(0x40000000), %bb.40(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr44_sgpr45, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr52_sgpr53, $sgpr64_sgpr65, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr66_sgpr67 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i37) ; GFX90A-NEXT: renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF @@ -472,48 +476,48 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr58_vgpr59 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.40.Flow23: ; GFX90A-NEXT: successors: %bb.38(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr68_sgpr69, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr42_sgpr43, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_OR_B64 killed renamable $sgpr48_sgpr49, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.38 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.41.bb41: ; GFX90A-NEXT: successors: %bb.46(0x40000000), %bb.42(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr54_sgpr55, $sgpr66_sgpr67, $sgpr68_sgpr69 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr59, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE renamable $vgpr58_vgpr59, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i44) - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF @@ -522,271 +526,273 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr60_vgpr61 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.46, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.42.Flow24: ; GFX90A-NEXT: successors: %bb.40(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr59 = COPY killed renamable $vgpr3, implicit $exec - ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr18_sgpr19, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr42_sgpr43, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.40 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.43.bb55: ; GFX90A-NEXT: successors: %bb.48(0x40000000), %bb.44(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: S_BITCMP1_B32 killed renamable $sgpr17, 16, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_XOR_B64 renamable $sgpr64_sgpr65, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_CSELECT_B64 -1, 0, implicit killed $scc + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_XOR_B64 renamable $sgpr66_sgpr67, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr50_sgpr51, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.48, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.44: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr57, $vgpr56, $vgpr30, $vgpr31, $vgpr60, $vgpr62, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $vgpr61, $vgpr58, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr63 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr62, $vgpr56, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $vgpr57, $vgpr61, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr47, $vgpr46, $vgpr2, $vgpr4, $vgpr5, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr40, $vgpr60, $vgpr63, $vgpr58 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.45.Flow26: ; GFX90A-NEXT: successors: %bb.47(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr46_sgpr47, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.47 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.46.bb48: ; GFX90A-NEXT: successors: %bb.43(0x40000000), %bb.47(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr64_sgpr65, $sgpr50_sgpr51, $sgpr66_sgpr67, $sgpr44_sgpr45, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr66_sgpr67, $sgpr58_sgpr59, $sgpr68_sgpr69, $sgpr64_sgpr65, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = COPY $vcc ; GFX90A-NEXT: renamable $vgpr0 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr1, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr0 = FLAT_LOAD_UBYTE killed renamable $vgpr0_vgpr1, 1024, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i51) - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec - ; GFX90A-NEXT: renamable $sgpr70_sgpr71 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr0_vgpr1 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr62_vgpr63 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr44_sgpr45 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.47.Flow25: ; GFX90A-NEXT: successors: %bb.42(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr60_sgpr61, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr58_sgpr59, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr44_sgpr45, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_AND_B64 killed renamable $sgpr70_sgpr71, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr68_sgpr69, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr58_sgpr59, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr66_sgpr67, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_AND_B64 killed renamable $sgpr50_sgpr51, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_AND_B64 killed renamable $sgpr48_sgpr49, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr48_sgpr49, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_OR_B64 killed renamable $sgpr44_sgpr45, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.42 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.48.bb63: ; GFX90A-NEXT: successors: %bb.50(0x40000000), %bb.49(0x40000000) - ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57, $sgpr48_sgpr49 + ; GFX90A-NEXT: liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59, $sgpr48_sgpr49 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.50, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.49: ; GFX90A-NEXT: successors: %bb.44(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr48_sgpr49 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.44 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.50.bb68: ; GFX90A-NEXT: successors: %bb.54(0x40000000), %bb.51(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.51: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr58_sgpr59 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr6_vgpr7 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.52.bb80: ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr52_sgpr53 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 -1 - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: S_BRANCH %bb.61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.54.bb73: ; GFX90A-NEXT: successors: %bb.52(0x40000000), %bb.55(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr54_sgpr55, $sgpr60_sgpr61 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr8 = FLAT_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i76) ; GFX90A-NEXT: renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $sgpr50_sgpr51 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr7, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr8, implicit $exec - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vgpr10_vgpr11 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr8_vgpr9 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 - ; GFX90A-NEXT: $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF + ; GFX90A-NEXT: $sgpr62_sgpr63 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.52, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.55.Flow29: ; GFX90A-NEXT: successors: %bb.45(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc + ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr62_sgpr63, implicit-def $scc ; GFX90A-NEXT: S_BRANCH %bb.45 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.56.bb90: ; GFX90A-NEXT: successors: %bb.60(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr21, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr12, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr12, 0, 1, 0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr19, 0, $vgpr18, 0, 1, 0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $vgpr12 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr66_sgpr67, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr14, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr13 = COPY renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr13, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_LSHR_B64 killed renamable $sgpr56_sgpr57, 1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = V_LSHRREV_B64_e64 1, $vgpr22_vgpr23, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32_vgpr33 = V_LSHRREV_B64_e64 1, $vgpr20_vgpr21, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $vgpr14 = COPY renamable $vgpr20, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.60 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.57: ; GFX90A-NEXT: successors: %bb.7(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr40_sgpr41, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr42_sgpr43 = S_MOV_B64 0 - ; GFX90A-NEXT: renamable $sgpr40_sgpr41 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr66_sgpr67 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 @@ -812,79 +818,80 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.58.bb105: ; GFX90A-NEXT: successors: %bb.3(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr4_vgpr5:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr2_vgpr3 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr33, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.420, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr28_vgpr29 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 ; GFX90A-NEXT: S_BRANCH %bb.3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.59.bb85: ; GFX90A-NEXT: successors: %bb.56(0x40000000), %bb.60(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec ; GFX90A-NEXT: renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec ; GFX90A-NEXT: renamable $vgpr12 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86) ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37 - ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $sgpr64_sgpr65 = COPY renamable $sgpr36_sgpr37 ; GFX90A-NEXT: renamable $vgpr17 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr32 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr20 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr18 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr15 = IMPLICIT_DEF implicit-def $vgpr14 - ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12 + ; GFX90A-NEXT: renamable $vgpr22 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr12 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18 + ; GFX90A-NEXT: renamable $sgpr18 = IMPLICIT_DEF + ; GFX90A-NEXT: renamable $vgpr14 = IMPLICIT_DEF ; GFX90A-NEXT: $sgpr54_sgpr55 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.56, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.60.Flow31: ; GFX90A-NEXT: successors: %bb.61(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr54_sgpr55, implicit-def $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_MOV_B64 0 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.61.Flow30: ; GFX90A-NEXT: successors: %bb.55(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr3, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19:0x0000000000000003, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_XOR_B64 $exec, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr46_sgpr47, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr58_sgpr59 = S_XOR_B64 $exec, -1, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr60_sgpr61 = S_AND_B64 killed renamable $sgpr54_sgpr55, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr46_sgpr47 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc - ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr46_sgpr47, killed renamable $sgpr52_sgpr53, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_ANDN2_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr56_sgpr57 = S_AND_B64 killed renamable $sgpr64_sgpr65, $exec, implicit-def dead $scc + ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_OR_B64 killed renamable $sgpr52_sgpr53, killed renamable $sgpr56_sgpr57, implicit-def dead $scc ; GFX90A-NEXT: S_BRANCH %bb.55 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.62.bb140: ; GFX90A-NEXT: successors: %bb.68(0x40000000), %bb.63(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 -1 + ; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.68, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.63.Flow13: ; GFX90A-NEXT: successors: %bb.64(0x40000000), %bb.66(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc + ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.66, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.64.bb159: ; GFX90A-NEXT: successors: %bb.67(0x40000000), %bb.65(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec ; GFX90A-NEXT: $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec @@ -893,104 +900,106 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.65.Flow10: ; GFX90A-NEXT: successors: %bb.66(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec ; GFX90A-NEXT: $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.66.Flow14: ; GFX90A-NEXT: successors: %bb.8(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $sgpr68_sgpr69 = COPY $exec ; GFX90A-NEXT: S_BRANCH %bb.8 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.67.bb161: ; GFX90A-NEXT: successors: %bb.65(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr13, killed $vgpr3, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr25, killed $vgpr27, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr29, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr15, killed $vgpr3, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr12, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr52, killed $vgpr15, implicit $exec + ; GFX90A-NEXT: renamable $vgpr4 = V_OR_B32_e32 killed $vgpr16, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec + ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr32, implicit $exec ; GFX90A-NEXT: DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3) ; GFX90A-NEXT: S_BRANCH %bb.65 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.68.bb174: ; GFX90A-NEXT: successors: %bb.72(0x40000000), %bb.69(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec - ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec - ; GFX90A-NEXT: renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec - ; GFX90A-NEXT: renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr12, implicit $exec - ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr14, implicit $exec - ; GFX90A-NEXT: renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr28_vgpr29:0x000000000000000F, $vgpr32_vgpr33:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: renamable $agpr0 = COPY killed renamable $vgpr32, implicit $exec + ; GFX90A-NEXT: renamable $vgpr32 = V_OR_B32_e32 1, $vgpr28, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_OR_B32_e32 $vgpr32, $vgpr26, implicit $exec + ; GFX90A-NEXT: renamable $vgpr38 = V_OR_B32_e32 $vgpr52, $vgpr24, implicit $exec + ; GFX90A-NEXT: renamable $vgpr34 = V_CNDMASK_B32_e64 0, $vgpr38, 0, 0, $sgpr12_sgpr13, implicit $exec + ; GFX90A-NEXT: renamable $vgpr50 = V_OR_B32_e32 $vgpr34, $vgpr2, implicit $exec + ; GFX90A-NEXT: renamable $vgpr48 = V_OR_B32_e32 $vgpr50, $vgpr14, implicit $exec + ; GFX90A-NEXT: renamable $vgpr36 = V_OR_B32_e32 $vgpr48, $vgpr18, implicit $exec + ; GFX90A-NEXT: renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr36, killed $sgpr12_sgpr13, implicit $exec ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 -1 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.72, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.69.Flow: ; GFX90A-NEXT: successors: %bb.70(0x40000000), %bb.71(0x40000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.71, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.70.bb186: ; GFX90A-NEXT: successors: %bb.71(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr4_vgpr5 = V_LSHLREV_B64_e64 3, killed $vgpr4_vgpr5, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr4, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr4, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr2, killed $vgpr5, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr33 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr51 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr55 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: renamable $vgpr37 = COPY renamable $vgpr33, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr13 = COPY killed renamable $sgpr22, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr13, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 renamable $vgpr33, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3) + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr33, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5) ; GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5) ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.71.Flow9: ; GFX90A-NEXT: successors: %bb.63(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr36_sgpr37 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $sgpr24_sgpr25 = S_MOV_B64 0 + ; GFX90A-NEXT: renamable $vgpr32 = COPY killed renamable $agpr0, implicit $exec ; GFX90A-NEXT: S_BRANCH %bb.63 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.72.bb196: ; GFX90A-NEXT: successors: %bb.69(0x80000000) - ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr12, $vgpr17, $vgpr30, $vgpr31, $agpr0_agpr1:0x0000000000000003, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000C, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x0000000000000003, $vgpr22_vgpr23:0x0000000000000003, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x000000000000000C, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr54_vgpr55:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr50, killed $vgpr18, implicit $exec - ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr2, killed $vgpr16, implicit $exec - ; GFX90A-NEXT: renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec - ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) + ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 $vgpr54, killed $vgpr22, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = V_OR_B32_e32 killed $vgpr2, killed $vgpr20, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr21, renamable $vgpr20_vgpr21, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 bb: diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 306fe33bfb7ac..8e12e7e03947b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -2523,7 +2523,7 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: s_add_u32 s36, s36, s3 ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0 +; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -2533,9 +2533,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; CI-NEXT: v_mov_b32_e32 v0, v2 -; CI-NEXT: v_mov_b32_e32 v2, v3 +; CI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; CI-NEXT: v_mov_b32_e32 v0, v3 +; CI-NEXT: v_mov_b32_e32 v2, v4 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll index b71885b54b5a2..51652a09863e0 100644 --- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll +++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll @@ -231,7 +231,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) { ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], lit64(0x123456789876) +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0x123456789876 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -434,7 +434,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) { ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1] +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], 0x123456789876, v[0:1] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] ; GFX1250-NEXT: s_endpgm @@ -1210,7 +1210,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) { ; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_sub_nc_u64 s[2:3], lit64(0x123456789876), s[2:3] +; GFX1250-NEXT: s_sub_nc_u64 s[2:3], 0x123456789876, s[2:3] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] ; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1413,7 +1413,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) { ; GFX1250-NEXT: v_mov_b32_e32 v1, 0 ; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], lit64(0x123456789876), v[0:1] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[2:3], 0x123456789876, v[0:1] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; GFX1250-NEXT: global_store_b64 v1, v[2:3], s[0:1] ; GFX1250-NEXT: s_endpgm @@ -1973,9 +1973,9 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car ; GCN-ISEL-LABEL: name: sudiv64 ; GCN-ISEL-LABEL: body: ; GCN-ISEL-LABEL: bb.3 -; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 +; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = S_UADDO_PSEUDO ; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]] -; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64 +; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = S_USUBO_PSEUDO ; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]] define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { @@ -2029,7 +2029,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; CISI-NEXT: v_mul_lo_u32 v4, s1, v0 ; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; CISI-NEXT: v_mul_lo_u32 v3, s0, v0 -; CISI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; CISI-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; CISI-NEXT: v_mul_lo_u32 v6, v0, v2 ; CISI-NEXT: v_mul_hi_u32 v7, v0, v3 ; CISI-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -2132,18 +2132,18 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; VI-LABEL: sudiv64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] -; VI-NEXT: s_mov_b32 s0, 0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_cbranch_scc0 .LBB16_4 +; VI-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; VI-NEXT: s_mov_b32 s6, 0 +; VI-NEXT: s_cmp_lg_u64 s[6:7], 0 +; VI-NEXT: s_cbranch_scc0 .LBB16_3 ; VI-NEXT: ; %bb.1: -; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 -; VI-NEXT: v_cvt_f32_u32_e32 v1, s3 -; VI-NEXT: s_sub_u32 s4, 0, s2 -; VI-NEXT: s_subb_u32 s5, 0, s3 +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: v_cvt_f32_u32_e32 v1, s5 +; VI-NEXT: s_sub_u32 s8, 0, s4 +; VI-NEXT: s_subb_u32 s9, 0, s5 ; VI-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; VI-NEXT: v_rcp_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2152,17 +2152,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v4, v1 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v0 -; VI-NEXT: v_mul_lo_u32 v2, s4, v4 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v5, 0 -; VI-NEXT: v_mul_lo_u32 v3, s5, v5 +; VI-NEXT: v_mul_lo_u32 v2, s8, v4 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s8, v5, 0 +; VI-NEXT: v_mul_lo_u32 v3, s9, v5 ; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 ; VI-NEXT: v_mul_hi_u32 v6, v5, v0 -; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0 +; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v3, 0 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v1 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v4, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v0, 0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v3, 0 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v3, 0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v6, v0 ; VI-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -2170,15 +2170,15 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v0 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s4, v6, 0 -; VI-NEXT: v_mul_lo_u32 v4, s4, v7 -; VI-NEXT: v_mul_lo_u32 v5, s5, v6 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s8, v6, 0 +; VI-NEXT: v_mul_lo_u32 v4, s8, v7 +; VI-NEXT: v_mul_lo_u32 v5, s9, v6 ; VI-NEXT: v_mul_hi_u32 v8, v6, v0 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v7, v0, 0 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v7, v0, 0 ; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v7, v1, 0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v6, v1, 0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v7, v1, 0 ; VI-NEXT: v_add_u32_e32 v4, vcc, v8, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v4, v2 @@ -2188,119 +2188,117 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0 -; VI-NEXT: v_mul_hi_u32 v4, s10, v2 -; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: v_readfirstlane_b32 s5, v0 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s11, v3, 0 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s11, v2, 0 -; VI-NEXT: v_readfirstlane_b32 s6, v4 -; VI-NEXT: s_add_u32 s0, s6, s5 -; VI-NEXT: s_addc_u32 s1, 0, s4 -; VI-NEXT: v_readfirstlane_b32 s6, v2 -; VI-NEXT: v_readfirstlane_b32 s5, v3 -; VI-NEXT: s_add_u32 s0, s0, s6 -; VI-NEXT: v_readfirstlane_b32 s4, v1 -; VI-NEXT: s_addc_u32 s0, s1, s5 -; VI-NEXT: s_addc_u32 s6, s4, 0 -; VI-NEXT: v_readfirstlane_b32 s1, v0 -; VI-NEXT: s_add_u32 s7, s0, s1 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0 -; VI-NEXT: s_addc_u32 s6, 0, s6 -; VI-NEXT: s_mul_i32 s0, s2, s6 -; VI-NEXT: v_readfirstlane_b32 s1, v1 -; VI-NEXT: s_add_i32 s0, s1, s0 -; VI-NEXT: s_mul_i32 s1, s3, s7 -; VI-NEXT: s_add_i32 s12, s0, s1 -; VI-NEXT: s_sub_i32 s0, s11, s12 -; VI-NEXT: v_sub_u32_e32 v0, vcc, s10, v0 -; VI-NEXT: s_cmp_lg_u64 vcc, 0 -; VI-NEXT: s_subb_u32 s13, s0, s3 -; VI-NEXT: v_subrev_u32_e64 v1, s[0:1], s2, v0 -; VI-NEXT: s_cmp_lg_u64 s[0:1], 0 -; VI-NEXT: s_subb_u32 s13, s13, 0 -; VI-NEXT: s_cmp_ge_u32 s13, s3 -; VI-NEXT: s_cselect_b32 s14, -1, 0 -; VI-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 -; VI-NEXT: s_cmp_eq_u32 s13, s3 -; VI-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s14 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] -; VI-NEXT: s_add_u32 s0, s7, 1 -; VI-NEXT: s_addc_u32 s13, s6, 0 -; VI-NEXT: s_add_u32 s1, s7, 2 -; VI-NEXT: s_addc_u32 s7, s6, 0 -; VI-NEXT: v_mov_b32_e32 v3, s0 -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; VI-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s13 -; VI-NEXT: v_mov_b32_e32 v4, s7 -; VI-NEXT: s_cmp_lg_u64 vcc, 0 -; VI-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] -; VI-NEXT: s_subb_u32 s0, s11, s12 -; VI-NEXT: s_cmp_ge_u32 s0, s3 -; VI-NEXT: s_cselect_b32 s1, -1, 0 -; VI-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; VI-NEXT: s_cmp_eq_u32 s0, s3 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; VI-NEXT: s_cbranch_execnz .LBB16_3 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v3, 0 +; VI-NEXT: v_mul_hi_u32 v4, s2, v2 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: v_readfirstlane_b32 s9, v0 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s3, v3, 0 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[6:7], s3, v2, 0 +; VI-NEXT: v_readfirstlane_b32 s10, v4 +; VI-NEXT: s_add_u32 s6, s10, s9 +; VI-NEXT: s_addc_u32 s7, 0, s8 +; VI-NEXT: v_readfirstlane_b32 s10, v2 +; VI-NEXT: v_readfirstlane_b32 s9, v3 +; VI-NEXT: s_add_u32 s6, s6, s10 +; VI-NEXT: v_readfirstlane_b32 s8, v1 +; VI-NEXT: s_addc_u32 s6, s7, s9 +; VI-NEXT: s_addc_u32 s8, s8, 0 +; VI-NEXT: v_readfirstlane_b32 s7, v0 +; VI-NEXT: s_add_u32 s12, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s12 +; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0 +; VI-NEXT: s_addc_u32 s13, 0, s8 +; VI-NEXT: s_mul_i32 s8, s4, s13 +; VI-NEXT: v_readfirstlane_b32 s9, v1 +; VI-NEXT: s_add_i32 s8, s9, s8 +; VI-NEXT: s_mul_i32 s9, s5, s12 +; VI-NEXT: s_add_i32 s14, s8, s9 +; VI-NEXT: s_sub_i32 s10, s3, s14 +; VI-NEXT: v_readfirstlane_b32 s8, v0 +; VI-NEXT: s_sub_u32 s15, s2, s8 +; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 +; VI-NEXT: s_subb_u32 s16, s10, s5 +; VI-NEXT: s_sub_u32 s17, s15, s4 +; VI-NEXT: s_cselect_b64 s[10:11], -1, 0 +; VI-NEXT: s_cmp_lg_u64 s[10:11], 0 +; VI-NEXT: s_subb_u32 s10, s16, 0 +; VI-NEXT: s_cmp_ge_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s11, -1, 0 +; VI-NEXT: s_cmp_ge_u32 s17, s4 +; VI-NEXT: s_cselect_b32 s16, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s10, s5 +; VI-NEXT: s_cselect_b32 s10, s16, s11 +; VI-NEXT: s_add_u32 s11, s12, 1 +; VI-NEXT: s_addc_u32 s16, s13, 0 +; VI-NEXT: s_add_u32 s17, s12, 2 +; VI-NEXT: s_addc_u32 s18, s13, 0 +; VI-NEXT: s_cmp_lg_u32 s10, 0 +; VI-NEXT: s_cselect_b32 s10, s17, s11 +; VI-NEXT: s_cselect_b32 s11, s18, s16 +; VI-NEXT: s_cmp_lg_u64 s[8:9], 0 +; VI-NEXT: s_subb_u32 s3, s3, s14 +; VI-NEXT: s_cmp_ge_u32 s3, s5 +; VI-NEXT: s_cselect_b32 s8, -1, 0 +; VI-NEXT: s_cmp_ge_u32 s15, s4 +; VI-NEXT: s_cselect_b32 s9, -1, 0 +; VI-NEXT: s_cmp_eq_u32 s3, s5 +; VI-NEXT: s_cselect_b32 s3, s9, s8 +; VI-NEXT: s_cmp_lg_u32 s3, 0 +; VI-NEXT: s_cselect_b32 s9, s11, s13 +; VI-NEXT: s_cselect_b32 s8, s10, s12 +; VI-NEXT: s_cbranch_execnz .LBB16_4 ; VI-NEXT: .LBB16_2: -; VI-NEXT: v_cvt_f32_u32_e32 v0, s2 -; VI-NEXT: s_sub_i32 s0, 0, s2 +; VI-NEXT: v_cvt_f32_u32_e32 v0, s4 +; VI-NEXT: s_sub_i32 s3, 0, s4 ; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: v_mul_lo_u32 v1, s0, v0 +; VI-NEXT: v_mul_lo_u32 v1, s3, v0 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: v_mul_hi_u32 v0, s10, v0 -; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: s_mul_i32 s0, s0, s2 -; VI-NEXT: s_sub_i32 s0, s10, s0 -; VI-NEXT: s_sub_i32 s1, s0, s2 +; VI-NEXT: v_mul_hi_u32 v0, s2, v0 +; VI-NEXT: v_readfirstlane_b32 s3, v0 +; VI-NEXT: s_mul_i32 s3, s3, s4 +; VI-NEXT: s_sub_i32 s2, s2, s3 +; VI-NEXT: s_sub_i32 s3, s2, s4 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; VI-NEXT: s_cmp_ge_u32 s0, s2 +; VI-NEXT: s_cmp_ge_u32 s2, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 -; VI-NEXT: s_cselect_b32 s0, s1, s0 +; VI-NEXT: s_cselect_b32 s2, s3, s2 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; VI-NEXT: s_cmp_ge_u32 s0, s2 +; VI-NEXT: s_cmp_ge_u32 s2, s4 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_branch .LBB16_5 ; VI-NEXT: .LBB16_3: -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_mov_b32_e32 v3, s9 +; VI-NEXT: ; implicit-def: $sgpr8_sgpr9 +; VI-NEXT: s_branch .LBB16_2 +; VI-NEXT: .LBB16_4: +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: .LBB16_5: +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB16_4: -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_branch .LBB16_2 ; ; GFX9-LABEL: sudiv64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] +; GFX9-NEXT: s_mov_b32 s4, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX9-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX9-NEXT: s_sub_u32 s0, 0, s2 -; GFX9-NEXT: s_subb_u32 s1, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX9-NEXT: s_sub_u32 s10, 0, s6 +; GFX9-NEXT: s_subb_u32 s11, 0, s7 ; GFX9-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2309,166 +2307,157 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX9-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s6, v1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s12, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s14, s0, s7 -; GFX9-NEXT: s_mul_i32 s13, s1, s7 -; GFX9-NEXT: s_add_i32 s12, s14, s12 -; GFX9-NEXT: s_add_i32 s12, s12, s13 -; GFX9-NEXT: s_mul_i32 s15, s0, s7 -; GFX9-NEXT: s_mul_hi_u32 s13, s7, s12 -; GFX9-NEXT: s_mul_i32 s14, s7, s12 -; GFX9-NEXT: s_mul_hi_u32 s7, s7, s15 -; GFX9-NEXT: s_add_u32 s7, s7, s14 +; GFX9-NEXT: v_readfirstlane_b32 s12, v1 +; GFX9-NEXT: v_readfirstlane_b32 s8, v0 +; GFX9-NEXT: s_mul_i32 s9, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s14, s10, s8 +; GFX9-NEXT: s_mul_i32 s13, s11, s8 +; GFX9-NEXT: s_add_i32 s9, s14, s9 +; GFX9-NEXT: s_add_i32 s9, s9, s13 +; GFX9-NEXT: s_mul_i32 s15, s10, s8 +; GFX9-NEXT: s_mul_i32 s14, s8, s9 +; GFX9-NEXT: s_mul_hi_u32 s16, s8, s15 +; GFX9-NEXT: s_mul_hi_u32 s13, s8, s9 +; GFX9-NEXT: s_add_u32 s14, s16, s14 ; GFX9-NEXT: s_addc_u32 s13, 0, s13 -; GFX9-NEXT: s_mul_hi_u32 s16, s6, s15 -; GFX9-NEXT: s_mul_i32 s15, s6, s15 -; GFX9-NEXT: s_add_u32 s7, s7, s15 -; GFX9-NEXT: s_mul_hi_u32 s14, s6, s12 -; GFX9-NEXT: s_addc_u32 s7, s13, s16 -; GFX9-NEXT: s_addc_u32 s13, s14, 0 -; GFX9-NEXT: s_mul_i32 s12, s6, s12 -; GFX9-NEXT: s_add_u32 s7, s7, s12 -; GFX9-NEXT: s_addc_u32 s12, 0, s13 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s7, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s6, s6, s12 -; GFX9-NEXT: v_readfirstlane_b32 s12, v0 -; GFX9-NEXT: s_mul_i32 s7, s0, s6 -; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 -; GFX9-NEXT: s_add_i32 s7, s13, s7 -; GFX9-NEXT: s_mul_i32 s1, s1, s12 -; GFX9-NEXT: s_add_i32 s7, s7, s1 -; GFX9-NEXT: s_mul_i32 s0, s0, s12 -; GFX9-NEXT: s_mul_hi_u32 s13, s6, s0 -; GFX9-NEXT: s_mul_i32 s14, s6, s0 -; GFX9-NEXT: s_mul_i32 s16, s12, s7 -; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 -; GFX9-NEXT: s_mul_hi_u32 s15, s12, s7 -; GFX9-NEXT: s_add_u32 s0, s0, s16 -; GFX9-NEXT: s_addc_u32 s12, 0, s15 -; GFX9-NEXT: s_add_u32 s0, s0, s14 -; GFX9-NEXT: s_mul_hi_u32 s1, s6, s7 -; GFX9-NEXT: s_addc_u32 s0, s12, s13 -; GFX9-NEXT: s_addc_u32 s1, s1, 0 -; GFX9-NEXT: s_mul_i32 s7, s6, s7 -; GFX9-NEXT: s_add_u32 s0, s0, s7 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_addc_u32 s0, s6, s1 -; GFX9-NEXT: v_readfirstlane_b32 s7, v0 -; GFX9-NEXT: s_mul_i32 s6, s10, s0 -; GFX9-NEXT: s_mul_hi_u32 s12, s10, s7 -; GFX9-NEXT: s_mul_hi_u32 s1, s10, s0 -; GFX9-NEXT: s_add_u32 s6, s12, s6 -; GFX9-NEXT: s_addc_u32 s1, 0, s1 -; GFX9-NEXT: s_mul_hi_u32 s13, s11, s7 -; GFX9-NEXT: s_mul_i32 s7, s11, s7 -; GFX9-NEXT: s_add_u32 s6, s6, s7 -; GFX9-NEXT: s_mul_hi_u32 s12, s11, s0 -; GFX9-NEXT: s_addc_u32 s1, s1, s13 -; GFX9-NEXT: s_addc_u32 s6, s12, 0 -; GFX9-NEXT: s_mul_i32 s0, s11, s0 -; GFX9-NEXT: s_add_u32 s7, s1, s0 -; GFX9-NEXT: s_addc_u32 s6, 0, s6 -; GFX9-NEXT: s_mul_i32 s0, s2, s6 -; GFX9-NEXT: s_mul_hi_u32 s1, s2, s7 -; GFX9-NEXT: s_add_i32 s0, s1, s0 -; GFX9-NEXT: s_mul_i32 s1, s3, s7 -; GFX9-NEXT: s_add_i32 s12, s0, s1 -; GFX9-NEXT: s_mul_i32 s1, s2, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: s_sub_i32 s0, s11, s12 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: s_subb_u32 s13, s0, s3 -; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s2, v0 -; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: s_subb_u32 s13, s13, 0 -; GFX9-NEXT: s_cmp_ge_u32 s13, s3 -; GFX9-NEXT: s_cselect_b32 s14, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v1 -; GFX9-NEXT: s_cmp_eq_u32 s13, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] -; GFX9-NEXT: s_add_u32 s0, s7, 1 -; GFX9-NEXT: s_addc_u32 s13, s6, 0 -; GFX9-NEXT: s_add_u32 s1, s7, 2 -; GFX9-NEXT: s_addc_u32 s14, s6, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, s13 -; GFX9-NEXT: v_mov_b32_e32 v3, s14 -; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: s_subb_u32 s0, s11, s12 -; GFX9-NEXT: s_cmp_ge_u32 s0, s3 -; GFX9-NEXT: s_cselect_b32 s1, -1, 0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: s_cmp_eq_u32 s0, s3 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s6 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: s_mul_hi_u32 s17, s12, s15 +; GFX9-NEXT: s_mul_i32 s15, s12, s15 +; GFX9-NEXT: s_add_u32 s14, s14, s15 +; GFX9-NEXT: s_mul_hi_u32 s16, s12, s9 +; GFX9-NEXT: s_addc_u32 s13, s13, s17 +; GFX9-NEXT: s_addc_u32 s14, s16, 0 +; GFX9-NEXT: s_mul_i32 s9, s12, s9 +; GFX9-NEXT: s_add_u32 s9, s13, s9 +; GFX9-NEXT: s_addc_u32 s13, 0, s14 +; GFX9-NEXT: s_add_u32 s14, s8, s9 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s12, s12, s13 +; GFX9-NEXT: s_mul_i32 s8, s10, s12 +; GFX9-NEXT: s_mul_hi_u32 s9, s10, s14 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s11, s11, s14 +; GFX9-NEXT: s_add_i32 s8, s8, s11 +; GFX9-NEXT: s_mul_i32 s10, s10, s14 +; GFX9-NEXT: s_mul_hi_u32 s11, s12, s10 +; GFX9-NEXT: s_mul_i32 s13, s12, s10 +; GFX9-NEXT: s_mul_i32 s16, s14, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s14, s10 +; GFX9-NEXT: s_mul_hi_u32 s15, s14, s8 +; GFX9-NEXT: s_add_u32 s10, s10, s16 +; GFX9-NEXT: s_addc_u32 s15, 0, s15 +; GFX9-NEXT: s_add_u32 s10, s10, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s12, s8 +; GFX9-NEXT: s_addc_u32 s10, s15, s11 +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mul_i32 s8, s12, s8 +; GFX9-NEXT: s_add_u32 s8, s10, s8 +; GFX9-NEXT: s_addc_u32 s10, 0, s9 +; GFX9-NEXT: s_add_u32 s11, s14, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_addc_u32 s8, s12, s10 +; GFX9-NEXT: s_mul_i32 s10, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s12, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8 +; GFX9-NEXT: s_add_u32 s10, s12, s10 +; GFX9-NEXT: s_addc_u32 s9, 0, s9 +; GFX9-NEXT: s_mul_hi_u32 s13, s3, s11 +; GFX9-NEXT: s_mul_i32 s11, s3, s11 +; GFX9-NEXT: s_add_u32 s10, s10, s11 +; GFX9-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX9-NEXT: s_addc_u32 s9, s9, s13 +; GFX9-NEXT: s_addc_u32 s10, s12, 0 +; GFX9-NEXT: s_mul_i32 s8, s3, s8 +; GFX9-NEXT: s_add_u32 s12, s9, s8 +; GFX9-NEXT: s_addc_u32 s13, 0, s10 +; GFX9-NEXT: s_mul_i32 s8, s6, s13 +; GFX9-NEXT: s_mul_hi_u32 s9, s6, s12 +; GFX9-NEXT: s_add_i32 s8, s9, s8 +; GFX9-NEXT: s_mul_i32 s9, s7, s12 +; GFX9-NEXT: s_add_i32 s14, s8, s9 +; GFX9-NEXT: s_sub_i32 s10, s3, s14 +; GFX9-NEXT: s_mul_i32 s8, s6, s12 +; GFX9-NEXT: s_sub_u32 s15, s2, s8 +; GFX9-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s16, s10, s7 +; GFX9-NEXT: s_sub_u32 s17, s15, s6 +; GFX9-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GFX9-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GFX9-NEXT: s_subb_u32 s10, s16, 0 +; GFX9-NEXT: s_cmp_ge_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s11, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s17, s6 +; GFX9-NEXT: s_cselect_b32 s16, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s10, s7 +; GFX9-NEXT: s_cselect_b32 s10, s16, s11 +; GFX9-NEXT: s_add_u32 s11, s12, 1 +; GFX9-NEXT: s_addc_u32 s16, s13, 0 +; GFX9-NEXT: s_add_u32 s17, s12, 2 +; GFX9-NEXT: s_addc_u32 s18, s13, 0 +; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s10, s17, s11 +; GFX9-NEXT: s_cselect_b32 s11, s18, s16 +; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX9-NEXT: s_subb_u32 s3, s3, s14 +; GFX9-NEXT: s_cmp_ge_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s8, -1, 0 +; GFX9-NEXT: s_cmp_ge_u32 s15, s6 +; GFX9-NEXT: s_cselect_b32 s9, -1, 0 +; GFX9-NEXT: s_cmp_eq_u32 s3, s7 +; GFX9-NEXT: s_cselect_b32 s3, s9, s8 +; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: s_cselect_b32 s9, s11, s13 +; GFX9-NEXT: s_cselect_b32 s8, s10, s12 ; GFX9-NEXT: s_cbranch_execnz .LBB16_3 ; GFX9-NEXT: .LBB16_2: -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: s_sub_i32 s0, 0, s2 -; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s6 +; GFX9-NEXT: s_mov_b32 s9, 0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s3, v0 -; GFX9-NEXT: s_mul_i32 s0, s0, s3 -; GFX9-NEXT: s_mul_hi_u32 s0, s3, s0 -; GFX9-NEXT: s_add_i32 s3, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s10, s3 -; GFX9-NEXT: s_mul_i32 s4, s0, s2 -; GFX9-NEXT: s_sub_i32 s4, s10, s4 -; GFX9-NEXT: s_add_i32 s3, s0, 1 -; GFX9-NEXT: s_sub_i32 s5, s4, s2 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s0, s3, s0 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s3, s0, 1 -; GFX9-NEXT: s_cmp_ge_u32 s4, s2 -; GFX9-NEXT: s_cselect_b32 s0, s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: s_mul_i32 s3, s3, s4 +; GFX9-NEXT: s_mul_hi_u32 s3, s4, s3 +; GFX9-NEXT: s_add_i32 s4, s4, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s4 +; GFX9-NEXT: s_mul_i32 s5, s3, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, s5 +; GFX9-NEXT: s_add_i32 s4, s3, 1 +; GFX9-NEXT: s_sub_i32 s5, s2, s6 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s3, s4, s3 +; GFX9-NEXT: s_cselect_b32 s2, s5, s2 +; GFX9-NEXT: s_add_i32 s4, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s8, s4, s3 ; GFX9-NEXT: .LBB16_3: +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .LBB16_4: -; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX9-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX9-NEXT: s_branch .LBB16_2 ; ; GFX1010-LABEL: sudiv64: ; GFX1010: ; %bb.0: ; GFX1010-NEXT: s_clause 0x1 -; GFX1010-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX1010-NEXT: s_waitcnt lgkmcnt(0) -; GFX1010-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] +; GFX1010-NEXT: s_or_b64 s[4:5], s[2:3], s[6:7] ; GFX1010-NEXT: s_mov_b32 s4, 0 ; GFX1010-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1010-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1010-NEXT: ; %bb.1: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1010-NEXT: s_sub_u32 s5, 0, s2 -; GFX1010-NEXT: s_subb_u32 s6, 0, s3 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX1010-NEXT: s_sub_u32 s9, 0, s6 +; GFX1010-NEXT: s_subb_u32 s10, 0, s7 ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1010-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2477,160 +2466,158 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1010-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1010-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s7, s5, s0 -; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s1 -; GFX1010-NEXT: s_mul_i32 s12, s6, s1 -; GFX1010-NEXT: s_add_i32 s7, s13, s7 -; GFX1010-NEXT: s_mul_i32 s14, s5, s1 -; GFX1010-NEXT: s_add_i32 s7, s7, s12 -; GFX1010-NEXT: s_mul_hi_u32 s13, s1, s14 -; GFX1010-NEXT: s_mul_hi_u32 s15, s0, s14 -; GFX1010-NEXT: s_mul_i32 s12, s0, s14 -; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s7 -; GFX1010-NEXT: s_mul_i32 s1, s1, s7 -; GFX1010-NEXT: s_mul_hi_u32 s16, s0, s7 -; GFX1010-NEXT: s_add_u32 s1, s13, s1 -; GFX1010-NEXT: s_addc_u32 s13, 0, s14 -; GFX1010-NEXT: s_add_u32 s1, s1, s12 -; GFX1010-NEXT: s_mul_i32 s7, s0, s7 -; GFX1010-NEXT: s_addc_u32 s1, s13, s15 -; GFX1010-NEXT: s_addc_u32 s12, s16, 0 -; GFX1010-NEXT: s_add_u32 s1, s1, s7 -; GFX1010-NEXT: s_addc_u32 s7, 0, s12 -; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s7 -; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s7, s5, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s5, s1 -; GFX1010-NEXT: s_mul_i32 s6, s6, s1 -; GFX1010-NEXT: s_add_i32 s7, s12, s7 -; GFX1010-NEXT: s_mul_i32 s5, s5, s1 -; GFX1010-NEXT: s_add_i32 s7, s7, s6 -; GFX1010-NEXT: s_mul_hi_u32 s12, s0, s5 -; GFX1010-NEXT: s_mul_i32 s13, s0, s5 -; GFX1010-NEXT: s_mul_hi_u32 s5, s1, s5 -; GFX1010-NEXT: s_mul_hi_u32 s14, s1, s7 -; GFX1010-NEXT: s_mul_i32 s1, s1, s7 -; GFX1010-NEXT: s_mul_hi_u32 s6, s0, s7 -; GFX1010-NEXT: s_add_u32 s1, s5, s1 -; GFX1010-NEXT: s_addc_u32 s5, 0, s14 -; GFX1010-NEXT: s_add_u32 s1, s1, s13 -; GFX1010-NEXT: s_mul_i32 s7, s0, s7 -; GFX1010-NEXT: s_addc_u32 s1, s5, s12 -; GFX1010-NEXT: s_addc_u32 s5, s6, 0 -; GFX1010-NEXT: s_add_u32 s1, s1, s7 -; GFX1010-NEXT: s_addc_u32 s5, 0, s5 -; GFX1010-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX1010-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1010-NEXT: s_addc_u32 s0, s0, s5 -; GFX1010-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1010-NEXT: s_mul_i32 s6, s10, s0 -; GFX1010-NEXT: s_mul_hi_u32 s5, s10, s0 -; GFX1010-NEXT: s_mul_hi_u32 s7, s11, s0 -; GFX1010-NEXT: s_mul_i32 s0, s11, s0 -; GFX1010-NEXT: s_mul_hi_u32 s12, s10, s1 -; GFX1010-NEXT: s_mul_hi_u32 s13, s11, s1 -; GFX1010-NEXT: s_mul_i32 s1, s11, s1 -; GFX1010-NEXT: s_add_u32 s6, s12, s6 -; GFX1010-NEXT: s_addc_u32 s5, 0, s5 -; GFX1010-NEXT: s_add_u32 s1, s6, s1 -; GFX1010-NEXT: s_addc_u32 s1, s5, s13 -; GFX1010-NEXT: s_addc_u32 s5, s7, 0 -; GFX1010-NEXT: s_add_u32 s1, s1, s0 -; GFX1010-NEXT: s_addc_u32 s5, 0, s5 -; GFX1010-NEXT: s_mul_hi_u32 s0, s2, s1 -; GFX1010-NEXT: s_mul_i32 s7, s2, s5 -; GFX1010-NEXT: s_mul_i32 s12, s2, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s7 -; GFX1010-NEXT: v_sub_co_u32 v0, s7, s10, s12 -; GFX1010-NEXT: s_mul_i32 s6, s3, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s6 -; GFX1010-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX1010-NEXT: s_sub_i32 s6, s11, s0 -; GFX1010-NEXT: s_cmp_lg_u32 s7, 0 -; GFX1010-NEXT: s_subb_u32 s6, s6, s3 -; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX1010-NEXT: s_subb_u32 s6, s6, 0 -; GFX1010-NEXT: s_cmp_ge_u32 s6, s3 -; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1010-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1010-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s5 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1010-NEXT: s_mul_i32 s12, s10, s8 +; GFX1010-NEXT: s_add_i32 s11, s13, s11 +; GFX1010-NEXT: s_mul_i32 s14, s9, s8 +; GFX1010-NEXT: s_add_i32 s11, s11, s12 +; GFX1010-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1010-NEXT: s_mul_i32 s16, s8, s11 +; GFX1010-NEXT: s_mul_hi_u32 s15, s5, s14 +; GFX1010-NEXT: s_mul_i32 s12, s5, s14 +; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX1010-NEXT: s_add_u32 s13, s13, s16 +; GFX1010-NEXT: s_addc_u32 s14, 0, s14 +; GFX1010-NEXT: s_mul_hi_u32 s17, s5, s11 +; GFX1010-NEXT: s_add_u32 s12, s13, s12 +; GFX1010-NEXT: s_mul_i32 s11, s5, s11 +; GFX1010-NEXT: s_addc_u32 s12, s14, s15 +; GFX1010-NEXT: s_addc_u32 s13, s17, 0 +; GFX1010-NEXT: s_add_u32 s11, s12, s11 +; GFX1010-NEXT: s_addc_u32 s12, 0, s13 +; GFX1010-NEXT: s_add_u32 s8, s8, s11 +; GFX1010-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1010-NEXT: s_mul_i32 s11, s9, s8 +; GFX1010-NEXT: s_addc_u32 s5, s5, s12 +; GFX1010-NEXT: s_mul_i32 s10, s10, s8 +; GFX1010-NEXT: s_mul_i32 s9, s9, s5 +; GFX1010-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1010-NEXT: s_add_i32 s9, s13, s9 +; GFX1010-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1010-NEXT: s_add_i32 s9, s9, s10 +; GFX1010-NEXT: s_mul_i32 s10, s5, s11 +; GFX1010-NEXT: s_mul_i32 s15, s8, s9 +; GFX1010-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1010-NEXT: s_add_u32 s12, s12, s15 +; GFX1010-NEXT: s_addc_u32 s14, 0, s14 +; GFX1010-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1010-NEXT: s_add_u32 s10, s12, s10 +; GFX1010-NEXT: s_mul_i32 s9, s5, s9 +; GFX1010-NEXT: s_addc_u32 s10, s14, s13 +; GFX1010-NEXT: s_addc_u32 s11, s11, 0 +; GFX1010-NEXT: s_add_u32 s9, s10, s9 +; GFX1010-NEXT: s_addc_u32 s10, 0, s11 +; GFX1010-NEXT: s_add_u32 s8, s8, s9 +; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1010-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1010-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1010-NEXT: s_mul_hi_u32 s9, s3, s8 +; GFX1010-NEXT: s_addc_u32 s5, s5, s10 +; GFX1010-NEXT: s_mul_i32 s8, s3, s8 +; GFX1010-NEXT: s_mul_i32 s12, s2, s5 +; GFX1010-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1010-NEXT: s_add_u32 s11, s11, s12 +; GFX1010-NEXT: s_addc_u32 s10, 0, s10 +; GFX1010-NEXT: s_mul_hi_u32 s13, s3, s5 +; GFX1010-NEXT: s_add_u32 s8, s11, s8 +; GFX1010-NEXT: s_mul_i32 s5, s3, s5 +; GFX1010-NEXT: s_addc_u32 s8, s10, s9 +; GFX1010-NEXT: s_addc_u32 s9, s13, 0 +; GFX1010-NEXT: s_add_u32 s5, s8, s5 +; GFX1010-NEXT: s_addc_u32 s8, 0, s9 +; GFX1010-NEXT: s_mul_hi_u32 s9, s6, s5 +; GFX1010-NEXT: s_mul_i32 s10, s6, s8 +; GFX1010-NEXT: s_mul_i32 s11, s7, s5 +; GFX1010-NEXT: s_add_i32 s9, s9, s10 +; GFX1010-NEXT: s_mul_i32 s10, s6, s5 +; GFX1010-NEXT: s_add_i32 s9, s9, s11 +; GFX1010-NEXT: s_sub_i32 s11, s3, s9 +; GFX1010-NEXT: s_sub_u32 s10, s2, s10 ; GFX1010-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s6, s3 -; GFX1010-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1010-NEXT: s_add_u32 s6, s1, 1 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: s_addc_u32 s12, s5, 0 -; GFX1010-NEXT: s_add_u32 s13, s1, 2 -; GFX1010-NEXT: s_addc_u32 s14, s5, 0 -; GFX1010-NEXT: s_cmp_lg_u32 s7, 0 -; GFX1010-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX1010-NEXT: s_subb_u32 s0, s11, s0 -; GFX1010-NEXT: v_mov_b32_e32 v2, s13 -; GFX1010-NEXT: s_cmp_ge_u32 s0, s3 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX1010-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1010-NEXT: s_cmp_eq_u32 s0, s3 -; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX1010-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s14 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1010-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1010-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo +; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1010-NEXT: s_subb_u32 s11, s11, s7 +; GFX1010-NEXT: s_sub_u32 s13, s10, s6 +; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s14, 0 +; GFX1010-NEXT: s_subb_u32 s11, s11, 0 +; GFX1010-NEXT: s_cmp_ge_u32 s11, s7 +; GFX1010-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1010-NEXT: s_cmp_ge_u32 s13, s6 +; GFX1010-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1010-NEXT: s_cmp_eq_u32 s11, s7 +; GFX1010-NEXT: s_cselect_b32 s11, s13, s14 +; GFX1010-NEXT: s_add_u32 s13, s5, 1 +; GFX1010-NEXT: s_addc_u32 s14, s8, 0 +; GFX1010-NEXT: s_add_u32 s15, s5, 2 +; GFX1010-NEXT: s_addc_u32 s16, s8, 0 +; GFX1010-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1010-NEXT: s_cselect_b32 s11, s15, s13 +; GFX1010-NEXT: s_cselect_b32 s13, s16, s14 +; GFX1010-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1010-NEXT: s_subb_u32 s3, s3, s9 +; GFX1010-NEXT: s_cmp_ge_u32 s3, s7 +; GFX1010-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1010-NEXT: s_cmp_ge_u32 s10, s6 +; GFX1010-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1010-NEXT: s_cmp_eq_u32 s3, s7 +; GFX1010-NEXT: s_cselect_b32 s3, s10, s9 +; GFX1010-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1010-NEXT: s_cselect_b32 s9, s13, s8 +; GFX1010-NEXT: s_cselect_b32 s8, s11, s5 ; GFX1010-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1010-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1010-NEXT: .LBB16_2: -; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1010-NEXT: s_sub_i32 s1, 0, s2 +; GFX1010-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX1010-NEXT: s_sub_i32 s4, 0, s6 +; GFX1010-NEXT: s_mov_b32 s9, 0 ; GFX1010-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1010-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1010-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1010-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1010-NEXT: s_mul_i32 s1, s1, s0 -; GFX1010-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX1010-NEXT: s_add_i32 s0, s0, s1 -; GFX1010-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX1010-NEXT: s_mul_i32 s1, s0, s2 -; GFX1010-NEXT: s_add_i32 s3, s0, 1 -; GFX1010-NEXT: s_sub_i32 s1, s10, s1 -; GFX1010-NEXT: s_sub_i32 s4, s1, s2 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1010-NEXT: s_cselect_b32 s1, s4, s1 -; GFX1010-NEXT: s_add_i32 s3, s0, 1 -; GFX1010-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1010-NEXT: s_mov_b32 s1, 0 -; GFX1010-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1010-NEXT: v_mov_b32_e32 v0, s0 -; GFX1010-NEXT: v_mov_b32_e32 v1, s1 +; GFX1010-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1010-NEXT: s_mul_i32 s4, s4, s3 +; GFX1010-NEXT: s_mul_hi_u32 s4, s3, s4 +; GFX1010-NEXT: s_add_i32 s3, s3, s4 +; GFX1010-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX1010-NEXT: s_mul_i32 s4, s3, s6 +; GFX1010-NEXT: s_sub_i32 s2, s2, s4 +; GFX1010-NEXT: s_add_i32 s4, s3, 1 +; GFX1010-NEXT: s_sub_i32 s5, s2, s6 +; GFX1010-NEXT: s_cmp_ge_u32 s2, s6 +; GFX1010-NEXT: s_cselect_b32 s3, s4, s3 +; GFX1010-NEXT: s_cselect_b32 s2, s5, s2 +; GFX1010-NEXT: s_add_i32 s4, s3, 1 +; GFX1010-NEXT: s_cmp_ge_u32 s2, s6 +; GFX1010-NEXT: s_cselect_b32 s8, s4, s3 ; GFX1010-NEXT: .LBB16_3: +; GFX1010-NEXT: v_mov_b32_e32 v0, s8 ; GFX1010-NEXT: v_mov_b32_e32 v2, 0 -; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX1010-NEXT: v_mov_b32_e32 v1, s9 +; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1010-NEXT: s_endpgm ; GFX1010-NEXT: .LBB16_4: -; GFX1010-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1010-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1010-NEXT: s_branch .LBB16_2 ; ; GFX1030W32-LABEL: sudiv64: ; GFX1030W32: ; %bb.0: ; GFX1030W32-NEXT: s_clause 0x1 -; GFX1030W32-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W32-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] -; GFX1030W32-NEXT: s_mov_b32 s4, 0 -; GFX1030W32-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1030W32-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX1030W32-NEXT: s_mov_b32 s6, 0 +; GFX1030W32-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1030W32-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W32-NEXT: ; %bb.1: -; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1030W32-NEXT: s_sub_u32 s5, 0, s2 -; GFX1030W32-NEXT: s_subb_u32 s6, 0, s3 +; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX1030W32-NEXT: s_sub_u32 s9, 0, s4 +; GFX1030W32-NEXT: s_subb_u32 s10, 0, s5 ; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W32-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2639,160 +2626,158 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W32-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W32-NEXT: s_mul_i32 s7, s5, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s5, s1 -; GFX1030W32-NEXT: s_mul_i32 s12, s6, s1 -; GFX1030W32-NEXT: s_add_i32 s7, s13, s7 -; GFX1030W32-NEXT: s_mul_i32 s14, s5, s1 -; GFX1030W32-NEXT: s_add_i32 s7, s7, s12 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s1, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s15, s0, s14 -; GFX1030W32-NEXT: s_mul_i32 s12, s0, s14 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s7 -; GFX1030W32-NEXT: s_mul_i32 s1, s1, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s16, s0, s7 -; GFX1030W32-NEXT: s_add_u32 s1, s13, s1 -; GFX1030W32-NEXT: s_addc_u32 s13, 0, s14 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s12 -; GFX1030W32-NEXT: s_mul_i32 s7, s0, s7 -; GFX1030W32-NEXT: s_addc_u32 s1, s13, s15 -; GFX1030W32-NEXT: s_addc_u32 s12, s16, 0 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s7 -; GFX1030W32-NEXT: s_addc_u32 s7, 0, s12 -; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1030W32-NEXT: s_addc_u32 s0, s0, s7 -; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W32-NEXT: s_mul_i32 s7, s5, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s5, s1 -; GFX1030W32-NEXT: s_mul_i32 s6, s6, s1 -; GFX1030W32-NEXT: s_add_i32 s7, s12, s7 -; GFX1030W32-NEXT: s_mul_i32 s5, s5, s1 -; GFX1030W32-NEXT: s_add_i32 s7, s7, s6 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s0, s5 -; GFX1030W32-NEXT: s_mul_i32 s13, s0, s5 -; GFX1030W32-NEXT: s_mul_hi_u32 s5, s1, s5 -; GFX1030W32-NEXT: s_mul_hi_u32 s14, s1, s7 -; GFX1030W32-NEXT: s_mul_i32 s1, s1, s7 -; GFX1030W32-NEXT: s_mul_hi_u32 s6, s0, s7 -; GFX1030W32-NEXT: s_add_u32 s1, s5, s1 -; GFX1030W32-NEXT: s_addc_u32 s5, 0, s14 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s13 -; GFX1030W32-NEXT: s_mul_i32 s7, s0, s7 -; GFX1030W32-NEXT: s_addc_u32 s1, s5, s12 -; GFX1030W32-NEXT: s_addc_u32 s5, s6, 0 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s7 -; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 -; GFX1030W32-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX1030W32-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1030W32-NEXT: s_addc_u32 s0, s0, s5 -; GFX1030W32-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W32-NEXT: s_mul_i32 s6, s10, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s5, s10, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s7, s11, s0 -; GFX1030W32-NEXT: s_mul_i32 s0, s11, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s12, s10, s1 -; GFX1030W32-NEXT: s_mul_hi_u32 s13, s11, s1 -; GFX1030W32-NEXT: s_mul_i32 s1, s11, s1 -; GFX1030W32-NEXT: s_add_u32 s6, s12, s6 -; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 -; GFX1030W32-NEXT: s_add_u32 s1, s6, s1 -; GFX1030W32-NEXT: s_addc_u32 s1, s5, s13 -; GFX1030W32-NEXT: s_addc_u32 s5, s7, 0 -; GFX1030W32-NEXT: s_add_u32 s1, s1, s0 -; GFX1030W32-NEXT: s_addc_u32 s5, 0, s5 -; GFX1030W32-NEXT: s_mul_hi_u32 s0, s2, s1 -; GFX1030W32-NEXT: s_mul_i32 s7, s2, s5 -; GFX1030W32-NEXT: s_mul_i32 s12, s2, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s7 -; GFX1030W32-NEXT: v_sub_co_u32 v0, s7, s10, s12 -; GFX1030W32-NEXT: s_mul_i32 s6, s3, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s6 -; GFX1030W32-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX1030W32-NEXT: s_sub_i32 s6, s11, s0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s7, 0 -; GFX1030W32-NEXT: s_subb_u32 s6, s6, s3 -; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX1030W32-NEXT: s_subb_u32 s6, s6, 0 -; GFX1030W32-NEXT: s_cmp_ge_u32 s6, s3 -; GFX1030W32-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1030W32-NEXT: v_readfirstlane_b32 s7, v1 +; GFX1030W32-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s10, s8 +; GFX1030W32-NEXT: s_add_i32 s11, s13, s11 +; GFX1030W32-NEXT: s_mul_i32 s14, s9, s8 +; GFX1030W32-NEXT: s_add_i32 s11, s11, s12 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1030W32-NEXT: s_mul_i32 s16, s8, s11 +; GFX1030W32-NEXT: s_mul_hi_u32 s15, s7, s14 +; GFX1030W32-NEXT: s_mul_i32 s12, s7, s14 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX1030W32-NEXT: s_add_u32 s13, s13, s16 +; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 +; GFX1030W32-NEXT: s_mul_hi_u32 s17, s7, s11 +; GFX1030W32-NEXT: s_add_u32 s12, s13, s12 +; GFX1030W32-NEXT: s_mul_i32 s11, s7, s11 +; GFX1030W32-NEXT: s_addc_u32 s12, s14, s15 +; GFX1030W32-NEXT: s_addc_u32 s13, s17, 0 +; GFX1030W32-NEXT: s_add_u32 s11, s12, s11 +; GFX1030W32-NEXT: s_addc_u32 s12, 0, s13 +; GFX1030W32-NEXT: s_add_u32 s8, s8, s11 +; GFX1030W32-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_mul_i32 s11, s9, s8 +; GFX1030W32-NEXT: s_addc_u32 s7, s7, s12 +; GFX1030W32-NEXT: s_mul_i32 s10, s10, s8 +; GFX1030W32-NEXT: s_mul_i32 s9, s9, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1030W32-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W32-NEXT: s_mul_i32 s10, s7, s11 +; GFX1030W32-NEXT: s_mul_i32 s15, s8, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1030W32-NEXT: s_add_u32 s12, s12, s15 +; GFX1030W32-NEXT: s_addc_u32 s14, 0, s14 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX1030W32-NEXT: s_add_u32 s10, s12, s10 +; GFX1030W32-NEXT: s_mul_i32 s9, s7, s9 +; GFX1030W32-NEXT: s_addc_u32 s10, s14, s13 +; GFX1030W32-NEXT: s_addc_u32 s11, s11, 0 +; GFX1030W32-NEXT: s_add_u32 s9, s10, s9 +; GFX1030W32-NEXT: s_addc_u32 s10, 0, s11 +; GFX1030W32-NEXT: s_add_u32 s8, s8, s9 +; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1030W32-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s3, s8 +; GFX1030W32-NEXT: s_addc_u32 s7, s7, s10 +; GFX1030W32-NEXT: s_mul_i32 s8, s3, s8 +; GFX1030W32-NEXT: s_mul_i32 s12, s2, s7 +; GFX1030W32-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX1030W32-NEXT: s_add_u32 s11, s11, s12 +; GFX1030W32-NEXT: s_addc_u32 s10, 0, s10 +; GFX1030W32-NEXT: s_mul_hi_u32 s13, s3, s7 +; GFX1030W32-NEXT: s_add_u32 s8, s11, s8 +; GFX1030W32-NEXT: s_mul_i32 s7, s3, s7 +; GFX1030W32-NEXT: s_addc_u32 s8, s10, s9 +; GFX1030W32-NEXT: s_addc_u32 s9, s13, 0 +; GFX1030W32-NEXT: s_add_u32 s7, s8, s7 +; GFX1030W32-NEXT: s_addc_u32 s8, 0, s9 +; GFX1030W32-NEXT: s_mul_hi_u32 s9, s4, s7 +; GFX1030W32-NEXT: s_mul_i32 s10, s4, s8 +; GFX1030W32-NEXT: s_mul_i32 s11, s5, s7 +; GFX1030W32-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W32-NEXT: s_mul_i32 s10, s4, s7 +; GFX1030W32-NEXT: s_add_i32 s9, s9, s11 +; GFX1030W32-NEXT: s_sub_i32 s11, s3, s9 +; GFX1030W32-NEXT: s_sub_u32 s10, s2, s10 ; GFX1030W32-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1030W32-NEXT: s_cmp_eq_u32 s6, s3 -; GFX1030W32-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1030W32-NEXT: s_add_u32 s6, s1, 1 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1030W32-NEXT: s_addc_u32 s12, s5, 0 -; GFX1030W32-NEXT: s_add_u32 s13, s1, 2 -; GFX1030W32-NEXT: s_addc_u32 s14, s5, 0 -; GFX1030W32-NEXT: s_cmp_lg_u32 s7, 0 -; GFX1030W32-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX1030W32-NEXT: s_subb_u32 s0, s11, s0 -; GFX1030W32-NEXT: v_mov_b32_e32 v2, s13 -; GFX1030W32-NEXT: s_cmp_ge_u32 s0, s3 -; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX1030W32-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1030W32-NEXT: s_cmp_eq_u32 s0, s3 -; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX1030W32-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s14 -; GFX1030W32-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo -; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1030W32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1030W32-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo -; GFX1030W32-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 +; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1030W32-NEXT: s_subb_u32 s11, s11, s5 +; GFX1030W32-NEXT: s_sub_u32 s13, s10, s4 +; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s14, 0 +; GFX1030W32-NEXT: s_subb_u32 s11, s11, 0 +; GFX1030W32-NEXT: s_cmp_ge_u32 s11, s5 +; GFX1030W32-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W32-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W32-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1030W32-NEXT: s_cmp_eq_u32 s11, s5 +; GFX1030W32-NEXT: s_cselect_b32 s11, s13, s14 +; GFX1030W32-NEXT: s_add_u32 s13, s7, 1 +; GFX1030W32-NEXT: s_addc_u32 s14, s8, 0 +; GFX1030W32-NEXT: s_add_u32 s15, s7, 2 +; GFX1030W32-NEXT: s_addc_u32 s16, s8, 0 +; GFX1030W32-NEXT: s_cmp_lg_u32 s11, 0 +; GFX1030W32-NEXT: s_cselect_b32 s11, s15, s13 +; GFX1030W32-NEXT: s_cselect_b32 s13, s16, s14 +; GFX1030W32-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1030W32-NEXT: s_subb_u32 s3, s3, s9 +; GFX1030W32-NEXT: s_cmp_ge_u32 s3, s5 +; GFX1030W32-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W32-NEXT: s_cmp_ge_u32 s10, s4 +; GFX1030W32-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1030W32-NEXT: s_cmp_eq_u32 s3, s5 +; GFX1030W32-NEXT: s_cselect_b32 s3, s10, s9 +; GFX1030W32-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1030W32-NEXT: s_cselect_b32 s9, s13, s8 +; GFX1030W32-NEXT: s_cselect_b32 s8, s11, s7 +; GFX1030W32-NEXT: s_andn2_b32 vcc_lo, exec_lo, s6 ; GFX1030W32-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX1030W32-NEXT: .LBB16_2: -; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1030W32-NEXT: s_sub_i32 s1, 0, s2 +; GFX1030W32-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX1030W32-NEXT: s_sub_i32 s5, 0, s4 +; GFX1030W32-NEXT: s_mov_b32 s9, 0 ; GFX1030W32-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1030W32-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030W32-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W32-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W32-NEXT: s_mul_i32 s1, s1, s0 -; GFX1030W32-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX1030W32-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W32-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX1030W32-NEXT: s_mul_i32 s1, s0, s2 -; GFX1030W32-NEXT: s_add_i32 s3, s0, 1 -; GFX1030W32-NEXT: s_sub_i32 s1, s10, s1 -; GFX1030W32-NEXT: s_sub_i32 s4, s1, s2 -; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1030W32-NEXT: s_cselect_b32 s1, s4, s1 -; GFX1030W32-NEXT: s_add_i32 s3, s0, 1 -; GFX1030W32-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1030W32-NEXT: s_mov_b32 s1, 0 -; GFX1030W32-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030W32-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1030W32-NEXT: s_mul_i32 s5, s5, s3 +; GFX1030W32-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX1030W32-NEXT: s_add_i32 s3, s3, s5 +; GFX1030W32-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX1030W32-NEXT: s_mul_i32 s5, s3, s4 +; GFX1030W32-NEXT: s_sub_i32 s2, s2, s5 +; GFX1030W32-NEXT: s_add_i32 s5, s3, 1 +; GFX1030W32-NEXT: s_sub_i32 s6, s2, s4 +; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4 +; GFX1030W32-NEXT: s_cselect_b32 s3, s5, s3 +; GFX1030W32-NEXT: s_cselect_b32 s2, s6, s2 +; GFX1030W32-NEXT: s_add_i32 s5, s3, 1 +; GFX1030W32-NEXT: s_cmp_ge_u32 s2, s4 +; GFX1030W32-NEXT: s_cselect_b32 s8, s5, s3 ; GFX1030W32-NEXT: .LBB16_3: +; GFX1030W32-NEXT: v_mov_b32_e32 v0, s8 ; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX1030W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W32-NEXT: s_endpgm ; GFX1030W32-NEXT: .LBB16_4: -; GFX1030W32-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1030W32-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1030W32-NEXT: s_branch .LBB16_2 ; ; GFX1030W64-LABEL: sudiv64: ; GFX1030W64: ; %bb.0: ; GFX1030W64-NEXT: s_clause 0x1 -; GFX1030W64-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x34 +; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030W64-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] -; GFX1030W64-NEXT: s_mov_b32 s0, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1030W64-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX1030W64-NEXT: s_mov_b32 s6, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1030W64-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1030W64-NEXT: ; %bb.1: -; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX1030W64-NEXT: s_sub_u32 s5, 0, s2 -; GFX1030W64-NEXT: s_subb_u32 s6, 0, s3 +; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX1030W64-NEXT: s_sub_u32 s9, 0, s4 +; GFX1030W64-NEXT: s_subb_u32 s10, 0, s5 ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX1030W64-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2801,160 +2786,158 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX1030W64-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s5, s4 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s5, s0 -; GFX1030W64-NEXT: s_mul_i32 s7, s6, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s12, s1 -; GFX1030W64-NEXT: s_mul_i32 s13, s5, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s1, s7 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s0, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s14, s4, s13 -; GFX1030W64-NEXT: s_mul_i32 s7, s4, s13 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 -; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s15, s4, s1 -; GFX1030W64-NEXT: s_add_u32 s0, s12, s0 -; GFX1030W64-NEXT: s_addc_u32 s12, 0, s13 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s7 -; GFX1030W64-NEXT: s_mul_i32 s1, s4, s1 -; GFX1030W64-NEXT: s_addc_u32 s0, s12, s14 -; GFX1030W64-NEXT: s_addc_u32 s7, s15, 0 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 -; GFX1030W64-NEXT: s_addc_u32 s7, 0, s7 -; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_addc_u32 s4, s4, s7 -; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s5, s4 -; GFX1030W64-NEXT: s_mul_hi_u32 s7, s5, s0 -; GFX1030W64-NEXT: s_mul_i32 s6, s6, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s7, s1 -; GFX1030W64-NEXT: s_mul_i32 s5, s5, s0 -; GFX1030W64-NEXT: s_add_i32 s1, s1, s6 -; GFX1030W64-NEXT: s_mul_hi_u32 s7, s4, s5 -; GFX1030W64-NEXT: s_mul_i32 s12, s4, s5 -; GFX1030W64-NEXT: s_mul_hi_u32 s5, s0, s5 -; GFX1030W64-NEXT: s_mul_hi_u32 s13, s0, s1 -; GFX1030W64-NEXT: s_mul_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s1 -; GFX1030W64-NEXT: s_add_u32 s0, s5, s0 -; GFX1030W64-NEXT: s_addc_u32 s5, 0, s13 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s12 -; GFX1030W64-NEXT: s_mul_i32 s1, s4, s1 -; GFX1030W64-NEXT: s_addc_u32 s0, s5, s7 -; GFX1030W64-NEXT: s_addc_u32 s5, s6, 0 -; GFX1030W64-NEXT: s_add_u32 s0, s0, s1 -; GFX1030W64-NEXT: s_addc_u32 s5, 0, s5 -; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_addc_u32 s0, s4, s5 -; GFX1030W64-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1030W64-NEXT: s_mul_i32 s5, s10, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s4, s10, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s6, s11, s0 -; GFX1030W64-NEXT: s_mul_i32 s0, s11, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s7, s10, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s12, s11, s1 -; GFX1030W64-NEXT: s_mul_i32 s1, s11, s1 -; GFX1030W64-NEXT: s_add_u32 s5, s7, s5 -; GFX1030W64-NEXT: s_addc_u32 s4, 0, s4 -; GFX1030W64-NEXT: s_add_u32 s1, s5, s1 -; GFX1030W64-NEXT: s_addc_u32 s1, s4, s12 -; GFX1030W64-NEXT: s_addc_u32 s4, s6, 0 -; GFX1030W64-NEXT: s_add_u32 s6, s1, s0 -; GFX1030W64-NEXT: s_addc_u32 s7, 0, s4 -; GFX1030W64-NEXT: s_mul_hi_u32 s0, s2, s6 -; GFX1030W64-NEXT: s_mul_i32 s1, s2, s7 -; GFX1030W64-NEXT: s_mul_i32 s5, s2, s6 -; GFX1030W64-NEXT: s_add_i32 s12, s0, s1 -; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s10, s5 -; GFX1030W64-NEXT: s_mul_i32 s4, s3, s6 -; GFX1030W64-NEXT: s_add_i32 s12, s12, s4 -; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], v0, s2 -; GFX1030W64-NEXT: s_sub_i32 s13, s11, s12 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: s_subb_u32 s13, s13, s3 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 -; GFX1030W64-NEXT: s_subb_u32 s4, s13, 0 -; GFX1030W64-NEXT: s_cmp_ge_u32 s4, s3 -; GFX1030W64-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX1030W64-NEXT: s_cselect_b32 s5, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s4, s3 -; GFX1030W64-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1030W64-NEXT: s_add_u32 s4, s6, 1 -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc -; GFX1030W64-NEXT: s_addc_u32 s5, s7, 0 -; GFX1030W64-NEXT: s_add_u32 s13, s6, 2 -; GFX1030W64-NEXT: s_addc_u32 s14, s7, 0 -; GFX1030W64-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1030W64-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX1030W64-NEXT: s_subb_u32 s0, s11, s12 -; GFX1030W64-NEXT: v_mov_b32_e32 v2, s13 -; GFX1030W64-NEXT: s_cmp_ge_u32 s0, s3 -; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX1030W64-NEXT: s_cselect_b32 s11, -1, 0 -; GFX1030W64-NEXT: s_cmp_eq_u32 s0, s3 -; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX1030W64-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s14 -; GFX1030W64-NEXT: v_cndmask_b32_e64 v0, s11, v0, s[0:1] -; GFX1030W64-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc -; GFX1030W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX1030W64-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc -; GFX1030W64-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc +; GFX1030W64-NEXT: v_readfirstlane_b32 s8, v1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s6, v0 +; GFX1030W64-NEXT: s_mul_i32 s7, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s9, s6 +; GFX1030W64-NEXT: s_mul_i32 s11, s10, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s12, s7 +; GFX1030W64-NEXT: s_mul_i32 s13, s9, s6 +; GFX1030W64-NEXT: s_add_i32 s7, s7, s11 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s6, s13 +; GFX1030W64-NEXT: s_mul_i32 s15, s6, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s14, s8, s13 +; GFX1030W64-NEXT: s_mul_i32 s11, s8, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s6, s7 +; GFX1030W64-NEXT: s_add_u32 s12, s12, s15 +; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s16, s8, s7 +; GFX1030W64-NEXT: s_add_u32 s11, s12, s11 +; GFX1030W64-NEXT: s_mul_i32 s7, s8, s7 +; GFX1030W64-NEXT: s_addc_u32 s11, s13, s14 +; GFX1030W64-NEXT: s_addc_u32 s12, s16, 0 +; GFX1030W64-NEXT: s_add_u32 s7, s11, s7 +; GFX1030W64-NEXT: s_addc_u32 s11, 0, s12 +; GFX1030W64-NEXT: s_add_u32 s12, s6, s7 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_i32 s6, s9, s12 +; GFX1030W64-NEXT: s_addc_u32 s8, s8, s11 +; GFX1030W64-NEXT: s_mul_i32 s10, s10, s12 +; GFX1030W64-NEXT: s_mul_i32 s9, s9, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s7, s12, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s13, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s8, s6 +; GFX1030W64-NEXT: s_add_i32 s9, s9, s10 +; GFX1030W64-NEXT: s_mul_i32 s6, s8, s6 +; GFX1030W64-NEXT: s_mul_i32 s14, s12, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1030W64-NEXT: s_add_u32 s7, s7, s14 +; GFX1030W64-NEXT: s_addc_u32 s13, 0, s13 +; GFX1030W64-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1030W64-NEXT: s_add_u32 s6, s7, s6 +; GFX1030W64-NEXT: s_mul_i32 s9, s8, s9 +; GFX1030W64-NEXT: s_addc_u32 s6, s13, s11 +; GFX1030W64-NEXT: s_addc_u32 s7, s10, 0 +; GFX1030W64-NEXT: s_add_u32 s6, s6, s9 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s12, s6 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s3, s10 +; GFX1030W64-NEXT: s_addc_u32 s7, s8, s9 +; GFX1030W64-NEXT: s_mul_i32 s8, s3, s10 +; GFX1030W64-NEXT: s_mul_i32 s10, s2, s7 +; GFX1030W64-NEXT: s_mul_hi_u32 s9, s2, s7 +; GFX1030W64-NEXT: s_add_u32 s10, s11, s10 +; GFX1030W64-NEXT: s_addc_u32 s9, 0, s9 +; GFX1030W64-NEXT: s_mul_hi_u32 s12, s3, s7 +; GFX1030W64-NEXT: s_add_u32 s8, s10, s8 +; GFX1030W64-NEXT: s_mul_i32 s7, s3, s7 +; GFX1030W64-NEXT: s_addc_u32 s6, s9, s6 +; GFX1030W64-NEXT: s_addc_u32 s8, s12, 0 +; GFX1030W64-NEXT: s_add_u32 s10, s6, s7 +; GFX1030W64-NEXT: s_addc_u32 s11, 0, s8 +; GFX1030W64-NEXT: s_mul_hi_u32 s6, s4, s10 +; GFX1030W64-NEXT: s_mul_i32 s7, s4, s11 +; GFX1030W64-NEXT: s_mul_i32 s8, s5, s10 +; GFX1030W64-NEXT: s_add_i32 s6, s6, s7 +; GFX1030W64-NEXT: s_add_i32 s12, s6, s8 +; GFX1030W64-NEXT: s_mul_i32 s6, s4, s10 +; GFX1030W64-NEXT: s_sub_i32 s8, s3, s12 +; GFX1030W64-NEXT: s_sub_u32 s13, s2, s6 +; GFX1030W64-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_subb_u32 s14, s8, s5 +; GFX1030W64-NEXT: s_sub_u32 s15, s13, s4 +; GFX1030W64-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1030W64-NEXT: s_subb_u32 s8, s14, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s15, s4 +; GFX1030W64-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s8, s5 +; GFX1030W64-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1030W64-NEXT: s_add_u32 s9, s10, 1 +; GFX1030W64-NEXT: s_addc_u32 s14, s11, 0 +; GFX1030W64-NEXT: s_add_u32 s15, s10, 2 +; GFX1030W64-NEXT: s_addc_u32 s16, s11, 0 +; GFX1030W64-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1030W64-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1030W64-NEXT: s_cselect_b32 s14, s16, s14 +; GFX1030W64-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GFX1030W64-NEXT: s_subb_u32 s3, s3, s12 +; GFX1030W64-NEXT: s_cmp_ge_u32 s3, s5 +; GFX1030W64-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1030W64-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1030W64-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030W64-NEXT: s_cmp_eq_u32 s3, s5 +; GFX1030W64-NEXT: s_cselect_b32 s3, s7, s6 +; GFX1030W64-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1030W64-NEXT: s_cselect_b32 s7, s14, s11 +; GFX1030W64-NEXT: s_cselect_b32 s6, s15, s10 ; GFX1030W64-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1030W64-NEXT: .LBB16_2: -; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1030W64-NEXT: s_sub_i32 s1, 0, s2 +; GFX1030W64-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX1030W64-NEXT: s_sub_i32 s5, 0, s4 +; GFX1030W64-NEXT: s_mov_b32 s7, 0 ; GFX1030W64-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1030W64-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1030W64-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1030W64-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1030W64-NEXT: s_mul_i32 s1, s1, s0 -; GFX1030W64-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX1030W64-NEXT: s_add_i32 s0, s0, s1 -; GFX1030W64-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX1030W64-NEXT: s_mul_i32 s1, s0, s2 -; GFX1030W64-NEXT: s_add_i32 s3, s0, 1 -; GFX1030W64-NEXT: s_sub_i32 s1, s10, s1 -; GFX1030W64-NEXT: s_sub_i32 s4, s1, s2 -; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1030W64-NEXT: s_cselect_b32 s1, s4, s1 -; GFX1030W64-NEXT: s_add_i32 s3, s0, 1 -; GFX1030W64-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1030W64-NEXT: s_mov_b32 s1, 0 -; GFX1030W64-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1 +; GFX1030W64-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1030W64-NEXT: s_mul_i32 s5, s5, s3 +; GFX1030W64-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX1030W64-NEXT: s_add_i32 s3, s3, s5 +; GFX1030W64-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX1030W64-NEXT: s_mul_i32 s5, s3, s4 +; GFX1030W64-NEXT: s_sub_i32 s2, s2, s5 +; GFX1030W64-NEXT: s_add_i32 s5, s3, 1 +; GFX1030W64-NEXT: s_sub_i32 s6, s2, s4 +; GFX1030W64-NEXT: s_cmp_ge_u32 s2, s4 +; GFX1030W64-NEXT: s_cselect_b32 s3, s5, s3 +; GFX1030W64-NEXT: s_cselect_b32 s2, s6, s2 +; GFX1030W64-NEXT: s_add_i32 s5, s3, 1 +; GFX1030W64-NEXT: s_cmp_ge_u32 s2, s4 +; GFX1030W64-NEXT: s_cselect_b32 s6, s5, s3 ; GFX1030W64-NEXT: .LBB16_3: +; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0 -; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7 +; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX1030W64-NEXT: s_endpgm ; GFX1030W64-NEXT: .LBB16_4: -; GFX1030W64-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1030W64-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GFX1030W64-NEXT: s_branch .LBB16_2 ; ; GFX11-LABEL: sudiv64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b64 s[4:5], s[10:11], s[2:3] -; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] +; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX11-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX11-NEXT: ; %bb.1: -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX11-NEXT: s_sub_u32 s5, 0, s2 -; GFX11-NEXT: s_subb_u32 s6, 0, s3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX11-NEXT: s_sub_u32 s9, 0, s4 +; GFX11-NEXT: s_subb_u32 s10, 0, s5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 @@ -2968,310 +2951,308 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v1 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_mul_i32 s7, s5, s0 -; GFX11-NEXT: s_mul_hi_u32 s13, s5, s1 -; GFX11-NEXT: s_mul_i32 s12, s6, s1 -; GFX11-NEXT: s_add_i32 s7, s13, s7 -; GFX11-NEXT: s_mul_i32 s14, s5, s1 -; GFX11-NEXT: s_add_i32 s7, s7, s12 -; GFX11-NEXT: s_mul_hi_u32 s13, s1, s14 -; GFX11-NEXT: s_mul_hi_u32 s15, s0, s14 -; GFX11-NEXT: s_mul_i32 s12, s0, s14 -; GFX11-NEXT: s_mul_hi_u32 s14, s1, s7 -; GFX11-NEXT: s_mul_i32 s1, s1, s7 -; GFX11-NEXT: s_mul_hi_u32 s16, s0, s7 -; GFX11-NEXT: s_add_u32 s1, s13, s1 -; GFX11-NEXT: s_addc_u32 s13, 0, s14 -; GFX11-NEXT: s_add_u32 s1, s1, s12 -; GFX11-NEXT: s_mul_i32 s7, s0, s7 -; GFX11-NEXT: s_addc_u32 s1, s13, s15 -; GFX11-NEXT: s_addc_u32 s12, s16, 0 -; GFX11-NEXT: s_add_u32 s1, s1, s7 -; GFX11-NEXT: s_addc_u32 s7, 0, s12 -; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_addc_u32 s0, s0, s7 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_mul_i32 s7, s5, s0 -; GFX11-NEXT: s_mul_hi_u32 s12, s5, s1 -; GFX11-NEXT: s_mul_i32 s6, s6, s1 -; GFX11-NEXT: s_add_i32 s7, s12, s7 -; GFX11-NEXT: s_mul_i32 s5, s5, s1 -; GFX11-NEXT: s_add_i32 s7, s7, s6 -; GFX11-NEXT: s_mul_hi_u32 s12, s0, s5 -; GFX11-NEXT: s_mul_i32 s13, s0, s5 -; GFX11-NEXT: s_mul_hi_u32 s5, s1, s5 -; GFX11-NEXT: s_mul_hi_u32 s14, s1, s7 -; GFX11-NEXT: s_mul_i32 s1, s1, s7 -; GFX11-NEXT: s_mul_hi_u32 s6, s0, s7 -; GFX11-NEXT: s_add_u32 s1, s5, s1 -; GFX11-NEXT: s_addc_u32 s5, 0, s14 -; GFX11-NEXT: s_add_u32 s1, s1, s13 -; GFX11-NEXT: s_mul_i32 s7, s0, s7 -; GFX11-NEXT: s_addc_u32 s1, s5, s12 -; GFX11-NEXT: s_addc_u32 s5, s6, 0 -; GFX11-NEXT: s_add_u32 s1, s1, s7 -; GFX11-NEXT: s_addc_u32 s5, 0, s5 -; GFX11-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_addc_u32 s0, s0, s5 -; GFX11-NEXT: v_readfirstlane_b32 s1, v0 -; GFX11-NEXT: s_mul_i32 s6, s10, s0 -; GFX11-NEXT: s_mul_hi_u32 s5, s10, s0 -; GFX11-NEXT: s_mul_hi_u32 s7, s11, s0 -; GFX11-NEXT: s_mul_i32 s0, s11, s0 -; GFX11-NEXT: s_mul_hi_u32 s12, s10, s1 -; GFX11-NEXT: s_mul_hi_u32 s13, s11, s1 -; GFX11-NEXT: s_mul_i32 s1, s11, s1 -; GFX11-NEXT: s_add_u32 s6, s12, s6 -; GFX11-NEXT: s_addc_u32 s5, 0, s5 -; GFX11-NEXT: s_add_u32 s1, s6, s1 -; GFX11-NEXT: s_addc_u32 s1, s5, s13 -; GFX11-NEXT: s_addc_u32 s5, s7, 0 -; GFX11-NEXT: s_add_u32 s1, s1, s0 -; GFX11-NEXT: s_addc_u32 s5, 0, s5 -; GFX11-NEXT: s_mul_hi_u32 s0, s2, s1 -; GFX11-NEXT: s_mul_i32 s7, s2, s5 -; GFX11-NEXT: s_mul_i32 s12, s2, s1 -; GFX11-NEXT: s_add_i32 s0, s0, s7 -; GFX11-NEXT: v_sub_co_u32 v0, s7, s10, s12 -; GFX11-NEXT: s_mul_i32 s6, s3, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, s6 -; GFX11-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX11-NEXT: s_sub_i32 s6, s11, s0 -; GFX11-NEXT: s_cmp_lg_u32 s7, 0 -; GFX11-NEXT: s_subb_u32 s6, s6, s3 +; GFX11-NEXT: v_readfirstlane_b32 s7, v1 +; GFX11-NEXT: v_readfirstlane_b32 s8, v0 +; GFX11-NEXT: s_mul_i32 s11, s9, s7 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX11-NEXT: s_mul_i32 s12, s10, s8 +; GFX11-NEXT: s_add_i32 s11, s13, s11 +; GFX11-NEXT: s_mul_i32 s14, s9, s8 +; GFX11-NEXT: s_add_i32 s11, s11, s12 +; GFX11-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX11-NEXT: s_mul_i32 s16, s8, s11 +; GFX11-NEXT: s_mul_hi_u32 s15, s7, s14 +; GFX11-NEXT: s_mul_i32 s12, s7, s14 +; GFX11-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX11-NEXT: s_add_u32 s13, s13, s16 +; GFX11-NEXT: s_addc_u32 s14, 0, s14 +; GFX11-NEXT: s_mul_hi_u32 s17, s7, s11 +; GFX11-NEXT: s_add_u32 s12, s13, s12 +; GFX11-NEXT: s_mul_i32 s11, s7, s11 +; GFX11-NEXT: s_addc_u32 s12, s14, s15 +; GFX11-NEXT: s_addc_u32 s13, s17, 0 +; GFX11-NEXT: s_add_u32 s11, s12, s11 +; GFX11-NEXT: s_addc_u32 s12, 0, s13 +; GFX11-NEXT: s_add_u32 s8, s8, s11 +; GFX11-NEXT: s_cselect_b32 s11, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_mul_i32 s11, s9, s8 +; GFX11-NEXT: s_addc_u32 s7, s7, s12 +; GFX11-NEXT: s_mul_i32 s10, s10, s8 +; GFX11-NEXT: s_mul_i32 s9, s9, s7 +; GFX11-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX11-NEXT: s_add_i32 s9, s13, s9 +; GFX11-NEXT: s_mul_hi_u32 s13, s7, s11 +; GFX11-NEXT: s_add_i32 s9, s9, s10 +; GFX11-NEXT: s_mul_i32 s10, s7, s11 +; GFX11-NEXT: s_mul_i32 s15, s8, s9 +; GFX11-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX11-NEXT: s_add_u32 s12, s12, s15 +; GFX11-NEXT: s_addc_u32 s14, 0, s14 +; GFX11-NEXT: s_mul_hi_u32 s11, s7, s9 +; GFX11-NEXT: s_add_u32 s10, s12, s10 +; GFX11-NEXT: s_mul_i32 s9, s7, s9 +; GFX11-NEXT: s_addc_u32 s10, s14, s13 +; GFX11-NEXT: s_addc_u32 s11, s11, 0 +; GFX11-NEXT: s_add_u32 s9, s10, s9 +; GFX11-NEXT: s_addc_u32 s10, 0, s11 +; GFX11-NEXT: s_add_u32 s8, s8, s9 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX11-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-NEXT: s_mul_hi_u32 s9, s3, s8 +; GFX11-NEXT: s_addc_u32 s7, s7, s10 +; GFX11-NEXT: s_mul_i32 s8, s3, s8 +; GFX11-NEXT: s_mul_i32 s12, s2, s7 +; GFX11-NEXT: s_mul_hi_u32 s10, s2, s7 +; GFX11-NEXT: s_add_u32 s11, s11, s12 +; GFX11-NEXT: s_addc_u32 s10, 0, s10 +; GFX11-NEXT: s_mul_hi_u32 s13, s3, s7 +; GFX11-NEXT: s_add_u32 s8, s11, s8 +; GFX11-NEXT: s_mul_i32 s7, s3, s7 +; GFX11-NEXT: s_addc_u32 s8, s10, s9 +; GFX11-NEXT: s_addc_u32 s9, s13, 0 +; GFX11-NEXT: s_add_u32 s7, s8, s7 +; GFX11-NEXT: s_addc_u32 s8, 0, s9 +; GFX11-NEXT: s_mul_hi_u32 s9, s4, s7 +; GFX11-NEXT: s_mul_i32 s10, s4, s8 +; GFX11-NEXT: s_mul_i32 s11, s5, s7 +; GFX11-NEXT: s_add_i32 s9, s9, s10 +; GFX11-NEXT: s_mul_i32 s10, s4, s7 +; GFX11-NEXT: s_add_i32 s9, s9, s11 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_sub_i32 s11, s3, s9 +; GFX11-NEXT: s_sub_u32 s10, s2, s10 +; GFX11-NEXT: s_cselect_b32 s12, -1, 0 ; GFX11-NEXT: s_cmp_lg_u32 s12, 0 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX11-NEXT: s_subb_u32 s6, s6, 0 +; GFX11-NEXT: s_subb_u32 s11, s11, s5 +; GFX11-NEXT: s_sub_u32 s13, s10, s4 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s14, 0 +; GFX11-NEXT: s_subb_u32 s11, s11, 0 +; GFX11-NEXT: s_cmp_ge_u32 s11, s5 +; GFX11-NEXT: s_cselect_b32 s14, -1, 0 +; GFX11-NEXT: s_cmp_ge_u32 s13, s4 +; GFX11-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s11, s5 +; GFX11-NEXT: s_cselect_b32 s11, s13, s14 +; GFX11-NEXT: s_add_u32 s13, s7, 1 +; GFX11-NEXT: s_addc_u32 s14, s8, 0 +; GFX11-NEXT: s_add_u32 s15, s7, 2 +; GFX11-NEXT: s_addc_u32 s16, s8, 0 +; GFX11-NEXT: s_cmp_lg_u32 s11, 0 +; GFX11-NEXT: s_cselect_b32 s11, s15, s13 +; GFX11-NEXT: s_cselect_b32 s13, s16, s14 +; GFX11-NEXT: s_cmp_lg_u32 s12, 0 +; GFX11-NEXT: s_subb_u32 s3, s3, s9 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_ge_u32 s6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-NEXT: s_cselect_b32 s12, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s6, s3 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: s_add_u32 s6, s1, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX11-NEXT: s_addc_u32 s12, s5, 0 -; GFX11-NEXT: s_add_u32 s13, s1, 2 -; GFX11-NEXT: s_addc_u32 s14, s5, 0 -; GFX11-NEXT: s_cmp_lg_u32 s7, 0 -; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX11-NEXT: s_subb_u32 s0, s11, s0 -; GFX11-NEXT: v_mov_b32_e32 v2, s13 -; GFX11-NEXT: s_cmp_ge_u32 s0, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX11-NEXT: s_cselect_b32 s7, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s0, s3 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v2, s6, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, s5, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX11-NEXT: s_cmp_ge_u32 s3, s5 +; GFX11-NEXT: s_cselect_b32 s9, -1, 0 +; GFX11-NEXT: s_cmp_ge_u32 s10, s4 +; GFX11-NEXT: s_cselect_b32 s10, -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s3, s5 +; GFX11-NEXT: s_cselect_b32 s3, s10, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: s_cselect_b32 s9, s13, s8 +; GFX11-NEXT: s_cselect_b32 s8, s11, s7 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s6 ; GFX11-NEXT: s_cbranch_vccnz .LBB16_3 ; GFX11-NEXT: .LBB16_2: -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX11-NEXT: s_sub_i32 s1, 0, s2 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX11-NEXT: s_sub_i32 s5, 0, s4 +; GFX11-NEXT: s_mov_b32 s9, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: s_mul_i32 s1, s1, s0 -; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: s_mul_i32 s5, s5, s3 +; GFX11-NEXT: s_mul_hi_u32 s5, s3, s5 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_add_i32 s0, s0, s1 -; GFX11-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s1, s0, s2 -; GFX11-NEXT: s_add_i32 s3, s0, 1 -; GFX11-NEXT: s_sub_i32 s1, s10, s1 -; GFX11-NEXT: s_sub_i32 s4, s1, s2 -; GFX11-NEXT: s_cmp_ge_u32 s1, s2 -; GFX11-NEXT: s_cselect_b32 s0, s3, s0 -; GFX11-NEXT: s_cselect_b32 s1, s4, s1 -; GFX11-NEXT: s_add_i32 s3, s0, 1 -; GFX11-NEXT: s_cmp_ge_u32 s1, s2 -; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: s_cselect_b32 s0, s3, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_add_i32 s3, s3, s5 +; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s5, s3, s4 +; GFX11-NEXT: s_sub_i32 s2, s2, s5 +; GFX11-NEXT: s_add_i32 s5, s3, 1 +; GFX11-NEXT: s_sub_i32 s6, s2, s4 +; GFX11-NEXT: s_cmp_ge_u32 s2, s4 +; GFX11-NEXT: s_cselect_b32 s3, s5, s3 +; GFX11-NEXT: s_cselect_b32 s2, s6, s2 +; GFX11-NEXT: s_add_i32 s5, s3, 1 +; GFX11-NEXT: s_cmp_ge_u32 s2, s4 +; GFX11-NEXT: s_cselect_b32 s8, s5, s3 ; GFX11-NEXT: .LBB16_3: -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s8 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; GFX11-NEXT: .LBB16_4: -; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX11-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX11-NEXT: s_branch .LBB16_2 ; ; GFX1250-LABEL: sudiv64: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: s_clause 0x1 -; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x34 +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX1250-NEXT: s_wait_kmcnt 0x0 -; GFX1250-NEXT: s_or_b64 s[0:1], s[10:11], s[2:3] +; GFX1250-NEXT: s_or_b64 s[6:7], s[2:3], s[4:5] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_and_b64 s[0:1], s[0:1], lit64(0xffffffff00000000) -; GFX1250-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-NEXT: s_and_b64 s[6:7], s[6:7], 0xffffffff00000000 +; GFX1250-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GFX1250-NEXT: s_cbranch_scc0 .LBB16_4 ; GFX1250-NEXT: ; %bb.1: -; GFX1250-NEXT: s_cvt_f32_u32 s0, s2 -; GFX1250-NEXT: s_cvt_f32_u32 s1, s3 -; GFX1250-NEXT: s_sub_nc_u64 s[6:7], 0, s[2:3] +; GFX1250-NEXT: s_cvt_f32_u32 s6, s4 +; GFX1250-NEXT: s_cvt_f32_u32 s7, s5 +; GFX1250-NEXT: s_sub_nc_u64 s[10:11], 0, s[4:5] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_fmac_f32 s0, s1, 0x4f800000 -; GFX1250-NEXT: v_s_rcp_f32 s0, s0 +; GFX1250-NEXT: s_fmac_f32 s6, s7, 0x4f800000 +; GFX1250-NEXT: v_s_rcp_f32 s6, s6 ; GFX1250-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_mul_f32 s0, s0, 0x5f7ffffc -; GFX1250-NEXT: s_mul_f32 s1, s0, 0x2f800000 +; GFX1250-NEXT: s_mul_f32 s6, s6, 0x5f7ffffc +; GFX1250-NEXT: s_mul_f32 s7, s6, 0x2f800000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_trunc_f32 s1, s1 -; GFX1250-NEXT: s_fmac_f32 s0, s1, 0xcf800000 -; GFX1250-NEXT: s_cvt_u32_f32 s5, s1 -; GFX1250-NEXT: s_mov_b32 s1, 0 +; GFX1250-NEXT: s_trunc_f32 s7, s7 +; GFX1250-NEXT: s_fmac_f32 s6, s7, 0xcf800000 +; GFX1250-NEXT: s_cvt_u32_f32 s9, s7 +; GFX1250-NEXT: s_mov_b32 s7, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX1250-NEXT: s_cvt_u32_f32 s4, s0 -; GFX1250-NEXT: s_mul_u64 s[12:13], s[6:7], s[4:5] +; GFX1250-NEXT: s_cvt_u32_f32 s8, s6 +; GFX1250-NEXT: s_mul_u64 s[12:13], s[10:11], s[8:9] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_hi_u32 s15, s4, s13 -; GFX1250-NEXT: s_mul_i32 s14, s4, s13 -; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s12 -; GFX1250-NEXT: s_mul_i32 s17, s5, s12 -; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[0:1], s[14:15] -; GFX1250-NEXT: s_mul_hi_u32 s16, s5, s12 -; GFX1250-NEXT: s_mul_hi_u32 s18, s5, s13 -; GFX1250-NEXT: s_add_co_u32 s0, s14, s17 -; GFX1250-NEXT: s_add_co_ci_u32 s0, s15, s16 -; GFX1250-NEXT: s_mul_i32 s12, s5, s13 +; GFX1250-NEXT: s_mul_hi_u32 s15, s8, s13 +; GFX1250-NEXT: s_mul_i32 s14, s8, s13 +; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s12 +; GFX1250-NEXT: s_mul_i32 s17, s9, s12 +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], s[14:15] +; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s12 +; GFX1250-NEXT: s_mul_hi_u32 s18, s9, s13 +; GFX1250-NEXT: s_add_co_u32 s6, s14, s17 +; GFX1250-NEXT: s_add_co_ci_u32 s6, s15, s16 +; GFX1250-NEXT: s_mul_i32 s12, s9, s13 ; GFX1250-NEXT: s_add_co_ci_u32 s13, s18, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13] -; GFX1250-NEXT: v_add_co_u32 v0, s0, s4, s12 -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1250-NEXT: s_add_co_ci_u32 s5, s5, s13 -; GFX1250-NEXT: v_readfirstlane_b32 s4, v0 -; GFX1250-NEXT: s_mul_u64 s[6:7], s[6:7], s[4:5] +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX1250-NEXT: s_add_co_u32 s8, s8, s12 +; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s6, 0 +; GFX1250-NEXT: s_add_co_ci_u32 s9, s9, s13 +; GFX1250-NEXT: s_mul_u64 s[10:11], s[10:11], s[8:9] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_hi_u32 s13, s4, s7 -; GFX1250-NEXT: s_mul_i32 s12, s4, s7 -; GFX1250-NEXT: s_mul_hi_u32 s0, s4, s6 -; GFX1250-NEXT: s_mul_i32 s15, s5, s6 -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[0:1], s[12:13] -; GFX1250-NEXT: s_mul_hi_u32 s14, s5, s6 -; GFX1250-NEXT: s_mul_hi_u32 s4, s5, s7 -; GFX1250-NEXT: s_add_co_u32 s0, s12, s15 -; GFX1250-NEXT: s_add_co_ci_u32 s0, s13, s14 -; GFX1250-NEXT: s_mul_i32 s6, s5, s7 -; GFX1250-NEXT: s_add_co_ci_u32 s7, s4, 0 +; GFX1250-NEXT: s_mul_hi_u32 s13, s8, s11 +; GFX1250-NEXT: s_mul_i32 s12, s8, s11 +; GFX1250-NEXT: s_mul_hi_u32 s6, s8, s10 +; GFX1250-NEXT: s_mul_i32 s15, s9, s10 +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], s[12:13] +; GFX1250-NEXT: s_mul_hi_u32 s14, s9, s10 +; GFX1250-NEXT: s_mul_hi_u32 s16, s9, s11 +; GFX1250-NEXT: s_add_co_u32 s6, s12, s15 +; GFX1250-NEXT: s_add_co_ci_u32 s6, s13, s14 +; GFX1250-NEXT: s_mul_i32 s10, s9, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s11, s16, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[0:1], s[6:7] -; GFX1250-NEXT: v_add_co_u32 v0, s0, v0, s6 -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s7 -; GFX1250-NEXT: v_readfirstlane_b32 s7, v0 -; GFX1250-NEXT: s_mul_hi_u32 s5, s10, s0 -; GFX1250-NEXT: s_mul_i32 s4, s10, s0 -; GFX1250-NEXT: s_mul_hi_u32 s12, s11, s0 -; GFX1250-NEXT: s_mul_i32 s6, s11, s0 -; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s7 -; GFX1250-NEXT: s_mul_i32 s13, s11, s7 -; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[4:5] -; GFX1250-NEXT: s_mul_hi_u32 s0, s11, s7 -; GFX1250-NEXT: s_add_co_u32 s4, s4, s13 -; GFX1250-NEXT: s_add_co_ci_u32 s0, s5, s0 -; GFX1250-NEXT: s_add_co_ci_u32 s7, s12, 0 +; GFX1250-NEXT: s_add_nc_u64 s[10:11], s[6:7], s[10:11] +; GFX1250-NEXT: s_add_co_u32 s8, s8, s10 +; GFX1250-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1250-NEXT: s_mul_hi_u32 s6, s2, s8 +; GFX1250-NEXT: s_cmp_lg_u32 s10, 0 +; GFX1250-NEXT: s_mul_hi_u32 s12, s3, s8 +; GFX1250-NEXT: s_add_co_ci_u32 s10, s9, s11 +; GFX1250-NEXT: s_mul_i32 s11, s3, s8 +; GFX1250-NEXT: s_mul_hi_u32 s9, s2, s10 +; GFX1250-NEXT: s_mul_i32 s8, s2, s10 +; GFX1250-NEXT: s_mul_hi_u32 s13, s3, s10 +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[8:9] +; GFX1250-NEXT: s_mul_i32 s10, s3, s10 +; GFX1250-NEXT: s_add_co_u32 s6, s8, s11 +; GFX1250-NEXT: s_add_co_ci_u32 s6, s9, s12 +; GFX1250-NEXT: s_add_co_ci_u32 s11, s13, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[0:1], s[6:7] -; GFX1250-NEXT: s_and_b64 s[6:7], s[4:5], lit64(0xffffffff00000000) +; GFX1250-NEXT: s_add_nc_u64 s[8:9], s[6:7], s[10:11] +; GFX1250-NEXT: s_and_b64 s[10:11], s[8:9], 0xffffffff00000000 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_or_b32 s6, s6, s4 -; GFX1250-NEXT: s_mul_u64 s[4:5], s[2:3], s[6:7] -; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[6:7], 2 -; GFX1250-NEXT: v_sub_co_u32 v0, s0, s10, s4 -; GFX1250-NEXT: s_sub_co_i32 s4, s11, s5 -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1250-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v3, s15 -; GFX1250-NEXT: v_sub_co_u32 v1, s12, v0, s2 -; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, s3 -; GFX1250-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[6:7], 1 -; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v1 -; GFX1250-NEXT: s_sub_co_ci_u32 s4, s4, 0 +; GFX1250-NEXT: s_or_b32 s10, s10, s8 +; GFX1250-NEXT: s_mul_u64 s[8:9], s[4:5], s[10:11] ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_ge_u32 s4, s3 -; GFX1250-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1250-NEXT: s_sub_co_u32 s6, s2, s8 +; GFX1250-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1250-NEXT: s_sub_co_i32 s12, s3, s9 +; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, s5 +; GFX1250-NEXT: s_sub_co_u32 s13, s6, s4 ; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s4, s3 -; GFX1250-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1250-NEXT: s_cmp_lg_u32 s0, 0 -; GFX1250-NEXT: v_cndmask_b32_e32 v1, s14, v1, vcc_lo -; GFX1250-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v0 -; GFX1250-NEXT: s_sub_co_ci_u32 s0, s11, s5 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_lg_u32 s14, 0 +; GFX1250-NEXT: s_sub_co_ci_u32 s12, s12, 0 +; GFX1250-NEXT: s_cmp_ge_u32 s12, s5 +; GFX1250-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1250-NEXT: s_cmp_ge_u32 s13, s4 +; GFX1250-NEXT: s_cselect_b32 s15, -1, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s12, s5 +; GFX1250-NEXT: s_add_nc_u64 s[12:13], s[10:11], 1 +; GFX1250-NEXT: s_cselect_b32 s16, s15, s14 +; GFX1250-NEXT: s_add_nc_u64 s[14:15], s[10:11], 2 +; GFX1250-NEXT: s_cmp_lg_u32 s16, 0 +; GFX1250-NEXT: s_cselect_b32 s12, s14, s12 +; GFX1250-NEXT: s_cselect_b32 s13, s15, s13 +; GFX1250-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1250-NEXT: s_sub_co_ci_u32 s3, s3, s9 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_cmp_ge_u32 s3, s5 +; GFX1250-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1250-NEXT: s_cmp_ge_u32 s6, s4 +; GFX1250-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1250-NEXT: s_cmp_eq_u32 s3, s5 +; GFX1250-NEXT: s_cselect_b32 s3, s6, s8 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_cmp_ge_u32 s0, s3 -; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX1250-NEXT: s_cselect_b32 s4, -1, 0 -; GFX1250-NEXT: s_cmp_eq_u32 s0, s3 -; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX1250-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX1250-NEXT: v_cndmask_b32_e64 v0, s4, v0, s0 -; GFX1250-NEXT: v_cndmask_b32_e32 v2, s12, v2, vcc_lo -; GFX1250-NEXT: v_cndmask_b32_e32 v1, s13, v3, vcc_lo -; GFX1250-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1250-NEXT: v_cndmask_b32_e32 v1, s7, v1, vcc_lo -; GFX1250-NEXT: v_cndmask_b32_e32 v0, s6, v2, vcc_lo +; GFX1250-NEXT: s_cmp_lg_u32 s3, 0 +; GFX1250-NEXT: s_cselect_b32 s9, s13, s11 +; GFX1250-NEXT: s_cselect_b32 s8, s12, s10 ; GFX1250-NEXT: s_cbranch_execnz .LBB16_3 ; GFX1250-NEXT: .LBB16_2: -; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX1250-NEXT: s_sub_co_i32 s1, 0, s2 +; GFX1250-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX1250-NEXT: s_sub_co_i32 s5, 0, s4 +; GFX1250-NEXT: s_mov_b32 s9, 0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) ; GFX1250-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1250-NEXT: v_nop ; GFX1250-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1250-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1250-NEXT: s_mul_i32 s1, s1, s0 +; GFX1250-NEXT: v_readfirstlane_b32 s3, v0 +; GFX1250-NEXT: s_mul_i32 s5, s5, s3 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX1250-NEXT: s_add_co_i32 s0, s0, s1 +; GFX1250-NEXT: s_mul_hi_u32 s5, s3, s5 +; GFX1250-NEXT: s_add_co_i32 s3, s3, s5 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1250-NEXT: s_mul_hi_u32 s0, s10, s0 -; GFX1250-NEXT: s_mul_i32 s1, s0, s2 -; GFX1250-NEXT: s_add_co_i32 s3, s0, 1 -; GFX1250-NEXT: s_sub_co_i32 s1, s10, s1 -; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: s_sub_co_i32 s4, s1, s2 -; GFX1250-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1250-NEXT: s_cselect_b32 s0, s3, s0 -; GFX1250-NEXT: s_cselect_b32 s1, s4, s1 -; GFX1250-NEXT: s_add_co_i32 s3, s0, 1 -; GFX1250-NEXT: s_cmp_ge_u32 s1, s2 -; GFX1250-NEXT: s_mov_b32 s1, 0 -; GFX1250-NEXT: s_cselect_b32 s0, s3, s0 +; GFX1250-NEXT: s_mul_hi_u32 s3, s2, s3 +; GFX1250-NEXT: s_mul_i32 s5, s3, s4 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-NEXT: s_sub_co_i32 s2, s2, s5 +; GFX1250-NEXT: s_add_co_i32 s5, s3, 1 +; GFX1250-NEXT: s_sub_co_i32 s6, s2, s4 +; GFX1250-NEXT: s_cmp_ge_u32 s2, s4 +; GFX1250-NEXT: s_cselect_b32 s3, s5, s3 +; GFX1250-NEXT: s_cselect_b32 s2, s6, s2 +; GFX1250-NEXT: s_add_co_i32 s5, s3, 1 +; GFX1250-NEXT: s_cmp_ge_u32 s2, s4 +; GFX1250-NEXT: s_cselect_b32 s8, s5, s3 ; GFX1250-NEXT: .LBB16_3: +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[8:9] ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 -; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[8:9] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX1250-NEXT: s_endpgm ; GFX1250-NEXT: .LBB16_4: -; GFX1250-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1250-NEXT: s_branch .LBB16_2 %result = udiv i64 %x, %y store i64 %result, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll index a92b99aa1e2c1..1c5f8c84e447d 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll @@ -173,11 +173,12 @@ ret: ; GCN-LABEL: {{^}}sink_ubfe_i64_span_midpoint: ; GCN: s_cbranch_scc{{[0-1]}} .LBB3_2 -; GCN: v_alignbit_b32 v[[LO:[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}, 30 -; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7f, v[[LO]] +; GCN: s_lshr_b64 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s[[[LO2:[0-9]+]]:[[HI2:[0-9]+]]], 30 +; GCN: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7f ; GCN: .LBB3_3: -; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xff, v[[LO]] +; GCN: s_lshr_b64 s[[[LO3:[0-9]+]]:[[HI3:[0-9]+]]], s[[[LO4:[0-9]+]]:[[HI4:[0-9]+]]], 30 +; GCN: s_and_b32 s{{[0-9]+}}, s[[LO3]], 0xff ; GCN: buffer_store_dwordx2 define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(ptr addrspace(1) %out, i64 %arg1, i1 %arg) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll index f9fae025e0bf8..79b44d6a92caa 100644 --- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll @@ -686,7 +686,7 @@ define double @v_mul_f64_vop2_literal_64(double %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] -; GFX1250-NEXT: v_mul_f64_e32 v[0:1], lit64(0x405ec66666666666), v[0:1] ; encoding: [0xfe,0x00,0x00,0x0c,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40] +; GFX1250-NEXT: v_mul_f64_e32 v[0:1], 0x405ec66666666666, v[0:1] ; encoding: [0xfe,0x00,0x00,0x0c,0x66,0x66,0x66,0x66,0x66,0xc6,0x5e,0x40] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] %mul = fmul double %x, 123.1 ret double %mul @@ -788,7 +788,7 @@ define i64 @v_add_u64_vop2_literal_64(i64 %x) { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; encoding: [0x00,0x00,0xc8,0xbf] ; GFX1250-NEXT: s_wait_kmcnt 0x0 ; encoding: [0x00,0x00,0xc7,0xbf] -; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0x112345678), v[0:1] ; encoding: [0xfe,0x00,0x00,0x50,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 0x112345678, v[0:1] ; encoding: [0xfe,0x00,0x00,0x50,0x78,0x56,0x34,0x12,0x01,0x00,0x00,0x00] ; GFX1250-NEXT: s_set_pc_i64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe] %add = add i64 %x, 4600387192 ret i64 %add diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index f82bb59eb7906..be60a00145c8a 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1012,7 +1012,7 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() { ; ; GFX1250-LABEL: store_constant_adjacent_offsets: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], lit64(0x7b0000007b) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x7b0000007b ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: ds_store_b64 v2, v[0:1] ; GFX1250-NEXT: s_endpgm @@ -1350,7 +1350,7 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { ; ; GFX1250-LABEL: write2_v2i32_align1_odd_offset: ; GFX1250: ; %bb.0: ; %entry -; GFX1250-NEXT: v_mov_b64_e32 v[0:1], lit64(0x1c80000007b) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], 0x1c80000007b ; GFX1250-NEXT: v_mov_b32_e32 v2, 0 ; GFX1250-NEXT: ds_store_b64 v2, v[0:1] offset:65 ; GFX1250-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll index ac30297770807..bcccf50e3805c 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll @@ -7,7 +7,7 @@ define amdgpu_cs void @amdgpu_cs() #0 { ; CHECK-LABEL: amdgpu_cs: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; CHECK-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -19,7 +19,7 @@ define amdgpu_cs void @amdgpu_cs() #0 { define amdgpu_kernel void @kernel() #0 { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; CHECK-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -31,7 +31,7 @@ define amdgpu_kernel void @kernel() #0 { define amdgpu_cs void @with_local() #0 { ; CHECK-TRUE16-LABEL: with_local: ; CHECK-TRUE16: ; %bb.0: -; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 ; CHECK-TRUE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-TRUE16-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -42,7 +42,7 @@ define amdgpu_cs void @with_local() #0 { ; ; CHECK-FAKE16-LABEL: with_local: ; CHECK-FAKE16: ; %bb.0: -; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 13 ; CHECK-FAKE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-FAKE16-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -60,7 +60,7 @@ define amdgpu_cs void @with_local() #0 { define amdgpu_cs void @with_calls_inline_const() #0 { ; CHECK-TRUE16-LABEL: with_calls_inline_const: ; CHECK-TRUE16: ; %bb.0: -; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-TRUE16-NEXT: v_mov_b16_e32 v0.l, 15 ; CHECK-TRUE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-TRUE16-NEXT: s_mov_b32 s1, callee@abs32@hi @@ -76,7 +76,7 @@ define amdgpu_cs void @with_calls_inline_const() #0 { ; ; CHECK-FAKE16-LABEL: with_calls_inline_const: ; CHECK-FAKE16: ; %bb.0: -; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 15 ; CHECK-FAKE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-FAKE16-NEXT: s_mov_b32 s1, callee@abs32@hi @@ -100,7 +100,7 @@ define amdgpu_cs void @with_calls_inline_const() #0 { define amdgpu_cs void @with_calls_no_inline_const() #0 { ; CHECK-TRUE16-LABEL: with_calls_no_inline_const: ; CHECK-TRUE16: ; %bb.0: -; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-TRUE16-NEXT: v_mov_b16_e32 v0.l, 15 ; CHECK-TRUE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-TRUE16-NEXT: s_mov_b32 s1, callee@abs32@hi @@ -117,7 +117,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 { ; ; CHECK-FAKE16-LABEL: with_calls_no_inline_const: ; CHECK-FAKE16: ; %bb.0: -; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 15 ; CHECK-FAKE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-FAKE16-NEXT: s_mov_b32 s1, callee@abs32@hi @@ -140,7 +140,7 @@ define amdgpu_cs void @with_calls_no_inline_const() #0 { define amdgpu_cs void @with_spills() #0 { ; CHECK-LABEL: with_spills: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; CHECK-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -153,7 +153,7 @@ define amdgpu_cs void @with_spills() #0 { define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { ; CHECK-LABEL: realign_stack: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-NEXT: v_mov_b32_e32 v32, 0 ; CHECK-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-NEXT: s_mov_b32 s1, callee@abs32@hi @@ -187,7 +187,7 @@ define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 { define amdgpu_cs void @frame_pointer_none() #1 { ; CHECK-TRUE16-LABEL: frame_pointer_none: ; CHECK-TRUE16: ; %bb.0: -; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 ; CHECK-TRUE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-TRUE16-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -198,7 +198,7 @@ define amdgpu_cs void @frame_pointer_none() #1 { ; ; CHECK-FAKE16-LABEL: frame_pointer_none: ; CHECK-FAKE16: ; %bb.0: -; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 13 ; CHECK-FAKE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-FAKE16-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -214,7 +214,7 @@ define amdgpu_cs void @frame_pointer_none() #1 { define amdgpu_cs void @frame_pointer_all() #2 { ; CHECK-TRUE16-LABEL: frame_pointer_all: ; CHECK-TRUE16: ; %bb.0: -; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-TRUE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-TRUE16-NEXT: v_mov_b16_e32 v0.l, 13 ; CHECK-TRUE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-TRUE16-NEXT: s_cmovk_i32 s33, 0x1c0 @@ -225,7 +225,7 @@ define amdgpu_cs void @frame_pointer_all() #2 { ; ; CHECK-FAKE16-LABEL: frame_pointer_all: ; CHECK-FAKE16: ; %bb.0: -; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 2) +; CHECK-FAKE16-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) ; CHECK-FAKE16-NEXT: v_mov_b32_e32 v0, 13 ; CHECK-FAKE16-NEXT: s_cmp_lg_u32 0, s33 ; CHECK-FAKE16-NEXT: s_cmovk_i32 s33, 0x1c0 diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll index e6f02295e67d5..d8a5e7fa3b029 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll @@ -11,17 +11,17 @@ define i32 @s_add_co_select_user() { ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e64 v0, s[4:5], s6, s6 +; GFX7-NEXT: s_add_u32 s7, s6, s6 +; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_or_b32 s4, s4, s5 ; GFX7-NEXT: s_cmp_lg_u32 s4, 0 -; GFX7-NEXT: s_addc_u32 s7, s6, 0 +; GFX7-NEXT: s_addc_u32 s8, s6, 0 ; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX7-NEXT: s_cselect_b32 s4, s7, 0 +; GFX7-NEXT: s_cselect_b32 s4, s8, 0 ; GFX7-NEXT: s_cmp_gt_u32 s6, 31 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 -; GFX7-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX7-NEXT: s_cselect_b32 s4, s7, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: s_add_co_select_user: @@ -30,16 +30,16 @@ define i32 @s_add_co_select_user() { ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e64 v0, s[4:5], s6, s6 +; GFX9-NEXT: s_add_u32 s7, s6, s6 +; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0 -; GFX9-NEXT: s_addc_u32 s7, s6, 0 +; GFX9-NEXT: s_addc_u32 s8, s6, 0 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX9-NEXT: s_cselect_b32 s4, s7, 0 +; GFX9-NEXT: s_cselect_b32 s4, s8, 0 ; GFX9-NEXT: s_cmp_gt_u32 s6, 31 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX9-NEXT: s_cselect_b32 s4, s7, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_add_co_select_user: @@ -48,15 +48,16 @@ define i32 @s_add_co_select_user() { ; GFX10-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32 v0, s5, s4, s4 -; GFX10-NEXT: s_cmp_lg_u32 s5, 0 -; GFX10-NEXT: s_addc_u32 s5, s4, 0 +; GFX10-NEXT: s_add_u32 s5, s4, s4 ; GFX10-NEXT: s_cselect_b32 s6, -1, 0 -; GFX10-NEXT: s_and_b32 s6, s6, exec_lo -; GFX10-NEXT: s_cselect_b32 s5, s5, 0 +; GFX10-NEXT: s_cmp_lg_u32 s6, 0 +; GFX10-NEXT: s_addc_u32 s6, s4, 0 +; GFX10-NEXT: s_cselect_b32 s7, -1, 0 +; GFX10-NEXT: s_and_b32 s7, s7, exec_lo +; GFX10-NEXT: s_cselect_b32 s6, s6, 0 ; GFX10-NEXT: s_cmp_gt_u32 s4, 31 -; GFX10-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo +; GFX10-NEXT: s_cselect_b32 s4, s5, s6 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: s_add_co_select_user: @@ -65,16 +66,18 @@ define i32 @s_add_co_select_user() { ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s1, s0, s0 -; GFX11-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-NEXT: s_addc_u32 s1, s0, 0 +; GFX11-NEXT: s_add_u32 s1, s0, s0 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_cmp_lg_u32 s2, 0 +; GFX11-NEXT: s_addc_u32 s2, s0, 0 +; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_and_b32 s3, s3, exec_lo +; GFX11-NEXT: s_cselect_b32 s2, s2, 0 ; GFX11-NEXT: s_cmp_gt_u32 s0, 31 -; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, s1, v0, vcc_lo +; GFX11-NEXT: s_cselect_b32 s0, s1, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: %i = load volatile i32, ptr addrspace(4) null, align 8 @@ -98,14 +101,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_add_i32 s0, s2, s2 -; GFX7-NEXT: s_cmp_lt_u32 s0, s2 +; GFX7-NEXT: s_add_u32 s0, s2, s2 ; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_cmp_lg_u32 s0, 0 ; GFX7-NEXT: s_addc_u32 s0, s2, 0 -; GFX7-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0 +; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX7-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX7-NEXT: ; %bb.1: ; %bb0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -125,13 +127,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[8:9], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s0, s2, s2 -; GFX9-NEXT: s_cmp_lt_u32 s0, s2 +; GFX9-NEXT: s_add_u32 s0, s2, s2 ; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: s_addc_u32 s0, s2, 0 -; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s0, v0 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1] ; GFX9-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX9-NEXT: ; %bb.1: ; %bb0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -151,13 +152,12 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_load_dword s0, s[8:9], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_i32 s1, s0, s0 -; GFX10-NEXT: s_cmp_lt_u32 s1, s0 +; GFX10-NEXT: s_add_u32 s1, s0, s0 ; GFX10-NEXT: s_cselect_b32 s1, -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 ; GFX10-NEXT: s_cmp_lg_u32 s1, 0 ; GFX10-NEXT: s_addc_u32 s0, s0, 0 -; GFX10-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0 +; GFX10-NEXT: s_cselect_b32 s0, -1, 0 +; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s0 ; GFX10-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX10-NEXT: ; %bb.1: ; %bb0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -177,15 +177,13 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) { ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_add_i32 s1, s0, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_cmp_lt_u32 s1, s0 +; GFX11-NEXT: s_add_u32 s1, s0, s0 ; GFX11-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_cmp_lg_u32 s1, 0 ; GFX11-NEXT: s_addc_u32 s0, s0, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v0 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX11-NEXT: s_cbranch_vccnz .LBB1_2 ; GFX11-NEXT: ; %bb.1: ; %bb0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll new file mode 100644 index 0000000000000..d747fb7cce7dc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll @@ -0,0 +1,1347 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250 %s + +declare bfloat @llvm.fabs.bf16(bfloat) #0 +declare bfloat @llvm.canonicalize.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0 +declare <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat>) #0 +declare <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat>) #0 +declare <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat>) #0 +declare <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat>) #0 +declare <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat>) #0 +declare <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat>) #0 +declare <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat>) #0 +declare <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat>) #0 +declare <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat>) #0 +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat undef) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v0, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v[0:1], v0, off +; GFX1250-NEXT: s_endpgm + %val = load bfloat, ptr addrspace(1) %out + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val) + store bfloat %canonicalized, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 { +; GFX1250-LABEL: s_test_canonicalize_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e64 v0, s2, s2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = bitcast i16 %val.arg to bfloat + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat %hi) #1 { +; GFX1250-LABEL: v_test_canonicalize_build_vector_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %ins0 = insertelement <2 x bfloat> poison, bfloat %lo, i32 0 + %ins1 = insertelement <2 x bfloat> %ins0, bfloat %hi, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %ins1) + ret <2 x bfloat> %canonicalized +} + + +define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + + +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %val.fabs.fneg = fneg bfloat %val.fabs + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = load bfloat, ptr addrspace(1) %out + %val.fneg = fneg bfloat %val + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 { +; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = load bfloat, ptr addrspace(1) %out + %val.fneg = fneg bfloat %val + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 { +; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_u16 v1, v0, s[0:1] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = load bfloat, ptr addrspace(1) %out + %val.fabs = call bfloat @llvm.fabs.bf16(bfloat %val) + %val.fabs.fneg = fneg bfloat %val.fabs + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat %val.fabs.fneg) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_p0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_n0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -0.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_p1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 1.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_n1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat -1.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_literal_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 16.0) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 { +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR03FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 { +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR83FF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C00) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -1 to bfloat)) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat bitcast (i16 -2 to bfloat)) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7C01) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xR7DFF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFDFF) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call bfloat @llvm.canonicalize.bf16(bfloat 0xRFC01) + store bfloat %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val) + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %val.fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %val) + %val.fabs.fneg = fneg <2 x bfloat> %val.fabs + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val.fabs.fneg) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %out, i32 %tid + %val = load <2 x bfloat>, ptr addrspace(1) %gep + %fneg.val = fneg <2 x bfloat> %val + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %fneg.val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 { +; GFX1250-LABEL: s_test_canonicalize_var_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_and_b32 s3, s2, 0xffff0000 +; GFX1250-NEXT: s_lshl_b32 s2, s2, 16 +; GFX1250-NEXT: v_max_num_f32_e64 v0, s3, s3 +; GFX1250-NEXT: v_max_num_f32_e64 v1, s2, s2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v1, v0 +; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %val = bitcast i32 %val.arg to <2 x bfloat> + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %val) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> zeroinitializer) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 { +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 { +; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> bitcast (i32 -1 to <2 x bfloat>)) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> ) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define <3 x bfloat> @v_test_canonicalize_var_v3bf16(<3 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v3bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> %val) + ret <3 x bfloat> %canonicalized +} + +define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %val) + ret <4 x bfloat> %canonicalized +} + +define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) + store <2 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define <2 x bfloat> @v_test_canonicalize_reg_undef_v2bf16(bfloat %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_reg_undef_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_undef_reg_v2bf16(bfloat %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_undef_reg_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <2 x bfloat> poison, bfloat %val, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_undef_lo_imm_hi_v2bf16() #1 { +; GFX1250-LABEL: v_test_canonicalize_undef_lo_imm_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 1.0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_imm_lo_undef_hi_v2bf16() #1 { +; GFX1250-LABEL: v_test_canonicalize_imm_lo_undef_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x3f80 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <2 x bfloat> undef, bfloat 1.0, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_undef_lo_k_hi_v2bf16() #1 { +; GFX1250-LABEL: v_test_canonicalize_undef_lo_k_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x41800000 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_k_lo_undef_hi_v2bf16() #1 { +; GFX1250-LABEL: v_test_canonicalize_k_lo_undef_hi_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x4180 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <2 x bfloat> undef, bfloat 16.0, i32 0 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_reg_k_v2bf16(bfloat %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_reg_k_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x4000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec0 = insertelement <2 x bfloat> poison, bfloat %val, i32 0 + %vec1 = insertelement <2 x bfloat> %vec0, bfloat 2.0, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) + ret <2 x bfloat> %canonicalized +} + +define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_k_reg_v2bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x4000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, v0, s0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec0 = insertelement <2 x bfloat> poison, bfloat 2.0, i32 0 + %vec1 = insertelement <2 x bfloat> %vec0, bfloat %val, i32 1 + %canonicalized = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> %vec1) + ret <2 x bfloat> %canonicalized +} + +define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 { +; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b64 v0, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef) + store <4 x bfloat> %canonicalized, ptr addrspace(1) %out + ret void +} + +define <4 x bfloat> @v_test_canonicalize_reg_undef_undef_undef_v4bf16(bfloat %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, 0x7fc07fc0 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec = insertelement <4 x bfloat> poison, bfloat %val, i32 0 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec) + ret <4 x bfloat> %canonicalized +} + +define <4 x bfloat> @v_test_canonicalize_reg_reg_undef_undef_v4bf16(bfloat %val0, bfloat %val1) #1 { +; GFX1250-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0x7fc07fc0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 + %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 1 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec1) + ret <4 x bfloat> %canonicalized +} + +define <4 x bfloat> @v_test_canonicalize_reg_undef_reg_reg_v4bf16(bfloat %val0, bfloat %val1, bfloat %val2) #1 { +; GFX1250-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v1, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_movk_i32 s0, 0x7fc0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_perm_b32 v0, s0, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %vec0 = insertelement <4 x bfloat> poison, bfloat %val0, i32 0 + %vec1 = insertelement <4 x bfloat> %vec0, bfloat %val1, i32 2 + %vec2 = insertelement <4 x bfloat> %vec1, bfloat %val2, i32 3 + %canonicalized = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> %vec2) + ret <4 x bfloat> %canonicalized +} + +define <6 x bfloat> @v_test_canonicalize_var_v6bf16(<6 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v6bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <6 x bfloat> @llvm.canonicalize.v6bf16(<6 x bfloat> %val) + ret <6 x bfloat> %canonicalized +} + +define <8 x bfloat> @v_test_canonicalize_var_v8bf16(<8 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v8bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v0, v0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_max_num_f32_e32 v3, v3, v3 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v7 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v6 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v4 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <8 x bfloat> @llvm.canonicalize.v8bf16(<8 x bfloat> %val) + ret <8 x bfloat> %canonicalized +} + +define <12 x bfloat> @v_test_canonicalize_var_v12bf16(<12 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v12bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_max_num_f32 v8, v8, v8 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v10, v10, v10 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v11, v11, v11 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v11 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v10 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v9 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v8 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <12 x bfloat> @llvm.canonicalize.v12bf16(<12 x bfloat> %val) + ret <12 x bfloat> %canonicalized +} + +define <16 x bfloat> @v_test_canonicalize_var_v16bf16(<16 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v16bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v8, 0xffff0000, v7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v6 +; GFX1250-NEXT: v_and_b32_e32 v10, 0xffff0000, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_lshlrev_b32 v5, 16, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v7, v7, v7 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v14, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: v_max_num_f32_e32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v12, v12, v12 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v14, v14, v14 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v15, v15, v15 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v15 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v14 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v13 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v12 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v11 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v10 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v9 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v8 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> %val) + ret <16 x bfloat> %canonicalized +} + +define <32 x bfloat> @v_test_canonicalize_var_v32bf16(<32 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v32bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v16, 0xffff0000, v15 +; GFX1250-NEXT: v_and_b32_e32 v18, 0xffff0000, v13 +; GFX1250-NEXT: v_and_b32_e32 v20, 0xffff0000, v11 +; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff0000, v9 +; GFX1250-NEXT: v_and_b32_e32 v24, 0xffff0000, v7 +; GFX1250-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_lshlrev_b32 v15, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v17, 0xffff0000, v14 +; GFX1250-NEXT: v_dual_lshlrev_b32 v14, 16, v14 :: v_dual_lshlrev_b32 v13, 16, v13 +; GFX1250-NEXT: v_max_num_f32_e32 v18, v18, v18 +; GFX1250-NEXT: v_and_b32_e32 v19, 0xffff0000, v12 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v12 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX1250-NEXT: v_max_num_f32_e32 v20, v20, v20 +; GFX1250-NEXT: v_and_b32_e32 v21, 0xffff0000, v10 +; GFX1250-NEXT: v_dual_lshlrev_b32 v10, 16, v10 :: v_dual_lshlrev_b32 v9, 16, v9 +; GFX1250-NEXT: v_max_num_f32_e32 v22, v22, v22 +; GFX1250-NEXT: v_and_b32_e32 v23, 0xffff0000, v8 +; GFX1250-NEXT: v_dual_lshlrev_b32 v8, 16, v8 :: v_dual_lshlrev_b32 v7, 16, v7 +; GFX1250-NEXT: v_max_num_f32_e32 v24, v24, v24 +; GFX1250-NEXT: v_and_b32_e32 v25, 0xffff0000, v6 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v26, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v27, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v28, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v29, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v4 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v30, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v31, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_dual_max_num_f32 v15, v15, v15 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v14, v14, v14 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v19, v19, v19 +; GFX1250-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v11, v11, v11 +; GFX1250-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v10, v10, v10 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v23, v23, v23 +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v7, v7, v7 +; GFX1250-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v5, v5, v5 +; GFX1250-NEXT: v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v28, v28, v28 +; GFX1250-NEXT: v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v30, v30, v30 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v31, v31, v31 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v2, v2, v2 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v31 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v30 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v29 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v28 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v27 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v26 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v25 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v24 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v8, v8, v23 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v9, v9, v22 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v10, v10, v21 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v11, v11, v20 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v12, v12, v19 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v13, v13, v18 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v14, v14, v17 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v15, v15, v16 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <32 x bfloat> @llvm.canonicalize.v32bf16(<32 x bfloat> %val) + ret <32 x bfloat> %canonicalized +} + +define <64 x bfloat> @v_test_canonicalize_var_v64bf16(<64 x bfloat> %val) #1 { +; GFX1250-LABEL: v_test_canonicalize_var_v64bf16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v31, off, s32 +; GFX1250-NEXT: v_and_b32_e32 v81, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v38, 0xffff0000, v24 +; GFX1250-NEXT: v_lshlrev_b32_e32 v24, 16, v24 +; GFX1250-NEXT: v_and_b32_e32 v39, 0xffff0000, v23 +; GFX1250-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX1250-NEXT: v_and_b32_e32 v80, 0xffff0000, v6 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v6 +; GFX1250-NEXT: v_and_b32_e32 v82, 0xffff0000, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v81, v81, v81 +; GFX1250-NEXT: v_and_b32_e32 v83, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v34, 0xffff0000, v28 +; GFX1250-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX1250-NEXT: v_and_b32_e32 v35, 0xffff0000, v27 +; GFX1250-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX1250-NEXT: v_and_b32_e32 v36, 0xffff0000, v26 +; GFX1250-NEXT: v_lshlrev_b32_e32 v26, 16, v26 +; GFX1250-NEXT: v_and_b32_e32 v48, 0xffff0000, v22 +; GFX1250-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v82, v82, v82 +; GFX1250-NEXT: v_dual_max_num_f32 v1, v1, v1 :: v_dual_max_num_f32 v83, v83, v83 +; GFX1250-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v24, v24, v24 +; GFX1250-NEXT: v_max_num_f32_e32 v39, v39, v39 +; GFX1250-NEXT: v_dual_max_num_f32 v23, v23, v23 :: v_dual_max_num_f32 v48, v48, v48 +; GFX1250-NEXT: v_and_b32_e32 v32, 0xffff0000, v30 +; GFX1250-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX1250-NEXT: v_and_b32_e32 v33, 0xffff0000, v29 +; GFX1250-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX1250-NEXT: v_and_b32_e32 v37, 0xffff0000, v25 +; GFX1250-NEXT: v_dual_lshlrev_b32 v25, 16, v25 :: v_dual_lshlrev_b32 v22, 16, v22 +; GFX1250-NEXT: v_and_b32_e32 v49, 0xffff0000, v21 +; GFX1250-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX1250-NEXT: v_and_b32_e32 v50, 0xffff0000, v20 +; GFX1250-NEXT: v_lshlrev_b32_e32 v20, 16, v20 +; GFX1250-NEXT: v_and_b32_e32 v51, 0xffff0000, v19 +; GFX1250-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX1250-NEXT: v_and_b32_e32 v52, 0xffff0000, v18 +; GFX1250-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX1250-NEXT: v_and_b32_e32 v53, 0xffff0000, v17 +; GFX1250-NEXT: v_lshlrev_b32_e32 v17, 16, v17 +; GFX1250-NEXT: v_and_b32_e32 v54, 0xffff0000, v16 +; GFX1250-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX1250-NEXT: v_and_b32_e32 v55, 0xffff0000, v15 +; GFX1250-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX1250-NEXT: v_and_b32_e32 v64, 0xffff0000, v14 +; GFX1250-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX1250-NEXT: v_and_b32_e32 v65, 0xffff0000, v13 +; GFX1250-NEXT: v_lshlrev_b32_e32 v13, 16, v13 +; GFX1250-NEXT: v_and_b32_e32 v66, 0xffff0000, v12 +; GFX1250-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX1250-NEXT: v_and_b32_e32 v67, 0xffff0000, v11 +; GFX1250-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX1250-NEXT: v_and_b32_e32 v68, 0xffff0000, v10 +; GFX1250-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX1250-NEXT: v_and_b32_e32 v69, 0xffff0000, v9 +; GFX1250-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX1250-NEXT: v_and_b32_e32 v70, 0xffff0000, v8 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX1250-NEXT: v_and_b32_e32 v71, 0xffff0000, v7 +; GFX1250-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v81 +; GFX1250-NEXT: v_and_b32_e32 v81, 0xffff0000, v5 +; GFX1250-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, v82 +; GFX1250-NEXT: v_and_b32_e32 v82, 0xffff0000, v4 +; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v2, v2, v83 +; GFX1250-NEXT: v_and_b32_e32 v83, 0xffff0000, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v32, v32, v32 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX1250-NEXT: v_dual_max_num_f32 v27, v27, v27 :: v_dual_max_num_f32 v36, v36, v36 +; GFX1250-NEXT: v_dual_max_num_f32 v26, v26, v26 :: v_dual_max_num_f32 v37, v37, v37 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v23, v23, v39 +; GFX1250-NEXT: v_dual_max_num_f32 v30, v30, v30 :: v_dual_max_num_f32 v33, v33, v33 +; GFX1250-NEXT: v_dual_max_num_f32 v29, v29, v29 :: v_dual_max_num_f32 v34, v34, v34 +; GFX1250-NEXT: v_dual_max_num_f32 v28, v28, v28 :: v_dual_max_num_f32 v35, v35, v35 +; GFX1250-NEXT: v_dual_max_num_f32 v25, v25, v25 :: v_dual_max_num_f32 v38, v38, v38 +; GFX1250-NEXT: v_dual_max_num_f32 v22, v22, v22 :: v_dual_max_num_f32 v49, v49, v49 +; GFX1250-NEXT: v_dual_max_num_f32 v21, v21, v21 :: v_dual_max_num_f32 v50, v50, v50 +; GFX1250-NEXT: v_dual_max_num_f32 v20, v20, v20 :: v_dual_max_num_f32 v51, v51, v51 +; GFX1250-NEXT: v_dual_max_num_f32 v19, v19, v19 :: v_dual_max_num_f32 v52, v52, v52 +; GFX1250-NEXT: v_dual_max_num_f32 v18, v18, v18 :: v_dual_max_num_f32 v53, v53, v53 +; GFX1250-NEXT: v_dual_max_num_f32 v17, v17, v17 :: v_dual_max_num_f32 v54, v54, v54 +; GFX1250-NEXT: v_dual_max_num_f32 v16, v16, v16 :: v_dual_max_num_f32 v55, v55, v55 +; GFX1250-NEXT: v_dual_max_num_f32 v15, v15, v15 :: v_dual_max_num_f32 v64, v64, v64 +; GFX1250-NEXT: v_dual_max_num_f32 v14, v14, v14 :: v_dual_max_num_f32 v65, v65, v65 +; GFX1250-NEXT: v_dual_max_num_f32 v13, v13, v13 :: v_dual_max_num_f32 v66, v66, v66 +; GFX1250-NEXT: v_dual_max_num_f32 v12, v12, v12 :: v_dual_max_num_f32 v67, v67, v67 +; GFX1250-NEXT: v_dual_max_num_f32 v11, v11, v11 :: v_dual_max_num_f32 v68, v68, v68 +; GFX1250-NEXT: v_dual_max_num_f32 v10, v10, v10 :: v_dual_max_num_f32 v69, v69, v69 +; GFX1250-NEXT: v_dual_max_num_f32 v9, v9, v9 :: v_dual_max_num_f32 v70, v70, v70 +; GFX1250-NEXT: v_dual_max_num_f32 v8, v8, v8 :: v_dual_max_num_f32 v71, v71, v71 +; GFX1250-NEXT: v_dual_max_num_f32 v80, v80, v80 :: v_dual_max_num_f32 v81, v81, v81 +; GFX1250-NEXT: v_dual_max_num_f32 v82, v82, v82 :: v_dual_max_num_f32 v83, v83, v83 +; GFX1250-NEXT: v_dual_max_num_f32 v3, v3, v3 :: v_dual_max_num_f32 v4, v4, v4 +; GFX1250-NEXT: v_dual_max_num_f32 v5, v5, v5 :: v_dual_max_num_f32 v6, v6, v6 +; GFX1250-NEXT: v_max_num_f32_e32 v7, v7, v7 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v26, v26, v36 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v3, v3, v83 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v4, v4, v82 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v5, v5, v81 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v6, v6, v80 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v7, v7, v71 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v8, v8, v70 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v9, v9, v69 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v10, v10, v68 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v11, v11, v67 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v12, v12, v66 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v13, v13, v65 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v14, v14, v64 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v15, v15, v55 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v16, v16, v54 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v17, v17, v53 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v18, v18, v52 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v19, v19, v51 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v20, v20, v50 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v21, v21, v49 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v22, v22, v48 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v24, v24, v38 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v25, v25, v37 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v27, v27, v35 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v28, v28, v34 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v29, v29, v33 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v30, v30, v32 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v39, 0xffff0000, v31 +; GFX1250-NEXT: v_lshlrev_b32_e32 v31, 16, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_max_num_f32_e32 v36, v39, v39 +; GFX1250-NEXT: v_max_num_f32_e32 v31, v31, v31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v31, v31, v36 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %canonicalized = call <64 x bfloat> @llvm.canonicalize.v64bf16(<64 x bfloat> %val) + ret <64 x bfloat> %canonicalized +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll index 6a898fa799f3e..30bcdf97e26fd 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll @@ -231,22 +231,13 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_copysign_bf16_f32: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_copysign_bf16_f32: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_bf16_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sign = fptrunc float %sign.f32 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op @@ -298,22 +289,13 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) { ; GFX10-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11TRUE16-LABEL: v_copysign_bf16_f64: -; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11FAKE16-LABEL: v_copysign_bf16_f64: -; GFX11FAKE16: ; %bb.0: -; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_bf16_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sign = fptrunc double %sign.f64 to bfloat %op = call bfloat @llvm.copysign.bf16(bfloat %mag, bfloat %sign) ret bfloat %op @@ -499,9 +481,10 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f ; ; GFX11TRUE16-LABEL: s_copysign_bf16_f32: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 @@ -575,9 +558,10 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign. ; ; GFX11TRUE16-LABEL: s_copysign_bf16_f64: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 @@ -1153,12 +1137,12 @@ define amdgpu_ps i32 @s_copysign_v2bf16(<2 x bfloat> inreg %arg_mag, <2 x bfloat define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x bfloat> inreg %arg_sign) { ; GCN-LABEL: s_copysign_v3bf16: ; GCN: ; %bb.0: -; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s3 ; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s2 +; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s5 +; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s0 ; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s1 -; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s0 +; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2 ; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2 @@ -1166,45 +1150,45 @@ define amdgpu_ps <3 x i16> @s_copysign_v3bf16(<3 x bfloat> inreg %arg_mag, <3 x ; GCN-NEXT: v_bfe_u32 v4, v4, 16, 15 ; GCN-NEXT: v_bfe_u32 v3, v3, 16, 15 ; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2 -; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GCN-NEXT: v_and_b32_e32 v6, 0x8000, v1 ; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: v_or_b32_e32 v2, v5, v2 -; GCN-NEXT: v_or_b32_e32 v1, v4, v1 -; GCN-NEXT: v_or_b32_e32 v0, v3, v0 -; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GCN-NEXT: v_or_b32_e32 v2, v2, v1 -; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; GCN-NEXT: v_readfirstlane_b32 s1, v1 -; GCN-NEXT: v_readfirstlane_b32 s0, v2 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 +; GCN-NEXT: v_or_b32_e32 v1, v5, v2 +; GCN-NEXT: v_or_b32_e32 v2, v4, v6 +; GCN-NEXT: v_or_b32_e32 v3, v3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GCN-NEXT: v_or_b32_e32 v4, v3, v0 +; GCN-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; GCN-NEXT: v_readfirstlane_b32 s0, v4 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: v_readfirstlane_b32 s2, v1 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_copysign_v3bf16: ; GFX7: ; %bb.0: -; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s4 -; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s5 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3 -; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s3 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s4 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2 -; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s0 -; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 +; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s5 +; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0 +; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s2 +; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 15 -; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15 -; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 -; GFX7-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; GFX7-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7-NEXT: v_readfirstlane_b32 s0, v2 -; GFX7-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7-NEXT: v_or_b32_e32 v4, v2, v0 +; GFX7-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; GFX7-NEXT: v_readfirstlane_b32 s0, v4 +; GFX7-NEXT: v_readfirstlane_b32 s1, v2 +; GFX7-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_copysign_v3bf16: @@ -3677,9 +3661,10 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f ; ; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; @@ -3744,9 +3729,10 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, d ; ; GFX11TRUE16-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64: ; GFX11TRUE16: ; %bb.0: -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; @@ -6700,15 +6686,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64(<3 x bfloat> %m ; GFX11TRUE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l -; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v4 +; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v3 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3 -; GFX11TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4 ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11FAKE16-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f64: diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 574c1042859aa..62847b15d3443 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -776,22 +776,13 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sign.trunc = fptrunc float %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) ret half %out @@ -823,22 +814,13 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) { ; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] %sign.trunc = fptrunc double %sign to half %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc) ret half %out @@ -1450,29 +1432,29 @@ define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x hal ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v2, s4 ; SI-NEXT: v_cvt_f16_f32_e32 v3, s1 -; SI-NEXT: v_cvt_f16_f32_e32 v0, s5 -; SI-NEXT: v_cvt_f16_f32_e32 v1, s2 -; SI-NEXT: v_cvt_f16_f32_e32 v4, s3 -; SI-NEXT: v_cvt_f16_f32_e32 v5, s0 +; SI-NEXT: v_cvt_f16_f32_e32 v4, s5 +; SI-NEXT: v_cvt_f16_f32_e32 v5, s2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s3 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SI-NEXT: s_brev_b32 s0, -2 ; SI-NEXT: v_bfi_b32 v2, s0, v3, v2 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_bfi_b32 v3, s0, v5, v4 ; SI-NEXT: v_bfi_b32 v0, s0, v1, v0 -; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; SI-NEXT: v_or_b32_e32 v2, v3, v1 -; SI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; SI-NEXT: v_readfirstlane_b32 s1, v1 -; SI-NEXT: v_readfirstlane_b32 s0, v2 -; SI-NEXT: v_readfirstlane_b32 s2, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v3 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16 +; SI-NEXT: v_or_b32_e32 v4, v4, v0 +; SI-NEXT: v_readfirstlane_b32 s0, v4 +; SI-NEXT: v_readfirstlane_b32 s1, v2 +; SI-NEXT: v_readfirstlane_b32 s2, v1 ; SI-NEXT: ; return to shader part epilog ; ; VI-LABEL: s_copysign_v3f16: @@ -2832,9 +2814,10 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float ; ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f32: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -2883,9 +2866,10 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, doubl ; ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f64: ; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, s2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-TRUE16-NEXT: ; return to shader part epilog ; @@ -5590,15 +5574,16 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 ; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff0000, v0, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff0000, v1, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff0000, v2, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64: diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll index 210e09fd9169a..7f6a920d25016 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -137,33 +137,31 @@ define amdgpu_kernel void @v_fdiv_f16( ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc +; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] glc dlc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v5, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v4, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_fdiv_f16: diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 605026614c614..1e7855ccb3642 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -723,7 +723,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -1065,7 +1065,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -1586,7 +1586,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -1946,7 +1946,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -2483,7 +2483,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -2847,7 +2847,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -3386,7 +3386,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -3750,7 +3750,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -4289,7 +4289,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -4653,7 +4653,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -5192,7 +5192,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -5556,7 +5556,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -6057,7 +6057,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -6405,7 +6405,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -6898,7 +6898,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -7246,7 +7246,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -7739,7 +7739,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -8087,7 +8087,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -8580,7 +8580,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -8928,7 +8928,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -9480,7 +9480,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -9864,7 +9864,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] @@ -10382,7 +10382,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -10750,7 +10750,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] @@ -11264,7 +11264,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] @@ -11644,7 +11644,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0xffffffffffffff80 ; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1] ; GFX1250-SDAG-NEXT: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 4eaa1965c66f1..fc8883924dfbc 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4152,7 +4152,8 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) { ; GFX942-LABEL: store_load_i64_aligned: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 @@ -4262,7 +4263,8 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; GFX942-LABEL: store_load_i64_unaligned: ; GFX942: ; %bb.0: ; %bb ; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942-NEXT: v_mov_b64_e32 v[2:3], 15 +; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_mov_b32_e32 v3, 0 ; GFX942-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index e74ad3d62bea4..47161954cc332 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -8946,8 +8946,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4 ; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -8971,8 +8970,7 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4 ; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -8996,9 +8994,8 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -9027,8 +9024,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4 ; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -9054,8 +9050,7 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4 ; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -9079,9 +9074,8 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc @@ -9110,8 +9104,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v4, v3 -; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4 ; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -9136,8 +9129,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v4, v3 -; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4 ; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -9162,9 +9154,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc @@ -9194,8 +9185,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v1, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 ; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -9221,8 +9211,7 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v1, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 ; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -9246,9 +9235,8 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v4, v3 -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc @@ -9279,8 +9267,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3 ; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -9307,8 +9294,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3 ; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -9335,9 +9321,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3 ; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 -; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9369,8 +9354,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, 1, v3 ; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -9399,8 +9383,7 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3 ; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -9427,9 +9410,8 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN3-NEXT: v_subrev_co_u32_e32 v2, vcc, 1, v3 ; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 -; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc @@ -9463,8 +9445,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v5, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v5 ; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc @@ -9493,8 +9474,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v5, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5 ; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc @@ -9523,9 +9503,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v5, v0 -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v5 ; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 -; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc @@ -9557,8 +9536,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v5, v0 -; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 -; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, 1, v5 ; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 ; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc @@ -9587,8 +9565,7 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v5, v0 -; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5 ; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 ; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc @@ -9617,9 +9594,8 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v5, v0 -; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v5 ; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 -; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 ; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc ; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll new file mode 100644 index 0000000000000..1ba13b287be46 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fma-mix.gfx11plus.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-REAL16 +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -stop-after=amdgpu-isel | FileCheck %s --check-prefixes=GFX11-FAKE16 + +; Make sure no "vgpr32 = copy vgpr16" is generated + +define amdgpu_kernel void @fma_mix_f16 (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) { + ; GFX11-REAL16-LABEL: name: fma_mix_f16 + ; GFX11-REAL16: bb.0.entry: + ; GFX11-REAL16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: {{ $}} + ; GFX11-REAL16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-REAL16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-REAL16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-REAL16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1 + ; GFX11-REAL16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3 + ; GFX11-REAL16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5 + ; GFX11-REAL16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7 + ; GFX11-REAL16-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6 + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1 + ; GFX11-REAL16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-REAL16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-REAL16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-REAL16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1) + ; GFX11-REAL16-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 killed [[REG_SEQUENCE2]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep3, addrspace 1) + ; GFX11-REAL16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 14336, 0, implicit $exec + ; GFX11-REAL16-NEXT: [[V_ADD_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_ADD_F16_t16_e64 0, killed [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], 0, killed [[V_MOV_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-REAL16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-REAL16-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_ADD_F16_t16_e64_]], %subreg.lo16, killed [[DEF]], %subreg.hi16 + ; GFX11-REAL16-NEXT: [[V_FMA_MIX_F16_t16_:%[0-9]+]]:vgpr_16 = nofpexcept V_FMA_MIX_F16_t16 0, killed [[GLOBAL_LOAD_DWORD_SADDR]], 0, killed [[GLOBAL_LOAD_DWORD_SADDR1]], 8, killed [[REG_SEQUENCE4]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX11-REAL16-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 killed [[V_MOV_B32_e32_]], killed [[V_FMA_MIX_F16_t16_]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s16) into %ir.4, addrspace 1) + ; GFX11-REAL16-NEXT: S_ENDPGM 0 + ; + ; GFX11-FAKE16-LABEL: name: fma_mix_f16 + ; GFX11-FAKE16: bb.0.entry: + ; GFX11-FAKE16-NEXT: liveins: $vgpr0, $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: {{ $}} + ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5 + ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GFX11-FAKE16-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s256) from %ir.a.kernarg.offset, align 4, addrspace 4) + ; GFX11-FAKE16-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub1 + ; GFX11-FAKE16-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub0 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub3 + ; GFX11-FAKE16-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub2 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub5 + ; GFX11-FAKE16-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub4 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub7 + ; GFX11-FAKE16-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX8_IMM]].sub6 + ; GFX11-FAKE16-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1 + ; GFX11-FAKE16-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 + ; GFX11-FAKE16-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]](s32), killed [[S_MOV_B32_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2 + ; GFX11-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 killed [[S_MOV_B32_1]], killed [[V_AND_B32_e64_]], implicit $exec + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep1, addrspace 1) + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_DWORD_SADDR1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[REG_SEQUENCE1]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s32) from %ir.in.gep2, addrspace 1) + ; GFX11-FAKE16-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR killed [[REG_SEQUENCE2]], [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s16) from %ir.in.gep3, addrspace 1) + ; GFX11-FAKE16-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 14336 + ; GFX11-FAKE16-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, killed [[GLOBAL_LOAD_USHORT_SADDR]], 0, killed [[S_MOV_B32_2]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX11-FAKE16-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF]] + ; GFX11-FAKE16-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 0, killed [[GLOBAL_LOAD_DWORD_SADDR]], 0, killed [[GLOBAL_LOAD_DWORD_SADDR1]], 8, killed [[V_ADD_F16_fake16_e64_]], 0, [[COPY10]], 0, 0, implicit $mode, implicit $exec + ; GFX11-FAKE16-NEXT: GLOBAL_STORE_SHORT_SADDR killed [[V_MOV_B32_e32_]], killed [[V_FMA_MIXLO_F16_]], killed [[REG_SEQUENCE3]], 0, 0, implicit $exec :: (store (s16) into %ir.4, addrspace 1) + ; GFX11-FAKE16-NEXT: S_ENDPGM 0 +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep1 = getelementptr i32, ptr addrspace(1) %a, i32 %tid + %in.gep2 = getelementptr i32, ptr addrspace(1) %b, i32 %tid + %in.gep3 = getelementptr i32, ptr addrspace(1) %c, i32 %tid + %load.a = load float, ptr addrspace(1) %in.gep1 + %load.b = load float, ptr addrspace(1) %in.gep2 + %load.c = load half, ptr addrspace(1) %in.gep3 + %add.c = fadd half %load.c, 0.5 + %load.float.c = fpext half %add.c to float + %result = tail call float @llvm.fmuladd.f32(float %load.a, float %load.b, float %load.float.c) + %half = fptrunc float %result to half + store half %half, ptr addrspace(1) %out + ret void +} + diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 462d7748b86cd..b14e8c44ffcce 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -581,145 +581,63 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c ret { half, half } %insert.1 } -; This one asserted with -enable-no-signed-zeros-fp-math -define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 { -; SI-SAFE-LABEL: fneg_fadd_0_f16: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 -; SI-SAFE-NEXT: v_rcp_f32_e32 v3, v2 -; SI-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; SI-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 -; SI-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 -; SI-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; SI-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 -; SI-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_f16: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 -; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3 -; SI-NSZ-NEXT: v_mul_f32_e32 v5, v4, v3 -; SI-NSZ-NEXT: v_fma_f32 v6, -v2, v5, v4 -; SI-NSZ-NEXT: v_fma_f32 v5, v6, v3, v5 -; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_f16: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0_f16: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog -; -; GFX11-SAFE-LABEL: fneg_fadd_0_f16: -; GFX11-SAFE: ; %bb.0: ; %.entry -; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1 -; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-SAFE-NEXT: ; return to shader part epilog -; -; GFX11-NSZ-LABEL: fneg_fadd_0_f16: -; GFX11-NSZ: ; %bb.0: ; %.entry -; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-NSZ-NEXT: ; return to shader part epilog -; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_f16: -; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry -; GFX11-SAFE-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1 -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog -; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_f16: -; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry -; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1 -; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0 -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1 -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog +define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 { +; SI-LABEL: fneg_fadd_0_safe_f16: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; SI-NEXT: v_mad_f32 v0, v0, 0, 0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: fneg_fadd_0_safe_f16: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mul_f16_e32 v0, 0, v0 +; VI-NEXT: v_add_f16_e32 v0, 0, v0 +; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x7e00 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fneg_fadd_0_safe_f16: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: v_rcp_f16_e32 v0, s1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, 0, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv half 1.000000e+00, %tmp6 %tmp8 = fmul half 0.000000e+00, %tmp7 @@ -733,108 +651,51 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ret half %.i198 } -; This is a workaround because -enable-no-signed-zeros-fp-math does not set up -; function attribute unsafe-fp-math automatically. Combine with the previous test -; when that is done. define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 { -; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-SAFE-NEXT: s_brev_b32 s0, 1 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_mov_b32_e32 v0, 0x8000 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc, s0, 0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog -; -; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-SAFE: ; %bb.0: ; %.entry -; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-SAFE-NEXT: ; return to shader part epilog -; -; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-NSZ: ; %bb.0: ; %.entry -; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-NSZ-NEXT: ; return to shader part epilog -; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, s0, 0 -; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000 -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1 -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog -; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry -; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1 -; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0 -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1 -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog +; SI-LABEL: fneg_fadd_0_nsz_f16: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_rcp_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: fneg_fadd_0_nsz_f16: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mul_f16_e32 v0, 0x8000, v0 +; VI-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x7e00 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fneg_fadd_0_nsz_f16: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: v_rcp_f16_e32 v0, s1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, 0x8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn half 1.000000e+00, %tmp6 %tmp8 = fmul contract half 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8 - %.i188 = fadd nnan ninf contract half %tmp9, 0.000000e+00 + %.i188 = fadd nsz half %tmp9, 0.000000e+00 %tmp10 = fcmp uge half %.i188, %tmp2 %tmp11 = fneg half %.i188 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index ba34e9245f39c..aaea4f76ea49b 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -880,102 +880,54 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out } ; This one asserted with -enable-no-signed-zeros-fp-math -define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { -; SI-SAFE-LABEL: fneg_fadd_0: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-SAFE-NEXT: v_rcp_f32_e32 v1, v0 -; SI-SAFE-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-SAFE-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-SAFE-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-SAFE-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-NSZ-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-SAFE-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-SAFE-NEXT: v_rcp_f32_e32 v2, v0 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-SAFE-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-SAFE-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-SAFE-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-NSZ-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-NSZ-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-NSZ-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog +define amdgpu_ps float @fneg_fadd_0_safe(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { +; SI-LABEL: fneg_fadd_0_safe: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v1, v0 +; SI-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v3, -v0, v1, 1.0 +; SI-NEXT: v_fma_f32 v1, v3, v1, v1 +; SI-NEXT: v_mul_f32_e32 v3, v2, v1 +; SI-NEXT: v_fma_f32 v4, -v0, v3, v2 +; SI-NEXT: v_fma_f32 v3, v4, v1, v3 +; SI-NEXT: v_fma_f32 v0, -v0, v3, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; SI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; SI-NEXT: v_mad_f32 v0, v0, 0, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: fneg_fadd_0_safe: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; VI-NEXT: v_fma_f32 v2, v3, v2, v2 +; VI-NEXT: v_mul_f32_e32 v3, v1, v2 +; VI-NEXT: v_fma_f32 v4, -v0, v3, v1 +; VI-NEXT: v_fma_f32 v3, v4, v2, v3 +; VI-NEXT: v_fma_f32 v0, -v0, v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; VI-NEXT: v_mad_f32 v0, v0, 0, 0 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; VI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 @@ -989,39 +941,23 @@ define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i ret float %.i198 } -; This is a workaround because -enable-no-signed-zeros-fp-math does not set up -; function attribute unsafe-fp-math automatically. Combine with the previous test -; when that is done. -define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { -; GCN-SAFE-LABEL: fneg_fadd_0_nsz: -; GCN-SAFE: ; %bb.0: ; %.entry -; GCN-SAFE-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_add_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-SAFE-NEXT: ; return to shader part epilog -; -; GCN-NSZ-LABEL: fneg_fadd_0_nsz: -; GCN-NSZ: ; %bb.0: ; %.entry -; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-NSZ-NEXT: ; return to shader part epilog +define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr { +; GCN-LABEL: fneg_fadd_0_nsz: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rcp_f32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GCN-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 - %.i188 = fadd float %tmp9, 0.000000e+00 + %.i188 = fadd nsz float %tmp9, 0.000000e+00 %tmp10 = fcmp uge float %.i188, %tmp2 %tmp11 = fneg float %.i188 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 @@ -5079,7 +5015,7 @@ define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(ptr addrspace(1) %out %a.gep = getelementptr inbounds double, ptr addrspace(1) %a.ptr, i64 %tid.ext %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i64 %tid.ext %a = load volatile double, ptr addrspace(1) %a.gep - %fneg.a = fsub double -0.000000e+00, %a + %fneg.a = fsub nsz double -0.000000e+00, %a %fpround = fptrunc double %fneg.a to float %fneg = fneg float %fpround store float %fneg, ptr addrspace(1) %out.gep @@ -8072,3 +8008,6 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" attributes #1 = { nounwind readnone } attributes #2 = { nounwind "unsafe-fp-math"="true" } attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-NSZ: {{.*}} +; GCN-SAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index e687745469014..3de6df211ac7c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -175,103 +175,54 @@ define { float, float } @v_fneg_add_multi_use_fneg_x_f32(float %a, float %b, flo ret { float, float } %insert.1 } -; This one asserted with -enable-no-signed-zeros-fp-math -define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #0 { -; SI-SAFE-LABEL: fneg_fadd_0_f32: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-SAFE-NEXT: v_rcp_f32_e32 v1, v0 -; SI-SAFE-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-SAFE-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-SAFE-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-SAFE-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_f32: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-NSZ-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_f32: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-SAFE-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-SAFE-NEXT: v_rcp_f32_e32 v2, v0 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-SAFE-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-SAFE-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-SAFE-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog +define amdgpu_ps float @fneg_fadd_0_safe_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #0 { +; SI-LABEL: fneg_fadd_0_safe_f32: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v1, v0 +; SI-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v3, -v0, v1, 1.0 +; SI-NEXT: v_fma_f32 v1, v3, v1, v1 +; SI-NEXT: v_mul_f32_e32 v3, v2, v1 +; SI-NEXT: v_fma_f32 v4, -v0, v3, v2 +; SI-NEXT: v_fma_f32 v3, v4, v1, v3 +; SI-NEXT: v_fma_f32 v0, -v0, v3, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; SI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; SI-NEXT: v_mad_f32 v0, v0, 0, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: ; return to shader part epilog ; -; VI-NSZ-LABEL: fneg_fadd_0_f32: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-NSZ-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-NSZ-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-NSZ-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog +; VI-LABEL: fneg_fadd_0_safe_f32: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; VI-NEXT: v_fma_f32 v2, v3, v2, v2 +; VI-NEXT: v_mul_f32_e32 v3, v1, v2 +; VI-NEXT: v_fma_f32 v4, -v0, v3, v1 +; VI-NEXT: v_fma_f32 v3, v4, v2, v3 +; VI-NEXT: v_fma_f32 v0, -v0, v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; VI-NEXT: v_mad_f32 v0, v0, 0, 0 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; VI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 @@ -289,35 +240,22 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; function attribute unsafe-fp-math automatically. Combine with the previous test ; when that is done. define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #2 { -; GCN-SAFE-LABEL: fneg_fadd_0_nsz_f32: -; GCN-SAFE: ; %bb.0: ; %.entry -; GCN-SAFE-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_add_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-SAFE-NEXT: ; return to shader part epilog -; -; GCN-NSZ-LABEL: fneg_fadd_0_nsz_f32: -; GCN-NSZ: ; %bb.0: ; %.entry -; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-NSZ-NEXT: ; return to shader part epilog +; GCN-LABEL: fneg_fadd_0_nsz_f32: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rcp_f32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GCN-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 - %.i188 = fadd float %tmp9, 0.000000e+00 + %.i188 = fadd nsz float %tmp9, 0.000000e+00 %tmp10 = fcmp uge float %.i188, %tmp2 %tmp11 = fneg float %.i188 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 @@ -569,8 +507,6 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; SI-NSZ-LABEL: fneg_fadd_0_f64: ; SI-NSZ: ; %bb.0: ; %.entry ; SI-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0 -; SI-NSZ-NEXT: s_mov_b32 s4, 0 -; SI-NSZ-NEXT: s_brev_b32 s5, 1 ; SI-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] ; SI-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] @@ -583,7 +519,10 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 ; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 ; SI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; SI-NSZ-NEXT: s_mov_b32 s2, 0 +; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; SI-NSZ-NEXT: s_brev_b32 s3, 1 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[0:1], s[2:3], s[2:3] ; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -637,7 +576,8 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; VI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 ; VI-NSZ-NEXT: s_mov_b32 s2, 0 ; VI-NSZ-NEXT: s_brev_b32 s3, 1 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[0:1], s[2:3], s[2:3] ; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -663,102 +603,56 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; function attribute unsafe-fp-math automatically. Combine with the previous test ; when that is done. define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %tmp6, <4 x i32> %arg) #2 { -; SI-SAFE-LABEL: fneg_fadd_0_nsz_f64: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-SAFE-NEXT: v_mov_b32_e32 v2, s1 -; SI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0 -; SI-SAFE-NEXT: v_mov_b32_e32 v3, s0 -; SI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0 -; SI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1] -; SI-SAFE-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; SI-SAFE-NEXT: s_mov_b32 s0, 0 -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_nsz_f64: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: s_mov_b32 s2, 0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: s_brev_b32 s3, 1 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] -; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; SI-NSZ-NEXT: s_mov_b32 s0, 0 -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_nsz_f64: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; VI-SAFE-NEXT: v_mov_b32_e32 v4, s0 -; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s1 -; VI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0 -; VI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0 -; VI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1] -; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; VI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; VI-SAFE-NEXT: s_mov_b32 s0, 0 -; VI-SAFE-NEXT: ; return to shader part epilog +; SI-LABEL: fneg_fadd_0_nsz_f64: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; SI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NEXT: s_brev_b32 s3, 1 +; SI-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: ; return to shader part epilog ; -; VI-NSZ-LABEL: fneg_fadd_0_nsz_f64: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: s_mov_b32 s2, 0 -; VI-NSZ-NEXT: s_brev_b32 s3, 1 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] -; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; VI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; VI-NSZ-NEXT: s_mov_b32 s0, 0 -; VI-NSZ-NEXT: ; return to shader part epilog +; VI-LABEL: fneg_fadd_0_nsz_f64: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; VI-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn double 1.000000e+00, %tmp6 %tmp8 = fmul double 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract double 0.000000e+00, %tmp8 - %.i188 = fadd double %tmp9, 0.000000e+00 + %.i188 = fadd nsz double %tmp9, 0.000000e+00 %tmp10 = fcmp uge double %.i188, %tmp2 %tmp11 = fneg double %.i188 %.i092 = select i1 %tmp10, double %tmp2, double %tmp11 @@ -4547,25 +4441,40 @@ define float @v_fneg_fabs_select_infloop_regression(float %arg, i1 %arg1) { ret float %i3 } -define float @v_fmul_0_fsub_0_infloop_regression(float %arg) { -; GCN-SAFE-LABEL: v_fmul_0_fsub_0_infloop_regression: +define float @v_fmul_0_fsub_0_safe_infloop_regression(float %arg) { +; GCN-SAFE-LABEL: v_fmul_0_fsub_0_safe_infloop_regression: ; GCN-SAFE: ; %bb.0: ; %bb ; GCN-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 ; GCN-SAFE-NEXT: v_sub_f32_e32 v0, 0, v0 ; GCN-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GCN-NSZ-LABEL: v_fmul_0_fsub_0_infloop_regression: -; GCN-NSZ: ; %bb.0: ; %bb -; GCN-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; GCN-NSZ-NEXT: s_setpc_b64 s[30:31] +; SI-NSZ-LABEL: v_fmul_0_fsub_0_safe_infloop_regression: +; SI-NSZ: ; %bb.0: ; %bb +; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NSZ-NEXT: s_brev_b32 s4, 1 +; SI-NSZ-NEXT: v_fma_f32 v0, v0, s4, 0 +; SI-NSZ-NEXT: s_setpc_b64 s[30:31] +; FIXME: utils/update_llc_test_checks.py will generate redundant VI +; labels, remove them, they will cause test failure. bb: %i = fmul float %arg, 0.0 %i1 = fsub float 0.0, %i ret float %i1 } +define float @v_fmul_0_fsub_0_nsz_infloop_regression(float %arg) { +; GCN-LABEL: v_fmul_0_fsub_0_nsz_infloop_regression: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] +bb: + %i = fmul float %arg, 0.0 + %i1 = fsub nsz float 0.0, %i + ret float %i1 +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 declare float @llvm.fma.f32(float, float, float) #1 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir index 73cdcddbef135..a3b2191695734 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy-agpr.mir @@ -209,8 +209,8 @@ body: | bb.0: ; GCN-LABEL: name: s_mov_b32_imm_65_copy_to_av_32 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65, implicit $exec - ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec - ; GCN-NEXT: S_ENDPGM 0, implicit [[AV_MOV_]] + ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: S_ENDPGM 0, implicit [[COPY]] %0:sreg_32 = S_MOV_B32 65, implicit $exec %1:av_32 = COPY %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir index dfcf9a1f5c5ae..bec188e4e8378 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -240,8 +240,8 @@ body: | bb.0: ; GCN-LABEL: name: s_mov_b32_imm_literal_copy_s_to_av_32 ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 999 - ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec - ; GCN-NEXT: $agpr0 = COPY [[AV_MOV_]] + ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[S_MOV_B32_]] + ; GCN-NEXT: $agpr0 = COPY [[COPY]] ; GCN-NEXT: S_ENDPGM 0 %0:sreg_32 = S_MOV_B32 999 %1:av_32 = COPY %0 @@ -257,8 +257,8 @@ body: | bb.0: ; GCN-LABEL: name: v_mov_b32_imm_literal_copy_v_to_av_32 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 999, implicit $exec - ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[V_MOV_B32_e32_]], implicit $exec - ; GCN-NEXT: $agpr0 = COPY [[AV_MOV_]] + ; GCN-NEXT: [[COPY:%[0-9]+]]:av_32 = COPY [[V_MOV_B32_e32_]] + ; GCN-NEXT: $agpr0 = COPY [[COPY]] ; GCN-NEXT: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 999, implicit $exec %1:av_32 = COPY %0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index a859cc91b7fde..fe95d4561d0cd 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -1571,25 +1571,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0x46000000 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v1 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 0x46000000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0] -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0] -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 0x7000 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 0x7000 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bounds: @@ -1739,25 +1738,24 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 2.0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v0.l, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v1, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_u16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v1, v1 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v4, -v3, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v0, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v4, v1 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v3, v2, s0 op_sel_hi:[1,0,0] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v2, v3, v0 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v3, -v1, v2, s0 op_sel_hi:[1,0,0] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xff800000, v1 +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xff800000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v0.l, 2.0 +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, 2.0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: fdiv_pow_shl_cnt_fail_out_of_bound2: diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index-agpr.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index-agpr.mir new file mode 100644 index 0000000000000..32a209608a4d0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-frame-index-agpr.mir @@ -0,0 +1,131 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass=si-fold-operands %s -o - | FileCheck %s + +--- +name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_fi_to_av +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_fi_to_av + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec + ; CHECK-NEXT: SI_RETURN implicit [[AV_MOV_]] + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:av_32 = AV_MOV_B32_IMM_PSEUDO %0, implicit $exec + SI_RETURN implicit %1 + +... + +--- +name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_fi_to_v +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_fi_to_v + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:av_32 = AV_MOV_B32_IMM_PSEUDO %0, implicit $exec + %2:vgpr_32 = COPY %1, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN implicit $vgpr0 + +... + +--- +name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_lit_to_v +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_lit_to_v + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1234 + ; CHECK-NEXT: [[AV_MOV_:%[0-9]+]]:av_32 = AV_MOV_B32_IMM_PSEUDO [[S_MOV_B32_]], implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32 = S_MOV_B32 1234 + %1:av_32 = AV_MOV_B32_IMM_PSEUDO %0, implicit $exec + %2:vgpr_32 = COPY %1, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN implicit $vgpr0 + +... + +--- +name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_imm_to_v +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index_av_mov_b32_imm_pseudo_from_s_mov_b32_imm_to_v + ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[V_MOV_B32_e32_]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32 = S_MOV_B32 8 + %1:av_32 = AV_MOV_B32_IMM_PSEUDO %0, implicit $exec + %2:vgpr_32 = COPY %1, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN implicit $vgpr0 + +... + +--- +name: fold_frame_index_av_regression_0 +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index_av_regression_0 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:av_32 = COPY %0 + %2:vgpr_32 = COPY %1, implicit $exec + $vgpr0 = COPY %2 + SI_RETURN implicit $vgpr0 + +... + +--- +name: fold_frame_index_av_regression_1 +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 16384 +stack: + - { id: 0, size: 16384, alignment: 4, local-offset: 0 } +body: | + bb.0: + ; CHECK-LABEL: name: fold_frame_index_av_regression_1 + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 %stack.0 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]], implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY [[COPY]] + ; CHECK-NEXT: SI_RETURN implicit $vgpr0 + %0:sreg_32 = S_MOV_B32 %stack.0 + %1:sreg_32 = S_MOV_B32 killed %0 + %2:sreg_64 = S_MOV_B64 0 + %3:av_32 = COPY %1 + %4:vgpr_32 = COPY %3, implicit $exec + $vgpr0 = COPY %4 + SI_RETURN implicit $vgpr0 + +... + diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index c4a38dcd7b5f3..78a961ea0da17 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1433,37 +1433,35 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: fast_frem_f16: @@ -1507,38 +1505,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: fast_frem_f16: @@ -1583,38 +1579,36 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1) ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: fast_frem_f16: @@ -1840,37 +1834,35 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX11-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX11-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 -; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 +; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 -; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v0.l -; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: unsafe_frem_f16: @@ -1914,38 +1906,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1150-TRUE16-NEXT: s_clause 0x1 ; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1150-TRUE16-NEXT: s_clause 0x1 -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1150-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1150-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1150-TRUE16-NEXT: s_endpgm ; ; GFX1150-FAKE16-LABEL: unsafe_frem_f16: @@ -1990,38 +1980,36 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace( ; GFX1200-TRUE16-NEXT: s_clause 0x1 ; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-TRUE16-NEXT: s_clause 0x1 -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] -; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8 +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v2, v1, s[2:3] +; GFX1200-TRUE16-NEXT: global_load_d16_b16 v3, v1, s[4:5] offset:8 ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l ; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.l +; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4 -; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1] -; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4 +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1] +; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel_hi:[1,0,1] ; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4 -; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3 +; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3 -; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l +; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h -; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3 +; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 ; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l -; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1] +; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l +; GFX1200-TRUE16-NEXT: global_store_b16 v1, v2, s[0:1] ; GFX1200-TRUE16-NEXT: s_endpgm ; ; GFX1200-FAKE16-LABEL: unsafe_frem_f16: diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 38003f6075c35..0084d936ec03b 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -895,11 +895,11 @@ define <3 x i16> @v3i16_func_void() #0 { ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CI-NEXT: s_mov_b32 s7, 0xf000 ; CI-NEXT: s_mov_b32 s6, -1 -; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; CI-NEXT: buffer_load_dwordx2 v[3:4], off, s[4:7], 0 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16 -; CI-NEXT: v_mov_b32_e32 v0, v2 -; CI-NEXT: v_mov_b32_e32 v2, v3 +; CI-NEXT: v_lshr_b64 v[1:2], v[3:4], 16 +; CI-NEXT: v_mov_b32_e32 v0, v3 +; CI-NEXT: v_mov_b32_e32 v2, v4 ; CI-NEXT: s_setpc_b64 s[30:31] ; ; GFX89-LABEL: v3i16_func_void: @@ -1008,7 +1008,7 @@ define <5 x i16> @v5i16_func_void() #0 { ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_sshort v4, off, s[4:7], 0 offset:8 ; CI-NEXT: s_waitcnt vmcnt(1) -; CI-NEXT: v_alignbit_b32 v5, v1, v0, 16 +; CI-NEXT: v_lshr_b64 v[5:6], v[0:1], 16 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; CI-NEXT: v_mov_b32_e32 v2, v1 ; CI-NEXT: v_mov_b32_e32 v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll b/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll deleted file mode 100644 index f0c9258358316..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/gfx1250-scratch-scope-se.ll +++ /dev/null @@ -1,94 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GCN-GISEL %s - -; Test that stores that may hit scratch are correctly promoted to SCOPE_SE. - -define void @test_scratch_store(ptr addrspace(5) %ptr, i32 %val) { -; GCN-LABEL: test_scratch_store: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: scratch_store_b32 v0, v1, off -; GCN-NEXT: s_set_pc_i64 s[30:31] - store i32 %val, ptr addrspace(5) %ptr - ret void -} - -define void @test_unknown_flat_store(ptr %ptr, i32 %val) { -; GCN-LABEL: test_unknown_flat_store: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: flat_store_b32 v[0:1], v2 -; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: s_set_pc_i64 s[30:31] - store i32 %val, ptr %ptr - ret void -} - -define void @test_flat_store_no_scratch_alloc(ptr %ptr, i32 %val) #0 { -; GCN-LABEL: test_flat_store_no_scratch_alloc: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: flat_store_b32 v[0:1], v2 -; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: s_set_pc_i64 s[30:31] - store i32 %val, ptr %ptr - ret void -} - -define void @test_flat_store_noalias_addrspace(ptr %ptr, i32 %val) { -; GCN-LABEL: test_flat_store_noalias_addrspace: -; GCN: ; %bb.0: -; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: flat_store_b32 v[0:1], v2 -; GCN-NEXT: s_wait_dscnt 0x0 -; GCN-NEXT: s_set_pc_i64 s[30:31] - store i32 %val, ptr %ptr, !noalias.addrspace !{i32 5, i32 6} - ret void -} - -; TODO: would be nice to handle -define void @test_flat_store_select(ptr addrspace(1) %a, ptr addrspace(3) %b, i1 %cond, i32 %val) { -; GCN-SDAG-LABEL: test_flat_store_select: -; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 -; GCN-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2 -; GCN-SDAG-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], src_shared_base -; GCN-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo -; GCN-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo -; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 -; GCN-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v2, v0 -; GCN-SDAG-NEXT: flat_store_b32 v[0:1], v4 -; GCN-SDAG-NEXT: s_wait_dscnt 0x0 -; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] -; -; GCN-GISEL-LABEL: test_flat_store_select: -; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 -; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, -1, v2 -; GCN-GISEL-NEXT: v_and_b32_e32 v3, 1, v3 -; GCN-GISEL-NEXT: s_mov_b64 s[0:1], src_shared_base -; GCN-GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo -; GCN-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, s1, vcc_lo -; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GCN-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GCN-GISEL-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v5, v1 -; GCN-GISEL-NEXT: flat_store_b32 v[0:1], v4 -; GCN-GISEL-NEXT: s_wait_dscnt 0x0 -; GCN-GISEL-NEXT: s_set_pc_i64 s[30:31] - %a.ascast = addrspacecast ptr addrspace(1) %a to ptr - %b.ascast = addrspacecast ptr addrspace(3) %b to ptr - %ptr = select i1 %cond, ptr %a.ascast, ptr %b.ascast - store i32 %val, ptr %ptr - ret void -} - -attributes #0 = { "amdgpu-no-flat-scratch-init" } diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index b67a1c513c49f..05403f008276c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -9818,7 +9818,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 @@ -9861,7 +9861,7 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 @@ -11339,7 +11339,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 @@ -11382,7 +11382,7 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 @@ -14855,7 +14855,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 @@ -14905,7 +14905,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 @@ -16648,7 +16648,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-TRUE16: ; %bb.0: ; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0 @@ -16697,7 +16697,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX1250-FAKE16: ; %bb.0: ; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800) +; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], 0xfffffffffffff800 ; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 ; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1] ; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index ffab56847edca..1a45bd978ccc1 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -10195,8 +10195,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: .LBB144_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4 ; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -10224,8 +10223,7 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: .LBB144_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4 ; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -10249,9 +10247,8 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -10282,8 +10279,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-NEXT: .LBB145_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v4 ; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -10313,8 +10309,7 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: .LBB145_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4 ; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -10338,9 +10333,8 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc @@ -10374,8 +10368,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v5 ; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 ; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc @@ -10403,8 +10396,7 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v4 ; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 ; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc @@ -10429,9 +10421,8 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc @@ -10464,8 +10455,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v5, v3 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v5 ; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 ; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc @@ -10495,8 +10485,7 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v1 ; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 ; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -10520,9 +10509,8 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, 1, v4 ; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 -; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc @@ -10560,8 +10548,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; SI-NEXT: .LBB148_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 ; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -10597,8 +10584,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: .LBB148_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3 ; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -10624,9 +10610,8 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc @@ -10663,8 +10648,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; SI-NEXT: .LBB149_1: ; %atomicrmw.start ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v1 ; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc @@ -10702,8 +10686,7 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: .LBB149_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 1, v3 ; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -10729,9 +10712,8 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v1 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc @@ -10771,8 +10753,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v4 ; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc @@ -10809,8 +10790,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5 ; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] ; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc @@ -10836,9 +10816,8 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v4 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] glc @@ -10876,8 +10855,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 -; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, 1, v4 ; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 ; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] ; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc @@ -10914,8 +10892,7 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v5, v0 -; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 -; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_subrev_u32_e32 v0, vcc, 1, v5 ; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 ; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] ; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc @@ -10941,9 +10918,8 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, 1, v4 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 -; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 ; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc ; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir index 1341a5916df4b..ff8ca8688bb85 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses.mir @@ -2,6 +2,7 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX11 # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefix=GFX12 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - -debugify-and-strip-all-safe | FileCheck %s -check-prefix=GFX12 --- name: nop1 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 2c03113e8af47..9b4539c0bb4de 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -103,55 +103,47 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 ; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5] -; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17] +; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 -; CHECK-NEXT: s_mov_b32 s16, 0 -; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v2, s16 -; CHECK-NEXT: v_mov_b32_e32 v3, s17 -; CHECK-NEXT: s_mov_b32 s18, s16 -; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39] -; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41] -; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43] -; CHECK-NEXT: v_readlane_b32 s36, v6, 0 -; CHECK-NEXT: v_readlane_b32 s44, v6, 8 -; CHECK-NEXT: v_readlane_b32 s45, v6, 9 -; CHECK-NEXT: v_readlane_b32 s46, v6, 10 -; CHECK-NEXT: v_readlane_b32 s47, v6, 11 -; CHECK-NEXT: v_readlane_b32 s48, v6, 12 -; CHECK-NEXT: v_readlane_b32 s49, v6, 13 -; CHECK-NEXT: v_readlane_b32 s50, v6, 14 -; CHECK-NEXT: v_readlane_b32 s51, v6, 15 -; CHECK-NEXT: v_readlane_b32 s37, v6, 1 -; CHECK-NEXT: v_readlane_b32 s38, v6, 2 -; CHECK-NEXT: v_readlane_b32 s39, v6, 3 -; CHECK-NEXT: v_readlane_b32 s40, v6, 4 -; CHECK-NEXT: v_readlane_b32 s41, v6, 5 -; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s42, v6, 6 -; CHECK-NEXT: v_readlane_b32 s43, v6, 7 +; CHECK-NEXT: s_mov_b32 s44, 0 +; CHECK-NEXT: s_mov_b32 s45, s44 +; CHECK-NEXT: v_mov_b32_e32 v2, s44 +; CHECK-NEXT: v_mov_b32_e32 v3, s45 +; CHECK-NEXT: s_mov_b32 s46, s44 +; CHECK-NEXT: s_mov_b32 s47, s44 +; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[44:47] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s4, v6, 0 +; CHECK-NEXT: v_readlane_b32 s12, v6, 8 +; CHECK-NEXT: v_readlane_b32 s13, v6, 9 +; CHECK-NEXT: v_readlane_b32 s14, v6, 10 +; CHECK-NEXT: v_readlane_b32 s15, v6, 11 +; CHECK-NEXT: v_readlane_b32 s16, v6, 12 +; CHECK-NEXT: v_readlane_b32 s17, v6, 13 +; CHECK-NEXT: v_readlane_b32 s18, v6, 14 +; CHECK-NEXT: v_readlane_b32 s19, v6, 15 +; CHECK-NEXT: v_readlane_b32 s5, v6, 1 +; CHECK-NEXT: v_readlane_b32 s6, v6, 2 +; CHECK-NEXT: v_readlane_b32 s7, v6, 3 +; CHECK-NEXT: v_readlane_b32 s8, v6, 4 +; CHECK-NEXT: v_readlane_b32 s9, v6, 5 +; CHECK-NEXT: image_sample_lz v0, v[2:3], s[12:19], s[24:27] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13] -; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: v_readlane_b32 s10, v6, 6 +; CHECK-NEXT: v_readlane_b32 s11, v6, 7 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[44:47], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[44:47], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 ; CHECK-NEXT: .LBB0_6: ; %Flow12 -; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[22:23] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader ; CHECK-NEXT: s_mov_b32 s8, 0 @@ -179,7 +171,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: .LBB0_9: ; %Flow13 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] +; CHECK-NEXT: s_or_b64 exec, exec, s[20:21] ; CHECK-NEXT: v_readlane_b32 s71, v5, 21 ; CHECK-NEXT: v_readlane_b32 s70, v5, 20 ; CHECK-NEXT: v_readlane_b32 s69, v5, 19 diff --git a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll index 9f5bbf834fdff..83e34906fa30c 100644 --- a/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll +++ b/llvm/test/CodeGen/AMDGPU/iglp-no-clobber.ll @@ -43,7 +43,7 @@ loop: ; preds = %1, %.lr.ph %addr = phi ptr addrspace(1) [ null, %.lr.ph ], [ %gep, %loop ] %offset = phi i64 [ 0, %.lr.ph ], [ %nextOff, %loop ] %inc = phi i32 [0, %.lr.ph], [ %incCond, %loop ] - %rsrc = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %addr, i16 0, i32 0, i32 0) + %rsrc = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %addr, i16 0, i64 0, i32 0) %load = tail call <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0) %load.bc = bitcast <2 x i32> %load to <8 x i8> %load.elem = extractelement <8 x i8> %load.bc, i64 0 @@ -63,6 +63,6 @@ end: ret void } -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) #0 +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i64, i32) #0 declare <2 x i32> @llvm.amdgcn.raw.ptr.buffer.load.v2i32(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1 diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll index b764ee50c3978..21390003ee565 100644 --- a/llvm/test/CodeGen/AMDGPU/imm.ll +++ b/llvm/test/CodeGen/AMDGPU/imm.ll @@ -1969,9 +1969,10 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x ; GFX942-LABEL: add_inline_imm_neg_1_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, -1 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -1 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2008,7 +2009,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -2 +; GFX942-NEXT: v_mov_b32_e32 v0, -2 +; GFX942-NEXT: v_mov_b32_e32 v1, -1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2045,7 +2047,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -16 +; GFX942-NEXT: v_mov_b32_e32 v0, -16 +; GFX942-NEXT: v_mov_b32_e32 v1, -1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2160,9 +2163,10 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) { ; GFX942-LABEL: store_inline_imm_0.0_f64: ; GFX942: ; %bb.0: ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0 +; GFX942-NEXT: v_mov_b32_e32 v1, v0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2235,7 +2239,8 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0.5 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x3fe00000 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2271,7 +2276,8 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -0.5 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xbfe00000 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2307,7 +2313,8 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], 1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2343,7 +2350,8 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -1.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xbff00000 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2379,7 +2387,8 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], 2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2415,7 +2424,8 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -2.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, -2.0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2451,7 +2461,8 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2487,7 +2498,8 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], -4.0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0xc0100000 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm @@ -2523,7 +2535,8 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) { ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX942-NEXT: s_mov_b32 s3, 0xf000 ; GFX942-NEXT: s_mov_b32 s2, -1 -; GFX942-NEXT: v_mov_b64_e32 v[0:1], 0.15915494309189532 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x6dc9c882 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x3fc45f30 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX942-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index eb5c5ef15ed56..6b094247e113c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -111,22 +111,20 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s0, s29, s28 -; GFX11-NEXT: s_mul_i32 s1, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1 +; GFX11-NEXT: s_mul_hi_u32 s1, s29, s28 +; GFX11-NEXT: s_mul_i32 s0, s29, s28 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], 1 ; GFX11-NEXT: s_mov_b32 s1, 0 -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_or_b32 s0, s0, 1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshr_b32 s0, s0, s30 -; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s0, s0, s22 ; GFX11-NEXT: s_mul_i32 s0, s0, s20 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s0, s19, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 ; GFX11-NEXT: s_mov_b32 s0, s1 ; GFX11-NEXT: global_load_u16 v1, v0, s[20:21] diff --git a/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll b/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll index 3f1dda53ef1b6..1fe29f699f31b 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-run-twice-absolute-md.ll @@ -1,5 +1,3 @@ -; XFAIL: target={{.*}}-aix{{.*}} - ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %s -o %t.ll ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %t.ll -o %t.second.ll ; RUN: diff -ub %t.ll %t.second.ll -I ".*ModuleID.*" diff --git a/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll b/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll index 55280129c49ad..58228fd252322 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-run-twice.ll @@ -1,5 +1,3 @@ -; XFAIL: target={{.*}}-aix{{.*}} - ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %s -o %t.ll ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds %t.ll -o %t.second.ll ; RUN: diff -ub %t.ll %t.second.ll -I ".*ModuleID.*" diff --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll index 98691d394abb3..20b876836082e 100644 --- a/llvm/test/CodeGen/AMDGPU/literal64.ll +++ b/llvm/test/CodeGen/AMDGPU/literal64.ll @@ -5,7 +5,7 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) { ; GCN-LABEL: s_add_u64: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], lit64(0xf12345678) +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xf12345678 ; GCN-NEXT: ; return to shader part epilog %result = add i64 %a, 64729929336 ret i64 %result @@ -14,7 +14,7 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) { define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_add_u64: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1] +; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm %result = add i64 %a, 64729929336 @@ -25,7 +25,7 @@ define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) { define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) { ; GCN-LABEL: s_add_neg_u64: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], lit64(0xfffffff0edcba988) +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0xfffffff0edcba988 ; GCN-NEXT: ; return to shader part epilog %result = sub i64 %a, 64729929336 ret i64 %result @@ -34,7 +34,7 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) { define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_add_neg_u64: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_nc_u64_e32 v[0:1], lit64(0xfffffff0edcba988), v[0:1] +; GCN-NEXT: v_add_nc_u64_e32 v[0:1], 0xfffffff0edcba988, v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm %result = sub i64 %a, 64729929336 @@ -45,7 +45,7 @@ define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) { define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) { ; GCN-LABEL: s_sub_u64: ; GCN: ; %bb.0: -; GCN-NEXT: s_sub_nc_u64 s[0:1], lit64(0xf12345678), s[0:1] +; GCN-NEXT: s_sub_nc_u64 s[0:1], 0xf12345678, s[0:1] ; GCN-NEXT: ; return to shader part epilog %result = sub i64 64729929336, %a ret i64 %result @@ -54,7 +54,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) { define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) { ; GCN-LABEL: v_sub_u64: ; GCN: ; %bb.0: -; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0xf12345678), v[0:1] +; GCN-NEXT: v_sub_nc_u64_e32 v[0:1], 0xf12345678, v[0:1] ; GCN-NEXT: global_store_b64 v[2:3], v[0:1], off ; GCN-NEXT: s_endpgm %result = sub i64 64729929336, %a @@ -67,7 +67,7 @@ define void @v_mov_b64_double(ptr addrspace(1) %ptr) { ; GCN: ; %bb.0: ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4063233333333333) +; GCN-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 ; GCN-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_SYS ; GCN-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, double 153.1 monotonic @@ -79,7 +79,7 @@ define void @v_mov_b64_int(ptr addrspace(1) %ptr) { ; GCN: ; %bb.0: ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], lit64(0xf12345678) +; GCN-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 ; GCN-NEXT: global_atomic_add_u64 v[0:1], v[2:3], off scope:SCOPE_SYS ; GCN-NEXT: s_set_pc_i64 s[30:31] %result = atomicrmw add ptr addrspace(1) %ptr, i64 64729929336 monotonic @@ -91,7 +91,7 @@ define void @store_double(ptr addrspace(1) %ptr) { ; GCN: ; %bb.0: ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4063233333333333) +; GCN-NEXT: v_mov_b64_e32 v[2:3], 0x4063233333333333 ; GCN-NEXT: global_store_b64 v[0:1], v[2:3], off ; GCN-NEXT: s_set_pc_i64 s[30:31] store double 153.1, ptr addrspace(1) %ptr @@ -104,7 +104,7 @@ define i1 @class_f64() noinline optnone { ; GCN-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-SDAG-NEXT: s_wait_kmcnt 0x0 ; GCN-SDAG-NEXT: s_mov_b32 s2, 1 -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0x4063233333333333) +; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 ; GCN-SDAG-NEXT: v_cmp_class_f64_e64 s0, s[0:1], s2 ; GCN-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GCN-SDAG-NEXT: s_set_pc_i64 s[30:31] @@ -114,7 +114,7 @@ define i1 @class_f64() noinline optnone { ; GCN-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-GISEL-NEXT: s_wait_kmcnt 0x0 ; GCN-GISEL-NEXT: s_mov_b32 s2, 1 -; GCN-GISEL-NEXT: s_mov_b64 s[0:1], lit64(0x4063233333333333) +; GCN-GISEL-NEXT: s_mov_b64 s[0:1], 0x4063233333333333 ; GCN-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] ; GCN-GISEL-NEXT: v_mov_b32_e32 v2, s2 ; GCN-GISEL-NEXT: v_cmp_class_f64_e64 s0, v[0:1], v2 @@ -131,7 +131,7 @@ define double @rsq_f64() { ; GCN: ; %bb.0: ; GCN-NEXT: s_wait_loadcnt_dscnt 0x0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_rsq_f64_e32 v[0:1], lit64(0x4063233333333333) +; GCN-NEXT: v_rsq_f64_e32 v[0:1], 0x4063233333333333 ; GCN-NEXT: s_set_pc_i64 s[30:31] %result = call double @llvm.amdgcn.rsq.f64(double 153.1) nounwind readnone ret double %result @@ -140,7 +140,7 @@ define double @rsq_f64() { define amdgpu_ps i64 @s_and_b64(i64 inreg %a) { ; GCN-LABEL: s_and_b64: ; GCN: ; %bb.0: -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], lit64(0xf12345678) +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], 0xf12345678 ; GCN-NEXT: ; return to shader part epilog %result = and i64 %a, 64729929336 ret i64 %result @@ -170,7 +170,7 @@ define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) { define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) { ; GCN-LABEL: v_add_f64_200.1: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_f64_e32 v[0:1], lit64(0x4069033333333333), v[0:1] +; GCN-NEXT: v_add_f64_e32 v[0:1], 0x4069033333333333, v[0:1] ; GCN-NEXT: ; return to shader part epilog %add = fadd double %a, 200.1 %ret = bitcast double %add to <2 x float> @@ -194,14 +194,14 @@ define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) { define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { ; GCN-SDAG-LABEL: v_lshl_add_u64: ; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xf12345678) +; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0xf12345678 ; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, s[0:1] ; GCN-SDAG-NEXT: ; return to shader part epilog ; ; GCN-GISEL-LABEL: v_lshl_add_u64: ; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], lit64(0xf12345678) +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0xf12345678 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3] ; GCN-GISEL-NEXT: ; return to shader part epilog @@ -216,10 +216,10 @@ define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) { define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { ; GCN-SDAG-LABEL: v_fma_f64: ; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: v_fmaak_f64 v[4:5], v[0:1], v[2:3], lit64(0x4063233333333333) -; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4069033333333333) +; GCN-SDAG-NEXT: v_fmaak_f64 v[4:5], v[0:1], v[2:3], 0x4063233333333333 +; GCN-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 ; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GCN-SDAG-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], lit64(0x4069033333333333) +; GCN-SDAG-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 ; GCN-SDAG-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] ; GCN-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 @@ -227,11 +227,11 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { ; ; GCN-GISEL-LABEL: v_fma_f64: ; GCN-GISEL: ; %bb.0: -; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], lit64(0x4063233333333333) +; GCN-GISEL-NEXT: v_mov_b64_e32 v[4:5], 0x4063233333333333 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GCN-GISEL-NEXT: v_fmac_f64_e32 v[4:5], v[0:1], v[2:3] -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4069033333333333) -; GCN-GISEL-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], lit64(0x4069033333333333) +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 +; GCN-GISEL-NEXT: v_fmaak_f64 v[0:1], v[0:1], v[4:5], 0x4069033333333333 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-GISEL-NEXT: v_fmac_f64_e32 v[2:3], v[0:1], v[4:5] ; GCN-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 @@ -246,7 +246,7 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) { define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) { ; GCN-SDAG-LABEL: v_add_neg_f64: ; GCN-SDAG: ; %bb.0: -; GCN-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0x4069033333333333) +; GCN-SDAG-NEXT: s_mov_b64 s[0:1], 0x4069033333333333 ; GCN-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-SDAG-NEXT: v_add_f64_e64 v[0:1], -v[0:1], s[0:1] ; GCN-SDAG-NEXT: ; return to shader part epilog @@ -254,7 +254,7 @@ define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) { ; GCN-GISEL-LABEL: v_add_neg_f64: ; GCN-GISEL: ; %bb.0: ; GCN-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4069033333333333) +; GCN-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0x4069033333333333 ; GCN-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GCN-GISEL-NEXT: v_add_f64_e64 v[0:1], -v[0:1], v[2:3] ; GCN-GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll index bb4a607fc62d0..44a4e8171ff33 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.gfx90a.ll @@ -18,22 +18,19 @@ define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, ptr addrspace(1 ; GCN-LABEL: load_1d_lwe: ; GCN: ; %bb.0: ; %main_body ; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v6, v0 ; GCN-NEXT: v_mov_b32_e32 v9, v8 ; GCN-NEXT: v_mov_b32_e32 v10, v8 ; GCN-NEXT: v_mov_b32_e32 v11, v8 ; GCN-NEXT: v_mov_b32_e32 v12, v8 -; GCN-NEXT: v_mov_b32_e32 v2, v8 -; GCN-NEXT: v_mov_b32_e32 v3, v9 -; GCN-NEXT: v_mov_b32_e32 v4, v10 -; GCN-NEXT: v_mov_b32_e32 v5, v11 -; GCN-NEXT: v_mov_b32_e32 v6, v12 -; GCN-NEXT: image_load v[2:6], v0, s[0:7] dmask:0xf unorm lwe +; GCN-NEXT: v_mov_b32_e32 v0, v8 +; GCN-NEXT: v_mov_b32_e32 v1, v9 +; GCN-NEXT: v_mov_b32_e32 v2, v10 +; GCN-NEXT: v_mov_b32_e32 v3, v11 +; GCN-NEXT: v_mov_b32_e32 v4, v12 +; GCN-NEXT: image_load v[0:4], v6, s[0:7] dmask:0xf unorm lwe ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, v2 -; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: v_mov_b32_e32 v2, v4 -; GCN-NEXT: v_mov_b32_e32 v3, v5 -; GCN-NEXT: global_store_dword v8, v6, s[8:9] +; GCN-NEXT: global_store_dword v8, v4, s[8:9] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll index 847957dab72d9..fa6d878ad7556 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -stop-after=amdgpu-isel < %s | FileCheck --check-prefix=CHECK45 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -enable-new-pm -stop-after=amdgpu-isel < %s | FileCheck --check-prefix=CHECK45 %s define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-LABEL: name: basic_raw_buffer @@ -24,7 +26,32 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) { ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_2]] ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_4]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678) + ; + ; CHECK45-LABEL: name: basic_raw_buffer + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -6629298651489370112 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[S_MOV_B]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 9 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_]] + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -536870912 + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_2]] + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]] + ; CHECK45-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } @@ -42,7 +69,22 @@ define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) { ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + ; + ; CHECK45-LABEL: name: read_raw_buffer + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET killed [[REG_SEQUENCE1]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i64 0, i32 0) %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0) ret float %loaded } @@ -71,117 +113,345 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) { ; CHECK-NEXT: $sgpr2 = COPY [[S_MOV_B32_3]] ; CHECK-NEXT: $sgpr3 = COPY [[S_MOV_B32_5]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 1234, i32 5678) + ; + ; CHECK45-LABEL: name: basic_struct_buffer + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -6629298651489370112 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[S_MOV_B]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY2]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 9 + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_]] + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -536854528 + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 killed [[S_MOV_B32_2]] + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: $sgpr2 = COPY [[S_MOV_B32_1]] + ; CHECK45-NEXT: $sgpr3 = COPY [[S_MOV_B32_3]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } -define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %numVals, i32 inreg %flags) { +define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i64 inreg %numVals, i32 inreg %flags) { ; CHECK-LABEL: name: variable_top_half ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3 + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr4 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[DEF]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY2]], killed [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 262144 ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def dead $scc - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec - ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] - ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: variable_top_half + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY3]], %subreg.sub1 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY5]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], killed [[S_MOV_B32_2]], implicit-def dead $scc + ; CHECK45-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK45-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], killed [[S_MOV_B32_3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], killed [[REG_SEQUENCE3]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 16384 + ; CHECK45-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK45-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[DEF2]], %subreg.sub0, killed [[S_MOV_B32_4]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_OR_B64_1]], killed [[REG_SEQUENCE4]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY8]], implicit $exec + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY9]], implicit $exec + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_LSHR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK45-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i64 %numVals, i32 %flags) ret ptr addrspace(8) %rsrc } -define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { +define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride, i64 inreg %numVals, i32 inreg %flags) { ; CHECK-LABEL: name: general_case ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[DEF]], %subreg.sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec - ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY6]], implicit $exec + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] - ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: general_case + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY6]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[S_LSHR_B64_]], killed [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], killed [[S_MOV_B32_3]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_OR_B64_]], killed [[REG_SEQUENCE3]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY8]], killed [[S_MOV_B32_4]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_2]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY10]], implicit $exec + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY11]], implicit $exec + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY13]], implicit $exec + ; CHECK45-NEXT: [[COPY14:%[0-9]+]]:sreg_32 = COPY [[S_LSHR_B64_]].sub0 + ; CHECK45-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; CHECK45-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY15]], implicit $exec + ; CHECK45-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_2]] + ; CHECK45-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_]] + ; CHECK45-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_3]] + ; CHECK45-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_1]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) ret ptr addrspace(8) %rsrc } -define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 inreg %numVals, i32 inreg %flags) { +define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i64 inreg %numVals, i32 inreg %flags) { ; CHECK-LABEL: name: general_case_load ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4 + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr5 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[DEF]], %subreg.sub1 ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY3]], killed [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 16, implicit-def dead $scc ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 killed [[S_AND_B32_]], killed [[S_LSHL_B32_]], implicit-def dead $scc - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[S_OR_B32_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[S_OR_B32_]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, [[COPY]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY6]], killed [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: general_case_load + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr5 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr4 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr3 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr2 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[S_LSHR_B64_:%[0-9]+]]:sreg_64 = S_LSHR_B64 [[REG_SEQUENCE1]], killed [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY6]], killed [[S_MOV_B32_1]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_LSHR_B64_]], killed [[REG_SEQUENCE2]], implicit-def dead $scc + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY]], killed [[S_MOV_B32_3]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_1:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[S_OR_B64_]], killed [[REG_SEQUENCE3]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub1 + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_1]].sub0 + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 killed [[COPY9]], killed [[S_MOV_B32_4]], implicit-def dead $scc + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, killed [[S_LSHL_B32_2]], %subreg.sub1 + ; CHECK45-NEXT: [[S_OR_B64_2:%[0-9]+]]:sreg_64 = S_OR_B64 killed [[REG_SEQUENCE]], killed [[REG_SEQUENCE4]], implicit-def dead $scc + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub1 + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_2]].sub0 + ; CHECK45-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY11]], %subreg.sub0, killed [[COPY10]], %subreg.sub1, killed [[COPY8]], %subreg.sub2, killed [[COPY7]], %subreg.sub3 + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_2]] + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[COPY12]], killed [[REG_SEQUENCE5]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %value } ; None of the components are uniform due to the lack of an inreg -define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i32 %numVals, i32 %flags) { +define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i64 %numVals, i32 %flags) { ; CHECK-LABEL: name: general_case_load_with_waterfall ; CHECK: bb.0 (%ir-block.0): - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[DEF]], %subreg.sub1 ; CHECK-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY2]], implicit $exec ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[COPY3]], killed [[S_MOV_B32_]], killed [[V_LSHLREV_B32_e64_]], implicit $exec - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[V_AND_OR_B32_e64_]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, killed [[V_AND_OR_B32_e64_]], %subreg.sub1, killed [[COPY5]], %subreg.sub2, [[COPY]], %subreg.sub3 ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY6]], killed [[REG_SEQUENCE1]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags) + ; + ; CHECK45-LABEL: name: general_case_load_with_waterfall + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr5 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr4 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; CHECK45-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; CHECK45-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY5]], %subreg.sub0, [[COPY4]], %subreg.sub1 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28 + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec + ; CHECK45-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; CHECK45-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 7 + ; CHECK45-NEXT: [[COPY7:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]] + ; CHECK45-NEXT: [[V_LSHRREV_B64_e64_:%[0-9]+]]:vreg_64_align2 = V_LSHRREV_B64_e64 killed [[S_MOV_B32_2]], [[COPY7]], implicit $exec + ; CHECK45-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub1 + ; CHECK45-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY3]] + ; CHECK45-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 12 + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_3]], killed [[COPY9]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_1]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub1 + ; CHECK45-NEXT: [[V_OR3_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY8]], killed [[COPY10]], killed [[COPY6]], implicit $exec + ; CHECK45-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; CHECK45-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_LSHRREV_B64_e64_]].sub0 + ; CHECK45-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE3]].sub0 + ; CHECK45-NEXT: [[V_OR3_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR3_B32_e64 killed [[COPY12]], killed [[COPY13]], killed [[COPY11]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR3_B32_e64_1]], %subreg.sub0, killed [[V_OR3_B32_e64_]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub1 + ; CHECK45-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE4]].sub0 + ; CHECK45-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 25 + ; CHECK45-NEXT: [[V_LSHLREV_B32_e64_2:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_4]], killed [[COPY17]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_1]], %subreg.sub0, killed [[V_LSHLREV_B32_e64_2]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub1 + ; CHECK45-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY16]], killed [[COPY18]], implicit $exec + ; CHECK45-NEXT: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE5]].sub0 + ; CHECK45-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY19]], killed [[COPY20]], implicit $exec + ; CHECK45-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub1 + ; CHECK45-NEXT: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE6]].sub0 + ; CHECK45-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY22]], %subreg.sub0, killed [[COPY21]], %subreg.sub1, killed [[COPY15]], %subreg.sub2, killed [[COPY14]], %subreg.sub3 + ; CHECK45-NEXT: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN [[COPY23]], killed [[REG_SEQUENCE7]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i64 %numVals, i32 %flags) %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %value } @@ -200,7 +470,22 @@ define amdgpu_ps float @read_buffer_fat_ptr_p0(ptr inreg %p) { ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i32 0, i32 0) + ; + ; CHECK45-LABEL: name: read_buffer_fat_ptr_p0 + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET killed [[REG_SEQUENCE1]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i64 0, i32 0) %loaded = load float, ptr addrspace(7) %ptr ret float %loaded } @@ -219,14 +504,29 @@ define amdgpu_ps float @read_buffer_fat_ptr_p1(ptr addrspace(1) inreg %p) { ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) ; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $vgpr0 - %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0) + ; + ; CHECK45-LABEL: name: read_buffer_fat_ptr_p1 + ; CHECK45: bb.0 (%ir-block.0): + ; CHECK45-NEXT: liveins: $sgpr0, $sgpr1 + ; CHECK45-NEXT: {{ $}} + ; CHECK45-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1 + ; CHECK45-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; CHECK45-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK45-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK45-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK45-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK45-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY3]], %subreg.sub0, killed [[COPY2]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK45-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET killed [[REG_SEQUENCE1]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8) + ; CHECK45-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET]] + ; CHECK45-NEXT: SI_RETURN_TO_EPILOG $vgpr0 + %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %p, i16 0, i64 0, i32 0) %loaded = load float, ptr addrspace(7) %ptr ret float %loaded } -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr nocapture readnone, i16, i32, i32) -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) -declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr nocapture readnone, i16, i32, i32) -declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr nocapture readnone, i16, i64, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i64, i32) +declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr nocapture readnone, i16, i64, i32) +declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nocapture readnone, i16, i64, i32) declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll index 6e24a6a348f2c..c265b05813ee7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll @@ -87,7 +87,7 @@ define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr ; ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset: ; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry -; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000) +; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], 0xffffffffff800000 ; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_data s[0:1], 0x0, null, 0 @@ -424,7 +424,7 @@ define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr ; ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset: ; GFX1250-SPREFETCH-SDAG: ; %bb.0: ; %entry -; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], lit64(0xffffffffff800000) +; GFX1250-SPREFETCH-SDAG-NEXT: s_mov_b64 s[2:3], 0xffffffffff800000 ; GFX1250-SPREFETCH-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-SPREFETCH-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3] ; GFX1250-SPREFETCH-SDAG-NEXT: s_prefetch_inst s[0:1], 0x0, null, 0 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll index 5b2213592f495..f93e5f06beff9 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll @@ -10159,14 +10159,14 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; ; GFX8-LABEL: constant_sextload_v64i1_to_v64i64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: ; implicit-def: $vgpr62 : SGPR spill to VGPR lane ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dwordx2 s[2:3], s[10:11], 0x0 +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshr_b32 s0, s3, 8 +; GFX8-NEXT: s_lshr_b32 s4, s3, 8 ; GFX8-NEXT: s_lshr_b32 s48, s3, 15 -; GFX8-NEXT: v_writelane_b32 v62, s0, 0 +; GFX8-NEXT: v_writelane_b32 v62, s4, 0 ; GFX8-NEXT: s_lshr_b32 s74, s3, 30 ; GFX8-NEXT: s_lshr_b32 s30, s3, 31 ; GFX8-NEXT: s_lshr_b32 s72, s3, 28 @@ -10186,11 +10186,11 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s58, s3, 14 ; GFX8-NEXT: s_lshr_b32 s62, s3, 12 ; GFX8-NEXT: s_lshr_b32 s54, s3, 10 -; GFX8-NEXT: v_writelane_b32 v62, s1, 1 -; GFX8-NEXT: s_lshr_b32 s0, s3, 9 +; GFX8-NEXT: v_writelane_b32 v62, s5, 1 +; GFX8-NEXT: s_lshr_b32 s4, s3, 9 ; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000 ; GFX8-NEXT: s_lshr_b32 s52, s3, 11 -; GFX8-NEXT: v_writelane_b32 v62, s0, 2 +; GFX8-NEXT: v_writelane_b32 v62, s4, 2 ; GFX8-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000 @@ -10213,8 +10213,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v34, s48 ; GFX8-NEXT: s_lshr_b32 s48, s2, 1 ; GFX8-NEXT: s_lshr_b32 s50, s3, 13 -; GFX8-NEXT: v_writelane_b32 v62, s1, 3 -; GFX8-NEXT: s_lshr_b32 s6, s3, 6 +; GFX8-NEXT: v_writelane_b32 v62, s5, 3 +; GFX8-NEXT: s_lshr_b32 s8, s3, 6 ; GFX8-NEXT: s_lshr_b32 s10, s3, 7 ; GFX8-NEXT: s_lshr_b32 s12, s3, 4 ; GFX8-NEXT: s_lshr_b32 s14, s3, 5 @@ -10264,8 +10264,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_lshr_b32 s54, s2, 13 ; GFX8-NEXT: s_lshr_b32 s52, s2, 10 ; GFX8-NEXT: v_mov_b32_e32 v30, s46 -; GFX8-NEXT: s_lshr_b32 s4, s2, 11 -; GFX8-NEXT: s_lshr_b32 s0, s2, 8 +; GFX8-NEXT: s_lshr_b32 s6, s2, 11 +; GFX8-NEXT: s_lshr_b32 s4, s2, 8 ; GFX8-NEXT: s_lshr_b32 s46, s2, 9 ; GFX8-NEXT: s_lshr_b32 s44, s2, 6 ; GFX8-NEXT: s_lshr_b32 s42, s2, 7 @@ -10278,14 +10278,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_writelane_b32 v62, s2, 4 ; GFX8-NEXT: v_writelane_b32 v62, s3, 5 ; GFX8-NEXT: v_readlane_b32 s2, v62, 2 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 3 -; GFX8-NEXT: v_mov_b32_e32 v38, s50 -; GFX8-NEXT: v_mov_b32_e32 v39, s51 -; GFX8-NEXT: s_bfe_i64 s[50:51], s[4:5], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[4:5], s[6:7], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 +; GFX8-NEXT: v_mov_b32_e32 v35, s49 +; GFX8-NEXT: s_bfe_i64 s[48:49], s[4:5], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000 ; GFX8-NEXT: v_readlane_b32 s2, v62, 0 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000 ; GFX8-NEXT: v_readlane_b32 s3, v62, 1 ; GFX8-NEXT: v_mov_b32_e32 v5, s75 ; GFX8-NEXT: v_mov_b32_e32 v13, s73 @@ -10303,8 +10301,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v29, s57 ; GFX8-NEXT: v_mov_b32_e32 v31, s47 ; GFX8-NEXT: v_mov_b32_e32 v33, s59 -; GFX8-NEXT: v_mov_b32_e32 v35, s49 ; GFX8-NEXT: v_mov_b32_e32 v37, s63 +; GFX8-NEXT: v_mov_b32_e32 v38, s50 +; GFX8-NEXT: v_mov_b32_e32 v39, s51 ; GFX8-NEXT: v_mov_b32_e32 v41, s55 ; GFX8-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000 @@ -10313,7 +10312,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[48:49], s[0:1], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[50:51], s[6:7], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000 @@ -10341,83 +10340,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000 ; GFX8-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[0:1], s[10:11], 0x10000 -; GFX8-NEXT: s_bfe_i64 s[10:11], s[2:3], 0x10000 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1f0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000 +; GFX8-NEXT: s_bfe_i64 s[6:7], s[2:3], 0x10000 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1f0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v43, s3 ; GFX8-NEXT: v_mov_b32_e32 v42, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1e0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v45, s3 ; GFX8-NEXT: v_mov_b32_e32 v44, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1d0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v47, s3 ; GFX8-NEXT: v_mov_b32_e32 v46, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1c0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v49, s3 ; GFX8-NEXT: v_mov_b32_e32 v48, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1b0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1b0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v51, s3 ; GFX8-NEXT: v_mov_b32_e32 v50, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x1a0 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x1a0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v53, s3 ; GFX8-NEXT: v_mov_b32_e32 v52, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x190 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x190 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v55, s3 ; GFX8-NEXT: v_mov_b32_e32 v54, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x180 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x180 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v57, s3 ; GFX8-NEXT: v_mov_b32_e32 v56, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x170 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x170 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v59, s3 ; GFX8-NEXT: v_mov_b32_e32 v58, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x160 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x160 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v61, s3 ; GFX8-NEXT: v_mov_b32_e32 v60, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x150 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 +; GFX8-NEXT: s_add_u32 s2, s0, 0x150 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[44:45], v[12:15] -; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] +; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] ; GFX8-NEXT: v_mov_b32_e32 v13, s3 ; GFX8-NEXT: v_mov_b32_e32 v12, s2 -; GFX8-NEXT: s_add_u32 s2, s8, 0x140 -; GFX8-NEXT: s_addc_u32 s3, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x130 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: flat_store_dwordx4 v[42:43], v[4:7] +; GFX8-NEXT: s_add_u32 s2, s0, 0x140 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v15, s3 +; GFX8-NEXT: v_mov_b32_e32 v14, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x130 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: flat_store_dwordx4 v[46:47], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[48:49], v[8:11] ; GFX8-NEXT: flat_store_dwordx4 v[50:51], v[16:19] -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: v_mov_b32_e32 v17, s1 -; GFX8-NEXT: v_mov_b32_e32 v16, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x120 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v19, s1 -; GFX8-NEXT: v_mov_b32_e32 v18, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x110 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_mov_b32_e32 v15, s3 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v17, s3 +; GFX8-NEXT: v_mov_b32_e32 v16, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x120 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v19, s3 +; GFX8-NEXT: v_mov_b32_e32 v18, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x110 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v42, vcc_lo ; GFX8-NEXT: v_mov_b32_e32 v43, vcc_hi -; GFX8-NEXT: v_mov_b32_e32 v14, s2 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 -; GFX8-NEXT: v_mov_b32_e32 v7, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v6, s4 +; GFX8-NEXT: v_mov_b32_e32 v7, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 ; GFX8-NEXT: v_mov_b32_e32 v8, s12 ; GFX8-NEXT: flat_store_dwordx4 v[52:53], v[20:23] +; GFX8-NEXT: v_mov_b32_e32 v2, s10 +; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[54:55], v[24:27] ; GFX8-NEXT: v_mov_b32_e32 v10, s14 @@ -10429,165 +10429,165 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o ; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[8:11] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x100 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x100 ; GFX8-NEXT: v_mov_b32_e32 v0, s16 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xf0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xf0 ; GFX8-NEXT: v_mov_b32_e32 v0, s22 ; GFX8-NEXT: v_mov_b32_e32 v1, s23 ; GFX8-NEXT: v_mov_b32_e32 v2, s20 ; GFX8-NEXT: v_mov_b32_e32 v3, s21 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xe0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xe0 ; GFX8-NEXT: v_mov_b32_e32 v0, s24 ; GFX8-NEXT: v_mov_b32_e32 v1, s25 ; GFX8-NEXT: v_mov_b32_e32 v2, s26 ; GFX8-NEXT: v_mov_b32_e32 v3, s27 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xd0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xd0 ; GFX8-NEXT: v_mov_b32_e32 v0, s28 ; GFX8-NEXT: v_mov_b32_e32 v1, s29 ; GFX8-NEXT: v_mov_b32_e32 v2, s86 ; GFX8-NEXT: v_mov_b32_e32 v3, s87 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xc0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xc0 ; GFX8-NEXT: v_mov_b32_e32 v0, s84 ; GFX8-NEXT: v_mov_b32_e32 v1, s85 ; GFX8-NEXT: v_mov_b32_e32 v2, s82 ; GFX8-NEXT: v_mov_b32_e32 v3, s83 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xb0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xb0 ; GFX8-NEXT: v_mov_b32_e32 v0, s80 ; GFX8-NEXT: v_mov_b32_e32 v1, s81 ; GFX8-NEXT: v_mov_b32_e32 v2, s78 ; GFX8-NEXT: v_mov_b32_e32 v3, s79 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0xa0 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0xa0 ; GFX8-NEXT: v_mov_b32_e32 v0, s76 ; GFX8-NEXT: v_mov_b32_e32 v1, s77 ; GFX8-NEXT: v_mov_b32_e32 v2, s74 ; GFX8-NEXT: v_mov_b32_e32 v3, s75 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x90 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x90 ; GFX8-NEXT: v_mov_b32_e32 v0, s72 ; GFX8-NEXT: v_mov_b32_e32 v1, s73 ; GFX8-NEXT: v_mov_b32_e32 v2, s70 ; GFX8-NEXT: v_mov_b32_e32 v3, s71 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x80 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x80 ; GFX8-NEXT: v_mov_b32_e32 v0, s68 ; GFX8-NEXT: v_mov_b32_e32 v1, s69 ; GFX8-NEXT: v_mov_b32_e32 v2, s66 ; GFX8-NEXT: v_mov_b32_e32 v3, s67 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x70 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x70 ; GFX8-NEXT: v_mov_b32_e32 v0, s64 ; GFX8-NEXT: v_mov_b32_e32 v1, s65 ; GFX8-NEXT: v_mov_b32_e32 v2, s62 ; GFX8-NEXT: v_mov_b32_e32 v3, s63 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x60 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x60 ; GFX8-NEXT: v_mov_b32_e32 v0, s60 ; GFX8-NEXT: v_mov_b32_e32 v1, s61 ; GFX8-NEXT: v_mov_b32_e32 v2, s58 ; GFX8-NEXT: v_mov_b32_e32 v3, s59 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 0x50 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 0x50 ; GFX8-NEXT: v_mov_b32_e32 v0, s56 ; GFX8-NEXT: v_mov_b32_e32 v1, s57 ; GFX8-NEXT: v_mov_b32_e32 v2, s54 ; GFX8-NEXT: v_mov_b32_e32 v3, s55 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 64 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 64 ; GFX8-NEXT: v_mov_b32_e32 v0, s52 ; GFX8-NEXT: v_mov_b32_e32 v1, s53 ; GFX8-NEXT: v_mov_b32_e32 v2, s50 ; GFX8-NEXT: v_mov_b32_e32 v3, s51 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 48 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 48 ; GFX8-NEXT: v_mov_b32_e32 v0, s48 ; GFX8-NEXT: v_mov_b32_e32 v1, s49 ; GFX8-NEXT: v_mov_b32_e32 v2, s46 ; GFX8-NEXT: v_mov_b32_e32 v3, s47 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 32 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 32 ; GFX8-NEXT: v_mov_b32_e32 v0, s44 ; GFX8-NEXT: v_mov_b32_e32 v1, s45 ; GFX8-NEXT: v_mov_b32_e32 v2, s42 ; GFX8-NEXT: v_mov_b32_e32 v3, s43 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: s_add_u32 s0, s8, 16 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: s_add_u32 s2, s0, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s40 ; GFX8-NEXT: v_mov_b32_e32 v1, s41 ; GFX8-NEXT: v_mov_b32_e32 v2, s38 ; GFX8-NEXT: v_mov_b32_e32 v3, s39 -; GFX8-NEXT: s_addc_u32 s1, s9, 0 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s36 ; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: v_mov_b32_e32 v4, s0 -; GFX8-NEXT: v_readlane_b32 s0, v62, 4 +; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_readlane_b32 s2, v62, 4 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GFX8-NEXT: v_readlane_b32 s1, v62, 5 -; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_readlane_b32 s3, v62, 5 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v0, s30 ; GFX8-NEXT: v_mov_b32_e32 v1, s31 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index f44a0b0ac2c65..bd191a37582c0 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -3807,53 +3807,64 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; VI-DS128-NEXT: v_mov_b32_e32 v4, v3 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v10 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill -; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23 -; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) +; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 +; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -3864,24 +3875,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 -; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 -; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 -; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 +; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 +; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; VI-DS128-NEXT: v_mov_b32_e32 v24, s0 -; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -3905,27 +3908,27 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 ; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 ; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 -; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 +; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64 +; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80 +; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v24, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 +; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32: @@ -3941,58 +3944,67 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v10 +; GFX9-DS128-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v11 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v10 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17 -; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v9 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v8 +; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v8 ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22 -; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23 -; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18 +; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19 +; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18 +; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20 -; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21 -; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48 +; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v17 +; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v23 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v22 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v20 +; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v23 +; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v22 +; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21 +; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v20 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27 -; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64 ; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26 ; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26 @@ -4003,23 +4015,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24 ; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112 -; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11 -; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10 -; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8 +; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 +; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 +; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 +; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24 ; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25 ; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24 ; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37 -; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36 -; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39 -; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38 -; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37 ; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58 ; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57 @@ -4043,27 +4048,27 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] offset:32 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = zext <64 x i16> %load to <64 x i32> @@ -4844,8 +4849,8 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32: ; VI-DS128: ; %bb.0: ; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-DS128-NEXT: s_mov_b32 m0, -1 ; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; VI-DS128-NEXT: s_mov_b32 s90, -1 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) @@ -4855,66 +4860,67 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000 ; VI-DS128-NEXT: s_add_u32 s88, s88, s11 ; VI-DS128-NEXT: s_addc_u32 s89, s89, 0 -; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; VI-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; VI-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16 ; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill -; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 -; VI-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 +; VI-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 ; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill ; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill +; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 -; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16 +; VI-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 ; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 ; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 +; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(2) -; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 -; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 -; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 +; VI-DS128-NEXT: s_waitcnt lgkmcnt(1) +; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 +; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; VI-DS128-NEXT: v_mov_b32_e32 v32, s0 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 +; VI-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 +; VI-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 +; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: s_waitcnt lgkmcnt(0) ; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 -; VI-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 -; VI-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 -; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -4950,22 +4956,22 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 +; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64 ; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 +; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32 +; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48 ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32 +; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] ; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload ; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload ; VI-DS128-NEXT: s_waitcnt vmcnt(0) -; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 -; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] -; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 +; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16 ; VI-DS128-NEXT: s_endpgm ; ; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32: @@ -4981,69 +4987,69 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16 ; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0 -; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18 -; GFX9-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v11 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v10 +; GFX9-DS128-NEXT: v_bfe_i32 v2, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v0, v10, 0, 16 ; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16 -; GFX9-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v9 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v8 +; GFX9-DS128-NEXT: v_bfe_i32 v5, v9, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v3, v8, 0, 16 ; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: s_nop 0 ; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill ; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill +; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32 ; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48 -; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v19 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v18 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v17 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v16 +; GFX9-DS128-NEXT: v_bfe_i32 v10, v19, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v8, v18, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v14, v17, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v12, v16, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26 ; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36 ; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64 +; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80 ; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v25 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v24 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2) -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 -; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10 -; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16 +; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40 +; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16 ; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112 ; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8 -; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v22, v25, 0, 16 +; GFX9-DS128-NEXT: v_bfe_i32 v20, v24, 0, 16 +; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37 ; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16 ; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24 -; GFX9-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16 -; GFX9-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16 -; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34 ; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33 ; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16 @@ -5079,22 +5085,22 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out ; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:64 ; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:48 ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] ; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload ; GFX9-DS128-NEXT: s_waitcnt vmcnt(0) -; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48 -; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] -; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16 +; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:16 ; GFX9-DS128-NEXT: s_endpgm %load = load <64 x i16>, ptr addrspace(3) %in %ext = sext <64 x i16> %load to <64 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/lower-brcond-with-xor.ll b/llvm/test/CodeGen/AMDGPU/lower-brcond-with-xor.ll new file mode 100644 index 0000000000000..e2f8df0448f82 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-brcond-with-xor.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a --debug-counter=dagcombine=0 -start-before=si-annotate-control-flow %s -o - | FileCheck %s + +define amdgpu_kernel void @test(i32 %N, ptr addrspace(1) %p) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 1, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc +; CHECK-NEXT: s_endpgm +entry: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %cmp2 = icmp slt i32 %id.x, 1 + br i1 %cmp2, label %if.then, label %exit + +if.then: + %idx.ext = zext i32 %N to i64 + %add.ptr = getelementptr i8, ptr addrspace(1) %p, i64 %idx.ext + ret void + +exit: + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll index 66de953043f10..610c3e2c02867 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll @@ -282,9 +282,9 @@ define i160 @ptrtoaddr_ext(ptr addrspace(7) %ptr) { ; CHECK-LABEL: define i160 @ptrtoaddr_ext ; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 -; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: [[RET:%.*]] = zext i32 [[PTR_OFF]] to i160 -; CHECK-NEXT: ret i160 [[RET]] +; CHECK-NEXT: [[ADDR:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[EXT:%.*]] = zext i32 [[ADDR]] to i160 +; CHECK-NEXT: ret i160 [[EXT]] ; %addr = ptrtoaddr ptr addrspace(7) %ptr to i32 %ext = zext i32 %addr to i160 @@ -296,9 +296,9 @@ define i16 @ptrtoaddr_trunc(ptr addrspace(7) %ptr) { ; CHECK-LABEL: define i16 @ptrtoaddr_trunc ; CHECK-SAME: ({ ptr addrspace(8), i32 } [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0 -; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 -; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[PTR_OFF]] to i16 -; CHECK-NEXT: ret i16 [[RET]] +; CHECK-NEXT: [[ADDR:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1 +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[ADDR]] to i16 +; CHECK-NEXT: ret i16 [[TRUNC]] ; %addr = ptrtoaddr ptr addrspace(7) %ptr to i32 %trunc = trunc i32 %addr to i16 @@ -450,17 +450,17 @@ define <2 x ptr addrspace(7)> @addrspacecast_poison_vec() { ret <2 x ptr addrspace(7)> %ret } -declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1), i16, i32, i32) +declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1), i16, i64, i32) -define ptr addrspace(7) @make_buffer_rsrc(ptr addrspace(1) %buf, i16 %stride, i32 %numRecords, i32 %flags) { +define ptr addrspace(7) @make_buffer_rsrc(ptr addrspace(1) %buf, i16 %stride, i64 %numRecords, i32 %flags) { ; CHECK-LABEL: define { ptr addrspace(8), i32 } @make_buffer_rsrc -; CHECK-SAME: (ptr addrspace(1) [[BUF:%.*]], i16 [[STRIDE:%.*]], i32 [[NUMRECORDS:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[BUF]], i16 [[STRIDE]], i32 [[NUMRECORDS]], i32 [[FLAGS]]) +; CHECK-SAME: (ptr addrspace(1) [[BUF:%.*]], i16 [[STRIDE:%.*]], i64 [[NUMRECORDS:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[BUF]], i16 [[STRIDE]], i64 [[NUMRECORDS]], i32 [[FLAGS]]) ; CHECK-NEXT: [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 0, 1 ; CHECK-NEXT: ret { ptr addrspace(8), i32 } [[TMP2]] ; - %ret = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %buf, i16 %stride, i32 %numRecords, i32 %flags) + %ret = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %buf, i16 %stride, i64 %numRecords, i32 %flags) ret ptr addrspace(7) %ret } diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 1ae3434db6da5..3f66c23e1a73b 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -65,10 +65,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3c00 -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, 1.0, v0.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo: @@ -137,13 +136,20 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GFX9: ; %bb.0: @@ -172,6 +178,14 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src ; SDAG-CI-NEXT: v_mov_b32_e32 v0, v3 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -196,10 +210,8 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: @@ -277,10 +289,8 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src ; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-GFX11-TRUE16: ; %bb.0: ; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: @@ -499,14 +509,25 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half } define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 { -; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixhi_f16 v0, v3, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GFX9: ; %bb.0: @@ -542,6 +563,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi ; SDAG-CI-NEXT: s_waitcnt vmcnt(0) ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: +; GISEL-GFX11: ; %bb.0: +; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GISEL-GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc +; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index eab92668c536b..21e6faf46f58d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -412,11 +412,9 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v3.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32: @@ -535,12 +533,10 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32: @@ -704,16 +700,13 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v2, v4 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v6.h +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, v6.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32: @@ -914,14 +907,23 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half ; FIXME (DAG): Fold clamp define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_mov_b32_e32 v0, v3 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GFX900: ; %bb.0: @@ -978,6 +980,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v3 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v2f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1040,13 +1051,13 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v1, v1.l, 0 -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1247,17 +1258,29 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s } define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { -; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v6 +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GFX900: ; %bb.0: @@ -1358,6 +1381,18 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: ; GISEL-VI: ; %bb.0: ; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -1452,10 +1487,10 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v3.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: @@ -1618,9 +1653,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3 +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-TRUE16-NEXT: v_pack_b32_f16 v0, v3.l, v0.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: @@ -2385,10 +2420,8 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { ; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 ; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll index a4878539b1c74..95df131e21358 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -2253,9 +2253,10 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half % ; SDAG-GFX1100-TRUE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: ; SDAG-GFX1100-TRUE16: ; %bb.0: ; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; SDAG-GFX1100-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v0.h ; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mix_f32 v0, |v2|, v1, v0 op_sel_hi:[1,1,1] ; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX1100-FAKE16-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index e6960a3f710da..dbcd3700a1605 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -2233,7 +2233,7 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 { ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_mov_b32 s3, 0 ; GFX1250-NEXT: s_mov_b32 s2, s1 -; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffffffff1c18) +; GFX1250-NEXT: s_mov_b64 s[4:5], 0xffffffffffff1c18 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] ; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir b/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir index 1ab3cf60a1c97..6e7f5b5492148 100644 --- a/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir +++ b/llvm/test/CodeGen/AMDGPU/madak-inline-constant.mir @@ -1,5 +1,5 @@ # RUN: llc -mtriple=amdgcn -run-pass peephole-opt -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s - +# RUN: llc -mtriple=amdgcn -run-pass peephole-opt -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s # GCN-LABEL: bb.0: # GCN: S_MOV_B32 1082130432 diff --git a/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll index 4f88077e3b0ee..74f15ac6e074e 100644 --- a/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll +++ b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll @@ -3,7 +3,7 @@ ; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr addrspace(3) inreg %p) { - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p3(ptr addrspace(3) %p, i16 0, i32 1234, i32 5678) + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p3(ptr addrspace(3) %p, i16 0, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p3(ptr addrspace(3) nocapture readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p3(ptr addrspace(3) nocapture readnone, i16, i64, i32) diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll index 4c0ab91b7d622..02f39e25cb447 100644 --- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll @@ -3749,7 +3749,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:30 @@ -3953,7 +3953,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:704 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:708 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) @@ -4185,8 +4185,12 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1020 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 @@ -4198,6 +4202,10 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1036 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -4210,376 +4218,346 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1056 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1048 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1060 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1052 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1044 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1064 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1084 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1072 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v125 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1104 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1116 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1152 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v5 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 8, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 8, v121 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v127, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v107, 8, v108 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v127, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v93, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v123, 8, v125 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v93, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v78, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v110 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v93, 8, v94 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v77 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v75, 8, v79 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v104 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v108 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v90 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v88 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v58 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v76 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v57, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v57 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v44 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v47, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v44 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v119, 8, v41 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v118 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v98, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v86, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v99 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v69, 8, v70 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v71, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v67 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v53, 8, v66 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v64 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v39, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v31, 8, v34 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v28, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x17 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v86, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v70, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 ; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 ; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 ; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 ; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v123, v4, 16, v3 -; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x6 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(28) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v25, 8, v27 -; ALIGNED-NEXT: s_waitcnt vmcnt(26) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v24, 8, v26 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v113, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v102, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v106, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v103, 8, v114 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v100, 8, v112 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(60) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v97, 8, v98 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v96 ; ALIGNED-NEXT: s_waitcnt vmcnt(14) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v12, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v13, 8, v16 ; ALIGNED-NEXT: s_waitcnt vmcnt(10) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v21, 8, v22 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v20 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v92, off, s[0:3], s32 offset:1100 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v76, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v17, 8, v19 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1108 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v101, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v84, v45, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v60, 16, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v5, 8, v6 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v60, 16, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v9, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1088 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v89, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v84, 8, v86 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v85, 8, v83 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1112 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1096 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1132 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v74, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v81 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v65, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v46, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v54, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v117, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v52, 8, v64 +; ALIGNED-NEXT: v_lshl_or_b32 v115, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v48 +; ALIGNED-NEXT: v_lshl_or_b32 v99, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v36, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v32, 8, v33 +; ALIGNED-NEXT: v_lshl_or_b32 v82, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v31, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v66, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v29, 8, v30 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v51, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v49, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v27, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 +; ALIGNED-NEXT: v_lshl_or_b32 v22, v73, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v6, 8, v8 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v7, 8, v5 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v77, 16, v73 ; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:3 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1068 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v94, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v45, v45, 8, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v60, 8, v94 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: v_lshl_or_b32 v45, v90, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v60, v104, 8, v92 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v111, 8, v122 +; ALIGNED-NEXT: buffer_store_dword v121, off, s[0:3], s32 offset:1092 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v110, 8, v120 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1076 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1080 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v109, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v1, 8, v120 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:13 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1156 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:1168 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v73, v73, 8, v1 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v77, v107, 8, v0 +; ALIGNED-NEXT: v_mov_b32_e32 v1, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v45, v92, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v73, v120, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v60, v94, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v60, 16, v45 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v77, v121, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 16, v73 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:232 ; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v27, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v60 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v77 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v45, 8, v88 -; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v73, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1228 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v0, s4 @@ -4587,190 +4565,184 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v0, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:250 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 ; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:253 +; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 +; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 +; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 ; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:242 +; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 +; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 +; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 ; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:245 +; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 +; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 +; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 ; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v66, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v82, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 +; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:235 +; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:233 +; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:239 +; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:237 +; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:238 +; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:236 +; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:226 +; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:227 +; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:225 +; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:231 +; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:229 +; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:230 +; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:228 +; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:224 +; ALIGNED-NEXT: buffer_store_dword v99, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v115, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v117, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:213 +; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:209 +; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:211 +; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:210 +; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:214 +; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:212 +; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:218 +; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:219 +; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:217 +; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:223 +; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:221 +; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:222 +; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:220 +; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:216 +; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:208 +; ALIGNED-NEXT: buffer_store_dword v74, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v89, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:202 +; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:203 +; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:207 +; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:205 +; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:206 +; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:204 +; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:200 +; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:194 +; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:195 +; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:193 +; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:199 +; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:197 +; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:198 +; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:192 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1220 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5] ; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:224 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1216 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1212 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1208 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:186 +; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:187 +; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:185 +; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:191 +; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:189 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:190 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:188 +; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:184 +; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:178 +; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:179 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:177 +; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:183 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:181 +; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:182 +; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1204 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:208 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1200 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1196 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1192 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v86 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:170 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:171 +; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:169 +; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:173 +; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:174 +; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:172 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:162 +; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:163 +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:167 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:164 +; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1180 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1172 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v103 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v127 offset:159 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1160 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1144 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1136 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v93 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:159 -; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:157 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:158 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:152 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:146 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:147 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1084 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1124 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1116 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1104 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -5207,7 +5179,7 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:516 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:388 -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:384 +; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:384 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:26 @@ -5232,11 +5204,11 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1152 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1188 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -5249,27 +5221,33 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1148 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1184 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1176 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1140 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1120 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1128 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:11 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:9 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:15 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:14 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:12 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:8 +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1168 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:9 +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1164 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1156 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1092 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 @@ -5279,13 +5257,13 @@ define void @memcpy_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5) ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1076 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1108 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1112 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1096 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1100 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1132 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1088 ; 4-byte Folded Reload @@ -12939,7 +12917,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v127, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1032 ; 4-byte Folded Spill @@ -13170,9 +13148,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1252 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:142 @@ -13181,584 +13163,557 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:141 ; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:139 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(5) +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v5, 8, v3 ; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:138 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:149 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:137 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:136 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v3 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:147 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:144 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x3 -; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:145 -; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:146 -; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:147 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:148 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v124, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v120, 8, v111 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:149 -; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:150 -; ALIGNED-NEXT: buffer_load_ubyte v110, v2, s[0:3], 0 offen offset:151 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v4, 8, v7 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v3 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:158 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:156 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v3 +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v110, 8, v122 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v2, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v3 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v104, 8, v106 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v122, v2, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v121, v2, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v92, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v122, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v90 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v121 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v120, v2, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v108, v2, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v104, v2, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v105, v2, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v111, v2, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v106, v2, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v94, v2, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v108, 8, v120 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v73 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v105, 8, v104 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v75 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v94, 8, v107 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v106, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v58, v2, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v92, v2, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v89, v2, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v90, v2, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v88, v2, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v78, v2, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v89, 8, v92 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v88, 8, v90 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v79, v2, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v76, v2, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v75, v2, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v59 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 8, v79 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v56 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v78, 8, v75 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v41, v2, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v74, v2, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v72, v2, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v61, v2, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v60, v2, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v73, v2, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v63, v2, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v62, v2, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v59, v2, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v45, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v42, 8, v41 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v61 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v44 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v59, 8, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v63, 8, v73 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v119, v2, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v57, v2, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v47, v2, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v56, v2, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v46, v2, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v44, v2, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v117, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v47, 8, v57 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v116, 8, v118 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v56 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v45, v2, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v43, v2, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v42, v2, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v113, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v45 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v112 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v44, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v102 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v96, 8, v97 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v101 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v87, 8, v98 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v83, 8, v85 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v82, 8, v84 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v80 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v81, 8, v69 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v53, v2, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v55, v2, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v54, 8, v68 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v53, 8, v66 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v64, 8, v67 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v50, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v65 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v48, 8, v52 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v37, 8, v39 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v55, 8, v38 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v34, 8, v36 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v31 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v4, 16, v3 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v35 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v32 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x17 -; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:239 -; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:234 -; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:240 -; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:241 -; ALIGNED-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:242 -; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:243 -; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:244 -; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:245 -; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:246 -; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:247 -; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:252 -; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:253 -; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:254 -; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:255 -; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:251 -; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:248 -; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:249 -; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 16, v3 -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v40, v2, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v117, v2, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v113, v2, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v114, v2, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v118, v2, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v115, v2, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v116, v2, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v112, v2, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v102, v2, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v101, v2, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v100, v2, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v99, v2, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v97, v2, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v98, v2, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v96, v2, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v87, v2, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v85, v2, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v71, v2, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v83, v2, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v69, v2, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v80, v2, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v84, v2, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v81, v2, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v65, v2, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v66, v2, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v82, v2, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v67, v2, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v68, v2, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v64, v2, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v54, v2, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v51, v2, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v52, v2, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v50, v2, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v48, v2, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v37, v2, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v36, v2, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v49, v2, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v39, v2, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v38, v2, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v35, v2, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v34, v2, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:235 +; ALIGNED-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:240 +; ALIGNED-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:241 +; ALIGNED-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:242 +; ALIGNED-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:243 +; ALIGNED-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:244 +; ALIGNED-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:245 +; ALIGNED-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:246 +; ALIGNED-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:247 +; ALIGNED-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:252 +; ALIGNED-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:253 +; ALIGNED-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:254 +; ALIGNED-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:255 +; ALIGNED-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:251 +; ALIGNED-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:248 +; ALIGNED-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x5 +; ALIGNED-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:250 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:2 ; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: s_waitcnt vmcnt(27) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 -; ALIGNED-NEXT: s_waitcnt vmcnt(25) -; ALIGNED-NEXT: v_lshl_or_b32 v4, v25, 8, v26 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v117, 8, v40 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v114, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v110, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v115, 8, v118 +; ALIGNED-NEXT: s_waitcnt vmcnt(61) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v112, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 +; ALIGNED-NEXT: s_waitcnt vmcnt(59) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v101, 8, v102 +; ALIGNED-NEXT: s_waitcnt vmcnt(57) +; ALIGNED-NEXT: v_lshl_or_b32 v4, v99, 8, v100 ; ALIGNED-NEXT: s_waitcnt vmcnt(13) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v13, 8, v16 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v16, 8, v20 ; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v91, v9, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v93, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v23, 8, v21 -; ALIGNED-NEXT: v_lshl_or_b32 v78, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v18, 8, v20 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v14, 8, v15 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v11, 8, v12 +; ALIGNED-NEXT: v_lshl_or_b32 v91, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v96, 8, v98 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v97, 8, v87 +; ALIGNED-NEXT: v_lshl_or_b32 v77, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v85 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v69, 8, v83 +; ALIGNED-NEXT: v_lshl_or_b32 v58, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v81, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v66, 8, v65 +; ALIGNED-NEXT: v_lshl_or_b32 v41, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v67, 8, v82 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v64, 8, v68 +; ALIGNED-NEXT: v_lshl_or_b32 v119, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v80, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v103, v4, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v17, 8, v19 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v36, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v86, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v49 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v35, 8, v38 +; ALIGNED-NEXT: v_lshl_or_b32 v70, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v33, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v30, 8, v32 +; ALIGNED-NEXT: v_lshl_or_b32 v55, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v27, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v29, 8, v25 +; ALIGNED-NEXT: v_lshl_or_b32 v53, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v18, 8, v17 +; ALIGNED-NEXT: v_lshl_or_b32 v31, v4, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v4, v19, 8, v23 +; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen +; ALIGNED-NEXT: v_lshl_or_b32 v26, v95, 16, v4 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v13, 8, v14 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(4) +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v21, v109, 16, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v8, 8, v10 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v9, 8, v7 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v107, off, s[0:3], s32 offset:1296 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:1304 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v86, v77, 16, v4 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v11, 8, v12 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1308 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v70, v91, 16, v77 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v6, 8, v8 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v7, 8, v5 -; ALIGNED-NEXT: v_lshl_or_b32 v4, v91, 16, v77 -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1332 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v15, v109, 16, v95 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1320 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1260 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v77, v77, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v0, 8, v91 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: v_lshl_or_b32 v77, v123, 8, v107 -; ALIGNED-NEXT: v_lshl_or_b32 v91, v3, 8, v125 -; ALIGNED-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_store_dword v95, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v95, v95, 8, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v95, v5, 8, v125 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v4, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x1 -; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v126, v2, s[0:3], 0 offen offset:11 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x4 +; ALIGNED-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v3, 8, v1 -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v91, v91, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v0, 8, v1 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v123, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v1, v2, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v2, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v77, v125, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v109, v5, 8, v1 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v91, v126, 8, v123 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v91, 16, v77 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v95, v4, 8, v0 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v109, 16, v95 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v91, v2, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v107, v2, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v77, v2, s[0:3], 0 offen offset:17 -; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232 -; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:236 -; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:228 -; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v109, v2, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v125, v2, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v95, v2, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:232 +; ALIGNED-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:236 +; ALIGNED-NEXT: buffer_store_dword v26, off, s[0:3], s32 offset:228 +; ALIGNED-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:224 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v2, 0x100, v2 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v91 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v127, 8, v109 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v127, v77, 8, v107 +; ALIGNED-NEXT: v_lshl_or_b32 v127, v95, 8, v125 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v3, vcc_lo, v3, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: v_add_co_ci_u32_e64 v4, null, s5, v4, vcc_lo -; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:250 -; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:251 -; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:249 -; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:255 -; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:253 -; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:254 -; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:252 -; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:248 -; ALIGNED-NEXT: flat_store_byte v[3:4], v15 offset:242 -; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:243 -; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:241 -; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:247 -; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:245 -; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:246 -; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:244 -; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:240 -; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:248 -; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:252 -; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v7 offset:250 +; ALIGNED-NEXT: flat_store_byte v[3:4], v9 offset:251 +; ALIGNED-NEXT: flat_store_byte v[3:4], v8 offset:249 +; ALIGNED-NEXT: flat_store_byte v[3:4], v11 offset:255 +; ALIGNED-NEXT: flat_store_byte v[3:4], v13 offset:253 +; ALIGNED-NEXT: flat_store_byte v[3:4], v12 offset:254 +; ALIGNED-NEXT: flat_store_byte v[3:4], v14 offset:252 +; ALIGNED-NEXT: flat_store_byte v[3:4], v10 offset:248 +; ALIGNED-NEXT: flat_store_byte v[3:4], v17 offset:242 +; ALIGNED-NEXT: flat_store_byte v[3:4], v18 offset:243 +; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:241 +; ALIGNED-NEXT: flat_store_byte v[3:4], v16 offset:247 +; ALIGNED-NEXT: flat_store_byte v[3:4], v19 offset:245 +; ALIGNED-NEXT: flat_store_byte v[3:4], v20 offset:246 +; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:244 +; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:240 +; ALIGNED-NEXT: buffer_store_dword v53, off, s[0:3], s32 offset:248 +; ALIGNED-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:252 +; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:244 +; ALIGNED-NEXT: buffer_store_dword v86, off, s[0:3], s32 offset:240 +; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:234 +; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:235 +; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:233 +; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:239 +; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:237 +; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:238 +; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:236 +; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:232 +; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:226 +; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:227 +; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:225 +; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:231 +; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:229 +; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:230 +; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:228 +; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:224 +; ALIGNED-NEXT: buffer_store_dword v103, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v119, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:213 +; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:215 +; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:209 +; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:211 +; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:210 +; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:214 +; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:212 +; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:218 +; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:219 +; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:217 +; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:223 +; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:221 +; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:222 +; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:220 +; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:216 +; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:208 +; ALIGNED-NEXT: buffer_store_dword v77, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v93, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:208 +; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:202 +; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:203 +; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:201 +; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:207 +; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:205 +; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:206 +; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:204 +; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:200 +; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:194 +; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:195 +; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:193 +; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:199 +; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:197 +; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:198 +; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:196 +; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:192 ; ALIGNED-NEXT: v_lshl_or_b32 v127, v0, 16, v127 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1492 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100 ; ALIGNED-NEXT: s_addc_u32 s5, s5, 0 ; ALIGNED-NEXT: s_cmp_lg_u64 s[4:5], 0x800 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:240 -; ALIGNED-NEXT: flat_store_byte v[3:4], v21 offset:234 -; ALIGNED-NEXT: flat_store_byte v[3:4], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[3:4], v22 offset:233 -; ALIGNED-NEXT: flat_store_byte v[3:4], v25 offset:239 -; ALIGNED-NEXT: flat_store_byte v[3:4], v27 offset:237 -; ALIGNED-NEXT: flat_store_byte v[3:4], v26 offset:238 -; ALIGNED-NEXT: flat_store_byte v[3:4], v28 offset:236 -; ALIGNED-NEXT: flat_store_byte v[3:4], v24 offset:232 -; ALIGNED-NEXT: flat_store_byte v[3:4], v31 offset:226 -; ALIGNED-NEXT: flat_store_byte v[3:4], v30 offset:227 -; ALIGNED-NEXT: flat_store_byte v[3:4], v34 offset:225 -; ALIGNED-NEXT: flat_store_byte v[3:4], v29 offset:231 -; ALIGNED-NEXT: flat_store_byte v[3:4], v33 offset:229 -; ALIGNED-NEXT: flat_store_byte v[3:4], v32 offset:230 -; ALIGNED-NEXT: flat_store_byte v[3:4], v35 offset:228 -; ALIGNED-NEXT: flat_store_byte v[3:4], v36 offset:224 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1488 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:192 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:204 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 +; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:186 +; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:187 +; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:185 +; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:191 +; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:189 +; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:190 +; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:188 +; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:184 +; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:178 +; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:179 +; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:177 +; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:183 +; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:181 +; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:182 +; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:180 +; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:200 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v54 offset:213 -; ALIGNED-NEXT: flat_store_byte v[3:4], v53 offset:215 -; ALIGNED-NEXT: flat_store_byte v[3:4], v37 offset:209 -; ALIGNED-NEXT: flat_store_byte v[3:4], v55 offset:211 -; ALIGNED-NEXT: flat_store_byte v[3:4], v38 offset:210 -; ALIGNED-NEXT: flat_store_byte v[3:4], v66 offset:214 -; ALIGNED-NEXT: flat_store_byte v[3:4], v68 offset:212 -; ALIGNED-NEXT: flat_store_byte v[3:4], v49 offset:218 -; ALIGNED-NEXT: flat_store_byte v[3:4], v50 offset:219 -; ALIGNED-NEXT: flat_store_byte v[3:4], v64 offset:217 -; ALIGNED-NEXT: flat_store_byte v[3:4], v48 offset:223 -; ALIGNED-NEXT: flat_store_byte v[3:4], v51 offset:221 -; ALIGNED-NEXT: flat_store_byte v[3:4], v52 offset:222 -; ALIGNED-NEXT: flat_store_byte v[3:4], v65 offset:220 -; ALIGNED-NEXT: flat_store_byte v[3:4], v67 offset:216 -; ALIGNED-NEXT: flat_store_byte v[3:4], v39 offset:208 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:220 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 +; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:170 +; ALIGNED-NEXT: flat_store_byte v[3:4], v78 offset:171 +; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:169 +; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:175 +; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:173 +; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:174 +; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:172 +; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:168 +; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:162 +; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:163 +; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:161 +; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:167 +; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:165 +; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:166 +; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:164 +; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:208 -; ALIGNED-NEXT: flat_store_byte v[3:4], v69 offset:202 -; ALIGNED-NEXT: flat_store_byte v[3:4], v81 offset:203 -; ALIGNED-NEXT: flat_store_byte v[3:4], v71 offset:201 -; ALIGNED-NEXT: flat_store_byte v[3:4], v82 offset:207 -; ALIGNED-NEXT: flat_store_byte v[3:4], v83 offset:205 -; ALIGNED-NEXT: flat_store_byte v[3:4], v84 offset:206 -; ALIGNED-NEXT: flat_store_byte v[3:4], v85 offset:204 -; ALIGNED-NEXT: flat_store_byte v[3:4], v80 offset:200 -; ALIGNED-NEXT: flat_store_byte v[3:4], v97 offset:194 -; ALIGNED-NEXT: flat_store_byte v[3:4], v96 offset:195 -; ALIGNED-NEXT: flat_store_byte v[3:4], v100 offset:193 -; ALIGNED-NEXT: flat_store_byte v[3:4], v87 offset:199 -; ALIGNED-NEXT: flat_store_byte v[3:4], v99 offset:197 -; ALIGNED-NEXT: flat_store_byte v[3:4], v98 offset:198 -; ALIGNED-NEXT: flat_store_byte v[3:4], v101 offset:196 -; ALIGNED-NEXT: flat_store_byte v[3:4], v102 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:296 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 +; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:154 +; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:153 +; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:159 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:157 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:292 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:158 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:288 -; ALIGNED-NEXT: flat_store_byte v[3:4], v112 offset:186 -; ALIGNED-NEXT: flat_store_byte v[3:4], v114 offset:187 -; ALIGNED-NEXT: flat_store_byte v[3:4], v113 offset:185 -; ALIGNED-NEXT: flat_store_byte v[3:4], v116 offset:191 -; ALIGNED-NEXT: flat_store_byte v[3:4], v117 offset:189 -; ALIGNED-NEXT: flat_store_byte v[3:4], v118 offset:190 -; ALIGNED-NEXT: flat_store_byte v[3:4], v119 offset:188 -; ALIGNED-NEXT: flat_store_byte v[3:4], v115 offset:184 -; ALIGNED-NEXT: flat_store_byte v[3:4], v41 offset:178 -; ALIGNED-NEXT: flat_store_byte v[3:4], v42 offset:179 -; ALIGNED-NEXT: flat_store_byte v[3:4], v45 offset:177 -; ALIGNED-NEXT: flat_store_byte v[3:4], v40 offset:183 -; ALIGNED-NEXT: flat_store_byte v[3:4], v43 offset:181 -; ALIGNED-NEXT: flat_store_byte v[3:4], v44 offset:182 -; ALIGNED-NEXT: flat_store_byte v[3:4], v46 offset:180 -; ALIGNED-NEXT: flat_store_byte v[3:4], v47 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:156 +; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:312 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:146 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:316 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:147 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:308 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:145 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:304 -; ALIGNED-NEXT: flat_store_byte v[3:4], v56 offset:170 -; ALIGNED-NEXT: flat_store_byte v[3:4], v58 offset:171 -; ALIGNED-NEXT: flat_store_byte v[3:4], v57 offset:169 -; ALIGNED-NEXT: flat_store_byte v[3:4], v60 offset:175 -; ALIGNED-NEXT: flat_store_byte v[3:4], v61 offset:173 -; ALIGNED-NEXT: flat_store_byte v[3:4], v62 offset:174 -; ALIGNED-NEXT: flat_store_byte v[3:4], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[3:4], v59 offset:168 -; ALIGNED-NEXT: flat_store_byte v[3:4], v73 offset:162 -; ALIGNED-NEXT: flat_store_byte v[3:4], v74 offset:163 -; ALIGNED-NEXT: flat_store_byte v[3:4], v79 offset:161 -; ALIGNED-NEXT: flat_store_byte v[3:4], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[3:4], v76 offset:165 -; ALIGNED-NEXT: flat_store_byte v[3:4], v75 offset:166 -; ALIGNED-NEXT: flat_store_byte v[3:4], v88 offset:164 -; ALIGNED-NEXT: flat_store_byte v[3:4], v89 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:264 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:150 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:268 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:260 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:256 -; ALIGNED-NEXT: flat_store_byte v[3:4], v90 offset:154 -; ALIGNED-NEXT: flat_store_byte v[3:4], v94 offset:155 -; ALIGNED-NEXT: flat_store_byte v[3:4], v92 offset:153 -; ALIGNED-NEXT: flat_store_byte v[3:4], v104 offset:159 -; ALIGNED-NEXT: flat_store_byte v[3:4], v105 offset:157 -; ALIGNED-NEXT: flat_store_byte v[3:4], v106 offset:158 -; ALIGNED-NEXT: flat_store_byte v[3:4], v108 offset:156 -; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:152 -; ALIGNED-NEXT: flat_store_byte v[3:4], v111 offset:146 -; ALIGNED-NEXT: flat_store_byte v[3:4], v120 offset:147 -; ALIGNED-NEXT: flat_store_byte v[3:4], v124 offset:145 -; ALIGNED-NEXT: flat_store_byte v[3:4], v110 offset:151 -; ALIGNED-NEXT: flat_store_byte v[3:4], v121 offset:149 -; ALIGNED-NEXT: flat_store_byte v[3:4], v122 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:148 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:144 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload @@ -13767,31 +13722,31 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:284 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:276 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:272 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:138 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:139 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:137 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:143 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:141 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:142 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1324 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:140 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload @@ -14215,11 +14170,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[3:4], v91 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v109 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[3:4], v77 offset:17 +; ALIGNED-NEXT: flat_store_byte v[3:4], v95 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:23 @@ -14232,35 +14187,37 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[3:4], v107 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:408 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:412 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:404 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1316 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1312 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:400 -; ALIGNED-NEXT: flat_store_byte v[3:4], v123 offset:10 -; ALIGNED-NEXT: flat_store_byte v[3:4], v126 offset:11 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:10 +; ALIGNED-NEXT: flat_store_byte v[3:4], v5 offset:11 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:13 -; ALIGNED-NEXT: flat_store_byte v[3:4], v125 offset:9 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:9 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:14 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[3:4], v6 offset:12 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:12 -; ALIGNED-NEXT: flat_store_byte v[3:4], v1 offset:8 +; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:8 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1300 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:2 @@ -14270,13 +14227,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:1 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1320 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1304 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:5 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1308 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[3:4], v0 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1296 ; 4-byte Folded Reload @@ -14296,7 +14253,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_mov_b32 s7, -1 ; ALIGNED-NEXT: .LBB9_4: ; %memmove_bwd_loop ; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1 -; ALIGNED-NEXT: s_clause 0x3a +; ALIGNED-NEXT: s_clause 0x39 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:20 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:21 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:22 @@ -14304,7 +14261,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:24 ; ALIGNED-NEXT: buffer_load_ubyte v10, v4, s[0:3], 0 offen offset:25 ; ALIGNED-NEXT: buffer_load_ubyte v12, v4, s[0:3], 0 offen offset:26 -; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:19 +; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:19 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:28 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:29 ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:30 @@ -14355,55 +14312,54 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:75 ; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:78 ; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:79 -; ALIGNED-NEXT: buffer_load_ubyte v125, v4, s[0:3], 0 offen offset:151 -; ALIGNED-NEXT: s_waitcnt vmcnt(58) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(57) -; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(56) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:716 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(55) -; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:720 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(54) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:724 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(53) -; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:732 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(52) +; ALIGNED-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:748 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(51) ; ALIGNED-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:756 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(50) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(49) -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:728 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(48) -; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:736 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(47) -; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:740 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(46) +; ALIGNED-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:744 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(45) ; ALIGNED-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:768 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v7, 8, v5 -; ALIGNED-NEXT: s_waitcnt vmcnt(43) +; ALIGNED-NEXT: s_waitcnt vmcnt(42) ; ALIGNED-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:752 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v3, v9, 8, v8 -; ALIGNED-NEXT: s_waitcnt vmcnt(41) +; ALIGNED-NEXT: s_waitcnt vmcnt(40) ; ALIGNED-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:760 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v5, v10, 8, v6 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v11, 8, v12 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v15, 8, v14 ; ALIGNED-NEXT: v_lshl_or_b32 v8, v19, 8, v17 -; ALIGNED-NEXT: s_waitcnt vmcnt(40) +; ALIGNED-NEXT: s_waitcnt vmcnt(39) ; ALIGNED-NEXT: v_lshl_or_b32 v9, v16, 8, v13 -; ALIGNED-NEXT: s_waitcnt vmcnt(38) +; ALIGNED-NEXT: s_waitcnt vmcnt(37) ; ALIGNED-NEXT: v_lshl_or_b32 v10, v20, 8, v18 -; ALIGNED-NEXT: s_waitcnt vmcnt(36) +; ALIGNED-NEXT: s_waitcnt vmcnt(35) ; ALIGNED-NEXT: v_lshl_or_b32 v11, v23, 8, v22 -; ALIGNED-NEXT: s_waitcnt vmcnt(34) +; ALIGNED-NEXT: s_waitcnt vmcnt(33) ; ALIGNED-NEXT: v_lshl_or_b32 v12, v28, 8, v25 -; ALIGNED-NEXT: s_waitcnt vmcnt(32) +; ALIGNED-NEXT: s_waitcnt vmcnt(31) ; ALIGNED-NEXT: v_lshl_or_b32 v13, v24, 8, v21 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(30) +; ALIGNED-NEXT: s_waitcnt vmcnt(29) ; ALIGNED-NEXT: v_lshl_or_b32 v14, v27, 8, v26 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v6, 16, v5 @@ -14412,27 +14368,27 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v6, v12, 16, v11 ; ALIGNED-NEXT: v_lshl_or_b32 v7, v14, 16, v13 ; ALIGNED-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:772 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(28) +; ALIGNED-NEXT: s_waitcnt vmcnt(27) ; ALIGNED-NEXT: v_lshl_or_b32 v15, v31, 8, v30 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(26) +; ALIGNED-NEXT: s_waitcnt vmcnt(25) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v34, 8, v33 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:796 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(24) +; ALIGNED-NEXT: s_waitcnt vmcnt(23) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v37, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:800 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(22) +; ALIGNED-NEXT: s_waitcnt vmcnt(21) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v35 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:840 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(17) +; ALIGNED-NEXT: s_waitcnt vmcnt(16) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v50, 8, v38 ; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:852 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(15) +; ALIGNED-NEXT: s_waitcnt vmcnt(14) ; ALIGNED-NEXT: v_lshl_or_b32 v5, v49, 8, v39 ; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:868 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v6, v51, 8, v48 ; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:880 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v7, v53, 8, v52 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v0, 16, v15 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -14442,13 +14398,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:916 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v55, 8, v29 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:920 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(11) +; ALIGNED-NEXT: s_waitcnt vmcnt(10) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v67, 8, v66 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:928 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(9) +; ALIGNED-NEXT: s_waitcnt vmcnt(8) ; ALIGNED-NEXT: v_lshl_or_b32 v2, v64, 8, v54 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:932 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v3, v68, 8, v65 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_clause 0x1 @@ -14457,13 +14413,13 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:788 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 16, v2 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:976 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(7) +; ALIGNED-NEXT: s_waitcnt vmcnt(6) ; ALIGNED-NEXT: v_lshl_or_b32 v0, v70, 8, v69 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:83 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:74 ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:988 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: s_waitcnt vmcnt(5) ; ALIGNED-NEXT: v_lshl_or_b32 v1, v80, 8, v71 ; ALIGNED-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:792 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:764 ; 4-byte Folded Spill @@ -14509,7 +14465,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_store_dword v70, off, s[0:3], s32 offset:968 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v71, off, s[0:3], s32 offset:972 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v80, off, s[0:3], s32 offset:980 ; 4-byte Folded Spill -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:1000 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:87 ; ALIGNED-NEXT: s_waitcnt vmcnt(7) @@ -14744,7 +14700,9 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:146 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:150 +; ALIGNED-NEXT: buffer_load_ubyte v8, v4, s[0:3], 0 offen offset:151 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1284 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 ; ALIGNED-NEXT: s_clause 0x1 @@ -14756,8 +14714,10 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:139 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1292 ; 4-byte Folded Spill ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:140 +; ALIGNED-NEXT: s_waitcnt vmcnt(6) +; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(4) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1300 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(3) @@ -14778,7 +14738,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1324 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:1372 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1316 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14788,7 +14748,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_ubyte v3, v4, s[0:3], 0 offen offset:147 ; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:148 -; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:145 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1328 ; 4-byte Folded Spill @@ -14796,7 +14756,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:1348 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1356 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1340 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -14804,231 +14764,171 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 ; ALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v7 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: v_lshl_or_b32 v1, v125, 8, v6 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v1, v8, 8, v6 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v0, v5, 8, v2 +; ALIGNED-NEXT: buffer_load_ubyte v2, v4, s[0:3], 0 offen offset:158 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:156 -; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:157 -; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:158 -; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:159 -; ALIGNED-NEXT: buffer_load_ubyte v106, v4, s[0:3], 0 offen offset:155 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v111, 8, v122 +; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:157 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:156 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v109, 8, v120 +; ALIGNED-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:1416 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v126, v4, s[0:3], 0 offen offset:159 +; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:155 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v1, v126, 8, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:152 -; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:153 -; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:154 +; ALIGNED-NEXT: buffer_load_ubyte v123, v4, s[0:3], 0 offen offset:152 +; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:153 +; ALIGNED-NEXT: buffer_load_ubyte v111, v4, s[0:3], 0 offen offset:154 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v94, 8, v105 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v121, 8, v123 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v106, 8, v92 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v124, 8, v111 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:160 -; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:161 -; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:162 -; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:163 -; ALIGNED-NEXT: buffer_load_ubyte v88, v4, s[0:3], 0 offen offset:164 -; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:165 -; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:166 -; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:167 +; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:160 +; ALIGNED-NEXT: buffer_load_ubyte v105, v4, s[0:3], 0 offen offset:161 +; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:162 +; ALIGNED-NEXT: buffer_load_ubyte v92, v4, s[0:3], 0 offen offset:163 +; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:164 +; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:165 +; ALIGNED-NEXT: buffer_load_ubyte v94, v4, s[0:3], 0 offen offset:166 +; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:167 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v105, 8, v108 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v74, 8, v75 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v92, 8, v93 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v72, 8, v76 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v77, 8, v88 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v91, 8, v94 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v95, 8, v107 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:172 -; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:173 -; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:174 -; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:175 -; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:171 +; ALIGNED-NEXT: buffer_load_ubyte v89, v4, s[0:3], 0 offen offset:172 +; ALIGNED-NEXT: buffer_load_ubyte v79, v4, s[0:3], 0 offen offset:173 +; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:174 +; ALIGNED-NEXT: buffer_load_ubyte v77, v4, s[0:3], 0 offen offset:175 +; ALIGNED-NEXT: buffer_load_ubyte v75, v4, s[0:3], 0 offen offset:171 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v61, 8, v63 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v79, 8, v89 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v60, 8, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v77, 8, v78 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:168 -; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:169 -; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:170 +; ALIGNED-NEXT: buffer_load_ubyte v74, v4, s[0:3], 0 offen offset:168 +; ALIGNED-NEXT: buffer_load_ubyte v72, v4, s[0:3], 0 offen offset:169 +; ALIGNED-NEXT: buffer_load_ubyte v63, v4, s[0:3], 0 offen offset:170 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v56, 8, v59 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v72, 8, v74 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v58, 8, v47 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v75, 8, v63 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:176 -; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:177 -; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:178 -; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:179 -; ALIGNED-NEXT: buffer_load_ubyte v45, v4, s[0:3], 0 offen offset:180 -; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:181 -; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:182 -; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:183 +; ALIGNED-NEXT: buffer_load_ubyte v61, v4, s[0:3], 0 offen offset:176 +; ALIGNED-NEXT: buffer_load_ubyte v59, v4, s[0:3], 0 offen offset:177 +; ALIGNED-NEXT: buffer_load_ubyte v47, v4, s[0:3], 0 offen offset:178 +; ALIGNED-NEXT: buffer_load_ubyte v56, v4, s[0:3], 0 offen offset:179 +; ALIGNED-NEXT: buffer_load_ubyte v60, v4, s[0:3], 0 offen offset:180 +; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:181 +; ALIGNED-NEXT: buffer_load_ubyte v58, v4, s[0:3], 0 offen offset:182 +; ALIGNED-NEXT: buffer_load_ubyte v46, v4, s[0:3], 0 offen offset:183 ; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v44, 8, v46 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v59, 8, v61 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v119 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v56, 8, v47 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v118, 8, v42 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v41, 8, v45 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v46, 8, v58 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 8, v60 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:188 -; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:189 -; ALIGNED-NEXT: buffer_load_ubyte v116, v4, s[0:3], 0 offen offset:190 -; ALIGNED-NEXT: buffer_load_ubyte v114, v4, s[0:3], 0 offen offset:191 -; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:187 +; ALIGNED-NEXT: buffer_load_ubyte v44, v4, s[0:3], 0 offen offset:188 +; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:189 +; ALIGNED-NEXT: buffer_load_ubyte v42, v4, s[0:3], 0 offen offset:190 +; ALIGNED-NEXT: buffer_load_ubyte v41, v4, s[0:3], 0 offen offset:191 +; ALIGNED-NEXT: buffer_load_ubyte v40, v4, s[0:3], 0 offen offset:187 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v115, 8, v117 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v43, 8, v44 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v114, 8, v116 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v41, 8, v42 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:184 -; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:185 -; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:186 +; ALIGNED-NEXT: buffer_load_ubyte v119, v4, s[0:3], 0 offen offset:184 +; ALIGNED-NEXT: buffer_load_ubyte v118, v4, s[0:3], 0 offen offset:185 +; ALIGNED-NEXT: buffer_load_ubyte v117, v4, s[0:3], 0 offen offset:186 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v118, 8, v119 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v1, v112, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v1, v40, 8, v117 ; ALIGNED-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:192 -; ALIGNED-NEXT: buffer_load_ubyte v98, v4, s[0:3], 0 offen offset:193 -; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:194 -; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:195 -; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:196 -; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:197 -; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:198 -; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:199 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v98, 8, v100 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v87 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v97, 8, v99 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v96 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:204 -; ALIGNED-NEXT: buffer_load_ubyte v81, v4, s[0:3], 0 offen offset:205 -; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:206 -; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:207 -; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:203 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v81, 8, v83 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v80, 8, v82 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:200 -; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:201 -; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:202 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v69, 8, v70 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v71, 8, v68 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:212 -; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:213 -; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:214 -; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:215 -; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:211 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v54, 8, v67 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v65 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:216 -; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:217 -; ALIGNED-NEXT: buffer_load_ubyte v49, v4, s[0:3], 0 offen offset:218 -; ALIGNED-NEXT: buffer_load_ubyte v48, v4, s[0:3], 0 offen offset:219 -; ALIGNED-NEXT: buffer_load_ubyte v64, v4, s[0:3], 0 offen offset:220 -; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:221 -; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:222 -; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:223 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v53, 8, v66 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v48, 8, v49 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v51, 8, v64 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v39, 8, v50 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:208 -; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:209 -; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:210 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v36, 8, v38 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v55, 8, v37 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x7 -; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:224 -; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:225 -; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:226 -; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:227 -; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:228 -; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:229 -; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:230 -; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:231 -; ALIGNED-NEXT: s_waitcnt vmcnt(6) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v35 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v29 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v31, 8, v34 -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v28, 8, v32 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x17 -; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:236 -; ALIGNED-NEXT: buffer_load_ubyte v25, v4, s[0:3], 0 offen offset:237 -; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:238 -; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:239 +; ALIGNED-NEXT: s_clause 0x3e +; ALIGNED-NEXT: buffer_load_ubyte v115, v4, s[0:3], 0 offen offset:192 +; ALIGNED-NEXT: buffer_load_ubyte v112, v4, s[0:3], 0 offen offset:193 +; ALIGNED-NEXT: buffer_load_ubyte v101, v4, s[0:3], 0 offen offset:194 +; ALIGNED-NEXT: buffer_load_ubyte v100, v4, s[0:3], 0 offen offset:195 +; ALIGNED-NEXT: buffer_load_ubyte v113, v4, s[0:3], 0 offen offset:196 +; ALIGNED-NEXT: buffer_load_ubyte v103, v4, s[0:3], 0 offen offset:197 +; ALIGNED-NEXT: buffer_load_ubyte v102, v4, s[0:3], 0 offen offset:198 +; ALIGNED-NEXT: buffer_load_ubyte v99, v4, s[0:3], 0 offen offset:199 +; ALIGNED-NEXT: buffer_load_ubyte v97, v4, s[0:3], 0 offen offset:204 +; ALIGNED-NEXT: buffer_load_ubyte v87, v4, s[0:3], 0 offen offset:205 +; ALIGNED-NEXT: buffer_load_ubyte v96, v4, s[0:3], 0 offen offset:206 +; ALIGNED-NEXT: buffer_load_ubyte v86, v4, s[0:3], 0 offen offset:207 +; ALIGNED-NEXT: buffer_load_ubyte v85, v4, s[0:3], 0 offen offset:203 +; ALIGNED-NEXT: buffer_load_ubyte v84, v4, s[0:3], 0 offen offset:200 +; ALIGNED-NEXT: buffer_load_ubyte v83, v4, s[0:3], 0 offen offset:201 +; ALIGNED-NEXT: buffer_load_ubyte v82, v4, s[0:3], 0 offen offset:202 +; ALIGNED-NEXT: buffer_load_ubyte v80, v4, s[0:3], 0 offen offset:212 +; ALIGNED-NEXT: buffer_load_ubyte v68, v4, s[0:3], 0 offen offset:213 +; ALIGNED-NEXT: buffer_load_ubyte v70, v4, s[0:3], 0 offen offset:214 +; ALIGNED-NEXT: buffer_load_ubyte v65, v4, s[0:3], 0 offen offset:215 +; ALIGNED-NEXT: buffer_load_ubyte v66, v4, s[0:3], 0 offen offset:211 +; ALIGNED-NEXT: buffer_load_ubyte v71, v4, s[0:3], 0 offen offset:216 +; ALIGNED-NEXT: buffer_load_ubyte v67, v4, s[0:3], 0 offen offset:217 +; ALIGNED-NEXT: buffer_load_ubyte v53, v4, s[0:3], 0 offen offset:218 +; ALIGNED-NEXT: buffer_load_ubyte v52, v4, s[0:3], 0 offen offset:219 +; ALIGNED-NEXT: buffer_load_ubyte v69, v4, s[0:3], 0 offen offset:220 +; ALIGNED-NEXT: buffer_load_ubyte v55, v4, s[0:3], 0 offen offset:221 +; ALIGNED-NEXT: buffer_load_ubyte v54, v4, s[0:3], 0 offen offset:222 +; ALIGNED-NEXT: buffer_load_ubyte v51, v4, s[0:3], 0 offen offset:223 +; ALIGNED-NEXT: buffer_load_ubyte v50, v4, s[0:3], 0 offen offset:208 +; ALIGNED-NEXT: buffer_load_ubyte v38, v4, s[0:3], 0 offen offset:209 +; ALIGNED-NEXT: buffer_load_ubyte v39, v4, s[0:3], 0 offen offset:210 +; ALIGNED-NEXT: buffer_load_ubyte v37, v4, s[0:3], 0 offen offset:224 +; ALIGNED-NEXT: buffer_load_ubyte v35, v4, s[0:3], 0 offen offset:225 +; ALIGNED-NEXT: buffer_load_ubyte v31, v4, s[0:3], 0 offen offset:226 +; ALIGNED-NEXT: buffer_load_ubyte v32, v4, s[0:3], 0 offen offset:227 +; ALIGNED-NEXT: buffer_load_ubyte v36, v4, s[0:3], 0 offen offset:228 +; ALIGNED-NEXT: buffer_load_ubyte v33, v4, s[0:3], 0 offen offset:229 +; ALIGNED-NEXT: buffer_load_ubyte v34, v4, s[0:3], 0 offen offset:230 +; ALIGNED-NEXT: buffer_load_ubyte v30, v4, s[0:3], 0 offen offset:231 +; ALIGNED-NEXT: buffer_load_ubyte v29, v4, s[0:3], 0 offen offset:236 +; ALIGNED-NEXT: buffer_load_ubyte v27, v4, s[0:3], 0 offen offset:237 +; ALIGNED-NEXT: buffer_load_ubyte v28, v4, s[0:3], 0 offen offset:238 +; ALIGNED-NEXT: buffer_load_ubyte v26, v4, s[0:3], 0 offen offset:239 ; ALIGNED-NEXT: buffer_load_ubyte v23, v4, s[0:3], 0 offen offset:235 -; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:232 -; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:233 -; ALIGNED-NEXT: buffer_load_ubyte v20, v4, s[0:3], 0 offen offset:234 +; ALIGNED-NEXT: buffer_load_ubyte v24, v4, s[0:3], 0 offen offset:232 +; ALIGNED-NEXT: buffer_load_ubyte v22, v4, s[0:3], 0 offen offset:233 +; ALIGNED-NEXT: buffer_load_ubyte v21, v4, s[0:3], 0 offen offset:234 ; ALIGNED-NEXT: buffer_load_ubyte v19, v4, s[0:3], 0 offen offset:240 ; ALIGNED-NEXT: buffer_load_ubyte v17, v4, s[0:3], 0 offen offset:241 ; ALIGNED-NEXT: buffer_load_ubyte v13, v4, s[0:3], 0 offen offset:242 @@ -15044,100 +14944,135 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_ubyte v7, v4, s[0:3], 0 offen offset:251 ; ALIGNED-NEXT: buffer_load_ubyte v6, v4, s[0:3], 0 offen offset:248 ; ALIGNED-NEXT: buffer_load_ubyte v5, v4, s[0:3], 0 offen offset:249 +; ALIGNED-NEXT: s_clause 0x6 ; ALIGNED-NEXT: buffer_load_ubyte v1, v4, s[0:3], 0 offen offset:250 -; ALIGNED-NEXT: v_lshl_or_b32 v123, v3, 16, v2 ; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen -; ALIGNED-NEXT: s_waitcnt vmcnt(23) -; ALIGNED-NEXT: v_lshl_or_b32 v2, v25, 8, v27 -; ALIGNED-NEXT: s_waitcnt vmcnt(21) -; ALIGNED-NEXT: v_lshl_or_b32 v3, v24, 8, v26 -; ALIGNED-NEXT: s_waitcnt vmcnt(9) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v12, 8, v16 -; ALIGNED-NEXT: s_waitcnt vmcnt(5) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v8, 8, v10 -; ALIGNED-NEXT: v_lshl_or_b32 v104, v3, 16, v2 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v21, 8, v22 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v20 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:2 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:4 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:5 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:6 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:7 +; ALIGNED-NEXT: s_waitcnt vmcnt(62) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v112, 8, v115 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v100, 8, v101 +; ALIGNED-NEXT: v_lshl_or_b32 v106, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v103, 8, v113 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v99, 8, v102 +; ALIGNED-NEXT: v_lshl_or_b32 v90, v3, 16, v2 +; ALIGNED-NEXT: s_waitcnt vmcnt(60) +; ALIGNED-NEXT: v_lshl_or_b32 v2, v87, 8, v97 +; ALIGNED-NEXT: s_waitcnt vmcnt(58) +; ALIGNED-NEXT: v_lshl_or_b32 v3, v86, 8, v96 +; ALIGNED-NEXT: s_waitcnt vmcnt(14) +; ALIGNED-NEXT: v_lshl_or_b32 v62, v12, 8, v16 +; ALIGNED-NEXT: s_waitcnt vmcnt(10) +; ALIGNED-NEXT: v_lshl_or_b32 v76, v8, 8, v10 +; ALIGNED-NEXT: s_waitcnt vmcnt(3) +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v88, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v83, 8, v84 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v85, 8, v82 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v109, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: buffer_store_dword v110, off, s[0:3], s32 offset:1384 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v122, off, s[0:3], s32 offset:1392 ; 4-byte Folded Spill ; ALIGNED-NEXT: v_lshl_or_b32 v73, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v68, 8, v80 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v65, 8, v70 +; ALIGNED-NEXT: v_lshl_or_b32 v45, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v67, 8, v71 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v52, 8, v53 +; ALIGNED-NEXT: v_lshl_or_b32 v116, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v55, 8, v69 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v51, 8, v54 +; ALIGNED-NEXT: v_lshl_or_b32 v114, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v38, 8, v50 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v66, 8, v39 +; ALIGNED-NEXT: v_lshl_or_b32 v98, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v35, 8, v37 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v32, 8, v31 +; ALIGNED-NEXT: v_lshl_or_b32 v81, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v33, 8, v36 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v30, 8, v34 +; ALIGNED-NEXT: v_lshl_or_b32 v64, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v27, 8, v29 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v26, 8, v28 +; ALIGNED-NEXT: v_lshl_or_b32 v49, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v22, 8, v24 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v23, 8, v21 +; ALIGNED-NEXT: v_lshl_or_b32 v48, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v2, v17, 8, v19 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v14, 8, v13 -; ALIGNED-NEXT: v_lshl_or_b32 v101, v3, 16, v2 +; ALIGNED-NEXT: v_lshl_or_b32 v25, v3, 16, v2 ; ALIGNED-NEXT: v_lshl_or_b32 v3, v15, 8, v18 -; ALIGNED-NEXT: v_lshl_or_b32 v84, v43, 16, v3 -; ALIGNED-NEXT: v_lshl_or_b32 v43, v9, 8, v11 -; ALIGNED-NEXT: v_lshl_or_b32 v3, v57, 16, v43 -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v5, 8, v6 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v7, 8, v1 -; ALIGNED-NEXT: v_lshl_or_b32 v2, v57, 16, v43 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:1 -; ALIGNED-NEXT: s_waitcnt vmcnt(1) +; ALIGNED-NEXT: v_lshl_or_b32 v20, v62, 16, v3 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v9, 8, v11 +; ALIGNED-NEXT: v_lshl_or_b32 v3, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v5, 8, v6 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v7, 8, v1 +; ALIGNED-NEXT: v_lshl_or_b32 v2, v76, 16, v62 +; ALIGNED-NEXT: s_clause 0x1 +; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:1 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:3 ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1336 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x5 -; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:2 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:3 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:4 -; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:5 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:6 -; ALIGNED-NEXT: buffer_load_ubyte v124, v4, s[0:3], 0 offen offset:7 -; ALIGNED-NEXT: v_lshl_or_b32 v43, v43, 8, v0 -; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:1360 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v57, v57, 8, v127 -; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: buffer_store_dword v78, off, s[0:3], s32 offset:1364 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:1376 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_store_dword v120, off, s[0:3], s32 offset:1368 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: buffer_store_dword v91, off, s[0:3], s32 offset:1380 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: v_lshl_or_b32 v43, v90, 8, v78 +; ALIGNED-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:1344 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v124, 8, v91 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Spill -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Spill -; ALIGNED-NEXT: s_clause 0x4 -; ALIGNED-NEXT: buffer_load_ubyte v121, v4, s[0:3], 0 offen offset:12 -; ALIGNED-NEXT: buffer_load_ubyte v107, v4, s[0:3], 0 offen offset:13 -; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:14 -; ALIGNED-NEXT: buffer_load_ubyte v108, v4, s[0:3], 0 offen offset:15 -; ALIGNED-NEXT: buffer_load_ubyte v93, v4, s[0:3], 0 offen offset:11 +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1352 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v62, v62, 8, v0 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v76, 8, v120 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v109, 8, v104 +; ALIGNED-NEXT: v_lshl_or_b32 v76, v122, 8, v110 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: buffer_load_ubyte v0, v4, s[0:3], 0 offen offset:12 +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Spill +; ALIGNED-NEXT: s_clause 0x3 +; ALIGNED-NEXT: buffer_load_ubyte v127, v4, s[0:3], 0 offen offset:13 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:14 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:15 +; ALIGNED-NEXT: buffer_load_ubyte v120, v4, s[0:3], 0 offen offset:11 ; ALIGNED-NEXT: s_waitcnt vmcnt(3) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v107, 8, v121 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v127, 8, v0 +; ALIGNED-NEXT: s_waitcnt vmcnt(2) +; ALIGNED-NEXT: buffer_store_dword v76, off, s[0:3], s32 offset:1428 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v108, 8, v110 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v104, 8, v76 +; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:1432 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v95, v4, s[0:3], 0 offen offset:8 -; ALIGNED-NEXT: buffer_load_ubyte v91, v4, s[0:3], 0 offen offset:9 -; ALIGNED-NEXT: buffer_load_ubyte v90, v4, s[0:3], 0 offen offset:10 +; ALIGNED-NEXT: buffer_load_ubyte v122, v4, s[0:3], 0 offen offset:8 +; ALIGNED-NEXT: buffer_load_ubyte v110, v4, s[0:3], 0 offen offset:9 +; ALIGNED-NEXT: buffer_load_ubyte v109, v4, s[0:3], 0 offen offset:10 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) -; ALIGNED-NEXT: v_lshl_or_b32 v43, v91, 8, v95 +; ALIGNED-NEXT: v_lshl_or_b32 v62, v110, 8, v122 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: v_lshl_or_b32 v57, v93, 8, v90 -; ALIGNED-NEXT: v_lshl_or_b32 v0, v57, 16, v43 -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Spill +; ALIGNED-NEXT: v_lshl_or_b32 v76, v120, 8, v109 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v76, 16, v62 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Spill ; ALIGNED-NEXT: s_clause 0x2 -; ALIGNED-NEXT: buffer_load_ubyte v57, v4, s[0:3], 0 offen offset:18 -; ALIGNED-NEXT: buffer_load_ubyte v78, v4, s[0:3], 0 offen offset:16 -; ALIGNED-NEXT: buffer_load_ubyte v43, v4, s[0:3], 0 offen offset:17 +; ALIGNED-NEXT: buffer_load_ubyte v62, v4, s[0:3], 0 offen offset:18 +; ALIGNED-NEXT: buffer_load_ubyte v104, v4, s[0:3], 0 offen offset:16 +; ALIGNED-NEXT: buffer_load_ubyte v76, v4, s[0:3], 0 offen offset:17 ; ALIGNED-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:488 ; ALIGNED-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:492 -; ALIGNED-NEXT: buffer_store_dword v84, off, s[0:3], s32 offset:484 -; ALIGNED-NEXT: buffer_store_dword v101, off, s[0:3], s32 offset:480 +; ALIGNED-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:484 +; ALIGNED-NEXT: buffer_store_dword v25, off, s[0:3], s32 offset:480 ; ALIGNED-NEXT: s_clause 0x1 ; ALIGNED-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:704 ; ALIGNED-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:708 ; ALIGNED-NEXT: v_add_nc_u32_e32 v4, 0xffffff00, v4 ; ALIGNED-NEXT: s_waitcnt vmcnt(4) -; ALIGNED-NEXT: v_lshl_or_b32 v0, v126, 8, v57 +; ALIGNED-NEXT: v_lshl_or_b32 v0, v125, 8, v62 ; ALIGNED-NEXT: s_waitcnt vmcnt(2) -; ALIGNED-NEXT: v_lshl_or_b32 v126, v43, 8, v78 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v76, 8, v104 ; ALIGNED-NEXT: s_waitcnt vmcnt(1) ; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, s4 ; ALIGNED-NEXT: s_waitcnt vmcnt(0) @@ -15158,165 +15093,153 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: flat_store_byte v[2:3], v16 offset:246 ; ALIGNED-NEXT: flat_store_byte v[2:3], v18 offset:244 ; ALIGNED-NEXT: flat_store_byte v[2:3], v19 offset:240 -; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:504 -; ALIGNED-NEXT: buffer_store_dword v104, off, s[0:3], s32 offset:508 -; ALIGNED-NEXT: buffer_store_dword v123, off, s[0:3], s32 offset:500 -; ALIGNED-NEXT: v_lshl_or_b32 v126, v0, 16, v126 +; ALIGNED-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:504 +; ALIGNED-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:508 +; ALIGNED-NEXT: buffer_store_dword v64, off, s[0:3], s32 offset:500 +; ALIGNED-NEXT: buffer_store_dword v81, off, s[0:3], s32 offset:496 +; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:234 +; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 +; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:233 +; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:239 +; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:237 +; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:238 +; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:236 +; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:232 +; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:226 +; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:227 +; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:225 +; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:231 +; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:229 +; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:230 +; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:228 +; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:224 +; ALIGNED-NEXT: buffer_store_dword v98, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v114, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v116, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:452 +; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:213 +; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:215 +; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:209 +; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:211 +; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:210 +; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:214 +; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:212 +; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:218 +; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:219 +; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:217 +; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:223 +; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:221 +; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:222 +; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:220 +; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:216 +; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:208 +; ALIGNED-NEXT: buffer_store_dword v73, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v88, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v90, off, s[0:3], s32 offset:468 +; ALIGNED-NEXT: buffer_store_dword v106, off, s[0:3], s32 offset:464 +; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:202 +; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:203 +; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:201 +; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:207 +; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:205 +; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:206 +; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:204 +; ALIGNED-NEXT: flat_store_byte v[2:3], v84 offset:200 +; ALIGNED-NEXT: flat_store_byte v[2:3], v101 offset:194 +; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:195 +; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:193 +; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:199 +; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:197 +; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:198 +; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:196 +; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:192 +; ALIGNED-NEXT: v_lshl_or_b32 v125, v0, 16, v125 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1484 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_add_u32 s4, s4, 0xffffff00 ; ALIGNED-NEXT: s_addc_u32 s5, s5, -1 ; ALIGNED-NEXT: s_cmp_eq_u64 s[4:5], s[6:7] ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:496 -; ALIGNED-NEXT: flat_store_byte v[2:3], v20 offset:234 -; ALIGNED-NEXT: flat_store_byte v[2:3], v23 offset:235 -; ALIGNED-NEXT: flat_store_byte v[2:3], v21 offset:233 -; ALIGNED-NEXT: flat_store_byte v[2:3], v24 offset:239 -; ALIGNED-NEXT: flat_store_byte v[2:3], v25 offset:237 -; ALIGNED-NEXT: flat_store_byte v[2:3], v26 offset:238 -; ALIGNED-NEXT: flat_store_byte v[2:3], v27 offset:236 -; ALIGNED-NEXT: flat_store_byte v[2:3], v22 offset:232 -; ALIGNED-NEXT: flat_store_byte v[2:3], v29 offset:226 -; ALIGNED-NEXT: flat_store_byte v[2:3], v30 offset:227 -; ALIGNED-NEXT: flat_store_byte v[2:3], v33 offset:225 -; ALIGNED-NEXT: flat_store_byte v[2:3], v28 offset:231 -; ALIGNED-NEXT: flat_store_byte v[2:3], v31 offset:229 -; ALIGNED-NEXT: flat_store_byte v[2:3], v32 offset:230 -; ALIGNED-NEXT: flat_store_byte v[2:3], v34 offset:228 -; ALIGNED-NEXT: flat_store_byte v[2:3], v35 offset:224 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1480 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1476 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:460 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1472 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:456 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 +; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:186 +; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:187 +; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:185 +; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:191 +; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:189 +; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:190 +; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:188 +; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:184 +; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:178 +; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:179 +; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:177 +; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:183 +; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:181 +; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:182 +; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:180 +; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:176 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1468 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:452 -; ALIGNED-NEXT: flat_store_byte v[2:3], v54 offset:213 -; ALIGNED-NEXT: flat_store_byte v[2:3], v52 offset:215 -; ALIGNED-NEXT: flat_store_byte v[2:3], v36 offset:209 -; ALIGNED-NEXT: flat_store_byte v[2:3], v55 offset:211 -; ALIGNED-NEXT: flat_store_byte v[2:3], v37 offset:210 -; ALIGNED-NEXT: flat_store_byte v[2:3], v65 offset:214 -; ALIGNED-NEXT: flat_store_byte v[2:3], v67 offset:212 -; ALIGNED-NEXT: flat_store_byte v[2:3], v49 offset:218 -; ALIGNED-NEXT: flat_store_byte v[2:3], v48 offset:219 -; ALIGNED-NEXT: flat_store_byte v[2:3], v53 offset:217 -; ALIGNED-NEXT: flat_store_byte v[2:3], v39 offset:223 -; ALIGNED-NEXT: flat_store_byte v[2:3], v51 offset:221 -; ALIGNED-NEXT: flat_store_byte v[2:3], v50 offset:222 -; ALIGNED-NEXT: flat_store_byte v[2:3], v64 offset:220 -; ALIGNED-NEXT: flat_store_byte v[2:3], v66 offset:216 -; ALIGNED-NEXT: flat_store_byte v[2:3], v38 offset:208 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1464 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1460 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:476 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1456 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:468 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:464 -; ALIGNED-NEXT: flat_store_byte v[2:3], v68 offset:202 -; ALIGNED-NEXT: flat_store_byte v[2:3], v71 offset:203 -; ALIGNED-NEXT: flat_store_byte v[2:3], v69 offset:201 -; ALIGNED-NEXT: flat_store_byte v[2:3], v80 offset:207 -; ALIGNED-NEXT: flat_store_byte v[2:3], v81 offset:205 -; ALIGNED-NEXT: flat_store_byte v[2:3], v82 offset:206 -; ALIGNED-NEXT: flat_store_byte v[2:3], v83 offset:204 -; ALIGNED-NEXT: flat_store_byte v[2:3], v70 offset:200 -; ALIGNED-NEXT: flat_store_byte v[2:3], v87 offset:194 -; ALIGNED-NEXT: flat_store_byte v[2:3], v86 offset:195 -; ALIGNED-NEXT: flat_store_byte v[2:3], v98 offset:193 -; ALIGNED-NEXT: flat_store_byte v[2:3], v85 offset:199 -; ALIGNED-NEXT: flat_store_byte v[2:3], v97 offset:197 -; ALIGNED-NEXT: flat_store_byte v[2:3], v96 offset:198 -; ALIGNED-NEXT: flat_store_byte v[2:3], v99 offset:196 -; ALIGNED-NEXT: flat_store_byte v[2:3], v100 offset:192 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:552 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 +; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:170 +; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:171 +; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:169 +; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:175 +; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:173 +; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:174 +; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:172 +; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:168 +; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:162 +; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:163 +; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:161 +; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:167 +; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:165 +; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:166 +; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:164 +; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:160 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1444 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:556 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:548 +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1436 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:544 -; ALIGNED-NEXT: flat_store_byte v[2:3], v102 offset:186 -; ALIGNED-NEXT: flat_store_byte v[2:3], v112 offset:187 -; ALIGNED-NEXT: flat_store_byte v[2:3], v103 offset:185 -; ALIGNED-NEXT: flat_store_byte v[2:3], v114 offset:191 -; ALIGNED-NEXT: flat_store_byte v[2:3], v115 offset:189 -; ALIGNED-NEXT: flat_store_byte v[2:3], v116 offset:190 -; ALIGNED-NEXT: flat_store_byte v[2:3], v117 offset:188 -; ALIGNED-NEXT: flat_store_byte v[2:3], v113 offset:184 -; ALIGNED-NEXT: flat_store_byte v[2:3], v119 offset:178 -; ALIGNED-NEXT: flat_store_byte v[2:3], v40 offset:179 -; ALIGNED-NEXT: flat_store_byte v[2:3], v44 offset:177 -; ALIGNED-NEXT: flat_store_byte v[2:3], v118 offset:183 -; ALIGNED-NEXT: flat_store_byte v[2:3], v41 offset:181 -; ALIGNED-NEXT: flat_store_byte v[2:3], v42 offset:182 -; ALIGNED-NEXT: flat_store_byte v[2:3], v45 offset:180 -; ALIGNED-NEXT: flat_store_byte v[2:3], v46 offset:176 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:568 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:572 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 +; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:154 +; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:155 +; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:153 +; ALIGNED-NEXT: flat_store_byte v[2:3], v126 offset:159 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:564 +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:157 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1420 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:560 -; ALIGNED-NEXT: flat_store_byte v[2:3], v47 offset:170 -; ALIGNED-NEXT: flat_store_byte v[2:3], v58 offset:171 -; ALIGNED-NEXT: flat_store_byte v[2:3], v56 offset:169 -; ALIGNED-NEXT: flat_store_byte v[2:3], v60 offset:175 -; ALIGNED-NEXT: flat_store_byte v[2:3], v61 offset:173 -; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:174 -; ALIGNED-NEXT: flat_store_byte v[2:3], v63 offset:172 -; ALIGNED-NEXT: flat_store_byte v[2:3], v59 offset:168 -; ALIGNED-NEXT: flat_store_byte v[2:3], v75 offset:162 -; ALIGNED-NEXT: flat_store_byte v[2:3], v74 offset:163 -; ALIGNED-NEXT: flat_store_byte v[2:3], v79 offset:161 -; ALIGNED-NEXT: flat_store_byte v[2:3], v72 offset:167 -; ALIGNED-NEXT: flat_store_byte v[2:3], v77 offset:165 -; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:166 -; ALIGNED-NEXT: flat_store_byte v[2:3], v88 offset:164 -; ALIGNED-NEXT: flat_store_byte v[2:3], v89 offset:160 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1408 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:520 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:524 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload -; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:516 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:158 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:512 -; ALIGNED-NEXT: flat_store_byte v[2:3], v92 offset:154 -; ALIGNED-NEXT: flat_store_byte v[2:3], v106 offset:155 -; ALIGNED-NEXT: flat_store_byte v[2:3], v94 offset:153 -; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:159 -; ALIGNED-NEXT: flat_store_byte v[2:3], v111 offset:157 -; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:158 -; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:156 -; ALIGNED-NEXT: flat_store_byte v[2:3], v105 offset:152 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:156 +; ALIGNED-NEXT: flat_store_byte v[2:3], v123 offset:152 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:146 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1348 ; 4-byte Folded Reload @@ -15325,14 +15248,16 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1340 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:145 -; ALIGNED-NEXT: flat_store_byte v[2:3], v125 offset:151 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:151 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1372 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:149 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:150 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1356 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:148 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1332 ; 4-byte Folded Reload @@ -15767,7 +15692,7 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:780 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:644 -; ALIGNED-NEXT: buffer_store_dword v126, off, s[0:3], s32 offset:640 +; ALIGNED-NEXT: buffer_store_dword v125, off, s[0:3], s32 offset:640 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:756 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:26 @@ -15792,11 +15717,11 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:732 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:24 -; ALIGNED-NEXT: flat_store_byte v[2:3], v57 offset:18 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1416 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v62 offset:18 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1452 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:19 -; ALIGNED-NEXT: flat_store_byte v[2:3], v43 offset:17 +; ALIGNED-NEXT: flat_store_byte v[2:3], v76 offset:17 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:724 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:23 @@ -15809,40 +15734,50 @@ define void @memmove_p0_p5_sz2048(ptr addrspace(0) align 1 %dst, ptr addrspace(5 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:712 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:20 -; ALIGNED-NEXT: flat_store_byte v[2:3], v78 offset:16 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v104 offset:16 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1448 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:664 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1404 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1440 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:668 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1396 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1412 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:660 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1388 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1400 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:656 -; ALIGNED-NEXT: flat_store_byte v[2:3], v90 offset:10 -; ALIGNED-NEXT: flat_store_byte v[2:3], v93 offset:11 -; ALIGNED-NEXT: flat_store_byte v[2:3], v107 offset:13 -; ALIGNED-NEXT: flat_store_byte v[2:3], v91 offset:9 -; ALIGNED-NEXT: flat_store_byte v[2:3], v108 offset:15 -; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:14 -; ALIGNED-NEXT: flat_store_byte v[2:3], v121 offset:12 -; ALIGNED-NEXT: flat_store_byte v[2:3], v95 offset:8 -; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:2 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1360 ; 4-byte Folded Reload +; ALIGNED-NEXT: flat_store_byte v[2:3], v109 offset:10 +; ALIGNED-NEXT: flat_store_byte v[2:3], v120 offset:11 +; ALIGNED-NEXT: flat_store_byte v[2:3], v127 offset:13 +; ALIGNED-NEXT: flat_store_byte v[2:3], v110 offset:9 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1432 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:15 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1428 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:14 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1424 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:12 +; ALIGNED-NEXT: flat_store_byte v[2:3], v122 offset:8 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1368 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:2 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1352 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:3 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1344 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:1 -; ALIGNED-NEXT: flat_store_byte v[2:3], v124 offset:7 -; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1376 ; 4-byte Folded Reload +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1392 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) -; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:7 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1380 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) +; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:5 +; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1384 ; 4-byte Folded Reload +; ALIGNED-NEXT: s_waitcnt vmcnt(0) ; ALIGNED-NEXT: flat_store_byte v[2:3], v0 offset:6 ; ALIGNED-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:1364 ; 4-byte Folded Reload ; ALIGNED-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll new file mode 100644 index 0000000000000..1e6dc4e06ef4d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers-mmra.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s + + +define float @test_barrier_workgroup_local_mmra(ptr addrspace(3) noundef %x, ptr addrspace(3) noundef %y, float %val) { +; GFX10-WGP-LABEL: test_barrier_workgroup_local_mmra: +; GFX10-WGP: ; %bb.0: +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: ds_read_b32 v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-WGP-LABEL: test_barrier_workgroup_local_mmra: +; GFX11-WGP: ; %bb.0: +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: ds_store_b32 v0, v2 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: ds_load_b32 v0, v1 +; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: test_barrier_workgroup_local_mmra: +; GFX12-WGP: ; %bb.0: +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: ds_store_b32 v0, v2 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: ds_load_b32 v0, v1 +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_barrier_workgroup_local_mmra: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: ds_load_b32 v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + store float %val, ptr addrspace(3) %x + fence syncscope("workgroup") release, !mmra !0 + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire, !mmra !0 + %ret = load float, ptr addrspace(3) %y + ret float %ret +} + +define float @test_barrier_workgroup_global_mmra(ptr addrspace(1) noundef %x, ptr addrspace(1) noundef %y, float %val) { +; GFX10-WGP-LABEL: test_barrier_workgroup_global_mmra: +; GFX10-WGP: ; %bb.0: +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v4, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_barrier +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_load_dword v0, v[2:3], off +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-WGP-LABEL: test_barrier_workgroup_global_mmra: +; GFX11-WGP: ; %bb.0: +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-WGP-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_barrier +; GFX11-WGP-NEXT: buffer_gl0_inv +; GFX11-WGP-NEXT: global_load_b32 v0, v[2:3], off +; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX11-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-WGP-LABEL: test_barrier_workgroup_global_mmra: +; GFX12-WGP: ; %bb.0: +; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-WGP-NEXT: s_wait_expcnt 0x0 +; GFX12-WGP-NEXT: s_wait_samplecnt 0x0 +; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0 +; GFX12-WGP-NEXT: s_wait_kmcnt 0x0 +; GFX12-WGP-NEXT: global_store_b32 v[0:1], v4, off +; GFX12-WGP-NEXT: s_wait_storecnt 0x0 +; GFX12-WGP-NEXT: s_barrier_signal -1 +; GFX12-WGP-NEXT: s_barrier_wait -1 +; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE +; GFX12-WGP-NEXT: global_load_b32 v0, v[2:3], off +; GFX12-WGP-NEXT: s_wait_loadcnt 0x0 +; GFX12-WGP-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: test_barrier_workgroup_global_mmra: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_store_b32 v[0:1], v4, off +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_barrier_signal -1 +; GFX1250-NEXT: s_barrier_wait -1 +; GFX1250-NEXT: global_load_b32 v0, v[2:3], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + store float %val, ptr addrspace(1) %x + fence syncscope("workgroup") release, !mmra !1 + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire, !mmra !1 + %ret = load float, ptr addrspace(1) %y + ret float %ret +} + +!0 = !{!"amdgpu-synchronize-as", !"local"} +!1 = !{!"amdgpu-synchronize-as", !"global"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index cc42428e1aa06..8b0b099999f06 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -143,14 +143,17 @@ define amdgpu_kernel void @workgroup_release_fence() { ; ; GFX12-WGP-LABEL: workgroup_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -213,14 +216,17 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; ; GFX12-WGP-LABEL: workgroup_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -283,14 +289,17 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; ; GFX12-WGP-LABEL: workgroup_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: workgroup_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -670,14 +679,17 @@ define amdgpu_kernel void @agent_release_fence() { ; ; GFX12-WGP-LABEL: agent_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: agent_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -740,14 +752,17 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; ; GFX12-WGP-LABEL: agent_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: agent_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -810,14 +825,17 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; ; GFX12-WGP-LABEL: agent_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: agent_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1197,14 +1215,17 @@ define amdgpu_kernel void @system_release_fence() { ; ; GFX12-WGP-LABEL: system_release_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: system_release_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1267,14 +1288,17 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; ; GFX12-WGP-LABEL: system_acq_rel_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: system_acq_rel_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} @@ -1337,14 +1361,17 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; ; GFX12-WGP-LABEL: system_seq_cst_fence: ; GFX12-WGP: ; %bb.0: ; %entry +; GFX12-WGP-NEXT: s_wait_dscnt 0x0 ; GFX12-WGP-NEXT: s_endpgm ; ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry +; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; ; GFX1250-LABEL: system_seq_cst_fence: ; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll index bc905fa564f8a..80ea48be0b893 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: private_last_use_load_0: @@ -13,6 +14,17 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: private_last_use_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -36,6 +48,20 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: private_last_use_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(5) %in, i32 %tid @@ -57,6 +83,17 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) % ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: private_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -74,6 +111,17 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: private_last_use_and_nontemporal_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 store i32 %val, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll index 2aa4f021c259c..89de17ecbd1e8 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-LABEL: private_nontemporal_load_0: @@ -201,6 +202,17 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(5) %in, align 4, !nontemporal !0 @@ -450,6 +462,20 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -627,6 +653,17 @@ define amdgpu_kernel void @private_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -846,6 +883,20 @@ define amdgpu_kernel void @private_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v0, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: scratch_store_b32 v1, v0, s0 scale_offset th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1047,6 +1098,17 @@ define amdgpu_kernel void @private_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(5) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll index df4193969f8a0..7faa0621aa6d0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @private_volatile_load_0( ; GFX6-LABEL: private_volatile_load_0: @@ -155,6 +156,17 @@ define amdgpu_kernel void @private_volatile_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_volatile_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(5) %in, align 4 @@ -340,6 +352,20 @@ define amdgpu_kernel void @private_volatile_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_volatile_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(5) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -490,6 +516,18 @@ define amdgpu_kernel void @private_volatile_store_0( ; GFX12-CU-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_volatile_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: scratch_store_b32 off, v0, s0 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -664,6 +702,21 @@ define amdgpu_kernel void @private_volatile_store_1( ; GFX12-CU-NEXT: scratch_store_b32 v1, v0, s0 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: private_volatile_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v0, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: scratch_store_b32 v1, v0, s0 scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(5) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 7e3d5c97391e1..baccb4c7d0859 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -3221,7 +3221,7 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a, ; GFX1250-NEXT: s_load_b128 s[12:15], s[4:5], 0x4c ; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-NEXT: s_wait_xcnt 0x0 -; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffff) +; GFX1250-NEXT: s_mov_b64 s[4:5], 0xffffffff ; GFX1250-NEXT: s_mov_b32 s3, 0 ; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-NEXT: s_mov_b32 s7, s3 diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index bf8994e005fc5..3d9c2a29cb9c1 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -815,9 +815,10 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s2, s2, 9 -; SI-NEXT: v_mul_hi_u32 v0, s2, v0 -; SI-NEXT: s_mul_i32 s2, s2, 0xff803fe1 -; SI-NEXT: v_alignbit_b32 v0, v0, s2, 1 +; SI-NEXT: s_mul_i32 s4, s2, 0xff803fe1 +; SI-NEXT: v_mul_hi_u32 v1, s2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 1 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -832,7 +833,7 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { ; VI-NEXT: s_lshr_b32 s0, s0, 9 ; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v0, 0 ; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_alignbit_b32 v0, v1, v0, 1 +; VI-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1] ; VI-NEXT: s_nop 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -844,11 +845,11 @@ define amdgpu_kernel void @test_umul_i24(ptr addrspace(1) %out, i32 %arg) { ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s2, 9 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, 0xff803fe1 -; GFX9-NEXT: s_mul_i32 s2, s2, 0xff803fe1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1 +; GFX9-NEXT: s_mul_hi_u32 s5, s2, 0xff803fe1 +; GFX9-NEXT: s_mul_i32 s4, s2, 0xff803fe1 +; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 ; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 9f27e1ffd9130..b0651ef53dd1b 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -791,7 +791,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x3f80000000000000) +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x3f80000000000000 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -803,7 +803,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000) +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f80000000000000 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -851,7 +851,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x400000003f800000) +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x400000003f800000 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -863,7 +863,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000) +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x400000003f800000 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -1989,7 +1989,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000) +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4040000040800000 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v4, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2001,7 +2001,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x4040000040800000 ; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 @@ -2907,8 +2907,8 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-SDAG: ; %bb.0: ; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-SDAG-NEXT: v_and_b32_e32 v6, 0x3ff, v0 -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], lit64(0x4040000040800000) -; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], lit64(0x400000003f800000) +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[2:3], 0x4040000040800000 +; GFX1250-SDAG-NEXT: v_mov_b64_e32 v[4:5], 0x400000003f800000 ; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v6, s[0:1] scale_offset ; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 @@ -2920,9 +2920,9 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) { ; GFX1250-GISEL: ; %bb.0: ; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1250-GISEL-NEXT: v_and_b32_e32 v6, 0x3ff, v0 -; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], lit64(0x4040000040800000) +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x4040000040800000 ; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 -; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], lit64(0x400000003f800000) +; GFX1250-GISEL-NEXT: s_mov_b64 s[4:5], 0x400000003f800000 ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] ; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] ; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/postra-machine-sink-livein-subrange.mir b/llvm/test/CodeGen/AMDGPU/postra-machine-sink-livein-subrange.mir new file mode 100644 index 0000000000000..eb48ff08f1b7c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/postra-machine-sink-livein-subrange.mir @@ -0,0 +1,113 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -run-pass=postra-machine-sink -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s + +# Test live-in with subrange is updated accordingly in postra-machine-sink. +--- +name: test_postra_machine_sink_livein_update +tracksRegLiveness: true +frameInfo: + adjustsStack: true +stack: + - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: 0, size: 8, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +machineFunctionInfo: + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + ; GCN-LABEL: name: test_postra_machine_sink_livein_update + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $sgpr30, $sgpr31, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr40, $sgpr30_sgpr31 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr44 = COPY $vgpr13, implicit $exec + ; GCN-NEXT: renamable $vgpr43 = COPY $vgpr12, implicit $exec + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit undef $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $exec, $sgpr30, $sgpr31, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr57 = COPY $vgpr9, implicit $exec + ; GCN-NEXT: renamable $vgpr56 = COPY $vgpr8, implicit $exec + ; GCN-NEXT: renamable $vgpr59 = COPY $vgpr7, implicit $exec + ; GCN-NEXT: renamable $vgpr58 = COPY $vgpr6, implicit $exec + ; GCN-NEXT: renamable $vgpr61 = COPY $vgpr5, implicit $exec + ; GCN-NEXT: renamable $vgpr60 = COPY $vgpr4, implicit $exec + ; GCN-NEXT: renamable $vgpr42 = COPY $vgpr3, implicit $exec + ; GCN-NEXT: renamable $vgpr41 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr46 = COPY $vgpr1, implicit $exec + ; GCN-NEXT: renamable $vgpr45 = COPY $vgpr0, implicit $exec + ; GCN-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GCN-NEXT: renamable $sgpr16_sgpr17 = IMPLICIT_DEF + ; GCN-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + ; GCN-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31 + ; GCN-NEXT: SI_SPILL_AV64_SAVE killed $vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + ; GCN-NEXT: SI_SPILL_AV64_SAVE killed $vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.2, align 4, addrspace 5) + ; GCN-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0 + ; GCN-NEXT: renamable $vgpr14_vgpr15 = SI_SPILL_AV64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) + ; GCN-NEXT: renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; GCN-NEXT: renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.2, align 4, addrspace 5) + ; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $vgpr40, $vgpr14_vgpr15:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 0, implicit $exec + ; GCN-NEXT: FLAT_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr43_vgpr44, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; GCN-NEXT: FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr14_vgpr15, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31 + bb.0: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $sgpr30, $sgpr31, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr40, $sgpr30_sgpr31 + + renamable $vgpr44 = COPY $vgpr13, implicit $exec + renamable $vgpr43 = COPY $vgpr12, implicit $exec + renamable $vgpr57 = COPY $vgpr9, implicit $exec + renamable $vgpr56 = COPY $vgpr8, implicit $exec + renamable $vgpr59 = COPY $vgpr7, implicit $exec + renamable $vgpr58 = COPY $vgpr6, implicit $exec + renamable $vgpr61 = COPY $vgpr5, implicit $exec + renamable $vgpr60 = COPY $vgpr4, implicit $exec + renamable $vgpr42 = COPY $vgpr3, implicit $exec + renamable $vgpr41 = COPY $vgpr2, implicit $exec + renamable $vgpr46 = COPY $vgpr1, implicit $exec + renamable $vgpr45 = COPY $vgpr0, implicit $exec + S_CBRANCH_SCC1 %bb.2, implicit undef $scc + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2(0x80000000) + liveins: $sgpr30, $sgpr31, $vgpr40, $sgpr30_sgpr31, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr41_vgpr42:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F, $vgpr45_vgpr46:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F + + ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + renamable $sgpr16_sgpr17 = IMPLICIT_DEF + $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31 + $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31 + SI_SPILL_AV64_SAVE killed $vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5) + SI_SPILL_AV64_SAVE killed $vgpr10_vgpr11, %stack.2, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.2, align 4, addrspace 5) + dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0 + renamable $vgpr14_vgpr15 = SI_SPILL_AV64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5) + renamable $vgpr0_vgpr1 = nofpexcept V_FMA_F64_e64 0, killed $vgpr45_vgpr46, 0, killed $vgpr41_vgpr42, 0, killed $vgpr60_vgpr61, 0, 0, implicit $mode, implicit $exec + ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + FLAT_STORE_DWORDX2 killed renamable $vgpr58_vgpr59, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + renamable $vgpr0_vgpr1 = SI_SPILL_AV64_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.2, align 4, addrspace 5) + FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr56_vgpr57, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + + bb.2: + liveins: $vgpr40, $vgpr14_vgpr15:0x000000000000000F, $vgpr43_vgpr44:0x000000000000000F + + renamable $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 0, implicit $exec + FLAT_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr43_vgpr44, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + FLAT_STORE_DWORDX2 killed renamable $vgpr0_vgpr1, killed renamable $vgpr14_vgpr15, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64)) + S_SETPC_B64_return undef $sgpr30_sgpr31 +... diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 79910af5c0434..93f4ea37117ba 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -929,9 +929,8 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { ; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 +; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16: diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index c572185e7bbf6..4ea58a5890d35 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -619,8 +619,7 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_add_i32_e32 v2, vcc, -1, v1 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, 1, v1 ; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], 4, v1 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 4, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 83c521043025c..aa131ed6c9db1 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -377,63 +377,63 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_mov_b32_e32 v10, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: s_movk_i32 s0, 0x7f +; GFX8-NEXT: v_mov_b32_e32 v13, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_mov_b32 s0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_mov_b32 s1, 0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[6:7] +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5] ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v19, vcc, 0xffffd800, v2 -; GFX8-NEXT: v_addc_u32_e32 v20, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v21, vcc, 0xffffe000, v2 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[19:20] -; GFX8-NEXT: s_addk_i32 s1, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s1, 0x3fffff +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[20:21] +; GFX8-NEXT: s_addk_i32 s0, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s0, 0x3fffff ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v23, vcc, v13, v10 -; GFX8-NEXT: v_addc_u32_e32 v24, vcc, v14, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v14, v10 +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, v15, v11, vcc ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v13, vcc, 0xfffff000, v2 -; GFX8-NEXT: flat_load_dwordx2 v[19:20], v[21:22] +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfffff000, v2 +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[22:23] ; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] -; GFX8-NEXT: v_addc_u32_e32 v14, vcc, -1, v3, vcc +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v15, v23 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v16, v24, vcc -; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0xfffff800, v2 -; GFX8-NEXT: flat_load_dwordx2 v[13:14], v[13:14] -; GFX8-NEXT: v_addc_u32_e32 v16, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[15:16], v[15:16] +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v16, v24 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v17, v25, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff800, v2 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v21, vcc, v17, v21 -; GFX8-NEXT: v_addc_u32_e32 v22, vcc, v18, v22, vcc -; GFX8-NEXT: flat_load_dwordx2 v[17:18], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v18, v22 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v19, v23, vcc +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v21 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v22, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v22 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v23, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc @@ -441,30 +441,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v19, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v20, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v20, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v21, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v13, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v14, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v14, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v15, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v16, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v17, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v18, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_add_i32 s1, s0, -1 -; GFX8-NEXT: s_cmp_eq_u32 s0, 0 -; GFX8-NEXT: s_cbranch_scc1 .LBB1_5 -; GFX8-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: s_mov_b32 s0, s1 -; GFX8-NEXT: s_branch .LBB1_1 -; GFX8-NEXT: .LBB1_5: ; %while.end +; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, 1, v13 +; GFX8-NEXT: s_and_b64 vcc, exec, vcc +; GFX8-NEXT: s_cbranch_vccz .LBB1_1 +; GFX8-NEXT: ; %bb.4: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc @@ -503,7 +500,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX900-NEXT: v_mov_b32_e32 v5, 0 -; GFX900-NEXT: s_movk_i32 s5, 0x7f +; GFX900-NEXT: v_mov_b32_e32 v7, 0x7f ; GFX900-NEXT: s_movk_i32 s2, 0xd000 ; GFX900-NEXT: s_movk_i32 s3, 0xe000 ; GFX900-NEXT: s_movk_i32 s4, 0xf000 @@ -511,77 +508,74 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 +; GFX900-NEXT: s_mov_b32 s5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 -; GFX900-NEXT: s_mov_b32 s6, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, 0xffffb000, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[9:10], v[2:3], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[11:12], v[2:3], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, 0xffffc000, v2 -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[19:20], v[13:14], off -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, s2, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, -1, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v13, vcc, s3, v2 -; GFX900-NEXT: global_load_dwordx2 v[15:16], v[15:16], off offset:-2048 -; GFX900-NEXT: v_addc_co_u32_e32 v14, vcc, -1, v3, vcc -; GFX900-NEXT: s_addk_i32 s6, 0x2000 -; GFX900-NEXT: s_cmp_gt_u32 s6, 0x3fffff +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2 +; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc +; GFX900-NEXT: s_addk_i32 s5, 0x2000 +; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v21, vcc, v7, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[7:8], v[13:14], off offset:-4096 +; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096 ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e64 v23, s[0:1], v17, v21 -; GFX900-NEXT: v_addc_co_u32_e64 v24, s[0:1], v18, v5, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[17:18], v[13:14], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[21:22], v[13:14], off +; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22 +; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 ; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v19, vcc, v19, v23 -; GFX900-NEXT: global_load_dwordx2 v[13:14], v[2:3], off -; GFX900-NEXT: v_addc_co_u32_e32 v20, vcc, v20, v24, vcc +; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24 +; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off +; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v15, vcc, v15, v19 -; GFX900-NEXT: v_addc_co_u32_e32 v16, vcc, v16, v20, vcc +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc ; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v7, v15 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v8, v16, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v16 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v17, vcc ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v17, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v18, v8, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v18, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v19, v9, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v7, vcc, v21, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v8, vcc, v22, v8, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v22, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v23, v9, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v8, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v9, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v10, v5, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v11, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v12, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v13, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v14, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_add_i32 s0, s5, -1 -; GFX900-NEXT: s_cmp_eq_u32 s5, 0 -; GFX900-NEXT: s_cbranch_scc1 .LBB1_5 -; GFX900-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX900-NEXT: s_mov_b32 s5, s0 -; GFX900-NEXT: s_branch .LBB1_1 -; GFX900-NEXT: .LBB1_5: ; %while.end +; GFX900-NEXT: v_subrev_co_u32_e32 v7, vcc, 1, v7 +; GFX900-NEXT: s_and_b64 vcc, exec, vcc +; GFX900-NEXT: s_cbranch_vccz .LBB1_1 +; GFX900-NEXT: ; %bb.4: ; %while.end ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v6 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -612,7 +606,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_movk_i32 s1, 0x7f +; GFX10-NEXT: v_mov_b32_e32 v7, 0x7f ; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34 @@ -624,77 +618,74 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v4, v0 -; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b32 s1, 0 ; GFX10-NEXT: .LBB1_2: ; %for.body ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffb800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v4, 0xffffc800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v4, 0xffffd800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v17, vcc_lo, v4, 0xffffe800 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffb800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xffffc800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v4, 0xffffd800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v4, 0xffffe800 ; GFX10-NEXT: s_clause 0x2 -; GFX10-NEXT: global_load_dwordx2 v[11:12], v[7:8], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[15:16], v[9:10], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[19:20], v[13:14], off offset:-2048 -; GFX10-NEXT: v_add_co_ci_u32_e32 v18, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v22, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048 +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx2 v[23:24], v[17:18], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[7:8], v[7:8], off -; GFX10-NEXT: global_load_dwordx2 v[9:10], v[9:10], off -; GFX10-NEXT: global_load_dwordx2 v[13:14], v[13:14], off -; GFX10-NEXT: global_load_dwordx2 v[25:26], v[17:18], off -; GFX10-NEXT: global_load_dwordx2 v[27:28], v[21:22], off -; GFX10-NEXT: global_load_dwordx2 v[29:30], v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[31:32], v[4:5], off +; GFX10-NEXT: global_load_dwordx2 v[24:25], v[18:19], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off +; GFX10-NEXT: global_load_dwordx2 v[26:27], v[18:19], off +; GFX10-NEXT: global_load_dwordx2 v[28:29], v[22:23], off +; GFX10-NEXT: global_load_dwordx2 v[30:31], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[32:33], v[4:5], off ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: s_addk_i32 s2, 0x2000 -; GFX10-NEXT: s_cmp_gt_u32 s2, 0x3fffff +; GFX10-NEXT: s_addk_i32 s1, 0x2000 +; GFX10-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX10-NEXT: s_waitcnt vmcnt(10) -; GFX10-NEXT: v_add_co_u32 v2, s0, v11, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v12, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v12, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v13, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add_co_u32 v2, s0, v7, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v8, v3, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v15, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v16, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v17, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v2, s0, v9, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v10, v3, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v19, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v20, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v11, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v20, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v21, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add_co_u32 v2, s0, v13, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v14, v3, s0 -; GFX10-NEXT: v_add_co_u32 v2, s0, v23, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v24, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v24, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v25, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_add_co_u32 v2, s0, v25, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v26, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v26, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v27, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v2, s0, v27, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v28, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v28, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v29, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_add_co_u32 v2, s0, v29, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v30, v3, s0 +; GFX10-NEXT: v_add_co_u32 v2, s0, v30, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v31, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v31, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v32, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v32, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v33, v3, vcc_lo ; GFX10-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX10-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX10-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX10-NEXT: s_add_i32 s0, s1, -1 -; GFX10-NEXT: s_cmp_eq_u32 s1, 0 -; GFX10-NEXT: s_cbranch_scc1 .LBB1_5 -; GFX10-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX10-NEXT: s_mov_b32 s1, s0 -; GFX10-NEXT: s_branch .LBB1_1 -; GFX10-NEXT: .LBB1_5: ; %while.end +; GFX10-NEXT: v_sub_co_u32 v7, s0, v7, 1 +; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_vccz .LBB1_1 +; GFX10-NEXT: ; %bb.4: ; %while.end ; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v6 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -731,15 +722,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, s0, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 -; GFX90A-NEXT: s_movk_i32 s3, 0x7f +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f ; GFX90A-NEXT: s_movk_i32 s0, 0xd000 ; GFX90A-NEXT: s_movk_i32 s1, 0xe000 ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX90A-NEXT: s_mov_b32 s3, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_mov_b32 s4, 0 ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 @@ -766,49 +757,46 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: s_addk_i32 s4, 0x2000 -; GFX90A-NEXT: s_cmp_gt_u32 s4, 0x3fffff +; GFX90A-NEXT: s_addk_i32 s3, 0x2000 +; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff ; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v12, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(7) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v18, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v19, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v20, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v21, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(5) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v16, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v17, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(4) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v24, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v25, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(3) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v26, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v27, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v28, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v29, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v14, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v15, v4, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v8, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, v10, v1 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v4, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX90A-NEXT: s_add_i32 s4, s3, -1 -; GFX90A-NEXT: s_cmp_eq_u32 s3, 0 -; GFX90A-NEXT: s_cbranch_scc1 .LBB1_5 -; GFX90A-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX90A-NEXT: s_mov_b32 s3, s4 -; GFX90A-NEXT: s_branch .LBB1_1 -; GFX90A-NEXT: .LBB1_5: ; %while.end +; GFX90A-NEXT: v_subrev_co_u32_e32 v1, vcc, 1, v1 +; GFX90A-NEXT: s_and_b64 vcc, exec, vcc +; GFX90A-NEXT: s_cbranch_vccz .LBB1_1 +; GFX90A-NEXT: ; %bb.4: ; %while.end ; GFX90A-NEXT: v_mov_b32_e32 v1, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 ; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc @@ -828,8 +816,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 17, v0 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0xff, v0 -; GFX11-NEXT: s_movk_i32 s1, 0x7f -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v7, 0x7f +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -843,95 +831,92 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB1_2: ; %for.body ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v4, 0xffffc000 -; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v9, vcc_lo, 0xffffc000, v4 +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffc000 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xffffc000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v5, vcc_lo -; GFX11-NEXT: global_load_b64 v[13:14], v[7:8], off offset:-4096 -; GFX11-NEXT: v_add_co_u32 v11, vcc_lo, 0xffffd000, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v12, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v15, vcc_lo, v4, 0xffffe000 -; GFX11-NEXT: global_load_b64 v[9:10], v[9:10], off offset:-2048 -; GFX11-NEXT: v_add_co_ci_u32_e64 v16, null, -1, v5, vcc_lo -; GFX11-NEXT: global_load_b64 v[11:12], v[11:12], off offset:-2048 -; GFX11-NEXT: v_add_co_u32 v17, vcc_lo, 0xffffe000, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, vcc_lo +; GFX11-NEXT: global_load_b64 v[14:15], v[8:9], off offset:-4096 +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffd000, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v4, 0xffffe000 +; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:-2048 +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, -1, v5, vcc_lo +; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:-2048 +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, 0xffffe000, v4 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[19:20], v[15:16], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[7:8], v[7:8], off -; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v21, vcc_lo, 0xfffff000, v4 +; GFX11-NEXT: global_load_b64 v[20:21], v[16:17], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v22, null, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, -1, v5, vcc_lo ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_b64 v[17:18], v[17:18], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[15:16], v[15:16], off -; GFX11-NEXT: global_load_b64 v[21:22], v[21:22], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[23:24], v[4:5], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[25:26], v[4:5], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[27:28], v[4:5], off +; GFX11-NEXT: global_load_b64 v[18:19], v[18:19], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[16:17], v[16:17], off +; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[24:25], v[4:5], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[26:27], v[4:5], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[28:29], v[4:5], off ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo -; GFX11-NEXT: s_addk_i32 s2, 0x2000 -; GFX11-NEXT: s_cmp_gt_u32 s2, 0x3fffff +; GFX11-NEXT: s_addk_i32 s1, 0x2000 +; GFX11-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_add_co_u32 v2, s0, v13, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v14, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_add_co_u32 v2, s0, v9, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v10, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v10, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_add_co_u32 v2, s0, v7, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v8, v3, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v11, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v12, v3, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v19, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v20, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_add_co_u32 v2, s0, v17, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v18, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v18, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v19, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_add_co_u32 v2, s0, v15, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v16, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v17, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_add_co_u32 v2, s0, v21, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v22, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_add_co_u32 v2, s0, v23, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v24, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v24, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v25, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_add_co_u32 v2, s0, v25, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v26, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v27, v2 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v28, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v28, v3, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v29, v3, vcc_lo ; GFX11-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX11-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX11-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX11-NEXT: s_add_i32 s0, s1, -1 -; GFX11-NEXT: s_cmp_eq_u32 s1, 0 -; GFX11-NEXT: s_cbranch_scc1 .LBB1_5 -; GFX11-NEXT: ; %bb.4: ; in Loop: Header=BB1_1 Depth=1 -; GFX11-NEXT: s_mov_b32 s1, s0 -; GFX11-NEXT: s_branch .LBB1_1 -; GFX11-NEXT: .LBB1_5: ; %while.end +; GFX11-NEXT: v_sub_co_u32 v7, s0, v7, 1 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_vccz .LBB1_1 +; GFX11-NEXT: ; %bb.4: ; %while.end ; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index e674fafb79d9f..4355495621593 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -85,8 +85,8 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 ; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GISEL-NEXT: s_endpgm - %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %a.flat, i16 0, i32 16, i32 0) - %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %b.flat, i16 0, i32 16, i32 0) + %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %a.flat, i16 0, i64 16, i32 0) + %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %b.flat, i16 0, i64 16, i32 0) %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -211,4 +211,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr readnone nocapture, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr readnone nocapture, i16, i64, i32) diff --git a/llvm/test/CodeGen/AMDGPU/remat-sop.mir b/llvm/test/CodeGen/AMDGPU/remat-sop.mir index 1da55cf535449..cb652db425421 100644 --- a/llvm/test/CodeGen/AMDGPU/remat-sop.mir +++ b/llvm/test/CodeGen/AMDGPU/remat-sop.mir @@ -91,15 +91,17 @@ body: | bb.0: ; GCN-LABEL: name: test_no_remat_s_mov_b32_vreg_src_short_lr ; GCN: renamable $sgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) - ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 renamable $sgpr0 - ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: renamable $sgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) - ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 - ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) + ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.1, addrspace 5) + ; GCN-NEXT: renamable $sgpr0 = IMPLICIT_DEF + ; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 killed renamable $sgpr0 + ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1 + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.1, addrspace 5) + ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 + ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5) ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0 ; GCN-NEXT: S_ENDPGM 0 %0:sreg_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll new file mode 100644 index 0000000000000..dc2e09dda2193 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX12,GFX12-ISEL %s + +define amdgpu_kernel void @kernel1() #0 { +; GFX12-LABEL: kernel1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_cmp_eq_u32 0, 0 +; GFX12-NEXT: s_barrier_signal_isfirst -1 +; GFX12-NEXT: s_barrier_wait -1 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_vccnz .LBB0_2 +; GFX12-NEXT: ; %bb.1: +; GFX12-NEXT: s_barrier_signal -3 +; GFX12-NEXT: .LBB0_2: +; GFX12-NEXT: s_barrier_wait -3 +; GFX12-NEXT: s_get_barrier_state s0, -3 +; GFX12-NEXT: s_endpgm + call void @llvm.amdgcn.s.cluster.barrier() + %state3 = call i32 @llvm.amdgcn.s.get.barrier.state(i32 -3) + ret void +} + +declare void @llvm.amdgcn.s.cluster.barrier() #1 +declare i32 @llvm.amdgcn.s.get.barrier.state(i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { convergent nounwind } +attributes #2 = { nounwind readnone } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12-ISEL: {{.*}} +; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll new file mode 100644 index 0000000000000..a828ee0a7883c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -stop-after=amdgpu-isel < %s | FileCheck -check-prefixes=GCN-ISEL %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s +; Ensure that S_UADDO_PSEUDO is selected when carryout user is S_ADD_CO_PSEUDO + +; GCN-ISEL-LABEL: name: s_uaddo_pseudo +; GCN-ISEL-LABEL: body: +; GCN-ISEL: S_UADDO_PSEUDO +; GCN-ISEL: S_ADD_CO_PSEUDO + +define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) { +; CHECK-LABEL: s_uaddo_pseudo: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_add_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 +; CHECK-NEXT: s_addc_u32 s0, 1, 0 +; CHECK-NEXT: ; return to shader part epilog + %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1) + %carryout = extractvalue { i32, i1 } %pair, 1 + %zext_carryout = zext i1 %carryout to i32 + %result = add i32 %zext_carryout, 1 + ret i32 %result +} + +; GCN-ISEL-LABEL: name: s_usubo_pseudo +; GCN-ISEL-LABEL: body: +; GCN-ISEL: S_USUBO_PSEUDO +; GCN-ISEL: S_SUB_CO_PSEUDO + +define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: s_usubo_pseudo: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_sub_u32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_cmp_lg_u64 s[2:3], 0 +; CHECK-NEXT: s_subb_u32 s0, s1, 0 +; CHECK-NEXT: ; return to shader part epilog + %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1) + %carryout = extractvalue { i32, i1 } %pair, 1 + %zext_carryout = zext i1 %carryout to i32 + %result = sub i32 %val1, %zext_carryout + ret i32 %result +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-ISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll index 0b58b328bbfb6..68c33487b0596 100644 --- a/llvm/test/CodeGen/AMDGPU/sad.ll +++ b/llvm/test/CodeGen/AMDGPU/sad.ll @@ -67,9 +67,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_sad_u32 v2, s0, v0, v1 +; GCN-NEXT: v_sad_u32 v2, s1, v0, v1 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_dword v[0:1], v2 @@ -249,10 +249,10 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i ; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s3, s0, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_sad_u32 v3, s0, v0, v1 +; GCN-NEXT: v_sad_u32 v3, s1, v0, v1 ; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -284,8 +284,8 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out ; GCN-NEXT: s_add_u32 s20, s20, s17 ; GCN-NEXT: s_addc_u32 s21, s21, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_min_u32 s3, s0, s1 -; GCN-NEXT: s_max_u32 s0, s0, s1 +; GCN-NEXT: s_min_u32 s3, s1, s0 +; GCN-NEXT: s_max_u32 s0, s1, s0 ; GCN-NEXT: s_sub_i32 s0, s0, s3 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s0 @@ -583,17 +583,17 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; GCN-NEXT: s_add_i32 s12, s12, s17 -; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s3, s0, s3 -; GCN-NEXT: s_sub_i32 s6, s1, s0 -; GCN-NEXT: s_cmp_lt_u32 s1, s0 -; GCN-NEXT: s_cselect_b32 s0, s3, s6 -; GCN-NEXT: s_add_i32 s0, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_sub_i32 s0, s0, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, s2, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm %icmp0 = icmp ugt i32 %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 5e76c7d7c734f..697bcc3b8fb47 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -6,8 +6,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_sdiv: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s8, s1, 31 ; GCN-NEXT: s_add_u32 s0, s0, s8 @@ -16,126 +17,158 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_sub_u32 s4, 0, s10 -; GCN-NEXT: s_subb_u32 s5, 0, s11 +; GCN-NEXT: s_sub_u32 s12, 0, s10 +; GCN-NEXT: s_subb_u32 s13, 0, s11 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s12, s3, 31 -; GCN-NEXT: s_add_u32 s2, s2, s12 -; GCN-NEXT: s_mov_b32 s13, s12 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mov_b32_e32 v5, s11 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s10, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 -; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 -; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] -; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 1, v0 -; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, v6, v8, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s3 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_mul_i32 s1, s12, s14 +; GCN-NEXT: v_readfirstlane_b32 s17, v2 +; GCN-NEXT: s_mul_i32 s15, s13, s0 +; GCN-NEXT: s_mul_i32 s16, s12, s0 +; GCN-NEXT: s_add_i32 s1, s17, s1 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s16 +; GCN-NEXT: s_add_i32 s1, s1, s15 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s16 +; GCN-NEXT: v_readfirstlane_b32 s15, v3 +; GCN-NEXT: s_mul_i32 s17, s0, s1 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 +; GCN-NEXT: s_add_u32 s15, s15, s17 +; GCN-NEXT: v_readfirstlane_b32 s17, v0 +; GCN-NEXT: s_addc_u32 s17, 0, s17 +; GCN-NEXT: s_mul_i32 s16, s14, s16 +; GCN-NEXT: v_readfirstlane_b32 s18, v4 +; GCN-NEXT: s_add_u32 s15, s15, s16 +; GCN-NEXT: s_addc_u32 s15, s17, s18 +; GCN-NEXT: v_readfirstlane_b32 s16, v1 +; GCN-NEXT: s_addc_u32 s16, s16, 0 +; GCN-NEXT: s_mul_i32 s1, s14, s1 +; GCN-NEXT: s_add_u32 s1, s15, s1 +; GCN-NEXT: s_addc_u32 s15, 0, s16 +; GCN-NEXT: s_add_u32 s16, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s12, v0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s14, s14, s15 +; GCN-NEXT: s_mul_i32 s0, s12, s14 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mul_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s1, s12, s16 +; GCN-NEXT: s_add_i32 s0, s0, s13 ; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v3, s14, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s16, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s14, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s16, v0 +; GCN-NEXT: s_mul_i32 s13, s16, s0 +; GCN-NEXT: v_readfirstlane_b32 s17, v2 +; GCN-NEXT: s_add_u32 s13, s17, s13 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_mul_i32 s1, s14, s1 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_add_u32 s1, s13, s1 +; GCN-NEXT: s_addc_u32 s1, s15, s12 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s0, s14, s0 +; GCN-NEXT: s_add_u32 s0, s1, s0 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: s_add_u32 s15, s16, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s14, s14, s12 +; GCN-NEXT: s_ashr_i32 s12, s7, 31 +; GCN-NEXT: s_add_u32 s0, s6, s12 +; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_addc_u32 s1, s7, s12 +; GCN-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_readfirstlane_b32 s4, v1 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 +; GCN-NEXT: s_mul_i32 s1, s6, s14 +; GCN-NEXT: v_readfirstlane_b32 s16, v3 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: s_add_u32 s1, s16, s1 +; GCN-NEXT: s_addc_u32 s4, 0, s4 +; GCN-NEXT: s_mul_i32 s15, s7, s15 +; GCN-NEXT: v_readfirstlane_b32 s16, v1 +; GCN-NEXT: s_add_u32 s1, s1, s15 +; GCN-NEXT: s_addc_u32 s1, s4, s16 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: s_addc_u32 s4, s4, 0 +; GCN-NEXT: s_mul_i32 s14, s7, s14 +; GCN-NEXT: s_add_u32 s14, s1, s14 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_addc_u32 s15, 0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_mul_i32 s4, s10, s15 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: s_mul_i32 s5, s11, s14 +; GCN-NEXT: s_add_i32 s16, s4, s5 +; GCN-NEXT: s_sub_i32 s17, s7, s16 +; GCN-NEXT: s_mul_i32 s4, s10, s14 +; GCN-NEXT: s_sub_u32 s6, s6, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s18, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_subb_u32 s17, s17, s11 +; GCN-NEXT: s_sub_u32 s19, s6, s10 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s17, 0 +; GCN-NEXT: s_cmp_ge_u32 s4, s11 +; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s19, s10 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s4, s11 +; GCN-NEXT: s_cselect_b32 s4, s17, s5 +; GCN-NEXT: s_add_u32 s5, s14, 1 +; GCN-NEXT: s_addc_u32 s17, s15, 0 +; GCN-NEXT: s_add_u32 s19, s14, 2 +; GCN-NEXT: s_addc_u32 s20, s15, 0 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s4, s19, s5 +; GCN-NEXT: s_cselect_b32 s5, s20, s17 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s16 +; GCN-NEXT: s_cmp_ge_u32 s7, s11 +; GCN-NEXT: s_cselect_b32 s16, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s6, s10 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s11 +; GCN-NEXT: s_cselect_b32 s6, s6, s16 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s5, s5, s15 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9] +; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_sub_u32 s4, s4, s6 +; GCN-NEXT: s_subb_u32 s5, s5, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv: @@ -1040,27 +1073,26 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_sext_i32_i16 s1, s9 -; GCN-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_sext_i32_i16 s0, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_alignbit_b32 v2, s0, v2, 24 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-NEXT: v_mul_f32_e32 v2, v3, v4 +; GCN-NEXT: s_sext_i32_i16 s9, s9 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], 24 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-NEXT: s_sext_i32_i16 s3, s3 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_xor_b32 s0, s2, s0 +; GCN-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-NEXT: s_or_b32 s2, s0, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1074,27 +1106,26 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 % ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: s_sext_i32_i16 s1, s9 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_alignbit_b32 v0, s1, v0, 24 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_sext_i32_i16 s0, s3 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s2 -; GCN-IR-NEXT: v_alignbit_b32 v2, s0, v2, 24 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v2, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GCN-IR-NEXT: v_or_b32_e32 v0, 1, v0 -; GCN-IR-NEXT: v_mul_f32_e32 v2, v3, v4 +; GCN-IR-NEXT: s_sext_i32_i16 s9, s9 +; GCN-IR-NEXT: s_mov_b32 s5, s1 +; GCN-IR-NEXT: s_lshr_b64 s[0:1], s[8:9], 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s0 +; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_xor_b32 s0, s2, s0 +; GCN-IR-NEXT: s_ashr_i32 s0, s0, 30 +; GCN-IR-NEXT: s_or_b32 s2, s0, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 ; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-IR-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 ; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v2 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1111,116 +1142,145 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_sdiv_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i32 s8, s3, 31 -; GCN-NEXT: s_add_u32 s2, s2, s8 -; GCN-NEXT: s_mov_b32 s9, s8 -; GCN-NEXT: s_addc_u32 s3, s3, s8 -; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s4, 0, s2 -; GCN-NEXT: s_subb_u32 s5, 0, s3 +; GCN-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s4 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_addc_u32 s3, s3, s4 +; GCN-NEXT: s_xor_b64 s[6:7], s[2:3], s[4:5] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GCN-NEXT: s_sub_u32 s2, 0, s6 +; GCN-NEXT: s_subb_u32 s10, 0, s7 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 -; GCN-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s2, v2 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GCN-NEXT: v_add_i32_e64 v4, s[0:1], 1, v0 -; GCN-NEXT: v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1] -; GCN-NEXT: v_add_i32_e64 v6, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1] -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: s_mul_i32 s9, s2, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_mul_i32 s12, s10, s8 +; GCN-NEXT: s_mul_i32 s13, s2, s8 +; GCN-NEXT: s_add_i32 s9, s14, s9 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 +; GCN-NEXT: s_add_i32 s9, s9, s12 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: s_mul_i32 s15, s8, s9 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s9 +; GCN-NEXT: s_add_u32 s12, s12, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_mul_i32 s13, s11, s13 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: v_readfirstlane_b32 s14, v4 +; GCN-NEXT: s_add_u32 s12, s12, s13 +; GCN-NEXT: s_addc_u32 s12, s15, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s9, s11, s9 +; GCN-NEXT: s_add_u32 s9, s12, s9 +; GCN-NEXT: s_addc_u32 s12, 0, s13 +; GCN-NEXT: s_add_u32 s13, s8, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_addc_u32 s11, s11, s12 +; GCN-NEXT: s_mul_i32 s8, s2, s11 +; GCN-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-NEXT: s_add_i32 s8, s9, s8 +; GCN-NEXT: s_mul_i32 s10, s10, s13 +; GCN-NEXT: s_mul_i32 s2, s2, s13 +; GCN-NEXT: s_add_i32 s8, s8, s10 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v3, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s13, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 +; GCN-NEXT: s_mul_i32 s10, s13, s8 +; GCN-NEXT: v_readfirstlane_b32 s14, v2 +; GCN-NEXT: s_add_u32 s10, s14, s10 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s2, s11, s2 +; GCN-NEXT: s_addc_u32 s12, 0, s12 +; GCN-NEXT: v_readfirstlane_b32 s9, v3 +; GCN-NEXT: s_add_u32 s2, s10, s2 +; GCN-NEXT: s_addc_u32 s2, s12, s9 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_mul_i32 s8, s11, s8 +; GCN-NEXT: s_add_u32 s2, s2, s8 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: s_add_u32 s2, s13, s2 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_addc_u32 s8, s11, s10 +; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s8, 24 +; GCN-NEXT: s_mul_i32 s8, s8, 24 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-NEXT: s_add_u32 s8, s10, s8 +; GCN-NEXT: s_addc_u32 s10, 0, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_mul_i32 s8, s7, s10 +; GCN-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-NEXT: s_add_i32 s11, s9, s8 +; GCN-NEXT: s_sub_i32 s12, 0, s11 +; GCN-NEXT: s_mul_i32 s8, s6, s10 +; GCN-NEXT: s_sub_u32 s13, 24, s8 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s14, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s12, s12, s7 +; GCN-NEXT: s_sub_u32 s15, s13, s6 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_subb_u32 s8, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s7 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s6 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, s7 +; GCN-NEXT: s_cselect_b32 s8, s12, s9 +; GCN-NEXT: s_add_u32 s9, s10, 1 +; GCN-NEXT: s_addc_u32 s12, 0, 0 +; GCN-NEXT: s_add_u32 s15, s10, 2 +; GCN-NEXT: s_addc_u32 s16, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s8, s15, s9 +; GCN-NEXT: s_cselect_b32 s9, s16, s12 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s11, 0, s11 +; GCN-NEXT: s_cmp_ge_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s6 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s12 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s7, s9, 0 +; GCN-NEXT: s_cselect_b32 s6, s8, s10 +; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] +; GCN-NEXT: s_sub_u32 s6, s6, s4 +; GCN-NEXT: s_subb_u32 s7, s7, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_sdiv_k_num_i64: diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 92d3277d5d3e3..bb22144b815a1 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -4148,28 +4148,28 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; -------------------------------------------------------------------------------- define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y) { -; CI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_add_f32_e32 v3, 4.0, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v2, 4.0, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_add_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_add_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_add_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_add_v2f16: ; VI-SAFE: ; %bb.0: @@ -4229,21 +4229,6 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NSZ-NEXT: v_sub_f32_e32 v2, -4.0, v2 -; CI-NSZ-NEXT: v_sub_f32_e32 v3, -4.0, v3 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; VI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4302,6 +4287,105 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ret <2 x half> %select } +define <2 x half> @select_fneg_posk_src_add_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %y) { +; CI-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_mov_b32_e32 v1, 0xc400 +; VI-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v2, -4.0, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_v2f16_nsz: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq <2 x i32> %c, zeroinitializer + %add = fadd nsz <2 x half> %x, + %fneg = fneg <2 x half> %add + %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> + ret <2 x half> %select +} + define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: ; CI-SAFE: ; %bb.0: @@ -4704,34 +4788,34 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < } define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %z) { -; CI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-SAFE-NEXT: v_mul_f32_e32 v3, 4.0, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v3, v3, v5 -; CI-SAFE-NEXT: v_mul_f32_e32 v2, 4.0, v2 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-SAFE-NEXT: v_add_f32_e32 v2, v2, v4 -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-SAFE-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_fmad_v2f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_mul_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_add_f32_e32 v3, v3, v5 +; CI-NEXT: v_mul_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_add_f32_e32 v2, v2, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-SAFE: ; %bb.0: @@ -4793,27 +4877,6 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v4, v4 -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v5, v5 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v4 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v5, v5 -; CI-NSZ-NEXT: v_mul_f32_e32 v2, -4.0, v2 -; CI-NSZ-NEXT: v_mul_f32_e32 v3, -4.0, v3 -; CI-NSZ-NEXT: v_sub_f32_e32 v2, v2, v4 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_sub_f32_e32 v3, v3, v5 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; VI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; VI-NSZ: ; %bb.0: ; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -4873,6 +4936,112 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ret <2 x half> %select } +define <2 x half> @select_fneg_posk_src_fmad_v2f16_nsz(<2 x i32> %c, <2 x half> %x, <2 x half> %z) { +; CI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 +; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; CI-NEXT: v_cvt_f32_f16_e32 v5, v5 +; CI-NEXT: v_mul_f32_e32 v2, -4.0, v2 +; CI-NEXT: v_mul_f32_e32 v3, -4.0, v3 +; CI-NEXT: v_sub_f32_e32 v2, v2, v4 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_sub_f32_e32 v3, v3, v5 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc +; CI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; VI-NEXT: v_fma_f16 v1, v4, -4.0, -v1 +; VI-NEXT: v_fma_f16 v2, v2, -4.0, -v3 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_pk_fma_f16 v1, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_fma_f16 v0, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16_nsz: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq <2 x i32> %c, zeroinitializer + %fmad = call nsz <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) + %fneg = fneg <2 x half> %fmad + %select = select <2 x i1> %cmp, <2 x half> %fneg, <2 x half> + ret <2 x half> %select +} + declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0 declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll index 59a884c829312..760a126afa995 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll @@ -281,9 +281,9 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a ; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64 ; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_alignbit_b32 v2, v3, v2, 31 -; GCN-NEXT: v_and_b32_e32 v2, 3, v2 +; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], 31 ; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_and_b32_e32 v2, 3, v2 ; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_endpgm %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index f614f58d8e1dc..5944342b2642a 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1491,29 +1491,29 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[10:11] +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s7, v1 -; GCN-NEXT: v_readfirstlane_b32 s6, v0 -; GCN-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN-NEXT: v_readfirstlane_b32 s5, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 ; GCN-NEXT: s_cbranch_scc0 .LBB8_4 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_ashr_i32 s0, s5, 31 -; GCN-NEXT: s_add_u32 s2, s4, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 +; GCN-NEXT: s_ashr_i32 s6, s3, 31 +; GCN-NEXT: s_add_u32 s8, s2, s6 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_addc_u32 s9, s3, s6 +; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GCN-NEXT: s_sub_u32 s3, 0, s8 +; GCN-NEXT: s_subb_u32 s12, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -1522,155 +1522,148 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_mul_i32 s5, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 -; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s5, s15, s5 -; GCN-NEXT: s_add_i32 s5, s5, s14 -; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 -; GCN-NEXT: s_mul_i32 s15, s3, s5 -; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s11, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s15, s3, s10 +; GCN-NEXT: s_mul_i32 s14, s12, s10 +; GCN-NEXT: s_add_i32 s11, s15, s11 +; GCN-NEXT: s_add_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s16, s3, s10 +; GCN-NEXT: s_mul_i32 s15, s10, s11 +; GCN-NEXT: s_mul_hi_u32 s17, s10, s16 +; GCN-NEXT: s_mul_hi_u32 s14, s10, s11 +; GCN-NEXT: s_add_u32 s15, s17, s15 ; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 -; GCN-NEXT: s_mul_i32 s16, s2, s16 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 -; GCN-NEXT: s_addc_u32 s3, s14, s17 -; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s5, s2, s5 -; GCN-NEXT: s_add_u32 s3, s3, s5 -; GCN-NEXT: s_addc_u32 s5, 0, s14 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s5 -; GCN-NEXT: v_readfirstlane_b32 s5, v0 -; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 -; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s5 -; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s5 -; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 -; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s5, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s5, 0, s16 -; GCN-NEXT: s_add_u32 s0, s0, s15 -; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s5, s14 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mul_i32 s3, s2, s3 -; GCN-NEXT: s_add_u32 s0, s0, s3 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s7, 31 -; GCN-NEXT: s_add_u32 s0, s6, s14 -; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s7, s14 -; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s5, s1 -; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 -; GCN-NEXT: s_mul_i32 s3, s17, s3 -; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s7 -; GCN-NEXT: s_addc_u32 s1, s5, 0 -; GCN-NEXT: s_mul_i32 s2, s17, s2 -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_mul_hi_u32 s2, s12, s0 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s2, s13, s0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s5, s1, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s5 -; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, s16, v0 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s7, s1, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s7, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s13 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 -; GCN-NEXT: s_cmp_eq_u32 s15, s13 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s16 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s7, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s2, s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s5 -; GCN-NEXT: s_cmp_ge_u32 s0, s13 -; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GCN-NEXT: s_cmp_eq_u32 s0, s13 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s14, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s14, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 -; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GCN-NEXT: s_mul_hi_u32 s18, s13, s16 +; GCN-NEXT: s_mul_i32 s16, s13, s16 +; GCN-NEXT: s_add_u32 s15, s15, s16 +; GCN-NEXT: s_mul_hi_u32 s17, s13, s11 +; GCN-NEXT: s_addc_u32 s14, s14, s18 +; GCN-NEXT: s_addc_u32 s15, s17, 0 +; GCN-NEXT: s_mul_i32 s11, s13, s11 +; GCN-NEXT: s_add_u32 s11, s14, s11 +; GCN-NEXT: s_addc_u32 s14, 0, s15 +; GCN-NEXT: s_add_u32 s15, s10, s11 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s13, s13, s14 +; GCN-NEXT: s_mul_i32 s10, s3, s13 +; GCN-NEXT: s_mul_hi_u32 s11, s3, s15 +; GCN-NEXT: s_add_i32 s10, s11, s10 +; GCN-NEXT: s_mul_i32 s12, s12, s15 +; GCN-NEXT: s_add_i32 s10, s10, s12 +; GCN-NEXT: s_mul_i32 s3, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s12, s13, s3 +; GCN-NEXT: s_mul_i32 s14, s13, s3 +; GCN-NEXT: s_mul_i32 s17, s15, s10 +; GCN-NEXT: s_mul_hi_u32 s3, s15, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s15, s10 +; GCN-NEXT: s_add_u32 s3, s3, s17 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: s_add_u32 s3, s3, s14 +; GCN-NEXT: s_mul_hi_u32 s11, s13, s10 +; GCN-NEXT: s_addc_u32 s3, s16, s12 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s10, s13, s10 +; GCN-NEXT: s_add_u32 s3, s3, s10 +; GCN-NEXT: s_addc_u32 s12, 0, s11 +; GCN-NEXT: s_add_u32 s3, s15, s3 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[10:11], 0 +; GCN-NEXT: s_addc_u32 s14, s13, s12 +; GCN-NEXT: s_ashr_i32 s10, s5, 31 +; GCN-NEXT: s_add_u32 s12, s4, s10 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: s_addc_u32 s13, s5, s10 +; GCN-NEXT: s_xor_b64 s[12:13], s[12:13], s[10:11] +; GCN-NEXT: s_mul_i32 s15, s12, s14 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s3 +; GCN-NEXT: s_mul_hi_u32 s5, s12, s14 +; GCN-NEXT: s_add_u32 s15, s16, s15 +; GCN-NEXT: s_addc_u32 s5, 0, s5 +; GCN-NEXT: s_mul_hi_u32 s17, s13, s3 +; GCN-NEXT: s_mul_i32 s3, s13, s3 +; GCN-NEXT: s_add_u32 s3, s15, s3 +; GCN-NEXT: s_mul_hi_u32 s16, s13, s14 +; GCN-NEXT: s_addc_u32 s3, s5, s17 +; GCN-NEXT: s_addc_u32 s5, s16, 0 +; GCN-NEXT: s_mul_i32 s14, s13, s14 +; GCN-NEXT: s_add_u32 s3, s3, s14 +; GCN-NEXT: s_addc_u32 s5, 0, s5 +; GCN-NEXT: s_mul_i32 s5, s8, s5 +; GCN-NEXT: s_mul_hi_u32 s14, s8, s3 +; GCN-NEXT: s_add_i32 s5, s14, s5 +; GCN-NEXT: s_mul_i32 s14, s9, s3 +; GCN-NEXT: s_add_i32 s5, s5, s14 +; GCN-NEXT: s_sub_i32 s16, s13, s5 +; GCN-NEXT: s_mul_i32 s3, s8, s3 +; GCN-NEXT: s_sub_u32 s3, s12, s3 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_subb_u32 s12, s16, s9 +; GCN-NEXT: s_sub_u32 s18, s3, s8 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s19, s12, 0 +; GCN-NEXT: s_cmp_ge_u32 s19, s9 +; GCN-NEXT: s_cselect_b32 s20, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s18, s8 +; GCN-NEXT: s_cselect_b32 s21, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, s9 +; GCN-NEXT: s_cselect_b32 s20, s21, s20 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s12, s12, s9 +; GCN-NEXT: s_sub_u32 s21, s18, s8 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s12, s12, 0 +; GCN-NEXT: s_cmp_lg_u32 s20, 0 +; GCN-NEXT: s_cselect_b32 s16, s21, s18 +; GCN-NEXT: s_cselect_b32 s12, s12, s19 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_subb_u32 s5, s13, s5 +; GCN-NEXT: s_cmp_ge_u32 s5, s9 +; GCN-NEXT: s_cselect_b32 s13, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s3, s8 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s13 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s9, s12, s5 +; GCN-NEXT: s_cselect_b32 s8, s16, s3 +; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[10:11] +; GCN-NEXT: s_sub_u32 s8, s8, s10 +; GCN-NEXT: s_subb_u32 s9, s9, s10 ; GCN-NEXT: s_cbranch_execnz .LBB8_3 ; GCN-NEXT: .LBB8_2: -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GCN-NEXT: s_sub_i32 s0, 0, s4 -; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 +; GCN-NEXT: s_mov_b32 s9, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s0, s2, s0 -; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_mul_hi_u32 s0, s6, s2 -; GCN-NEXT: s_mul_i32 s0, s0, s4 -; GCN-NEXT: s_sub_i32 s0, s6, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s4 -; GCN-NEXT: s_cmp_ge_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s4 -; GCN-NEXT: s_cmp_ge_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_mul_i32 s3, s3, s5 +; GCN-NEXT: s_mul_hi_u32 s3, s5, s3 +; GCN-NEXT: s_add_i32 s5, s5, s3 +; GCN-NEXT: s_mul_hi_u32 s3, s4, s5 +; GCN-NEXT: s_mul_i32 s3, s3, s2 +; GCN-NEXT: s_sub_i32 s3, s4, s3 +; GCN-NEXT: s_sub_i32 s4, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s3, s4, s3 +; GCN-NEXT: s_sub_i32 s4, s3, s2 +; GCN-NEXT: s_cmp_ge_u32 s3, s2 +; GCN-NEXT: s_cselect_b32 s8, s4, s3 ; GCN-NEXT: .LBB8_3: +; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm ; GCN-NEXT: .LBB8_4: -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GCN-NEXT: s_branch .LBB8_2 ; ; TAHITI-LABEL: srem_i64: @@ -1732,7 +1725,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_lo_u32 v8, v8, v5 ; TAHITI-NEXT: v_mul_lo_u32 v7, v7, v5 ; TAHITI-NEXT: v_add_i32_e32 v9, vcc, v9, v10 -; TAHITI-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; TAHITI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; TAHITI-NEXT: v_mul_lo_u32 v11, v5, v8 ; TAHITI-NEXT: v_mul_hi_u32 v12, v5, v7 ; TAHITI-NEXT: v_mul_hi_u32 v13, v5, v8 @@ -1819,7 +1812,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 +; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -1836,150 +1829,175 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; ; TONGA-LABEL: srem_i64: ; TONGA: ; %bb.0: -; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; TONGA-NEXT: v_mov_b32_e32 v4, 0 +; TONGA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) -; TONGA-NEXT: v_mov_b32_e32 v0, s6 -; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: v_mov_b32_e32 v0, s2 +; TONGA-NEXT: v_mov_b32_e32 v1, s3 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_or_b32_e32 v5, v1, v3 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; TONGA-NEXT: s_cbranch_vccz .LBB8_4 +; TONGA-NEXT: v_readfirstlane_b32 s5, v1 +; TONGA-NEXT: v_readfirstlane_b32 s4, v0 +; TONGA-NEXT: v_readfirstlane_b32 s3, v3 +; TONGA-NEXT: v_readfirstlane_b32 s2, v2 +; TONGA-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] +; TONGA-NEXT: s_mov_b32 s6, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 +; TONGA-NEXT: s_cbranch_scc0 .LBB8_3 ; TONGA-NEXT: ; %bb.1: -; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v2, v4 -; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc -; TONGA-NEXT: v_xor_b32_e32 v9, v5, v4 -; TONGA-NEXT: v_xor_b32_e32 v10, v3, v4 -; TONGA-NEXT: v_cvt_f32_u32_e32 v3, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v4, v10 -; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v9 -; TONGA-NEXT: v_subb_u32_e32 v12, vcc, 0, v10, vcc -; TONGA-NEXT: v_madmk_f32 v3, v4, 0x4f800000, v3 -; TONGA-NEXT: v_rcp_f32_e32 v3, v3 -; TONGA-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; TONGA-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; TONGA-NEXT: v_trunc_f32_e32 v4, v4 -; TONGA-NEXT: v_madmk_f32 v3, v4, 0xcf800000, v3 -; TONGA-NEXT: v_cvt_u32_f32_e32 v7, v4 -; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v3 -; TONGA-NEXT: v_mul_lo_u32 v5, v11, v7 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v11, v8, 0 -; TONGA-NEXT: v_mul_lo_u32 v6, v12, v8 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v5 -; TONGA-NEXT: v_add_u32_e32 v6, vcc, v4, v6 -; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[0:1], v8, v6, 0 -; TONGA-NEXT: v_mul_hi_u32 v13, v8, v3 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v4 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v3, 0 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v5, vcc -; TONGA-NEXT: v_mad_u64_u32 v[5:6], s[0:1], v7, v6, 0 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v13, v3 -; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v14, v4, vcc -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v5 -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v8, v3 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, v7, v4, vcc -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v11, v13, 0 -; TONGA-NEXT: v_mul_lo_u32 v7, v11, v14 -; TONGA-NEXT: v_mul_lo_u32 v8, v12, v13 -; TONGA-NEXT: v_mul_hi_u32 v11, v13, v3 -; TONGA-NEXT: v_mad_u64_u32 v[5:6], s[0:1], v14, v3, 0 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v7, v4 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; TONGA-NEXT: v_mad_u64_u32 v[7:8], s[0:1], v13, v4, 0 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v14, v4, 0 -; TONGA-NEXT: v_add_u32_e32 v7, vcc, v11, v7 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v7, v5 -; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v5, v3 -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v13, v3 -; TONGA-NEXT: v_addc_u32_e32 v6, vcc, v14, v4, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v7, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v0, v7 -; TONGA-NEXT: v_xor_b32_e32 v8, v3, v7 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v8, v6, 0 -; TONGA-NEXT: v_mul_hi_u32 v11, v8, v5 -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v3 -; TONGA-NEXT: v_addc_u32_e32 v12, vcc, 0, v4, vcc -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v1, v5, 0 -; TONGA-NEXT: v_mad_u64_u32 v[5:6], s[0:1], v1, v6, 0 -; TONGA-NEXT: v_add_u32_e32 v3, vcc, v11, v3 -; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v12, v4, vcc -; TONGA-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc -; TONGA-NEXT: v_add_u32_e32 v5, vcc, v3, v5 -; TONGA-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; TONGA-NEXT: v_mul_lo_u32 v6, v9, v3 -; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v9, v5, 0 -; TONGA-NEXT: v_mul_lo_u32 v5, v10, v5 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; TONGA-NEXT: v_add_u32_e32 v4, vcc, v5, v4 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, v1, v4 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v8, v3 -; TONGA-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v10, vcc -; TONGA-NEXT: v_sub_u32_e64 v6, s[0:1], v3, v9 -; TONGA-NEXT: v_subbrev_u32_e64 v8, s[2:3], 0, v5, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v8, v10 -; TONGA-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v6, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v8, v10 -; TONGA-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v10, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v12, s[0:1], v6, v9 -; TONGA-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; TONGA-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v10 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v11 -; TONGA-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v3, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v1, v10 -; TONGA-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; TONGA-NEXT: v_xor_b32_e32 v3, v3, v7 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v7 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 -; TONGA-NEXT: v_subb_u32_e32 v4, vcc, v1, v7, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB8_3 +; TONGA-NEXT: s_ashr_i32 s6, s3, 31 +; TONGA-NEXT: s_add_u32 s8, s2, s6 +; TONGA-NEXT: s_mov_b32 s7, s6 +; TONGA-NEXT: s_addc_u32 s9, s3, s6 +; TONGA-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s6 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s7 +; TONGA-NEXT: s_sub_u32 s3, 0, s6 +; TONGA-NEXT: s_subb_u32 s10, 0, s7 +; TONGA-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 +; TONGA-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; TONGA-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; TONGA-NEXT: v_trunc_f32_e32 v1, v1 +; TONGA-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v4, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v5, v0 +; TONGA-NEXT: v_mul_lo_u32 v2, s3, v4 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s3, v5, 0 +; TONGA-NEXT: v_mul_lo_u32 v3, s10, v5 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; TONGA-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; TONGA-NEXT: v_mul_hi_u32 v6, v5, v0 +; TONGA-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v5, v3, 0 +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v6, v1 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v0, 0 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v3, 0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v6, v0 +; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v7, v1, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; TONGA-NEXT: v_add_u32_e32 v6, vcc, v5, v0 +; TONGA-NEXT: v_addc_u32_e32 v7, vcc, v4, v1, vcc +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s3, v6, 0 +; TONGA-NEXT: v_mul_lo_u32 v4, s3, v7 +; TONGA-NEXT: v_mul_lo_u32 v5, s10, v6 +; TONGA-NEXT: v_mul_hi_u32 v8, v6, v0 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v7, v0, 0 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v4, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v5, v1 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v6, v1, 0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v7, v1, 0 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v4, v2 +; TONGA-NEXT: v_addc_u32_e32 v2, vcc, v5, v3, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v2, v0 +; TONGA-NEXT: s_ashr_i32 s10, s5, 31 +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; TONGA-NEXT: s_add_u32 s8, s4, s10 +; TONGA-NEXT: v_add_u32_e32 v2, vcc, v6, v0 +; TONGA-NEXT: s_mov_b32 s11, s10 +; TONGA-NEXT: s_addc_u32 s9, s5, s10 +; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v7, v1, vcc +; TONGA-NEXT: s_xor_b64 s[12:13], s[8:9], s[10:11] +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s12, v3, 0 +; TONGA-NEXT: v_mul_hi_u32 v4, s12, v2 +; TONGA-NEXT: v_readfirstlane_b32 s3, v1 +; TONGA-NEXT: v_readfirstlane_b32 s5, v0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s13, v3, 0 +; TONGA-NEXT: v_mad_u64_u32 v[2:3], s[8:9], s13, v2, 0 +; TONGA-NEXT: v_readfirstlane_b32 s14, v4 +; TONGA-NEXT: s_add_u32 s5, s14, s5 +; TONGA-NEXT: s_addc_u32 s3, 0, s3 +; TONGA-NEXT: v_readfirstlane_b32 s14, v2 +; TONGA-NEXT: v_readfirstlane_b32 s9, v3 +; TONGA-NEXT: s_add_u32 s5, s5, s14 +; TONGA-NEXT: v_readfirstlane_b32 s8, v1 +; TONGA-NEXT: s_addc_u32 s3, s3, s9 +; TONGA-NEXT: s_addc_u32 s5, s8, 0 +; TONGA-NEXT: v_readfirstlane_b32 s8, v0 +; TONGA-NEXT: s_add_u32 s3, s3, s8 +; TONGA-NEXT: v_mov_b32_e32 v0, s3 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s6, v0, 0 +; TONGA-NEXT: s_addc_u32 s5, 0, s5 +; TONGA-NEXT: s_mul_i32 s5, s6, s5 +; TONGA-NEXT: v_readfirstlane_b32 s14, v1 +; TONGA-NEXT: s_add_i32 s5, s14, s5 +; TONGA-NEXT: s_mul_i32 s3, s7, s3 +; TONGA-NEXT: s_add_i32 s5, s5, s3 +; TONGA-NEXT: s_sub_i32 s3, s13, s5 +; TONGA-NEXT: v_readfirstlane_b32 s14, v0 +; TONGA-NEXT: s_sub_u32 s12, s12, s14 +; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 +; TONGA-NEXT: s_subb_u32 s3, s3, s7 +; TONGA-NEXT: s_sub_u32 s18, s12, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s19, s3, 0 +; TONGA-NEXT: s_cmp_ge_u32 s19, s7 +; TONGA-NEXT: s_cselect_b32 s20, -1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s18, s6 +; TONGA-NEXT: s_cselect_b32 s21, -1, 0 +; TONGA-NEXT: s_cmp_eq_u32 s19, s7 +; TONGA-NEXT: s_cselect_b32 s20, s21, s20 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s3, s3, s7 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s3, s3, 0 +; TONGA-NEXT: s_cmp_lg_u32 s20, 0 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s3, s3, s19 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 +; TONGA-NEXT: s_subb_u32 s5, s13, s5 +; TONGA-NEXT: s_cmp_ge_u32 s5, s7 +; TONGA-NEXT: s_cselect_b32 s13, -1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s12, s6 +; TONGA-NEXT: s_cselect_b32 s6, -1, 0 +; TONGA-NEXT: s_cmp_eq_u32 s5, s7 +; TONGA-NEXT: s_cselect_b32 s6, s6, s13 +; TONGA-NEXT: s_cmp_lg_u32 s6, 0 +; TONGA-NEXT: s_cselect_b32 s7, s3, s5 +; TONGA-NEXT: s_cselect_b32 s6, s16, s12 +; TONGA-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; TONGA-NEXT: s_sub_u32 s6, s6, s10 +; TONGA-NEXT: s_subb_u32 s7, s7, s10 +; TONGA-NEXT: s_cbranch_execnz .LBB8_4 ; TONGA-NEXT: .LBB8_2: -; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v2 -; TONGA-NEXT: v_sub_u32_e32 v3, vcc, 0, v2 -; TONGA-NEXT: v_mov_b32_e32 v4, 0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 -; TONGA-NEXT: v_mul_lo_u32 v3, v3, v1 -; TONGA-NEXT: v_mul_hi_u32 v3, v1, v3 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s2 +; TONGA-NEXT: s_sub_i32 s3, 0, s2 +; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 +; TONGA-NEXT: v_mul_lo_u32 v1, s3, v0 ; TONGA-NEXT: v_mul_hi_u32 v1, v0, v1 -; TONGA-NEXT: v_mul_lo_u32 v1, v1, v2 -; TONGA-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_mul_hi_u32 v0, s4, v0 +; TONGA-NEXT: v_mul_lo_u32 v0, v0, s2 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s2, v0 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 ; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v0, v2 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; TONGA-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s2, v0 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; TONGA-NEXT: v_mov_b32_e32 v1, 0 +; TONGA-NEXT: s_branch .LBB8_5 ; TONGA-NEXT: .LBB8_3: -; TONGA-NEXT: v_mov_b32_e32 v0, s4 -; TONGA-NEXT: v_mov_b32_e32 v1, s5 -; TONGA-NEXT: flat_store_dwordx2 v[0:1], v[3:4] -; TONGA-NEXT: s_endpgm -; TONGA-NEXT: .LBB8_4: -; TONGA-NEXT: ; implicit-def: $vgpr3_vgpr4 +; TONGA-NEXT: ; implicit-def: $sgpr6_sgpr7 ; TONGA-NEXT: s_branch .LBB8_2 +; TONGA-NEXT: .LBB8_4: +; TONGA-NEXT: v_mov_b32_e32 v0, s6 +; TONGA-NEXT: v_mov_b32_e32 v1, s7 +; TONGA-NEXT: .LBB8_5: +; TONGA-NEXT: v_mov_b32_e32 v2, s0 +; TONGA-NEXT: v_mov_b32_e32 v3, s1 +; TONGA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; TONGA-NEXT: s_endpgm ; ; EG-LABEL: srem_i64: ; EG: ; %bb.0: @@ -2684,35 +2702,35 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v2i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[10:11] offset:16 -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[10:11] +; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_readfirstlane_b32 s11, v1 -; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s13, v5 -; GCN-NEXT: v_readfirstlane_b32 s12, v4 -; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[10:11] -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: v_readfirstlane_b32 s5, v3 -; GCN-NEXT: v_readfirstlane_b32 s4, v2 -; GCN-NEXT: v_readfirstlane_b32 s7, v7 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_readfirstlane_b32 s6, v6 -; GCN-NEXT: s_cbranch_scc0 .LBB10_7 +; GCN-NEXT: v_readfirstlane_b32 s11, v5 +; GCN-NEXT: v_readfirstlane_b32 s10, v4 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[8:9] +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: v_readfirstlane_b32 s5, v7 +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s4, v6 +; GCN-NEXT: s_cbranch_scc0 .LBB10_6 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_ashr_i32 s0, s11, 31 -; GCN-NEXT: s_add_u32 s2, s10, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s11, s0 -; GCN-NEXT: s_xor_b64 s[16:17], s[2:3], s[0:1] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s17 -; GCN-NEXT: s_sub_u32 s0, 0, s16 -; GCN-NEXT: s_subb_u32 s1, 0, s17 +; GCN-NEXT: s_ashr_i32 s6, s9, 31 +; GCN-NEXT: s_add_u32 s12, s8, s6 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_addc_u32 s13, s9, s6 +; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[6:7] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GCN-NEXT: s_sub_u32 s9, 0, s6 +; GCN-NEXT: s_subb_u32 s16, 0, s7 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -2721,321 +2739,312 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v1 -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_mul_i32 s11, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s19, s0, s3 -; GCN-NEXT: s_mul_i32 s18, s1, s3 -; GCN-NEXT: s_add_i32 s11, s19, s11 -; GCN-NEXT: s_add_i32 s11, s11, s18 -; GCN-NEXT: s_mul_i32 s20, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s18, s3, s11 -; GCN-NEXT: s_mul_i32 s19, s3, s11 -; GCN-NEXT: s_mul_hi_u32 s3, s3, s20 -; GCN-NEXT: s_add_u32 s3, s3, s19 +; GCN-NEXT: v_readfirstlane_b32 s17, v1 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_mul_i32 s15, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s19, s9, s14 +; GCN-NEXT: s_mul_i32 s18, s16, s14 +; GCN-NEXT: s_add_i32 s15, s19, s15 +; GCN-NEXT: s_add_i32 s15, s15, s18 +; GCN-NEXT: s_mul_i32 s20, s9, s14 +; GCN-NEXT: s_mul_i32 s19, s14, s15 +; GCN-NEXT: s_mul_hi_u32 s21, s14, s20 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s15 +; GCN-NEXT: s_add_u32 s19, s21, s19 ; GCN-NEXT: s_addc_u32 s18, 0, s18 -; GCN-NEXT: s_mul_hi_u32 s21, s2, s20 -; GCN-NEXT: s_mul_i32 s20, s2, s20 -; GCN-NEXT: s_add_u32 s3, s3, s20 -; GCN-NEXT: s_mul_hi_u32 s19, s2, s11 -; GCN-NEXT: s_addc_u32 s3, s18, s21 -; GCN-NEXT: s_addc_u32 s18, s19, 0 -; GCN-NEXT: s_mul_i32 s11, s2, s11 -; GCN-NEXT: s_add_u32 s3, s3, s11 -; GCN-NEXT: s_addc_u32 s11, 0, s18 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s11 -; GCN-NEXT: v_readfirstlane_b32 s11, v0 -; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s18, s0, s11 -; GCN-NEXT: s_add_i32 s3, s18, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s11 -; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s11 -; GCN-NEXT: s_mul_hi_u32 s18, s2, s0 -; GCN-NEXT: s_mul_i32 s19, s2, s0 -; GCN-NEXT: s_mul_i32 s21, s11, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s11, s0 -; GCN-NEXT: s_mul_hi_u32 s20, s11, s3 -; GCN-NEXT: s_add_u32 s0, s0, s21 -; GCN-NEXT: s_addc_u32 s11, 0, s20 -; GCN-NEXT: s_add_u32 s0, s0, s19 -; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s11, s18 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mul_i32 s3, s2, s3 -; GCN-NEXT: s_add_u32 s0, s0, s3 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s18, s13, 31 -; GCN-NEXT: s_add_u32 s0, s12, s18 -; GCN-NEXT: s_mov_b32 s19, s18 -; GCN-NEXT: s_addc_u32 s1, s13, s18 -; GCN-NEXT: s_xor_b64 s[20:21], s[0:1], s[18:19] -; GCN-NEXT: v_readfirstlane_b32 s3, v0 -; GCN-NEXT: s_mul_i32 s1, s20, s2 -; GCN-NEXT: s_mul_hi_u32 s11, s20, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s20, s2 -; GCN-NEXT: s_add_u32 s1, s11, s1 -; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s13, s21, s3 -; GCN-NEXT: s_mul_i32 s3, s21, s3 -; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s11, s21, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s13 -; GCN-NEXT: s_addc_u32 s1, s11, 0 -; GCN-NEXT: s_mul_i32 s2, s21, s2 -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: s_mul_i32 s1, s16, s1 -; GCN-NEXT: s_mul_hi_u32 s2, s16, s0 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s2, s17, s0 -; GCN-NEXT: s_mul_i32 s0, s16, s0 -; GCN-NEXT: s_add_i32 s11, s1, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: s_sub_i32 s1, s21, s11 -; GCN-NEXT: v_sub_co_u32_e32 v0, vcc, s20, v0 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s13, s1, s17 -; GCN-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s16, v0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s19, s13, 0 -; GCN-NEXT: s_cmp_ge_u32 s19, s17 -; GCN-NEXT: s_cselect_b32 s20, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v1 -; GCN-NEXT: s_cmp_eq_u32 s19, s17 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s20 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s13, s17 -; GCN-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s16, v1 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s2, s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 -; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v2, s19 -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s21, s11 -; GCN-NEXT: s_cmp_ge_u32 s0, s17 -; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 -; GCN-NEXT: s_cmp_eq_u32 s0, s17 -; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_mov_b32_e32 v4, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s18, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s18, v2 -; GCN-NEXT: v_mov_b32_e32 v2, s18 -; GCN-NEXT: v_subrev_co_u32_e32 v0, vcc, s18, v0 -; GCN-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GCN-NEXT: s_mul_hi_u32 s22, s17, s20 +; GCN-NEXT: s_mul_i32 s20, s17, s20 +; GCN-NEXT: s_add_u32 s19, s19, s20 +; GCN-NEXT: s_mul_hi_u32 s21, s17, s15 +; GCN-NEXT: s_addc_u32 s18, s18, s22 +; GCN-NEXT: s_addc_u32 s19, s21, 0 +; GCN-NEXT: s_mul_i32 s15, s17, s15 +; GCN-NEXT: s_add_u32 s15, s18, s15 +; GCN-NEXT: s_addc_u32 s18, 0, s19 +; GCN-NEXT: s_add_u32 s19, s14, s15 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s17, s17, s18 +; GCN-NEXT: s_mul_i32 s14, s9, s17 +; GCN-NEXT: s_mul_hi_u32 s15, s9, s19 +; GCN-NEXT: s_add_i32 s14, s15, s14 +; GCN-NEXT: s_mul_i32 s16, s16, s19 +; GCN-NEXT: s_add_i32 s14, s14, s16 +; GCN-NEXT: s_mul_i32 s9, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s16, s17, s9 +; GCN-NEXT: s_mul_i32 s18, s17, s9 +; GCN-NEXT: s_mul_i32 s21, s19, s14 +; GCN-NEXT: s_mul_hi_u32 s9, s19, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s19, s14 +; GCN-NEXT: s_add_u32 s9, s9, s21 +; GCN-NEXT: s_addc_u32 s20, 0, s20 +; GCN-NEXT: s_add_u32 s9, s9, s18 +; GCN-NEXT: s_mul_hi_u32 s15, s17, s14 +; GCN-NEXT: s_addc_u32 s9, s20, s16 +; GCN-NEXT: s_addc_u32 s15, s15, 0 +; GCN-NEXT: s_mul_i32 s14, s17, s14 +; GCN-NEXT: s_add_u32 s9, s9, s14 +; GCN-NEXT: s_addc_u32 s16, 0, s15 +; GCN-NEXT: s_add_u32 s9, s19, s9 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[14:15], 0 +; GCN-NEXT: s_addc_u32 s18, s17, s16 +; GCN-NEXT: s_ashr_i32 s14, s11, 31 +; GCN-NEXT: s_add_u32 s16, s10, s14 +; GCN-NEXT: s_mov_b32 s15, s14 +; GCN-NEXT: s_addc_u32 s17, s11, s14 +; GCN-NEXT: s_xor_b64 s[16:17], s[16:17], s[14:15] +; GCN-NEXT: s_mul_i32 s19, s16, s18 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s9 +; GCN-NEXT: s_mul_hi_u32 s11, s16, s18 +; GCN-NEXT: s_add_u32 s19, s20, s19 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: s_mul_hi_u32 s21, s17, s9 +; GCN-NEXT: s_mul_i32 s9, s17, s9 +; GCN-NEXT: s_add_u32 s9, s19, s9 +; GCN-NEXT: s_mul_hi_u32 s20, s17, s18 +; GCN-NEXT: s_addc_u32 s9, s11, s21 +; GCN-NEXT: s_addc_u32 s11, s20, 0 +; GCN-NEXT: s_mul_i32 s18, s17, s18 +; GCN-NEXT: s_add_u32 s9, s9, s18 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: s_mul_i32 s11, s6, s11 +; GCN-NEXT: s_mul_hi_u32 s18, s6, s9 +; GCN-NEXT: s_add_i32 s11, s18, s11 +; GCN-NEXT: s_mul_i32 s18, s7, s9 +; GCN-NEXT: s_add_i32 s11, s11, s18 +; GCN-NEXT: s_sub_i32 s20, s17, s11 +; GCN-NEXT: s_mul_i32 s9, s6, s9 +; GCN-NEXT: s_sub_u32 s9, s16, s9 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s16, s20, s7 +; GCN-NEXT: s_sub_u32 s22, s9, s6 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_subb_u32 s23, s16, 0 +; GCN-NEXT: s_cmp_ge_u32 s23, s7 +; GCN-NEXT: s_cselect_b32 s24, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s22, s6 +; GCN-NEXT: s_cselect_b32 s25, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s23, s7 +; GCN-NEXT: s_cselect_b32 s24, s25, s24 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_subb_u32 s16, s16, s7 +; GCN-NEXT: s_sub_u32 s25, s22, s6 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_subb_u32 s16, s16, 0 +; GCN-NEXT: s_cmp_lg_u32 s24, 0 +; GCN-NEXT: s_cselect_b32 s20, s25, s22 +; GCN-NEXT: s_cselect_b32 s16, s16, s23 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s11, s17, s11 +; GCN-NEXT: s_cmp_ge_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s9, s6 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s11, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s17 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s7, s16, s11 +; GCN-NEXT: s_cselect_b32 s6, s20, s9 +; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[14:15] +; GCN-NEXT: s_sub_u32 s6, s6, s14 +; GCN-NEXT: s_subb_u32 s7, s7, s14 ; GCN-NEXT: s_cbranch_execnz .LBB10_3 ; GCN-NEXT: .LBB10_2: -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GCN-NEXT: s_sub_i32 s0, 0, s10 -; GCN-NEXT: s_mov_b32 s1, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_sub_i32 s6, 0, s8 +; GCN-NEXT: s_mov_b32 s7, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_readfirstlane_b32 s2, v0 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s0, s2, s0 -; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_mul_hi_u32 s0, s12, s2 -; GCN-NEXT: s_mul_i32 s0, s0, s10 -; GCN-NEXT: s_sub_i32 s0, s12, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s10 -; GCN-NEXT: s_cmp_ge_u32 s0, s10 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s10 -; GCN-NEXT: s_cmp_ge_u32 s0, s10 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-NEXT: s_mul_i32 s6, s6, s9 +; GCN-NEXT: s_mul_hi_u32 s6, s9, s6 +; GCN-NEXT: s_add_i32 s9, s9, s6 +; GCN-NEXT: s_mul_hi_u32 s6, s10, s9 +; GCN-NEXT: s_mul_i32 s6, s6, s8 +; GCN-NEXT: s_sub_i32 s6, s10, s6 +; GCN-NEXT: s_sub_i32 s9, s6, s8 +; GCN-NEXT: s_cmp_ge_u32 s6, s8 +; GCN-NEXT: s_cselect_b32 s6, s9, s6 +; GCN-NEXT: s_sub_i32 s9, s6, s8 +; GCN-NEXT: s_cmp_ge_u32 s6, s8 +; GCN-NEXT: s_cselect_b32 s6, s9, s6 ; GCN-NEXT: .LBB10_3: -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_cbranch_scc0 .LBB10_8 +; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB10_7 ; GCN-NEXT: ; %bb.4: -; GCN-NEXT: s_ashr_i32 s0, s5, 31 -; GCN-NEXT: s_add_u32 s2, s4, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: v_madmk_f32 v2, v3, 0x4f800000, v2 -; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 -; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 -; GCN-NEXT: v_trunc_f32_e32 v3, v3 -; GCN-NEXT: v_madmk_f32 v2, v3, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_readfirstlane_b32 s2, v3 -; GCN-NEXT: v_readfirstlane_b32 s3, v2 -; GCN-NEXT: s_mul_i32 s5, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 -; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s5, s15, s5 -; GCN-NEXT: s_add_i32 s5, s5, s14 -; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 -; GCN-NEXT: s_mul_i32 s15, s3, s5 -; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 -; GCN-NEXT: s_mul_i32 s16, s2, s16 +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: s_add_u32 s10, s2, s8 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_addc_u32 s11, s3, s8 +; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GCN-NEXT: s_sub_u32 s3, 0, s10 +; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: s_add_u32 s17, s19, s17 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_add_u32 s17, s17, s18 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_addc_u32 s16, s16, s20 +; GCN-NEXT: s_addc_u32 s17, s19, 0 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_addc_u32 s16, 0, s17 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s3, s3, s19 +; GCN-NEXT: s_addc_u32 s18, 0, s18 ; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 -; GCN-NEXT: s_addc_u32 s3, s14, s17 -; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s5, s2, s5 -; GCN-NEXT: s_add_u32 s3, s3, s5 -; GCN-NEXT: s_addc_u32 s5, 0, s14 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s3, v2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s5 -; GCN-NEXT: v_readfirstlane_b32 s5, v2 -; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 -; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s5 -; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s5 -; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 -; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s5, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s5, 0, s16 -; GCN-NEXT: s_add_u32 s0, s0, s15 -; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s5, s14 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mul_i32 s3, s2, s3 -; GCN-NEXT: s_add_u32 s0, s0, s3 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s7, 31 -; GCN-NEXT: s_add_u32 s0, s6, s14 -; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s7, s14 -; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] -; GCN-NEXT: v_readfirstlane_b32 s3, v2 -; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s5, s1 -; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 -; GCN-NEXT: s_mul_i32 s3, s17, s3 -; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s7 -; GCN-NEXT: s_addc_u32 s1, s5, 0 -; GCN-NEXT: s_mul_i32 s2, s17, s2 -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_mul_hi_u32 s2, s12, s0 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s2, s13, s0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s5, s1, s2 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s5 -; GCN-NEXT: v_sub_co_u32_e32 v2, vcc, s16, v2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s7, s1, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v2 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s7, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s13 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v3 -; GCN-NEXT: s_cmp_eq_u32 s15, s13 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v5, s16 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s7, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v3 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s2, s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s15 -; GCN-NEXT: v_mov_b32_e32 v5, s2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s5 -; GCN-NEXT: s_cmp_ge_u32 s0, s13 -; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 -; GCN-NEXT: s_cmp_eq_u32 s0, s13 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GCN-NEXT: v_mov_b32_e32 v6, s0 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 -; GCN-NEXT: v_xor_b32_e32 v3, s14, v4 -; GCN-NEXT: v_mov_b32_e32 v4, s14 -; GCN-NEXT: v_subrev_co_u32_e32 v2, vcc, s14, v2 -; GCN-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc -; GCN-NEXT: s_cbranch_execnz .LBB10_6 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_ashr_i32 s12, s5, 31 +; GCN-NEXT: s_add_u32 s14, s4, s12 +; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_addc_u32 s15, s5, s12 +; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GCN-NEXT: s_mul_i32 s17, s14, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s5, s14, s16 +; GCN-NEXT: s_add_u32 s17, s18, s17 +; GCN-NEXT: s_addc_u32 s5, 0, s5 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s3 +; GCN-NEXT: s_mul_i32 s3, s15, s3 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s15, s16 +; GCN-NEXT: s_addc_u32 s3, s5, s19 +; GCN-NEXT: s_addc_u32 s5, s18, 0 +; GCN-NEXT: s_mul_i32 s16, s15, s16 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_addc_u32 s5, 0, s5 +; GCN-NEXT: s_mul_i32 s5, s10, s5 +; GCN-NEXT: s_mul_hi_u32 s16, s10, s3 +; GCN-NEXT: s_add_i32 s5, s16, s5 +; GCN-NEXT: s_mul_i32 s16, s11, s3 +; GCN-NEXT: s_add_i32 s5, s5, s16 +; GCN-NEXT: s_sub_i32 s18, s15, s5 +; GCN-NEXT: s_mul_i32 s3, s10, s3 +; GCN-NEXT: s_sub_u32 s3, s14, s3 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s14, s18, s11 +; GCN-NEXT: s_sub_u32 s20, s3, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s21, s14, 0 +; GCN-NEXT: s_cmp_ge_u32 s21, s11 +; GCN-NEXT: s_cselect_b32 s22, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_cselect_b32 s23, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, s11 +; GCN-NEXT: s_cselect_b32 s22, s23, s22 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s14, s14, s11 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s14, s14, 0 +; GCN-NEXT: s_cmp_lg_u32 s22, 0 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s14, s14, s21 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s5, s15, s5 +; GCN-NEXT: s_cmp_ge_u32 s5, s11 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s3, s10 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, s11 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s11, s14, s5 +; GCN-NEXT: s_cselect_b32 s10, s18, s3 +; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GCN-NEXT: s_sub_u32 s10, s10, s12 +; GCN-NEXT: s_subb_u32 s11, s11, s12 +; GCN-NEXT: s_cbranch_execnz .LBB10_8 ; GCN-NEXT: .LBB10_5: -; GCN-NEXT: v_cvt_f32_u32_e32 v2, s4 -; GCN-NEXT: s_sub_i32 s0, 0, s4 -; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GCN-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 -; GCN-NEXT: v_add_u32_e32 v2, v2, v3 -; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v2, s4 -; GCN-NEXT: v_sub_u32_e32 v2, s6, v2 -; GCN-NEXT: v_subrev_u32_e32 v3, s4, v2 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GCN-NEXT: v_subrev_u32_e32 v3, s4, v2 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 ; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 +; GCN-NEXT: v_sub_u32_e32 v0, s4, v0 +; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-NEXT: s_branch .LBB10_9 ; GCN-NEXT: .LBB10_6: -; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] -; GCN-NEXT: s_endpgm -; GCN-NEXT: .LBB10_7: -; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 ; GCN-NEXT: s_branch .LBB10_2 -; GCN-NEXT: .LBB10_8: +; GCN-NEXT: .LBB10_7: +; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GCN-NEXT: s_branch .LBB10_5 +; GCN-NEXT: .LBB10_8: +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NEXT: .LBB10_9: +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v2i64: ; TAHITI: ; %bb.0: @@ -3097,7 +3106,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v12, v12, v9 ; TAHITI-NEXT: v_mul_lo_u32 v11, v11, v9 ; TAHITI-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; TAHITI-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; TAHITI-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; TAHITI-NEXT: v_mul_lo_u32 v15, v9, v12 ; TAHITI-NEXT: v_mul_hi_u32 v16, v9, v11 ; TAHITI-NEXT: v_mul_hi_u32 v17, v9, v12 @@ -3240,7 +3249,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v10, v10, v3 ; TAHITI-NEXT: v_mul_lo_u32 v5, v5, v3 ; TAHITI-NEXT: v_add_i32_e32 v11, vcc, v11, v12 -; TAHITI-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; TAHITI-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; TAHITI-NEXT: v_mul_lo_u32 v13, v3, v10 ; TAHITI-NEXT: v_mul_hi_u32 v14, v3, v5 ; TAHITI-NEXT: v_mul_hi_u32 v15, v3, v10 @@ -3347,152 +3356,181 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-LABEL: srem_v2i64: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 16 -; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: s_addc_u32 s1, s7, 0 ; TONGA-NEXT: v_mov_b32_e32 v0, s0 -; TONGA-NEXT: v_mov_b32_e32 v5, s7 +; TONGA-NEXT: v_mov_b32_e32 v4, s6 ; TONGA-NEXT: v_mov_b32_e32 v1, s1 +; TONGA-NEXT: v_mov_b32_e32 v5, s7 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; TONGA-NEXT: s_waitcnt vmcnt(1) +; TONGA-NEXT: v_readfirstlane_b32 s1, v1 +; TONGA-NEXT: v_readfirstlane_b32 s0, v0 ; TONGA-NEXT: s_waitcnt vmcnt(0) -; TONGA-NEXT: v_or_b32_e32 v9, v5, v1 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; TONGA-NEXT: s_cbranch_vccz .LBB10_7 +; TONGA-NEXT: v_readfirstlane_b32 s3, v5 +; TONGA-NEXT: v_readfirstlane_b32 s2, v4 +; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] +; TONGA-NEXT: s_mov_b32 s6, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 +; TONGA-NEXT: s_cbranch_scc0 .LBB10_3 ; TONGA-NEXT: ; %bb.1: -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v1 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v0, v8 -; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v14, v9, v8 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v14 -; TONGA-NEXT: v_cvt_f32_u32_e32 v9, v1 -; TONGA-NEXT: v_sub_u32_e32 v15, vcc, 0, v14 -; TONGA-NEXT: v_subb_u32_e32 v16, vcc, 0, v1, vcc -; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 -; TONGA-NEXT: v_rcp_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; TONGA-NEXT: v_trunc_f32_e32 v9, v9 -; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; TONGA-NEXT: v_cvt_u32_f32_e32 v12, v9 -; TONGA-NEXT: v_cvt_u32_f32_e32 v13, v8 -; TONGA-NEXT: v_mul_lo_u32 v10, v15, v12 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v13, 0 -; TONGA-NEXT: v_mul_lo_u32 v11, v16, v13 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v10 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v9, v11 -; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[0:1], v13, v11, 0 -; TONGA-NEXT: v_mul_hi_u32 v17, v13, v8 -; TONGA-NEXT: v_add_u32_e32 v17, vcc, v17, v9 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v12, v8, 0 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v10, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v12, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v17, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v18, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v10 -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v17, vcc, v13, v8 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v12, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v15, v17, 0 -; TONGA-NEXT: v_mul_lo_u32 v12, v15, v18 -; TONGA-NEXT: v_mul_lo_u32 v13, v16, v17 -; TONGA-NEXT: v_mul_hi_u32 v15, v17, v8 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v8, 0 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v12, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v13 -; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v17, v9, 0 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v18, v9, 0 -; TONGA-NEXT: v_add_u32_e32 v12, vcc, v15, v12 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v12, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v13, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; TONGA-NEXT: s_ashr_i32 s6, s1, 31 +; TONGA-NEXT: s_add_u32 s8, s0, s6 +; TONGA-NEXT: s_mov_b32 s7, s6 +; TONGA-NEXT: s_addc_u32 s9, s1, s6 +; TONGA-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s6 +; TONGA-NEXT: v_cvt_f32_u32_e32 v1, s7 +; TONGA-NEXT: s_sub_u32 s1, 0, s6 +; TONGA-NEXT: s_subb_u32 s10, 0, s7 +; TONGA-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; TONGA-NEXT: v_rcp_f32_e32 v0, v0 +; TONGA-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; TONGA-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; TONGA-NEXT: v_trunc_f32_e32 v1, v1 +; TONGA-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v1 +; TONGA-NEXT: v_cvt_u32_f32_e32 v9, v0 +; TONGA-NEXT: v_mul_lo_u32 v4, s1, v8 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s1, v9, 0 +; TONGA-NEXT: v_mul_lo_u32 v5, s10, v9 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v4 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v1, v5 +; TONGA-NEXT: v_mul_hi_u32 v10, v9, v0 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v9, v11, 0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v8, v0, 0 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v4 +; TONGA-NEXT: v_addc_u32_e32 v12, vcc, 0, v5, vcc +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v8, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v10, v0 +; TONGA-NEXT: v_addc_u32_e32 v0, vcc, v12, v1, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v4 +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v9, v0 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v8, v1, vcc +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s1, v10, 0 +; TONGA-NEXT: v_mul_lo_u32 v8, s1, v11 +; TONGA-NEXT: v_mul_lo_u32 v9, s10, v10 +; TONGA-NEXT: v_mul_hi_u32 v12, v10, v0 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], v11, v0, 0 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v8, v1 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v9, v1 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v10, v1, 0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v1, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v12, v8 ; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v17, v8 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v18, v9, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v5 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v4, v12 -; TONGA-NEXT: v_xor_b32_e32 v13, v8, v12 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v13, v11, 0 -; TONGA-NEXT: v_mul_hi_u32 v15, v13, v10 -; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v5, v12, vcc -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v12 -; TONGA-NEXT: v_add_u32_e32 v15, vcc, v15, v8 -; TONGA-NEXT: v_addc_u32_e32 v16, vcc, 0, v9, vcc -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v5, v10, 0 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v5, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v8, vcc, v15, v8 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v16, v9, vcc -; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v8, v10 -; TONGA-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; TONGA-NEXT: v_mul_lo_u32 v11, v14, v8 -; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v14, v10, 0 -; TONGA-NEXT: v_mul_lo_u32 v10, v1, v10 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v11, v9 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v9 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v5, v9 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v13, v8 -; TONGA-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, vcc -; TONGA-NEXT: v_sub_u32_e64 v11, s[0:1], v8, v14 -; TONGA-NEXT: v_subbrev_u32_e64 v13, s[2:3], 0, v10, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v13, v1 -; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v11, v14 -; TONGA-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v13, v1 -; TONGA-NEXT: v_subb_u32_e64 v10, s[0:1], v10, v1, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v16, s[0:1], v11, v14 -; TONGA-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc -; TONGA-NEXT: v_subbrev_u32_e64 v10, s[0:1], 0, v10, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v15 -; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v14 -; TONGA-NEXT: v_cndmask_b32_e64 v10, v13, v10, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v9, v13, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v11, v11, v16, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v5, v10, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc -; TONGA-NEXT: v_xor_b32_e32 v5, v5, v12 -; TONGA-NEXT: v_xor_b32_e32 v1, v1, v12 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v5, v12 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v1, v12, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB10_3 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v8, v4 +; TONGA-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v4, v0 +; TONGA-NEXT: s_ashr_i32 s10, s3, 31 +; TONGA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; TONGA-NEXT: s_add_u32 s8, s2, s10 +; TONGA-NEXT: v_add_u32_e32 v4, vcc, v10, v0 +; TONGA-NEXT: s_mov_b32 s11, s10 +; TONGA-NEXT: s_addc_u32 s9, s3, s10 +; TONGA-NEXT: v_addc_u32_e32 v5, vcc, v11, v1, vcc +; TONGA-NEXT: s_xor_b64 s[12:13], s[8:9], s[10:11] +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s12, v5, 0 +; TONGA-NEXT: v_mul_hi_u32 v8, s12, v4 +; TONGA-NEXT: v_readfirstlane_b32 s1, v1 +; TONGA-NEXT: v_readfirstlane_b32 s3, v0 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s13, v5, 0 +; TONGA-NEXT: v_mad_u64_u32 v[4:5], s[8:9], s13, v4, 0 +; TONGA-NEXT: v_readfirstlane_b32 s14, v8 +; TONGA-NEXT: s_add_u32 s3, s14, s3 +; TONGA-NEXT: s_addc_u32 s1, 0, s1 +; TONGA-NEXT: v_readfirstlane_b32 s14, v4 +; TONGA-NEXT: v_readfirstlane_b32 s9, v5 +; TONGA-NEXT: s_add_u32 s3, s3, s14 +; TONGA-NEXT: v_readfirstlane_b32 s8, v1 +; TONGA-NEXT: s_addc_u32 s1, s1, s9 +; TONGA-NEXT: s_addc_u32 s3, s8, 0 +; TONGA-NEXT: v_readfirstlane_b32 s8, v0 +; TONGA-NEXT: s_add_u32 s1, s1, s8 +; TONGA-NEXT: v_mov_b32_e32 v0, s1 +; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s6, v0, 0 +; TONGA-NEXT: s_addc_u32 s3, 0, s3 +; TONGA-NEXT: s_mul_i32 s3, s6, s3 +; TONGA-NEXT: v_readfirstlane_b32 s14, v1 +; TONGA-NEXT: s_add_i32 s3, s14, s3 +; TONGA-NEXT: s_mul_i32 s1, s7, s1 +; TONGA-NEXT: s_add_i32 s3, s3, s1 +; TONGA-NEXT: s_sub_i32 s1, s13, s3 +; TONGA-NEXT: v_readfirstlane_b32 s14, v0 +; TONGA-NEXT: s_sub_u32 s12, s12, s14 +; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 +; TONGA-NEXT: s_subb_u32 s1, s1, s7 +; TONGA-NEXT: s_sub_u32 s18, s12, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s19, s1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s19, s7 +; TONGA-NEXT: s_cselect_b32 s20, -1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s18, s6 +; TONGA-NEXT: s_cselect_b32 s21, -1, 0 +; TONGA-NEXT: s_cmp_eq_u32 s19, s7 +; TONGA-NEXT: s_cselect_b32 s20, s21, s20 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s1, s1, s7 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s1, s1, 0 +; TONGA-NEXT: s_cmp_lg_u32 s20, 0 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s1, s1, s19 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 +; TONGA-NEXT: s_subb_u32 s3, s13, s3 +; TONGA-NEXT: s_cmp_ge_u32 s3, s7 +; TONGA-NEXT: s_cselect_b32 s13, -1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s12, s6 +; TONGA-NEXT: s_cselect_b32 s6, -1, 0 +; TONGA-NEXT: s_cmp_eq_u32 s3, s7 +; TONGA-NEXT: s_cselect_b32 s6, s6, s13 +; TONGA-NEXT: s_cmp_lg_u32 s6, 0 +; TONGA-NEXT: s_cselect_b32 s7, s1, s3 +; TONGA-NEXT: s_cselect_b32 s6, s16, s12 +; TONGA-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; TONGA-NEXT: s_sub_u32 s6, s6, s10 +; TONGA-NEXT: s_subb_u32 s7, s7, s10 +; TONGA-NEXT: s_cbranch_execnz .LBB10_4 ; TONGA-NEXT: .LBB10_2: -; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 +; TONGA-NEXT: v_cvt_f32_u32_e32 v0, s0 +; TONGA-NEXT: s_sub_i32 s1, 0, s0 ; TONGA-NEXT: v_mov_b32_e32 v9, 0 -; TONGA-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; TONGA-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; TONGA-NEXT: v_cvt_u32_f32_e32 v1, v1 -; TONGA-NEXT: v_mul_lo_u32 v5, v5, v1 -; TONGA-NEXT: v_mul_hi_u32 v5, v1, v5 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5 -; TONGA-NEXT: v_mul_hi_u32 v1, v4, v1 -; TONGA-NEXT: v_mul_lo_u32 v1, v1, v0 -; TONGA-NEXT: v_sub_u32_e32 v1, vcc, v4, v1 -; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v0, v1 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; TONGA-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; TONGA-NEXT: v_sub_u32_e32 v4, vcc, v1, v0 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v1, v4, vcc +; TONGA-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; TONGA-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; TONGA-NEXT: v_cvt_u32_f32_e32 v0, v0 +; TONGA-NEXT: v_mul_lo_u32 v1, s1, v0 +; TONGA-NEXT: v_mul_hi_u32 v1, v0, v1 +; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; TONGA-NEXT: v_mul_hi_u32 v0, s2, v0 +; TONGA-NEXT: v_mul_lo_u32 v0, v0, s0 +; TONGA-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s0, v0 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, s0, v0 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 +; TONGA-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; TONGA-NEXT: s_branch .LBB10_5 ; TONGA-NEXT: .LBB10_3: +; TONGA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; TONGA-NEXT: s_branch .LBB10_2 +; TONGA-NEXT: .LBB10_4: +; TONGA-NEXT: v_mov_b32_e32 v9, s7 +; TONGA-NEXT: v_mov_b32_e32 v8, s6 +; TONGA-NEXT: .LBB10_5: ; TONGA-NEXT: v_or_b32_e32 v1, v7, v3 ; TONGA-NEXT: v_mov_b32_e32 v0, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; TONGA-NEXT: s_cbranch_vccz .LBB10_8 -; TONGA-NEXT: ; %bb.4: +; TONGA-NEXT: s_cbranch_vccz .LBB10_9 +; TONGA-NEXT: ; %bb.6: ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc @@ -3534,7 +3572,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v13, v15, v0 ; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v16, v0, 0 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v10, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v11 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v11, v1 ; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v15, v1, 0 ; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v16, v1, 0 ; TONGA-NEXT: v_add_u32_e32 v10, vcc, v13, v10 @@ -3598,8 +3636,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v11 ; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v0, v11 ; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v1, v11, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB10_6 -; TONGA-NEXT: .LBB10_5: +; TONGA-NEXT: s_cbranch_execnz .LBB10_8 +; TONGA-NEXT: .LBB10_7: ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 ; TONGA-NEXT: v_mov_b32_e32 v11, 0 @@ -3618,16 +3656,13 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc -; TONGA-NEXT: .LBB10_6: +; TONGA-NEXT: .LBB10_8: ; TONGA-NEXT: v_mov_b32_e32 v0, s4 ; TONGA-NEXT: v_mov_b32_e32 v1, s5 ; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; TONGA-NEXT: s_endpgm -; TONGA-NEXT: .LBB10_7: -; TONGA-NEXT: ; implicit-def: $vgpr8_vgpr9 -; TONGA-NEXT: s_branch .LBB10_2 -; TONGA-NEXT: .LBB10_8: -; TONGA-NEXT: s_branch .LBB10_5 +; TONGA-NEXT: .LBB10_9: +; TONGA-NEXT: s_branch .LBB10_7 ; ; EG-LABEL: srem_v2i64: ; EG: ; %bb.0: @@ -4860,629 +4895,687 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1) define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GCN-LABEL: srem_v4i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v16, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[10:13], v8, s[10:11] offset:32 -; GCN-NEXT: global_load_dwordx4 v[14:17], v8, s[10:11] -; GCN-NEXT: global_load_dwordx4 v[0:3], v8, s[10:11] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v8, s[10:11] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:32 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(3) +; GCN-NEXT: v_readfirstlane_b32 s3, v3 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_readfirstlane_b32 s17, v5 +; GCN-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s19, v13 +; GCN-NEXT: v_readfirstlane_b32 s18, v12 +; GCN-NEXT: s_or_b64 s[6:7], s[18:19], s[16:17] +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_readfirstlane_b32 s2, v2 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: v_readfirstlane_b32 s13, v7 +; GCN-NEXT: v_readfirstlane_b32 s12, v6 ; GCN-NEXT: v_readfirstlane_b32 s5, v11 ; GCN-NEXT: v_readfirstlane_b32 s4, v10 -; GCN-NEXT: s_waitcnt vmcnt(2) -; GCN-NEXT: v_readfirstlane_b32 s7, v15 -; GCN-NEXT: v_readfirstlane_b32 s6, v14 -; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] -; GCN-NEXT: s_mov_b32 s0, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_cbranch_scc0 .LBB12_13 +; GCN-NEXT: v_readfirstlane_b32 s11, v9 +; GCN-NEXT: v_readfirstlane_b32 s10, v8 +; GCN-NEXT: v_readfirstlane_b32 s15, v15 +; GCN-NEXT: s_cmp_lg_u64 s[6:7], 0 +; GCN-NEXT: v_readfirstlane_b32 s14, v14 +; GCN-NEXT: s_cbranch_scc0 .LBB12_6 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_ashr_i32 s0, s5, 31 -; GCN-NEXT: s_add_u32 s2, s4, s0 -; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s3, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[0:1] -; GCN-NEXT: v_cvt_f32_u32_e32 v8, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v9, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 -; GCN-NEXT: v_rcp_f32_e32 v8, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 -; GCN-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 -; GCN-NEXT: v_trunc_f32_e32 v9, v9 -; GCN-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 -; GCN-NEXT: v_cvt_u32_f32_e32 v9, v9 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_readfirstlane_b32 s2, v9 -; GCN-NEXT: v_readfirstlane_b32 s3, v8 -; GCN-NEXT: s_mul_i32 s5, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s15, s0, s3 -; GCN-NEXT: s_mul_i32 s14, s1, s3 -; GCN-NEXT: s_add_i32 s5, s15, s5 -; GCN-NEXT: s_add_i32 s5, s5, s14 -; GCN-NEXT: s_mul_i32 s16, s0, s3 -; GCN-NEXT: s_mul_hi_u32 s14, s3, s5 -; GCN-NEXT: s_mul_i32 s15, s3, s5 -; GCN-NEXT: s_mul_hi_u32 s3, s3, s16 -; GCN-NEXT: s_add_u32 s3, s3, s15 -; GCN-NEXT: s_addc_u32 s14, 0, s14 -; GCN-NEXT: s_mul_hi_u32 s17, s2, s16 -; GCN-NEXT: s_mul_i32 s16, s2, s16 -; GCN-NEXT: s_add_u32 s3, s3, s16 -; GCN-NEXT: s_mul_hi_u32 s15, s2, s5 -; GCN-NEXT: s_addc_u32 s3, s14, s17 -; GCN-NEXT: s_addc_u32 s14, s15, 0 -; GCN-NEXT: s_mul_i32 s5, s2, s5 -; GCN-NEXT: s_add_u32 s3, s3, s5 -; GCN-NEXT: s_addc_u32 s5, 0, s14 -; GCN-NEXT: v_add_co_u32_e32 v8, vcc, s3, v8 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s5 -; GCN-NEXT: v_readfirstlane_b32 s5, v8 -; GCN-NEXT: s_mul_i32 s3, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s14, s0, s5 -; GCN-NEXT: s_add_i32 s3, s14, s3 -; GCN-NEXT: s_mul_i32 s1, s1, s5 -; GCN-NEXT: s_add_i32 s3, s3, s1 -; GCN-NEXT: s_mul_i32 s0, s0, s5 -; GCN-NEXT: s_mul_hi_u32 s14, s2, s0 -; GCN-NEXT: s_mul_i32 s15, s2, s0 -; GCN-NEXT: s_mul_i32 s17, s5, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s5, s0 -; GCN-NEXT: s_mul_hi_u32 s16, s5, s3 -; GCN-NEXT: s_add_u32 s0, s0, s17 -; GCN-NEXT: s_addc_u32 s5, 0, s16 -; GCN-NEXT: s_add_u32 s0, s0, s15 -; GCN-NEXT: s_mul_hi_u32 s1, s2, s3 -; GCN-NEXT: s_addc_u32 s0, s5, s14 -; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mul_i32 s3, s2, s3 -; GCN-NEXT: s_add_u32 s0, s0, s3 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: v_add_co_u32_e32 v8, vcc, s0, v8 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_addc_u32 s2, s2, s1 -; GCN-NEXT: s_ashr_i32 s14, s7, 31 -; GCN-NEXT: s_add_u32 s0, s6, s14 -; GCN-NEXT: s_mov_b32 s15, s14 -; GCN-NEXT: s_addc_u32 s1, s7, s14 -; GCN-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] -; GCN-NEXT: v_readfirstlane_b32 s3, v8 -; GCN-NEXT: s_mul_i32 s1, s16, s2 -; GCN-NEXT: s_mul_hi_u32 s5, s16, s3 -; GCN-NEXT: s_mul_hi_u32 s0, s16, s2 -; GCN-NEXT: s_add_u32 s1, s5, s1 -; GCN-NEXT: s_addc_u32 s0, 0, s0 -; GCN-NEXT: s_mul_hi_u32 s7, s17, s3 -; GCN-NEXT: s_mul_i32 s3, s17, s3 -; GCN-NEXT: s_add_u32 s1, s1, s3 -; GCN-NEXT: s_mul_hi_u32 s5, s17, s2 -; GCN-NEXT: s_addc_u32 s0, s0, s7 -; GCN-NEXT: s_addc_u32 s1, s5, 0 -; GCN-NEXT: s_mul_i32 s2, s17, s2 -; GCN-NEXT: s_add_u32 s0, s0, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s1 -; GCN-NEXT: s_mul_i32 s1, s12, s1 -; GCN-NEXT: s_mul_hi_u32 s2, s12, s0 -; GCN-NEXT: s_add_i32 s1, s2, s1 -; GCN-NEXT: s_mul_i32 s2, s13, s0 -; GCN-NEXT: s_mul_i32 s0, s12, s0 -; GCN-NEXT: s_add_i32 s5, s1, s2 -; GCN-NEXT: v_mov_b32_e32 v8, s0 -; GCN-NEXT: s_sub_i32 s1, s17, s5 -; GCN-NEXT: v_sub_co_u32_e32 v8, vcc, s16, v8 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: s_subb_u32 s7, s1, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v9, s[0:1], s12, v8 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s15, s7, 0 -; GCN-NEXT: s_cmp_ge_u32 s15, s13 -; GCN-NEXT: s_cselect_b32 s16, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v9 -; GCN-NEXT: s_cmp_eq_u32 s15, s13 -; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v11, s16 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e64 v10, v11, v10, s[2:3] -; GCN-NEXT: s_subb_u32 s2, s7, s13 -; GCN-NEXT: v_subrev_co_u32_e64 v11, s[0:1], s12, v9 -; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GCN-NEXT: s_subb_u32 s2, s2, 0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v10 -; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v10, s15 -; GCN-NEXT: v_mov_b32_e32 v11, s2 -; GCN-NEXT: s_cmp_lg_u64 vcc, 0 -; GCN-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[0:1] -; GCN-NEXT: s_subb_u32 s0, s17, s5 -; GCN-NEXT: s_cmp_ge_u32 s0, s13 -; GCN-NEXT: s_cselect_b32 s1, -1, 0 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v8 -; GCN-NEXT: s_cmp_eq_u32 s0, s13 -; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GCN-NEXT: v_mov_b32_e32 v14, s1 -; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_cndmask_b32_e32 v11, v14, v11, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 -; GCN-NEXT: v_mov_b32_e32 v14, s0 -; GCN-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; GCN-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc -; GCN-NEXT: v_xor_b32_e32 v8, s14, v8 -; GCN-NEXT: v_xor_b32_e32 v9, s14, v10 -; GCN-NEXT: v_mov_b32_e32 v10, s14 -; GCN-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v8 -; GCN-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v10, vcc +; GCN-NEXT: s_ashr_i32 s6, s17, 31 +; GCN-NEXT: s_add_u32 s20, s16, s6 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_addc_u32 s21, s17, s6 +; GCN-NEXT: s_xor_b64 s[6:7], s[20:21], s[6:7] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GCN-NEXT: s_sub_u32 s17, 0, s6 +; GCN-NEXT: s_subb_u32 s24, 0, s7 +; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_readfirstlane_b32 s25, v1 +; GCN-NEXT: v_readfirstlane_b32 s22, v0 +; GCN-NEXT: s_mul_i32 s23, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s27, s17, s22 +; GCN-NEXT: s_mul_i32 s26, s24, s22 +; GCN-NEXT: s_add_i32 s23, s27, s23 +; GCN-NEXT: s_add_i32 s23, s23, s26 +; GCN-NEXT: s_mul_i32 s28, s17, s22 +; GCN-NEXT: s_mul_i32 s27, s22, s23 +; GCN-NEXT: s_mul_hi_u32 s29, s22, s28 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s23 +; GCN-NEXT: s_add_u32 s27, s29, s27 +; GCN-NEXT: s_addc_u32 s26, 0, s26 +; GCN-NEXT: s_mul_hi_u32 s30, s25, s28 +; GCN-NEXT: s_mul_i32 s28, s25, s28 +; GCN-NEXT: s_add_u32 s27, s27, s28 +; GCN-NEXT: s_mul_hi_u32 s29, s25, s23 +; GCN-NEXT: s_addc_u32 s26, s26, s30 +; GCN-NEXT: s_addc_u32 s27, s29, 0 +; GCN-NEXT: s_mul_i32 s23, s25, s23 +; GCN-NEXT: s_add_u32 s23, s26, s23 +; GCN-NEXT: s_addc_u32 s26, 0, s27 +; GCN-NEXT: s_add_u32 s27, s22, s23 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s25, s25, s26 +; GCN-NEXT: s_mul_i32 s22, s17, s25 +; GCN-NEXT: s_mul_hi_u32 s23, s17, s27 +; GCN-NEXT: s_add_i32 s22, s23, s22 +; GCN-NEXT: s_mul_i32 s24, s24, s27 +; GCN-NEXT: s_add_i32 s22, s22, s24 +; GCN-NEXT: s_mul_i32 s17, s17, s27 +; GCN-NEXT: s_mul_hi_u32 s24, s25, s17 +; GCN-NEXT: s_mul_i32 s26, s25, s17 +; GCN-NEXT: s_mul_i32 s29, s27, s22 +; GCN-NEXT: s_mul_hi_u32 s17, s27, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s27, s22 +; GCN-NEXT: s_add_u32 s17, s17, s29 +; GCN-NEXT: s_addc_u32 s28, 0, s28 +; GCN-NEXT: s_add_u32 s17, s17, s26 +; GCN-NEXT: s_mul_hi_u32 s23, s25, s22 +; GCN-NEXT: s_addc_u32 s17, s28, s24 +; GCN-NEXT: s_addc_u32 s23, s23, 0 +; GCN-NEXT: s_mul_i32 s22, s25, s22 +; GCN-NEXT: s_add_u32 s17, s17, s22 +; GCN-NEXT: s_addc_u32 s24, 0, s23 +; GCN-NEXT: s_add_u32 s17, s27, s17 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_addc_u32 s26, s25, s24 +; GCN-NEXT: s_ashr_i32 s22, s19, 31 +; GCN-NEXT: s_add_u32 s24, s18, s22 +; GCN-NEXT: s_mov_b32 s23, s22 +; GCN-NEXT: s_addc_u32 s25, s19, s22 +; GCN-NEXT: s_xor_b64 s[24:25], s[24:25], s[22:23] +; GCN-NEXT: s_mul_i32 s27, s24, s26 +; GCN-NEXT: s_mul_hi_u32 s28, s24, s17 +; GCN-NEXT: s_mul_hi_u32 s19, s24, s26 +; GCN-NEXT: s_add_u32 s27, s28, s27 +; GCN-NEXT: s_addc_u32 s19, 0, s19 +; GCN-NEXT: s_mul_hi_u32 s29, s25, s17 +; GCN-NEXT: s_mul_i32 s17, s25, s17 +; GCN-NEXT: s_add_u32 s17, s27, s17 +; GCN-NEXT: s_mul_hi_u32 s28, s25, s26 +; GCN-NEXT: s_addc_u32 s17, s19, s29 +; GCN-NEXT: s_addc_u32 s19, s28, 0 +; GCN-NEXT: s_mul_i32 s26, s25, s26 +; GCN-NEXT: s_add_u32 s17, s17, s26 +; GCN-NEXT: s_addc_u32 s19, 0, s19 +; GCN-NEXT: s_mul_i32 s19, s6, s19 +; GCN-NEXT: s_mul_hi_u32 s26, s6, s17 +; GCN-NEXT: s_add_i32 s19, s26, s19 +; GCN-NEXT: s_mul_i32 s26, s7, s17 +; GCN-NEXT: s_add_i32 s19, s19, s26 +; GCN-NEXT: s_sub_i32 s28, s25, s19 +; GCN-NEXT: s_mul_i32 s17, s6, s17 +; GCN-NEXT: s_sub_u32 s17, s24, s17 +; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_subb_u32 s24, s28, s7 +; GCN-NEXT: s_sub_u32 s30, s17, s6 +; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 +; GCN-NEXT: s_subb_u32 s31, s24, 0 +; GCN-NEXT: s_cmp_ge_u32 s31, s7 +; GCN-NEXT: s_cselect_b32 s33, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s30, s6 +; GCN-NEXT: s_cselect_b32 s34, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s31, s7 +; GCN-NEXT: s_cselect_b32 s33, s34, s33 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 +; GCN-NEXT: s_subb_u32 s24, s24, s7 +; GCN-NEXT: s_sub_u32 s34, s30, s6 +; GCN-NEXT: s_cselect_b64 s[28:29], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[28:29], 0 +; GCN-NEXT: s_subb_u32 s24, s24, 0 +; GCN-NEXT: s_cmp_lg_u32 s33, 0 +; GCN-NEXT: s_cselect_b32 s28, s34, s30 +; GCN-NEXT: s_cselect_b32 s24, s24, s31 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_subb_u32 s19, s25, s19 +; GCN-NEXT: s_cmp_ge_u32 s19, s7 +; GCN-NEXT: s_cselect_b32 s25, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s17, s6 +; GCN-NEXT: s_cselect_b32 s6, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s19, s7 +; GCN-NEXT: s_cselect_b32 s6, s6, s25 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cselect_b32 s7, s24, s19 +; GCN-NEXT: s_cselect_b32 s6, s28, s17 +; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[22:23] +; GCN-NEXT: s_sub_u32 s6, s6, s22 +; GCN-NEXT: s_subb_u32 s7, s7, s22 ; GCN-NEXT: s_cbranch_execnz .LBB12_3 ; GCN-NEXT: .LBB12_2: -; GCN-NEXT: v_cvt_f32_u32_e32 v8, s4 -; GCN-NEXT: s_sub_i32 s0, 0, s4 -; GCN-NEXT: s_mov_b32 s1, 0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v8 -; GCN-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 -; GCN-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GCN-NEXT: v_readfirstlane_b32 s2, v8 -; GCN-NEXT: s_mul_i32 s0, s0, s2 -; GCN-NEXT: s_mul_hi_u32 s0, s2, s0 -; GCN-NEXT: s_add_i32 s2, s2, s0 -; GCN-NEXT: s_mul_hi_u32 s0, s6, s2 -; GCN-NEXT: s_mul_i32 s0, s0, s4 -; GCN-NEXT: s_sub_i32 s0, s6, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s4 -; GCN-NEXT: s_cmp_ge_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: s_sub_i32 s2, s0, s4 -; GCN-NEXT: s_cmp_ge_u32 s0, s4 -; GCN-NEXT: s_cselect_b32 s0, s2, s0 -; GCN-NEXT: v_mov_b32_e32 v9, s1 -; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 +; GCN-NEXT: s_sub_i32 s6, 0, s16 +; GCN-NEXT: s_mov_b32 s7, 0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_readfirstlane_b32 s17, v0 +; GCN-NEXT: s_mul_i32 s6, s6, s17 +; GCN-NEXT: s_mul_hi_u32 s6, s17, s6 +; GCN-NEXT: s_add_i32 s17, s17, s6 +; GCN-NEXT: s_mul_hi_u32 s6, s18, s17 +; GCN-NEXT: s_mul_i32 s6, s6, s16 +; GCN-NEXT: s_sub_i32 s6, s18, s6 +; GCN-NEXT: s_sub_i32 s17, s6, s16 +; GCN-NEXT: s_cmp_ge_u32 s6, s16 +; GCN-NEXT: s_cselect_b32 s6, s17, s6 +; GCN-NEXT: s_sub_i32 s17, s6, s16 +; GCN-NEXT: s_cmp_ge_u32 s6, s16 +; GCN-NEXT: s_cselect_b32 s6, s17, s6 ; GCN-NEXT: .LBB12_3: -; GCN-NEXT: v_or_b32_e32 v11, v17, v13 -; GCN-NEXT: v_mov_b32_e32 v10, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; GCN-NEXT: s_cbranch_vccz .LBB12_14 +; GCN-NEXT: s_or_b64 s[16:17], s[14:15], s[12:13] +; GCN-NEXT: s_mov_b32 s16, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB12_7 ; GCN-NEXT: ; %bb.4: -; GCN-NEXT: v_ashrrev_i32_e32 v10, 31, v13 -; GCN-NEXT: v_add_co_u32_e32 v11, vcc, v12, v10 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v10, vcc -; GCN-NEXT: v_xor_b32_e32 v11, v11, v10 -; GCN-NEXT: v_xor_b32_e32 v10, v13, v10 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v11 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v10 -; GCN-NEXT: v_sub_co_u32_e32 v15, vcc, 0, v11 -; GCN-NEXT: v_subb_co_u32_e32 v18, vcc, 0, v10, vcc -; GCN-NEXT: v_madmk_f32 v13, v14, 0x4f800000, v13 -; GCN-NEXT: v_rcp_f32_e32 v13, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 0x5f7ffffc, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 0x2f800000, v13 -; GCN-NEXT: v_trunc_f32_e32 v14, v14 -; GCN-NEXT: v_madmk_f32 v13, v14, 0xcf800000, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_mul_lo_u32 v20, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v19, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v21, v18, v13 -; GCN-NEXT: v_mul_lo_u32 v22, v15, v13 -; GCN-NEXT: v_add_u32_e32 v19, v19, v20 -; GCN-NEXT: v_add_u32_e32 v19, v19, v21 -; GCN-NEXT: v_mul_lo_u32 v20, v13, v19 -; GCN-NEXT: v_mul_hi_u32 v21, v13, v22 -; GCN-NEXT: v_mul_hi_u32 v23, v13, v19 -; GCN-NEXT: v_mul_hi_u32 v24, v14, v19 -; GCN-NEXT: v_mul_lo_u32 v19, v14, v19 -; GCN-NEXT: v_add_co_u32_e32 v20, vcc, v21, v20 -; GCN-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v23, vcc -; GCN-NEXT: v_mul_lo_u32 v23, v14, v22 -; GCN-NEXT: v_mul_hi_u32 v22, v14, v22 -; GCN-NEXT: v_add_co_u32_e32 v20, vcc, v20, v23 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, v21, v22, vcc -; GCN-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v24, vcc -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v21, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v19 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v20, vcc -; GCN-NEXT: v_mul_lo_u32 v19, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v20, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v18, v18, v13 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v13 -; GCN-NEXT: v_add_u32_e32 v19, v20, v19 -; GCN-NEXT: v_add_u32_e32 v18, v19, v18 -; GCN-NEXT: v_mul_lo_u32 v21, v13, v18 -; GCN-NEXT: v_mul_hi_u32 v22, v13, v15 -; GCN-NEXT: v_mul_hi_u32 v23, v13, v18 -; GCN-NEXT: v_mul_hi_u32 v20, v14, v15 -; GCN-NEXT: v_mul_lo_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v19, v14, v18 -; GCN-NEXT: v_add_co_u32_e32 v21, vcc, v22, v21 -; GCN-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v23, vcc -; GCN-NEXT: v_mul_lo_u32 v18, v14, v18 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v21, v15 -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, v22, v20, vcc -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v19, vcc -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v15, v18 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v15 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v18, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v17 -; GCN-NEXT: v_add_co_u32_e32 v18, vcc, v16, v15 -; GCN-NEXT: v_xor_b32_e32 v18, v18, v15 -; GCN-NEXT: v_mul_lo_u32 v19, v18, v14 -; GCN-NEXT: v_mul_hi_u32 v20, v18, v13 -; GCN-NEXT: v_mul_hi_u32 v21, v18, v14 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v15, vcc -; GCN-NEXT: v_xor_b32_e32 v17, v17, v15 -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v21, vcc -; GCN-NEXT: v_mul_lo_u32 v21, v17, v13 -; GCN-NEXT: v_mul_hi_u32 v13, v17, v13 -; GCN-NEXT: v_mul_hi_u32 v22, v17, v14 -; GCN-NEXT: v_mul_lo_u32 v14, v17, v14 -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v19, v21 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v20, v13, vcc -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v22, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v14 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v14, v11, v14 -; GCN-NEXT: v_mul_hi_u32 v19, v11, v13 -; GCN-NEXT: v_mul_lo_u32 v20, v10, v13 -; GCN-NEXT: v_mul_lo_u32 v13, v11, v13 -; GCN-NEXT: v_add_u32_e32 v14, v19, v14 -; GCN-NEXT: v_add_u32_e32 v14, v14, v20 -; GCN-NEXT: v_sub_u32_e32 v19, v17, v14 -; GCN-NEXT: v_sub_co_u32_e32 v13, vcc, v18, v13 -; GCN-NEXT: v_subb_co_u32_e64 v18, s[0:1], v19, v10, vcc -; GCN-NEXT: v_sub_co_u32_e64 v19, s[0:1], v13, v11 -; GCN-NEXT: v_subbrev_co_u32_e64 v20, s[2:3], 0, v18, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v10 -; GCN-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v11 -; GCN-NEXT: v_subb_co_u32_e32 v14, vcc, v17, v14, vcc -; GCN-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], v20, v10 -; GCN-NEXT: v_subb_co_u32_e64 v18, s[0:1], v18, v10, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v14, v10 -; GCN-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] -; GCN-NEXT: v_sub_co_u32_e64 v22, s[0:1], v19, v11 -; GCN-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v13, v11 -; GCN-NEXT: v_subbrev_co_u32_e64 v18, s[0:1], 0, v18, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v14, v10 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 -; GCN-NEXT: v_cndmask_b32_e32 v10, v17, v11, vcc -; GCN-NEXT: v_cndmask_b32_e64 v19, v19, v22, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 -; GCN-NEXT: v_cndmask_b32_e64 v18, v20, v18, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v11, v13, v19, vcc -; GCN-NEXT: v_cndmask_b32_e32 v10, v14, v18, vcc -; GCN-NEXT: v_xor_b32_e32 v11, v11, v15 -; GCN-NEXT: v_xor_b32_e32 v13, v10, v15 -; GCN-NEXT: v_sub_co_u32_e32 v10, vcc, v11, v15 -; GCN-NEXT: v_subb_co_u32_e32 v11, vcc, v13, v15, vcc -; GCN-NEXT: s_cbranch_execnz .LBB12_6 +; GCN-NEXT: s_ashr_i32 s16, s13, 31 +; GCN-NEXT: s_add_u32 s18, s12, s16 +; GCN-NEXT: s_mov_b32 s17, s16 +; GCN-NEXT: s_addc_u32 s19, s13, s16 +; GCN-NEXT: s_xor_b64 s[18:19], s[18:19], s[16:17] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s18 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s19 +; GCN-NEXT: s_sub_u32 s13, 0, s18 +; GCN-NEXT: s_subb_u32 s22, 0, s19 +; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_readfirstlane_b32 s23, v1 +; GCN-NEXT: v_readfirstlane_b32 s20, v0 +; GCN-NEXT: s_mul_i32 s21, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s25, s13, s20 +; GCN-NEXT: s_mul_i32 s24, s22, s20 +; GCN-NEXT: s_add_i32 s21, s25, s21 +; GCN-NEXT: s_add_i32 s21, s21, s24 +; GCN-NEXT: s_mul_i32 s26, s13, s20 +; GCN-NEXT: s_mul_i32 s25, s20, s21 +; GCN-NEXT: s_mul_hi_u32 s27, s20, s26 +; GCN-NEXT: s_mul_hi_u32 s24, s20, s21 +; GCN-NEXT: s_add_u32 s25, s27, s25 +; GCN-NEXT: s_addc_u32 s24, 0, s24 +; GCN-NEXT: s_mul_hi_u32 s28, s23, s26 +; GCN-NEXT: s_mul_i32 s26, s23, s26 +; GCN-NEXT: s_add_u32 s25, s25, s26 +; GCN-NEXT: s_mul_hi_u32 s27, s23, s21 +; GCN-NEXT: s_addc_u32 s24, s24, s28 +; GCN-NEXT: s_addc_u32 s25, s27, 0 +; GCN-NEXT: s_mul_i32 s21, s23, s21 +; GCN-NEXT: s_add_u32 s21, s24, s21 +; GCN-NEXT: s_addc_u32 s24, 0, s25 +; GCN-NEXT: s_add_u32 s25, s20, s21 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s23, s23, s24 +; GCN-NEXT: s_mul_i32 s20, s13, s23 +; GCN-NEXT: s_mul_hi_u32 s21, s13, s25 +; GCN-NEXT: s_add_i32 s20, s21, s20 +; GCN-NEXT: s_mul_i32 s22, s22, s25 +; GCN-NEXT: s_add_i32 s20, s20, s22 +; GCN-NEXT: s_mul_i32 s13, s13, s25 +; GCN-NEXT: s_mul_hi_u32 s22, s23, s13 +; GCN-NEXT: s_mul_i32 s24, s23, s13 +; GCN-NEXT: s_mul_i32 s27, s25, s20 +; GCN-NEXT: s_mul_hi_u32 s13, s25, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s25, s20 +; GCN-NEXT: s_add_u32 s13, s13, s27 +; GCN-NEXT: s_addc_u32 s26, 0, s26 +; GCN-NEXT: s_add_u32 s13, s13, s24 +; GCN-NEXT: s_mul_hi_u32 s21, s23, s20 +; GCN-NEXT: s_addc_u32 s13, s26, s22 +; GCN-NEXT: s_addc_u32 s21, s21, 0 +; GCN-NEXT: s_mul_i32 s20, s23, s20 +; GCN-NEXT: s_add_u32 s13, s13, s20 +; GCN-NEXT: s_addc_u32 s22, 0, s21 +; GCN-NEXT: s_add_u32 s13, s25, s13 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_addc_u32 s24, s23, s22 +; GCN-NEXT: s_ashr_i32 s20, s15, 31 +; GCN-NEXT: s_add_u32 s22, s14, s20 +; GCN-NEXT: s_mov_b32 s21, s20 +; GCN-NEXT: s_addc_u32 s23, s15, s20 +; GCN-NEXT: s_xor_b64 s[22:23], s[22:23], s[20:21] +; GCN-NEXT: s_mul_i32 s25, s22, s24 +; GCN-NEXT: s_mul_hi_u32 s26, s22, s13 +; GCN-NEXT: s_mul_hi_u32 s15, s22, s24 +; GCN-NEXT: s_add_u32 s25, s26, s25 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: s_mul_hi_u32 s27, s23, s13 +; GCN-NEXT: s_mul_i32 s13, s23, s13 +; GCN-NEXT: s_add_u32 s13, s25, s13 +; GCN-NEXT: s_mul_hi_u32 s26, s23, s24 +; GCN-NEXT: s_addc_u32 s13, s15, s27 +; GCN-NEXT: s_addc_u32 s15, s26, 0 +; GCN-NEXT: s_mul_i32 s24, s23, s24 +; GCN-NEXT: s_add_u32 s13, s13, s24 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: s_mul_i32 s15, s18, s15 +; GCN-NEXT: s_mul_hi_u32 s24, s18, s13 +; GCN-NEXT: s_add_i32 s15, s24, s15 +; GCN-NEXT: s_mul_i32 s24, s19, s13 +; GCN-NEXT: s_add_i32 s15, s15, s24 +; GCN-NEXT: s_sub_i32 s26, s23, s15 +; GCN-NEXT: s_mul_i32 s13, s18, s13 +; GCN-NEXT: s_sub_u32 s13, s22, s13 +; GCN-NEXT: s_cselect_b64 s[24:25], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 +; GCN-NEXT: s_subb_u32 s22, s26, s19 +; GCN-NEXT: s_sub_u32 s28, s13, s18 +; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_subb_u32 s29, s22, 0 +; GCN-NEXT: s_cmp_ge_u32 s29, s19 +; GCN-NEXT: s_cselect_b32 s30, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s28, s18 +; GCN-NEXT: s_cselect_b32 s31, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s29, s19 +; GCN-NEXT: s_cselect_b32 s30, s31, s30 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_subb_u32 s22, s22, s19 +; GCN-NEXT: s_sub_u32 s31, s28, s18 +; GCN-NEXT: s_cselect_b64 s[26:27], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[26:27], 0 +; GCN-NEXT: s_subb_u32 s22, s22, 0 +; GCN-NEXT: s_cmp_lg_u32 s30, 0 +; GCN-NEXT: s_cselect_b32 s26, s31, s28 +; GCN-NEXT: s_cselect_b32 s22, s22, s29 +; GCN-NEXT: s_cmp_lg_u64 s[24:25], 0 +; GCN-NEXT: s_subb_u32 s15, s23, s15 +; GCN-NEXT: s_cmp_ge_u32 s15, s19 +; GCN-NEXT: s_cselect_b32 s23, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s18 +; GCN-NEXT: s_cselect_b32 s18, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s15, s19 +; GCN-NEXT: s_cselect_b32 s18, s18, s23 +; GCN-NEXT: s_cmp_lg_u32 s18, 0 +; GCN-NEXT: s_cselect_b32 s19, s22, s15 +; GCN-NEXT: s_cselect_b32 s18, s26, s13 +; GCN-NEXT: s_xor_b64 s[18:19], s[18:19], s[20:21] +; GCN-NEXT: s_sub_u32 s18, s18, s20 +; GCN-NEXT: s_subb_u32 s19, s19, s20 +; GCN-NEXT: s_cbranch_execnz .LBB12_8 ; GCN-NEXT: .LBB12_5: -; GCN-NEXT: v_cvt_f32_u32_e32 v10, v12 -; GCN-NEXT: v_sub_u32_e32 v11, 0, v12 -; GCN-NEXT: v_rcp_iflag_f32_e32 v10, v10 -; GCN-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 -; GCN-NEXT: v_cvt_u32_f32_e32 v10, v10 -; GCN-NEXT: v_mul_lo_u32 v11, v11, v10 -; GCN-NEXT: v_mul_hi_u32 v11, v10, v11 -; GCN-NEXT: v_add_u32_e32 v10, v10, v11 -; GCN-NEXT: v_mul_hi_u32 v10, v16, v10 -; GCN-NEXT: v_mul_lo_u32 v10, v10, v12 -; GCN-NEXT: v_sub_u32_e32 v10, v16, v10 -; GCN-NEXT: v_sub_u32_e32 v11, v10, v12 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 -; GCN-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GCN-NEXT: v_sub_u32_e32 v11, v10, v12 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 -; GCN-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc -; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 +; GCN-NEXT: s_sub_i32 s13, 0, s12 +; GCN-NEXT: v_mov_b32_e32 v3, 0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s13, v0 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 +; GCN-NEXT: v_sub_u32_e32 v0, s14, v0 +; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_subrev_u32_e32 v1, s12, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GCN-NEXT: s_branch .LBB12_9 ; GCN-NEXT: .LBB12_6: -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_or_b32_e32 v13, v5, v1 -; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; GCN-NEXT: s_cbranch_vccz .LBB12_15 -; GCN-NEXT: ; %bb.7: -; GCN-NEXT: v_ashrrev_i32_e32 v13, 31, v1 -; GCN-NEXT: v_add_co_u32_e32 v12, vcc, v0, v13 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v13, vcc -; GCN-NEXT: v_xor_b32_e32 v12, v12, v13 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v13 -; GCN-NEXT: v_cvt_f32_u32_e32 v13, v12 -; GCN-NEXT: v_cvt_f32_u32_e32 v14, v1 -; GCN-NEXT: v_sub_co_u32_e32 v15, vcc, 0, v12 -; GCN-NEXT: v_subb_co_u32_e32 v16, vcc, 0, v1, vcc -; GCN-NEXT: v_madmk_f32 v13, v14, 0x4f800000, v13 -; GCN-NEXT: v_rcp_f32_e32 v13, v13 -; GCN-NEXT: v_mul_f32_e32 v13, 0x5f7ffffc, v13 -; GCN-NEXT: v_mul_f32_e32 v14, 0x2f800000, v13 -; GCN-NEXT: v_trunc_f32_e32 v14, v14 -; GCN-NEXT: v_madmk_f32 v13, v14, 0xcf800000, v13 -; GCN-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GCN-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GCN-NEXT: v_mul_lo_u32 v18, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v17, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v19, v16, v13 -; GCN-NEXT: v_mul_lo_u32 v20, v15, v13 -; GCN-NEXT: v_add_u32_e32 v17, v17, v18 -; GCN-NEXT: v_add_u32_e32 v17, v17, v19 -; GCN-NEXT: v_mul_lo_u32 v18, v13, v17 -; GCN-NEXT: v_mul_hi_u32 v19, v13, v20 -; GCN-NEXT: v_mul_hi_u32 v21, v13, v17 -; GCN-NEXT: v_mul_hi_u32 v22, v14, v17 -; GCN-NEXT: v_mul_lo_u32 v17, v14, v17 -; GCN-NEXT: v_add_co_u32_e32 v18, vcc, v19, v18 -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v21, vcc -; GCN-NEXT: v_mul_lo_u32 v21, v14, v20 -; GCN-NEXT: v_mul_hi_u32 v20, v14, v20 -; GCN-NEXT: v_add_co_u32_e32 v18, vcc, v18, v21 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, v19, v20, vcc -; GCN-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v22, vcc -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v18, v17 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v17 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v18, vcc -; GCN-NEXT: v_mul_lo_u32 v17, v15, v14 -; GCN-NEXT: v_mul_hi_u32 v18, v15, v13 -; GCN-NEXT: v_mul_lo_u32 v16, v16, v13 -; GCN-NEXT: v_mul_lo_u32 v15, v15, v13 -; GCN-NEXT: v_add_u32_e32 v17, v18, v17 -; GCN-NEXT: v_add_u32_e32 v16, v17, v16 -; GCN-NEXT: v_mul_lo_u32 v19, v13, v16 -; GCN-NEXT: v_mul_hi_u32 v20, v13, v15 -; GCN-NEXT: v_mul_hi_u32 v21, v13, v16 -; GCN-NEXT: v_mul_hi_u32 v18, v14, v15 -; GCN-NEXT: v_mul_lo_u32 v15, v14, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v14, v16 -; GCN-NEXT: v_add_co_u32_e32 v19, vcc, v20, v19 -; GCN-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v21, vcc -; GCN-NEXT: v_mul_lo_u32 v16, v14, v16 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v19, v15 -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, v20, v18, vcc -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v15, v16 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v17, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v15 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, v14, v16, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v15, 31, v5 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v4, v15 -; GCN-NEXT: v_xor_b32_e32 v16, v16, v15 -; GCN-NEXT: v_mul_lo_u32 v17, v16, v14 -; GCN-NEXT: v_mul_hi_u32 v18, v16, v13 -; GCN-NEXT: v_mul_hi_u32 v19, v16, v14 -; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v15, vcc -; GCN-NEXT: v_xor_b32_e32 v5, v5, v15 -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v18, v17 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v19, v5, v13 -; GCN-NEXT: v_mul_hi_u32 v13, v5, v13 -; GCN-NEXT: v_mul_hi_u32 v20, v5, v14 -; GCN-NEXT: v_mul_lo_u32 v14, v5, v14 -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v17, v19 -; GCN-NEXT: v_addc_co_u32_e32 v13, vcc, v18, v13, vcc -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v20, vcc -; GCN-NEXT: v_add_co_u32_e32 v13, vcc, v13, v14 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v17, vcc -; GCN-NEXT: v_mul_lo_u32 v14, v12, v14 -; GCN-NEXT: v_mul_hi_u32 v17, v12, v13 -; GCN-NEXT: v_mul_lo_u32 v18, v1, v13 -; GCN-NEXT: v_mul_lo_u32 v13, v12, v13 -; GCN-NEXT: v_add_u32_e32 v14, v17, v14 -; GCN-NEXT: v_add_u32_e32 v14, v14, v18 -; GCN-NEXT: v_sub_u32_e32 v17, v5, v14 -; GCN-NEXT: v_sub_co_u32_e32 v13, vcc, v16, v13 -; GCN-NEXT: v_subb_co_u32_e64 v16, s[0:1], v17, v1, vcc -; GCN-NEXT: v_sub_co_u32_e64 v17, s[0:1], v13, v12 -; GCN-NEXT: v_subbrev_co_u32_e64 v18, s[2:3], 0, v16, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v1 -; GCN-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v17, v12 -; GCN-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v14, vcc -; GCN-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v1 -; GCN-NEXT: v_subb_co_u32_e64 v16, s[0:1], v16, v1, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1 -; GCN-NEXT: v_cndmask_b32_e64 v19, v19, v20, s[2:3] -; GCN-NEXT: v_sub_co_u32_e64 v20, s[0:1], v17, v12 -; GCN-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v13, v12 -; GCN-NEXT: v_subbrev_co_u32_e64 v16, s[0:1], 0, v16, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v19 -; GCN-NEXT: v_cndmask_b32_e32 v1, v14, v12, vcc -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v16, v18, v16, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc -; GCN-NEXT: v_cndmask_b32_e32 v5, v13, v17, vcc -; GCN-NEXT: v_xor_b32_e32 v5, v5, v15 -; GCN-NEXT: v_xor_b32_e32 v1, v1, v15 -; GCN-NEXT: v_sub_co_u32_e32 v12, vcc, v5, v15 -; GCN-NEXT: v_subb_co_u32_e32 v13, vcc, v1, v15, vcc -; GCN-NEXT: s_cbranch_execnz .LBB12_9 +; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7 +; GCN-NEXT: s_branch .LBB12_2 +; GCN-NEXT: .LBB12_7: +; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19 +; GCN-NEXT: s_branch .LBB12_5 ; GCN-NEXT: .LBB12_8: -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: v_sub_u32_e32 v5, 0, v0 -; GCN-NEXT: v_mov_b32_e32 v13, 0 -; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GCN-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v1 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_add_u32_e32 v1, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v1, v4, v1 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v0 -; GCN-NEXT: v_sub_u32_e32 v1, v4, v1 -; GCN-NEXT: v_sub_u32_e32 v4, v1, v0 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN-NEXT: v_sub_u32_e32 v4, v1, v0 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 ; GCN-NEXT: .LBB12_9: -; GCN-NEXT: v_or_b32_e32 v1, v7, v3 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GCN-NEXT: s_cbranch_vccz .LBB12_16 +; GCN-NEXT: s_or_b64 s[12:13], s[10:11], s[8:9] +; GCN-NEXT: s_mov_b32 s12, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB12_12 ; GCN-NEXT: ; %bb.10: -; GCN-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GCN-NEXT: v_add_co_u32_e32 v1, vcc, v2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v0, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v1, v0 -; GCN-NEXT: v_xor_b32_e32 v0, v3, v0 -; GCN-NEXT: v_cvt_f32_u32_e32 v3, v1 -; GCN-NEXT: v_cvt_f32_u32_e32 v4, v0 -; GCN-NEXT: v_sub_co_u32_e32 v5, vcc, 0, v1 -; GCN-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v0, vcc -; GCN-NEXT: v_madmk_f32 v3, v4, 0x4f800000, v3 -; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 -; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_madmk_f32 v3, v4, 0xcf800000, v3 -; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GCN-NEXT: v_mul_lo_u32 v16, v5, v4 -; GCN-NEXT: v_mul_hi_u32 v15, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v17, v14, v3 -; GCN-NEXT: v_mul_lo_u32 v18, v5, v3 -; GCN-NEXT: v_add_u32_e32 v15, v15, v16 -; GCN-NEXT: v_add_u32_e32 v15, v15, v17 -; GCN-NEXT: v_mul_lo_u32 v16, v3, v15 -; GCN-NEXT: v_mul_hi_u32 v17, v3, v18 -; GCN-NEXT: v_mul_hi_u32 v19, v3, v15 -; GCN-NEXT: v_mul_hi_u32 v20, v4, v15 -; GCN-NEXT: v_mul_lo_u32 v15, v4, v15 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v17, v16 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v19, v4, v18 -; GCN-NEXT: v_mul_hi_u32 v18, v4, v18 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, v16, v19 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, v17, v18, vcc -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v20, vcc -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v16, v15 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v17, vcc -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v3, v15 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v16, vcc -; GCN-NEXT: v_mul_lo_u32 v15, v5, v4 -; GCN-NEXT: v_mul_hi_u32 v16, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v14, v14, v3 -; GCN-NEXT: v_mul_lo_u32 v5, v5, v3 -; GCN-NEXT: v_add_u32_e32 v15, v16, v15 -; GCN-NEXT: v_add_u32_e32 v14, v15, v14 -; GCN-NEXT: v_mul_lo_u32 v17, v3, v14 -; GCN-NEXT: v_mul_hi_u32 v18, v3, v5 -; GCN-NEXT: v_mul_hi_u32 v19, v3, v14 -; GCN-NEXT: v_mul_hi_u32 v16, v4, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v4, v5 -; GCN-NEXT: v_mul_hi_u32 v15, v4, v14 -; GCN-NEXT: v_add_co_u32_e32 v17, vcc, v18, v17 -; GCN-NEXT: v_addc_co_u32_e32 v18, vcc, 0, v19, vcc -; GCN-NEXT: v_mul_lo_u32 v14, v4, v14 -; GCN-NEXT: v_add_co_u32_e32 v5, vcc, v17, v5 -; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, v18, v16, vcc -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GCN-NEXT: v_add_co_u32_e32 v5, vcc, v5, v14 -; GCN-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v15, vcc -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc -; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v7 -; GCN-NEXT: v_add_co_u32_e32 v14, vcc, v6, v5 -; GCN-NEXT: v_xor_b32_e32 v14, v14, v5 -; GCN-NEXT: v_mul_lo_u32 v15, v14, v4 -; GCN-NEXT: v_mul_hi_u32 v16, v14, v3 -; GCN-NEXT: v_mul_hi_u32 v17, v14, v4 -; GCN-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v5, vcc -; GCN-NEXT: v_xor_b32_e32 v7, v7, v5 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v16, v15 -; GCN-NEXT: v_addc_co_u32_e32 v16, vcc, 0, v17, vcc -; GCN-NEXT: v_mul_lo_u32 v17, v7, v3 -; GCN-NEXT: v_mul_hi_u32 v3, v7, v3 -; GCN-NEXT: v_mul_hi_u32 v18, v7, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v7, v4 -; GCN-NEXT: v_add_co_u32_e32 v15, vcc, v15, v17 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v16, v3, vcc -; GCN-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v18, vcc -; GCN-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v15, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v15, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v16, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_u32_e32 v4, v15, v4 -; GCN-NEXT: v_add_u32_e32 v4, v4, v16 -; GCN-NEXT: v_sub_u32_e32 v15, v7, v4 -; GCN-NEXT: v_sub_co_u32_e32 v3, vcc, v14, v3 -; GCN-NEXT: v_subb_co_u32_e64 v14, s[0:1], v15, v0, vcc -; GCN-NEXT: v_sub_co_u32_e64 v15, s[0:1], v3, v1 -; GCN-NEXT: v_subbrev_co_u32_e64 v16, s[2:3], 0, v14, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v16, v0 -; GCN-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_ge_u32_e64 s[2:3], v15, v1 -; GCN-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], v16, v0 -; GCN-NEXT: v_subb_co_u32_e64 v14, s[0:1], v14, v0, s[0:1] -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v17, v17, v18, s[2:3] -; GCN-NEXT: v_sub_co_u32_e64 v18, s[0:1], v15, v1 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1 -; GCN-NEXT: v_subbrev_co_u32_e64 v14, s[0:1], 0, v14, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v4, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v17 -; GCN-NEXT: v_cndmask_b32_e32 v0, v7, v1, vcc -; GCN-NEXT: v_cndmask_b32_e64 v15, v15, v18, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GCN-NEXT: v_cndmask_b32_e64 v14, v16, v14, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v15, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v14, vcc -; GCN-NEXT: v_xor_b32_e32 v1, v1, v5 -; GCN-NEXT: v_xor_b32_e32 v0, v0, v5 -; GCN-NEXT: v_sub_co_u32_e32 v14, vcc, v1, v5 -; GCN-NEXT: v_subb_co_u32_e32 v15, vcc, v0, v5, vcc -; GCN-NEXT: s_cbranch_execnz .LBB12_12 +; GCN-NEXT: s_ashr_i32 s12, s9, 31 +; GCN-NEXT: s_add_u32 s14, s8, s12 +; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_addc_u32 s15, s9, s12 +; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GCN-NEXT: s_sub_u32 s9, 0, s14 +; GCN-NEXT: s_subb_u32 s18, 0, s15 +; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_readfirstlane_b32 s19, v1 +; GCN-NEXT: v_readfirstlane_b32 s16, v0 +; GCN-NEXT: s_mul_i32 s17, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s21, s9, s16 +; GCN-NEXT: s_mul_i32 s20, s18, s16 +; GCN-NEXT: s_add_i32 s17, s21, s17 +; GCN-NEXT: s_add_i32 s17, s17, s20 +; GCN-NEXT: s_mul_i32 s22, s9, s16 +; GCN-NEXT: s_mul_i32 s21, s16, s17 +; GCN-NEXT: s_mul_hi_u32 s23, s16, s22 +; GCN-NEXT: s_mul_hi_u32 s20, s16, s17 +; GCN-NEXT: s_add_u32 s21, s23, s21 +; GCN-NEXT: s_addc_u32 s20, 0, s20 +; GCN-NEXT: s_mul_hi_u32 s24, s19, s22 +; GCN-NEXT: s_mul_i32 s22, s19, s22 +; GCN-NEXT: s_add_u32 s21, s21, s22 +; GCN-NEXT: s_mul_hi_u32 s23, s19, s17 +; GCN-NEXT: s_addc_u32 s20, s20, s24 +; GCN-NEXT: s_addc_u32 s21, s23, 0 +; GCN-NEXT: s_mul_i32 s17, s19, s17 +; GCN-NEXT: s_add_u32 s17, s20, s17 +; GCN-NEXT: s_addc_u32 s20, 0, s21 +; GCN-NEXT: s_add_u32 s21, s16, s17 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s19, s19, s20 +; GCN-NEXT: s_mul_i32 s16, s9, s19 +; GCN-NEXT: s_mul_hi_u32 s17, s9, s21 +; GCN-NEXT: s_add_i32 s16, s17, s16 +; GCN-NEXT: s_mul_i32 s18, s18, s21 +; GCN-NEXT: s_add_i32 s16, s16, s18 +; GCN-NEXT: s_mul_i32 s9, s9, s21 +; GCN-NEXT: s_mul_hi_u32 s18, s19, s9 +; GCN-NEXT: s_mul_i32 s20, s19, s9 +; GCN-NEXT: s_mul_i32 s23, s21, s16 +; GCN-NEXT: s_mul_hi_u32 s9, s21, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s21, s16 +; GCN-NEXT: s_add_u32 s9, s9, s23 +; GCN-NEXT: s_addc_u32 s22, 0, s22 +; GCN-NEXT: s_add_u32 s9, s9, s20 +; GCN-NEXT: s_mul_hi_u32 s17, s19, s16 +; GCN-NEXT: s_addc_u32 s9, s22, s18 +; GCN-NEXT: s_addc_u32 s17, s17, 0 +; GCN-NEXT: s_mul_i32 s16, s19, s16 +; GCN-NEXT: s_add_u32 s9, s9, s16 +; GCN-NEXT: s_addc_u32 s18, 0, s17 +; GCN-NEXT: s_add_u32 s9, s21, s9 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_addc_u32 s20, s19, s18 +; GCN-NEXT: s_ashr_i32 s16, s11, 31 +; GCN-NEXT: s_add_u32 s18, s10, s16 +; GCN-NEXT: s_mov_b32 s17, s16 +; GCN-NEXT: s_addc_u32 s19, s11, s16 +; GCN-NEXT: s_xor_b64 s[18:19], s[18:19], s[16:17] +; GCN-NEXT: s_mul_i32 s21, s18, s20 +; GCN-NEXT: s_mul_hi_u32 s22, s18, s9 +; GCN-NEXT: s_mul_hi_u32 s11, s18, s20 +; GCN-NEXT: s_add_u32 s21, s22, s21 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: s_mul_hi_u32 s23, s19, s9 +; GCN-NEXT: s_mul_i32 s9, s19, s9 +; GCN-NEXT: s_add_u32 s9, s21, s9 +; GCN-NEXT: s_mul_hi_u32 s22, s19, s20 +; GCN-NEXT: s_addc_u32 s9, s11, s23 +; GCN-NEXT: s_addc_u32 s11, s22, 0 +; GCN-NEXT: s_mul_i32 s20, s19, s20 +; GCN-NEXT: s_add_u32 s9, s9, s20 +; GCN-NEXT: s_addc_u32 s11, 0, s11 +; GCN-NEXT: s_mul_i32 s11, s14, s11 +; GCN-NEXT: s_mul_hi_u32 s20, s14, s9 +; GCN-NEXT: s_add_i32 s11, s20, s11 +; GCN-NEXT: s_mul_i32 s20, s15, s9 +; GCN-NEXT: s_add_i32 s11, s11, s20 +; GCN-NEXT: s_sub_i32 s22, s19, s11 +; GCN-NEXT: s_mul_i32 s9, s14, s9 +; GCN-NEXT: s_sub_u32 s9, s18, s9 +; GCN-NEXT: s_cselect_b64 s[20:21], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_subb_u32 s18, s22, s15 +; GCN-NEXT: s_sub_u32 s24, s9, s14 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_subb_u32 s25, s18, 0 +; GCN-NEXT: s_cmp_ge_u32 s25, s15 +; GCN-NEXT: s_cselect_b32 s26, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s24, s14 +; GCN-NEXT: s_cselect_b32 s27, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s25, s15 +; GCN-NEXT: s_cselect_b32 s26, s27, s26 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_subb_u32 s18, s18, s15 +; GCN-NEXT: s_sub_u32 s27, s24, s14 +; GCN-NEXT: s_cselect_b64 s[22:23], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[22:23], 0 +; GCN-NEXT: s_subb_u32 s18, s18, 0 +; GCN-NEXT: s_cmp_lg_u32 s26, 0 +; GCN-NEXT: s_cselect_b32 s22, s27, s24 +; GCN-NEXT: s_cselect_b32 s18, s18, s25 +; GCN-NEXT: s_cmp_lg_u64 s[20:21], 0 +; GCN-NEXT: s_subb_u32 s11, s19, s11 +; GCN-NEXT: s_cmp_ge_u32 s11, s15 +; GCN-NEXT: s_cselect_b32 s19, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s9, s14 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s11, s15 +; GCN-NEXT: s_cselect_b32 s14, s14, s19 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s15, s18, s11 +; GCN-NEXT: s_cselect_b32 s14, s22, s9 +; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[16:17] +; GCN-NEXT: s_sub_u32 s14, s14, s16 +; GCN-NEXT: s_subb_u32 s15, s15, s16 +; GCN-NEXT: s_cbranch_execnz .LBB12_13 ; GCN-NEXT: .LBB12_11: -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v2 -; GCN-NEXT: v_sub_u32_e32 v1, 0, v2 -; GCN-NEXT: v_mov_b32_e32 v15, 0 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: s_sub_i32 s9, 0, s8 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v1, v1, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s9, v0 ; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 ; GCN-NEXT: v_add_u32_e32 v0, v0, v1 -; GCN-NEXT: v_mul_hi_u32 v0, v6, v0 -; GCN-NEXT: v_mul_lo_u32 v0, v0, v2 -; GCN-NEXT: v_sub_u32_e32 v0, v6, v0 -; GCN-NEXT: v_sub_u32_e32 v1, v0, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 +; GCN-NEXT: v_sub_u32_e32 v0, s10, v0 +; GCN-NEXT: v_subrev_u32_e32 v1, s8, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_sub_u32_e32 v1, v0, v2 -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 -; GCN-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc +; GCN-NEXT: v_subrev_u32_e32 v1, s8, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 +; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GCN-NEXT: s_branch .LBB12_14 ; GCN-NEXT: .LBB12_12: -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: global_store_dwordx4 v0, v[12:15], s[8:9] offset:16 -; GCN-NEXT: global_store_dwordx4 v0, v[8:11], s[8:9] -; GCN-NEXT: s_endpgm +; GCN-NEXT: ; implicit-def: $sgpr14_sgpr15 +; GCN-NEXT: s_branch .LBB12_11 ; GCN-NEXT: .LBB12_13: -; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9 -; GCN-NEXT: s_branch .LBB12_2 +; GCN-NEXT: v_mov_b32_e32 v4, s14 +; GCN-NEXT: v_mov_b32_e32 v5, s15 ; GCN-NEXT: .LBB12_14: -; GCN-NEXT: s_branch .LBB12_5 -; GCN-NEXT: .LBB12_15: -; GCN-NEXT: ; implicit-def: $vgpr12_vgpr13 -; GCN-NEXT: s_branch .LBB12_8 +; GCN-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] +; GCN-NEXT: s_mov_b32 s8, 0 +; GCN-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GCN-NEXT: s_cbranch_scc0 .LBB12_17 +; GCN-NEXT: ; %bb.15: +; GCN-NEXT: s_ashr_i32 s8, s3, 31 +; GCN-NEXT: s_add_u32 s10, s2, s8 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_addc_u32 s11, s3, s8 +; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11 +; GCN-NEXT: s_sub_u32 s3, 0, s10 +; GCN-NEXT: s_subb_u32 s14, 0, s11 +; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 +; GCN-NEXT: v_rcp_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_readfirstlane_b32 s15, v1 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_mul_i32 s13, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s17, s3, s12 +; GCN-NEXT: s_mul_i32 s16, s14, s12 +; GCN-NEXT: s_add_i32 s13, s17, s13 +; GCN-NEXT: s_add_i32 s13, s13, s16 +; GCN-NEXT: s_mul_i32 s18, s3, s12 +; GCN-NEXT: s_mul_i32 s17, s12, s13 +; GCN-NEXT: s_mul_hi_u32 s19, s12, s18 +; GCN-NEXT: s_mul_hi_u32 s16, s12, s13 +; GCN-NEXT: s_add_u32 s17, s19, s17 +; GCN-NEXT: s_addc_u32 s16, 0, s16 +; GCN-NEXT: s_mul_hi_u32 s20, s15, s18 +; GCN-NEXT: s_mul_i32 s18, s15, s18 +; GCN-NEXT: s_add_u32 s17, s17, s18 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s13 +; GCN-NEXT: s_addc_u32 s16, s16, s20 +; GCN-NEXT: s_addc_u32 s17, s19, 0 +; GCN-NEXT: s_mul_i32 s13, s15, s13 +; GCN-NEXT: s_add_u32 s13, s16, s13 +; GCN-NEXT: s_addc_u32 s16, 0, s17 +; GCN-NEXT: s_add_u32 s17, s12, s13 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s15, s15, s16 +; GCN-NEXT: s_mul_i32 s12, s3, s15 +; GCN-NEXT: s_mul_hi_u32 s13, s3, s17 +; GCN-NEXT: s_add_i32 s12, s13, s12 +; GCN-NEXT: s_mul_i32 s14, s14, s17 +; GCN-NEXT: s_add_i32 s12, s12, s14 +; GCN-NEXT: s_mul_i32 s3, s3, s17 +; GCN-NEXT: s_mul_hi_u32 s14, s15, s3 +; GCN-NEXT: s_mul_i32 s16, s15, s3 +; GCN-NEXT: s_mul_i32 s19, s17, s12 +; GCN-NEXT: s_mul_hi_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s17, s12 +; GCN-NEXT: s_add_u32 s3, s3, s19 +; GCN-NEXT: s_addc_u32 s18, 0, s18 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_mul_hi_u32 s13, s15, s12 +; GCN-NEXT: s_addc_u32 s3, s18, s14 +; GCN-NEXT: s_addc_u32 s13, s13, 0 +; GCN-NEXT: s_mul_i32 s12, s15, s12 +; GCN-NEXT: s_add_u32 s3, s3, s12 +; GCN-NEXT: s_addc_u32 s14, 0, s13 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[12:13], 0 +; GCN-NEXT: s_addc_u32 s16, s15, s14 +; GCN-NEXT: s_ashr_i32 s12, s5, 31 +; GCN-NEXT: s_add_u32 s14, s4, s12 +; GCN-NEXT: s_mov_b32 s13, s12 +; GCN-NEXT: s_addc_u32 s15, s5, s12 +; GCN-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] +; GCN-NEXT: s_mul_i32 s17, s14, s16 +; GCN-NEXT: s_mul_hi_u32 s18, s14, s3 +; GCN-NEXT: s_mul_hi_u32 s5, s14, s16 +; GCN-NEXT: s_add_u32 s17, s18, s17 +; GCN-NEXT: s_addc_u32 s5, 0, s5 +; GCN-NEXT: s_mul_hi_u32 s19, s15, s3 +; GCN-NEXT: s_mul_i32 s3, s15, s3 +; GCN-NEXT: s_add_u32 s3, s17, s3 +; GCN-NEXT: s_mul_hi_u32 s18, s15, s16 +; GCN-NEXT: s_addc_u32 s3, s5, s19 +; GCN-NEXT: s_addc_u32 s5, s18, 0 +; GCN-NEXT: s_mul_i32 s16, s15, s16 +; GCN-NEXT: s_add_u32 s3, s3, s16 +; GCN-NEXT: s_addc_u32 s5, 0, s5 +; GCN-NEXT: s_mul_i32 s5, s10, s5 +; GCN-NEXT: s_mul_hi_u32 s16, s10, s3 +; GCN-NEXT: s_add_i32 s5, s16, s5 +; GCN-NEXT: s_mul_i32 s16, s11, s3 +; GCN-NEXT: s_add_i32 s5, s5, s16 +; GCN-NEXT: s_sub_i32 s18, s15, s5 +; GCN-NEXT: s_mul_i32 s3, s10, s3 +; GCN-NEXT: s_sub_u32 s3, s14, s3 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s14, s18, s11 +; GCN-NEXT: s_sub_u32 s20, s3, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s21, s14, 0 +; GCN-NEXT: s_cmp_ge_u32 s21, s11 +; GCN-NEXT: s_cselect_b32 s22, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s20, s10 +; GCN-NEXT: s_cselect_b32 s23, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s21, s11 +; GCN-NEXT: s_cselect_b32 s22, s23, s22 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s14, s14, s11 +; GCN-NEXT: s_sub_u32 s23, s20, s10 +; GCN-NEXT: s_cselect_b64 s[18:19], -1, 0 +; GCN-NEXT: s_cmp_lg_u64 s[18:19], 0 +; GCN-NEXT: s_subb_u32 s14, s14, 0 +; GCN-NEXT: s_cmp_lg_u32 s22, 0 +; GCN-NEXT: s_cselect_b32 s18, s23, s20 +; GCN-NEXT: s_cselect_b32 s14, s14, s21 +; GCN-NEXT: s_cmp_lg_u64 s[16:17], 0 +; GCN-NEXT: s_subb_u32 s5, s15, s5 +; GCN-NEXT: s_cmp_ge_u32 s5, s11 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s3, s10 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, s11 +; GCN-NEXT: s_cselect_b32 s10, s10, s15 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_cselect_b32 s11, s14, s5 +; GCN-NEXT: s_cselect_b32 s10, s18, s3 +; GCN-NEXT: s_xor_b64 s[10:11], s[10:11], s[12:13] +; GCN-NEXT: s_sub_u32 s10, s10, s12 +; GCN-NEXT: s_subb_u32 s11, s11, s12 +; GCN-NEXT: s_cbranch_execnz .LBB12_18 ; GCN-NEXT: .LBB12_16: -; GCN-NEXT: s_branch .LBB12_11 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: s_sub_i32 s3, 0, s2 +; GCN-NEXT: v_mov_b32_e32 v7, 0 +; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 +; GCN-NEXT: v_add_u32_e32 v0, v0, v1 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 +; GCN-NEXT: v_sub_u32_e32 v0, s4, v0 +; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_subrev_u32_e32 v1, s2, v0 +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GCN-NEXT: s_branch .LBB12_19 +; GCN-NEXT: .LBB12_17: +; GCN-NEXT: ; implicit-def: $sgpr10_sgpr11 +; GCN-NEXT: s_branch .LBB12_16 +; GCN-NEXT: .LBB12_18: +; GCN-NEXT: v_mov_b32_e32 v6, s10 +; GCN-NEXT: v_mov_b32_e32 v7, s11 +; GCN-NEXT: .LBB12_19: +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GCN-NEXT: s_endpgm ; ; TAHITI-LABEL: srem_v4i64: ; TAHITI: ; %bb.0: @@ -5546,7 +5639,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v20, v20, v11 ; TAHITI-NEXT: v_mul_lo_u32 v19, v19, v11 ; TAHITI-NEXT: v_add_i32_e32 v21, vcc, v21, v22 -; TAHITI-NEXT: v_add_i32_e32 v20, vcc, v21, v20 +; TAHITI-NEXT: v_add_i32_e32 v20, vcc, v20, v21 ; TAHITI-NEXT: v_mul_lo_u32 v23, v11, v20 ; TAHITI-NEXT: v_mul_hi_u32 v24, v11, v19 ; TAHITI-NEXT: v_mul_hi_u32 v25, v11, v20 @@ -5689,7 +5782,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v18, v18, v13 ; TAHITI-NEXT: v_mul_lo_u32 v15, v15, v13 ; TAHITI-NEXT: v_add_i32_e32 v19, vcc, v19, v20 -; TAHITI-NEXT: v_add_i32_e32 v18, vcc, v19, v18 +; TAHITI-NEXT: v_add_i32_e32 v18, vcc, v18, v19 ; TAHITI-NEXT: v_mul_lo_u32 v21, v13, v18 ; TAHITI-NEXT: v_mul_hi_u32 v22, v13, v15 ; TAHITI-NEXT: v_mul_hi_u32 v23, v13, v18 @@ -5833,7 +5926,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v16, v16, v13 ; TAHITI-NEXT: v_mul_lo_u32 v15, v15, v13 ; TAHITI-NEXT: v_add_i32_e32 v17, vcc, v17, v18 -; TAHITI-NEXT: v_add_i32_e32 v16, vcc, v17, v16 +; TAHITI-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; TAHITI-NEXT: v_mul_lo_u32 v19, v13, v16 ; TAHITI-NEXT: v_mul_hi_u32 v20, v13, v15 ; TAHITI-NEXT: v_mul_hi_u32 v21, v13, v16 @@ -5976,7 +6069,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TAHITI-NEXT: v_mul_lo_u32 v14, v14, v3 ; TAHITI-NEXT: v_mul_lo_u32 v5, v5, v3 ; TAHITI-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; TAHITI-NEXT: v_add_i32_e32 v14, vcc, v15, v14 +; TAHITI-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; TAHITI-NEXT: v_mul_lo_u32 v17, v3, v14 ; TAHITI-NEXT: v_mul_hi_u32 v18, v3, v5 ; TAHITI-NEXT: v_mul_hi_u32 v19, v3, v14 @@ -6089,7 +6182,6 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-LABEL: srem_v4i64: ; TONGA: ; %bb.0: ; TONGA-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 -; TONGA-NEXT: v_mov_b32_e32 v8, 0 ; TONGA-NEXT: s_waitcnt lgkmcnt(0) ; TONGA-NEXT: s_add_u32 s0, s6, 48 ; TONGA-NEXT: v_mov_b32_e32 v0, s6 @@ -6109,249 +6201,279 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mov_b32_e32 v4, s0 ; TONGA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; TONGA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] +; TONGA-NEXT: s_waitcnt vmcnt(3) +; TONGA-NEXT: v_readfirstlane_b32 s3, v15 +; TONGA-NEXT: v_readfirstlane_b32 s2, v14 ; TONGA-NEXT: s_waitcnt vmcnt(2) -; TONGA-NEXT: v_or_b32_e32 v9, v15, v11 -; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; TONGA-NEXT: s_cbranch_vccz .LBB12_13 +; TONGA-NEXT: v_readfirstlane_b32 s1, v11 +; TONGA-NEXT: v_readfirstlane_b32 s0, v10 +; TONGA-NEXT: s_or_b64 s[6:7], s[2:3], s[0:1] +; TONGA-NEXT: s_mov_b32 s6, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[6:7], 0 +; TONGA-NEXT: s_cbranch_scc0 .LBB12_3 ; TONGA-NEXT: ; %bb.1: -; TONGA-NEXT: v_ashrrev_i32_e32 v8, 31, v11 -; TONGA-NEXT: v_add_u32_e32 v9, vcc, v10, v8 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v11, v8, vcc -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v8 -; TONGA-NEXT: v_xor_b32_e32 v8, v11, v8 -; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v9 -; TONGA-NEXT: v_cvt_f32_u32_e32 v18, v8 -; TONGA-NEXT: v_sub_u32_e32 v23, vcc, 0, v9 -; TONGA-NEXT: v_subb_u32_e32 v24, vcc, 0, v8, vcc -; TONGA-NEXT: v_madmk_f32 v11, v18, 0x4f800000, v11 -; TONGA-NEXT: v_rcp_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_f32_e32 v11, 0x5f7ffffc, v11 -; TONGA-NEXT: v_mul_f32_e32 v18, 0x2f800000, v11 -; TONGA-NEXT: v_trunc_f32_e32 v18, v18 -; TONGA-NEXT: v_madmk_f32 v11, v18, 0xcf800000, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v22, v18 -; TONGA-NEXT: v_cvt_u32_f32_e32 v11, v11 -; TONGA-NEXT: v_mul_lo_u32 v20, v23, v22 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v21, v24, v11 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v20 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v19, v21 -; TONGA-NEXT: v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0 -; TONGA-NEXT: v_mul_hi_u32 v25, v11, v18 -; TONGA-NEXT: v_add_u32_e32 v25, vcc, v25, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0 -; TONGA-NEXT: v_addc_u32_e32 v26, vcc, 0, v20, vcc -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v25, v18 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v26, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v20 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v22, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v22, v23, v25 -; TONGA-NEXT: v_mul_lo_u32 v23, v24, v11 -; TONGA-NEXT: v_mul_hi_u32 v24, v11, v18 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v22, v19 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v19, v23 -; TONGA-NEXT: v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0 -; TONGA-NEXT: v_add_u32_e32 v22, vcc, v24, v22 -; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v23, vcc -; TONGA-NEXT: v_add_u32_e32 v20, vcc, v22, v20 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v23, v21, vcc -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v20, v18 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v18 -; TONGA-NEXT: v_addc_u32_e32 v20, vcc, v25, v19, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v22, 31, v15 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v22 -; TONGA-NEXT: v_xor_b32_e32 v23, v18, v22 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v23, v11 -; TONGA-NEXT: v_addc_u32_e32 v15, vcc, v15, v22, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v15, v22 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v18 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v19, vcc -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v24, v18 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v25, v19, vcc -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v21, vcc -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v20 -; TONGA-NEXT: v_addc_u32_e32 v18, vcc, 0, v18, vcc -; TONGA-NEXT: v_mul_lo_u32 v20, v9, v18 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0 -; TONGA-NEXT: v_mul_lo_u32 v11, v8, v11 -; TONGA-NEXT: v_add_u32_e32 v19, vcc, v20, v19 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 -; TONGA-NEXT: v_sub_u32_e32 v19, vcc, v15, v11 -; TONGA-NEXT: v_sub_u32_e32 v18, vcc, v23, v18 -; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, vcc -; TONGA-NEXT: v_sub_u32_e64 v20, s[0:1], v18, v9 -; TONGA-NEXT: v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v21, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v23, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v20, v9 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v15, v11, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v24, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v21, v8 -; TONGA-NEXT: v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v23, v23, v24, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v24, s[0:1], v20, v9 -; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v18, v9 -; TONGA-NEXT: v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v23 -; TONGA-NEXT: v_cndmask_b32_e32 v8, v15, v9, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v24, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 -; TONGA-NEXT: v_cndmask_b32_e64 v19, v21, v19, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v9, v18, v20, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v8, v11, v19, vcc -; TONGA-NEXT: v_xor_b32_e32 v9, v9, v22 -; TONGA-NEXT: v_xor_b32_e32 v11, v8, v22 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v9, v22 -; TONGA-NEXT: v_subb_u32_e32 v9, vcc, v11, v22, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_3 +; TONGA-NEXT: s_ashr_i32 s6, s1, 31 +; TONGA-NEXT: s_add_u32 s8, s0, s6 +; TONGA-NEXT: s_mov_b32 s7, s6 +; TONGA-NEXT: s_addc_u32 s9, s1, s6 +; TONGA-NEXT: s_xor_b64 s[6:7], s[8:9], s[6:7] +; TONGA-NEXT: v_cvt_f32_u32_e32 v8, s6 +; TONGA-NEXT: v_cvt_f32_u32_e32 v9, s7 +; TONGA-NEXT: s_sub_u32 s1, 0, s6 +; TONGA-NEXT: s_subb_u32 s10, 0, s7 +; TONGA-NEXT: v_madmk_f32 v8, v9, 0x4f800000, v8 +; TONGA-NEXT: v_rcp_f32_e32 v8, v8 +; TONGA-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; TONGA-NEXT: v_mul_f32_e32 v9, 0x2f800000, v8 +; TONGA-NEXT: v_trunc_f32_e32 v9, v9 +; TONGA-NEXT: v_madmk_f32 v8, v9, 0xcf800000, v8 +; TONGA-NEXT: v_cvt_u32_f32_e32 v14, v9 +; TONGA-NEXT: v_cvt_u32_f32_e32 v15, v8 +; TONGA-NEXT: v_mul_lo_u32 v10, s1, v14 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s1, v15, 0 +; TONGA-NEXT: v_mul_lo_u32 v11, s10, v15 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v9, v10 +; TONGA-NEXT: v_add_u32_e32 v11, vcc, v9, v11 +; TONGA-NEXT: v_mul_hi_u32 v18, v15, v8 +; TONGA-NEXT: v_mad_u64_u32 v[9:10], s[8:9], v15, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v18, v9 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v14, v8, 0 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v10, vcc +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v14, v11, 0 +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v18, v8 +; TONGA-NEXT: v_addc_u32_e32 v8, vcc, v19, v9, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v11, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v10 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v15, v8 +; TONGA-NEXT: v_addc_u32_e32 v19, vcc, v14, v9, vcc +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s1, v18, 0 +; TONGA-NEXT: v_mul_lo_u32 v14, s1, v19 +; TONGA-NEXT: v_mul_lo_u32 v15, s10, v18 +; TONGA-NEXT: v_mul_hi_u32 v20, v18, v8 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], v19, v8, 0 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v14, v9 +; TONGA-NEXT: v_add_u32_e32 v9, vcc, v15, v9 +; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[8:9], v18, v9, 0 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], v19, v9, 0 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v20, v14 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v14, v10 +; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v15, v11, vcc +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; TONGA-NEXT: v_add_u32_e32 v8, vcc, v10, v8 +; TONGA-NEXT: s_ashr_i32 s10, s3, 31 +; TONGA-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc +; TONGA-NEXT: s_add_u32 s8, s2, s10 +; TONGA-NEXT: v_add_u32_e32 v10, vcc, v18, v8 +; TONGA-NEXT: s_mov_b32 s11, s10 +; TONGA-NEXT: s_addc_u32 s9, s3, s10 +; TONGA-NEXT: v_addc_u32_e32 v11, vcc, v19, v9, vcc +; TONGA-NEXT: s_xor_b64 s[12:13], s[8:9], s[10:11] +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s12, v11, 0 +; TONGA-NEXT: v_mul_hi_u32 v14, s12, v10 +; TONGA-NEXT: v_readfirstlane_b32 s1, v9 +; TONGA-NEXT: v_readfirstlane_b32 s3, v8 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s13, v11, 0 +; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[8:9], s13, v10, 0 +; TONGA-NEXT: v_readfirstlane_b32 s14, v14 +; TONGA-NEXT: s_add_u32 s3, s14, s3 +; TONGA-NEXT: s_addc_u32 s1, 0, s1 +; TONGA-NEXT: v_readfirstlane_b32 s14, v10 +; TONGA-NEXT: v_readfirstlane_b32 s9, v11 +; TONGA-NEXT: s_add_u32 s3, s3, s14 +; TONGA-NEXT: v_readfirstlane_b32 s8, v9 +; TONGA-NEXT: s_addc_u32 s1, s1, s9 +; TONGA-NEXT: s_addc_u32 s3, s8, 0 +; TONGA-NEXT: v_readfirstlane_b32 s8, v8 +; TONGA-NEXT: s_add_u32 s1, s1, s8 +; TONGA-NEXT: v_mov_b32_e32 v8, s1 +; TONGA-NEXT: v_mad_u64_u32 v[8:9], s[8:9], s6, v8, 0 +; TONGA-NEXT: s_addc_u32 s3, 0, s3 +; TONGA-NEXT: s_mul_i32 s3, s6, s3 +; TONGA-NEXT: v_readfirstlane_b32 s14, v9 +; TONGA-NEXT: s_add_i32 s3, s14, s3 +; TONGA-NEXT: s_mul_i32 s1, s7, s1 +; TONGA-NEXT: s_add_i32 s3, s3, s1 +; TONGA-NEXT: s_sub_i32 s1, s13, s3 +; TONGA-NEXT: v_readfirstlane_b32 s14, v8 +; TONGA-NEXT: s_sub_u32 s12, s12, s14 +; TONGA-NEXT: s_cselect_b64 s[14:15], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 +; TONGA-NEXT: s_subb_u32 s1, s1, s7 +; TONGA-NEXT: s_sub_u32 s18, s12, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s19, s1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s19, s7 +; TONGA-NEXT: s_cselect_b32 s20, -1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s18, s6 +; TONGA-NEXT: s_cselect_b32 s21, -1, 0 +; TONGA-NEXT: s_cmp_eq_u32 s19, s7 +; TONGA-NEXT: s_cselect_b32 s20, s21, s20 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s1, s1, s7 +; TONGA-NEXT: s_sub_u32 s21, s18, s6 +; TONGA-NEXT: s_cselect_b64 s[16:17], -1, 0 +; TONGA-NEXT: s_cmp_lg_u64 s[16:17], 0 +; TONGA-NEXT: s_subb_u32 s1, s1, 0 +; TONGA-NEXT: s_cmp_lg_u32 s20, 0 +; TONGA-NEXT: s_cselect_b32 s16, s21, s18 +; TONGA-NEXT: s_cselect_b32 s1, s1, s19 +; TONGA-NEXT: s_cmp_lg_u64 s[14:15], 0 +; TONGA-NEXT: s_subb_u32 s3, s13, s3 +; TONGA-NEXT: s_cmp_ge_u32 s3, s7 +; TONGA-NEXT: s_cselect_b32 s13, -1, 0 +; TONGA-NEXT: s_cmp_ge_u32 s12, s6 +; TONGA-NEXT: s_cselect_b32 s6, -1, 0 +; TONGA-NEXT: s_cmp_eq_u32 s3, s7 +; TONGA-NEXT: s_cselect_b32 s6, s6, s13 +; TONGA-NEXT: s_cmp_lg_u32 s6, 0 +; TONGA-NEXT: s_cselect_b32 s7, s1, s3 +; TONGA-NEXT: s_cselect_b32 s6, s16, s12 +; TONGA-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] +; TONGA-NEXT: s_sub_u32 s6, s6, s10 +; TONGA-NEXT: s_subb_u32 s7, s7, s10 +; TONGA-NEXT: s_cbranch_execnz .LBB12_4 ; TONGA-NEXT: .LBB12_2: -; TONGA-NEXT: v_cvt_f32_u32_e32 v8, v10 -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v8, s0 +; TONGA-NEXT: s_sub_i32 s1, 0, s0 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; TONGA-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8 ; TONGA-NEXT: v_cvt_u32_f32_e32 v8, v8 -; TONGA-NEXT: v_mul_lo_u32 v9, v9, v8 +; TONGA-NEXT: v_mul_lo_u32 v9, s1, v8 ; TONGA-NEXT: v_mul_hi_u32 v9, v8, v9 ; TONGA-NEXT: v_add_u32_e32 v8, vcc, v8, v9 -; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 -; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 -; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; TONGA-NEXT: v_mul_hi_u32 v8, s2, v8 +; TONGA-NEXT: v_mul_lo_u32 v8, v8, s0 +; TONGA-NEXT: v_sub_u32_e32 v8, vcc, s2, v8 +; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, s0, v8 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v8 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc -; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 +; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, s0, v8 +; TONGA-NEXT: v_cmp_le_u32_e32 vcc, s0, v8 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_mov_b32_e32 v9, 0 +; TONGA-NEXT: s_branch .LBB12_5 ; TONGA-NEXT: .LBB12_3: +; TONGA-NEXT: ; implicit-def: $sgpr6_sgpr7 +; TONGA-NEXT: s_branch .LBB12_2 +; TONGA-NEXT: .LBB12_4: +; TONGA-NEXT: v_mov_b32_e32 v9, s7 +; TONGA-NEXT: v_mov_b32_e32 v8, s6 +; TONGA-NEXT: .LBB12_5: ; TONGA-NEXT: v_or_b32_e32 v11, v17, v13 ; TONGA-NEXT: v_mov_b32_e32 v10, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; TONGA-NEXT: s_cbranch_vccz .LBB12_14 -; TONGA-NEXT: ; %bb.4: +; TONGA-NEXT: s_cbranch_vccz .LBB12_15 +; TONGA-NEXT: ; %bb.6: ; TONGA-NEXT: v_ashrrev_i32_e32 v10, 31, v13 ; TONGA-NEXT: v_add_u32_e32 v11, vcc, v12, v10 ; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v13, v10, vcc -; TONGA-NEXT: v_xor_b32_e32 v15, v11, v10 -; TONGA-NEXT: v_xor_b32_e32 v20, v13, v10 -; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v15 -; TONGA-NEXT: v_cvt_f32_u32_e32 v11, v20 -; TONGA-NEXT: v_sub_u32_e32 v21, vcc, 0, v15 -; TONGA-NEXT: v_subb_u32_e32 v22, vcc, 0, v20, vcc -; TONGA-NEXT: v_madmk_f32 v10, v11, 0x4f800000, v10 -; TONGA-NEXT: v_rcp_f32_e32 v10, v10 -; TONGA-NEXT: v_mul_f32_e32 v10, 0x5f7ffffc, v10 -; TONGA-NEXT: v_mul_f32_e32 v11, 0x2f800000, v10 -; TONGA-NEXT: v_trunc_f32_e32 v11, v11 -; TONGA-NEXT: v_madmk_f32 v10, v11, 0xcf800000, v10 -; TONGA-NEXT: v_cvt_u32_f32_e32 v18, v11 -; TONGA-NEXT: v_cvt_u32_f32_e32 v19, v10 -; TONGA-NEXT: v_mul_lo_u32 v13, v21, v18 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v21, v19, 0 -; TONGA-NEXT: v_mul_lo_u32 v14, v22, v19 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v13 -; TONGA-NEXT: v_add_u32_e32 v23, vcc, v11, v14 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v19, v23, 0 -; TONGA-NEXT: v_mul_hi_u32 v11, v19, v10 -; TONGA-NEXT: v_add_u32_e32 v24, vcc, v11, v13 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v18, v10, 0 -; TONGA-NEXT: v_addc_u32_e32 v25, vcc, 0, v14, vcc -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v18, v23, 0 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v24, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v25, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v10, v13 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v23, vcc, v19, v10 -; TONGA-NEXT: v_addc_u32_e32 v24, vcc, v18, v11, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v21, v23, 0 -; TONGA-NEXT: v_mul_lo_u32 v18, v21, v24 -; TONGA-NEXT: v_mul_lo_u32 v19, v22, v23 -; TONGA-NEXT: v_mul_hi_u32 v21, v23, v10 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v24, v10, 0 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v18, v11 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v11, v19 -; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v24, v11, 0 -; TONGA-NEXT: v_add_u32_e32 v18, vcc, v21, v18 -; TONGA-NEXT: v_addc_u32_e32 v19, vcc, 0, v19, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v18, v13 -; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v19, v14, vcc -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v13, v10 -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v23, v10 -; TONGA-NEXT: v_addc_u32_e32 v14, vcc, v24, v11, vcc -; TONGA-NEXT: v_ashrrev_i32_e32 v18, 31, v17 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v16, v18 -; TONGA-NEXT: v_xor_b32_e32 v19, v10, v18 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v19, v14, 0 -; TONGA-NEXT: v_mul_hi_u32 v21, v19, v13 -; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v18, vcc -; TONGA-NEXT: v_xor_b32_e32 v17, v17, v18 -; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v10 -; TONGA-NEXT: v_addc_u32_e32 v22, vcc, 0, v11, vcc -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v17, v13, 0 -; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v17, v14, 0 -; TONGA-NEXT: v_add_u32_e32 v10, vcc, v21, v10 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, v22, v11, vcc -; TONGA-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v10, v13 -; TONGA-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; TONGA-NEXT: v_mul_lo_u32 v14, v15, v10 -; TONGA-NEXT: v_mad_u64_u32 v[10:11], s[0:1], v15, v13, 0 -; TONGA-NEXT: v_mul_lo_u32 v13, v20, v13 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v14, v11 -; TONGA-NEXT: v_add_u32_e32 v11, vcc, v13, v11 -; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v17, v11 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v19, v10 -; TONGA-NEXT: v_subb_u32_e64 v13, s[0:1], v13, v20, vcc -; TONGA-NEXT: v_sub_u32_e64 v14, s[0:1], v10, v15 -; TONGA-NEXT: v_subbrev_u32_e64 v19, s[2:3], 0, v13, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v19, v20 +; TONGA-NEXT: v_xor_b32_e32 v11, v11, v10 +; TONGA-NEXT: v_xor_b32_e32 v10, v13, v10 +; TONGA-NEXT: v_cvt_f32_u32_e32 v13, v11 +; TONGA-NEXT: v_cvt_f32_u32_e32 v14, v10 +; TONGA-NEXT: v_sub_u32_e32 v22, vcc, 0, v11 +; TONGA-NEXT: v_subb_u32_e32 v23, vcc, 0, v10, vcc +; TONGA-NEXT: v_madmk_f32 v13, v14, 0x4f800000, v13 +; TONGA-NEXT: v_rcp_f32_e32 v13, v13 +; TONGA-NEXT: v_mul_f32_e32 v13, 0x5f7ffffc, v13 +; TONGA-NEXT: v_mul_f32_e32 v14, 0x2f800000, v13 +; TONGA-NEXT: v_trunc_f32_e32 v14, v14 +; TONGA-NEXT: v_madmk_f32 v13, v14, 0xcf800000, v13 +; TONGA-NEXT: v_cvt_u32_f32_e32 v20, v14 +; TONGA-NEXT: v_cvt_u32_f32_e32 v21, v13 +; TONGA-NEXT: v_mul_lo_u32 v15, v22, v20 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v21, 0 +; TONGA-NEXT: v_mul_lo_u32 v18, v23, v21 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v14, v15 +; TONGA-NEXT: v_add_u32_e32 v18, vcc, v14, v18 +; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v21, v18, 0 +; TONGA-NEXT: v_mul_hi_u32 v19, v21, v13 +; TONGA-NEXT: v_add_u32_e32 v24, vcc, v19, v14 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v13, 0 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v20, v18, 0 +; TONGA-NEXT: v_addc_u32_e32 v15, vcc, 0, v15, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v24, v13 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v15, v14, vcc +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v19, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v18 +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v24, vcc, v21, v13 +; TONGA-NEXT: v_addc_u32_e32 v25, vcc, v20, v14, vcc +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v24, 0 +; TONGA-NEXT: v_mul_lo_u32 v15, v22, v25 +; TONGA-NEXT: v_mul_lo_u32 v20, v23, v24 +; TONGA-NEXT: v_mad_u64_u32 v[18:19], s[0:1], v25, v13, 0 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v15, v14 +; TONGA-NEXT: v_add_u32_e32 v20, vcc, v20, v14 +; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v24, v20, 0 +; TONGA-NEXT: v_mul_hi_u32 v13, v24, v13 +; TONGA-NEXT: v_mad_u64_u32 v[20:21], s[0:1], v25, v20, 0 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v14 +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v15, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v18 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v14, v19, vcc +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v21, vcc +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v20 +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc +; TONGA-NEXT: v_add_u32_e32 v15, vcc, v24, v13 +; TONGA-NEXT: v_addc_u32_e32 v18, vcc, v25, v14, vcc +; TONGA-NEXT: v_ashrrev_i32_e32 v19, 31, v17 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v16, v19 +; TONGA-NEXT: v_xor_b32_e32 v20, v13, v19 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v20, v18, 0 +; TONGA-NEXT: v_mul_hi_u32 v21, v20, v15 +; TONGA-NEXT: v_addc_u32_e32 v17, vcc, v17, v19, vcc +; TONGA-NEXT: v_xor_b32_e32 v22, v17, v19 +; TONGA-NEXT: v_add_u32_e32 v21, vcc, v21, v13 +; TONGA-NEXT: v_addc_u32_e32 v23, vcc, 0, v14, vcc +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v22, v15, 0 +; TONGA-NEXT: v_mad_u64_u32 v[17:18], s[0:1], v22, v18, 0 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v21, v13 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, v23, v14, vcc +; TONGA-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc +; TONGA-NEXT: v_add_u32_e32 v15, vcc, v13, v17 +; TONGA-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc +; TONGA-NEXT: v_mul_lo_u32 v17, v11, v13 +; TONGA-NEXT: v_mad_u64_u32 v[13:14], s[0:1], v11, v15, 0 +; TONGA-NEXT: v_mul_lo_u32 v15, v10, v15 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v17, v14 +; TONGA-NEXT: v_add_u32_e32 v14, vcc, v15, v14 +; TONGA-NEXT: v_sub_u32_e32 v15, vcc, v22, v14 +; TONGA-NEXT: v_sub_u32_e32 v13, vcc, v20, v13 +; TONGA-NEXT: v_subb_u32_e64 v15, s[0:1], v15, v10, vcc +; TONGA-NEXT: v_sub_u32_e64 v17, s[0:1], v13, v11 +; TONGA-NEXT: v_subbrev_u32_e64 v18, s[2:3], 0, v15, s[0:1] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v18, v10 +; TONGA-NEXT: v_cndmask_b32_e64 v20, 0, -1, s[2:3] +; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v17, v11 ; TONGA-NEXT: v_cndmask_b32_e64 v21, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_ge_u32_e64 s[2:3], v14, v15 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v17, v11, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v22, 0, -1, s[2:3] -; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v19, v20 -; TONGA-NEXT: v_subb_u32_e64 v13, s[0:1], v13, v20, s[0:1] -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v11, v20 -; TONGA-NEXT: v_cndmask_b32_e64 v21, v21, v22, s[2:3] -; TONGA-NEXT: v_sub_u32_e64 v22, s[0:1], v14, v15 -; TONGA-NEXT: v_cndmask_b32_e64 v17, 0, -1, vcc -; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v15 -; TONGA-NEXT: v_subbrev_u32_e64 v13, s[0:1], 0, v13, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e64 v15, 0, -1, vcc -; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v11, v20 -; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v21 -; TONGA-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; TONGA-NEXT: v_cndmask_b32_e64 v14, v14, v22, s[0:1] -; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15 -; TONGA-NEXT: v_cndmask_b32_e64 v13, v19, v13, s[0:1] -; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; TONGA-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc -; TONGA-NEXT: v_xor_b32_e32 v10, v10, v18 -; TONGA-NEXT: v_xor_b32_e32 v11, v11, v18 -; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v10, v18 -; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_6 -; TONGA-NEXT: .LBB12_5: +; TONGA-NEXT: v_cmp_eq_u32_e64 s[2:3], v18, v10 +; TONGA-NEXT: v_subb_u32_e64 v15, s[0:1], v15, v10, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[2:3] +; TONGA-NEXT: v_sub_u32_e64 v21, s[0:1], v17, v11 +; TONGA-NEXT: v_subbrev_u32_e64 v15, s[0:1], 0, v15, s[0:1] +; TONGA-NEXT: v_subb_u32_e32 v14, vcc, v22, v14, vcc +; TONGA-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v20 +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v14, v10 +; TONGA-NEXT: v_cndmask_b32_e64 v15, v18, v15, s[0:1] +; TONGA-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc +; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v13, v11 +; TONGA-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; TONGA-NEXT: v_cmp_eq_u32_e32 vcc, v14, v10 +; TONGA-NEXT: v_cndmask_b32_e32 v10, v18, v11, vcc +; TONGA-NEXT: v_cndmask_b32_e64 v17, v17, v21, s[0:1] +; TONGA-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 +; TONGA-NEXT: v_cndmask_b32_e32 v11, v13, v17, vcc +; TONGA-NEXT: v_cndmask_b32_e32 v10, v14, v15, vcc +; TONGA-NEXT: v_xor_b32_e32 v11, v11, v19 +; TONGA-NEXT: v_xor_b32_e32 v13, v10, v19 +; TONGA-NEXT: v_sub_u32_e32 v10, vcc, v11, v19 +; TONGA-NEXT: v_subb_u32_e32 v11, vcc, v13, v19, vcc +; TONGA-NEXT: s_cbranch_execnz .LBB12_8 +; TONGA-NEXT: .LBB12_7: ; TONGA-NEXT: v_cvt_f32_u32_e32 v10, v12 ; TONGA-NEXT: v_sub_u32_e32 v11, vcc, 0, v12 ; TONGA-NEXT: v_rcp_iflag_f32_e32 v10, v10 @@ -6370,13 +6492,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v10, v12 ; TONGA-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc ; TONGA-NEXT: v_mov_b32_e32 v11, 0 -; TONGA-NEXT: .LBB12_6: +; TONGA-NEXT: .LBB12_8: ; TONGA-NEXT: s_waitcnt vmcnt(0) ; TONGA-NEXT: v_or_b32_e32 v13, v5, v1 ; TONGA-NEXT: v_mov_b32_e32 v12, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; TONGA-NEXT: s_cbranch_vccz .LBB12_15 -; TONGA-NEXT: ; %bb.7: +; TONGA-NEXT: s_cbranch_vccz .LBB12_16 +; TONGA-NEXT: ; %bb.9: ; TONGA-NEXT: v_ashrrev_i32_e32 v12, 31, v1 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, v0, v12 ; TONGA-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc @@ -6418,7 +6540,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v19, v21, v12 ; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v22, v12, 0 ; TONGA-NEXT: v_add_u32_e32 v13, vcc, v16, v13 -; TONGA-NEXT: v_add_u32_e32 v13, vcc, v13, v17 +; TONGA-NEXT: v_add_u32_e32 v13, vcc, v17, v13 ; TONGA-NEXT: v_mad_u64_u32 v[16:17], s[0:1], v21, v13, 0 ; TONGA-NEXT: v_mad_u64_u32 v[12:13], s[0:1], v22, v13, 0 ; TONGA-NEXT: v_add_u32_e32 v16, vcc, v19, v16 @@ -6482,8 +6604,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v16 ; TONGA-NEXT: v_sub_u32_e32 v12, vcc, v5, v16 ; TONGA-NEXT: v_subb_u32_e32 v13, vcc, v1, v16, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_9 -; TONGA-NEXT: .LBB12_8: +; TONGA-NEXT: s_cbranch_execnz .LBB12_11 +; TONGA-NEXT: .LBB12_10: ; TONGA-NEXT: v_cvt_f32_u32_e32 v1, v0 ; TONGA-NEXT: v_sub_u32_e32 v5, vcc, 0, v0 ; TONGA-NEXT: v_mov_b32_e32 v13, 0 @@ -6502,12 +6624,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_subrev_u32_e32 v4, vcc, v0, v1 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v1, v0 ; TONGA-NEXT: v_cndmask_b32_e32 v12, v1, v4, vcc -; TONGA-NEXT: .LBB12_9: +; TONGA-NEXT: .LBB12_11: ; TONGA-NEXT: v_or_b32_e32 v1, v7, v3 ; TONGA-NEXT: v_mov_b32_e32 v0, 0 ; TONGA-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; TONGA-NEXT: s_cbranch_vccz .LBB12_16 -; TONGA-NEXT: ; %bb.10: +; TONGA-NEXT: s_cbranch_vccz .LBB12_17 +; TONGA-NEXT: ; %bb.12: ; TONGA-NEXT: v_ashrrev_i32_e32 v0, 31, v3 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_addc_u32_e32 v3, vcc, v3, v0, vcc @@ -6549,7 +6671,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v17, v19, v0 ; TONGA-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v20, v0, 0 ; TONGA-NEXT: v_add_u32_e32 v1, vcc, v14, v1 -; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v15 +; TONGA-NEXT: v_add_u32_e32 v1, vcc, v15, v1 ; TONGA-NEXT: v_mad_u64_u32 v[14:15], s[0:1], v19, v1, 0 ; TONGA-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v20, v1, 0 ; TONGA-NEXT: v_add_u32_e32 v14, vcc, v17, v14 @@ -6613,8 +6735,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_xor_b32_e32 v1, v1, v15 ; TONGA-NEXT: v_sub_u32_e32 v14, vcc, v0, v15 ; TONGA-NEXT: v_subb_u32_e32 v15, vcc, v1, v15, vcc -; TONGA-NEXT: s_cbranch_execnz .LBB12_12 -; TONGA-NEXT: .LBB12_11: +; TONGA-NEXT: s_cbranch_execnz .LBB12_14 +; TONGA-NEXT: .LBB12_13: ; TONGA-NEXT: v_cvt_f32_u32_e32 v0, v2 ; TONGA-NEXT: v_sub_u32_e32 v1, vcc, 0, v2 ; TONGA-NEXT: v_mov_b32_e32 v15, 0 @@ -6633,7 +6755,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_subrev_u32_e32 v1, vcc, v2, v0 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TONGA-NEXT: v_cndmask_b32_e32 v14, v0, v1, vcc -; TONGA-NEXT: .LBB12_12: +; TONGA-NEXT: .LBB12_14: ; TONGA-NEXT: v_mov_b32_e32 v0, s4 ; TONGA-NEXT: v_mov_b32_e32 v1, s5 ; TONGA-NEXT: s_add_u32 s0, s4, 16 @@ -6643,16 +6765,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mov_b32_e32 v1, s1 ; TONGA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; TONGA-NEXT: s_endpgm -; TONGA-NEXT: .LBB12_13: -; TONGA-NEXT: ; implicit-def: $vgpr8_vgpr9 -; TONGA-NEXT: s_branch .LBB12_2 -; TONGA-NEXT: .LBB12_14: -; TONGA-NEXT: s_branch .LBB12_5 ; TONGA-NEXT: .LBB12_15: -; TONGA-NEXT: ; implicit-def: $vgpr12_vgpr13 -; TONGA-NEXT: s_branch .LBB12_8 +; TONGA-NEXT: s_branch .LBB12_7 ; TONGA-NEXT: .LBB12_16: -; TONGA-NEXT: s_branch .LBB12_11 +; TONGA-NEXT: ; implicit-def: $vgpr12_vgpr13 +; TONGA-NEXT: s_branch .LBB12_10 +; TONGA-NEXT: .LBB12_17: +; TONGA-NEXT: s_branch .LBB12_13 ; ; EG-LABEL: srem_v4i64: ; EG: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c7b690fbd4a21..465024a699d43 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -5,119 +5,159 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_mov_b32 s4, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GCN-NEXT: s_sub_u32 s10, 0, s8 +; GCN-NEXT: s_subb_u32 s11, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_mul_i32 s1, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_mul_i32 s13, s11, s0 +; GCN-NEXT: s_mul_i32 s14, s10, s0 +; GCN-NEXT: s_add_i32 s1, s15, s1 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 +; GCN-NEXT: s_add_i32 s1, s1, s13 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v3 +; GCN-NEXT: s_mul_i32 s15, s0, s1 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 +; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NEXT: s_add_u32 s13, s13, s14 +; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_add_u32 s1, s13, s1 +; GCN-NEXT: s_addc_u32 s13, 0, s14 +; GCN-NEXT: s_add_u32 s14, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s12, s12, s13 +; GCN-NEXT: s_mul_i32 s0, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mul_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s1, s10, s14 +; GCN-NEXT: s_add_i32 s0, s0, s11 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 +; GCN-NEXT: s_mul_i32 s11, s14, s0 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s10, v3 +; GCN-NEXT: s_add_u32 s1, s11, s1 +; GCN-NEXT: s_addc_u32 s1, s13, s10 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s0, s12, s0 +; GCN-NEXT: s_add_u32 s0, s1, s0 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_add_u32 s11, s14, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 +; GCN-NEXT: s_mul_i32 s4, s6, s1 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: s_add_u32 s4, s12, s4 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_mul_i32 s11, s7, s11 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_add_u32 s4, s4, s11 +; GCN-NEXT: s_addc_u32 s4, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s1, s7, s1 +; GCN-NEXT: s_add_u32 s4, s4, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s10 +; GCN-NEXT: s_mul_i32 s5, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s5, s10, s5 +; GCN-NEXT: s_mul_i32 s10, s9, s4 +; GCN-NEXT: s_add_i32 s10, s5, s10 +; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_mul_i32 s4, s8, s4 +; GCN-NEXT: s_sub_u32 s6, s6, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s12, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s13, s6, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s14, s11, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s8 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s15, s15, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s16, s13, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s5, s16, s13 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s10 +; GCN-NEXT: s_cmp_ge_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s6, s8 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s10 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem: @@ -921,133 +961,169 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 -; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 -; GCN-NEXT: s_ashr_i32 s6, s5, 31 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_mov_b32 s7, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s6 -; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s4, 0, s8 -; GCN-NEXT: s_subb_u32 s5, 0, s9 -; GCN-NEXT: s_ashr_i32 s10, s3, 31 +; GCN-NEXT: s_ashr_i64 s[6:7], s[2:3], 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[4:5], 31 +; GCN-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-NEXT: s_add_u32 s2, s2, s4 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_addc_u32 s3, s3, s4 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: s_sub_u32 s10, 0, s4 +; GCN-NEXT: s_subb_u32 s11, 0, s5 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_add_u32 s2, s2, s10 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: s_addc_u32 s3, s3, s10 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s12, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s13, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s13, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s13, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s8, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s9, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s13 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: s_mul_i32 s9, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_mul_i32 s13, s11, s8 +; GCN-NEXT: s_mul_i32 s14, s10, s8 +; GCN-NEXT: s_add_i32 s9, s15, s9 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 +; GCN-NEXT: s_add_i32 s9, s9, s13 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s9 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v3 +; GCN-NEXT: s_mul_i32 s15, s8, s9 +; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: v_mul_hi_u32 v0, v1, s9 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NEXT: s_add_u32 s13, s13, s14 +; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s9, s12, s9 +; GCN-NEXT: s_add_u32 s9, s13, s9 +; GCN-NEXT: s_addc_u32 s13, 0, s14 +; GCN-NEXT: s_add_u32 s14, s8, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_addc_u32 s12, s12, s13 +; GCN-NEXT: s_mul_i32 s8, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-NEXT: s_add_i32 s8, s9, s8 +; GCN-NEXT: s_mul_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s9, s10, s14 +; GCN-NEXT: s_add_i32 s8, s8, s11 +; GCN-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 +; GCN-NEXT: s_mul_i32 s11, s14, s8 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s9, s12, s9 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s10, v3 +; GCN-NEXT: s_add_u32 s9, s11, s9 +; GCN-NEXT: s_addc_u32 s9, s13, s10 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s8, s12, s8 +; GCN-NEXT: s_add_u32 s8, s9, s8 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_add_u32 s11, s14, s8 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_or_b32 s8, s8, s9 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_addc_u32 s10, s12, s10 +; GCN-NEXT: s_ashr_i32 s8, s7, 31 +; GCN-NEXT: s_add_u32 s6, s6, s8 +; GCN-NEXT: s_mov_b32 s9, s8 +; GCN-NEXT: s_addc_u32 s7, s7, s8 +; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: s_mul_i32 s12, s6, s10 +; GCN-NEXT: v_readfirstlane_b32 s13, v1 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 +; GCN-NEXT: v_readfirstlane_b32 s14, v3 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: s_add_u32 s12, s14, s12 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: s_mul_i32 s11, s7, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_add_u32 s11, s12, s11 +; GCN-NEXT: s_addc_u32 s11, s13, s14 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_addc_u32 s12, s12, 0 +; GCN-NEXT: s_mul_i32 s10, s7, s10 +; GCN-NEXT: s_add_u32 s10, s11, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_addc_u32 s11, 0, s12 +; GCN-NEXT: s_mul_i32 s11, s4, s11 +; GCN-NEXT: v_readfirstlane_b32 s12, v0 +; GCN-NEXT: s_add_i32 s11, s12, s11 +; GCN-NEXT: s_mul_i32 s12, s5, s10 +; GCN-NEXT: s_add_i32 s12, s11, s12 +; GCN-NEXT: s_sub_i32 s13, s7, s12 +; GCN-NEXT: s_mul_i32 s10, s4, s10 +; GCN-NEXT: s_sub_u32 s6, s6, s10 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s14, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s13, s13, s5 +; GCN-NEXT: s_sub_u32 s15, s6, s4 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s16, s13, 0 +; GCN-NEXT: s_cmp_ge_u32 s16, s5 +; GCN-NEXT: s_cselect_b32 s11, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s15, s4 +; GCN-NEXT: s_cselect_b32 s17, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s16, s5 +; GCN-NEXT: s_cselect_b32 s17, s17, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s13, s13, s5 +; GCN-NEXT: s_sub_u32 s18, s15, s4 +; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 +; GCN-NEXT: s_or_b32 s10, s10, s11 +; GCN-NEXT: s_cmp_lg_u32 s10, 0 +; GCN-NEXT: s_subb_u32 s10, s13, 0 +; GCN-NEXT: s_cmp_lg_u32 s17, 0 +; GCN-NEXT: s_cselect_b32 s11, s18, s15 +; GCN-NEXT: s_cselect_b32 s10, s10, s16 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s12 +; GCN-NEXT: s_cmp_ge_u32 s7, s5 +; GCN-NEXT: s_cselect_b32 s12, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s6, s4 +; GCN-NEXT: s_cselect_b32 s4, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s12 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s5, s10, s7 +; GCN-NEXT: s_cselect_b32 s4, s11, s6 +; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[8:9] +; GCN-NEXT: s_sub_u32 s4, s4, s8 +; GCN-NEXT: s_subb_u32 s5, s5, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem33_64: @@ -1156,34 +1232,33 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %y) { ; GCN-LABEL: s_test_srem24_48: ; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sext_i32_i16 s9, s9 +; GCN-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; GCN-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GCN-NEXT: s_sext_i32_i16 s3, s3 -; GCN-NEXT: s_sext_i32_i16 s5, s5 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 -; GCN-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_alignbit_b32 v2, s3, v2, 24 -; GCN-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GCN-NEXT: v_xor_b32_e32 v5, v2, v0 -; GCN-NEXT: v_ashrrev_i32_e32 v5, 30, v5 -; GCN-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-NEXT: s_xor_b32 s3, s2, s4 +; GCN-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-NEXT: s_or_b32 s3, s3, 1 +; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GCN-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s3, v2 +; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1192,34 +1267,33 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 % ; ; GCN-IR-LABEL: s_test_srem24_48: ; GCN-IR: ; %bb.0: +; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) +; GCN-IR-NEXT: s_sext_i32_i16 s9, s9 +; GCN-IR-NEXT: s_lshr_b64 s[4:5], s[8:9], 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s4 ; GCN-IR-NEXT: s_sext_i32_i16 s3, s3 -; GCN-IR-NEXT: s_sext_i32_i16 s5, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_alignbit_b32 v0, s5, v0, 24 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s2 -; GCN-IR-NEXT: v_alignbit_b32 v2, s3, v2, 24 -; GCN-IR-NEXT: v_cvt_f32_i32_e32 v3, v2 -; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GCN-IR-NEXT: v_xor_b32_e32 v5, v2, v0 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 30, v5 -; GCN-IR-NEXT: v_or_b32_e32 v5, 1, v5 -; GCN-IR-NEXT: v_mul_f32_e32 v4, v3, v4 -; GCN-IR-NEXT: v_trunc_f32_e32 v4, v4 -; GCN-IR-NEXT: v_mad_f32 v3, -v4, v1, v3 -; GCN-IR-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v1| -; GCN-IR-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-IR-NEXT: v_cvt_f32_i32_e32 v1, s2 +; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 +; GCN-IR-NEXT: s_xor_b32 s3, s2, s4 +; GCN-IR-NEXT: s_ashr_i32 s3, s3, 30 +; GCN-IR-NEXT: s_or_b32 s3, s3, 1 +; GCN-IR-NEXT: v_mul_f32_e32 v2, v1, v2 +; GCN-IR-NEXT: v_trunc_f32_e32 v2, v2 +; GCN-IR-NEXT: v_mad_f32 v1, -v2, v0, v1 +; GCN-IR-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| +; GCN-IR-NEXT: s_and_b64 s[8:9], s[8:9], exec +; GCN-IR-NEXT: s_cselect_b32 s3, s3, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s3, v2 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s4 ; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v1, v0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 -; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, v0, v2 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -1236,110 +1310,145 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-LABEL: s_test_srem_k_num_i64: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-NEXT: s_add_u32 s2, s2, s4 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_addc_u32 s3, s3, s4 -; GCN-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s2, 0, s8 -; GCN-NEXT: s_subb_u32 s3, 0, s9 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GCN-NEXT: s_sub_u32 s2, 0, s4 +; GCN-NEXT: s_subb_u32 s8, 0, s5 +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s9, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-NEXT: s_mul_i32 s7, s2, s9 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_mul_i32 s10, s8, s6 +; GCN-NEXT: s_mul_i32 s11, s2, s6 +; GCN-NEXT: s_add_i32 s7, s12, s7 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 +; GCN-NEXT: s_add_i32 s7, s7, s10 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s7 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 +; GCN-NEXT: v_readfirstlane_b32 s10, v3 +; GCN-NEXT: s_mul_i32 s13, s6, s7 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s7 +; GCN-NEXT: s_add_u32 s10, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s11, s9, s11 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-NEXT: s_add_u32 s10, s10, s11 +; GCN-NEXT: s_addc_u32 s10, s13, s12 +; GCN-NEXT: v_readfirstlane_b32 s11, v1 +; GCN-NEXT: s_addc_u32 s11, s11, 0 +; GCN-NEXT: s_mul_i32 s7, s9, s7 +; GCN-NEXT: s_add_u32 s7, s10, s7 +; GCN-NEXT: s_addc_u32 s10, 0, s11 +; GCN-NEXT: s_add_u32 s11, s6, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_addc_u32 s9, s9, s10 +; GCN-NEXT: s_mul_i32 s6, s2, s9 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_add_i32 s6, s7, s6 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_mul_i32 s2, s2, s11 +; GCN-NEXT: s_add_i32 s6, s6, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s8, s11, s6 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s8, s12, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s2, s9, s2 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: v_readfirstlane_b32 s7, v3 +; GCN-NEXT: s_add_u32 s2, s8, s2 +; GCN-NEXT: s_addc_u32 s2, s10, s7 +; GCN-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-NEXT: s_addc_u32 s7, s7, 0 +; GCN-NEXT: s_mul_i32 s6, s9, s6 +; GCN-NEXT: s_add_u32 s2, s2, s6 +; GCN-NEXT: s_addc_u32 s8, 0, s7 +; GCN-NEXT: s_add_u32 s2, s11, s2 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_addc_u32 s6, s9, s8 +; GCN-NEXT: v_mul_hi_u32 v1, s2, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s6, 24 +; GCN-NEXT: s_mul_i32 s6, s6, 24 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-NEXT: v_readfirstlane_b32 s7, v0 +; GCN-NEXT: s_add_u32 s6, s8, s6 +; GCN-NEXT: s_addc_u32 s6, 0, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 +; GCN-NEXT: s_mul_i32 s7, s5, s6 +; GCN-NEXT: s_mul_i32 s6, s4, s6 +; GCN-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-NEXT: s_add_i32 s8, s8, s7 +; GCN-NEXT: s_sub_i32 s9, 0, s8 +; GCN-NEXT: s_sub_u32 s10, 24, s6 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s11, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s9, s9, s5 +; GCN-NEXT: s_sub_u32 s12, s10, s4 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s13, s9, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s5 +; GCN-NEXT: s_cselect_b32 s7, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s4 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s13, s5 +; GCN-NEXT: s_cselect_b32 s14, s14, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s9, s9, s5 +; GCN-NEXT: s_sub_u32 s15, s12, s4 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_or_b32 s6, s6, s7 +; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_subb_u32 s6, s9, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s7, s15, s12 +; GCN-NEXT: s_cselect_b32 s6, s6, s13 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s8, 0, s8 +; GCN-NEXT: s_cmp_ge_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s9, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s10, s4 +; GCN-NEXT: s_cselect_b32 s4, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s8, s5 +; GCN-NEXT: s_cselect_b32 s4, s4, s9 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b32 s4, s6, s8 +; GCN-NEXT: s_cselect_b32 s5, s7, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem_k_num_i64: diff --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll index baaca4ddeaf05..f79fbd98f1e09 100644 --- a/llvm/test/CodeGen/AMDGPU/sub_u64.ll +++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll @@ -126,7 +126,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) { ; ; GFX1250-LABEL: test_sub_u64_64bit_imm_v: ; GFX1250: ; %bb.0: -; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], lit64(0x13b9ac9ff), v[0:1] +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], 0x13b9ac9ff, v[0:1] ; GFX1250-NEXT: ; return to shader part epilog %sub = sub i64 5294967295, %a %ret = bitcast i64 %sub to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir index 0430c8349f350..1dbeccf7cf984 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -107,7 +107,7 @@ body: | successors: %bb.13(0x80000000) ; GCN-LABEL: bb.7: - ; GCN: undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; GCN: undef %{{.+}}.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -123,7 +123,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.9: - ; GCN: undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; GCN: undef %{{.+}}.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 @@ -134,7 +134,7 @@ body: | successors: %bb.12(0x80000000) ; GCN-LABEL: bb.10: - ; GCN: undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec + ; GCN: undef %{{.+}}.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec undef %15.sub0:vreg_128 = V_MOV_B32_e32 2143289344, implicit $exec %15.sub1:vreg_128 = COPY %15.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll index 4a5dc8f300af3..d4a8a0d762afd 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -14,16 +14,15 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) { ; CHECK-LABEL: _Z6kernelILi4000ELi1EEvPd: ; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[2:3], 0x100 +; CHECK-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; CHECK-NEXT: s_mov_b64 s[0:1], 0 -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 -; CHECK-NEXT: s_mov_b64 s[0:1], 0x100 -; CHECK-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; CHECK-NEXT: s_load_dword s0, s[0:1], 0x0 +; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s1, s2 -; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: s_mov_b32 s1, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40260000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 ; CHECK-NEXT: v_mov_b32_e32 v1, s7 @@ -32,8 +31,8 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce) ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x40140000 -; CHECK-NEXT: s_add_i32 s0, s0, s1 -; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 +; CHECK-NEXT: s_add_i32 s1, s1, s0 +; CHECK-NEXT: s_cmpk_lt_i32 s1, 0xa00 ; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s7, 0x40180000 diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir index 6966c3d8b6d6a..bc8a383a285b2 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir +++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir @@ -36,24 +36,18 @@ body: | ; CHECK-NEXT: SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: SI_SPILL_AV128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec ; CHECK-NEXT: SI_SPILL_AV32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr7 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, $vgpr7, 0, killed $vgpr3, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) - ; CHECK-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr5 - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9 - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr6_vgpr7 ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, $vgpr4, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5) ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index bf1f6980fe25a..1ed04f8782d5d 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -50,7 +50,7 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 @@ -721,16 +721,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s2, s2, 0xff000000 -; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 -; GCN-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_alignbit_b32 v0, s5, v0, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-NEXT: s_and_b32 s2, s2, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mov_b32 s5, s1 @@ -753,16 +751,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48 ; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s2, s2, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s4, s4, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 -; GCN-IR-NEXT: v_alignbit_b32 v0, s5, v0, 24 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GCN-IR-NEXT: s_and_b32 s3, s3, 0xffff -; GCN-IR-NEXT: v_mov_b32_e32 v1, s2 -; GCN-IR-NEXT: v_alignbit_b32 v1, s3, v1, 24 -; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, v1 +; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s4, s4, 0xff000000 +; GCN-IR-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GCN-IR-NEXT: s_and_b32 s2, s2, 0xff000000 +; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 +; GCN-IR-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v2, v0 ; GCN-IR-NEXT: s_mov_b32 s4, s0 ; GCN-IR-NEXT: s_mov_b32 s5, s1 @@ -788,104 +784,137 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GCN-NEXT: s_sub_u32 s4, 0, s2 -; GCN-NEXT: s_subb_u32 s5, 0, s3 +; GCN-NEXT: s_sub_u32 s6, 0, s2 +; GCN-NEXT: s_subb_u32 s8, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: s_mul_i32 s5, s6, s9 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_mul_i32 s10, s8, s4 +; GCN-NEXT: s_mul_i32 s11, s6, s4 +; GCN-NEXT: s_add_i32 s5, s12, s5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 +; GCN-NEXT: s_add_i32 s5, s5, s10 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 +; GCN-NEXT: v_readfirstlane_b32 s10, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 +; GCN-NEXT: s_mul_i32 s13, s4, s5 +; GCN-NEXT: s_add_u32 s10, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s11, s9, s11 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-NEXT: s_add_u32 s10, s10, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s10, s13, s12 +; GCN-NEXT: s_addc_u32 s11, s14, 0 +; GCN-NEXT: s_mul_i32 s5, s9, s5 +; GCN-NEXT: s_add_u32 s5, s10, s5 +; GCN-NEXT: s_addc_u32 s10, 0, s11 +; GCN-NEXT: s_add_u32 s11, s4, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_addc_u32 s9, s9, s10 +; GCN-NEXT: s_mul_i32 s4, s6, s9 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_mul_i32 s5, s6, s11 +; GCN-NEXT: s_add_i32 s4, s4, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s8, s11, s4 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s8, s12, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s5, s9, s5 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NEXT: s_add_u32 s5, s8, s5 +; GCN-NEXT: s_addc_u32 s5, s10, s6 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: s_addc_u32 s6, s6, 0 +; GCN-NEXT: s_mul_i32 s4, s9, s4 +; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_addc_u32 s6, 0, s6 +; GCN-NEXT: s_add_u32 s8, s11, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_addc_u32 s4, s9, s6 +; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 +; GCN-NEXT: s_mul_i32 s4, s4, 24 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 -; GCN-NEXT: v_mul_hi_u32 v2, s2, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, 0, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 24, v2 -; GCN-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s2, v2 -; GCN-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] -; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v3 -; GCN-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GCN-NEXT: v_add_i32_e64 v4, s[0:1], 1, v0 -; GCN-NEXT: v_addc_u32_e64 v5, s[0:1], 0, 0, s[0:1] -; GCN-NEXT: v_add_i32_e64 v6, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v7, s[0:1], 0, 0, s[0:1] -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v4, v5, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 -; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mul_i32 s0, s3, s8 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s11, 24, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s12, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s13, s11, s2 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s0, s3 +; GCN-NEXT: s_cselect_b32 s1, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s2 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s0, s3 +; GCN-NEXT: s_cselect_b32 s0, s10, s1 +; GCN-NEXT: s_add_u32 s1, s8, 1 +; GCN-NEXT: s_addc_u32 s10, 0, 0 +; GCN-NEXT: s_add_u32 s13, s8, 2 +; GCN-NEXT: s_addc_u32 s14, 0, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_cselect_b32 s0, s13, s1 +; GCN-NEXT: s_cselect_b32 s1, s14, s10 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s9, 0, s9 +; GCN-NEXT: s_cmp_ge_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s11, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s1, s1, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll new file mode 100644 index 0000000000000..fe8c90ee7b686 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/uniform-alignbit.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s + +define amdgpu_kernel void @uniform_build_vector(i64 %in, ptr addrspace(1) %out) { +; GCN-LABEL: uniform_build_vector: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshr_b64 s[4:5], s[0:1], 1 +; GCN-NEXT: s_mov_b32 s5, 0 +; GCN-NEXT: s_mov_b32 s6, s5 +; GCN-NEXT: s_mov_b32 s7, s5 +; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: ; sched_barrier mask(0x00000000) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GCN-NEXT: global_store_dword v1, v0, s[2:3] +; GCN-NEXT: s_endpgm +entry: + %shifted = lshr i64 %in, 1 + %trunc = trunc i64 %shifted to i32 + %insert = insertelement <4 x i32> zeroinitializer, i32 %trunc, i64 0 + %load = tail call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %insert, i32 0, i32 0, i32 0) + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %extract = extractelement <4 x i32> %load, i64 0 + %and = and i32 %extract, 1 + %convert = sitofp i32 %and to float + store float %convert, ptr addrspace(1) %out + ret void +} + +; Function Attrs: convergent nocallback nofree nounwind willreturn +declare void @llvm.amdgcn.sched.barrier(i32 immarg) #0 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read) +declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #1 \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index c4d928185d8f4..b846ce7f12466 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -5,119 +5,159 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_urem_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd -; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 -; GCN-NEXT: s_sub_u32 s0, 0, s12 -; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_mov_b32 s4, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 +; GCN-NEXT: s_sub_u32 s10, 0, s8 +; GCN-NEXT: s_subb_u32 s11, 0, s9 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 -; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s11 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_mul_hi_u32 v2, s10, v0 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: s_mul_i32 s1, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_mul_i32 s13, s11, s0 +; GCN-NEXT: s_mul_i32 s14, s10, s0 +; GCN-NEXT: s_add_i32 s1, s15, s1 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s14 +; GCN-NEXT: s_add_i32 s1, s1, s13 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s1 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s14 +; GCN-NEXT: v_readfirstlane_b32 s13, v3 +; GCN-NEXT: s_mul_i32 s15, s0, s1 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s1 +; GCN-NEXT: s_add_u32 s13, s13, s15 +; GCN-NEXT: v_readfirstlane_b32 s15, v0 +; GCN-NEXT: s_mul_i32 s14, s12, s14 +; GCN-NEXT: s_addc_u32 s15, 0, s15 +; GCN-NEXT: v_readfirstlane_b32 s16, v4 +; GCN-NEXT: s_add_u32 s13, s13, s14 +; GCN-NEXT: s_addc_u32 s13, s15, s16 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s14, s14, 0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_add_u32 s1, s13, s1 +; GCN-NEXT: s_addc_u32 s13, 0, s14 +; GCN-NEXT: s_add_u32 s14, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s10, v0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s12, s12, s13 +; GCN-NEXT: s_mul_i32 s0, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: s_mul_i32 s11, s11, s14 +; GCN-NEXT: s_mul_i32 s1, s10, s14 +; GCN-NEXT: s_add_i32 s0, s0, s11 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s14, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s14, v0 +; GCN-NEXT: s_mul_i32 s11, s14, s0 +; GCN-NEXT: v_readfirstlane_b32 s15, v2 +; GCN-NEXT: s_add_u32 s11, s15, s11 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s1, s12, s1 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s10, v3 +; GCN-NEXT: s_add_u32 s1, s11, s1 +; GCN-NEXT: s_addc_u32 s1, s13, s10 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s0, s12, s0 +; GCN-NEXT: s_add_u32 s0, s1, s0 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_add_u32 s11, s14, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_addc_u32 s1, s12, s10 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mul_hi_u32 v1, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NEXT: v_mul_hi_u32 v3, s6, v2 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-NEXT: v_mul_hi_u32 v1, s7, v2 +; GCN-NEXT: s_mul_i32 s4, s6, s1 +; GCN-NEXT: v_readfirstlane_b32 s12, v3 +; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 +; GCN-NEXT: s_add_u32 s4, s12, s4 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: s_mul_i32 s11, s7, s11 +; GCN-NEXT: v_readfirstlane_b32 s12, v1 +; GCN-NEXT: s_add_u32 s4, s4, s11 +; GCN-NEXT: s_addc_u32 s4, s10, s12 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_addc_u32 s10, s10, 0 +; GCN-NEXT: s_mul_i32 s1, s7, s1 +; GCN-NEXT: s_add_u32 s4, s4, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: s_addc_u32 s5, 0, s10 +; GCN-NEXT: s_mul_i32 s5, s8, s5 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_add_i32 s5, s10, s5 +; GCN-NEXT: s_mul_i32 s10, s9, s4 +; GCN-NEXT: s_add_i32 s10, s5, s10 +; GCN-NEXT: s_sub_i32 s11, s7, s10 +; GCN-NEXT: s_mul_i32 s4, s8, s4 +; GCN-NEXT: s_sub_u32 s6, s6, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s12, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s13, s6, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s14, s11, 0 +; GCN-NEXT: s_cmp_ge_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s5, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s8 +; GCN-NEXT: s_cselect_b32 s15, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s14, s9 +; GCN-NEXT: s_cselect_b32 s15, s15, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s11, s11, s9 +; GCN-NEXT: s_sub_u32 s16, s13, s8 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_subb_u32 s4, s11, 0 +; GCN-NEXT: s_cmp_lg_u32 s15, 0 +; GCN-NEXT: s_cselect_b32 s5, s16, s13 +; GCN-NEXT: s_cselect_b32 s4, s4, s14 +; GCN-NEXT: s_cmp_lg_u32 s12, 0 +; GCN-NEXT: s_subb_u32 s7, s7, s10 +; GCN-NEXT: s_cmp_ge_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s6, s8 +; GCN-NEXT: s_cselect_b32 s8, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s7, s9 +; GCN-NEXT: s_cselect_b32 s8, s8, s10 +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b32 s4, s4, s7 +; GCN-NEXT: s_cselect_b32 s5, s5, s6 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_i64: @@ -764,106 +804,143 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6 define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) { ; GCN-LABEL: s_test_urem_k_num_i64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 -; GCN-NEXT: s_mov_b32 s11, 0xf000 -; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GCN-NEXT: s_sub_u32 s0, 0, s6 -; GCN-NEXT: s_subb_u32 s1, 0, s7 -; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GCN-NEXT: s_sub_u32 s6, 0, s2 +; GCN-NEXT: s_subb_u32 s8, 0, s3 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v0, s6, v0 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc -; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s6, v0 -; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s7, v5 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s6, v4 -; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s7, v5 -; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s6, v4 -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] -; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] -; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 -; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-NEXT: s_mul_i32 s5, s6, s9 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_mul_i32 s10, s8, s4 +; GCN-NEXT: s_mul_i32 s11, s6, s4 +; GCN-NEXT: s_add_i32 s5, s12, s5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, s11 +; GCN-NEXT: s_add_i32 s5, s5, s10 +; GCN-NEXT: v_mul_hi_u32 v0, v0, s5 +; GCN-NEXT: v_mul_hi_u32 v4, v1, s11 +; GCN-NEXT: v_readfirstlane_b32 s10, v3 +; GCN-NEXT: v_mul_hi_u32 v1, v1, s5 +; GCN-NEXT: s_mul_i32 s13, s4, s5 +; GCN-NEXT: s_add_u32 s10, s10, s13 +; GCN-NEXT: v_readfirstlane_b32 s13, v0 +; GCN-NEXT: s_mul_i32 s11, s9, s11 +; GCN-NEXT: s_addc_u32 s13, 0, s13 +; GCN-NEXT: v_readfirstlane_b32 s12, v4 +; GCN-NEXT: s_add_u32 s10, s10, s11 +; GCN-NEXT: v_readfirstlane_b32 s14, v1 +; GCN-NEXT: s_addc_u32 s10, s13, s12 +; GCN-NEXT: s_addc_u32 s11, s14, 0 +; GCN-NEXT: s_mul_i32 s5, s9, s5 +; GCN-NEXT: s_add_u32 s5, s10, s5 +; GCN-NEXT: s_addc_u32 s10, 0, s11 +; GCN-NEXT: s_add_u32 s11, s4, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_addc_u32 s9, s9, s10 +; GCN-NEXT: s_mul_i32 s4, s6, s9 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_add_i32 s4, s5, s4 +; GCN-NEXT: s_mul_i32 s8, s8, s11 +; GCN-NEXT: s_mul_i32 s5, s6, s11 +; GCN-NEXT: s_add_i32 s4, s4, s8 +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mul_hi_u32 v3, s9, v2 +; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 +; GCN-NEXT: v_mul_hi_u32 v1, s9, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: s_mul_i32 s8, s11, s4 +; GCN-NEXT: v_readfirstlane_b32 s12, v2 +; GCN-NEXT: s_add_u32 s8, s12, s8 +; GCN-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-NEXT: s_mul_i32 s5, s9, s5 +; GCN-NEXT: s_addc_u32 s10, 0, s10 +; GCN-NEXT: v_readfirstlane_b32 s6, v3 +; GCN-NEXT: s_add_u32 s5, s8, s5 +; GCN-NEXT: s_addc_u32 s5, s10, s6 +; GCN-NEXT: v_readfirstlane_b32 s6, v1 +; GCN-NEXT: s_addc_u32 s6, s6, 0 +; GCN-NEXT: s_mul_i32 s4, s9, s4 +; GCN-NEXT: s_add_u32 s4, s5, s4 +; GCN-NEXT: s_addc_u32 s6, 0, s6 +; GCN-NEXT: s_add_u32 s8, s11, s4 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b32 s4, s4, s5 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_addc_u32 s4, s9, s6 +; GCN-NEXT: v_mul_hi_u32 v1, s8, 24 +; GCN-NEXT: v_mul_hi_u32 v0, s4, 24 +; GCN-NEXT: s_mul_i32 s4, s4, 24 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-NEXT: v_readfirstlane_b32 s5, v0 +; GCN-NEXT: s_add_u32 s4, s8, s4 +; GCN-NEXT: s_addc_u32 s8, 0, s5 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_mul_i32 s0, s3, s8 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_add_i32 s9, s1, s0 +; GCN-NEXT: s_sub_i32 s10, 0, s9 +; GCN-NEXT: s_mul_i32 s0, s2, s8 +; GCN-NEXT: s_sub_u32 s8, 24, s0 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s11, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s12, s8, s2 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s13, s10, 0 +; GCN-NEXT: s_cmp_ge_u32 s13, s3 +; GCN-NEXT: s_cselect_b32 s1, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s12, s2 +; GCN-NEXT: s_cselect_b32 s14, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s13, s3 +; GCN-NEXT: s_cselect_b32 s14, s14, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s10, s10, s3 +; GCN-NEXT: s_sub_u32 s15, s12, s2 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_cmp_lg_u32 s0, 0 +; GCN-NEXT: s_subb_u32 s0, s10, 0 +; GCN-NEXT: s_cmp_lg_u32 s14, 0 +; GCN-NEXT: s_cselect_b32 s1, s15, s12 +; GCN-NEXT: s_cselect_b32 s0, s0, s13 +; GCN-NEXT: s_cmp_lg_u32 s11, 0 +; GCN-NEXT: s_subb_u32 s9, 0, s9 +; GCN-NEXT: s_cmp_ge_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s10, -1, 0 +; GCN-NEXT: s_cmp_ge_u32 s8, s2 +; GCN-NEXT: s_cselect_b32 s2, -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s9, s3 +; GCN-NEXT: s_cselect_b32 s2, s2, s10 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_cselect_b32 s0, s0, s9 +; GCN-NEXT: s_cselect_b32 s1, s1, s8 +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_urem_k_num_i64: @@ -956,30 +1033,30 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mul_hi_u32 v3, s2, v2 ; GCN-NEXT: v_mul_hi_u32 v2, s3, v2 -; GCN-NEXT: v_mul_hi_u32 v1, s2, v0 ; GCN-NEXT: s_mul_i32 s5, s3, 0xaaaaaaab +; GCN-NEXT: v_mul_hi_u32 v1, s2, v0 ; GCN-NEXT: v_add_i32_e32 v3, vcc, s5, v3 ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: s_mul_i32 s0, s2, 0xaaaaaaaa ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v3 -; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 +; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-NEXT: s_mul_i32 s0, s3, 0xaaaaaaaa ; GCN-NEXT: v_addc_u32_e64 v2, s[8:9], 0, 0, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v1 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v0, v2, vcc -; GCN-NEXT: v_alignbit_b32 v1, v0, v1, 4 -; GCN-NEXT: v_lshrrev_b32_e32 v0, 4, v0 -; GCN-NEXT: v_mul_hi_u32 v2, v1, 24 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GCN-NEXT: v_lshr_b64 v[0:1], v[0:1], 4 +; GCN-NEXT: v_mul_lo_u32 v1, v2, 24 +; GCN-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-NEXT: v_mul_lo_u32 v0, v0, 24 -; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 -; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v1 -; GCN-NEXT: v_subb_u32_e32 v1, vcc, v3, v2, vcc +; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll index c12871536bafa..f5dc824aae35f 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -116,7 +116,7 @@ entry: ; GCN-LABEL: {{^}}nsz_mad_sub0_src0: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src0(ptr addrspace(1) %out, ptr addrspace(1) %in) { entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -125,7 +125,7 @@ entry: %b = load float, ptr addrspace(1) %b_ptr %c = load float, ptr addrspace(1) %c_ptr - %neg_a = fsub float 0.0, %a + %neg_a = fsub nsz float 0.0, %a %tmp0 = fmul float %neg_a, %b %tmp1 = fadd float %tmp0, %c @@ -176,7 +176,7 @@ entry: ; GCN-LABEL: {{^}}nsz_mad_sub0_src1: ; GCN-NOT: v_mac_f32 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}} -define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 { +define amdgpu_kernel void @nsz_mad_sub0_src1(ptr addrspace(1) %out, ptr addrspace(1) %in) { entry: %b_ptr = getelementptr float, ptr addrspace(1) %in, i32 1 %c_ptr = getelementptr float, ptr addrspace(1) %in, i32 2 @@ -185,7 +185,7 @@ entry: %b = load float, ptr addrspace(1) %b_ptr %c = load float, ptr addrspace(1) %c_ptr - %neg_b = fsub float 0.0, %b + %neg_b = fsub nsz float 0.0, %b %tmp0 = fmul float %a, %neg_b %tmp1 = fadd float %tmp0, %c @@ -310,6 +310,5 @@ define float @v_mac_f32_dynamic_ftz(float %a, float %b, float %c) "denormal-fp-m declare i32 @llvm.amdgcn.workitem.id.x() #2 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } attributes #2 = { nounwind readnone } attributes #3 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll index bcc60b06db291..8da6f2348690a 100644 --- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -236,7 +236,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %a.neg = fsub half 0.0, %a.val + %a.neg = fsub nsz half 0.0, %a.val %t.val = fmul half %a.neg, %b.val %r.val = fadd half %t.val, %c.val @@ -263,7 +263,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %b.neg = fsub half 0.0, %b.val + %b.neg = fsub nsz half 0.0, %b.val %t.val = fmul half %a.val, %b.neg %r.val = fadd half %t.val, %c.val @@ -290,7 +290,7 @@ entry: %b.val = load half, ptr addrspace(1) %b %c.val = load half, ptr addrspace(1) %c - %c.neg = fsub half 0.0, %c.val + %c.neg = fsub nsz half 0.0, %c.val %t.val = fmul half %a.val, %b.val %r.val = fadd half %t.val, %c.neg @@ -601,7 +601,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %a.neg = fsub <2 x half> , %a.val + %a.neg = fsub nsz <2 x half> , %a.val %t.val = fmul <2 x half> %a.neg, %b.val %r.val = fadd <2 x half> %t.val, %c.val @@ -634,7 +634,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %b.neg = fsub <2 x half> , %b.val + %b.neg = fsub nsz <2 x half> , %b.val %t.val = fmul <2 x half> %a.val, %b.neg %r.val = fadd <2 x half> %t.val, %c.val @@ -667,7 +667,7 @@ entry: %b.val = load <2 x half>, ptr addrspace(1) %b %c.val = load <2 x half>, ptr addrspace(1) %c - %c.neg = fsub <2 x half> , %c.val + %c.neg = fsub nsz <2 x half> , %c.val %t.val = fmul <2 x half> %a.val, %b.val %r.val = fadd <2 x half> %t.val, %c.neg @@ -678,5 +678,5 @@ entry: declare void @llvm.amdgcn.s.barrier() #2 attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #2 = { nounwind convergent } diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir index 95aaea6ea8091..27229cd518028 100644 --- a/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir +++ b/llvm/test/CodeGen/AMDGPU/v_swap_b32.mir @@ -1,4 +1,5 @@ # RUN: llc -simplify-mir -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -simplify-mir -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-shrink-instructions -verify-machineinstrs %s -o - -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s # RUN: llc -simplify-mir -mtriple=amdgcn -mcpu=gfx900 -passes=si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: swap_phys_condensed diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir new file mode 100644 index 0000000000000..8a70a8acd28d3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-lowering-gfx1250-t16.mir @@ -0,0 +1,66 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -start-before=amdgpu-lower-vgpr-encoding -o - %s | FileCheck -check-prefixes=GCN,ASM %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -start-before=amdgpu-lower-vgpr-encoding -o - %s | llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -filetype=obj -o - | llvm-objdump -d --mcpu=gfx1250 --mattr=+real-true16 - | FileCheck -check-prefixes=GCN,DIS %s + +# ASM-LABEL: {{^}}high_vgprs: +# DIS-LABEL: : +--- +name: high_vgprs +tracksRegLiveness: true +body: | + bb.0: + ; ASM: %bb.0: + + ; GCN-NEXT: v_add_f16_e64 v0.h, v1.h, v2.h + $vgpr0_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr1_hi16, 0, undef $vgpr2_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v0.l, v1.l, v2.l + $vgpr0_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr1_lo16, 0, undef $vgpr2_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.h, v129.h, v130.h + $vgpr128_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr129_hi16, 0, undef $vgpr130_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.l, v129.l, v130.l + $vgpr128_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr129_lo16, 0, undef $vgpr130_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x45 + ; ASM-SAME: ; msbs: dst=1 src0=1 src1=1 src2=0 + ; GCN-NEXT: v_add_f16_e64 v0.h /*v256.h*/, v1.h /*v257.h*/, v2.h /*v258.h*/ + $vgpr256_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr257_hi16, 0, undef $vgpr258_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v0.l /*v256.l*/, v1.l /*v257.l*/, v2.l /*v258.l*/ + $vgpr256_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr257_lo16, 0, undef $vgpr258_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.h /*v384.h*/, v129.h /*v385.h*/, v130.h /*v386.h*/ + $vgpr384_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr385_hi16, 0, undef $vgpr386_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.l /*v384.l*/, v129.l /*v385.l*/, v130.l /*v386.l*/ + $vgpr384_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr385_lo16, 0, undef $vgpr386_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0x8a + ; ASM-SAME: ; msbs: dst=2 src0=2 src1=2 src2=0 + ; GCN-NEXT: v_add_f16_e64 v0.h /*v512.h*/, v1.h /*v513.h*/, v2.h /*v514.h*/ + $vgpr512_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr513_hi16, 0, undef $vgpr514_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v0.l /*v512.l*/, v1.l /*v513.l*/, v2.l /*v514.l*/ + $vgpr512_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr513_lo16, 0, undef $vgpr514_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.h /*v640.h*/, v129.h /*v641.h*/, v130.h /*v642.h*/ + $vgpr640_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr641_hi16, 0, undef $vgpr642_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.l /*v640.l*/, v129.l /*v641.l*/, v130.l /*v642.l*/ + $vgpr640_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr641_lo16, 0, undef $vgpr642_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: s_set_vgpr_msb 0xcf + ; ASM-SAME: ; msbs: dst=3 src0=3 src1=3 src2=0 + ; GCN-NEXT: v_add_f16_e64 v0.h /*v768.h*/, v1.h /*v769.h*/, v2.h /*v770.h*/ + $vgpr768_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr769_hi16, 0, undef $vgpr770_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v0.l /*v768.l*/, v1.l /*v769.l*/, v2.l /*v770.l*/ + $vgpr768_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr769_lo16, 0, undef $vgpr770_lo16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.h /*v896.h*/, v129.h /*v897.h*/, v130.h /*v898.h*/ + $vgpr896_hi16 = V_ADD_F16_t16_e64 0, undef $vgpr897_hi16, 0, undef $vgpr898_hi16, 0, 0, 0, implicit $exec, implicit $mode + + ; GCN-NEXT: v_add_f16_e64 v128.l /*v896.l*/, v129.l /*v897.l*/, v130.l /*v898.l*/ + $vgpr896_lo16 = V_ADD_F16_t16_e64 0, undef $vgpr897_lo16, 0, undef $vgpr898_lo16, 0, 0, 0, implicit $exec, implicit $mode +... diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir index 0ddd2aa285b26..0d54bfaed8130 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -1,4 +1,5 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GFX9 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX12 %s diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 2a76d83cd7dac..75db3879e7b03 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -730,19 +730,19 @@ bb: define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_udiv64: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_or_b64 s[8:9], s[6:7], s[4:5] -; GFX1032-NEXT: s_mov_b32 s8, 0 -; GFX1032-NEXT: s_cmp_lg_u64 s[8:9], 0 +; GFX1032-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1] +; GFX1032-NEXT: s_mov_b32 s4, 0 +; GFX1032-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1032-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1032-NEXT: s_sub_u32 s9, 0, s4 -; GFX1032-NEXT: s_subb_u32 s10, 0, s5 +; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1032-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX1032-NEXT: s_sub_u32 s9, 0, s0 +; GFX1032-NEXT: s_subb_u32 s10, 0, s1 ; GFX1032-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1032-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1032-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -751,160 +751,158 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s0 -; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s1 -; GFX1032-NEXT: s_mul_i32 s12, s10, s1 +; GFX1032-NEXT: v_readfirstlane_b32 s5, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s8, v0 +; GFX1032-NEXT: s_mul_i32 s11, s9, s5 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 +; GFX1032-NEXT: s_mul_i32 s12, s10, s8 ; GFX1032-NEXT: s_add_i32 s11, s13, s11 -; GFX1032-NEXT: s_mul_i32 s14, s9, s1 +; GFX1032-NEXT: s_mul_i32 s14, s9, s8 ; GFX1032-NEXT: s_add_i32 s11, s11, s12 -; GFX1032-NEXT: s_mul_hi_u32 s13, s1, s14 -; GFX1032-NEXT: s_mul_hi_u32 s15, s0, s14 -; GFX1032-NEXT: s_mul_i32 s12, s0, s14 -; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX1032-NEXT: s_mul_i32 s1, s1, s11 -; GFX1032-NEXT: s_mul_hi_u32 s16, s0, s11 -; GFX1032-NEXT: s_add_u32 s1, s13, s1 -; GFX1032-NEXT: s_addc_u32 s13, 0, s14 -; GFX1032-NEXT: s_add_u32 s1, s1, s12 -; GFX1032-NEXT: s_mul_i32 s11, s0, s11 -; GFX1032-NEXT: s_addc_u32 s1, s13, s15 -; GFX1032-NEXT: s_addc_u32 s12, s16, 0 -; GFX1032-NEXT: s_add_u32 s1, s1, s11 -; GFX1032-NEXT: s_addc_u32 s11, 0, s12 -; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_addc_u32 s0, s0, s11 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s11, s9, s0 -; GFX1032-NEXT: s_mul_hi_u32 s12, s9, s1 -; GFX1032-NEXT: s_mul_i32 s10, s10, s1 -; GFX1032-NEXT: s_add_i32 s11, s12, s11 -; GFX1032-NEXT: s_mul_i32 s9, s9, s1 -; GFX1032-NEXT: s_add_i32 s11, s11, s10 -; GFX1032-NEXT: s_mul_hi_u32 s12, s0, s9 -; GFX1032-NEXT: s_mul_i32 s13, s0, s9 -; GFX1032-NEXT: s_mul_hi_u32 s9, s1, s9 -; GFX1032-NEXT: s_mul_hi_u32 s14, s1, s11 -; GFX1032-NEXT: s_mul_i32 s1, s1, s11 -; GFX1032-NEXT: s_mul_hi_u32 s10, s0, s11 -; GFX1032-NEXT: s_add_u32 s1, s9, s1 -; GFX1032-NEXT: s_addc_u32 s9, 0, s14 -; GFX1032-NEXT: s_add_u32 s1, s1, s13 -; GFX1032-NEXT: s_mul_i32 s11, s0, s11 -; GFX1032-NEXT: s_addc_u32 s1, s9, s12 -; GFX1032-NEXT: s_addc_u32 s9, s10, 0 -; GFX1032-NEXT: s_add_u32 s1, s1, s11 -; GFX1032-NEXT: s_addc_u32 s9, 0, s9 -; GFX1032-NEXT: v_add_co_u32 v0, s1, v0, s1 -; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 -; GFX1032-NEXT: s_addc_u32 s0, s0, s9 -; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1032-NEXT: s_mul_i32 s10, s6, s0 -; GFX1032-NEXT: s_mul_hi_u32 s9, s6, s0 -; GFX1032-NEXT: s_mul_hi_u32 s11, s7, s0 -; GFX1032-NEXT: s_mul_i32 s0, s7, s0 -; GFX1032-NEXT: s_mul_hi_u32 s12, s6, s1 -; GFX1032-NEXT: s_mul_hi_u32 s13, s7, s1 -; GFX1032-NEXT: s_mul_i32 s1, s7, s1 -; GFX1032-NEXT: s_add_u32 s10, s12, s10 -; GFX1032-NEXT: s_addc_u32 s9, 0, s9 -; GFX1032-NEXT: s_add_u32 s1, s10, s1 -; GFX1032-NEXT: s_addc_u32 s1, s9, s13 -; GFX1032-NEXT: s_addc_u32 s9, s11, 0 -; GFX1032-NEXT: s_add_u32 s1, s1, s0 -; GFX1032-NEXT: s_addc_u32 s9, 0, s9 -; GFX1032-NEXT: s_mul_hi_u32 s0, s4, s1 -; GFX1032-NEXT: s_mul_i32 s11, s4, s9 -; GFX1032-NEXT: s_mul_i32 s12, s4, s1 -; GFX1032-NEXT: s_add_i32 s0, s0, s11 -; GFX1032-NEXT: v_sub_co_u32 v0, s11, s6, s12 -; GFX1032-NEXT: s_mul_i32 s10, s5, s1 -; GFX1032-NEXT: s_add_i32 s0, s0, s10 -; GFX1032-NEXT: v_sub_co_u32 v1, s12, v0, s4 -; GFX1032-NEXT: s_sub_i32 s10, s7, s0 +; GFX1032-NEXT: s_mul_hi_u32 s13, s8, s14 +; GFX1032-NEXT: s_mul_i32 s16, s8, s11 +; GFX1032-NEXT: s_mul_hi_u32 s15, s5, s14 +; GFX1032-NEXT: s_mul_i32 s12, s5, s14 +; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s11 +; GFX1032-NEXT: s_add_u32 s13, s13, s16 +; GFX1032-NEXT: s_addc_u32 s14, 0, s14 +; GFX1032-NEXT: s_mul_hi_u32 s17, s5, s11 +; GFX1032-NEXT: s_add_u32 s12, s13, s12 +; GFX1032-NEXT: s_mul_i32 s11, s5, s11 +; GFX1032-NEXT: s_addc_u32 s12, s14, s15 +; GFX1032-NEXT: s_addc_u32 s13, s17, 0 +; GFX1032-NEXT: s_add_u32 s11, s12, s11 +; GFX1032-NEXT: s_addc_u32 s12, 0, s13 +; GFX1032-NEXT: s_add_u32 s8, s8, s11 +; GFX1032-NEXT: s_cselect_b32 s11, -1, 0 +; GFX1032-NEXT: s_mul_hi_u32 s13, s9, s8 ; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1032-NEXT: s_subb_u32 s10, s10, s5 -; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 -; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v1 -; GFX1032-NEXT: s_subb_u32 s10, s10, 0 -; GFX1032-NEXT: s_cmp_ge_u32 s10, s5 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo +; GFX1032-NEXT: s_mul_i32 s11, s9, s8 +; GFX1032-NEXT: s_addc_u32 s5, s5, s12 +; GFX1032-NEXT: s_mul_i32 s10, s10, s8 +; GFX1032-NEXT: s_mul_i32 s9, s9, s5 +; GFX1032-NEXT: s_mul_hi_u32 s12, s8, s11 +; GFX1032-NEXT: s_add_i32 s9, s13, s9 +; GFX1032-NEXT: s_mul_hi_u32 s13, s5, s11 +; GFX1032-NEXT: s_add_i32 s9, s9, s10 +; GFX1032-NEXT: s_mul_i32 s10, s5, s11 +; GFX1032-NEXT: s_mul_i32 s15, s8, s9 +; GFX1032-NEXT: s_mul_hi_u32 s14, s8, s9 +; GFX1032-NEXT: s_add_u32 s12, s12, s15 +; GFX1032-NEXT: s_addc_u32 s14, 0, s14 +; GFX1032-NEXT: s_mul_hi_u32 s11, s5, s9 +; GFX1032-NEXT: s_add_u32 s10, s12, s10 +; GFX1032-NEXT: s_mul_i32 s9, s5, s9 +; GFX1032-NEXT: s_addc_u32 s10, s14, s13 +; GFX1032-NEXT: s_addc_u32 s11, s11, 0 +; GFX1032-NEXT: s_add_u32 s9, s10, s9 +; GFX1032-NEXT: s_addc_u32 s10, 0, s11 +; GFX1032-NEXT: s_add_u32 s8, s8, s9 +; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1032-NEXT: s_mul_hi_u32 s11, s2, s8 +; GFX1032-NEXT: s_cmp_lg_u32 s9, 0 +; GFX1032-NEXT: s_mul_hi_u32 s9, s3, s8 +; GFX1032-NEXT: s_addc_u32 s5, s5, s10 +; GFX1032-NEXT: s_mul_i32 s8, s3, s8 +; GFX1032-NEXT: s_mul_i32 s12, s2, s5 +; GFX1032-NEXT: s_mul_hi_u32 s10, s2, s5 +; GFX1032-NEXT: s_add_u32 s11, s11, s12 +; GFX1032-NEXT: s_addc_u32 s10, 0, s10 +; GFX1032-NEXT: s_mul_hi_u32 s13, s3, s5 +; GFX1032-NEXT: s_add_u32 s8, s11, s8 +; GFX1032-NEXT: s_mul_i32 s5, s3, s5 +; GFX1032-NEXT: s_addc_u32 s8, s10, s9 +; GFX1032-NEXT: s_addc_u32 s9, s13, 0 +; GFX1032-NEXT: s_add_u32 s5, s8, s5 +; GFX1032-NEXT: s_addc_u32 s8, 0, s9 +; GFX1032-NEXT: s_mul_hi_u32 s9, s0, s5 +; GFX1032-NEXT: s_mul_i32 s10, s0, s8 +; GFX1032-NEXT: s_mul_i32 s11, s1, s5 +; GFX1032-NEXT: s_add_i32 s9, s9, s10 +; GFX1032-NEXT: s_mul_i32 s10, s0, s5 +; GFX1032-NEXT: s_add_i32 s9, s9, s11 +; GFX1032-NEXT: s_sub_i32 s11, s3, s9 +; GFX1032-NEXT: s_sub_u32 s10, s2, s10 ; GFX1032-NEXT: s_cselect_b32 s12, -1, 0 -; GFX1032-NEXT: s_cmp_eq_u32 s10, s5 -; GFX1032-NEXT: s_cselect_b32 vcc_lo, -1, 0 -; GFX1032-NEXT: s_add_u32 s10, s1, 1 -; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1032-NEXT: s_addc_u32 s12, s9, 0 -; GFX1032-NEXT: s_add_u32 s13, s1, 2 -; GFX1032-NEXT: s_addc_u32 s14, s9, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1032-NEXT: s_subb_u32 s11, s11, s1 +; GFX1032-NEXT: s_sub_u32 s13, s10, s0 +; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1032-NEXT: s_cmp_lg_u32 s14, 0 +; GFX1032-NEXT: s_subb_u32 s11, s11, 0 +; GFX1032-NEXT: s_cmp_ge_u32 s11, s1 +; GFX1032-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1032-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1032-NEXT: s_cselect_b32 s13, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s11, s1 +; GFX1032-NEXT: s_cselect_b32 s11, s13, s14 +; GFX1032-NEXT: s_add_u32 s13, s5, 1 +; GFX1032-NEXT: s_addc_u32 s14, s8, 0 +; GFX1032-NEXT: s_add_u32 s15, s5, 2 +; GFX1032-NEXT: s_addc_u32 s16, s8, 0 ; GFX1032-NEXT: s_cmp_lg_u32 s11, 0 -; GFX1032-NEXT: v_cmp_le_u32_e32 vcc_lo, s4, v0 -; GFX1032-NEXT: s_subb_u32 s0, s7, s0 -; GFX1032-NEXT: v_mov_b32_e32 v2, s13 -; GFX1032-NEXT: s_cmp_ge_u32 s0, s5 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo -; GFX1032-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1032-NEXT: s_cmp_eq_u32 s0, s5 -; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s14 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s7, v0, s0 -; GFX1032-NEXT: v_cndmask_b32_e32 v2, s10, v2, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e32 v1, s12, v1, vcc_lo -; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc_lo -; GFX1032-NEXT: v_cndmask_b32_e32 v0, s1, v2, vcc_lo -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; GFX1032-NEXT: s_cselect_b32 s11, s15, s13 +; GFX1032-NEXT: s_cselect_b32 s13, s16, s14 +; GFX1032-NEXT: s_cmp_lg_u32 s12, 0 +; GFX1032-NEXT: s_subb_u32 s3, s3, s9 +; GFX1032-NEXT: s_cmp_ge_u32 s3, s1 +; GFX1032-NEXT: s_cselect_b32 s9, -1, 0 +; GFX1032-NEXT: s_cmp_ge_u32 s10, s0 +; GFX1032-NEXT: s_cselect_b32 s10, -1, 0 +; GFX1032-NEXT: s_cmp_eq_u32 s3, s1 +; GFX1032-NEXT: s_cselect_b32 s1, s10, s9 +; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1032-NEXT: s_cselect_b32 s9, s13, s8 +; GFX1032-NEXT: s_cselect_b32 s8, s11, s5 +; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 ; GFX1032-NEXT: s_cbranch_vccnz .LBB15_3 ; GFX1032-NEXT: .LBB15_2: -; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX1032-NEXT: s_sub_i32 s1, 0, s4 +; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1032-NEXT: s_sub_i32 s3, 0, s0 +; GFX1032-NEXT: s_mov_b32 s9, 0 ; GFX1032-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1032-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1032-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1032-NEXT: s_mul_i32 s1, s1, s0 -; GFX1032-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX1032-NEXT: s_add_i32 s0, s0, s1 -; GFX1032-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX1032-NEXT: s_mul_i32 s1, s0, s4 -; GFX1032-NEXT: s_add_i32 s5, s0, 1 -; GFX1032-NEXT: s_sub_i32 s1, s6, s1 -; GFX1032-NEXT: s_sub_i32 s6, s1, s4 -; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 -; GFX1032-NEXT: s_cselect_b32 s1, s6, s1 -; GFX1032-NEXT: s_add_i32 s5, s0, 1 -; GFX1032-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1032-NEXT: s_mov_b32 s1, 0 -; GFX1032-NEXT: s_cselect_b32 s0, s5, s0 -; GFX1032-NEXT: v_mov_b32_e32 v0, s0 -; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1032-NEXT: s_mul_i32 s3, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX1032-NEXT: s_add_i32 s1, s1, s3 +; GFX1032-NEXT: s_mul_hi_u32 s1, s2, s1 +; GFX1032-NEXT: s_mul_i32 s3, s1, s0 +; GFX1032-NEXT: s_sub_i32 s2, s2, s3 +; GFX1032-NEXT: s_add_i32 s3, s1, 1 +; GFX1032-NEXT: s_sub_i32 s4, s2, s0 +; GFX1032-NEXT: s_cmp_ge_u32 s2, s0 +; GFX1032-NEXT: s_cselect_b32 s1, s3, s1 +; GFX1032-NEXT: s_cselect_b32 s2, s4, s2 +; GFX1032-NEXT: s_add_i32 s3, s1, 1 +; GFX1032-NEXT: s_cmp_ge_u32 s2, s0 +; GFX1032-NEXT: s_cselect_b32 s8, s3, s1 ; GFX1032-NEXT: .LBB15_3: +; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0 -; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 +; GFX1032-NEXT: v_mov_b32_e32 v1, s9 +; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1032-NEXT: s_endpgm ; GFX1032-NEXT: .LBB15_4: -; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1032-NEXT: ; implicit-def: $sgpr8_sgpr9 ; GFX1032-NEXT: s_branch .LBB15_2 ; ; GFX1064-LABEL: test_udiv64: ; GFX1064: ; %bb.0: ; %bb -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[4:5] -; GFX1064-NEXT: s_mov_b32 s0, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_or_b64 s[4:5], s[2:3], s[0:1] +; GFX1064-NEXT: s_mov_b32 s4, 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GFX1064-NEXT: s_cbranch_scc0 .LBB15_4 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s5 -; GFX1064-NEXT: s_sub_u32 s9, 0, s4 -; GFX1064-NEXT: s_subb_u32 s10, 0, s5 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX1064-NEXT: s_sub_u32 s9, 0, s0 +; GFX1064-NEXT: s_subb_u32 s10, 0, s1 ; GFX1064-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 @@ -914,141 +912,139 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 { ; GFX1064-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s8, v1 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s0 -; GFX1064-NEXT: s_mul_i32 s11, s10, s0 -; GFX1064-NEXT: s_add_i32 s1, s12, s1 -; GFX1064-NEXT: s_mul_i32 s13, s9, s0 -; GFX1064-NEXT: s_add_i32 s1, s1, s11 -; GFX1064-NEXT: s_mul_hi_u32 s12, s0, s13 +; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 +; GFX1064-NEXT: s_mul_i32 s5, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s12, s9, s4 +; GFX1064-NEXT: s_mul_i32 s11, s10, s4 +; GFX1064-NEXT: s_add_i32 s5, s12, s5 +; GFX1064-NEXT: s_mul_i32 s13, s9, s4 +; GFX1064-NEXT: s_add_i32 s5, s5, s11 +; GFX1064-NEXT: s_mul_hi_u32 s12, s4, s13 +; GFX1064-NEXT: s_mul_i32 s15, s4, s5 ; GFX1064-NEXT: s_mul_hi_u32 s14, s8, s13 ; GFX1064-NEXT: s_mul_i32 s11, s8, s13 -; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 -; GFX1064-NEXT: s_mul_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s15, s8, s1 -; GFX1064-NEXT: s_add_u32 s0, s12, s0 -; GFX1064-NEXT: s_addc_u32 s12, 0, s13 -; GFX1064-NEXT: s_add_u32 s0, s0, s11 -; GFX1064-NEXT: s_mul_i32 s1, s8, s1 -; GFX1064-NEXT: s_addc_u32 s0, s12, s14 -; GFX1064-NEXT: s_addc_u32 s11, s15, 0 -; GFX1064-NEXT: s_add_u32 s0, s0, s1 -; GFX1064-NEXT: s_addc_u32 s11, 0, s11 -; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1064-NEXT: s_mul_hi_u32 s13, s4, s5 +; GFX1064-NEXT: s_add_u32 s12, s12, s15 +; GFX1064-NEXT: s_addc_u32 s13, 0, s13 +; GFX1064-NEXT: s_mul_hi_u32 s16, s8, s5 +; GFX1064-NEXT: s_add_u32 s11, s12, s11 +; GFX1064-NEXT: s_mul_i32 s5, s8, s5 +; GFX1064-NEXT: s_addc_u32 s11, s13, s14 +; GFX1064-NEXT: s_addc_u32 s12, s16, 0 +; GFX1064-NEXT: s_add_u32 s5, s11, s5 +; GFX1064-NEXT: s_addc_u32 s11, 0, s12 +; GFX1064-NEXT: s_add_u32 s12, s4, s5 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_mul_hi_u32 s13, s9, s12 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_mul_i32 s4, s9, s12 ; GFX1064-NEXT: s_addc_u32 s8, s8, s11 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s9, s8 -; GFX1064-NEXT: s_mul_hi_u32 s11, s9, s0 -; GFX1064-NEXT: s_mul_i32 s10, s10, s0 -; GFX1064-NEXT: s_add_i32 s1, s11, s1 -; GFX1064-NEXT: s_mul_i32 s9, s9, s0 -; GFX1064-NEXT: s_add_i32 s1, s1, s10 -; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s9 -; GFX1064-NEXT: s_mul_i32 s12, s8, s9 -; GFX1064-NEXT: s_mul_hi_u32 s9, s0, s9 -; GFX1064-NEXT: s_mul_hi_u32 s13, s0, s1 -; GFX1064-NEXT: s_mul_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s1 -; GFX1064-NEXT: s_add_u32 s0, s9, s0 -; GFX1064-NEXT: s_addc_u32 s9, 0, s13 -; GFX1064-NEXT: s_add_u32 s0, s0, s12 -; GFX1064-NEXT: s_mul_i32 s1, s8, s1 -; GFX1064-NEXT: s_addc_u32 s0, s9, s11 -; GFX1064-NEXT: s_addc_u32 s9, s10, 0 -; GFX1064-NEXT: s_add_u32 s0, s0, s1 +; GFX1064-NEXT: s_mul_i32 s10, s10, s12 +; GFX1064-NEXT: s_mul_i32 s9, s9, s8 +; GFX1064-NEXT: s_mul_hi_u32 s5, s12, s4 +; GFX1064-NEXT: s_add_i32 s9, s13, s9 +; GFX1064-NEXT: s_mul_hi_u32 s11, s8, s4 +; GFX1064-NEXT: s_add_i32 s9, s9, s10 +; GFX1064-NEXT: s_mul_i32 s4, s8, s4 +; GFX1064-NEXT: s_mul_i32 s14, s12, s9 +; GFX1064-NEXT: s_mul_hi_u32 s13, s12, s9 +; GFX1064-NEXT: s_add_u32 s5, s5, s14 +; GFX1064-NEXT: s_addc_u32 s13, 0, s13 +; GFX1064-NEXT: s_mul_hi_u32 s10, s8, s9 +; GFX1064-NEXT: s_add_u32 s4, s5, s4 +; GFX1064-NEXT: s_mul_i32 s9, s8, s9 +; GFX1064-NEXT: s_addc_u32 s4, s13, s11 +; GFX1064-NEXT: s_addc_u32 s5, s10, 0 +; GFX1064-NEXT: s_add_u32 s4, s4, s9 +; GFX1064-NEXT: s_addc_u32 s9, 0, s5 +; GFX1064-NEXT: s_add_u32 s10, s12, s4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_mul_hi_u32 s11, s2, s10 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_mul_hi_u32 s4, s3, s10 +; GFX1064-NEXT: s_addc_u32 s5, s8, s9 +; GFX1064-NEXT: s_mul_i32 s8, s3, s10 +; GFX1064-NEXT: s_mul_i32 s10, s2, s5 +; GFX1064-NEXT: s_mul_hi_u32 s9, s2, s5 +; GFX1064-NEXT: s_add_u32 s10, s11, s10 ; GFX1064-NEXT: s_addc_u32 s9, 0, s9 -; GFX1064-NEXT: v_add_co_u32 v0, s[0:1], v0, s0 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_addc_u32 s0, s8, s9 -; GFX1064-NEXT: v_readfirstlane_b32 s1, v0 -; GFX1064-NEXT: s_mul_i32 s9, s6, s0 -; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s0 -; GFX1064-NEXT: s_mul_hi_u32 s10, s7, s0 -; GFX1064-NEXT: s_mul_i32 s0, s7, s0 -; GFX1064-NEXT: s_mul_hi_u32 s11, s6, s1 -; GFX1064-NEXT: s_mul_hi_u32 s12, s7, s1 -; GFX1064-NEXT: s_mul_i32 s1, s7, s1 -; GFX1064-NEXT: s_add_u32 s9, s11, s9 -; GFX1064-NEXT: s_addc_u32 s8, 0, s8 -; GFX1064-NEXT: s_add_u32 s1, s9, s1 -; GFX1064-NEXT: s_addc_u32 s1, s8, s12 -; GFX1064-NEXT: s_addc_u32 s8, s10, 0 -; GFX1064-NEXT: s_add_u32 s10, s1, s0 +; GFX1064-NEXT: s_mul_hi_u32 s12, s3, s5 +; GFX1064-NEXT: s_add_u32 s8, s10, s8 +; GFX1064-NEXT: s_mul_i32 s5, s3, s5 +; GFX1064-NEXT: s_addc_u32 s4, s9, s4 +; GFX1064-NEXT: s_addc_u32 s8, s12, 0 +; GFX1064-NEXT: s_add_u32 s10, s4, s5 ; GFX1064-NEXT: s_addc_u32 s11, 0, s8 -; GFX1064-NEXT: s_mul_hi_u32 s0, s4, s10 -; GFX1064-NEXT: s_mul_i32 s1, s4, s11 -; GFX1064-NEXT: s_mul_i32 s9, s4, s10 -; GFX1064-NEXT: s_add_i32 s12, s0, s1 -; GFX1064-NEXT: v_sub_co_u32 v0, s[0:1], s6, s9 -; GFX1064-NEXT: s_mul_i32 s8, s5, s10 -; GFX1064-NEXT: s_add_i32 s12, s12, s8 -; GFX1064-NEXT: v_sub_co_u32 v1, s[8:9], v0, s4 -; GFX1064-NEXT: s_sub_i32 s13, s7, s12 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: s_subb_u32 s13, s13, s5 +; GFX1064-NEXT: s_mul_hi_u32 s4, s0, s10 +; GFX1064-NEXT: s_mul_i32 s5, s0, s11 +; GFX1064-NEXT: s_mul_i32 s8, s1, s10 +; GFX1064-NEXT: s_add_i32 s4, s4, s5 +; GFX1064-NEXT: s_add_i32 s12, s4, s8 +; GFX1064-NEXT: s_mul_i32 s4, s0, s10 +; GFX1064-NEXT: s_sub_i32 s8, s3, s12 +; GFX1064-NEXT: s_sub_u32 s13, s2, s4 +; GFX1064-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_subb_u32 s14, s8, s1 +; GFX1064-NEXT: s_sub_u32 s15, s13, s0 +; GFX1064-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX1064-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v1 -; GFX1064-NEXT: s_subb_u32 s8, s13, 0 -; GFX1064-NEXT: s_cmp_ge_u32 s8, s5 -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GFX1064-NEXT: s_subb_u32 s8, s14, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s8, s1 ; GFX1064-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s8, s5 -; GFX1064-NEXT: s_cselect_b64 vcc, -1, 0 -; GFX1064-NEXT: s_add_u32 s8, s10, 1 -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc -; GFX1064-NEXT: s_addc_u32 s9, s11, 0 -; GFX1064-NEXT: s_add_u32 s13, s10, 2 +; GFX1064-NEXT: s_cmp_ge_u32 s15, s0 +; GFX1064-NEXT: s_cselect_b32 s14, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s8, s1 +; GFX1064-NEXT: s_cselect_b32 s8, s14, s9 +; GFX1064-NEXT: s_add_u32 s9, s10, 1 ; GFX1064-NEXT: s_addc_u32 s14, s11, 0 -; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 -; GFX1064-NEXT: s_subb_u32 s0, s7, s12 -; GFX1064-NEXT: v_mov_b32_e32 v2, s13 -; GFX1064-NEXT: s_cmp_ge_u32 s0, s5 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc -; GFX1064-NEXT: s_cselect_b32 s7, -1, 0 -; GFX1064-NEXT: s_cmp_eq_u32 s0, s5 -; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 -; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s14 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s7, v0, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s9, v1, vcc -; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: v_cndmask_b32_e32 v1, s11, v1, vcc -; GFX1064-NEXT: v_cndmask_b32_e32 v0, s10, v2, vcc +; GFX1064-NEXT: s_add_u32 s15, s10, 2 +; GFX1064-NEXT: s_addc_u32 s16, s11, 0 +; GFX1064-NEXT: s_cmp_lg_u32 s8, 0 +; GFX1064-NEXT: s_cselect_b32 s15, s15, s9 +; GFX1064-NEXT: s_cselect_b32 s14, s16, s14 +; GFX1064-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX1064-NEXT: s_subb_u32 s3, s3, s12 +; GFX1064-NEXT: s_cmp_ge_u32 s3, s1 +; GFX1064-NEXT: s_cselect_b32 s4, -1, 0 +; GFX1064-NEXT: s_cmp_ge_u32 s13, s0 +; GFX1064-NEXT: s_cselect_b32 s5, -1, 0 +; GFX1064-NEXT: s_cmp_eq_u32 s3, s1 +; GFX1064-NEXT: s_cselect_b32 s1, s5, s4 +; GFX1064-NEXT: s_cmp_lg_u32 s1, 0 +; GFX1064-NEXT: s_cselect_b32 s5, s14, s11 +; GFX1064-NEXT: s_cselect_b32 s4, s15, s10 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_3 ; GFX1064-NEXT: .LBB15_2: -; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX1064-NEXT: s_sub_i32 s1, 0, s4 +; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX1064-NEXT: s_sub_i32 s3, 0, s0 +; GFX1064-NEXT: s_mov_b32 s5, 0 ; GFX1064-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX1064-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX1064-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1064-NEXT: s_mul_i32 s1, s1, s0 -; GFX1064-NEXT: s_mul_hi_u32 s1, s0, s1 -; GFX1064-NEXT: s_add_i32 s0, s0, s1 -; GFX1064-NEXT: s_mul_hi_u32 s0, s6, s0 -; GFX1064-NEXT: s_mul_i32 s1, s0, s4 -; GFX1064-NEXT: s_add_i32 s5, s0, 1 -; GFX1064-NEXT: s_sub_i32 s1, s6, s1 -; GFX1064-NEXT: s_sub_i32 s6, s1, s4 -; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 -; GFX1064-NEXT: s_cselect_b32 s1, s6, s1 -; GFX1064-NEXT: s_add_i32 s5, s0, 1 -; GFX1064-NEXT: s_cmp_ge_u32 s1, s4 -; GFX1064-NEXT: s_mov_b32 s1, 0 -; GFX1064-NEXT: s_cselect_b32 s0, s5, s0 -; GFX1064-NEXT: v_mov_b32_e32 v0, s0 -; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v0 +; GFX1064-NEXT: s_mul_i32 s3, s3, s1 +; GFX1064-NEXT: s_mul_hi_u32 s3, s1, s3 +; GFX1064-NEXT: s_add_i32 s1, s1, s3 +; GFX1064-NEXT: s_mul_hi_u32 s1, s2, s1 +; GFX1064-NEXT: s_mul_i32 s3, s1, s0 +; GFX1064-NEXT: s_sub_i32 s2, s2, s3 +; GFX1064-NEXT: s_add_i32 s3, s1, 1 +; GFX1064-NEXT: s_sub_i32 s4, s2, s0 +; GFX1064-NEXT: s_cmp_ge_u32 s2, s0 +; GFX1064-NEXT: s_cselect_b32 s1, s3, s1 +; GFX1064-NEXT: s_cselect_b32 s2, s4, s2 +; GFX1064-NEXT: s_add_i32 s3, s1, 1 +; GFX1064-NEXT: s_cmp_ge_u32 s2, s0 +; GFX1064-NEXT: s_cselect_b32 s4, s3, s1 ; GFX1064-NEXT: .LBB15_3: +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:16 +; GFX1064-NEXT: v_mov_b32_e32 v1, s5 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:16 ; GFX1064-NEXT: s_endpgm ; GFX1064-NEXT: .LBB15_4: -; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX1064-NEXT: s_branch .LBB15_2 bb: %tmp = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll index b17050178c306..e3437fded0429 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll +++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll @@ -4537,6 +4537,3152 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2 ret <2 x half> %ret } +define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %active, <2 x half> %x, <2 x half> %y) { + ; This should not be turned into a tail call. +; DAGISEL-LABEL: tail_call_gfx_from_whole_wave: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL-NEXT: s_wait_expcnt 0x0 +; DAGISEL-NEXT: s_wait_samplecnt 0x0 +; DAGISEL-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL-NEXT: s_wait_kmcnt 0x0 +; DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:24 +; DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:28 +; DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:32 +; DAGISEL-NEXT: scratch_store_b32 off, v9, s32 offset:36 +; DAGISEL-NEXT: scratch_store_b32 off, v10, s32 offset:40 +; DAGISEL-NEXT: scratch_store_b32 off, v11, s32 offset:44 +; DAGISEL-NEXT: scratch_store_b32 off, v12, s32 offset:48 +; DAGISEL-NEXT: scratch_store_b32 off, v13, s32 offset:52 +; DAGISEL-NEXT: scratch_store_b32 off, v14, s32 offset:56 +; DAGISEL-NEXT: scratch_store_b32 off, v15, s32 offset:60 +; DAGISEL-NEXT: scratch_store_b32 off, v16, s32 offset:64 +; DAGISEL-NEXT: scratch_store_b32 off, v17, s32 offset:68 +; DAGISEL-NEXT: scratch_store_b32 off, v18, s32 offset:72 +; DAGISEL-NEXT: scratch_store_b32 off, v19, s32 offset:76 +; DAGISEL-NEXT: scratch_store_b32 off, v20, s32 offset:80 +; DAGISEL-NEXT: scratch_store_b32 off, v21, s32 offset:84 +; DAGISEL-NEXT: scratch_store_b32 off, v22, s32 offset:88 +; DAGISEL-NEXT: scratch_store_b32 off, v23, s32 offset:92 +; DAGISEL-NEXT: scratch_store_b32 off, v24, s32 offset:96 +; DAGISEL-NEXT: scratch_store_b32 off, v25, s32 offset:100 +; DAGISEL-NEXT: scratch_store_b32 off, v26, s32 offset:104 +; DAGISEL-NEXT: scratch_store_b32 off, v27, s32 offset:108 +; DAGISEL-NEXT: scratch_store_b32 off, v28, s32 offset:112 +; DAGISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116 +; DAGISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120 +; DAGISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128 +; DAGISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132 +; DAGISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136 +; DAGISEL-NEXT: scratch_store_b32 off, v35, s32 offset:140 +; DAGISEL-NEXT: scratch_store_b32 off, v36, s32 offset:144 +; DAGISEL-NEXT: scratch_store_b32 off, v37, s32 offset:148 +; DAGISEL-NEXT: scratch_store_b32 off, v38, s32 offset:152 +; DAGISEL-NEXT: scratch_store_b32 off, v39, s32 offset:156 +; DAGISEL-NEXT: scratch_store_b32 off, v48, s32 offset:160 +; DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:164 +; DAGISEL-NEXT: scratch_store_b32 off, v50, s32 offset:168 +; DAGISEL-NEXT: scratch_store_b32 off, v51, s32 offset:172 +; DAGISEL-NEXT: scratch_store_b32 off, v52, s32 offset:176 +; DAGISEL-NEXT: scratch_store_b32 off, v53, s32 offset:180 +; DAGISEL-NEXT: scratch_store_b32 off, v54, s32 offset:184 +; DAGISEL-NEXT: scratch_store_b32 off, v55, s32 offset:188 +; DAGISEL-NEXT: scratch_store_b32 off, v64, s32 offset:192 +; DAGISEL-NEXT: scratch_store_b32 off, v65, s32 offset:196 +; DAGISEL-NEXT: scratch_store_b32 off, v66, s32 offset:200 +; DAGISEL-NEXT: scratch_store_b32 off, v67, s32 offset:204 +; DAGISEL-NEXT: scratch_store_b32 off, v68, s32 offset:208 +; DAGISEL-NEXT: scratch_store_b32 off, v69, s32 offset:212 +; DAGISEL-NEXT: scratch_store_b32 off, v70, s32 offset:216 +; DAGISEL-NEXT: scratch_store_b32 off, v71, s32 offset:220 +; DAGISEL-NEXT: scratch_store_b32 off, v80, s32 offset:224 +; DAGISEL-NEXT: scratch_store_b32 off, v81, s32 offset:228 +; DAGISEL-NEXT: scratch_store_b32 off, v82, s32 offset:232 +; DAGISEL-NEXT: scratch_store_b32 off, v83, s32 offset:236 +; DAGISEL-NEXT: scratch_store_b32 off, v84, s32 offset:240 +; DAGISEL-NEXT: scratch_store_b32 off, v85, s32 offset:244 +; DAGISEL-NEXT: scratch_store_b32 off, v86, s32 offset:248 +; DAGISEL-NEXT: scratch_store_b32 off, v87, s32 offset:252 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v96, s32 offset:256 +; DAGISEL-NEXT: scratch_store_b32 off, v97, s32 offset:260 +; DAGISEL-NEXT: scratch_store_b32 off, v98, s32 offset:264 +; DAGISEL-NEXT: scratch_store_b32 off, v99, s32 offset:268 +; DAGISEL-NEXT: scratch_store_b32 off, v100, s32 offset:272 +; DAGISEL-NEXT: scratch_store_b32 off, v101, s32 offset:276 +; DAGISEL-NEXT: scratch_store_b32 off, v102, s32 offset:280 +; DAGISEL-NEXT: scratch_store_b32 off, v103, s32 offset:284 +; DAGISEL-NEXT: scratch_store_b32 off, v112, s32 offset:288 +; DAGISEL-NEXT: scratch_store_b32 off, v113, s32 offset:292 +; DAGISEL-NEXT: scratch_store_b32 off, v114, s32 offset:296 +; DAGISEL-NEXT: scratch_store_b32 off, v115, s32 offset:300 +; DAGISEL-NEXT: scratch_store_b32 off, v116, s32 offset:304 +; DAGISEL-NEXT: scratch_store_b32 off, v117, s32 offset:308 +; DAGISEL-NEXT: scratch_store_b32 off, v118, s32 offset:312 +; DAGISEL-NEXT: scratch_store_b32 off, v119, s32 offset:316 +; DAGISEL-NEXT: scratch_store_b32 off, v128, s32 offset:320 +; DAGISEL-NEXT: scratch_store_b32 off, v129, s32 offset:324 +; DAGISEL-NEXT: scratch_store_b32 off, v130, s32 offset:328 +; DAGISEL-NEXT: scratch_store_b32 off, v131, s32 offset:332 +; DAGISEL-NEXT: scratch_store_b32 off, v132, s32 offset:336 +; DAGISEL-NEXT: scratch_store_b32 off, v133, s32 offset:340 +; DAGISEL-NEXT: scratch_store_b32 off, v134, s32 offset:344 +; DAGISEL-NEXT: scratch_store_b32 off, v135, s32 offset:348 +; DAGISEL-NEXT: scratch_store_b32 off, v144, s32 offset:352 +; DAGISEL-NEXT: scratch_store_b32 off, v145, s32 offset:356 +; DAGISEL-NEXT: scratch_store_b32 off, v146, s32 offset:360 +; DAGISEL-NEXT: scratch_store_b32 off, v147, s32 offset:364 +; DAGISEL-NEXT: scratch_store_b32 off, v148, s32 offset:368 +; DAGISEL-NEXT: scratch_store_b32 off, v149, s32 offset:372 +; DAGISEL-NEXT: scratch_store_b32 off, v150, s32 offset:376 +; DAGISEL-NEXT: scratch_store_b32 off, v151, s32 offset:380 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_store_b32 off, v160, s32 offset:384 +; DAGISEL-NEXT: scratch_store_b32 off, v161, s32 offset:388 +; DAGISEL-NEXT: scratch_store_b32 off, v162, s32 offset:392 +; DAGISEL-NEXT: scratch_store_b32 off, v163, s32 offset:396 +; DAGISEL-NEXT: scratch_store_b32 off, v164, s32 offset:400 +; DAGISEL-NEXT: scratch_store_b32 off, v165, s32 offset:404 +; DAGISEL-NEXT: scratch_store_b32 off, v166, s32 offset:408 +; DAGISEL-NEXT: scratch_store_b32 off, v167, s32 offset:412 +; DAGISEL-NEXT: scratch_store_b32 off, v176, s32 offset:416 +; DAGISEL-NEXT: scratch_store_b32 off, v177, s32 offset:420 +; DAGISEL-NEXT: scratch_store_b32 off, v178, s32 offset:424 +; DAGISEL-NEXT: scratch_store_b32 off, v179, s32 offset:428 +; DAGISEL-NEXT: scratch_store_b32 off, v180, s32 offset:432 +; DAGISEL-NEXT: scratch_store_b32 off, v181, s32 offset:436 +; DAGISEL-NEXT: scratch_store_b32 off, v182, s32 offset:440 +; DAGISEL-NEXT: scratch_store_b32 off, v183, s32 offset:444 +; DAGISEL-NEXT: scratch_store_b32 off, v192, s32 offset:448 +; DAGISEL-NEXT: scratch_store_b32 off, v193, s32 offset:452 +; DAGISEL-NEXT: scratch_store_b32 off, v194, s32 offset:456 +; DAGISEL-NEXT: scratch_store_b32 off, v195, s32 offset:460 +; DAGISEL-NEXT: scratch_store_b32 off, v196, s32 offset:464 +; DAGISEL-NEXT: scratch_store_b32 off, v197, s32 offset:468 +; DAGISEL-NEXT: scratch_store_b32 off, v198, s32 offset:472 +; DAGISEL-NEXT: scratch_store_b32 off, v199, s32 offset:476 +; DAGISEL-NEXT: scratch_store_b32 off, v208, s32 offset:480 +; DAGISEL-NEXT: scratch_store_b32 off, v209, s32 offset:484 +; DAGISEL-NEXT: scratch_store_b32 off, v210, s32 offset:488 +; DAGISEL-NEXT: scratch_store_b32 off, v211, s32 offset:492 +; DAGISEL-NEXT: scratch_store_b32 off, v212, s32 offset:496 +; DAGISEL-NEXT: scratch_store_b32 off, v213, s32 offset:500 +; DAGISEL-NEXT: scratch_store_b32 off, v214, s32 offset:504 +; DAGISEL-NEXT: scratch_store_b32 off, v215, s32 offset:508 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_store_b32 off, v224, s32 offset:512 +; DAGISEL-NEXT: scratch_store_b32 off, v225, s32 offset:516 +; DAGISEL-NEXT: scratch_store_b32 off, v226, s32 offset:520 +; DAGISEL-NEXT: scratch_store_b32 off, v227, s32 offset:524 +; DAGISEL-NEXT: scratch_store_b32 off, v228, s32 offset:528 +; DAGISEL-NEXT: scratch_store_b32 off, v229, s32 offset:532 +; DAGISEL-NEXT: scratch_store_b32 off, v230, s32 offset:536 +; DAGISEL-NEXT: scratch_store_b32 off, v231, s32 offset:540 +; DAGISEL-NEXT: scratch_store_b32 off, v240, s32 offset:544 +; DAGISEL-NEXT: scratch_store_b32 off, v241, s32 offset:548 +; DAGISEL-NEXT: scratch_store_b32 off, v242, s32 offset:552 +; DAGISEL-NEXT: scratch_store_b32 off, v243, s32 offset:556 +; DAGISEL-NEXT: scratch_store_b32 off, v244, s32 offset:560 +; DAGISEL-NEXT: scratch_store_b32 off, v245, s32 offset:564 +; DAGISEL-NEXT: scratch_store_b32 off, v246, s32 offset:568 +; DAGISEL-NEXT: scratch_store_b32 off, v247, s32 offset:572 +; DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; DAGISEL-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL-NEXT: s_mov_b32 s37, gfx_callee@abs32@hi +; DAGISEL-NEXT: s_mov_b32 s36, gfx_callee@abs32@lo +; DAGISEL-NEXT: v_swap_b32 v0, v1 +; DAGISEL-NEXT: s_wait_alu 0xfffe +; DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:24 +; DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:28 +; DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:32 +; DAGISEL-NEXT: scratch_load_b32 v9, off, s32 offset:36 +; DAGISEL-NEXT: scratch_load_b32 v10, off, s32 offset:40 +; DAGISEL-NEXT: scratch_load_b32 v11, off, s32 offset:44 +; DAGISEL-NEXT: scratch_load_b32 v12, off, s32 offset:48 +; DAGISEL-NEXT: scratch_load_b32 v13, off, s32 offset:52 +; DAGISEL-NEXT: scratch_load_b32 v14, off, s32 offset:56 +; DAGISEL-NEXT: scratch_load_b32 v15, off, s32 offset:60 +; DAGISEL-NEXT: scratch_load_b32 v16, off, s32 offset:64 +; DAGISEL-NEXT: scratch_load_b32 v17, off, s32 offset:68 +; DAGISEL-NEXT: scratch_load_b32 v18, off, s32 offset:72 +; DAGISEL-NEXT: scratch_load_b32 v19, off, s32 offset:76 +; DAGISEL-NEXT: scratch_load_b32 v20, off, s32 offset:80 +; DAGISEL-NEXT: scratch_load_b32 v21, off, s32 offset:84 +; DAGISEL-NEXT: scratch_load_b32 v22, off, s32 offset:88 +; DAGISEL-NEXT: scratch_load_b32 v23, off, s32 offset:92 +; DAGISEL-NEXT: scratch_load_b32 v24, off, s32 offset:96 +; DAGISEL-NEXT: scratch_load_b32 v25, off, s32 offset:100 +; DAGISEL-NEXT: scratch_load_b32 v26, off, s32 offset:104 +; DAGISEL-NEXT: scratch_load_b32 v27, off, s32 offset:108 +; DAGISEL-NEXT: scratch_load_b32 v28, off, s32 offset:112 +; DAGISEL-NEXT: scratch_load_b32 v29, off, s32 offset:116 +; DAGISEL-NEXT: scratch_load_b32 v30, off, s32 offset:120 +; DAGISEL-NEXT: scratch_load_b32 v31, off, s32 offset:124 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v32, off, s32 offset:128 +; DAGISEL-NEXT: scratch_load_b32 v33, off, s32 offset:132 +; DAGISEL-NEXT: scratch_load_b32 v34, off, s32 offset:136 +; DAGISEL-NEXT: scratch_load_b32 v35, off, s32 offset:140 +; DAGISEL-NEXT: scratch_load_b32 v36, off, s32 offset:144 +; DAGISEL-NEXT: scratch_load_b32 v37, off, s32 offset:148 +; DAGISEL-NEXT: scratch_load_b32 v38, off, s32 offset:152 +; DAGISEL-NEXT: scratch_load_b32 v39, off, s32 offset:156 +; DAGISEL-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:164 +; DAGISEL-NEXT: scratch_load_b32 v50, off, s32 offset:168 +; DAGISEL-NEXT: scratch_load_b32 v51, off, s32 offset:172 +; DAGISEL-NEXT: scratch_load_b32 v52, off, s32 offset:176 +; DAGISEL-NEXT: scratch_load_b32 v53, off, s32 offset:180 +; DAGISEL-NEXT: scratch_load_b32 v54, off, s32 offset:184 +; DAGISEL-NEXT: scratch_load_b32 v55, off, s32 offset:188 +; DAGISEL-NEXT: scratch_load_b32 v64, off, s32 offset:192 +; DAGISEL-NEXT: scratch_load_b32 v65, off, s32 offset:196 +; DAGISEL-NEXT: scratch_load_b32 v66, off, s32 offset:200 +; DAGISEL-NEXT: scratch_load_b32 v67, off, s32 offset:204 +; DAGISEL-NEXT: scratch_load_b32 v68, off, s32 offset:208 +; DAGISEL-NEXT: scratch_load_b32 v69, off, s32 offset:212 +; DAGISEL-NEXT: scratch_load_b32 v70, off, s32 offset:216 +; DAGISEL-NEXT: scratch_load_b32 v71, off, s32 offset:220 +; DAGISEL-NEXT: scratch_load_b32 v80, off, s32 offset:224 +; DAGISEL-NEXT: scratch_load_b32 v81, off, s32 offset:228 +; DAGISEL-NEXT: scratch_load_b32 v82, off, s32 offset:232 +; DAGISEL-NEXT: scratch_load_b32 v83, off, s32 offset:236 +; DAGISEL-NEXT: scratch_load_b32 v84, off, s32 offset:240 +; DAGISEL-NEXT: scratch_load_b32 v85, off, s32 offset:244 +; DAGISEL-NEXT: scratch_load_b32 v86, off, s32 offset:248 +; DAGISEL-NEXT: scratch_load_b32 v87, off, s32 offset:252 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v96, off, s32 offset:256 +; DAGISEL-NEXT: scratch_load_b32 v97, off, s32 offset:260 +; DAGISEL-NEXT: scratch_load_b32 v98, off, s32 offset:264 +; DAGISEL-NEXT: scratch_load_b32 v99, off, s32 offset:268 +; DAGISEL-NEXT: scratch_load_b32 v100, off, s32 offset:272 +; DAGISEL-NEXT: scratch_load_b32 v101, off, s32 offset:276 +; DAGISEL-NEXT: scratch_load_b32 v102, off, s32 offset:280 +; DAGISEL-NEXT: scratch_load_b32 v103, off, s32 offset:284 +; DAGISEL-NEXT: scratch_load_b32 v112, off, s32 offset:288 +; DAGISEL-NEXT: scratch_load_b32 v113, off, s32 offset:292 +; DAGISEL-NEXT: scratch_load_b32 v114, off, s32 offset:296 +; DAGISEL-NEXT: scratch_load_b32 v115, off, s32 offset:300 +; DAGISEL-NEXT: scratch_load_b32 v116, off, s32 offset:304 +; DAGISEL-NEXT: scratch_load_b32 v117, off, s32 offset:308 +; DAGISEL-NEXT: scratch_load_b32 v118, off, s32 offset:312 +; DAGISEL-NEXT: scratch_load_b32 v119, off, s32 offset:316 +; DAGISEL-NEXT: scratch_load_b32 v128, off, s32 offset:320 +; DAGISEL-NEXT: scratch_load_b32 v129, off, s32 offset:324 +; DAGISEL-NEXT: scratch_load_b32 v130, off, s32 offset:328 +; DAGISEL-NEXT: scratch_load_b32 v131, off, s32 offset:332 +; DAGISEL-NEXT: scratch_load_b32 v132, off, s32 offset:336 +; DAGISEL-NEXT: scratch_load_b32 v133, off, s32 offset:340 +; DAGISEL-NEXT: scratch_load_b32 v134, off, s32 offset:344 +; DAGISEL-NEXT: scratch_load_b32 v135, off, s32 offset:348 +; DAGISEL-NEXT: scratch_load_b32 v144, off, s32 offset:352 +; DAGISEL-NEXT: scratch_load_b32 v145, off, s32 offset:356 +; DAGISEL-NEXT: scratch_load_b32 v146, off, s32 offset:360 +; DAGISEL-NEXT: scratch_load_b32 v147, off, s32 offset:364 +; DAGISEL-NEXT: scratch_load_b32 v148, off, s32 offset:368 +; DAGISEL-NEXT: scratch_load_b32 v149, off, s32 offset:372 +; DAGISEL-NEXT: scratch_load_b32 v150, off, s32 offset:376 +; DAGISEL-NEXT: scratch_load_b32 v151, off, s32 offset:380 +; DAGISEL-NEXT: s_clause 0x1f +; DAGISEL-NEXT: scratch_load_b32 v160, off, s32 offset:384 +; DAGISEL-NEXT: scratch_load_b32 v161, off, s32 offset:388 +; DAGISEL-NEXT: scratch_load_b32 v162, off, s32 offset:392 +; DAGISEL-NEXT: scratch_load_b32 v163, off, s32 offset:396 +; DAGISEL-NEXT: scratch_load_b32 v164, off, s32 offset:400 +; DAGISEL-NEXT: scratch_load_b32 v165, off, s32 offset:404 +; DAGISEL-NEXT: scratch_load_b32 v166, off, s32 offset:408 +; DAGISEL-NEXT: scratch_load_b32 v167, off, s32 offset:412 +; DAGISEL-NEXT: scratch_load_b32 v176, off, s32 offset:416 +; DAGISEL-NEXT: scratch_load_b32 v177, off, s32 offset:420 +; DAGISEL-NEXT: scratch_load_b32 v178, off, s32 offset:424 +; DAGISEL-NEXT: scratch_load_b32 v179, off, s32 offset:428 +; DAGISEL-NEXT: scratch_load_b32 v180, off, s32 offset:432 +; DAGISEL-NEXT: scratch_load_b32 v181, off, s32 offset:436 +; DAGISEL-NEXT: scratch_load_b32 v182, off, s32 offset:440 +; DAGISEL-NEXT: scratch_load_b32 v183, off, s32 offset:444 +; DAGISEL-NEXT: scratch_load_b32 v192, off, s32 offset:448 +; DAGISEL-NEXT: scratch_load_b32 v193, off, s32 offset:452 +; DAGISEL-NEXT: scratch_load_b32 v194, off, s32 offset:456 +; DAGISEL-NEXT: scratch_load_b32 v195, off, s32 offset:460 +; DAGISEL-NEXT: scratch_load_b32 v196, off, s32 offset:464 +; DAGISEL-NEXT: scratch_load_b32 v197, off, s32 offset:468 +; DAGISEL-NEXT: scratch_load_b32 v198, off, s32 offset:472 +; DAGISEL-NEXT: scratch_load_b32 v199, off, s32 offset:476 +; DAGISEL-NEXT: scratch_load_b32 v208, off, s32 offset:480 +; DAGISEL-NEXT: scratch_load_b32 v209, off, s32 offset:484 +; DAGISEL-NEXT: scratch_load_b32 v210, off, s32 offset:488 +; DAGISEL-NEXT: scratch_load_b32 v211, off, s32 offset:492 +; DAGISEL-NEXT: scratch_load_b32 v212, off, s32 offset:496 +; DAGISEL-NEXT: scratch_load_b32 v213, off, s32 offset:500 +; DAGISEL-NEXT: scratch_load_b32 v214, off, s32 offset:504 +; DAGISEL-NEXT: scratch_load_b32 v215, off, s32 offset:508 +; DAGISEL-NEXT: s_clause 0xf +; DAGISEL-NEXT: scratch_load_b32 v224, off, s32 offset:512 +; DAGISEL-NEXT: scratch_load_b32 v225, off, s32 offset:516 +; DAGISEL-NEXT: scratch_load_b32 v226, off, s32 offset:520 +; DAGISEL-NEXT: scratch_load_b32 v227, off, s32 offset:524 +; DAGISEL-NEXT: scratch_load_b32 v228, off, s32 offset:528 +; DAGISEL-NEXT: scratch_load_b32 v229, off, s32 offset:532 +; DAGISEL-NEXT: scratch_load_b32 v230, off, s32 offset:536 +; DAGISEL-NEXT: scratch_load_b32 v231, off, s32 offset:540 +; DAGISEL-NEXT: scratch_load_b32 v240, off, s32 offset:544 +; DAGISEL-NEXT: scratch_load_b32 v241, off, s32 offset:548 +; DAGISEL-NEXT: scratch_load_b32 v242, off, s32 offset:552 +; DAGISEL-NEXT: scratch_load_b32 v243, off, s32 offset:556 +; DAGISEL-NEXT: scratch_load_b32 v244, off, s32 offset:560 +; DAGISEL-NEXT: scratch_load_b32 v245, off, s32 offset:564 +; DAGISEL-NEXT: scratch_load_b32 v246, off, s32 offset:568 +; DAGISEL-NEXT: scratch_load_b32 v247, off, s32 offset:572 +; DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; DAGISEL-NEXT: s_setpc_b64 s[36:37] +; +; GISEL-LABEL: tail_call_gfx_from_whole_wave: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-NEXT: s_wait_expcnt 0x0 +; GISEL-NEXT: s_wait_samplecnt 0x0 +; GISEL-NEXT: s_wait_bvhcnt 0x0 +; GISEL-NEXT: s_wait_kmcnt 0x0 +; GISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v0, s32 +; GISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL-NEXT: scratch_store_b32 off, v6, s32 offset:24 +; GISEL-NEXT: scratch_store_b32 off, v7, s32 offset:28 +; GISEL-NEXT: scratch_store_b32 off, v8, s32 offset:32 +; GISEL-NEXT: scratch_store_b32 off, v9, s32 offset:36 +; GISEL-NEXT: scratch_store_b32 off, v10, s32 offset:40 +; GISEL-NEXT: scratch_store_b32 off, v11, s32 offset:44 +; GISEL-NEXT: scratch_store_b32 off, v12, s32 offset:48 +; GISEL-NEXT: scratch_store_b32 off, v13, s32 offset:52 +; GISEL-NEXT: scratch_store_b32 off, v14, s32 offset:56 +; GISEL-NEXT: scratch_store_b32 off, v15, s32 offset:60 +; GISEL-NEXT: scratch_store_b32 off, v16, s32 offset:64 +; GISEL-NEXT: scratch_store_b32 off, v17, s32 offset:68 +; GISEL-NEXT: scratch_store_b32 off, v18, s32 offset:72 +; GISEL-NEXT: scratch_store_b32 off, v19, s32 offset:76 +; GISEL-NEXT: scratch_store_b32 off, v20, s32 offset:80 +; GISEL-NEXT: scratch_store_b32 off, v21, s32 offset:84 +; GISEL-NEXT: scratch_store_b32 off, v22, s32 offset:88 +; GISEL-NEXT: scratch_store_b32 off, v23, s32 offset:92 +; GISEL-NEXT: scratch_store_b32 off, v24, s32 offset:96 +; GISEL-NEXT: scratch_store_b32 off, v25, s32 offset:100 +; GISEL-NEXT: scratch_store_b32 off, v26, s32 offset:104 +; GISEL-NEXT: scratch_store_b32 off, v27, s32 offset:108 +; GISEL-NEXT: scratch_store_b32 off, v28, s32 offset:112 +; GISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116 +; GISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120 +; GISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128 +; GISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132 +; GISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136 +; GISEL-NEXT: scratch_store_b32 off, v35, s32 offset:140 +; GISEL-NEXT: scratch_store_b32 off, v36, s32 offset:144 +; GISEL-NEXT: scratch_store_b32 off, v37, s32 offset:148 +; GISEL-NEXT: scratch_store_b32 off, v38, s32 offset:152 +; GISEL-NEXT: scratch_store_b32 off, v39, s32 offset:156 +; GISEL-NEXT: scratch_store_b32 off, v48, s32 offset:160 +; GISEL-NEXT: scratch_store_b32 off, v49, s32 offset:164 +; GISEL-NEXT: scratch_store_b32 off, v50, s32 offset:168 +; GISEL-NEXT: scratch_store_b32 off, v51, s32 offset:172 +; GISEL-NEXT: scratch_store_b32 off, v52, s32 offset:176 +; GISEL-NEXT: scratch_store_b32 off, v53, s32 offset:180 +; GISEL-NEXT: scratch_store_b32 off, v54, s32 offset:184 +; GISEL-NEXT: scratch_store_b32 off, v55, s32 offset:188 +; GISEL-NEXT: scratch_store_b32 off, v64, s32 offset:192 +; GISEL-NEXT: scratch_store_b32 off, v65, s32 offset:196 +; GISEL-NEXT: scratch_store_b32 off, v66, s32 offset:200 +; GISEL-NEXT: scratch_store_b32 off, v67, s32 offset:204 +; GISEL-NEXT: scratch_store_b32 off, v68, s32 offset:208 +; GISEL-NEXT: scratch_store_b32 off, v69, s32 offset:212 +; GISEL-NEXT: scratch_store_b32 off, v70, s32 offset:216 +; GISEL-NEXT: scratch_store_b32 off, v71, s32 offset:220 +; GISEL-NEXT: scratch_store_b32 off, v80, s32 offset:224 +; GISEL-NEXT: scratch_store_b32 off, v81, s32 offset:228 +; GISEL-NEXT: scratch_store_b32 off, v82, s32 offset:232 +; GISEL-NEXT: scratch_store_b32 off, v83, s32 offset:236 +; GISEL-NEXT: scratch_store_b32 off, v84, s32 offset:240 +; GISEL-NEXT: scratch_store_b32 off, v85, s32 offset:244 +; GISEL-NEXT: scratch_store_b32 off, v86, s32 offset:248 +; GISEL-NEXT: scratch_store_b32 off, v87, s32 offset:252 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v96, s32 offset:256 +; GISEL-NEXT: scratch_store_b32 off, v97, s32 offset:260 +; GISEL-NEXT: scratch_store_b32 off, v98, s32 offset:264 +; GISEL-NEXT: scratch_store_b32 off, v99, s32 offset:268 +; GISEL-NEXT: scratch_store_b32 off, v100, s32 offset:272 +; GISEL-NEXT: scratch_store_b32 off, v101, s32 offset:276 +; GISEL-NEXT: scratch_store_b32 off, v102, s32 offset:280 +; GISEL-NEXT: scratch_store_b32 off, v103, s32 offset:284 +; GISEL-NEXT: scratch_store_b32 off, v112, s32 offset:288 +; GISEL-NEXT: scratch_store_b32 off, v113, s32 offset:292 +; GISEL-NEXT: scratch_store_b32 off, v114, s32 offset:296 +; GISEL-NEXT: scratch_store_b32 off, v115, s32 offset:300 +; GISEL-NEXT: scratch_store_b32 off, v116, s32 offset:304 +; GISEL-NEXT: scratch_store_b32 off, v117, s32 offset:308 +; GISEL-NEXT: scratch_store_b32 off, v118, s32 offset:312 +; GISEL-NEXT: scratch_store_b32 off, v119, s32 offset:316 +; GISEL-NEXT: scratch_store_b32 off, v128, s32 offset:320 +; GISEL-NEXT: scratch_store_b32 off, v129, s32 offset:324 +; GISEL-NEXT: scratch_store_b32 off, v130, s32 offset:328 +; GISEL-NEXT: scratch_store_b32 off, v131, s32 offset:332 +; GISEL-NEXT: scratch_store_b32 off, v132, s32 offset:336 +; GISEL-NEXT: scratch_store_b32 off, v133, s32 offset:340 +; GISEL-NEXT: scratch_store_b32 off, v134, s32 offset:344 +; GISEL-NEXT: scratch_store_b32 off, v135, s32 offset:348 +; GISEL-NEXT: scratch_store_b32 off, v144, s32 offset:352 +; GISEL-NEXT: scratch_store_b32 off, v145, s32 offset:356 +; GISEL-NEXT: scratch_store_b32 off, v146, s32 offset:360 +; GISEL-NEXT: scratch_store_b32 off, v147, s32 offset:364 +; GISEL-NEXT: scratch_store_b32 off, v148, s32 offset:368 +; GISEL-NEXT: scratch_store_b32 off, v149, s32 offset:372 +; GISEL-NEXT: scratch_store_b32 off, v150, s32 offset:376 +; GISEL-NEXT: scratch_store_b32 off, v151, s32 offset:380 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_store_b32 off, v160, s32 offset:384 +; GISEL-NEXT: scratch_store_b32 off, v161, s32 offset:388 +; GISEL-NEXT: scratch_store_b32 off, v162, s32 offset:392 +; GISEL-NEXT: scratch_store_b32 off, v163, s32 offset:396 +; GISEL-NEXT: scratch_store_b32 off, v164, s32 offset:400 +; GISEL-NEXT: scratch_store_b32 off, v165, s32 offset:404 +; GISEL-NEXT: scratch_store_b32 off, v166, s32 offset:408 +; GISEL-NEXT: scratch_store_b32 off, v167, s32 offset:412 +; GISEL-NEXT: scratch_store_b32 off, v176, s32 offset:416 +; GISEL-NEXT: scratch_store_b32 off, v177, s32 offset:420 +; GISEL-NEXT: scratch_store_b32 off, v178, s32 offset:424 +; GISEL-NEXT: scratch_store_b32 off, v179, s32 offset:428 +; GISEL-NEXT: scratch_store_b32 off, v180, s32 offset:432 +; GISEL-NEXT: scratch_store_b32 off, v181, s32 offset:436 +; GISEL-NEXT: scratch_store_b32 off, v182, s32 offset:440 +; GISEL-NEXT: scratch_store_b32 off, v183, s32 offset:444 +; GISEL-NEXT: scratch_store_b32 off, v192, s32 offset:448 +; GISEL-NEXT: scratch_store_b32 off, v193, s32 offset:452 +; GISEL-NEXT: scratch_store_b32 off, v194, s32 offset:456 +; GISEL-NEXT: scratch_store_b32 off, v195, s32 offset:460 +; GISEL-NEXT: scratch_store_b32 off, v196, s32 offset:464 +; GISEL-NEXT: scratch_store_b32 off, v197, s32 offset:468 +; GISEL-NEXT: scratch_store_b32 off, v198, s32 offset:472 +; GISEL-NEXT: scratch_store_b32 off, v199, s32 offset:476 +; GISEL-NEXT: scratch_store_b32 off, v208, s32 offset:480 +; GISEL-NEXT: scratch_store_b32 off, v209, s32 offset:484 +; GISEL-NEXT: scratch_store_b32 off, v210, s32 offset:488 +; GISEL-NEXT: scratch_store_b32 off, v211, s32 offset:492 +; GISEL-NEXT: scratch_store_b32 off, v212, s32 offset:496 +; GISEL-NEXT: scratch_store_b32 off, v213, s32 offset:500 +; GISEL-NEXT: scratch_store_b32 off, v214, s32 offset:504 +; GISEL-NEXT: scratch_store_b32 off, v215, s32 offset:508 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_store_b32 off, v224, s32 offset:512 +; GISEL-NEXT: scratch_store_b32 off, v225, s32 offset:516 +; GISEL-NEXT: scratch_store_b32 off, v226, s32 offset:520 +; GISEL-NEXT: scratch_store_b32 off, v227, s32 offset:524 +; GISEL-NEXT: scratch_store_b32 off, v228, s32 offset:528 +; GISEL-NEXT: scratch_store_b32 off, v229, s32 offset:532 +; GISEL-NEXT: scratch_store_b32 off, v230, s32 offset:536 +; GISEL-NEXT: scratch_store_b32 off, v231, s32 offset:540 +; GISEL-NEXT: scratch_store_b32 off, v240, s32 offset:544 +; GISEL-NEXT: scratch_store_b32 off, v241, s32 offset:548 +; GISEL-NEXT: scratch_store_b32 off, v242, s32 offset:552 +; GISEL-NEXT: scratch_store_b32 off, v243, s32 offset:556 +; GISEL-NEXT: scratch_store_b32 off, v244, s32 offset:560 +; GISEL-NEXT: scratch_store_b32 off, v245, s32 offset:564 +; GISEL-NEXT: scratch_store_b32 off, v246, s32 offset:568 +; GISEL-NEXT: scratch_store_b32 off, v247, s32 offset:572 +; GISEL-NEXT: s_mov_b32 exec_lo, -1 +; GISEL-NEXT: v_mov_b32_e32 v2, v0 +; GISEL-NEXT: v_swap_b32 v0, v1 +; GISEL-NEXT: s_mov_b32 s36, gfx_callee@abs32@lo +; GISEL-NEXT: s_mov_b32 s37, gfx_callee@abs32@hi +; GISEL-NEXT: s_wait_alu 0xfffe +; GISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v0, off, s32 +; GISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL-NEXT: scratch_load_b32 v6, off, s32 offset:24 +; GISEL-NEXT: scratch_load_b32 v7, off, s32 offset:28 +; GISEL-NEXT: scratch_load_b32 v8, off, s32 offset:32 +; GISEL-NEXT: scratch_load_b32 v9, off, s32 offset:36 +; GISEL-NEXT: scratch_load_b32 v10, off, s32 offset:40 +; GISEL-NEXT: scratch_load_b32 v11, off, s32 offset:44 +; GISEL-NEXT: scratch_load_b32 v12, off, s32 offset:48 +; GISEL-NEXT: scratch_load_b32 v13, off, s32 offset:52 +; GISEL-NEXT: scratch_load_b32 v14, off, s32 offset:56 +; GISEL-NEXT: scratch_load_b32 v15, off, s32 offset:60 +; GISEL-NEXT: scratch_load_b32 v16, off, s32 offset:64 +; GISEL-NEXT: scratch_load_b32 v17, off, s32 offset:68 +; GISEL-NEXT: scratch_load_b32 v18, off, s32 offset:72 +; GISEL-NEXT: scratch_load_b32 v19, off, s32 offset:76 +; GISEL-NEXT: scratch_load_b32 v20, off, s32 offset:80 +; GISEL-NEXT: scratch_load_b32 v21, off, s32 offset:84 +; GISEL-NEXT: scratch_load_b32 v22, off, s32 offset:88 +; GISEL-NEXT: scratch_load_b32 v23, off, s32 offset:92 +; GISEL-NEXT: scratch_load_b32 v24, off, s32 offset:96 +; GISEL-NEXT: scratch_load_b32 v25, off, s32 offset:100 +; GISEL-NEXT: scratch_load_b32 v26, off, s32 offset:104 +; GISEL-NEXT: scratch_load_b32 v27, off, s32 offset:108 +; GISEL-NEXT: scratch_load_b32 v28, off, s32 offset:112 +; GISEL-NEXT: scratch_load_b32 v29, off, s32 offset:116 +; GISEL-NEXT: scratch_load_b32 v30, off, s32 offset:120 +; GISEL-NEXT: scratch_load_b32 v31, off, s32 offset:124 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v32, off, s32 offset:128 +; GISEL-NEXT: scratch_load_b32 v33, off, s32 offset:132 +; GISEL-NEXT: scratch_load_b32 v34, off, s32 offset:136 +; GISEL-NEXT: scratch_load_b32 v35, off, s32 offset:140 +; GISEL-NEXT: scratch_load_b32 v36, off, s32 offset:144 +; GISEL-NEXT: scratch_load_b32 v37, off, s32 offset:148 +; GISEL-NEXT: scratch_load_b32 v38, off, s32 offset:152 +; GISEL-NEXT: scratch_load_b32 v39, off, s32 offset:156 +; GISEL-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; GISEL-NEXT: scratch_load_b32 v49, off, s32 offset:164 +; GISEL-NEXT: scratch_load_b32 v50, off, s32 offset:168 +; GISEL-NEXT: scratch_load_b32 v51, off, s32 offset:172 +; GISEL-NEXT: scratch_load_b32 v52, off, s32 offset:176 +; GISEL-NEXT: scratch_load_b32 v53, off, s32 offset:180 +; GISEL-NEXT: scratch_load_b32 v54, off, s32 offset:184 +; GISEL-NEXT: scratch_load_b32 v55, off, s32 offset:188 +; GISEL-NEXT: scratch_load_b32 v64, off, s32 offset:192 +; GISEL-NEXT: scratch_load_b32 v65, off, s32 offset:196 +; GISEL-NEXT: scratch_load_b32 v66, off, s32 offset:200 +; GISEL-NEXT: scratch_load_b32 v67, off, s32 offset:204 +; GISEL-NEXT: scratch_load_b32 v68, off, s32 offset:208 +; GISEL-NEXT: scratch_load_b32 v69, off, s32 offset:212 +; GISEL-NEXT: scratch_load_b32 v70, off, s32 offset:216 +; GISEL-NEXT: scratch_load_b32 v71, off, s32 offset:220 +; GISEL-NEXT: scratch_load_b32 v80, off, s32 offset:224 +; GISEL-NEXT: scratch_load_b32 v81, off, s32 offset:228 +; GISEL-NEXT: scratch_load_b32 v82, off, s32 offset:232 +; GISEL-NEXT: scratch_load_b32 v83, off, s32 offset:236 +; GISEL-NEXT: scratch_load_b32 v84, off, s32 offset:240 +; GISEL-NEXT: scratch_load_b32 v85, off, s32 offset:244 +; GISEL-NEXT: scratch_load_b32 v86, off, s32 offset:248 +; GISEL-NEXT: scratch_load_b32 v87, off, s32 offset:252 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v96, off, s32 offset:256 +; GISEL-NEXT: scratch_load_b32 v97, off, s32 offset:260 +; GISEL-NEXT: scratch_load_b32 v98, off, s32 offset:264 +; GISEL-NEXT: scratch_load_b32 v99, off, s32 offset:268 +; GISEL-NEXT: scratch_load_b32 v100, off, s32 offset:272 +; GISEL-NEXT: scratch_load_b32 v101, off, s32 offset:276 +; GISEL-NEXT: scratch_load_b32 v102, off, s32 offset:280 +; GISEL-NEXT: scratch_load_b32 v103, off, s32 offset:284 +; GISEL-NEXT: scratch_load_b32 v112, off, s32 offset:288 +; GISEL-NEXT: scratch_load_b32 v113, off, s32 offset:292 +; GISEL-NEXT: scratch_load_b32 v114, off, s32 offset:296 +; GISEL-NEXT: scratch_load_b32 v115, off, s32 offset:300 +; GISEL-NEXT: scratch_load_b32 v116, off, s32 offset:304 +; GISEL-NEXT: scratch_load_b32 v117, off, s32 offset:308 +; GISEL-NEXT: scratch_load_b32 v118, off, s32 offset:312 +; GISEL-NEXT: scratch_load_b32 v119, off, s32 offset:316 +; GISEL-NEXT: scratch_load_b32 v128, off, s32 offset:320 +; GISEL-NEXT: scratch_load_b32 v129, off, s32 offset:324 +; GISEL-NEXT: scratch_load_b32 v130, off, s32 offset:328 +; GISEL-NEXT: scratch_load_b32 v131, off, s32 offset:332 +; GISEL-NEXT: scratch_load_b32 v132, off, s32 offset:336 +; GISEL-NEXT: scratch_load_b32 v133, off, s32 offset:340 +; GISEL-NEXT: scratch_load_b32 v134, off, s32 offset:344 +; GISEL-NEXT: scratch_load_b32 v135, off, s32 offset:348 +; GISEL-NEXT: scratch_load_b32 v144, off, s32 offset:352 +; GISEL-NEXT: scratch_load_b32 v145, off, s32 offset:356 +; GISEL-NEXT: scratch_load_b32 v146, off, s32 offset:360 +; GISEL-NEXT: scratch_load_b32 v147, off, s32 offset:364 +; GISEL-NEXT: scratch_load_b32 v148, off, s32 offset:368 +; GISEL-NEXT: scratch_load_b32 v149, off, s32 offset:372 +; GISEL-NEXT: scratch_load_b32 v150, off, s32 offset:376 +; GISEL-NEXT: scratch_load_b32 v151, off, s32 offset:380 +; GISEL-NEXT: s_clause 0x1f +; GISEL-NEXT: scratch_load_b32 v160, off, s32 offset:384 +; GISEL-NEXT: scratch_load_b32 v161, off, s32 offset:388 +; GISEL-NEXT: scratch_load_b32 v162, off, s32 offset:392 +; GISEL-NEXT: scratch_load_b32 v163, off, s32 offset:396 +; GISEL-NEXT: scratch_load_b32 v164, off, s32 offset:400 +; GISEL-NEXT: scratch_load_b32 v165, off, s32 offset:404 +; GISEL-NEXT: scratch_load_b32 v166, off, s32 offset:408 +; GISEL-NEXT: scratch_load_b32 v167, off, s32 offset:412 +; GISEL-NEXT: scratch_load_b32 v176, off, s32 offset:416 +; GISEL-NEXT: scratch_load_b32 v177, off, s32 offset:420 +; GISEL-NEXT: scratch_load_b32 v178, off, s32 offset:424 +; GISEL-NEXT: scratch_load_b32 v179, off, s32 offset:428 +; GISEL-NEXT: scratch_load_b32 v180, off, s32 offset:432 +; GISEL-NEXT: scratch_load_b32 v181, off, s32 offset:436 +; GISEL-NEXT: scratch_load_b32 v182, off, s32 offset:440 +; GISEL-NEXT: scratch_load_b32 v183, off, s32 offset:444 +; GISEL-NEXT: scratch_load_b32 v192, off, s32 offset:448 +; GISEL-NEXT: scratch_load_b32 v193, off, s32 offset:452 +; GISEL-NEXT: scratch_load_b32 v194, off, s32 offset:456 +; GISEL-NEXT: scratch_load_b32 v195, off, s32 offset:460 +; GISEL-NEXT: scratch_load_b32 v196, off, s32 offset:464 +; GISEL-NEXT: scratch_load_b32 v197, off, s32 offset:468 +; GISEL-NEXT: scratch_load_b32 v198, off, s32 offset:472 +; GISEL-NEXT: scratch_load_b32 v199, off, s32 offset:476 +; GISEL-NEXT: scratch_load_b32 v208, off, s32 offset:480 +; GISEL-NEXT: scratch_load_b32 v209, off, s32 offset:484 +; GISEL-NEXT: scratch_load_b32 v210, off, s32 offset:488 +; GISEL-NEXT: scratch_load_b32 v211, off, s32 offset:492 +; GISEL-NEXT: scratch_load_b32 v212, off, s32 offset:496 +; GISEL-NEXT: scratch_load_b32 v213, off, s32 offset:500 +; GISEL-NEXT: scratch_load_b32 v214, off, s32 offset:504 +; GISEL-NEXT: scratch_load_b32 v215, off, s32 offset:508 +; GISEL-NEXT: s_clause 0xf +; GISEL-NEXT: scratch_load_b32 v224, off, s32 offset:512 +; GISEL-NEXT: scratch_load_b32 v225, off, s32 offset:516 +; GISEL-NEXT: scratch_load_b32 v226, off, s32 offset:520 +; GISEL-NEXT: scratch_load_b32 v227, off, s32 offset:524 +; GISEL-NEXT: scratch_load_b32 v228, off, s32 offset:528 +; GISEL-NEXT: scratch_load_b32 v229, off, s32 offset:532 +; GISEL-NEXT: scratch_load_b32 v230, off, s32 offset:536 +; GISEL-NEXT: scratch_load_b32 v231, off, s32 offset:540 +; GISEL-NEXT: scratch_load_b32 v240, off, s32 offset:544 +; GISEL-NEXT: scratch_load_b32 v241, off, s32 offset:548 +; GISEL-NEXT: scratch_load_b32 v242, off, s32 offset:552 +; GISEL-NEXT: scratch_load_b32 v243, off, s32 offset:556 +; GISEL-NEXT: scratch_load_b32 v244, off, s32 offset:560 +; GISEL-NEXT: scratch_load_b32 v245, off, s32 offset:564 +; GISEL-NEXT: scratch_load_b32 v246, off, s32 offset:568 +; GISEL-NEXT: scratch_load_b32 v247, off, s32 offset:572 +; GISEL-NEXT: s_mov_b32 exec_lo, s0 +; GISEL-NEXT: s_setpc_b64 s[36:37] +; +; DAGISEL64-LABEL: tail_call_gfx_from_whole_wave: +; DAGISEL64: ; %bb.0: +; DAGISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; DAGISEL64-NEXT: s_wait_expcnt 0x0 +; DAGISEL64-NEXT: s_wait_samplecnt 0x0 +; DAGISEL64-NEXT: s_wait_bvhcnt 0x0 +; DAGISEL64-NEXT: s_wait_kmcnt 0x0 +; DAGISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v0, s32 +; DAGISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; DAGISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; DAGISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; DAGISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; DAGISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; DAGISEL64-NEXT: scratch_store_b32 off, v6, s32 offset:24 +; DAGISEL64-NEXT: scratch_store_b32 off, v7, s32 offset:28 +; DAGISEL64-NEXT: scratch_store_b32 off, v8, s32 offset:32 +; DAGISEL64-NEXT: scratch_store_b32 off, v9, s32 offset:36 +; DAGISEL64-NEXT: scratch_store_b32 off, v10, s32 offset:40 +; DAGISEL64-NEXT: scratch_store_b32 off, v11, s32 offset:44 +; DAGISEL64-NEXT: scratch_store_b32 off, v12, s32 offset:48 +; DAGISEL64-NEXT: scratch_store_b32 off, v13, s32 offset:52 +; DAGISEL64-NEXT: scratch_store_b32 off, v14, s32 offset:56 +; DAGISEL64-NEXT: scratch_store_b32 off, v15, s32 offset:60 +; DAGISEL64-NEXT: scratch_store_b32 off, v16, s32 offset:64 +; DAGISEL64-NEXT: scratch_store_b32 off, v17, s32 offset:68 +; DAGISEL64-NEXT: scratch_store_b32 off, v18, s32 offset:72 +; DAGISEL64-NEXT: scratch_store_b32 off, v19, s32 offset:76 +; DAGISEL64-NEXT: scratch_store_b32 off, v20, s32 offset:80 +; DAGISEL64-NEXT: scratch_store_b32 off, v21, s32 offset:84 +; DAGISEL64-NEXT: scratch_store_b32 off, v22, s32 offset:88 +; DAGISEL64-NEXT: scratch_store_b32 off, v23, s32 offset:92 +; DAGISEL64-NEXT: scratch_store_b32 off, v24, s32 offset:96 +; DAGISEL64-NEXT: scratch_store_b32 off, v25, s32 offset:100 +; DAGISEL64-NEXT: scratch_store_b32 off, v26, s32 offset:104 +; DAGISEL64-NEXT: scratch_store_b32 off, v27, s32 offset:108 +; DAGISEL64-NEXT: scratch_store_b32 off, v28, s32 offset:112 +; DAGISEL64-NEXT: scratch_store_b32 off, v29, s32 offset:116 +; DAGISEL64-NEXT: scratch_store_b32 off, v30, s32 offset:120 +; DAGISEL64-NEXT: scratch_store_b32 off, v31, s32 offset:124 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v32, s32 offset:128 +; DAGISEL64-NEXT: scratch_store_b32 off, v33, s32 offset:132 +; DAGISEL64-NEXT: scratch_store_b32 off, v34, s32 offset:136 +; DAGISEL64-NEXT: scratch_store_b32 off, v35, s32 offset:140 +; DAGISEL64-NEXT: scratch_store_b32 off, v36, s32 offset:144 +; DAGISEL64-NEXT: scratch_store_b32 off, v37, s32 offset:148 +; DAGISEL64-NEXT: scratch_store_b32 off, v38, s32 offset:152 +; DAGISEL64-NEXT: scratch_store_b32 off, v39, s32 offset:156 +; DAGISEL64-NEXT: scratch_store_b32 off, v48, s32 offset:160 +; DAGISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:164 +; DAGISEL64-NEXT: scratch_store_b32 off, v50, s32 offset:168 +; DAGISEL64-NEXT: scratch_store_b32 off, v51, s32 offset:172 +; DAGISEL64-NEXT: scratch_store_b32 off, v52, s32 offset:176 +; DAGISEL64-NEXT: scratch_store_b32 off, v53, s32 offset:180 +; DAGISEL64-NEXT: scratch_store_b32 off, v54, s32 offset:184 +; DAGISEL64-NEXT: scratch_store_b32 off, v55, s32 offset:188 +; DAGISEL64-NEXT: scratch_store_b32 off, v64, s32 offset:192 +; DAGISEL64-NEXT: scratch_store_b32 off, v65, s32 offset:196 +; DAGISEL64-NEXT: scratch_store_b32 off, v66, s32 offset:200 +; DAGISEL64-NEXT: scratch_store_b32 off, v67, s32 offset:204 +; DAGISEL64-NEXT: scratch_store_b32 off, v68, s32 offset:208 +; DAGISEL64-NEXT: scratch_store_b32 off, v69, s32 offset:212 +; DAGISEL64-NEXT: scratch_store_b32 off, v70, s32 offset:216 +; DAGISEL64-NEXT: scratch_store_b32 off, v71, s32 offset:220 +; DAGISEL64-NEXT: scratch_store_b32 off, v80, s32 offset:224 +; DAGISEL64-NEXT: scratch_store_b32 off, v81, s32 offset:228 +; DAGISEL64-NEXT: scratch_store_b32 off, v82, s32 offset:232 +; DAGISEL64-NEXT: scratch_store_b32 off, v83, s32 offset:236 +; DAGISEL64-NEXT: scratch_store_b32 off, v84, s32 offset:240 +; DAGISEL64-NEXT: scratch_store_b32 off, v85, s32 offset:244 +; DAGISEL64-NEXT: scratch_store_b32 off, v86, s32 offset:248 +; DAGISEL64-NEXT: scratch_store_b32 off, v87, s32 offset:252 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v96, s32 offset:256 +; DAGISEL64-NEXT: scratch_store_b32 off, v97, s32 offset:260 +; DAGISEL64-NEXT: scratch_store_b32 off, v98, s32 offset:264 +; DAGISEL64-NEXT: scratch_store_b32 off, v99, s32 offset:268 +; DAGISEL64-NEXT: scratch_store_b32 off, v100, s32 offset:272 +; DAGISEL64-NEXT: scratch_store_b32 off, v101, s32 offset:276 +; DAGISEL64-NEXT: scratch_store_b32 off, v102, s32 offset:280 +; DAGISEL64-NEXT: scratch_store_b32 off, v103, s32 offset:284 +; DAGISEL64-NEXT: scratch_store_b32 off, v112, s32 offset:288 +; DAGISEL64-NEXT: scratch_store_b32 off, v113, s32 offset:292 +; DAGISEL64-NEXT: scratch_store_b32 off, v114, s32 offset:296 +; DAGISEL64-NEXT: scratch_store_b32 off, v115, s32 offset:300 +; DAGISEL64-NEXT: scratch_store_b32 off, v116, s32 offset:304 +; DAGISEL64-NEXT: scratch_store_b32 off, v117, s32 offset:308 +; DAGISEL64-NEXT: scratch_store_b32 off, v118, s32 offset:312 +; DAGISEL64-NEXT: scratch_store_b32 off, v119, s32 offset:316 +; DAGISEL64-NEXT: scratch_store_b32 off, v128, s32 offset:320 +; DAGISEL64-NEXT: scratch_store_b32 off, v129, s32 offset:324 +; DAGISEL64-NEXT: scratch_store_b32 off, v130, s32 offset:328 +; DAGISEL64-NEXT: scratch_store_b32 off, v131, s32 offset:332 +; DAGISEL64-NEXT: scratch_store_b32 off, v132, s32 offset:336 +; DAGISEL64-NEXT: scratch_store_b32 off, v133, s32 offset:340 +; DAGISEL64-NEXT: scratch_store_b32 off, v134, s32 offset:344 +; DAGISEL64-NEXT: scratch_store_b32 off, v135, s32 offset:348 +; DAGISEL64-NEXT: scratch_store_b32 off, v144, s32 offset:352 +; DAGISEL64-NEXT: scratch_store_b32 off, v145, s32 offset:356 +; DAGISEL64-NEXT: scratch_store_b32 off, v146, s32 offset:360 +; DAGISEL64-NEXT: scratch_store_b32 off, v147, s32 offset:364 +; DAGISEL64-NEXT: scratch_store_b32 off, v148, s32 offset:368 +; DAGISEL64-NEXT: scratch_store_b32 off, v149, s32 offset:372 +; DAGISEL64-NEXT: scratch_store_b32 off, v150, s32 offset:376 +; DAGISEL64-NEXT: scratch_store_b32 off, v151, s32 offset:380 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_store_b32 off, v160, s32 offset:384 +; DAGISEL64-NEXT: scratch_store_b32 off, v161, s32 offset:388 +; DAGISEL64-NEXT: scratch_store_b32 off, v162, s32 offset:392 +; DAGISEL64-NEXT: scratch_store_b32 off, v163, s32 offset:396 +; DAGISEL64-NEXT: scratch_store_b32 off, v164, s32 offset:400 +; DAGISEL64-NEXT: scratch_store_b32 off, v165, s32 offset:404 +; DAGISEL64-NEXT: scratch_store_b32 off, v166, s32 offset:408 +; DAGISEL64-NEXT: scratch_store_b32 off, v167, s32 offset:412 +; DAGISEL64-NEXT: scratch_store_b32 off, v176, s32 offset:416 +; DAGISEL64-NEXT: scratch_store_b32 off, v177, s32 offset:420 +; DAGISEL64-NEXT: scratch_store_b32 off, v178, s32 offset:424 +; DAGISEL64-NEXT: scratch_store_b32 off, v179, s32 offset:428 +; DAGISEL64-NEXT: scratch_store_b32 off, v180, s32 offset:432 +; DAGISEL64-NEXT: scratch_store_b32 off, v181, s32 offset:436 +; DAGISEL64-NEXT: scratch_store_b32 off, v182, s32 offset:440 +; DAGISEL64-NEXT: scratch_store_b32 off, v183, s32 offset:444 +; DAGISEL64-NEXT: scratch_store_b32 off, v192, s32 offset:448 +; DAGISEL64-NEXT: scratch_store_b32 off, v193, s32 offset:452 +; DAGISEL64-NEXT: scratch_store_b32 off, v194, s32 offset:456 +; DAGISEL64-NEXT: scratch_store_b32 off, v195, s32 offset:460 +; DAGISEL64-NEXT: scratch_store_b32 off, v196, s32 offset:464 +; DAGISEL64-NEXT: scratch_store_b32 off, v197, s32 offset:468 +; DAGISEL64-NEXT: scratch_store_b32 off, v198, s32 offset:472 +; DAGISEL64-NEXT: scratch_store_b32 off, v199, s32 offset:476 +; DAGISEL64-NEXT: scratch_store_b32 off, v208, s32 offset:480 +; DAGISEL64-NEXT: scratch_store_b32 off, v209, s32 offset:484 +; DAGISEL64-NEXT: scratch_store_b32 off, v210, s32 offset:488 +; DAGISEL64-NEXT: scratch_store_b32 off, v211, s32 offset:492 +; DAGISEL64-NEXT: scratch_store_b32 off, v212, s32 offset:496 +; DAGISEL64-NEXT: scratch_store_b32 off, v213, s32 offset:500 +; DAGISEL64-NEXT: scratch_store_b32 off, v214, s32 offset:504 +; DAGISEL64-NEXT: scratch_store_b32 off, v215, s32 offset:508 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_store_b32 off, v224, s32 offset:512 +; DAGISEL64-NEXT: scratch_store_b32 off, v225, s32 offset:516 +; DAGISEL64-NEXT: scratch_store_b32 off, v226, s32 offset:520 +; DAGISEL64-NEXT: scratch_store_b32 off, v227, s32 offset:524 +; DAGISEL64-NEXT: scratch_store_b32 off, v228, s32 offset:528 +; DAGISEL64-NEXT: scratch_store_b32 off, v229, s32 offset:532 +; DAGISEL64-NEXT: scratch_store_b32 off, v230, s32 offset:536 +; DAGISEL64-NEXT: scratch_store_b32 off, v231, s32 offset:540 +; DAGISEL64-NEXT: scratch_store_b32 off, v240, s32 offset:544 +; DAGISEL64-NEXT: scratch_store_b32 off, v241, s32 offset:548 +; DAGISEL64-NEXT: scratch_store_b32 off, v242, s32 offset:552 +; DAGISEL64-NEXT: scratch_store_b32 off, v243, s32 offset:556 +; DAGISEL64-NEXT: scratch_store_b32 off, v244, s32 offset:560 +; DAGISEL64-NEXT: scratch_store_b32 off, v245, s32 offset:564 +; DAGISEL64-NEXT: scratch_store_b32 off, v246, s32 offset:568 +; DAGISEL64-NEXT: scratch_store_b32 off, v247, s32 offset:572 +; DAGISEL64-NEXT: s_mov_b64 exec, -1 +; DAGISEL64-NEXT: v_mov_b32_e32 v2, v0 +; DAGISEL64-NEXT: s_mov_b32 s37, gfx_callee@abs32@hi +; DAGISEL64-NEXT: s_mov_b32 s36, gfx_callee@abs32@lo +; DAGISEL64-NEXT: v_swap_b32 v0, v1 +; DAGISEL64-NEXT: s_wait_alu 0xfffe +; DAGISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v0, off, s32 +; DAGISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; DAGISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; DAGISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; DAGISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; DAGISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; DAGISEL64-NEXT: scratch_load_b32 v6, off, s32 offset:24 +; DAGISEL64-NEXT: scratch_load_b32 v7, off, s32 offset:28 +; DAGISEL64-NEXT: scratch_load_b32 v8, off, s32 offset:32 +; DAGISEL64-NEXT: scratch_load_b32 v9, off, s32 offset:36 +; DAGISEL64-NEXT: scratch_load_b32 v10, off, s32 offset:40 +; DAGISEL64-NEXT: scratch_load_b32 v11, off, s32 offset:44 +; DAGISEL64-NEXT: scratch_load_b32 v12, off, s32 offset:48 +; DAGISEL64-NEXT: scratch_load_b32 v13, off, s32 offset:52 +; DAGISEL64-NEXT: scratch_load_b32 v14, off, s32 offset:56 +; DAGISEL64-NEXT: scratch_load_b32 v15, off, s32 offset:60 +; DAGISEL64-NEXT: scratch_load_b32 v16, off, s32 offset:64 +; DAGISEL64-NEXT: scratch_load_b32 v17, off, s32 offset:68 +; DAGISEL64-NEXT: scratch_load_b32 v18, off, s32 offset:72 +; DAGISEL64-NEXT: scratch_load_b32 v19, off, s32 offset:76 +; DAGISEL64-NEXT: scratch_load_b32 v20, off, s32 offset:80 +; DAGISEL64-NEXT: scratch_load_b32 v21, off, s32 offset:84 +; DAGISEL64-NEXT: scratch_load_b32 v22, off, s32 offset:88 +; DAGISEL64-NEXT: scratch_load_b32 v23, off, s32 offset:92 +; DAGISEL64-NEXT: scratch_load_b32 v24, off, s32 offset:96 +; DAGISEL64-NEXT: scratch_load_b32 v25, off, s32 offset:100 +; DAGISEL64-NEXT: scratch_load_b32 v26, off, s32 offset:104 +; DAGISEL64-NEXT: scratch_load_b32 v27, off, s32 offset:108 +; DAGISEL64-NEXT: scratch_load_b32 v28, off, s32 offset:112 +; DAGISEL64-NEXT: scratch_load_b32 v29, off, s32 offset:116 +; DAGISEL64-NEXT: scratch_load_b32 v30, off, s32 offset:120 +; DAGISEL64-NEXT: scratch_load_b32 v31, off, s32 offset:124 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v32, off, s32 offset:128 +; DAGISEL64-NEXT: scratch_load_b32 v33, off, s32 offset:132 +; DAGISEL64-NEXT: scratch_load_b32 v34, off, s32 offset:136 +; DAGISEL64-NEXT: scratch_load_b32 v35, off, s32 offset:140 +; DAGISEL64-NEXT: scratch_load_b32 v36, off, s32 offset:144 +; DAGISEL64-NEXT: scratch_load_b32 v37, off, s32 offset:148 +; DAGISEL64-NEXT: scratch_load_b32 v38, off, s32 offset:152 +; DAGISEL64-NEXT: scratch_load_b32 v39, off, s32 offset:156 +; DAGISEL64-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; DAGISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:164 +; DAGISEL64-NEXT: scratch_load_b32 v50, off, s32 offset:168 +; DAGISEL64-NEXT: scratch_load_b32 v51, off, s32 offset:172 +; DAGISEL64-NEXT: scratch_load_b32 v52, off, s32 offset:176 +; DAGISEL64-NEXT: scratch_load_b32 v53, off, s32 offset:180 +; DAGISEL64-NEXT: scratch_load_b32 v54, off, s32 offset:184 +; DAGISEL64-NEXT: scratch_load_b32 v55, off, s32 offset:188 +; DAGISEL64-NEXT: scratch_load_b32 v64, off, s32 offset:192 +; DAGISEL64-NEXT: scratch_load_b32 v65, off, s32 offset:196 +; DAGISEL64-NEXT: scratch_load_b32 v66, off, s32 offset:200 +; DAGISEL64-NEXT: scratch_load_b32 v67, off, s32 offset:204 +; DAGISEL64-NEXT: scratch_load_b32 v68, off, s32 offset:208 +; DAGISEL64-NEXT: scratch_load_b32 v69, off, s32 offset:212 +; DAGISEL64-NEXT: scratch_load_b32 v70, off, s32 offset:216 +; DAGISEL64-NEXT: scratch_load_b32 v71, off, s32 offset:220 +; DAGISEL64-NEXT: scratch_load_b32 v80, off, s32 offset:224 +; DAGISEL64-NEXT: scratch_load_b32 v81, off, s32 offset:228 +; DAGISEL64-NEXT: scratch_load_b32 v82, off, s32 offset:232 +; DAGISEL64-NEXT: scratch_load_b32 v83, off, s32 offset:236 +; DAGISEL64-NEXT: scratch_load_b32 v84, off, s32 offset:240 +; DAGISEL64-NEXT: scratch_load_b32 v85, off, s32 offset:244 +; DAGISEL64-NEXT: scratch_load_b32 v86, off, s32 offset:248 +; DAGISEL64-NEXT: scratch_load_b32 v87, off, s32 offset:252 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v96, off, s32 offset:256 +; DAGISEL64-NEXT: scratch_load_b32 v97, off, s32 offset:260 +; DAGISEL64-NEXT: scratch_load_b32 v98, off, s32 offset:264 +; DAGISEL64-NEXT: scratch_load_b32 v99, off, s32 offset:268 +; DAGISEL64-NEXT: scratch_load_b32 v100, off, s32 offset:272 +; DAGISEL64-NEXT: scratch_load_b32 v101, off, s32 offset:276 +; DAGISEL64-NEXT: scratch_load_b32 v102, off, s32 offset:280 +; DAGISEL64-NEXT: scratch_load_b32 v103, off, s32 offset:284 +; DAGISEL64-NEXT: scratch_load_b32 v112, off, s32 offset:288 +; DAGISEL64-NEXT: scratch_load_b32 v113, off, s32 offset:292 +; DAGISEL64-NEXT: scratch_load_b32 v114, off, s32 offset:296 +; DAGISEL64-NEXT: scratch_load_b32 v115, off, s32 offset:300 +; DAGISEL64-NEXT: scratch_load_b32 v116, off, s32 offset:304 +; DAGISEL64-NEXT: scratch_load_b32 v117, off, s32 offset:308 +; DAGISEL64-NEXT: scratch_load_b32 v118, off, s32 offset:312 +; DAGISEL64-NEXT: scratch_load_b32 v119, off, s32 offset:316 +; DAGISEL64-NEXT: scratch_load_b32 v128, off, s32 offset:320 +; DAGISEL64-NEXT: scratch_load_b32 v129, off, s32 offset:324 +; DAGISEL64-NEXT: scratch_load_b32 v130, off, s32 offset:328 +; DAGISEL64-NEXT: scratch_load_b32 v131, off, s32 offset:332 +; DAGISEL64-NEXT: scratch_load_b32 v132, off, s32 offset:336 +; DAGISEL64-NEXT: scratch_load_b32 v133, off, s32 offset:340 +; DAGISEL64-NEXT: scratch_load_b32 v134, off, s32 offset:344 +; DAGISEL64-NEXT: scratch_load_b32 v135, off, s32 offset:348 +; DAGISEL64-NEXT: scratch_load_b32 v144, off, s32 offset:352 +; DAGISEL64-NEXT: scratch_load_b32 v145, off, s32 offset:356 +; DAGISEL64-NEXT: scratch_load_b32 v146, off, s32 offset:360 +; DAGISEL64-NEXT: scratch_load_b32 v147, off, s32 offset:364 +; DAGISEL64-NEXT: scratch_load_b32 v148, off, s32 offset:368 +; DAGISEL64-NEXT: scratch_load_b32 v149, off, s32 offset:372 +; DAGISEL64-NEXT: scratch_load_b32 v150, off, s32 offset:376 +; DAGISEL64-NEXT: scratch_load_b32 v151, off, s32 offset:380 +; DAGISEL64-NEXT: s_clause 0x1f +; DAGISEL64-NEXT: scratch_load_b32 v160, off, s32 offset:384 +; DAGISEL64-NEXT: scratch_load_b32 v161, off, s32 offset:388 +; DAGISEL64-NEXT: scratch_load_b32 v162, off, s32 offset:392 +; DAGISEL64-NEXT: scratch_load_b32 v163, off, s32 offset:396 +; DAGISEL64-NEXT: scratch_load_b32 v164, off, s32 offset:400 +; DAGISEL64-NEXT: scratch_load_b32 v165, off, s32 offset:404 +; DAGISEL64-NEXT: scratch_load_b32 v166, off, s32 offset:408 +; DAGISEL64-NEXT: scratch_load_b32 v167, off, s32 offset:412 +; DAGISEL64-NEXT: scratch_load_b32 v176, off, s32 offset:416 +; DAGISEL64-NEXT: scratch_load_b32 v177, off, s32 offset:420 +; DAGISEL64-NEXT: scratch_load_b32 v178, off, s32 offset:424 +; DAGISEL64-NEXT: scratch_load_b32 v179, off, s32 offset:428 +; DAGISEL64-NEXT: scratch_load_b32 v180, off, s32 offset:432 +; DAGISEL64-NEXT: scratch_load_b32 v181, off, s32 offset:436 +; DAGISEL64-NEXT: scratch_load_b32 v182, off, s32 offset:440 +; DAGISEL64-NEXT: scratch_load_b32 v183, off, s32 offset:444 +; DAGISEL64-NEXT: scratch_load_b32 v192, off, s32 offset:448 +; DAGISEL64-NEXT: scratch_load_b32 v193, off, s32 offset:452 +; DAGISEL64-NEXT: scratch_load_b32 v194, off, s32 offset:456 +; DAGISEL64-NEXT: scratch_load_b32 v195, off, s32 offset:460 +; DAGISEL64-NEXT: scratch_load_b32 v196, off, s32 offset:464 +; DAGISEL64-NEXT: scratch_load_b32 v197, off, s32 offset:468 +; DAGISEL64-NEXT: scratch_load_b32 v198, off, s32 offset:472 +; DAGISEL64-NEXT: scratch_load_b32 v199, off, s32 offset:476 +; DAGISEL64-NEXT: scratch_load_b32 v208, off, s32 offset:480 +; DAGISEL64-NEXT: scratch_load_b32 v209, off, s32 offset:484 +; DAGISEL64-NEXT: scratch_load_b32 v210, off, s32 offset:488 +; DAGISEL64-NEXT: scratch_load_b32 v211, off, s32 offset:492 +; DAGISEL64-NEXT: scratch_load_b32 v212, off, s32 offset:496 +; DAGISEL64-NEXT: scratch_load_b32 v213, off, s32 offset:500 +; DAGISEL64-NEXT: scratch_load_b32 v214, off, s32 offset:504 +; DAGISEL64-NEXT: scratch_load_b32 v215, off, s32 offset:508 +; DAGISEL64-NEXT: s_clause 0xf +; DAGISEL64-NEXT: scratch_load_b32 v224, off, s32 offset:512 +; DAGISEL64-NEXT: scratch_load_b32 v225, off, s32 offset:516 +; DAGISEL64-NEXT: scratch_load_b32 v226, off, s32 offset:520 +; DAGISEL64-NEXT: scratch_load_b32 v227, off, s32 offset:524 +; DAGISEL64-NEXT: scratch_load_b32 v228, off, s32 offset:528 +; DAGISEL64-NEXT: scratch_load_b32 v229, off, s32 offset:532 +; DAGISEL64-NEXT: scratch_load_b32 v230, off, s32 offset:536 +; DAGISEL64-NEXT: scratch_load_b32 v231, off, s32 offset:540 +; DAGISEL64-NEXT: scratch_load_b32 v240, off, s32 offset:544 +; DAGISEL64-NEXT: scratch_load_b32 v241, off, s32 offset:548 +; DAGISEL64-NEXT: scratch_load_b32 v242, off, s32 offset:552 +; DAGISEL64-NEXT: scratch_load_b32 v243, off, s32 offset:556 +; DAGISEL64-NEXT: scratch_load_b32 v244, off, s32 offset:560 +; DAGISEL64-NEXT: scratch_load_b32 v245, off, s32 offset:564 +; DAGISEL64-NEXT: scratch_load_b32 v246, off, s32 offset:568 +; DAGISEL64-NEXT: scratch_load_b32 v247, off, s32 offset:572 +; DAGISEL64-NEXT: s_mov_b64 exec, s[0:1] +; DAGISEL64-NEXT: s_setpc_b64 s[36:37] +; +; GISEL64-LABEL: tail_call_gfx_from_whole_wave: +; GISEL64: ; %bb.0: +; GISEL64-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL64-NEXT: s_wait_expcnt 0x0 +; GISEL64-NEXT: s_wait_samplecnt 0x0 +; GISEL64-NEXT: s_wait_bvhcnt 0x0 +; GISEL64-NEXT: s_wait_kmcnt 0x0 +; GISEL64-NEXT: s_xor_saveexec_b64 s[0:1], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v0, s32 +; GISEL64-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GISEL64-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GISEL64-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GISEL64-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GISEL64-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GISEL64-NEXT: scratch_store_b32 off, v6, s32 offset:24 +; GISEL64-NEXT: scratch_store_b32 off, v7, s32 offset:28 +; GISEL64-NEXT: scratch_store_b32 off, v8, s32 offset:32 +; GISEL64-NEXT: scratch_store_b32 off, v9, s32 offset:36 +; GISEL64-NEXT: scratch_store_b32 off, v10, s32 offset:40 +; GISEL64-NEXT: scratch_store_b32 off, v11, s32 offset:44 +; GISEL64-NEXT: scratch_store_b32 off, v12, s32 offset:48 +; GISEL64-NEXT: scratch_store_b32 off, v13, s32 offset:52 +; GISEL64-NEXT: scratch_store_b32 off, v14, s32 offset:56 +; GISEL64-NEXT: scratch_store_b32 off, v15, s32 offset:60 +; GISEL64-NEXT: scratch_store_b32 off, v16, s32 offset:64 +; GISEL64-NEXT: scratch_store_b32 off, v17, s32 offset:68 +; GISEL64-NEXT: scratch_store_b32 off, v18, s32 offset:72 +; GISEL64-NEXT: scratch_store_b32 off, v19, s32 offset:76 +; GISEL64-NEXT: scratch_store_b32 off, v20, s32 offset:80 +; GISEL64-NEXT: scratch_store_b32 off, v21, s32 offset:84 +; GISEL64-NEXT: scratch_store_b32 off, v22, s32 offset:88 +; GISEL64-NEXT: scratch_store_b32 off, v23, s32 offset:92 +; GISEL64-NEXT: scratch_store_b32 off, v24, s32 offset:96 +; GISEL64-NEXT: scratch_store_b32 off, v25, s32 offset:100 +; GISEL64-NEXT: scratch_store_b32 off, v26, s32 offset:104 +; GISEL64-NEXT: scratch_store_b32 off, v27, s32 offset:108 +; GISEL64-NEXT: scratch_store_b32 off, v28, s32 offset:112 +; GISEL64-NEXT: scratch_store_b32 off, v29, s32 offset:116 +; GISEL64-NEXT: scratch_store_b32 off, v30, s32 offset:120 +; GISEL64-NEXT: scratch_store_b32 off, v31, s32 offset:124 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v32, s32 offset:128 +; GISEL64-NEXT: scratch_store_b32 off, v33, s32 offset:132 +; GISEL64-NEXT: scratch_store_b32 off, v34, s32 offset:136 +; GISEL64-NEXT: scratch_store_b32 off, v35, s32 offset:140 +; GISEL64-NEXT: scratch_store_b32 off, v36, s32 offset:144 +; GISEL64-NEXT: scratch_store_b32 off, v37, s32 offset:148 +; GISEL64-NEXT: scratch_store_b32 off, v38, s32 offset:152 +; GISEL64-NEXT: scratch_store_b32 off, v39, s32 offset:156 +; GISEL64-NEXT: scratch_store_b32 off, v48, s32 offset:160 +; GISEL64-NEXT: scratch_store_b32 off, v49, s32 offset:164 +; GISEL64-NEXT: scratch_store_b32 off, v50, s32 offset:168 +; GISEL64-NEXT: scratch_store_b32 off, v51, s32 offset:172 +; GISEL64-NEXT: scratch_store_b32 off, v52, s32 offset:176 +; GISEL64-NEXT: scratch_store_b32 off, v53, s32 offset:180 +; GISEL64-NEXT: scratch_store_b32 off, v54, s32 offset:184 +; GISEL64-NEXT: scratch_store_b32 off, v55, s32 offset:188 +; GISEL64-NEXT: scratch_store_b32 off, v64, s32 offset:192 +; GISEL64-NEXT: scratch_store_b32 off, v65, s32 offset:196 +; GISEL64-NEXT: scratch_store_b32 off, v66, s32 offset:200 +; GISEL64-NEXT: scratch_store_b32 off, v67, s32 offset:204 +; GISEL64-NEXT: scratch_store_b32 off, v68, s32 offset:208 +; GISEL64-NEXT: scratch_store_b32 off, v69, s32 offset:212 +; GISEL64-NEXT: scratch_store_b32 off, v70, s32 offset:216 +; GISEL64-NEXT: scratch_store_b32 off, v71, s32 offset:220 +; GISEL64-NEXT: scratch_store_b32 off, v80, s32 offset:224 +; GISEL64-NEXT: scratch_store_b32 off, v81, s32 offset:228 +; GISEL64-NEXT: scratch_store_b32 off, v82, s32 offset:232 +; GISEL64-NEXT: scratch_store_b32 off, v83, s32 offset:236 +; GISEL64-NEXT: scratch_store_b32 off, v84, s32 offset:240 +; GISEL64-NEXT: scratch_store_b32 off, v85, s32 offset:244 +; GISEL64-NEXT: scratch_store_b32 off, v86, s32 offset:248 +; GISEL64-NEXT: scratch_store_b32 off, v87, s32 offset:252 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v96, s32 offset:256 +; GISEL64-NEXT: scratch_store_b32 off, v97, s32 offset:260 +; GISEL64-NEXT: scratch_store_b32 off, v98, s32 offset:264 +; GISEL64-NEXT: scratch_store_b32 off, v99, s32 offset:268 +; GISEL64-NEXT: scratch_store_b32 off, v100, s32 offset:272 +; GISEL64-NEXT: scratch_store_b32 off, v101, s32 offset:276 +; GISEL64-NEXT: scratch_store_b32 off, v102, s32 offset:280 +; GISEL64-NEXT: scratch_store_b32 off, v103, s32 offset:284 +; GISEL64-NEXT: scratch_store_b32 off, v112, s32 offset:288 +; GISEL64-NEXT: scratch_store_b32 off, v113, s32 offset:292 +; GISEL64-NEXT: scratch_store_b32 off, v114, s32 offset:296 +; GISEL64-NEXT: scratch_store_b32 off, v115, s32 offset:300 +; GISEL64-NEXT: scratch_store_b32 off, v116, s32 offset:304 +; GISEL64-NEXT: scratch_store_b32 off, v117, s32 offset:308 +; GISEL64-NEXT: scratch_store_b32 off, v118, s32 offset:312 +; GISEL64-NEXT: scratch_store_b32 off, v119, s32 offset:316 +; GISEL64-NEXT: scratch_store_b32 off, v128, s32 offset:320 +; GISEL64-NEXT: scratch_store_b32 off, v129, s32 offset:324 +; GISEL64-NEXT: scratch_store_b32 off, v130, s32 offset:328 +; GISEL64-NEXT: scratch_store_b32 off, v131, s32 offset:332 +; GISEL64-NEXT: scratch_store_b32 off, v132, s32 offset:336 +; GISEL64-NEXT: scratch_store_b32 off, v133, s32 offset:340 +; GISEL64-NEXT: scratch_store_b32 off, v134, s32 offset:344 +; GISEL64-NEXT: scratch_store_b32 off, v135, s32 offset:348 +; GISEL64-NEXT: scratch_store_b32 off, v144, s32 offset:352 +; GISEL64-NEXT: scratch_store_b32 off, v145, s32 offset:356 +; GISEL64-NEXT: scratch_store_b32 off, v146, s32 offset:360 +; GISEL64-NEXT: scratch_store_b32 off, v147, s32 offset:364 +; GISEL64-NEXT: scratch_store_b32 off, v148, s32 offset:368 +; GISEL64-NEXT: scratch_store_b32 off, v149, s32 offset:372 +; GISEL64-NEXT: scratch_store_b32 off, v150, s32 offset:376 +; GISEL64-NEXT: scratch_store_b32 off, v151, s32 offset:380 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_store_b32 off, v160, s32 offset:384 +; GISEL64-NEXT: scratch_store_b32 off, v161, s32 offset:388 +; GISEL64-NEXT: scratch_store_b32 off, v162, s32 offset:392 +; GISEL64-NEXT: scratch_store_b32 off, v163, s32 offset:396 +; GISEL64-NEXT: scratch_store_b32 off, v164, s32 offset:400 +; GISEL64-NEXT: scratch_store_b32 off, v165, s32 offset:404 +; GISEL64-NEXT: scratch_store_b32 off, v166, s32 offset:408 +; GISEL64-NEXT: scratch_store_b32 off, v167, s32 offset:412 +; GISEL64-NEXT: scratch_store_b32 off, v176, s32 offset:416 +; GISEL64-NEXT: scratch_store_b32 off, v177, s32 offset:420 +; GISEL64-NEXT: scratch_store_b32 off, v178, s32 offset:424 +; GISEL64-NEXT: scratch_store_b32 off, v179, s32 offset:428 +; GISEL64-NEXT: scratch_store_b32 off, v180, s32 offset:432 +; GISEL64-NEXT: scratch_store_b32 off, v181, s32 offset:436 +; GISEL64-NEXT: scratch_store_b32 off, v182, s32 offset:440 +; GISEL64-NEXT: scratch_store_b32 off, v183, s32 offset:444 +; GISEL64-NEXT: scratch_store_b32 off, v192, s32 offset:448 +; GISEL64-NEXT: scratch_store_b32 off, v193, s32 offset:452 +; GISEL64-NEXT: scratch_store_b32 off, v194, s32 offset:456 +; GISEL64-NEXT: scratch_store_b32 off, v195, s32 offset:460 +; GISEL64-NEXT: scratch_store_b32 off, v196, s32 offset:464 +; GISEL64-NEXT: scratch_store_b32 off, v197, s32 offset:468 +; GISEL64-NEXT: scratch_store_b32 off, v198, s32 offset:472 +; GISEL64-NEXT: scratch_store_b32 off, v199, s32 offset:476 +; GISEL64-NEXT: scratch_store_b32 off, v208, s32 offset:480 +; GISEL64-NEXT: scratch_store_b32 off, v209, s32 offset:484 +; GISEL64-NEXT: scratch_store_b32 off, v210, s32 offset:488 +; GISEL64-NEXT: scratch_store_b32 off, v211, s32 offset:492 +; GISEL64-NEXT: scratch_store_b32 off, v212, s32 offset:496 +; GISEL64-NEXT: scratch_store_b32 off, v213, s32 offset:500 +; GISEL64-NEXT: scratch_store_b32 off, v214, s32 offset:504 +; GISEL64-NEXT: scratch_store_b32 off, v215, s32 offset:508 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_store_b32 off, v224, s32 offset:512 +; GISEL64-NEXT: scratch_store_b32 off, v225, s32 offset:516 +; GISEL64-NEXT: scratch_store_b32 off, v226, s32 offset:520 +; GISEL64-NEXT: scratch_store_b32 off, v227, s32 offset:524 +; GISEL64-NEXT: scratch_store_b32 off, v228, s32 offset:528 +; GISEL64-NEXT: scratch_store_b32 off, v229, s32 offset:532 +; GISEL64-NEXT: scratch_store_b32 off, v230, s32 offset:536 +; GISEL64-NEXT: scratch_store_b32 off, v231, s32 offset:540 +; GISEL64-NEXT: scratch_store_b32 off, v240, s32 offset:544 +; GISEL64-NEXT: scratch_store_b32 off, v241, s32 offset:548 +; GISEL64-NEXT: scratch_store_b32 off, v242, s32 offset:552 +; GISEL64-NEXT: scratch_store_b32 off, v243, s32 offset:556 +; GISEL64-NEXT: scratch_store_b32 off, v244, s32 offset:560 +; GISEL64-NEXT: scratch_store_b32 off, v245, s32 offset:564 +; GISEL64-NEXT: scratch_store_b32 off, v246, s32 offset:568 +; GISEL64-NEXT: scratch_store_b32 off, v247, s32 offset:572 +; GISEL64-NEXT: s_mov_b64 exec, -1 +; GISEL64-NEXT: v_mov_b32_e32 v2, v0 +; GISEL64-NEXT: v_swap_b32 v0, v1 +; GISEL64-NEXT: s_mov_b32 s36, gfx_callee@abs32@lo +; GISEL64-NEXT: s_mov_b32 s37, gfx_callee@abs32@hi +; GISEL64-NEXT: s_wait_alu 0xfffe +; GISEL64-NEXT: s_xor_b64 exec, s[0:1], -1 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v0, off, s32 +; GISEL64-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GISEL64-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GISEL64-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GISEL64-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GISEL64-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GISEL64-NEXT: scratch_load_b32 v6, off, s32 offset:24 +; GISEL64-NEXT: scratch_load_b32 v7, off, s32 offset:28 +; GISEL64-NEXT: scratch_load_b32 v8, off, s32 offset:32 +; GISEL64-NEXT: scratch_load_b32 v9, off, s32 offset:36 +; GISEL64-NEXT: scratch_load_b32 v10, off, s32 offset:40 +; GISEL64-NEXT: scratch_load_b32 v11, off, s32 offset:44 +; GISEL64-NEXT: scratch_load_b32 v12, off, s32 offset:48 +; GISEL64-NEXT: scratch_load_b32 v13, off, s32 offset:52 +; GISEL64-NEXT: scratch_load_b32 v14, off, s32 offset:56 +; GISEL64-NEXT: scratch_load_b32 v15, off, s32 offset:60 +; GISEL64-NEXT: scratch_load_b32 v16, off, s32 offset:64 +; GISEL64-NEXT: scratch_load_b32 v17, off, s32 offset:68 +; GISEL64-NEXT: scratch_load_b32 v18, off, s32 offset:72 +; GISEL64-NEXT: scratch_load_b32 v19, off, s32 offset:76 +; GISEL64-NEXT: scratch_load_b32 v20, off, s32 offset:80 +; GISEL64-NEXT: scratch_load_b32 v21, off, s32 offset:84 +; GISEL64-NEXT: scratch_load_b32 v22, off, s32 offset:88 +; GISEL64-NEXT: scratch_load_b32 v23, off, s32 offset:92 +; GISEL64-NEXT: scratch_load_b32 v24, off, s32 offset:96 +; GISEL64-NEXT: scratch_load_b32 v25, off, s32 offset:100 +; GISEL64-NEXT: scratch_load_b32 v26, off, s32 offset:104 +; GISEL64-NEXT: scratch_load_b32 v27, off, s32 offset:108 +; GISEL64-NEXT: scratch_load_b32 v28, off, s32 offset:112 +; GISEL64-NEXT: scratch_load_b32 v29, off, s32 offset:116 +; GISEL64-NEXT: scratch_load_b32 v30, off, s32 offset:120 +; GISEL64-NEXT: scratch_load_b32 v31, off, s32 offset:124 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v32, off, s32 offset:128 +; GISEL64-NEXT: scratch_load_b32 v33, off, s32 offset:132 +; GISEL64-NEXT: scratch_load_b32 v34, off, s32 offset:136 +; GISEL64-NEXT: scratch_load_b32 v35, off, s32 offset:140 +; GISEL64-NEXT: scratch_load_b32 v36, off, s32 offset:144 +; GISEL64-NEXT: scratch_load_b32 v37, off, s32 offset:148 +; GISEL64-NEXT: scratch_load_b32 v38, off, s32 offset:152 +; GISEL64-NEXT: scratch_load_b32 v39, off, s32 offset:156 +; GISEL64-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; GISEL64-NEXT: scratch_load_b32 v49, off, s32 offset:164 +; GISEL64-NEXT: scratch_load_b32 v50, off, s32 offset:168 +; GISEL64-NEXT: scratch_load_b32 v51, off, s32 offset:172 +; GISEL64-NEXT: scratch_load_b32 v52, off, s32 offset:176 +; GISEL64-NEXT: scratch_load_b32 v53, off, s32 offset:180 +; GISEL64-NEXT: scratch_load_b32 v54, off, s32 offset:184 +; GISEL64-NEXT: scratch_load_b32 v55, off, s32 offset:188 +; GISEL64-NEXT: scratch_load_b32 v64, off, s32 offset:192 +; GISEL64-NEXT: scratch_load_b32 v65, off, s32 offset:196 +; GISEL64-NEXT: scratch_load_b32 v66, off, s32 offset:200 +; GISEL64-NEXT: scratch_load_b32 v67, off, s32 offset:204 +; GISEL64-NEXT: scratch_load_b32 v68, off, s32 offset:208 +; GISEL64-NEXT: scratch_load_b32 v69, off, s32 offset:212 +; GISEL64-NEXT: scratch_load_b32 v70, off, s32 offset:216 +; GISEL64-NEXT: scratch_load_b32 v71, off, s32 offset:220 +; GISEL64-NEXT: scratch_load_b32 v80, off, s32 offset:224 +; GISEL64-NEXT: scratch_load_b32 v81, off, s32 offset:228 +; GISEL64-NEXT: scratch_load_b32 v82, off, s32 offset:232 +; GISEL64-NEXT: scratch_load_b32 v83, off, s32 offset:236 +; GISEL64-NEXT: scratch_load_b32 v84, off, s32 offset:240 +; GISEL64-NEXT: scratch_load_b32 v85, off, s32 offset:244 +; GISEL64-NEXT: scratch_load_b32 v86, off, s32 offset:248 +; GISEL64-NEXT: scratch_load_b32 v87, off, s32 offset:252 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v96, off, s32 offset:256 +; GISEL64-NEXT: scratch_load_b32 v97, off, s32 offset:260 +; GISEL64-NEXT: scratch_load_b32 v98, off, s32 offset:264 +; GISEL64-NEXT: scratch_load_b32 v99, off, s32 offset:268 +; GISEL64-NEXT: scratch_load_b32 v100, off, s32 offset:272 +; GISEL64-NEXT: scratch_load_b32 v101, off, s32 offset:276 +; GISEL64-NEXT: scratch_load_b32 v102, off, s32 offset:280 +; GISEL64-NEXT: scratch_load_b32 v103, off, s32 offset:284 +; GISEL64-NEXT: scratch_load_b32 v112, off, s32 offset:288 +; GISEL64-NEXT: scratch_load_b32 v113, off, s32 offset:292 +; GISEL64-NEXT: scratch_load_b32 v114, off, s32 offset:296 +; GISEL64-NEXT: scratch_load_b32 v115, off, s32 offset:300 +; GISEL64-NEXT: scratch_load_b32 v116, off, s32 offset:304 +; GISEL64-NEXT: scratch_load_b32 v117, off, s32 offset:308 +; GISEL64-NEXT: scratch_load_b32 v118, off, s32 offset:312 +; GISEL64-NEXT: scratch_load_b32 v119, off, s32 offset:316 +; GISEL64-NEXT: scratch_load_b32 v128, off, s32 offset:320 +; GISEL64-NEXT: scratch_load_b32 v129, off, s32 offset:324 +; GISEL64-NEXT: scratch_load_b32 v130, off, s32 offset:328 +; GISEL64-NEXT: scratch_load_b32 v131, off, s32 offset:332 +; GISEL64-NEXT: scratch_load_b32 v132, off, s32 offset:336 +; GISEL64-NEXT: scratch_load_b32 v133, off, s32 offset:340 +; GISEL64-NEXT: scratch_load_b32 v134, off, s32 offset:344 +; GISEL64-NEXT: scratch_load_b32 v135, off, s32 offset:348 +; GISEL64-NEXT: scratch_load_b32 v144, off, s32 offset:352 +; GISEL64-NEXT: scratch_load_b32 v145, off, s32 offset:356 +; GISEL64-NEXT: scratch_load_b32 v146, off, s32 offset:360 +; GISEL64-NEXT: scratch_load_b32 v147, off, s32 offset:364 +; GISEL64-NEXT: scratch_load_b32 v148, off, s32 offset:368 +; GISEL64-NEXT: scratch_load_b32 v149, off, s32 offset:372 +; GISEL64-NEXT: scratch_load_b32 v150, off, s32 offset:376 +; GISEL64-NEXT: scratch_load_b32 v151, off, s32 offset:380 +; GISEL64-NEXT: s_clause 0x1f +; GISEL64-NEXT: scratch_load_b32 v160, off, s32 offset:384 +; GISEL64-NEXT: scratch_load_b32 v161, off, s32 offset:388 +; GISEL64-NEXT: scratch_load_b32 v162, off, s32 offset:392 +; GISEL64-NEXT: scratch_load_b32 v163, off, s32 offset:396 +; GISEL64-NEXT: scratch_load_b32 v164, off, s32 offset:400 +; GISEL64-NEXT: scratch_load_b32 v165, off, s32 offset:404 +; GISEL64-NEXT: scratch_load_b32 v166, off, s32 offset:408 +; GISEL64-NEXT: scratch_load_b32 v167, off, s32 offset:412 +; GISEL64-NEXT: scratch_load_b32 v176, off, s32 offset:416 +; GISEL64-NEXT: scratch_load_b32 v177, off, s32 offset:420 +; GISEL64-NEXT: scratch_load_b32 v178, off, s32 offset:424 +; GISEL64-NEXT: scratch_load_b32 v179, off, s32 offset:428 +; GISEL64-NEXT: scratch_load_b32 v180, off, s32 offset:432 +; GISEL64-NEXT: scratch_load_b32 v181, off, s32 offset:436 +; GISEL64-NEXT: scratch_load_b32 v182, off, s32 offset:440 +; GISEL64-NEXT: scratch_load_b32 v183, off, s32 offset:444 +; GISEL64-NEXT: scratch_load_b32 v192, off, s32 offset:448 +; GISEL64-NEXT: scratch_load_b32 v193, off, s32 offset:452 +; GISEL64-NEXT: scratch_load_b32 v194, off, s32 offset:456 +; GISEL64-NEXT: scratch_load_b32 v195, off, s32 offset:460 +; GISEL64-NEXT: scratch_load_b32 v196, off, s32 offset:464 +; GISEL64-NEXT: scratch_load_b32 v197, off, s32 offset:468 +; GISEL64-NEXT: scratch_load_b32 v198, off, s32 offset:472 +; GISEL64-NEXT: scratch_load_b32 v199, off, s32 offset:476 +; GISEL64-NEXT: scratch_load_b32 v208, off, s32 offset:480 +; GISEL64-NEXT: scratch_load_b32 v209, off, s32 offset:484 +; GISEL64-NEXT: scratch_load_b32 v210, off, s32 offset:488 +; GISEL64-NEXT: scratch_load_b32 v211, off, s32 offset:492 +; GISEL64-NEXT: scratch_load_b32 v212, off, s32 offset:496 +; GISEL64-NEXT: scratch_load_b32 v213, off, s32 offset:500 +; GISEL64-NEXT: scratch_load_b32 v214, off, s32 offset:504 +; GISEL64-NEXT: scratch_load_b32 v215, off, s32 offset:508 +; GISEL64-NEXT: s_clause 0xf +; GISEL64-NEXT: scratch_load_b32 v224, off, s32 offset:512 +; GISEL64-NEXT: scratch_load_b32 v225, off, s32 offset:516 +; GISEL64-NEXT: scratch_load_b32 v226, off, s32 offset:520 +; GISEL64-NEXT: scratch_load_b32 v227, off, s32 offset:524 +; GISEL64-NEXT: scratch_load_b32 v228, off, s32 offset:528 +; GISEL64-NEXT: scratch_load_b32 v229, off, s32 offset:532 +; GISEL64-NEXT: scratch_load_b32 v230, off, s32 offset:536 +; GISEL64-NEXT: scratch_load_b32 v231, off, s32 offset:540 +; GISEL64-NEXT: scratch_load_b32 v240, off, s32 offset:544 +; GISEL64-NEXT: scratch_load_b32 v241, off, s32 offset:548 +; GISEL64-NEXT: scratch_load_b32 v242, off, s32 offset:552 +; GISEL64-NEXT: scratch_load_b32 v243, off, s32 offset:556 +; GISEL64-NEXT: scratch_load_b32 v244, off, s32 offset:560 +; GISEL64-NEXT: scratch_load_b32 v245, off, s32 offset:564 +; GISEL64-NEXT: scratch_load_b32 v246, off, s32 offset:568 +; GISEL64-NEXT: scratch_load_b32 v247, off, s32 offset:572 +; GISEL64-NEXT: s_mov_b64 exec, s[0:1] +; GISEL64-NEXT: s_setpc_b64 s[36:37] +; +; GFX1250-DAGISEL-LABEL: tail_call_gfx_from_whole_wave: +; GFX1250-DAGISEL: ; %bb.0: +; GFX1250-DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-DAGISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-DAGISEL-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0, s32 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1, s32 offset:4 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2, s32 offset:8 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v3, s32 offset:12 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v4, s32 offset:16 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v5, s32 offset:20 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v6, s32 offset:24 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v7, s32 offset:28 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v8, s32 offset:32 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v9, s32 offset:36 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v10, s32 offset:40 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v11, s32 offset:44 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v12, s32 offset:48 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v13, s32 offset:52 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v14, s32 offset:56 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v15, s32 offset:60 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v16, s32 offset:64 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v17, s32 offset:68 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v18, s32 offset:72 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v19, s32 offset:76 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v20, s32 offset:80 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v21, s32 offset:84 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v22, s32 offset:88 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v23, s32 offset:92 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v24, s32 offset:96 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v25, s32 offset:100 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v26, s32 offset:104 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v27, s32 offset:108 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v28, s32 offset:112 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v29, s32 offset:116 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v30, s32 offset:120 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v31, s32 offset:124 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v32, s32 offset:128 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v33, s32 offset:132 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34, s32 offset:136 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35, s32 offset:140 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36, s32 offset:144 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37, s32 offset:148 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38, s32 offset:152 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39, s32 offset:156 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v48, s32 offset:160 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v49, s32 offset:164 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v50, s32 offset:168 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v51, s32 offset:172 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v52, s32 offset:176 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v53, s32 offset:180 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v54, s32 offset:184 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v55, s32 offset:188 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v64, s32 offset:192 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v65, s32 offset:196 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v66, s32 offset:200 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v67, s32 offset:204 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v68, s32 offset:208 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v69, s32 offset:212 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v70, s32 offset:216 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v71, s32 offset:220 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v80, s32 offset:224 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v81, s32 offset:228 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v82, s32 offset:232 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v83, s32 offset:236 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84, s32 offset:240 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85, s32 offset:244 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86, s32 offset:248 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87, s32 offset:252 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96, s32 offset:256 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97, s32 offset:260 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98, s32 offset:264 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99, s32 offset:268 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100, s32 offset:272 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101, s32 offset:276 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102, s32 offset:280 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103, s32 offset:284 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v112, s32 offset:288 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v113, s32 offset:292 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v114, s32 offset:296 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v115, s32 offset:300 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v116, s32 offset:304 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v117, s32 offset:308 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v118, s32 offset:312 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v119, s32 offset:316 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v128, s32 offset:320 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v129, s32 offset:324 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v130, s32 offset:328 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v131, s32 offset:332 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v132, s32 offset:336 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v133, s32 offset:340 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v134, s32 offset:344 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v135, s32 offset:348 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v144, s32 offset:352 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v145, s32 offset:356 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v146, s32 offset:360 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v147, s32 offset:364 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v148, s32 offset:368 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v149, s32 offset:372 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v150, s32 offset:376 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v151, s32 offset:380 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160, s32 offset:384 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161, s32 offset:388 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162, s32 offset:392 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163, s32 offset:396 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164, s32 offset:400 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165, s32 offset:404 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166, s32 offset:408 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167, s32 offset:412 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v176, s32 offset:416 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v177, s32 offset:420 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v178, s32 offset:424 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v179, s32 offset:428 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v180, s32 offset:432 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v181, s32 offset:436 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v182, s32 offset:440 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v183, s32 offset:444 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v192, s32 offset:448 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v193, s32 offset:452 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v194, s32 offset:456 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v195, s32 offset:460 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v196, s32 offset:464 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v197, s32 offset:468 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v198, s32 offset:472 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v199, s32 offset:476 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v208, s32 offset:480 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v209, s32 offset:484 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v210, s32 offset:488 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211, s32 offset:492 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212, s32 offset:496 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213, s32 offset:500 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214, s32 offset:504 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215, s32 offset:508 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224, s32 offset:512 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225, s32 offset:516 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226, s32 offset:520 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227, s32 offset:524 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228, s32 offset:528 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229, s32 offset:532 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230, s32 offset:536 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231, s32 offset:540 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v240, s32 offset:544 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v241, s32 offset:548 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v242, s32 offset:552 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v243, s32 offset:556 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v244, s32 offset:560 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v245, s32 offset:564 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v246, s32 offset:568 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v247, s32 offset:572 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 4 ; msbs: dst=0 src0=0 src1=1 src2=0 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v256*/, s32 offset:576 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v257*/, s32 offset:580 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v258*/, s32 offset:584 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v3 /*v259*/, s32 offset:588 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v4 /*v260*/, s32 offset:592 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v5 /*v261*/, s32 offset:596 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v6 /*v262*/, s32 offset:600 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v7 /*v263*/, s32 offset:604 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v8 /*v264*/, s32 offset:608 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v9 /*v265*/, s32 offset:612 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v10 /*v266*/, s32 offset:616 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v11 /*v267*/, s32 offset:620 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v12 /*v268*/, s32 offset:624 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v13 /*v269*/, s32 offset:628 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v14 /*v270*/, s32 offset:632 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v15 /*v271*/, s32 offset:636 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v16 /*v272*/, s32 offset:640 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v17 /*v273*/, s32 offset:644 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v18 /*v274*/, s32 offset:648 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v19 /*v275*/, s32 offset:652 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v20 /*v276*/, s32 offset:656 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v21 /*v277*/, s32 offset:660 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v22 /*v278*/, s32 offset:664 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v23 /*v279*/, s32 offset:668 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v24 /*v280*/, s32 offset:672 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v25 /*v281*/, s32 offset:676 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v26 /*v282*/, s32 offset:680 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v27 /*v283*/, s32 offset:684 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v28 /*v284*/, s32 offset:688 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v29 /*v285*/, s32 offset:692 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v30 /*v286*/, s32 offset:696 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v31 /*v287*/, s32 offset:700 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v32 /*v288*/, s32 offset:704 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v33 /*v289*/, s32 offset:708 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34 /*v290*/, s32 offset:712 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35 /*v291*/, s32 offset:716 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36 /*v292*/, s32 offset:720 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37 /*v293*/, s32 offset:724 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v294*/, s32 offset:728 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v295*/, s32 offset:732 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40 /*v296*/, s32 offset:736 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41 /*v297*/, s32 offset:740 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v298*/, s32 offset:744 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v299*/, s32 offset:748 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v44 /*v300*/, s32 offset:752 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v45 /*v301*/, s32 offset:756 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v46 /*v302*/, s32 offset:760 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v47 /*v303*/, s32 offset:764 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v48 /*v304*/, s32 offset:768 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v49 /*v305*/, s32 offset:772 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v50 /*v306*/, s32 offset:776 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v51 /*v307*/, s32 offset:780 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v52 /*v308*/, s32 offset:784 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v53 /*v309*/, s32 offset:788 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v54 /*v310*/, s32 offset:792 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v55 /*v311*/, s32 offset:796 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v56 /*v312*/, s32 offset:800 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v57 /*v313*/, s32 offset:804 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v58 /*v314*/, s32 offset:808 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v59 /*v315*/, s32 offset:812 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v60 /*v316*/, s32 offset:816 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v61 /*v317*/, s32 offset:820 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v62 /*v318*/, s32 offset:824 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v63 /*v319*/, s32 offset:828 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v64 /*v320*/, s32 offset:832 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v65 /*v321*/, s32 offset:836 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v66 /*v322*/, s32 offset:840 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v67 /*v323*/, s32 offset:844 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v68 /*v324*/, s32 offset:848 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v69 /*v325*/, s32 offset:852 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v70 /*v326*/, s32 offset:856 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v71 /*v327*/, s32 offset:860 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v72 /*v328*/, s32 offset:864 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v73 /*v329*/, s32 offset:868 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v74 /*v330*/, s32 offset:872 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v75 /*v331*/, s32 offset:876 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v76 /*v332*/, s32 offset:880 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v77 /*v333*/, s32 offset:884 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v78 /*v334*/, s32 offset:888 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v79 /*v335*/, s32 offset:892 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v80 /*v336*/, s32 offset:896 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v81 /*v337*/, s32 offset:900 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v82 /*v338*/, s32 offset:904 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v83 /*v339*/, s32 offset:908 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84 /*v340*/, s32 offset:912 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85 /*v341*/, s32 offset:916 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86 /*v342*/, s32 offset:920 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87 /*v343*/, s32 offset:924 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v88 /*v344*/, s32 offset:928 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v89 /*v345*/, s32 offset:932 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v90 /*v346*/, s32 offset:936 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v91 /*v347*/, s32 offset:940 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v92 /*v348*/, s32 offset:944 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v93 /*v349*/, s32 offset:948 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v94 /*v350*/, s32 offset:952 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v95 /*v351*/, s32 offset:956 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96 /*v352*/, s32 offset:960 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97 /*v353*/, s32 offset:964 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98 /*v354*/, s32 offset:968 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99 /*v355*/, s32 offset:972 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100 /*v356*/, s32 offset:976 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v357*/, s32 offset:980 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v358*/, s32 offset:984 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103 /*v359*/, s32 offset:988 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v104 /*v360*/, s32 offset:992 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v361*/, s32 offset:996 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v362*/, s32 offset:1000 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v107 /*v363*/, s32 offset:1004 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v108 /*v364*/, s32 offset:1008 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v109 /*v365*/, s32 offset:1012 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v110 /*v366*/, s32 offset:1016 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v111 /*v367*/, s32 offset:1020 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v112 /*v368*/, s32 offset:1024 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v113 /*v369*/, s32 offset:1028 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v114 /*v370*/, s32 offset:1032 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v115 /*v371*/, s32 offset:1036 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v116 /*v372*/, s32 offset:1040 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v117 /*v373*/, s32 offset:1044 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v118 /*v374*/, s32 offset:1048 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v119 /*v375*/, s32 offset:1052 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v120 /*v376*/, s32 offset:1056 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v121 /*v377*/, s32 offset:1060 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v122 /*v378*/, s32 offset:1064 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v123 /*v379*/, s32 offset:1068 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v124 /*v380*/, s32 offset:1072 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v125 /*v381*/, s32 offset:1076 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v126 /*v382*/, s32 offset:1080 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v127 /*v383*/, s32 offset:1084 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v128 /*v384*/, s32 offset:1088 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v129 /*v385*/, s32 offset:1092 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v130 /*v386*/, s32 offset:1096 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v131 /*v387*/, s32 offset:1100 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v132 /*v388*/, s32 offset:1104 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v133 /*v389*/, s32 offset:1108 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v134 /*v390*/, s32 offset:1112 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v135 /*v391*/, s32 offset:1116 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v136 /*v392*/, s32 offset:1120 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v137 /*v393*/, s32 offset:1124 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v138 /*v394*/, s32 offset:1128 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v139 /*v395*/, s32 offset:1132 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v140 /*v396*/, s32 offset:1136 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v141 /*v397*/, s32 offset:1140 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v142 /*v398*/, s32 offset:1144 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v143 /*v399*/, s32 offset:1148 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v144 /*v400*/, s32 offset:1152 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v145 /*v401*/, s32 offset:1156 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v146 /*v402*/, s32 offset:1160 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v147 /*v403*/, s32 offset:1164 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v148 /*v404*/, s32 offset:1168 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v149 /*v405*/, s32 offset:1172 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v150 /*v406*/, s32 offset:1176 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v151 /*v407*/, s32 offset:1180 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v152 /*v408*/, s32 offset:1184 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v153 /*v409*/, s32 offset:1188 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v154 /*v410*/, s32 offset:1192 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v155 /*v411*/, s32 offset:1196 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v156 /*v412*/, s32 offset:1200 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v157 /*v413*/, s32 offset:1204 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v158 /*v414*/, s32 offset:1208 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v159 /*v415*/, s32 offset:1212 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160 /*v416*/, s32 offset:1216 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161 /*v417*/, s32 offset:1220 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162 /*v418*/, s32 offset:1224 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163 /*v419*/, s32 offset:1228 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v420*/, s32 offset:1232 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v421*/, s32 offset:1236 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166 /*v422*/, s32 offset:1240 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167 /*v423*/, s32 offset:1244 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v424*/, s32 offset:1248 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v425*/, s32 offset:1252 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v170 /*v426*/, s32 offset:1256 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v171 /*v427*/, s32 offset:1260 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v172 /*v428*/, s32 offset:1264 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v173 /*v429*/, s32 offset:1268 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v174 /*v430*/, s32 offset:1272 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v175 /*v431*/, s32 offset:1276 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v176 /*v432*/, s32 offset:1280 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v177 /*v433*/, s32 offset:1284 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v178 /*v434*/, s32 offset:1288 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v179 /*v435*/, s32 offset:1292 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v180 /*v436*/, s32 offset:1296 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v181 /*v437*/, s32 offset:1300 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v182 /*v438*/, s32 offset:1304 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v183 /*v439*/, s32 offset:1308 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v184 /*v440*/, s32 offset:1312 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v185 /*v441*/, s32 offset:1316 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v186 /*v442*/, s32 offset:1320 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v187 /*v443*/, s32 offset:1324 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v188 /*v444*/, s32 offset:1328 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v189 /*v445*/, s32 offset:1332 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v190 /*v446*/, s32 offset:1336 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v191 /*v447*/, s32 offset:1340 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v192 /*v448*/, s32 offset:1344 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v193 /*v449*/, s32 offset:1348 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v194 /*v450*/, s32 offset:1352 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v195 /*v451*/, s32 offset:1356 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v196 /*v452*/, s32 offset:1360 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v197 /*v453*/, s32 offset:1364 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v198 /*v454*/, s32 offset:1368 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v199 /*v455*/, s32 offset:1372 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v200 /*v456*/, s32 offset:1376 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v201 /*v457*/, s32 offset:1380 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v202 /*v458*/, s32 offset:1384 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v203 /*v459*/, s32 offset:1388 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v204 /*v460*/, s32 offset:1392 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v205 /*v461*/, s32 offset:1396 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v206 /*v462*/, s32 offset:1400 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v207 /*v463*/, s32 offset:1404 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v208 /*v464*/, s32 offset:1408 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v209 /*v465*/, s32 offset:1412 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v210 /*v466*/, s32 offset:1416 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211 /*v467*/, s32 offset:1420 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212 /*v468*/, s32 offset:1424 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213 /*v469*/, s32 offset:1428 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214 /*v470*/, s32 offset:1432 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215 /*v471*/, s32 offset:1436 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v216 /*v472*/, s32 offset:1440 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v217 /*v473*/, s32 offset:1444 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v218 /*v474*/, s32 offset:1448 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v219 /*v475*/, s32 offset:1452 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v220 /*v476*/, s32 offset:1456 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v221 /*v477*/, s32 offset:1460 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v222 /*v478*/, s32 offset:1464 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v223 /*v479*/, s32 offset:1468 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224 /*v480*/, s32 offset:1472 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225 /*v481*/, s32 offset:1476 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226 /*v482*/, s32 offset:1480 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v483*/, s32 offset:1484 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v484*/, s32 offset:1488 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229 /*v485*/, s32 offset:1492 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230 /*v486*/, s32 offset:1496 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v487*/, s32 offset:1500 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v488*/, s32 offset:1504 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v233 /*v489*/, s32 offset:1508 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v234 /*v490*/, s32 offset:1512 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v235 /*v491*/, s32 offset:1516 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v236 /*v492*/, s32 offset:1520 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v237 /*v493*/, s32 offset:1524 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v238 /*v494*/, s32 offset:1528 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v239 /*v495*/, s32 offset:1532 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v240 /*v496*/, s32 offset:1536 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v241 /*v497*/, s32 offset:1540 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v242 /*v498*/, s32 offset:1544 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v243 /*v499*/, s32 offset:1548 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v244 /*v500*/, s32 offset:1552 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v245 /*v501*/, s32 offset:1556 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v246 /*v502*/, s32 offset:1560 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v247 /*v503*/, s32 offset:1564 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v248 /*v504*/, s32 offset:1568 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v249 /*v505*/, s32 offset:1572 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v250 /*v506*/, s32 offset:1576 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v251 /*v507*/, s32 offset:1580 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v252 /*v508*/, s32 offset:1584 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v509*/, s32 offset:1588 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v510*/, s32 offset:1592 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v511*/, s32 offset:1596 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 8 ; msbs: dst=0 src0=0 src1=2 src2=0 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v512*/, s32 offset:1600 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v513*/, s32 offset:1604 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v514*/, s32 offset:1608 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v3 /*v515*/, s32 offset:1612 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v4 /*v516*/, s32 offset:1616 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v5 /*v517*/, s32 offset:1620 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v6 /*v518*/, s32 offset:1624 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v7 /*v519*/, s32 offset:1628 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v8 /*v520*/, s32 offset:1632 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v9 /*v521*/, s32 offset:1636 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v10 /*v522*/, s32 offset:1640 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v11 /*v523*/, s32 offset:1644 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v12 /*v524*/, s32 offset:1648 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v13 /*v525*/, s32 offset:1652 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v14 /*v526*/, s32 offset:1656 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v15 /*v527*/, s32 offset:1660 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v16 /*v528*/, s32 offset:1664 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v17 /*v529*/, s32 offset:1668 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v18 /*v530*/, s32 offset:1672 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v19 /*v531*/, s32 offset:1676 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v20 /*v532*/, s32 offset:1680 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v21 /*v533*/, s32 offset:1684 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v22 /*v534*/, s32 offset:1688 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v23 /*v535*/, s32 offset:1692 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v24 /*v536*/, s32 offset:1696 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v25 /*v537*/, s32 offset:1700 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v26 /*v538*/, s32 offset:1704 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v27 /*v539*/, s32 offset:1708 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v28 /*v540*/, s32 offset:1712 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v29 /*v541*/, s32 offset:1716 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v30 /*v542*/, s32 offset:1720 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v31 /*v543*/, s32 offset:1724 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v32 /*v544*/, s32 offset:1728 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v33 /*v545*/, s32 offset:1732 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34 /*v546*/, s32 offset:1736 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35 /*v547*/, s32 offset:1740 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36 /*v548*/, s32 offset:1744 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37 /*v549*/, s32 offset:1748 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v550*/, s32 offset:1752 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v551*/, s32 offset:1756 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40 /*v552*/, s32 offset:1760 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41 /*v553*/, s32 offset:1764 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v554*/, s32 offset:1768 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v555*/, s32 offset:1772 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v44 /*v556*/, s32 offset:1776 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v45 /*v557*/, s32 offset:1780 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v46 /*v558*/, s32 offset:1784 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v47 /*v559*/, s32 offset:1788 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v48 /*v560*/, s32 offset:1792 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v49 /*v561*/, s32 offset:1796 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v50 /*v562*/, s32 offset:1800 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v51 /*v563*/, s32 offset:1804 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v52 /*v564*/, s32 offset:1808 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v53 /*v565*/, s32 offset:1812 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v54 /*v566*/, s32 offset:1816 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v55 /*v567*/, s32 offset:1820 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v56 /*v568*/, s32 offset:1824 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v57 /*v569*/, s32 offset:1828 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v58 /*v570*/, s32 offset:1832 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v59 /*v571*/, s32 offset:1836 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v60 /*v572*/, s32 offset:1840 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v61 /*v573*/, s32 offset:1844 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v62 /*v574*/, s32 offset:1848 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v63 /*v575*/, s32 offset:1852 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v64 /*v576*/, s32 offset:1856 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v65 /*v577*/, s32 offset:1860 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v66 /*v578*/, s32 offset:1864 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v67 /*v579*/, s32 offset:1868 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v68 /*v580*/, s32 offset:1872 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v69 /*v581*/, s32 offset:1876 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v70 /*v582*/, s32 offset:1880 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v71 /*v583*/, s32 offset:1884 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v72 /*v584*/, s32 offset:1888 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v73 /*v585*/, s32 offset:1892 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v74 /*v586*/, s32 offset:1896 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v75 /*v587*/, s32 offset:1900 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v76 /*v588*/, s32 offset:1904 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v77 /*v589*/, s32 offset:1908 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v78 /*v590*/, s32 offset:1912 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v79 /*v591*/, s32 offset:1916 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v80 /*v592*/, s32 offset:1920 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v81 /*v593*/, s32 offset:1924 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v82 /*v594*/, s32 offset:1928 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v83 /*v595*/, s32 offset:1932 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84 /*v596*/, s32 offset:1936 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85 /*v597*/, s32 offset:1940 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86 /*v598*/, s32 offset:1944 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87 /*v599*/, s32 offset:1948 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v88 /*v600*/, s32 offset:1952 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v89 /*v601*/, s32 offset:1956 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v90 /*v602*/, s32 offset:1960 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v91 /*v603*/, s32 offset:1964 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v92 /*v604*/, s32 offset:1968 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v93 /*v605*/, s32 offset:1972 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v94 /*v606*/, s32 offset:1976 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v95 /*v607*/, s32 offset:1980 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96 /*v608*/, s32 offset:1984 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97 /*v609*/, s32 offset:1988 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98 /*v610*/, s32 offset:1992 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99 /*v611*/, s32 offset:1996 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100 /*v612*/, s32 offset:2000 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v613*/, s32 offset:2004 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v614*/, s32 offset:2008 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103 /*v615*/, s32 offset:2012 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v104 /*v616*/, s32 offset:2016 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v617*/, s32 offset:2020 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v618*/, s32 offset:2024 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v107 /*v619*/, s32 offset:2028 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v108 /*v620*/, s32 offset:2032 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v109 /*v621*/, s32 offset:2036 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v110 /*v622*/, s32 offset:2040 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v111 /*v623*/, s32 offset:2044 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v112 /*v624*/, s32 offset:2048 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v113 /*v625*/, s32 offset:2052 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v114 /*v626*/, s32 offset:2056 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v115 /*v627*/, s32 offset:2060 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v116 /*v628*/, s32 offset:2064 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v117 /*v629*/, s32 offset:2068 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v118 /*v630*/, s32 offset:2072 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v119 /*v631*/, s32 offset:2076 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v120 /*v632*/, s32 offset:2080 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v121 /*v633*/, s32 offset:2084 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v122 /*v634*/, s32 offset:2088 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v123 /*v635*/, s32 offset:2092 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v124 /*v636*/, s32 offset:2096 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v125 /*v637*/, s32 offset:2100 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v126 /*v638*/, s32 offset:2104 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v127 /*v639*/, s32 offset:2108 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v128 /*v640*/, s32 offset:2112 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v129 /*v641*/, s32 offset:2116 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v130 /*v642*/, s32 offset:2120 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v131 /*v643*/, s32 offset:2124 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v132 /*v644*/, s32 offset:2128 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v133 /*v645*/, s32 offset:2132 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v134 /*v646*/, s32 offset:2136 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v135 /*v647*/, s32 offset:2140 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v136 /*v648*/, s32 offset:2144 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v137 /*v649*/, s32 offset:2148 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v138 /*v650*/, s32 offset:2152 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v139 /*v651*/, s32 offset:2156 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v140 /*v652*/, s32 offset:2160 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v141 /*v653*/, s32 offset:2164 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v142 /*v654*/, s32 offset:2168 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v143 /*v655*/, s32 offset:2172 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v144 /*v656*/, s32 offset:2176 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v145 /*v657*/, s32 offset:2180 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v146 /*v658*/, s32 offset:2184 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v147 /*v659*/, s32 offset:2188 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v148 /*v660*/, s32 offset:2192 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v149 /*v661*/, s32 offset:2196 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v150 /*v662*/, s32 offset:2200 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v151 /*v663*/, s32 offset:2204 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v152 /*v664*/, s32 offset:2208 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v153 /*v665*/, s32 offset:2212 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v154 /*v666*/, s32 offset:2216 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v155 /*v667*/, s32 offset:2220 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v156 /*v668*/, s32 offset:2224 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v157 /*v669*/, s32 offset:2228 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v158 /*v670*/, s32 offset:2232 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v159 /*v671*/, s32 offset:2236 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160 /*v672*/, s32 offset:2240 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161 /*v673*/, s32 offset:2244 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162 /*v674*/, s32 offset:2248 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163 /*v675*/, s32 offset:2252 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v676*/, s32 offset:2256 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v677*/, s32 offset:2260 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166 /*v678*/, s32 offset:2264 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167 /*v679*/, s32 offset:2268 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v680*/, s32 offset:2272 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v681*/, s32 offset:2276 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v170 /*v682*/, s32 offset:2280 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v171 /*v683*/, s32 offset:2284 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v172 /*v684*/, s32 offset:2288 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v173 /*v685*/, s32 offset:2292 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v174 /*v686*/, s32 offset:2296 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v175 /*v687*/, s32 offset:2300 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v176 /*v688*/, s32 offset:2304 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v177 /*v689*/, s32 offset:2308 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v178 /*v690*/, s32 offset:2312 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v179 /*v691*/, s32 offset:2316 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v180 /*v692*/, s32 offset:2320 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v181 /*v693*/, s32 offset:2324 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v182 /*v694*/, s32 offset:2328 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v183 /*v695*/, s32 offset:2332 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v184 /*v696*/, s32 offset:2336 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v185 /*v697*/, s32 offset:2340 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v186 /*v698*/, s32 offset:2344 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v187 /*v699*/, s32 offset:2348 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v188 /*v700*/, s32 offset:2352 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v189 /*v701*/, s32 offset:2356 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v190 /*v702*/, s32 offset:2360 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v191 /*v703*/, s32 offset:2364 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v192 /*v704*/, s32 offset:2368 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v193 /*v705*/, s32 offset:2372 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v194 /*v706*/, s32 offset:2376 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v195 /*v707*/, s32 offset:2380 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v196 /*v708*/, s32 offset:2384 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v197 /*v709*/, s32 offset:2388 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v198 /*v710*/, s32 offset:2392 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v199 /*v711*/, s32 offset:2396 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v200 /*v712*/, s32 offset:2400 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v201 /*v713*/, s32 offset:2404 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v202 /*v714*/, s32 offset:2408 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v203 /*v715*/, s32 offset:2412 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v204 /*v716*/, s32 offset:2416 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v205 /*v717*/, s32 offset:2420 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v206 /*v718*/, s32 offset:2424 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v207 /*v719*/, s32 offset:2428 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v208 /*v720*/, s32 offset:2432 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v209 /*v721*/, s32 offset:2436 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v210 /*v722*/, s32 offset:2440 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211 /*v723*/, s32 offset:2444 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212 /*v724*/, s32 offset:2448 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213 /*v725*/, s32 offset:2452 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214 /*v726*/, s32 offset:2456 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215 /*v727*/, s32 offset:2460 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v216 /*v728*/, s32 offset:2464 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v217 /*v729*/, s32 offset:2468 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v218 /*v730*/, s32 offset:2472 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v219 /*v731*/, s32 offset:2476 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v220 /*v732*/, s32 offset:2480 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v221 /*v733*/, s32 offset:2484 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v222 /*v734*/, s32 offset:2488 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v223 /*v735*/, s32 offset:2492 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224 /*v736*/, s32 offset:2496 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225 /*v737*/, s32 offset:2500 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226 /*v738*/, s32 offset:2504 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v739*/, s32 offset:2508 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v740*/, s32 offset:2512 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229 /*v741*/, s32 offset:2516 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230 /*v742*/, s32 offset:2520 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v743*/, s32 offset:2524 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v744*/, s32 offset:2528 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v233 /*v745*/, s32 offset:2532 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v234 /*v746*/, s32 offset:2536 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v235 /*v747*/, s32 offset:2540 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v236 /*v748*/, s32 offset:2544 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v237 /*v749*/, s32 offset:2548 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v238 /*v750*/, s32 offset:2552 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v239 /*v751*/, s32 offset:2556 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v240 /*v752*/, s32 offset:2560 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v241 /*v753*/, s32 offset:2564 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v242 /*v754*/, s32 offset:2568 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v243 /*v755*/, s32 offset:2572 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v244 /*v756*/, s32 offset:2576 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v245 /*v757*/, s32 offset:2580 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v246 /*v758*/, s32 offset:2584 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v247 /*v759*/, s32 offset:2588 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v248 /*v760*/, s32 offset:2592 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v249 /*v761*/, s32 offset:2596 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v250 /*v762*/, s32 offset:2600 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v251 /*v763*/, s32 offset:2604 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v252 /*v764*/, s32 offset:2608 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v765*/, s32 offset:2612 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v766*/, s32 offset:2616 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v767*/, s32 offset:2620 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 12 ; msbs: dst=0 src0=0 src1=3 src2=0 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v0 /*v768*/, s32 offset:2624 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v1 /*v769*/, s32 offset:2628 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v2 /*v770*/, s32 offset:2632 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v3 /*v771*/, s32 offset:2636 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v4 /*v772*/, s32 offset:2640 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v5 /*v773*/, s32 offset:2644 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v6 /*v774*/, s32 offset:2648 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v7 /*v775*/, s32 offset:2652 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v8 /*v776*/, s32 offset:2656 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v9 /*v777*/, s32 offset:2660 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v10 /*v778*/, s32 offset:2664 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v11 /*v779*/, s32 offset:2668 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v12 /*v780*/, s32 offset:2672 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v13 /*v781*/, s32 offset:2676 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v14 /*v782*/, s32 offset:2680 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v15 /*v783*/, s32 offset:2684 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v16 /*v784*/, s32 offset:2688 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v17 /*v785*/, s32 offset:2692 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v18 /*v786*/, s32 offset:2696 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v19 /*v787*/, s32 offset:2700 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v20 /*v788*/, s32 offset:2704 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v21 /*v789*/, s32 offset:2708 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v22 /*v790*/, s32 offset:2712 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v23 /*v791*/, s32 offset:2716 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v24 /*v792*/, s32 offset:2720 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v25 /*v793*/, s32 offset:2724 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v26 /*v794*/, s32 offset:2728 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v27 /*v795*/, s32 offset:2732 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v28 /*v796*/, s32 offset:2736 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v29 /*v797*/, s32 offset:2740 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v30 /*v798*/, s32 offset:2744 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v31 /*v799*/, s32 offset:2748 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v32 /*v800*/, s32 offset:2752 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v33 /*v801*/, s32 offset:2756 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v34 /*v802*/, s32 offset:2760 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v35 /*v803*/, s32 offset:2764 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v36 /*v804*/, s32 offset:2768 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v37 /*v805*/, s32 offset:2772 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v38 /*v806*/, s32 offset:2776 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v39 /*v807*/, s32 offset:2780 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v40 /*v808*/, s32 offset:2784 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v41 /*v809*/, s32 offset:2788 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v42 /*v810*/, s32 offset:2792 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v43 /*v811*/, s32 offset:2796 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v44 /*v812*/, s32 offset:2800 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v45 /*v813*/, s32 offset:2804 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v46 /*v814*/, s32 offset:2808 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v47 /*v815*/, s32 offset:2812 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v48 /*v816*/, s32 offset:2816 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v49 /*v817*/, s32 offset:2820 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v50 /*v818*/, s32 offset:2824 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v51 /*v819*/, s32 offset:2828 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v52 /*v820*/, s32 offset:2832 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v53 /*v821*/, s32 offset:2836 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v54 /*v822*/, s32 offset:2840 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v55 /*v823*/, s32 offset:2844 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v56 /*v824*/, s32 offset:2848 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v57 /*v825*/, s32 offset:2852 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v58 /*v826*/, s32 offset:2856 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v59 /*v827*/, s32 offset:2860 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v60 /*v828*/, s32 offset:2864 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v61 /*v829*/, s32 offset:2868 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v62 /*v830*/, s32 offset:2872 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v63 /*v831*/, s32 offset:2876 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v64 /*v832*/, s32 offset:2880 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v65 /*v833*/, s32 offset:2884 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v66 /*v834*/, s32 offset:2888 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v67 /*v835*/, s32 offset:2892 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v68 /*v836*/, s32 offset:2896 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v69 /*v837*/, s32 offset:2900 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v70 /*v838*/, s32 offset:2904 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v71 /*v839*/, s32 offset:2908 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v72 /*v840*/, s32 offset:2912 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v73 /*v841*/, s32 offset:2916 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v74 /*v842*/, s32 offset:2920 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v75 /*v843*/, s32 offset:2924 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v76 /*v844*/, s32 offset:2928 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v77 /*v845*/, s32 offset:2932 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v78 /*v846*/, s32 offset:2936 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v79 /*v847*/, s32 offset:2940 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v80 /*v848*/, s32 offset:2944 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v81 /*v849*/, s32 offset:2948 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v82 /*v850*/, s32 offset:2952 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v83 /*v851*/, s32 offset:2956 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v84 /*v852*/, s32 offset:2960 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v85 /*v853*/, s32 offset:2964 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v86 /*v854*/, s32 offset:2968 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v87 /*v855*/, s32 offset:2972 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v88 /*v856*/, s32 offset:2976 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v89 /*v857*/, s32 offset:2980 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v90 /*v858*/, s32 offset:2984 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v91 /*v859*/, s32 offset:2988 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v92 /*v860*/, s32 offset:2992 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v93 /*v861*/, s32 offset:2996 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v94 /*v862*/, s32 offset:3000 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v95 /*v863*/, s32 offset:3004 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v96 /*v864*/, s32 offset:3008 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v97 /*v865*/, s32 offset:3012 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v98 /*v866*/, s32 offset:3016 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v99 /*v867*/, s32 offset:3020 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v100 /*v868*/, s32 offset:3024 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v101 /*v869*/, s32 offset:3028 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v102 /*v870*/, s32 offset:3032 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v103 /*v871*/, s32 offset:3036 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v104 /*v872*/, s32 offset:3040 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v105 /*v873*/, s32 offset:3044 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v106 /*v874*/, s32 offset:3048 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v107 /*v875*/, s32 offset:3052 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v108 /*v876*/, s32 offset:3056 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v109 /*v877*/, s32 offset:3060 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v110 /*v878*/, s32 offset:3064 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v111 /*v879*/, s32 offset:3068 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v112 /*v880*/, s32 offset:3072 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v113 /*v881*/, s32 offset:3076 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v114 /*v882*/, s32 offset:3080 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v115 /*v883*/, s32 offset:3084 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v116 /*v884*/, s32 offset:3088 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v117 /*v885*/, s32 offset:3092 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v118 /*v886*/, s32 offset:3096 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v119 /*v887*/, s32 offset:3100 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v120 /*v888*/, s32 offset:3104 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v121 /*v889*/, s32 offset:3108 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v122 /*v890*/, s32 offset:3112 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v123 /*v891*/, s32 offset:3116 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v124 /*v892*/, s32 offset:3120 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v125 /*v893*/, s32 offset:3124 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v126 /*v894*/, s32 offset:3128 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v127 /*v895*/, s32 offset:3132 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v128 /*v896*/, s32 offset:3136 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v129 /*v897*/, s32 offset:3140 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v130 /*v898*/, s32 offset:3144 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v131 /*v899*/, s32 offset:3148 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v132 /*v900*/, s32 offset:3152 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v133 /*v901*/, s32 offset:3156 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v134 /*v902*/, s32 offset:3160 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v135 /*v903*/, s32 offset:3164 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v136 /*v904*/, s32 offset:3168 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v137 /*v905*/, s32 offset:3172 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v138 /*v906*/, s32 offset:3176 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v139 /*v907*/, s32 offset:3180 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v140 /*v908*/, s32 offset:3184 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v141 /*v909*/, s32 offset:3188 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v142 /*v910*/, s32 offset:3192 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v143 /*v911*/, s32 offset:3196 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v144 /*v912*/, s32 offset:3200 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v145 /*v913*/, s32 offset:3204 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v146 /*v914*/, s32 offset:3208 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v147 /*v915*/, s32 offset:3212 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v148 /*v916*/, s32 offset:3216 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v149 /*v917*/, s32 offset:3220 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v150 /*v918*/, s32 offset:3224 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v151 /*v919*/, s32 offset:3228 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v152 /*v920*/, s32 offset:3232 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v153 /*v921*/, s32 offset:3236 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v154 /*v922*/, s32 offset:3240 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v155 /*v923*/, s32 offset:3244 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v156 /*v924*/, s32 offset:3248 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v157 /*v925*/, s32 offset:3252 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v158 /*v926*/, s32 offset:3256 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v159 /*v927*/, s32 offset:3260 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v160 /*v928*/, s32 offset:3264 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v161 /*v929*/, s32 offset:3268 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v162 /*v930*/, s32 offset:3272 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v163 /*v931*/, s32 offset:3276 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v164 /*v932*/, s32 offset:3280 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v165 /*v933*/, s32 offset:3284 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v166 /*v934*/, s32 offset:3288 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v167 /*v935*/, s32 offset:3292 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v168 /*v936*/, s32 offset:3296 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v169 /*v937*/, s32 offset:3300 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v170 /*v938*/, s32 offset:3304 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v171 /*v939*/, s32 offset:3308 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v172 /*v940*/, s32 offset:3312 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v173 /*v941*/, s32 offset:3316 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v174 /*v942*/, s32 offset:3320 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v175 /*v943*/, s32 offset:3324 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v176 /*v944*/, s32 offset:3328 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v177 /*v945*/, s32 offset:3332 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v178 /*v946*/, s32 offset:3336 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v179 /*v947*/, s32 offset:3340 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v180 /*v948*/, s32 offset:3344 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v181 /*v949*/, s32 offset:3348 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v182 /*v950*/, s32 offset:3352 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v183 /*v951*/, s32 offset:3356 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v184 /*v952*/, s32 offset:3360 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v185 /*v953*/, s32 offset:3364 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v186 /*v954*/, s32 offset:3368 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v187 /*v955*/, s32 offset:3372 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v188 /*v956*/, s32 offset:3376 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v189 /*v957*/, s32 offset:3380 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v190 /*v958*/, s32 offset:3384 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v191 /*v959*/, s32 offset:3388 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v192 /*v960*/, s32 offset:3392 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v193 /*v961*/, s32 offset:3396 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v194 /*v962*/, s32 offset:3400 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v195 /*v963*/, s32 offset:3404 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v196 /*v964*/, s32 offset:3408 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v197 /*v965*/, s32 offset:3412 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v198 /*v966*/, s32 offset:3416 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v199 /*v967*/, s32 offset:3420 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v200 /*v968*/, s32 offset:3424 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v201 /*v969*/, s32 offset:3428 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v202 /*v970*/, s32 offset:3432 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v203 /*v971*/, s32 offset:3436 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v204 /*v972*/, s32 offset:3440 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v205 /*v973*/, s32 offset:3444 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v206 /*v974*/, s32 offset:3448 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v207 /*v975*/, s32 offset:3452 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v208 /*v976*/, s32 offset:3456 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v209 /*v977*/, s32 offset:3460 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v210 /*v978*/, s32 offset:3464 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v211 /*v979*/, s32 offset:3468 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v212 /*v980*/, s32 offset:3472 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v213 /*v981*/, s32 offset:3476 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v214 /*v982*/, s32 offset:3480 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v215 /*v983*/, s32 offset:3484 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v216 /*v984*/, s32 offset:3488 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v217 /*v985*/, s32 offset:3492 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v218 /*v986*/, s32 offset:3496 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v219 /*v987*/, s32 offset:3500 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v220 /*v988*/, s32 offset:3504 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v221 /*v989*/, s32 offset:3508 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v222 /*v990*/, s32 offset:3512 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v223 /*v991*/, s32 offset:3516 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v224 /*v992*/, s32 offset:3520 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v225 /*v993*/, s32 offset:3524 +; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v226 /*v994*/, s32 offset:3528 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v227 /*v995*/, s32 offset:3532 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v228 /*v996*/, s32 offset:3536 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v229 /*v997*/, s32 offset:3540 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v230 /*v998*/, s32 offset:3544 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v231 /*v999*/, s32 offset:3548 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v232 /*v1000*/, s32 offset:3552 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v233 /*v1001*/, s32 offset:3556 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v234 /*v1002*/, s32 offset:3560 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v235 /*v1003*/, s32 offset:3564 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v236 /*v1004*/, s32 offset:3568 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v237 /*v1005*/, s32 offset:3572 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v238 /*v1006*/, s32 offset:3576 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v239 /*v1007*/, s32 offset:3580 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v240 /*v1008*/, s32 offset:3584 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v241 /*v1009*/, s32 offset:3588 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v242 /*v1010*/, s32 offset:3592 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v243 /*v1011*/, s32 offset:3596 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v244 /*v1012*/, s32 offset:3600 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v245 /*v1013*/, s32 offset:3604 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v246 /*v1014*/, s32 offset:3608 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v247 /*v1015*/, s32 offset:3612 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v248 /*v1016*/, s32 offset:3616 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v249 /*v1017*/, s32 offset:3620 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v250 /*v1018*/, s32 offset:3624 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v251 /*v1019*/, s32 offset:3628 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v252 /*v1020*/, s32 offset:3632 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v253 /*v1021*/, s32 offset:3636 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v254 /*v1022*/, s32 offset:3640 +; GFX1250-DAGISEL-NEXT: scratch_store_b32 off, v255 /*v1023*/, s32 offset:3644 +; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, -1 +; GFX1250-DAGISEL-NEXT: v_mov_b32_e32 v2, v0 +; GFX1250-DAGISEL-NEXT: s_mov_b64 s[36:37], gfx_callee@abs64 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: v_swap_b32 v0, v1 +; GFX1250-DAGISEL-NEXT: s_xor_b32 exec_lo, s0, -1 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0, off, s32 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1, off, s32 offset:4 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2, off, s32 offset:8 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v3, off, s32 offset:12 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v4, off, s32 offset:16 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v5, off, s32 offset:20 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v6, off, s32 offset:24 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v7, off, s32 offset:28 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v8, off, s32 offset:32 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v9, off, s32 offset:36 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v10, off, s32 offset:40 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v11, off, s32 offset:44 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v12, off, s32 offset:48 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v13, off, s32 offset:52 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v14, off, s32 offset:56 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v15, off, s32 offset:60 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v16, off, s32 offset:64 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v17, off, s32 offset:68 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v18, off, s32 offset:72 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v19, off, s32 offset:76 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v20, off, s32 offset:80 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v21, off, s32 offset:84 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v22, off, s32 offset:88 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v23, off, s32 offset:92 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v24, off, s32 offset:96 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v25, off, s32 offset:100 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v26, off, s32 offset:104 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v27, off, s32 offset:108 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v28, off, s32 offset:112 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v29, off, s32 offset:116 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v30, off, s32 offset:120 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v31, off, s32 offset:124 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v32, off, s32 offset:128 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v33, off, s32 offset:132 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34, off, s32 offset:136 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35, off, s32 offset:140 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36, off, s32 offset:144 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37, off, s32 offset:148 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38, off, s32 offset:152 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39, off, s32 offset:156 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v48, off, s32 offset:160 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v49, off, s32 offset:164 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v50, off, s32 offset:168 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v51, off, s32 offset:172 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v52, off, s32 offset:176 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v53, off, s32 offset:180 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v54, off, s32 offset:184 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v55, off, s32 offset:188 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v64, off, s32 offset:192 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v65, off, s32 offset:196 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v66, off, s32 offset:200 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v67, off, s32 offset:204 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v68, off, s32 offset:208 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v69, off, s32 offset:212 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v70, off, s32 offset:216 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v71, off, s32 offset:220 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v80, off, s32 offset:224 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v81, off, s32 offset:228 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v82, off, s32 offset:232 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v83, off, s32 offset:236 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84, off, s32 offset:240 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85, off, s32 offset:244 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86, off, s32 offset:248 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87, off, s32 offset:252 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96, off, s32 offset:256 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97, off, s32 offset:260 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98, off, s32 offset:264 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99, off, s32 offset:268 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100, off, s32 offset:272 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101, off, s32 offset:276 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102, off, s32 offset:280 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103, off, s32 offset:284 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v112, off, s32 offset:288 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v113, off, s32 offset:292 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v114, off, s32 offset:296 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v115, off, s32 offset:300 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v116, off, s32 offset:304 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v117, off, s32 offset:308 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v118, off, s32 offset:312 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v119, off, s32 offset:316 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v128, off, s32 offset:320 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v129, off, s32 offset:324 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v130, off, s32 offset:328 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v131, off, s32 offset:332 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v132, off, s32 offset:336 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v133, off, s32 offset:340 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v134, off, s32 offset:344 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v135, off, s32 offset:348 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v144, off, s32 offset:352 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v145, off, s32 offset:356 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v146, off, s32 offset:360 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v147, off, s32 offset:364 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v148, off, s32 offset:368 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v149, off, s32 offset:372 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v150, off, s32 offset:376 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v151, off, s32 offset:380 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160, off, s32 offset:384 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161, off, s32 offset:388 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162, off, s32 offset:392 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163, off, s32 offset:396 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164, off, s32 offset:400 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165, off, s32 offset:404 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166, off, s32 offset:408 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167, off, s32 offset:412 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v176, off, s32 offset:416 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v177, off, s32 offset:420 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v178, off, s32 offset:424 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v179, off, s32 offset:428 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v180, off, s32 offset:432 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v181, off, s32 offset:436 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v182, off, s32 offset:440 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v183, off, s32 offset:444 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v192, off, s32 offset:448 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v193, off, s32 offset:452 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v194, off, s32 offset:456 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v195, off, s32 offset:460 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v196, off, s32 offset:464 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v197, off, s32 offset:468 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v198, off, s32 offset:472 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v199, off, s32 offset:476 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v208, off, s32 offset:480 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v209, off, s32 offset:484 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v210, off, s32 offset:488 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211, off, s32 offset:492 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212, off, s32 offset:496 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213, off, s32 offset:500 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214, off, s32 offset:504 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215, off, s32 offset:508 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224, off, s32 offset:512 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225, off, s32 offset:516 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226, off, s32 offset:520 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227, off, s32 offset:524 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228, off, s32 offset:528 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229, off, s32 offset:532 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230, off, s32 offset:536 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231, off, s32 offset:540 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v240, off, s32 offset:544 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v241, off, s32 offset:548 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v242, off, s32 offset:552 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v243, off, s32 offset:556 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v244, off, s32 offset:560 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v245, off, s32 offset:564 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v246, off, s32 offset:568 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v247, off, s32 offset:572 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 64 ; msbs: dst=1 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v256*/, off, s32 offset:576 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v257*/, off, s32 offset:580 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v258*/, off, s32 offset:584 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v3 /*v259*/, off, s32 offset:588 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v4 /*v260*/, off, s32 offset:592 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v5 /*v261*/, off, s32 offset:596 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v6 /*v262*/, off, s32 offset:600 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v7 /*v263*/, off, s32 offset:604 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v8 /*v264*/, off, s32 offset:608 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v9 /*v265*/, off, s32 offset:612 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v10 /*v266*/, off, s32 offset:616 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v11 /*v267*/, off, s32 offset:620 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v12 /*v268*/, off, s32 offset:624 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v13 /*v269*/, off, s32 offset:628 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v14 /*v270*/, off, s32 offset:632 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v15 /*v271*/, off, s32 offset:636 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v16 /*v272*/, off, s32 offset:640 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v17 /*v273*/, off, s32 offset:644 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v18 /*v274*/, off, s32 offset:648 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v19 /*v275*/, off, s32 offset:652 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v20 /*v276*/, off, s32 offset:656 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v21 /*v277*/, off, s32 offset:660 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v22 /*v278*/, off, s32 offset:664 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v23 /*v279*/, off, s32 offset:668 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v24 /*v280*/, off, s32 offset:672 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v25 /*v281*/, off, s32 offset:676 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v26 /*v282*/, off, s32 offset:680 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v27 /*v283*/, off, s32 offset:684 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v28 /*v284*/, off, s32 offset:688 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v29 /*v285*/, off, s32 offset:692 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v30 /*v286*/, off, s32 offset:696 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v31 /*v287*/, off, s32 offset:700 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v32 /*v288*/, off, s32 offset:704 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v33 /*v289*/, off, s32 offset:708 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34 /*v290*/, off, s32 offset:712 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35 /*v291*/, off, s32 offset:716 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36 /*v292*/, off, s32 offset:720 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37 /*v293*/, off, s32 offset:724 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v294*/, off, s32 offset:728 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v295*/, off, s32 offset:732 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40 /*v296*/, off, s32 offset:736 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41 /*v297*/, off, s32 offset:740 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v298*/, off, s32 offset:744 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v299*/, off, s32 offset:748 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v44 /*v300*/, off, s32 offset:752 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v45 /*v301*/, off, s32 offset:756 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v46 /*v302*/, off, s32 offset:760 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v47 /*v303*/, off, s32 offset:764 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v48 /*v304*/, off, s32 offset:768 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v49 /*v305*/, off, s32 offset:772 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v50 /*v306*/, off, s32 offset:776 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v51 /*v307*/, off, s32 offset:780 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v52 /*v308*/, off, s32 offset:784 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v53 /*v309*/, off, s32 offset:788 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v54 /*v310*/, off, s32 offset:792 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v55 /*v311*/, off, s32 offset:796 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v56 /*v312*/, off, s32 offset:800 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v57 /*v313*/, off, s32 offset:804 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v58 /*v314*/, off, s32 offset:808 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v59 /*v315*/, off, s32 offset:812 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v60 /*v316*/, off, s32 offset:816 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v61 /*v317*/, off, s32 offset:820 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v62 /*v318*/, off, s32 offset:824 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v63 /*v319*/, off, s32 offset:828 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v64 /*v320*/, off, s32 offset:832 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v65 /*v321*/, off, s32 offset:836 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v66 /*v322*/, off, s32 offset:840 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v67 /*v323*/, off, s32 offset:844 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v68 /*v324*/, off, s32 offset:848 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v69 /*v325*/, off, s32 offset:852 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v70 /*v326*/, off, s32 offset:856 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v71 /*v327*/, off, s32 offset:860 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v72 /*v328*/, off, s32 offset:864 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v73 /*v329*/, off, s32 offset:868 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v74 /*v330*/, off, s32 offset:872 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v75 /*v331*/, off, s32 offset:876 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v76 /*v332*/, off, s32 offset:880 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v77 /*v333*/, off, s32 offset:884 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v78 /*v334*/, off, s32 offset:888 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v79 /*v335*/, off, s32 offset:892 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v80 /*v336*/, off, s32 offset:896 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v81 /*v337*/, off, s32 offset:900 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v82 /*v338*/, off, s32 offset:904 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v83 /*v339*/, off, s32 offset:908 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84 /*v340*/, off, s32 offset:912 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85 /*v341*/, off, s32 offset:916 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86 /*v342*/, off, s32 offset:920 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87 /*v343*/, off, s32 offset:924 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v88 /*v344*/, off, s32 offset:928 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v89 /*v345*/, off, s32 offset:932 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v90 /*v346*/, off, s32 offset:936 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v91 /*v347*/, off, s32 offset:940 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v92 /*v348*/, off, s32 offset:944 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v93 /*v349*/, off, s32 offset:948 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v94 /*v350*/, off, s32 offset:952 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v95 /*v351*/, off, s32 offset:956 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96 /*v352*/, off, s32 offset:960 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97 /*v353*/, off, s32 offset:964 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98 /*v354*/, off, s32 offset:968 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99 /*v355*/, off, s32 offset:972 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100 /*v356*/, off, s32 offset:976 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v357*/, off, s32 offset:980 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v358*/, off, s32 offset:984 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103 /*v359*/, off, s32 offset:988 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v104 /*v360*/, off, s32 offset:992 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v361*/, off, s32 offset:996 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v362*/, off, s32 offset:1000 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v107 /*v363*/, off, s32 offset:1004 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v108 /*v364*/, off, s32 offset:1008 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v109 /*v365*/, off, s32 offset:1012 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v110 /*v366*/, off, s32 offset:1016 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v111 /*v367*/, off, s32 offset:1020 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v112 /*v368*/, off, s32 offset:1024 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v113 /*v369*/, off, s32 offset:1028 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v114 /*v370*/, off, s32 offset:1032 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v115 /*v371*/, off, s32 offset:1036 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v116 /*v372*/, off, s32 offset:1040 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v117 /*v373*/, off, s32 offset:1044 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v118 /*v374*/, off, s32 offset:1048 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v119 /*v375*/, off, s32 offset:1052 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v120 /*v376*/, off, s32 offset:1056 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v121 /*v377*/, off, s32 offset:1060 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v122 /*v378*/, off, s32 offset:1064 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v123 /*v379*/, off, s32 offset:1068 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v124 /*v380*/, off, s32 offset:1072 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v125 /*v381*/, off, s32 offset:1076 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v126 /*v382*/, off, s32 offset:1080 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v127 /*v383*/, off, s32 offset:1084 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v128 /*v384*/, off, s32 offset:1088 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v129 /*v385*/, off, s32 offset:1092 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v130 /*v386*/, off, s32 offset:1096 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v131 /*v387*/, off, s32 offset:1100 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v132 /*v388*/, off, s32 offset:1104 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v133 /*v389*/, off, s32 offset:1108 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v134 /*v390*/, off, s32 offset:1112 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v135 /*v391*/, off, s32 offset:1116 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v136 /*v392*/, off, s32 offset:1120 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v137 /*v393*/, off, s32 offset:1124 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v138 /*v394*/, off, s32 offset:1128 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v139 /*v395*/, off, s32 offset:1132 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v140 /*v396*/, off, s32 offset:1136 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v141 /*v397*/, off, s32 offset:1140 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v142 /*v398*/, off, s32 offset:1144 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v143 /*v399*/, off, s32 offset:1148 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v144 /*v400*/, off, s32 offset:1152 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v145 /*v401*/, off, s32 offset:1156 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v146 /*v402*/, off, s32 offset:1160 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v147 /*v403*/, off, s32 offset:1164 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v148 /*v404*/, off, s32 offset:1168 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v149 /*v405*/, off, s32 offset:1172 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v150 /*v406*/, off, s32 offset:1176 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v151 /*v407*/, off, s32 offset:1180 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v152 /*v408*/, off, s32 offset:1184 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v153 /*v409*/, off, s32 offset:1188 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v154 /*v410*/, off, s32 offset:1192 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v155 /*v411*/, off, s32 offset:1196 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v156 /*v412*/, off, s32 offset:1200 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v157 /*v413*/, off, s32 offset:1204 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v158 /*v414*/, off, s32 offset:1208 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v159 /*v415*/, off, s32 offset:1212 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160 /*v416*/, off, s32 offset:1216 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161 /*v417*/, off, s32 offset:1220 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162 /*v418*/, off, s32 offset:1224 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163 /*v419*/, off, s32 offset:1228 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v420*/, off, s32 offset:1232 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v421*/, off, s32 offset:1236 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166 /*v422*/, off, s32 offset:1240 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167 /*v423*/, off, s32 offset:1244 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v424*/, off, s32 offset:1248 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v425*/, off, s32 offset:1252 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v170 /*v426*/, off, s32 offset:1256 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v171 /*v427*/, off, s32 offset:1260 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v172 /*v428*/, off, s32 offset:1264 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v173 /*v429*/, off, s32 offset:1268 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v174 /*v430*/, off, s32 offset:1272 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v175 /*v431*/, off, s32 offset:1276 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v176 /*v432*/, off, s32 offset:1280 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v177 /*v433*/, off, s32 offset:1284 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v178 /*v434*/, off, s32 offset:1288 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v179 /*v435*/, off, s32 offset:1292 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v180 /*v436*/, off, s32 offset:1296 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v181 /*v437*/, off, s32 offset:1300 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v182 /*v438*/, off, s32 offset:1304 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v183 /*v439*/, off, s32 offset:1308 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v184 /*v440*/, off, s32 offset:1312 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v185 /*v441*/, off, s32 offset:1316 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v186 /*v442*/, off, s32 offset:1320 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v187 /*v443*/, off, s32 offset:1324 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v188 /*v444*/, off, s32 offset:1328 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v189 /*v445*/, off, s32 offset:1332 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v190 /*v446*/, off, s32 offset:1336 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v191 /*v447*/, off, s32 offset:1340 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v192 /*v448*/, off, s32 offset:1344 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v193 /*v449*/, off, s32 offset:1348 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v194 /*v450*/, off, s32 offset:1352 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v195 /*v451*/, off, s32 offset:1356 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v196 /*v452*/, off, s32 offset:1360 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v197 /*v453*/, off, s32 offset:1364 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v198 /*v454*/, off, s32 offset:1368 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v199 /*v455*/, off, s32 offset:1372 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v200 /*v456*/, off, s32 offset:1376 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v201 /*v457*/, off, s32 offset:1380 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v202 /*v458*/, off, s32 offset:1384 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v203 /*v459*/, off, s32 offset:1388 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v204 /*v460*/, off, s32 offset:1392 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v205 /*v461*/, off, s32 offset:1396 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v206 /*v462*/, off, s32 offset:1400 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v207 /*v463*/, off, s32 offset:1404 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v208 /*v464*/, off, s32 offset:1408 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v209 /*v465*/, off, s32 offset:1412 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v210 /*v466*/, off, s32 offset:1416 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211 /*v467*/, off, s32 offset:1420 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212 /*v468*/, off, s32 offset:1424 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213 /*v469*/, off, s32 offset:1428 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214 /*v470*/, off, s32 offset:1432 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215 /*v471*/, off, s32 offset:1436 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v216 /*v472*/, off, s32 offset:1440 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v217 /*v473*/, off, s32 offset:1444 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v218 /*v474*/, off, s32 offset:1448 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v219 /*v475*/, off, s32 offset:1452 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v220 /*v476*/, off, s32 offset:1456 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v221 /*v477*/, off, s32 offset:1460 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v222 /*v478*/, off, s32 offset:1464 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v223 /*v479*/, off, s32 offset:1468 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224 /*v480*/, off, s32 offset:1472 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225 /*v481*/, off, s32 offset:1476 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226 /*v482*/, off, s32 offset:1480 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v483*/, off, s32 offset:1484 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v484*/, off, s32 offset:1488 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229 /*v485*/, off, s32 offset:1492 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230 /*v486*/, off, s32 offset:1496 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v487*/, off, s32 offset:1500 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v488*/, off, s32 offset:1504 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v233 /*v489*/, off, s32 offset:1508 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v234 /*v490*/, off, s32 offset:1512 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v235 /*v491*/, off, s32 offset:1516 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v236 /*v492*/, off, s32 offset:1520 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v237 /*v493*/, off, s32 offset:1524 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v238 /*v494*/, off, s32 offset:1528 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v239 /*v495*/, off, s32 offset:1532 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v240 /*v496*/, off, s32 offset:1536 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v241 /*v497*/, off, s32 offset:1540 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v242 /*v498*/, off, s32 offset:1544 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v243 /*v499*/, off, s32 offset:1548 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v244 /*v500*/, off, s32 offset:1552 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v245 /*v501*/, off, s32 offset:1556 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v246 /*v502*/, off, s32 offset:1560 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v247 /*v503*/, off, s32 offset:1564 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v248 /*v504*/, off, s32 offset:1568 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v249 /*v505*/, off, s32 offset:1572 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v250 /*v506*/, off, s32 offset:1576 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v251 /*v507*/, off, s32 offset:1580 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v252 /*v508*/, off, s32 offset:1584 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v509*/, off, s32 offset:1588 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v510*/, off, s32 offset:1592 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v511*/, off, s32 offset:1596 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0x80 ; msbs: dst=2 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v512*/, off, s32 offset:1600 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v513*/, off, s32 offset:1604 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v514*/, off, s32 offset:1608 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v3 /*v515*/, off, s32 offset:1612 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v4 /*v516*/, off, s32 offset:1616 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v5 /*v517*/, off, s32 offset:1620 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v6 /*v518*/, off, s32 offset:1624 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v7 /*v519*/, off, s32 offset:1628 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v8 /*v520*/, off, s32 offset:1632 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v9 /*v521*/, off, s32 offset:1636 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v10 /*v522*/, off, s32 offset:1640 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v11 /*v523*/, off, s32 offset:1644 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v12 /*v524*/, off, s32 offset:1648 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v13 /*v525*/, off, s32 offset:1652 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v14 /*v526*/, off, s32 offset:1656 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v15 /*v527*/, off, s32 offset:1660 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v16 /*v528*/, off, s32 offset:1664 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v17 /*v529*/, off, s32 offset:1668 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v18 /*v530*/, off, s32 offset:1672 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v19 /*v531*/, off, s32 offset:1676 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v20 /*v532*/, off, s32 offset:1680 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v21 /*v533*/, off, s32 offset:1684 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v22 /*v534*/, off, s32 offset:1688 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v23 /*v535*/, off, s32 offset:1692 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v24 /*v536*/, off, s32 offset:1696 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v25 /*v537*/, off, s32 offset:1700 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v26 /*v538*/, off, s32 offset:1704 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v27 /*v539*/, off, s32 offset:1708 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v28 /*v540*/, off, s32 offset:1712 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v29 /*v541*/, off, s32 offset:1716 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v30 /*v542*/, off, s32 offset:1720 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v31 /*v543*/, off, s32 offset:1724 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v32 /*v544*/, off, s32 offset:1728 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v33 /*v545*/, off, s32 offset:1732 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34 /*v546*/, off, s32 offset:1736 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35 /*v547*/, off, s32 offset:1740 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36 /*v548*/, off, s32 offset:1744 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37 /*v549*/, off, s32 offset:1748 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v550*/, off, s32 offset:1752 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v551*/, off, s32 offset:1756 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40 /*v552*/, off, s32 offset:1760 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41 /*v553*/, off, s32 offset:1764 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v554*/, off, s32 offset:1768 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v555*/, off, s32 offset:1772 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v44 /*v556*/, off, s32 offset:1776 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v45 /*v557*/, off, s32 offset:1780 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v46 /*v558*/, off, s32 offset:1784 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v47 /*v559*/, off, s32 offset:1788 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v48 /*v560*/, off, s32 offset:1792 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v49 /*v561*/, off, s32 offset:1796 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v50 /*v562*/, off, s32 offset:1800 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v51 /*v563*/, off, s32 offset:1804 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v52 /*v564*/, off, s32 offset:1808 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v53 /*v565*/, off, s32 offset:1812 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v54 /*v566*/, off, s32 offset:1816 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v55 /*v567*/, off, s32 offset:1820 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v56 /*v568*/, off, s32 offset:1824 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v57 /*v569*/, off, s32 offset:1828 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v58 /*v570*/, off, s32 offset:1832 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v59 /*v571*/, off, s32 offset:1836 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v60 /*v572*/, off, s32 offset:1840 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v61 /*v573*/, off, s32 offset:1844 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v62 /*v574*/, off, s32 offset:1848 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v63 /*v575*/, off, s32 offset:1852 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v64 /*v576*/, off, s32 offset:1856 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v65 /*v577*/, off, s32 offset:1860 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v66 /*v578*/, off, s32 offset:1864 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v67 /*v579*/, off, s32 offset:1868 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v68 /*v580*/, off, s32 offset:1872 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v69 /*v581*/, off, s32 offset:1876 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v70 /*v582*/, off, s32 offset:1880 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v71 /*v583*/, off, s32 offset:1884 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v72 /*v584*/, off, s32 offset:1888 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v73 /*v585*/, off, s32 offset:1892 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v74 /*v586*/, off, s32 offset:1896 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v75 /*v587*/, off, s32 offset:1900 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v76 /*v588*/, off, s32 offset:1904 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v77 /*v589*/, off, s32 offset:1908 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v78 /*v590*/, off, s32 offset:1912 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v79 /*v591*/, off, s32 offset:1916 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v80 /*v592*/, off, s32 offset:1920 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v81 /*v593*/, off, s32 offset:1924 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v82 /*v594*/, off, s32 offset:1928 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v83 /*v595*/, off, s32 offset:1932 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84 /*v596*/, off, s32 offset:1936 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85 /*v597*/, off, s32 offset:1940 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86 /*v598*/, off, s32 offset:1944 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87 /*v599*/, off, s32 offset:1948 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v88 /*v600*/, off, s32 offset:1952 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v89 /*v601*/, off, s32 offset:1956 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v90 /*v602*/, off, s32 offset:1960 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v91 /*v603*/, off, s32 offset:1964 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v92 /*v604*/, off, s32 offset:1968 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v93 /*v605*/, off, s32 offset:1972 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v94 /*v606*/, off, s32 offset:1976 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v95 /*v607*/, off, s32 offset:1980 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96 /*v608*/, off, s32 offset:1984 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97 /*v609*/, off, s32 offset:1988 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98 /*v610*/, off, s32 offset:1992 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99 /*v611*/, off, s32 offset:1996 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100 /*v612*/, off, s32 offset:2000 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v613*/, off, s32 offset:2004 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v614*/, off, s32 offset:2008 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103 /*v615*/, off, s32 offset:2012 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v104 /*v616*/, off, s32 offset:2016 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v617*/, off, s32 offset:2020 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v618*/, off, s32 offset:2024 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v107 /*v619*/, off, s32 offset:2028 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v108 /*v620*/, off, s32 offset:2032 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v109 /*v621*/, off, s32 offset:2036 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v110 /*v622*/, off, s32 offset:2040 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v111 /*v623*/, off, s32 offset:2044 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v112 /*v624*/, off, s32 offset:2048 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v113 /*v625*/, off, s32 offset:2052 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v114 /*v626*/, off, s32 offset:2056 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v115 /*v627*/, off, s32 offset:2060 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v116 /*v628*/, off, s32 offset:2064 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v117 /*v629*/, off, s32 offset:2068 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v118 /*v630*/, off, s32 offset:2072 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v119 /*v631*/, off, s32 offset:2076 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v120 /*v632*/, off, s32 offset:2080 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v121 /*v633*/, off, s32 offset:2084 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v122 /*v634*/, off, s32 offset:2088 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v123 /*v635*/, off, s32 offset:2092 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v124 /*v636*/, off, s32 offset:2096 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v125 /*v637*/, off, s32 offset:2100 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v126 /*v638*/, off, s32 offset:2104 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v127 /*v639*/, off, s32 offset:2108 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v128 /*v640*/, off, s32 offset:2112 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v129 /*v641*/, off, s32 offset:2116 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v130 /*v642*/, off, s32 offset:2120 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v131 /*v643*/, off, s32 offset:2124 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v132 /*v644*/, off, s32 offset:2128 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v133 /*v645*/, off, s32 offset:2132 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v134 /*v646*/, off, s32 offset:2136 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v135 /*v647*/, off, s32 offset:2140 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v136 /*v648*/, off, s32 offset:2144 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v137 /*v649*/, off, s32 offset:2148 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v138 /*v650*/, off, s32 offset:2152 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v139 /*v651*/, off, s32 offset:2156 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v140 /*v652*/, off, s32 offset:2160 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v141 /*v653*/, off, s32 offset:2164 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v142 /*v654*/, off, s32 offset:2168 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v143 /*v655*/, off, s32 offset:2172 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v144 /*v656*/, off, s32 offset:2176 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v145 /*v657*/, off, s32 offset:2180 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v146 /*v658*/, off, s32 offset:2184 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v147 /*v659*/, off, s32 offset:2188 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v148 /*v660*/, off, s32 offset:2192 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v149 /*v661*/, off, s32 offset:2196 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v150 /*v662*/, off, s32 offset:2200 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v151 /*v663*/, off, s32 offset:2204 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v152 /*v664*/, off, s32 offset:2208 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v153 /*v665*/, off, s32 offset:2212 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v154 /*v666*/, off, s32 offset:2216 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v155 /*v667*/, off, s32 offset:2220 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v156 /*v668*/, off, s32 offset:2224 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v157 /*v669*/, off, s32 offset:2228 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v158 /*v670*/, off, s32 offset:2232 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v159 /*v671*/, off, s32 offset:2236 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160 /*v672*/, off, s32 offset:2240 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161 /*v673*/, off, s32 offset:2244 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162 /*v674*/, off, s32 offset:2248 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163 /*v675*/, off, s32 offset:2252 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v676*/, off, s32 offset:2256 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v677*/, off, s32 offset:2260 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166 /*v678*/, off, s32 offset:2264 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167 /*v679*/, off, s32 offset:2268 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v680*/, off, s32 offset:2272 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v681*/, off, s32 offset:2276 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v170 /*v682*/, off, s32 offset:2280 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v171 /*v683*/, off, s32 offset:2284 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v172 /*v684*/, off, s32 offset:2288 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v173 /*v685*/, off, s32 offset:2292 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v174 /*v686*/, off, s32 offset:2296 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v175 /*v687*/, off, s32 offset:2300 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v176 /*v688*/, off, s32 offset:2304 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v177 /*v689*/, off, s32 offset:2308 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v178 /*v690*/, off, s32 offset:2312 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v179 /*v691*/, off, s32 offset:2316 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v180 /*v692*/, off, s32 offset:2320 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v181 /*v693*/, off, s32 offset:2324 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v182 /*v694*/, off, s32 offset:2328 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v183 /*v695*/, off, s32 offset:2332 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v184 /*v696*/, off, s32 offset:2336 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v185 /*v697*/, off, s32 offset:2340 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v186 /*v698*/, off, s32 offset:2344 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v187 /*v699*/, off, s32 offset:2348 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v188 /*v700*/, off, s32 offset:2352 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v189 /*v701*/, off, s32 offset:2356 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v190 /*v702*/, off, s32 offset:2360 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v191 /*v703*/, off, s32 offset:2364 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v192 /*v704*/, off, s32 offset:2368 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v193 /*v705*/, off, s32 offset:2372 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v194 /*v706*/, off, s32 offset:2376 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v195 /*v707*/, off, s32 offset:2380 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v196 /*v708*/, off, s32 offset:2384 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v197 /*v709*/, off, s32 offset:2388 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v198 /*v710*/, off, s32 offset:2392 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v199 /*v711*/, off, s32 offset:2396 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v200 /*v712*/, off, s32 offset:2400 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v201 /*v713*/, off, s32 offset:2404 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v202 /*v714*/, off, s32 offset:2408 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v203 /*v715*/, off, s32 offset:2412 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v204 /*v716*/, off, s32 offset:2416 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v205 /*v717*/, off, s32 offset:2420 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v206 /*v718*/, off, s32 offset:2424 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v207 /*v719*/, off, s32 offset:2428 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v208 /*v720*/, off, s32 offset:2432 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v209 /*v721*/, off, s32 offset:2436 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v210 /*v722*/, off, s32 offset:2440 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211 /*v723*/, off, s32 offset:2444 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212 /*v724*/, off, s32 offset:2448 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213 /*v725*/, off, s32 offset:2452 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214 /*v726*/, off, s32 offset:2456 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215 /*v727*/, off, s32 offset:2460 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v216 /*v728*/, off, s32 offset:2464 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v217 /*v729*/, off, s32 offset:2468 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v218 /*v730*/, off, s32 offset:2472 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v219 /*v731*/, off, s32 offset:2476 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v220 /*v732*/, off, s32 offset:2480 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v221 /*v733*/, off, s32 offset:2484 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v222 /*v734*/, off, s32 offset:2488 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v223 /*v735*/, off, s32 offset:2492 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224 /*v736*/, off, s32 offset:2496 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225 /*v737*/, off, s32 offset:2500 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226 /*v738*/, off, s32 offset:2504 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v739*/, off, s32 offset:2508 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v740*/, off, s32 offset:2512 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229 /*v741*/, off, s32 offset:2516 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230 /*v742*/, off, s32 offset:2520 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v743*/, off, s32 offset:2524 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v744*/, off, s32 offset:2528 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v233 /*v745*/, off, s32 offset:2532 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v234 /*v746*/, off, s32 offset:2536 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v235 /*v747*/, off, s32 offset:2540 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v236 /*v748*/, off, s32 offset:2544 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v237 /*v749*/, off, s32 offset:2548 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v238 /*v750*/, off, s32 offset:2552 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v239 /*v751*/, off, s32 offset:2556 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v240 /*v752*/, off, s32 offset:2560 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v241 /*v753*/, off, s32 offset:2564 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v242 /*v754*/, off, s32 offset:2568 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v243 /*v755*/, off, s32 offset:2572 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v244 /*v756*/, off, s32 offset:2576 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v245 /*v757*/, off, s32 offset:2580 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v246 /*v758*/, off, s32 offset:2584 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v247 /*v759*/, off, s32 offset:2588 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v248 /*v760*/, off, s32 offset:2592 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v249 /*v761*/, off, s32 offset:2596 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v250 /*v762*/, off, s32 offset:2600 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v251 /*v763*/, off, s32 offset:2604 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v252 /*v764*/, off, s32 offset:2608 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v765*/, off, s32 offset:2612 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v766*/, off, s32 offset:2616 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v767*/, off, s32 offset:2620 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0xc0 ; msbs: dst=3 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v0 /*v768*/, off, s32 offset:2624 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v1 /*v769*/, off, s32 offset:2628 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v2 /*v770*/, off, s32 offset:2632 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v3 /*v771*/, off, s32 offset:2636 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v4 /*v772*/, off, s32 offset:2640 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v5 /*v773*/, off, s32 offset:2644 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v6 /*v774*/, off, s32 offset:2648 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v7 /*v775*/, off, s32 offset:2652 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v8 /*v776*/, off, s32 offset:2656 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v9 /*v777*/, off, s32 offset:2660 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v10 /*v778*/, off, s32 offset:2664 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v11 /*v779*/, off, s32 offset:2668 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v12 /*v780*/, off, s32 offset:2672 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v13 /*v781*/, off, s32 offset:2676 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v14 /*v782*/, off, s32 offset:2680 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v15 /*v783*/, off, s32 offset:2684 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v16 /*v784*/, off, s32 offset:2688 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v17 /*v785*/, off, s32 offset:2692 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v18 /*v786*/, off, s32 offset:2696 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v19 /*v787*/, off, s32 offset:2700 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v20 /*v788*/, off, s32 offset:2704 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v21 /*v789*/, off, s32 offset:2708 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v22 /*v790*/, off, s32 offset:2712 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v23 /*v791*/, off, s32 offset:2716 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v24 /*v792*/, off, s32 offset:2720 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v25 /*v793*/, off, s32 offset:2724 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v26 /*v794*/, off, s32 offset:2728 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v27 /*v795*/, off, s32 offset:2732 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v28 /*v796*/, off, s32 offset:2736 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v29 /*v797*/, off, s32 offset:2740 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v30 /*v798*/, off, s32 offset:2744 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v31 /*v799*/, off, s32 offset:2748 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v32 /*v800*/, off, s32 offset:2752 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v33 /*v801*/, off, s32 offset:2756 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v34 /*v802*/, off, s32 offset:2760 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v35 /*v803*/, off, s32 offset:2764 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v36 /*v804*/, off, s32 offset:2768 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v37 /*v805*/, off, s32 offset:2772 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v38 /*v806*/, off, s32 offset:2776 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v39 /*v807*/, off, s32 offset:2780 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v40 /*v808*/, off, s32 offset:2784 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v41 /*v809*/, off, s32 offset:2788 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v42 /*v810*/, off, s32 offset:2792 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v43 /*v811*/, off, s32 offset:2796 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v44 /*v812*/, off, s32 offset:2800 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v45 /*v813*/, off, s32 offset:2804 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v46 /*v814*/, off, s32 offset:2808 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v47 /*v815*/, off, s32 offset:2812 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v48 /*v816*/, off, s32 offset:2816 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v49 /*v817*/, off, s32 offset:2820 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v50 /*v818*/, off, s32 offset:2824 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v51 /*v819*/, off, s32 offset:2828 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v52 /*v820*/, off, s32 offset:2832 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v53 /*v821*/, off, s32 offset:2836 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v54 /*v822*/, off, s32 offset:2840 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v55 /*v823*/, off, s32 offset:2844 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v56 /*v824*/, off, s32 offset:2848 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v57 /*v825*/, off, s32 offset:2852 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v58 /*v826*/, off, s32 offset:2856 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v59 /*v827*/, off, s32 offset:2860 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v60 /*v828*/, off, s32 offset:2864 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v61 /*v829*/, off, s32 offset:2868 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v62 /*v830*/, off, s32 offset:2872 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v63 /*v831*/, off, s32 offset:2876 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v64 /*v832*/, off, s32 offset:2880 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v65 /*v833*/, off, s32 offset:2884 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v66 /*v834*/, off, s32 offset:2888 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v67 /*v835*/, off, s32 offset:2892 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v68 /*v836*/, off, s32 offset:2896 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v69 /*v837*/, off, s32 offset:2900 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v70 /*v838*/, off, s32 offset:2904 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v71 /*v839*/, off, s32 offset:2908 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v72 /*v840*/, off, s32 offset:2912 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v73 /*v841*/, off, s32 offset:2916 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v74 /*v842*/, off, s32 offset:2920 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v75 /*v843*/, off, s32 offset:2924 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v76 /*v844*/, off, s32 offset:2928 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v77 /*v845*/, off, s32 offset:2932 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v78 /*v846*/, off, s32 offset:2936 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v79 /*v847*/, off, s32 offset:2940 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v80 /*v848*/, off, s32 offset:2944 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v81 /*v849*/, off, s32 offset:2948 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v82 /*v850*/, off, s32 offset:2952 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v83 /*v851*/, off, s32 offset:2956 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v84 /*v852*/, off, s32 offset:2960 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v85 /*v853*/, off, s32 offset:2964 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v86 /*v854*/, off, s32 offset:2968 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v87 /*v855*/, off, s32 offset:2972 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v88 /*v856*/, off, s32 offset:2976 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v89 /*v857*/, off, s32 offset:2980 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v90 /*v858*/, off, s32 offset:2984 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v91 /*v859*/, off, s32 offset:2988 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v92 /*v860*/, off, s32 offset:2992 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v93 /*v861*/, off, s32 offset:2996 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v94 /*v862*/, off, s32 offset:3000 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v95 /*v863*/, off, s32 offset:3004 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v96 /*v864*/, off, s32 offset:3008 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v97 /*v865*/, off, s32 offset:3012 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v98 /*v866*/, off, s32 offset:3016 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v99 /*v867*/, off, s32 offset:3020 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v100 /*v868*/, off, s32 offset:3024 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v101 /*v869*/, off, s32 offset:3028 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v102 /*v870*/, off, s32 offset:3032 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v103 /*v871*/, off, s32 offset:3036 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v104 /*v872*/, off, s32 offset:3040 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v105 /*v873*/, off, s32 offset:3044 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v106 /*v874*/, off, s32 offset:3048 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v107 /*v875*/, off, s32 offset:3052 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v108 /*v876*/, off, s32 offset:3056 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v109 /*v877*/, off, s32 offset:3060 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v110 /*v878*/, off, s32 offset:3064 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v111 /*v879*/, off, s32 offset:3068 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v112 /*v880*/, off, s32 offset:3072 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v113 /*v881*/, off, s32 offset:3076 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v114 /*v882*/, off, s32 offset:3080 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v115 /*v883*/, off, s32 offset:3084 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v116 /*v884*/, off, s32 offset:3088 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v117 /*v885*/, off, s32 offset:3092 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v118 /*v886*/, off, s32 offset:3096 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v119 /*v887*/, off, s32 offset:3100 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v120 /*v888*/, off, s32 offset:3104 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v121 /*v889*/, off, s32 offset:3108 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v122 /*v890*/, off, s32 offset:3112 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v123 /*v891*/, off, s32 offset:3116 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v124 /*v892*/, off, s32 offset:3120 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v125 /*v893*/, off, s32 offset:3124 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v126 /*v894*/, off, s32 offset:3128 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v127 /*v895*/, off, s32 offset:3132 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v128 /*v896*/, off, s32 offset:3136 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v129 /*v897*/, off, s32 offset:3140 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v130 /*v898*/, off, s32 offset:3144 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v131 /*v899*/, off, s32 offset:3148 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v132 /*v900*/, off, s32 offset:3152 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v133 /*v901*/, off, s32 offset:3156 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v134 /*v902*/, off, s32 offset:3160 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v135 /*v903*/, off, s32 offset:3164 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v136 /*v904*/, off, s32 offset:3168 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v137 /*v905*/, off, s32 offset:3172 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v138 /*v906*/, off, s32 offset:3176 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v139 /*v907*/, off, s32 offset:3180 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v140 /*v908*/, off, s32 offset:3184 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v141 /*v909*/, off, s32 offset:3188 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v142 /*v910*/, off, s32 offset:3192 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v143 /*v911*/, off, s32 offset:3196 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v144 /*v912*/, off, s32 offset:3200 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v145 /*v913*/, off, s32 offset:3204 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v146 /*v914*/, off, s32 offset:3208 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v147 /*v915*/, off, s32 offset:3212 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v148 /*v916*/, off, s32 offset:3216 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v149 /*v917*/, off, s32 offset:3220 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v150 /*v918*/, off, s32 offset:3224 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v151 /*v919*/, off, s32 offset:3228 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v152 /*v920*/, off, s32 offset:3232 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v153 /*v921*/, off, s32 offset:3236 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v154 /*v922*/, off, s32 offset:3240 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v155 /*v923*/, off, s32 offset:3244 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v156 /*v924*/, off, s32 offset:3248 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v157 /*v925*/, off, s32 offset:3252 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v158 /*v926*/, off, s32 offset:3256 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v159 /*v927*/, off, s32 offset:3260 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v160 /*v928*/, off, s32 offset:3264 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v161 /*v929*/, off, s32 offset:3268 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v162 /*v930*/, off, s32 offset:3272 +; GFX1250-DAGISEL-NEXT: s_clause 0x3e +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v163 /*v931*/, off, s32 offset:3276 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v164 /*v932*/, off, s32 offset:3280 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v165 /*v933*/, off, s32 offset:3284 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v166 /*v934*/, off, s32 offset:3288 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v167 /*v935*/, off, s32 offset:3292 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v168 /*v936*/, off, s32 offset:3296 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v169 /*v937*/, off, s32 offset:3300 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v170 /*v938*/, off, s32 offset:3304 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v171 /*v939*/, off, s32 offset:3308 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v172 /*v940*/, off, s32 offset:3312 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v173 /*v941*/, off, s32 offset:3316 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v174 /*v942*/, off, s32 offset:3320 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v175 /*v943*/, off, s32 offset:3324 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v176 /*v944*/, off, s32 offset:3328 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v177 /*v945*/, off, s32 offset:3332 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v178 /*v946*/, off, s32 offset:3336 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v179 /*v947*/, off, s32 offset:3340 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v180 /*v948*/, off, s32 offset:3344 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v181 /*v949*/, off, s32 offset:3348 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v182 /*v950*/, off, s32 offset:3352 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v183 /*v951*/, off, s32 offset:3356 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v184 /*v952*/, off, s32 offset:3360 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v185 /*v953*/, off, s32 offset:3364 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v186 /*v954*/, off, s32 offset:3368 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v187 /*v955*/, off, s32 offset:3372 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v188 /*v956*/, off, s32 offset:3376 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v189 /*v957*/, off, s32 offset:3380 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v190 /*v958*/, off, s32 offset:3384 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v191 /*v959*/, off, s32 offset:3388 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v192 /*v960*/, off, s32 offset:3392 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v193 /*v961*/, off, s32 offset:3396 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v194 /*v962*/, off, s32 offset:3400 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v195 /*v963*/, off, s32 offset:3404 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v196 /*v964*/, off, s32 offset:3408 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v197 /*v965*/, off, s32 offset:3412 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v198 /*v966*/, off, s32 offset:3416 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v199 /*v967*/, off, s32 offset:3420 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v200 /*v968*/, off, s32 offset:3424 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v201 /*v969*/, off, s32 offset:3428 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v202 /*v970*/, off, s32 offset:3432 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v203 /*v971*/, off, s32 offset:3436 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v204 /*v972*/, off, s32 offset:3440 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v205 /*v973*/, off, s32 offset:3444 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v206 /*v974*/, off, s32 offset:3448 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v207 /*v975*/, off, s32 offset:3452 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v208 /*v976*/, off, s32 offset:3456 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v209 /*v977*/, off, s32 offset:3460 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v210 /*v978*/, off, s32 offset:3464 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v211 /*v979*/, off, s32 offset:3468 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v212 /*v980*/, off, s32 offset:3472 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v213 /*v981*/, off, s32 offset:3476 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v214 /*v982*/, off, s32 offset:3480 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v215 /*v983*/, off, s32 offset:3484 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v216 /*v984*/, off, s32 offset:3488 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v217 /*v985*/, off, s32 offset:3492 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v218 /*v986*/, off, s32 offset:3496 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v219 /*v987*/, off, s32 offset:3500 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v220 /*v988*/, off, s32 offset:3504 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v221 /*v989*/, off, s32 offset:3508 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v222 /*v990*/, off, s32 offset:3512 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v223 /*v991*/, off, s32 offset:3516 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v224 /*v992*/, off, s32 offset:3520 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v225 /*v993*/, off, s32 offset:3524 +; GFX1250-DAGISEL-NEXT: s_clause 0x1d +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v226 /*v994*/, off, s32 offset:3528 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v227 /*v995*/, off, s32 offset:3532 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v228 /*v996*/, off, s32 offset:3536 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v229 /*v997*/, off, s32 offset:3540 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v230 /*v998*/, off, s32 offset:3544 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v231 /*v999*/, off, s32 offset:3548 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v232 /*v1000*/, off, s32 offset:3552 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v233 /*v1001*/, off, s32 offset:3556 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v234 /*v1002*/, off, s32 offset:3560 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v235 /*v1003*/, off, s32 offset:3564 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v236 /*v1004*/, off, s32 offset:3568 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v237 /*v1005*/, off, s32 offset:3572 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v238 /*v1006*/, off, s32 offset:3576 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v239 /*v1007*/, off, s32 offset:3580 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v240 /*v1008*/, off, s32 offset:3584 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v241 /*v1009*/, off, s32 offset:3588 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v242 /*v1010*/, off, s32 offset:3592 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v243 /*v1011*/, off, s32 offset:3596 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v244 /*v1012*/, off, s32 offset:3600 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v245 /*v1013*/, off, s32 offset:3604 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v246 /*v1014*/, off, s32 offset:3608 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v247 /*v1015*/, off, s32 offset:3612 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v248 /*v1016*/, off, s32 offset:3616 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v249 /*v1017*/, off, s32 offset:3620 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v250 /*v1018*/, off, s32 offset:3624 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v251 /*v1019*/, off, s32 offset:3628 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v252 /*v1020*/, off, s32 offset:3632 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v253 /*v1021*/, off, s32 offset:3636 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v254 /*v1022*/, off, s32 offset:3640 +; GFX1250-DAGISEL-NEXT: scratch_load_b32 v255 /*v1023*/, off, s32 offset:3644 +; GFX1250-DAGISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-DAGISEL-NEXT: s_mov_b32 exec_lo, s0 +; GFX1250-DAGISEL-NEXT: s_set_vgpr_msb 0 ; msbs: dst=0 src0=0 src1=0 src2=0 +; GFX1250-DAGISEL-NEXT: s_set_pc_i64 s[36:37] + %ret = tail call amdgpu_gfx <2 x half>(<2 x half>, <2 x half>) @gfx_callee(<2 x half> %y, <2 x half> %x) convergent + ret <2 x half> %ret +} + declare amdgpu_gfx_whole_wave float @callee(i1 %active, <8 x float> %x) define amdgpu_cs void @call_from_entry(<8 x float> %x, ptr %p) { diff --git a/llvm/test/CodeGen/ARM/2009-10-02-NEONSubregsBug.ll b/llvm/test/CodeGen/ARM/2009-10-02-NEONSubregsBug.ll index 484ad93bebeab..0e8d47347286b 100644 --- a/llvm/test/CodeGen/ARM/2009-10-02-NEONSubregsBug.ll +++ b/llvm/test/CodeGen/ARM/2009-10-02-NEONSubregsBug.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=armv7-eabi -mcpu=cortex-a8 -enable-unsafe-fp-math < %s +; RUN: llc -mtriple=armv7-eabi -mcpu=cortex-a8 < %s ; PR5367 define arm_aapcs_vfpcc void @_Z27Benchmark_SceDualQuaternionPvm(ptr nocapture %pBuffer, i32 %numItems) nounwind { diff --git a/llvm/test/CodeGen/ARM/2012-04-10-DAGCombine.ll b/llvm/test/CodeGen/ARM/2012-04-10-DAGCombine.ll index 80c1968c85743..593fb9348506b 100644 --- a/llvm/test/CodeGen/ARM/2012-04-10-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/2012-04-10-DAGCombine.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 -enable-unsafe-fp-math %s -o /dev/null +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o /dev/null ;target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:64-n32-S64" ;target triple = "armv7-none-linux-gnueabi" diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll index 7f70c44c78f9c..27d1dc20bd815 100644 --- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll +++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr3.ll @@ -11,7 +11,10 @@ define i32 @foo() local_unnamed_addr #0 { entry: + %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0) ret i32 42 } +declare float @llvm.fma.f32(float, float, float) + attributes #0 = { minsize norecurse nounwind optsize readnone "no-trapping-math"="true" "denormal-fp-math"="ieee"} diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll index c99cb27adf155..9c8dd8d95c61c 100644 --- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll +++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr4.ll @@ -10,7 +10,10 @@ define i32 @foo1() local_unnamed_addr #0 { entry: + %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0) ret i32 42 } +declare float @llvm.fma.f32(float, float, float) + attributes #0 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="positive-zero,positive-zero" } diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll index ba1e7d7ce55c1..cda3ea0fc6d18 100644 --- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll +++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr5.ll @@ -10,7 +10,10 @@ define i32 @foo1() local_unnamed_addr #0 { entry: + %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0) ret i32 42 } +declare float @llvm.fma.f32(float, float, float) + attributes #0 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="preserve-sign,preserve-sign"} diff --git a/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll b/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll index 1cd68aed1e051..59d0a40198392 100644 --- a/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll +++ b/llvm/test/CodeGen/ARM/build-attributes-fn-attr6.ll @@ -11,6 +11,7 @@ define i32 @foo1() local_unnamed_addr #0 { entry: + %a = call float @llvm.fma.f32(float 0.0, float 0.0, float 0.0) ret i32 42 } @@ -19,5 +20,7 @@ entry: ret i32 42 } +declare float @llvm.fma.f32(float, float, float) + attributes #0 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="preserve-sign,preserve-sign"} attributes #1 = { minsize norecurse nounwind optsize readnone "denormal-fp-math"="positive-zero,positive-zero"} diff --git a/llvm/test/CodeGen/ARM/build-attributes.ll b/llvm/test/CodeGen/ARM/build-attributes.ll index 68844aed03630..306a4a31b79fa 100644 --- a/llvm/test/CodeGen/ARM/build-attributes.ll +++ b/llvm/test/CodeGen/ARM/build-attributes.ll @@ -3,23 +3,16 @@ ; RUN: llc < %s -mtriple=thumbv5-linux-gnueabi -mcpu=xscale -mattr=+strict-align | FileCheck %s --check-prefix=XSCALE ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6 -; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6-FAST ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M -; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST ; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align | FileCheck %s --check-prefix=V6M -; RUN: llc < %s -mtriple=thumbv6sm-linux-gnueabi -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V6M-FAST ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align | FileCheck %s --check-prefix=ARM1156T2F-S -; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=ARM1156T2F-S-FAST ; RUN: llc < %s -mtriple=armv6-linux-gnueabi -mcpu=arm1156t2f-s -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi | FileCheck %s --check-prefix=V7M -; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7M-FAST ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s --check-prefix=V7 ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V7-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi | FileCheck %s --check-prefix=V8 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=V8-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi | FileCheck %s --check-prefix=Vt8 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING @@ -31,35 +24,24 @@ ; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi | FileCheck %s --check-prefix=V8MMAINLINE ; RUN: llc < %s -mtriple=thumbv8m.main-linux-gnueabi -mattr=+dsp | FileCheck %s --check-prefix=V8MMAINLINE_DSP ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-DEFAULT-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-neon,-d32 | FileCheck %s --check-prefix=CORTEX-A5-NONEON ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A5-NOFPU -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a5 -mattr=-vfp2sp -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A5-NOFPU-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-SOFT-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A8-HARD -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=hard -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A8-HARD-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A8-SOFT ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-SOFT-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-A9-HARD -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A9-HARD-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a9 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-A9-SOFT -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-DEFAULT-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A12-NOFPU -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -mattr=-vfp2sp -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A12-NOFPU-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a12 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 | FileCheck %s --check-prefix=CORTEX-A15 -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A15-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 | FileCheck %s --check-prefix=CORTEX-A17-DEFAULT -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-A17-NOFPU -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -mattr=-vfp2sp -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A17-NOFPU-FAST ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -enable-no-trapping-fp-math | FileCheck %s --check-prefix=NO-TRAPPING-MATH ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a15 -denormal-fp-math=ieee | FileCheck %s --check-prefix=DENORMAL-IEEE @@ -74,37 +56,26 @@ ; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a17 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=CORTEX-M0 -; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0-FAST ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus | FileCheck %s --check-prefix=CORTEX-M0PLUS -; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M0PLUS-FAST ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m0plus -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 | FileCheck %s --check-prefix=CORTEX-M1 -; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M1-FAST ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=cortex-m1 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align | FileCheck %s --check-prefix=SC000 -; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC000-FAST ; RUN: llc < %s -mtriple=thumbv6m-linux-gnueabi -mcpu=sc000 -mattr=+strict-align -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=CORTEX-M3 -; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M3-FAST ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 | FileCheck %s --check-prefix=SC300 -; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=SC300-FAST ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=sc300 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft | FileCheck %s --check-prefix=CORTEX-M4-SOFT -; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=soft -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-SOFT-FAST ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard | FileCheck %s --check-prefix=CORTEX-M4-HARD -; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -float-abi=hard -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M4-HARD-FAST ; RUN: llc < %s -mtriple=thumbv7m-linux-gnueabi -mcpu=cortex-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2sp | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SOFT -; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-vfp2sp -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-NOFPU-FAST ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-fp64 | FileCheck %s --check-prefix=CORTEX-M7 --check-prefix=CORTEX-M7-SINGLE -; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=-fp64 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=CORTEX-M23 ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CORTEX-M33 -; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M33-FAST ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m35p | FileCheck %s --check-prefix=CORTEX-M35P @@ -113,49 +84,34 @@ ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4 | FileCheck %s --check-prefix=CORTEX-R4 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4f | FileCheck %s --check-prefix=CORTEX-R4F ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5 -; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R5-FAST ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 | FileCheck %s --check-prefix=CORTEX-R7 -; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R7-FAST ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 | FileCheck %s --check-prefix=CORTEX-R8 -; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-R8-FAST ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r8 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 | FileCheck %s --check-prefix=CORTEX-A32 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A32-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a32 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 | FileCheck %s --check-prefix=CORTEX-A35 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A35-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a35 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 | FileCheck %s --check-prefix=CORTEX-A53 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A53-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a53 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 | FileCheck %s --check-prefix=CORTEX-A57 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A57-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a57 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=CORTEX-A72 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a73 | FileCheck %s --check-prefix=CORTEX-A73 ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 | FileCheck %s --check-prefix=EXYNOS-M3 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m3 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 | FileCheck %s --check-prefix=EXYNOS-M4 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m4 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 | FileCheck %s --check-prefix=EXYNOS-M5 -; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=EXYNOS-FAST ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=exynos-m5 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING -; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A-FAST ; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=CORTEX-A7-CHECK -; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-CHECK-FAST ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2sp,-vfp3,-vfp4,-neon,-fp16 | FileCheck %s --check-prefix=CORTEX-A7-NOFPU -; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2sp,-vfp3,-vfp4,-neon,-fp16 -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-NOFPU-FAST ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING -; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,-neon -enable-unsafe-fp-math -frame-pointer=all -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A7-FPUV4-FAST ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+vfp4,,-d32,-neon | FileCheck %s --check-prefix=CORTEX-A7-FPUV4 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=pic | FileCheck %s --check-prefix=RELOC-PIC ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -mattr=+strict-align -relocation-model=static | FileCheck %s --check-prefix=RELOC-OTHER @@ -278,15 +234,6 @@ ; V6-NOT: .eabi_attribute 28 ; V6: .eabi_attribute 38, 1 -; V6-FAST-NOT: .eabi_attribute 19 -;; Despite the V6 CPU having no FPU by default, we chose to flush to -;; positive zero here. There's no hardware support doing this, but the -;; fast maths software library might. -; V6-FAST-NOT: .eabi_attribute 20 -; V6-FAST-NOT: .eabi_attribute 21 -; V6-FAST-NOT: .eabi_attribute 22 -; V6-FAST: .eabi_attribute 23, 1 - ;; We emit 6, 12 for both v6-M and v6S-M, technically this is incorrect for ;; V6-M, however we don't model the OS extension so this is fine. ; V6M: .eabi_attribute 6, 12 @@ -312,14 +259,6 @@ ; V6M-NOT: .eabi_attribute 28 ; V6M: .eabi_attribute 38, 1 -; V6M-FAST-NOT: .eabi_attribute 19 -;; Despite the V6M CPU having no FPU by default, we chose to flush to -;; positive zero here. There's no hardware support doing this, but the -;; fast maths software library might. -; V6M-FAST-NOT: .eabi_attribute 20 -; V6M-FAST-NOT: .eabi_attribute 21 -; V6M-FAST-NOT: .eabi_attribute 22 -; V6M-FAST: .eabi_attribute 23, 1 ; ARM1156T2F-S: .cpu arm1156t2f-s ; ARM1156T2F-S: .eabi_attribute 6, 8 @@ -342,14 +281,6 @@ ; ARM1156T2F-S-NOT: .eabi_attribute 28 ; ARM1156T2F-S: .eabi_attribute 38, 1 -; ARM1156T2F-S-FAST-NOT: .eabi_attribute 19 -;; V6 cores default to flush to positive zero (value 0). Note that value 2 is also equally -;; valid for this core, it's an implementation defined question as to which of 0 and 2 you -;; select. LLVM historically picks 0. -; ARM1156T2F-S-FAST-NOT: .eabi_attribute 20 -; ARM1156T2F-S-FAST-NOT: .eabi_attribute 21 -; ARM1156T2F-S-FAST-NOT: .eabi_attribute 22 -; ARM1156T2F-S-FAST: .eabi_attribute 23, 1 ; V7M: .eabi_attribute 6, 10 ; V7M: .eabi_attribute 7, 77 @@ -374,15 +305,6 @@ ; V7M-NOT: .eabi_attribute 28 ; V7M: .eabi_attribute 38, 1 -; V7M-FAST-NOT: .eabi_attribute 19 -;; Despite the V7M CPU having no FPU by default, we chose to flush -;; preserving sign. This matches what the hardware would do in the -;; architecture revision were to exist on the current target. -; V7M-FAST: .eabi_attribute 20, 2 -; V7M-FAST-NOT: .eabi_attribute 21 -; V7M-FAST-NOT: .eabi_attribute 22 -; V7M-FAST: .eabi_attribute 23, 1 - ; V7: .syntax unified ; V7: .eabi_attribute 6, 10 ; V7-NOT: .eabi_attribute 27 @@ -401,13 +323,6 @@ ; V7-NOT: .eabi_attribute 28 ; V7: .eabi_attribute 38, 1 -; V7-FAST-NOT: .eabi_attribute 19 -;; The default CPU does have an FPU and it must be VFPv3 or better, so it flushes -;; denormals to zero preserving the sign. -; V7-FAST: .eabi_attribute 20, 2 -; V7-FAST-NOT: .eabi_attribute 21 -; V7-FAST-NOT: .eabi_attribute 22 -; V7-FAST: .eabi_attribute 23, 1 ; V7VE: .syntax unified ; V7VE: .eabi_attribute 6, 10 @ Tag_CPU_arch @@ -435,12 +350,6 @@ ; V8-NOT: .eabi_attribute 22 ; V8: .eabi_attribute 23, 3 -; V8-FAST-NOT: .eabi_attribute 19 -;; The default does have an FPU, and for V8-A, it flushes preserving sign. -; V8-FAST: .eabi_attribute 20, 2 -; V8-FAST-NOT: .eabi_attribute 21 -; V8-FAST-NOT: .eabi_attribute 22 -; V8-FAST: .eabi_attribute 23, 1 ; Vt8: .syntax unified ; Vt8: .eabi_attribute 6, 14 @@ -552,15 +461,11 @@ ;; We default to IEEE 754 compliance ; CORTEX-A7-CHECK: .eabi_attribute 20, 1 ;; The A7 has VFPv3 support by default, so flush preserving sign. -; CORTEX-A7-CHECK-FAST: .eabi_attribute 20, 2 ; CORTEX-A7-NOFPU: .eabi_attribute 20, 1 ;; Despite there being no FPU, we chose to flush to zero preserving ;; sign. This matches what the hardware would do for this architecture ;; revision. -; CORTEX-A7-NOFPU-FAST: .eabi_attribute 20, 2 ; CORTEX-A7-FPUV4: .eabi_attribute 20, 1 -;; The VFPv4 FPU flushes preserving sign. -; CORTEX-A7-FPUV4-FAST: .eabi_attribute 20, 2 ; Tag_ABI_FP_exceptions ; CORTEX-A7-CHECK: .eabi_attribute 21, 1 @@ -610,13 +515,6 @@ ; CORTEX-A5-DEFAULT: .eabi_attribute 24, 1 ; CORTEX-A5-DEFAULT: .eabi_attribute 25, 1 -; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 19 -;; The A5 defaults to a VFPv4 FPU, so it flushed preserving the sign when -ffast-math -;; is given. -; CORTEX-A5-DEFAULT-FAST: .eabi_attribute 20, 2 -; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 21 -; CORTEX-A5-DEFAULT-FAST-NOT: .eabi_attribute 22 -; CORTEX-A5-DEFAULT-FAST: .eabi_attribute 23, 1 ; CORTEX-A5-NONEON: .cpu cortex-a5 ; CORTEX-A5-NONEON: .eabi_attribute 6, 10 @@ -634,13 +532,6 @@ ; CORTEX-A5-NONEON: .eabi_attribute 24, 1 ; CORTEX-A5-NONEON: .eabi_attribute 25, 1 -; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 19 -;; The A5 defaults to a VFPv4 FPU, so it flushed preserving sign when -ffast-math -;; is given. -; CORTEX-A5-NONEON-FAST: .eabi_attribute 20, 2 -; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 21 -; CORTEX-A5-NONEON-FAST-NOT: .eabi_attribute 22 -; CORTEX-A5-NONEON-FAST: .eabi_attribute 23, 1 ; CORTEX-A5-NOFPU: .cpu cortex-a5 ; CORTEX-A5-NOFPU: .eabi_attribute 6, 10 @@ -659,14 +550,9 @@ ; CORTEX-A5-NOFPU: .eabi_attribute 24, 1 ; CORTEX-A5-NOFPU: .eabi_attribute 25, 1 -; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 19 ;; Despite there being no FPU, we chose to flush to zero preserving ;; sign. This matches what the hardware would do for this architecture ;; revision. -; CORTEX-A5-NOFPU-FAST: .eabi_attribute 20, 2 -; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 21 -; CORTEX-A5-NOFPU-FAST-NOT: .eabi_attribute 22 -; CORTEX-A5-NOFPU-FAST: .eabi_attribute 23, 1 ; CORTEX-A8-SOFT: .cpu cortex-a8 ; CORTEX-A8-SOFT: .eabi_attribute 6, 10 @@ -712,15 +598,6 @@ ; CORTEX-A9-SOFT-NOT: .eabi_attribute 28 ; CORTEX-A9-SOFT: .eabi_attribute 38, 1 -; CORTEX-A8-SOFT-FAST-NOT: .eabi_attribute 19 -; CORTEX-A9-SOFT-FAST-NOT: .eabi_attribute 19 -;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-A8-SOFT-FAST: .eabi_attribute 20, 2 -; CORTEX-A9-SOFT-FAST: .eabi_attribute 20, 2 -; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 21 -; CORTEX-A5-SOFT-FAST-NOT: .eabi_attribute 22 -; CORTEX-A5-SOFT-FAST: .eabi_attribute 23, 1 ; CORTEX-A8-HARD: .cpu cortex-a8 ; CORTEX-A8-HARD: .eabi_attribute 6, 10 @@ -766,21 +643,6 @@ ; CORTEX-A9-HARD: .eabi_attribute 28, 1 ; CORTEX-A9-HARD: .eabi_attribute 38, 1 -; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 19 -;; The A8 defaults to a VFPv3 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-A8-HARD-FAST: .eabi_attribute 20, 2 -; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 21 -; CORTEX-A8-HARD-FAST-NOT: .eabi_attribute 22 -; CORTEX-A8-HARD-FAST: .eabi_attribute 23, 1 - -; CORTEX-A9-HARD-FAST-NOT: .eabi_attribute 19 -;; The A9 defaults to a VFPv3 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-A9-HARD-FAST: .eabi_attribute 20, 2 -; CORTEX-A9-HARD-FAST-NOT: .eabi_attribute 21 -; CORTEX-A9-HARD-FAST-NOT: .eabi_attribute 22 -; CORTEX-A9-HARD-FAST: .eabi_attribute 23, 1 ; CORTEX-A12-DEFAULT: .cpu cortex-a12 ; CORTEX-A12-DEFAULT: .eabi_attribute 6, 10 @@ -800,13 +662,6 @@ ; CORTEX-A12-DEFAULT: .eabi_attribute 24, 1 ; CORTEX-A12-DEFAULT: .eabi_attribute 25, 1 -; CORTEX-A12-DEFAULT-FAST-NOT: .eabi_attribute 19 -;; The A12 defaults to a VFPv3 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-A12-DEFAULT-FAST: .eabi_attribute 20, 2 -; CORTEX-A12-HARD-FAST-NOT: .eabi_attribute 21 -; CORTEX-A12-HARD-FAST-NOT: .eabi_attribute 22 -; CORTEX-A12-HARD-FAST: .eabi_attribute 23, 1 ; CORTEX-A12-NOFPU: .cpu cortex-a12 ; CORTEX-A12-NOFPU: .eabi_attribute 6, 10 @@ -826,14 +681,6 @@ ; CORTEX-A12-NOFPU: .eabi_attribute 24, 1 ; CORTEX-A12-NOFPU: .eabi_attribute 25, 1 -; CORTEX-A12-NOFPU-FAST-NOT: .eabi_attribute 19 -;; Despite there being no FPU, we chose to flush to zero preserving -;; sign. This matches what the hardware would do for this architecture -;; revision. -; CORTEX-A12-NOFPU-FAST: .eabi_attribute 20, 2 -; CORTEX-A12-NOFPU-FAST-NOT: .eabi_attribute 21 -; CORTEX-A12-NOFPU-FAST-NOT: .eabi_attribute 22 -; CORTEX-A12-NOFPU-FAST: .eabi_attribute 23, 1 ; CORTEX-A15: .cpu cortex-a15 ; CORTEX-A15: .eabi_attribute 6, 10 @@ -857,13 +704,6 @@ ; CORTEX-A15-NOT: .eabi_attribute 28 ; CORTEX-A15: .eabi_attribute 38, 1 -; CORTEX-A15-FAST-NOT: .eabi_attribute 19 -;; The A15 defaults to a VFPv3 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-A15-FAST: .eabi_attribute 20, 2 -; CORTEX-A15-FAST-NOT: .eabi_attribute 21 -; CORTEX-A15-FAST-NOT: .eabi_attribute 22 -; CORTEX-A15-FAST: .eabi_attribute 23, 1 ; CORTEX-A17-DEFAULT: .cpu cortex-a17 ; CORTEX-A17-DEFAULT: .eabi_attribute 6, 10 @@ -883,13 +723,6 @@ ; CORTEX-A17-DEFAULT: .eabi_attribute 24, 1 ; CORTEX-A17-DEFAULT: .eabi_attribute 25, 1 -; CORTEX-A17-FAST-NOT: .eabi_attribute 19 -;; The A17 defaults to a VFPv3 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-A17-FAST: .eabi_attribute 20, 2 -; CORTEX-A17-FAST-NOT: .eabi_attribute 21 -; CORTEX-A17-FAST-NOT: .eabi_attribute 22 -; CORTEX-A17-FAST: .eabi_attribute 23, 1 ; CORTEX-A17-NOFPU: .cpu cortex-a17 ; CORTEX-A17-NOFPU: .eabi_attribute 6, 10 @@ -910,13 +743,6 @@ ; CORTEX-A17-NOFPU: .eabi_attribute 25, 1 ; CORTEX-A17-NOFPU-NOT: .eabi_attribute 19 -;; Despite there being no FPU, we chose to flush to zero preserving -;; sign. This matches what the hardware would do for this architecture -;; revision. -; CORTEX-A17-NOFPU-FAST: .eabi_attribute 20, 2 -; CORTEX-A17-NOFPU-FAST-NOT: .eabi_attribute 21 -; CORTEX-A17-NOFPU-FAST-NOT: .eabi_attribute 22 -; CORTEX-A17-NOFPU-FAST: .eabi_attribute 23, 1 ; Test flags -enable-no-trapping-fp-math and -denormal-fp-math: ; NO-TRAPPING-MATH: .eabi_attribute 21, 0 @@ -946,16 +772,6 @@ ; CORTEX-M0-NOT: .eabi_attribute 28 ; CORTEX-M0: .eabi_attribute 38, 1 -; CORTEX-M0-FAST-NOT: .eabi_attribute 19 -;; Despite the M0 CPU having no FPU in this scenario, we chose to -;; flush to positive zero here. There's no hardware support doing -;; this, but the fast maths software library might and such behaviour -;; would match hardware support on this architecture revision if it -;; existed. -; CORTEX-M0-FAST-NOT: .eabi_attribute 20 -; CORTEX-M0-FAST-NOT: .eabi_attribute 21 -; CORTEX-M0-FAST-NOT: .eabi_attribute 22 -; CORTEX-M0-FAST: .eabi_attribute 23, 1 ; CORTEX-M0PLUS: .cpu cortex-m0plus ; CORTEX-M0PLUS: .eabi_attribute 6, 12 @@ -978,16 +794,6 @@ ; CORTEX-M0PLUS-NOT: .eabi_attribute 28 ; CORTEX-M0PLUS: .eabi_attribute 38, 1 -; CORTEX-M0PLUS-FAST-NOT: .eabi_attribute 19 -;; Despite the M0+ CPU having no FPU in this scenario, we chose to -;; flush to positive zero here. There's no hardware support doing -;; this, but the fast maths software library might and such behaviour -;; would match hardware support on this architecture revision if it -;; existed. -; CORTEX-M0PLUS-FAST-NOT: .eabi_attribute 20 -; CORTEX-M0PLUS-FAST-NOT: .eabi_attribute 21 -; CORTEX-M0PLUS-FAST-NOT: .eabi_attribute 22 -; CORTEX-M0PLUS-FAST: .eabi_attribute 23, 1 ; CORTEX-M1: .cpu cortex-m1 ; CORTEX-M1: .eabi_attribute 6, 12 @@ -1010,16 +816,6 @@ ; CORTEX-M1-NOT: .eabi_attribute 28 ; CORTEX-M1: .eabi_attribute 38, 1 -; CORTEX-M1-FAST-NOT: .eabi_attribute 19 -;; Despite the M1 CPU having no FPU in this scenario, we chose to -;; flush to positive zero here. There's no hardware support doing -;; this, but the fast maths software library might and such behaviour -;; would match hardware support on this architecture revision if it -;; existed. -; CORTEX-M1-FAST-NOT: .eabi_attribute 20 -; CORTEX-M1-FAST-NOT: .eabi_attribute 21 -; CORTEX-M1-FAST-NOT: .eabi_attribute 22 -; CORTEX-M1-FAST: .eabi_attribute 23, 1 ; SC000: .cpu sc000 ; SC000: .eabi_attribute 6, 12 @@ -1041,16 +837,6 @@ ; SC000-NOT: .eabi_attribute 28 ; SC000: .eabi_attribute 38, 1 -; SC000-FAST-NOT: .eabi_attribute 19 -;; Despite the SC000 CPU having no FPU in this scenario, we chose to -;; flush to positive zero here. There's no hardware support doing -;; this, but the fast maths software library might and such behaviour -;; would match hardware support on this architecture revision if it -;; existed. -; SC000-FAST-NOT: .eabi_attribute 20 -; SC000-FAST-NOT: .eabi_attribute 21 -; SC000-FAST-NOT: .eabi_attribute 22 -; SC000-FAST: .eabi_attribute 23, 1 ; CORTEX-M3: .cpu cortex-m3 ; CORTEX-M3: .eabi_attribute 6, 10 @@ -1073,14 +859,6 @@ ; CORTEX-M3-NOT: .eabi_attribute 28 ; CORTEX-M3: .eabi_attribute 38, 1 -; CORTEX-M3-FAST-NOT: .eabi_attribute 19 -;; Despite there being no FPU, we chose to flush to zero preserving -;; sign. This matches what the hardware would do for this architecture -;; revision. -; CORTEX-M3-FAST: .eabi_attribute 20, 2 -; CORTEX-M3-FAST-NOT: .eabi_attribute 21 -; CORTEX-M3-FAST-NOT: .eabi_attribute 22 -; CORTEX-M3-FAST: .eabi_attribute 23, 1 ; SC300: .cpu sc300 ; SC300: .eabi_attribute 6, 10 @@ -1103,14 +881,6 @@ ; SC300-NOT: .eabi_attribute 28 ; SC300: .eabi_attribute 38, 1 -; SC300-FAST-NOT: .eabi_attribute 19 -;; Despite there being no FPU, we chose to flush to zero preserving -;; sign. This matches what the hardware would do for this architecture -;; revision. -; SC300-FAST: .eabi_attribute 20, 2 -; SC300-FAST-NOT: .eabi_attribute 21 -; SC300-FAST-NOT: .eabi_attribute 22 -; SC300-FAST: .eabi_attribute 23, 1 ; CORTEX-M4-SOFT: .cpu cortex-m4 ; CORTEX-M4-SOFT: .eabi_attribute 6, 13 @@ -1134,13 +904,6 @@ ; CORTEX-M4-SOFT-NOT: .eabi_attribute 28 ; CORTEX-M4-SOFT: .eabi_attribute 38, 1 -; CORTEX-M4-SOFT-FAST-NOT: .eabi_attribute 19 -;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-M4-SOFT-FAST: .eabi_attribute 20, 2 -; CORTEX-M4-SOFT-FAST-NOT: .eabi_attribute 21 -; CORTEX-M4-SOFT-FAST-NOT: .eabi_attribute 22 -; CORTEX-M4-SOFT-FAST: .eabi_attribute 23, 1 ; CORTEX-M4-HARD: .cpu cortex-m4 ; CORTEX-M4-HARD: .eabi_attribute 6, 13 @@ -1164,13 +927,6 @@ ; CORTEX-M4-HARD: .eabi_attribute 28, 1 ; CORTEX-M4-HARD: .eabi_attribute 38, 1 -; CORTEX-M4-HARD-FAST-NOT: .eabi_attribute 19 -;; The M4 defaults to a VFPv4 FPU, so it flushes preserving the sign when -;; -ffast-math is specified. -; CORTEX-M4-HARD-FAST: .eabi_attribute 20, 2 -; CORTEX-M4-HARD-FAST-NOT: .eabi_attribute 21 -; CORTEX-M4-HARD-FAST-NOT: .eabi_attribute 22 -; CORTEX-M4-HARD-FAST: .eabi_attribute 23, 1 ; CORTEX-M7: .cpu cortex-m7 ; CORTEX-M7: .eabi_attribute 6, 13 @@ -1197,16 +953,6 @@ ; CORTEX-M7: .eabi_attribute 38, 1 ; CORTEX-M7: .eabi_attribute 14, 0 -; CORTEX-M7-NOFPU-FAST-NOT: .eabi_attribute 19 -;; The M7 has the ARMv8 FP unit, which always flushes preserving sign. -; CORTEX-M7-FAST: .eabi_attribute 20, 2 -;; Despite there being no FPU, we chose to flush to zero preserving -;; sign. This matches what the hardware would do for this architecture -;; revision. -; CORTEX-M7-NOFPU-FAST: .eabi_attribute 20, 2 -; CORTEX-M7-NOFPU-FAST-NOT: .eabi_attribute 21 -; CORTEX-M7-NOFPU-FAST-NOT: .eabi_attribute 22 -; CORTEX-M7-NOFPU-FAST: .eabi_attribute 23, 1 ; CORTEX-R4: .cpu cortex-r4 ; CORTEX-R4: .eabi_attribute 6, 10 @@ -1273,12 +1019,6 @@ ; CORTEX-R5-NOT: .eabi_attribute 28 ; CORTEX-R5: .eabi_attribute 38, 1 -; CORTEX-R5-FAST-NOT: .eabi_attribute 19 -;; The R5 has the VFPv3 FP unit, which always flushes preserving sign. -; CORTEX-R5-FAST: .eabi_attribute 20, 2 -; CORTEX-R5-FAST-NOT: .eabi_attribute 21 -; CORTEX-R5-FAST-NOT: .eabi_attribute 22 -; CORTEX-R5-FAST: .eabi_attribute 23, 1 ; CORTEX-R7: .cpu cortex-r7 ; CORTEX-R7: .eabi_attribute 6, 10 @@ -1301,12 +1041,6 @@ ; CORTEX-R7-NOT: .eabi_attribute 28 ; CORTEX-R7: .eabi_attribute 38, 1 -; CORTEX-R7-FAST-NOT: .eabi_attribute 19 -;; The R7 has the VFPv3 FP unit, which always flushes preserving sign. -; CORTEX-R7-FAST: .eabi_attribute 20, 2 -; CORTEX-R7-FAST-NOT: .eabi_attribute 21 -; CORTEX-R7-FAST-NOT: .eabi_attribute 22 -; CORTEX-R7-FAST: .eabi_attribute 23, 1 ; CORTEX-R8: .cpu cortex-r8 ; CORTEX-R8: .eabi_attribute 6, 10 @@ -1329,12 +1063,6 @@ ; CORTEX-R8-NOT: .eabi_attribute 28 ; CORTEX-R8: .eabi_attribute 38, 1 -; CORTEX-R8-FAST-NOT: .eabi_attribute 19 -;; The R8 has the VFPv3 FP unit, which always flushes preserving sign. -; CORTEX-R8-FAST: .eabi_attribute 20, 2 -; CORTEX-R8-FAST-NOT: .eabi_attribute 21 -; CORTEX-R8-FAST-NOT: .eabi_attribute 22 -; CORTEX-R8-FAST: .eabi_attribute 23, 1 ; CORTEX-A32: .cpu cortex-a32 ; CORTEX-A32: .eabi_attribute 6, 14 @@ -1359,12 +1087,6 @@ ; CORTEX-A32-NOT: .eabi_attribute 28 ; CORTEX-A32: .eabi_attribute 38, 1 -; CORTEX-A32-FAST-NOT: .eabi_attribute 19 -;; The A32 has the ARMv8 FP unit, which always flushes preserving sign. -; CORTEX-A32-FAST: .eabi_attribute 20, 2 -; CORTEX-A32-FAST-NOT: .eabi_attribute 21 -; CORTEX-A32-FAST-NOT: .eabi_attribute 22 -; CORTEX-A32-FAST: .eabi_attribute 23, 1 ; CORTEX-M23: .cpu cortex-m23 ; CORTEX-M23: .eabi_attribute 6, 16 @@ -1430,11 +1152,6 @@ ; CORTEX-M35P: .eabi_attribute 38, 1 ; CORTEX-M35P: .eabi_attribute 14, 0 -; CORTEX-M33-FAST-NOT: .eabi_attribute 19 -; CORTEX-M33-FAST: .eabi_attribute 20, 2 -; CORTEX-M33-FAST-NOT: .eabi_attribute 21 -; CORTEX-M33-FAST-NOT: .eabi_attribute 22 -; CORTEX-M33-FAST: .eabi_attribute 23, 1 ; CORTEX-A35: .cpu cortex-a35 ; CORTEX-A35: .eabi_attribute 6, 14 @@ -1459,12 +1176,6 @@ ; CORTEX-A35-NOT: .eabi_attribute 28 ; CORTEX-A35: .eabi_attribute 38, 1 -; CORTEX-A35-FAST-NOT: .eabi_attribute 19 -;; The A35 has the ARMv8 FP unit, which always flushes preserving sign. -; CORTEX-A35-FAST: .eabi_attribute 20, 2 -; CORTEX-A35-FAST-NOT: .eabi_attribute 21 -; CORTEX-A35-FAST-NOT: .eabi_attribute 22 -; CORTEX-A35-FAST: .eabi_attribute 23, 1 ; CORTEX-A53: .cpu cortex-a53 ; CORTEX-A53: .eabi_attribute 6, 14 @@ -1489,12 +1200,6 @@ ; CORTEX-A53-NOT: .eabi_attribute 28 ; CORTEX-A53: .eabi_attribute 38, 1 -; CORTEX-A53-FAST-NOT: .eabi_attribute 19 -;; The A53 has the ARMv8 FP unit, which always flushes preserving sign. -; CORTEX-A53-FAST: .eabi_attribute 20, 2 -; CORTEX-A53-FAST-NOT: .eabi_attribute 21 -; CORTEX-A53-FAST-NOT: .eabi_attribute 22 -; CORTEX-A53-FAST: .eabi_attribute 23, 1 ; CORTEX-A57: .cpu cortex-a57 ; CORTEX-A57: .eabi_attribute 6, 14 @@ -1519,12 +1224,6 @@ ; CORTEX-A57-NOT: .eabi_attribute 28 ; CORTEX-A57: .eabi_attribute 38, 1 -; CORTEX-A57-FAST-NOT: .eabi_attribute 19 -;; The A57 has the ARMv8 FP unit, which always flushes preserving sign. -; CORTEX-A57-FAST: .eabi_attribute 20, 2 -; CORTEX-A57-FAST-NOT: .eabi_attribute 21 -; CORTEX-A57-FAST-NOT: .eabi_attribute 22 -; CORTEX-A57-FAST: .eabi_attribute 23, 1 ; CORTEX-A72: .cpu cortex-a72 ; CORTEX-A72: .eabi_attribute 6, 14 @@ -1549,12 +1248,6 @@ ; CORTEX-A72-NOT: .eabi_attribute 28 ; CORTEX-A72: .eabi_attribute 38, 1 -; CORTEX-A72-FAST-NOT: .eabi_attribute 19 -;; The A72 has the ARMv8 FP unit, which always flushes preserving sign. -; CORTEX-A72-FAST: .eabi_attribute 20, 2 -; CORTEX-A72-FAST-NOT: .eabi_attribute 21 -; CORTEX-A72-FAST-NOT: .eabi_attribute 22 -; CORTEX-A72-FAST: .eabi_attribute 23, 1 ; CORTEX-A73: .cpu cortex-a73 ; CORTEX-A73: .eabi_attribute 6, 14 @@ -1580,12 +1273,6 @@ ; CORTEX-A73: .eabi_attribute 38, 1 ; CORTEX-A73: .eabi_attribute 14, 0 -; EXYNOS-FAST-NOT: .eabi_attribute 19 -;; The Exynos processors have the ARMv8 FP unit, which always flushes preserving sign. -; EXYNOS-FAST: .eabi_attribute 20, 2 -; EXYNOS-FAST-NOT: .eabi_attribute 21 -; EXYNOS-FAST-NOT: .eabi_attribute 22 -; EXYNOS-FAST: .eabi_attribute 23, 1 ; EXYNOS-M3: .cpu exynos-m3 ; EXYNOS-M3: .eabi_attribute 6, 14 @@ -1684,12 +1371,6 @@ ; GENERIC-ARMV8_1-A-NOT: .eabi_attribute 28 ; GENERIC-ARMV8_1-A: .eabi_attribute 38, 1 -; GENERIC-ARMV8_1-A-FAST-NOT: .eabi_attribute 19 -;; GENERIC-ARMV8_1-A has the ARMv8 FP unit, which always flushes preserving sign. -; GENERIC-ARMV8_1-A-FAST: .eabi_attribute 20, 2 -; GENERIC-ARMV8_1-A-FAST-NOT: .eabi_attribute 21 -; GENERIC-ARMV8_1-A-FAST-NOT: .eabi_attribute 22 -; GENERIC-ARMV8_1-A-FAST: .eabi_attribute 23, 1 ; RELOC-PIC: .eabi_attribute 15, 1 ; RELOC-PIC: .eabi_attribute 16, 1 diff --git a/llvm/test/CodeGen/ARM/fadds.ll b/llvm/test/CodeGen/ARM/fadds.ll index b5d3bdae1f9d3..191d5b3c13d26 100644 --- a/llvm/test/CodeGen/ARM/fadds.ll +++ b/llvm/test/CodeGen/ARM/fadds.ll @@ -7,7 +7,7 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA8 -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA8U ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \ diff --git a/llvm/test/CodeGen/ARM/fmuls.ll b/llvm/test/CodeGen/ARM/fmuls.ll index b24d867a7e866..a390a242e5918 100644 --- a/llvm/test/CodeGen/ARM/fmuls.ll +++ b/llvm/test/CodeGen/ARM/fmuls.ll @@ -7,7 +7,7 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA8 -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA8U ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \ diff --git a/llvm/test/CodeGen/ARM/fnegs.ll b/llvm/test/CodeGen/ARM/fnegs.ll index 435a600822e4d..6055b8f6dd93b 100644 --- a/llvm/test/CodeGen/ARM/fnegs.ll +++ b/llvm/test/CodeGen/ARM/fnegs.ll @@ -10,11 +10,11 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA8 -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA8U ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \ -; RUN: | FileCheck %s -check-prefix=CORTEXA8U +; RUN: | FileCheck %s -check-prefix=CORTEXA8U-DARWIN ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a9 %s -o - \ ; RUN: | FileCheck %s -check-prefix=CORTEXA9 @@ -41,7 +41,10 @@ entry: ; CORTEXA8: vneg.f32 s{{.*}}, s{{.*}} ; CORTEXA8U-LABEL: test1: -; CORTEXA8U: vneg.f32 d{{.*}}, d{{.*}} +; CORTEXA8U: vsub.f32 d{{.*}}, d{{.*}}, d{{.*}} + +; CORTEXA8U-DARWIN-LABEL: test1: +; CORTEXA8U-DARWIN: vneg.f32 d{{.*}}, d{{.*}} ; CORTEXA9-LABEL: test1: ; CORTEXA9: vneg.f32 s{{.*}}, s{{.*}} @@ -110,9 +113,13 @@ define <2 x float> @fneg_bitcast(i64 %i) { ; CORTEXA8-NOT: vneg.f32 ; CORTEXA8U-LABEL: fneg_bitcast: -; CORTEXA8U-DAG: eor r0, r0, #-2147483648 -; CORTEXA8U-DAG: eor r1, r1, #-2147483648 -; CORTEXA8U-NOT: vneg.f32 +; CORTEXA8U-DAG: vmov.i32 d{{.*}}, #0x80000000 +; CORTEXA8U-DAG: vsub.f32 d{{.*}}, d{{.*}}, d{{.*}} + +; CORTEXA8U-DARWIN-LABEL: fneg_bitcast: +; CORTEXA8U-DARWIN-DAG: eor r0, r0, #-2147483648 +; CORTEXA8U-DARWIN-DAG: eor r1, r1, #-2147483648 +; CORTEXA8U-DARWIN-NOT: vneg.f32 ; CORTEXA9-LABEL: fneg_bitcast: ; CORTEXA9-DAG: eor r0, r0, #-2147483648 diff --git a/llvm/test/CodeGen/ARM/fnmscs.ll b/llvm/test/CodeGen/ARM/fnmscs.ll index 0fa878c0c2f49..49f9dcf32f544 100644 --- a/llvm/test/CodeGen/ARM/fnmscs.ll +++ b/llvm/test/CodeGen/ARM/fnmscs.ll @@ -13,11 +13,11 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -regalloc=basic %s -o - \ ; RUN: | FileCheck %s -check-prefix=A8 -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \ ; RUN: | FileCheck %s -check-prefix=A8U ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \ -; RUN: | FileCheck %s -check-prefix=A8U +; RUN: | FileCheck %s -check-prefix=A8U-DARWIN define float @t1(float %acc, float %a, float %b) nounwind { entry: @@ -31,15 +31,20 @@ entry: ; NEON: vnmla.f32 ; A8U-LABEL: t1: -; A8U: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} -; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} +; A8U: vmov.i32 d{{[0-9]+}}, #0x80000000 +; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + +; A8U-DARWIN-LABEL: t1: +; A8U-DARWIN: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} +; A8U-DARWIN: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} ; A8-LABEL: t1: ; A8: vnmul.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} ; A8: vsub.f32 s{{[0-9]}}, s{{[0-9]}}, s{{[0-9]}} %0 = fmul float %a, %b %1 = fsub float -0.0, %0 - %2 = fsub float %1, %acc + %2 = fsub float %1, %acc ret float %2 } @@ -55,8 +60,13 @@ entry: ; NEON: vnmla.f32 ; A8U-LABEL: t2: -; A8U: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}} -; A8U: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} +; A8U: vmov.i32 d{{[0-9]+}}, #0x80000000 +; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} +; A8U: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}} + +; A8U-DARWIN-LABEL: t2: +; A8U-DARWIN: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}} +; A8U-DARWIN: vsub.f32 d{{[0-9]}}, d{{[0-9]}}, d{{[0-9]}} ; A8-LABEL: t2: ; A8: vnmul.f32 s{{[01234]}}, s{{[01234]}}, s{{[01234]}} @@ -79,8 +89,12 @@ entry: ; NEON: vnmla.f64 ; A8U-LABEL: t3: -; A8U: vnmul.f64 d ; A8U: vsub.f64 d +; A8U: vsub.f64 d + +; A8U-DARWIN-LABEL: t3: +; A8U-DARWIN: vnmul.f64 d +; A8U-DARWIN: vsub.f64 d ; A8-LABEL: t3: ; A8: vnmul.f64 d @@ -103,8 +117,12 @@ entry: ; NEON: vnmla.f64 ; A8U-LABEL: t4: -; A8U: vnmul.f64 d ; A8U: vsub.f64 d +; A8U: vsub.f64 d + +; A8U-DARWIN-LABEL: t4: +; A8U-DARWIN: vnmul.f64 d +; A8U-DARWIN: vsub.f64 d ; A8-LABEL: t4: ; A8: vnmul.f64 d diff --git a/llvm/test/CodeGen/ARM/fnmul.ll b/llvm/test/CodeGen/ARM/fnmul.ll index b021de8b7ad00..655c9f8415402 100644 --- a/llvm/test/CodeGen/ARM/fnmul.ll +++ b/llvm/test/CodeGen/ARM/fnmul.ll @@ -1,15 +1,30 @@ -; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s -check-prefix STRICT - -; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 -enable-unsafe-fp-math %s -o - | FileCheck %s -check-prefix UNSAFE +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=arm-eabi -mattr=+v6,+vfp2 %s -o - | FileCheck %s define double @t1(double %a, double %b) { -; STRICT: vnmul.f64 -; -; UNSAFE: vnmul.f64 +; CHECK-LABEL: t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vnmul.f64 d0, d1, d0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr entry: - %tmp2 = fsub double -0.000000e+00, %a ; [#uses=1] - %tmp4 = fmul double %tmp2, %b ; [#uses=1] - ret double %tmp4 + %tmp2 = fsub double -0.000000e+00, %a + %tmp4 = fmul double %tmp2, %b + ret double %tmp4 } - +define double @tfast(double %a, double %b) { +; CHECK-LABEL: tfast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vnmul.f64 d0, d1, d0 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %tmp2 = fsub fast double -0.000000e+00, %a + %tmp4 = fmul fast double %tmp2, %b + ret double %tmp4 +} diff --git a/llvm/test/CodeGen/ARM/fp16-vminmaxnm.ll b/llvm/test/CodeGen/ARM/fp16-vminmaxnm.ll index 33ff71e8c473e..9d0ea0e2d37cf 100644 --- a/llvm/test/CodeGen/ARM/fp16-vminmaxnm.ll +++ b/llvm/test/CodeGen/ARM/fp16-vminmaxnm.ll @@ -1,5 +1,6 @@ -; RUN: llc < %s -mtriple=arm-eabi -mattr=+fullfp16 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -; RUN: llc < %s -mtriple thumbv7a -mattr=+fullfp16 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+fullfp16 -enable-no-nans-fp-math | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7a-none-eabihf -mattr=+fullfp16 -enable-no-nans-fp-math | FileCheck %s ; TODO: we can't pass half-precision arguments as "half" types yet. We do ; that for the time being by passing "float %f.coerce" and the necessary @@ -9,9 +10,11 @@ define half @fp16_vminnm_o(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_o: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -22,9 +25,11 @@ entry: define half @fp16_vminnm_o_rev(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_o_rev: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -35,9 +40,11 @@ entry: define half @fp16_vminnm_u(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_u: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -48,9 +55,11 @@ entry: define half @fp16_vminnm_ule(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_ule: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -61,9 +70,11 @@ entry: define half @fp16_vminnm_u_rev(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_u_rev: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -74,9 +85,11 @@ entry: define half @fp16_vmaxnm_o(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_o: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -87,9 +100,11 @@ entry: define half @fp16_vmaxnm_oge(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_oge: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -100,9 +115,11 @@ entry: define half @fp16_vmaxnm_o_rev(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_o_rev: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -113,9 +130,11 @@ entry: define half @fp16_vmaxnm_ole_rev(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_ole_rev: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -126,9 +145,11 @@ entry: define half @fp16_vmaxnm_u(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_u: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -139,9 +160,11 @@ entry: define half @fp16_vmaxnm_uge(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_uge: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -152,9 +175,11 @@ entry: define half @fp16_vmaxnm_u_rev(i16 signext %a, i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_u_rev: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r1 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr entry: %0 = bitcast i16 %a to half %1 = bitcast i16 %b to half @@ -167,11 +192,17 @@ entry: define half @fp16_vminnm_NNNo(i16 signext %a) { ; CHECK-LABEL: fp16_vminnm_NNNo: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], #1.200000e+01 -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI12_0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .short 0x5040 @ half 34 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast olt half %0, 12. @@ -183,11 +214,19 @@ entry: define half @fp16_vminnm_NNNo_rev(i16 signext %a) { ; CHECK-LABEL: fp16_vminnm_NNNo_rev: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI13_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI13_1 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI13_0: +; CHECK-NEXT: .short 0x5300 @ half 56 +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .short 0x54e0 @ half 78 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast ogt half %0, 56. @@ -199,11 +238,17 @@ entry: define half @fp16_vminnm_NNNu(i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_NNNu: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], #1.200000e+01 -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI14_0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .short 0x5040 @ half 34 entry: %0 = bitcast i16 %b to half %cmp1 = fcmp fast ult half 12., %0 @@ -215,11 +260,19 @@ entry: define half @fp16_vminnm_NNNule(i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_NNNule: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI15_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI15_1 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .short 0x5040 @ half 34 +; CHECK-NEXT: .LCPI15_1: +; CHECK-NEXT: .short 0x5300 @ half 56 entry: %0 = bitcast i16 %b to half %cmp1 = fcmp fast ule half 34., %0 @@ -231,11 +284,19 @@ entry: define half @fp16_vminnm_NNNu_rev(i16 signext %b) { ; CHECK-LABEL: fp16_vminnm_NNNu_rev: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vminnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI16_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI16_1 +; CHECK-NEXT: vminnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .short 0x5300 @ half 56 +; CHECK-NEXT: .LCPI16_1: +; CHECK-NEXT: .short 0x54e0 @ half 78 entry: %0 = bitcast i16 %b to half %cmp1 = fcmp fast ugt half 56., %0 @@ -247,11 +308,17 @@ entry: define half @fp16_vmaxnm_NNNo(i16 signext %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNo: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], #1.200000e+01 -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI17_0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .short 0x5040 @ half 34 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast ogt half %0, 12. @@ -263,11 +330,19 @@ entry: define half @fp16_vmaxnm_NNNoge(i16 signext %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNoge: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI18_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI18_1 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI18_0: +; CHECK-NEXT: .short 0x5040 @ half 34 +; CHECK-NEXT: .LCPI18_1: +; CHECK-NEXT: .short 0x5300 @ half 56 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast oge half %0, 34. @@ -279,11 +354,19 @@ entry: define half @fp16_vmaxnm_NNNo_rev(i16 signext %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNo_rev: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI19_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI19_1 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI19_0: +; CHECK-NEXT: .short 0x5300 @ half 56 +; CHECK-NEXT: .LCPI19_1: +; CHECK-NEXT: .short 0x54e0 @ half 78 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast olt half %0, 56. @@ -295,11 +378,19 @@ entry: define half @fp16_vmaxnm_NNNole_rev(i16 signext %a) { ; CHECK-LABEL: fp16_vmaxnm_NNNole_rev: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI20_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI20_1 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI20_0: +; CHECK-NEXT: .short 0x54e0 @ half 78 +; CHECK-NEXT: .LCPI20_1: +; CHECK-NEXT: .short 0x55a0 @ half 90 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast ole half %0, 78. @@ -311,11 +402,17 @@ entry: define half @fp16_vmaxnm_NNNu(i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_NNNu: -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmov.f16 [[S2:s[0-9]]], #1.200000e+01 -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmov.f16 s2, #1.200000e+01 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI21_0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .short 0x5040 @ half 34 entry: %0 = bitcast i16 %b to half %cmp1 = fcmp fast ugt half 12., %0 @@ -327,11 +424,19 @@ entry: define half @fp16_vmaxnm_NNNuge(i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_NNNuge: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI22_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI22_1 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI22_0: +; CHECK-NEXT: .short 0x5040 @ half 34 +; CHECK-NEXT: .LCPI22_1: +; CHECK-NEXT: .short 0x5300 @ half 56 entry: %0 = bitcast i16 %b to half %cmp1 = fcmp fast uge half 34., %0 @@ -343,11 +448,19 @@ entry: define half @fp16_vmaxnm_NNNu_rev(i16 signext %b) { ; CHECK-LABEL: fp16_vmaxnm_NNNu_rev: -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmov.f16 [[S0:s[0-9]]], r{{.}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] -; CHECK: vldr.16 s2, .LCPI{{.*}} -; CHECK: vmaxnm.f16 s0, [[S0]], [[S2]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s2, .LCPI23_0 +; CHECK-NEXT: vmov.f16 s0, r0 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: vldr.16 s2, .LCPI23_1 +; CHECK-NEXT: vmaxnm.f16 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI23_0: +; CHECK-NEXT: .short 0x5300 @ half 56 +; CHECK-NEXT: .LCPI23_1: +; CHECK-NEXT: .short 0x54e0 @ half 78 entry: %0 = bitcast i16 %b to half %cmp1 = fcmp fast ult half 56., %0 @@ -359,10 +472,16 @@ entry: define half @fp16_vminmaxnm_0(i16 signext %a) { ; CHECK-LABEL: fp16_vminmaxnm_0: -; CHECK: vldr.16 s0, .LCPI{{.*}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s2, s2, s0 -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s0, .LCPI24_0 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s2, s2, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI24_0: +; CHECK-NEXT: .short 0x0000 @ half 0 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast olt half %0, 0. @@ -374,10 +493,16 @@ entry: define half @fp16_vminmaxnm_neg0(i16 signext %a) { ; CHECK-LABEL: fp16_vminmaxnm_neg0: -; CHECK: vldr.16 s0, .LCPI{{.*}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s2, s2, s0 -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s0, .LCPI25_0 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s2, s2, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI25_0: +; CHECK-NEXT: .short 0x8000 @ half -0 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast olt half %0, -0. @@ -389,10 +514,16 @@ entry: define half @fp16_vminmaxnm_e_0(i16 signext %a) { ; CHECK-LABEL: fp16_vminmaxnm_e_0: -; CHECK: vldr.16 s0, .LCPI{{.*}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s2, s2, s0 -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s0, .LCPI26_0 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s2, s2, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI26_0: +; CHECK-NEXT: .short 0x0000 @ half 0 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast ule half 0., %0 @@ -404,10 +535,16 @@ entry: define half @fp16_vminmaxnm_e_neg0(i16 signext %a) { ; CHECK-LABEL: fp16_vminmaxnm_e_neg0: -; CHECK: vldr.16 s0, .LCPI{{.*}} -; CHECK: vmov.f16 [[S2:s[0-9]]], r{{.}} -; CHECK: vminnm.f16 s2, s2, s0 -; CHECK: vmaxnm.f16 s0, [[S2]], [[S0]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldr.16 s0, .LCPI27_0 +; CHECK-NEXT: vmov.f16 s2, r0 +; CHECK-NEXT: vminnm.f16 s2, s2, s0 +; CHECK-NEXT: vmaxnm.f16 s0, s2, s0 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 1 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI27_0: +; CHECK-NEXT: .short 0x8000 @ half -0 entry: %0 = bitcast i16 %a to half %cmp1 = fcmp fast ule half -0., %0 diff --git a/llvm/test/CodeGen/ARM/fp16.ll b/llvm/test/CodeGen/ARM/fp16.ll deleted file mode 100644 index 9ff701050ac7e..0000000000000 --- a/llvm/test/CodeGen/ARM/fp16.ll +++ /dev/null @@ -1,105 +0,0 @@ -; RUN: llc -mtriple=armv7a--none-eabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-EABI %s -; RUN: llc -mtriple=armv7a--none-gnueabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-GNU %s -; RUN: llc -mtriple=armv7a--none-musleabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-HARDFLOAT-GNU %s -; RUN: llc -mtriple=armv8-eabihf < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARMV8 %s -; RUN: llc -mtriple=thumbv7m-eabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-EABI %s -; RUN: llc -mtriple=thumbv7m-gnueabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-GNU %s -; RUN: llc -mtriple=thumbv7m-musleabi < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SOFTFLOAT-GNU %s - -;; +fp16 is special: it has f32->f16 (unlike v7), but not f64->f16 (unlike v8). -;; This exposes unsafe-fp-math optimization opportunities; test that. -; RUN: llc -mattr=+vfp3,+fp16 < %s |\ -; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-FP16 --check-prefix=CHECK-FP16-SAFE %s -; RUN: llc -mattr=+vfp3,+fp16 < %s -enable-unsafe-fp-math |\ -; RUN: FileCheck --check-prefix=CHECK --check-prefix=CHECK-FP16 --check-prefix=CHECK-FP16-UNSAFE %s - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32" -target triple = "armv7---eabihf" - -@x = global i16 12902 -@y = global i16 0 -@z = common global i16 0 - -define void @foo() nounwind { -; CHECK-LABEL: foo: -entry: - %0 = load i16, ptr @x, align 2 - %1 = load i16, ptr @y, align 2 - %2 = tail call float @llvm.convert.from.fp16.f32(i16 %0) -; CHECK-HARDFLOAT-EABI: __aeabi_h2f -; CHECK-HARDFLOAT-GNU: __gnu_h2f_ieee -; CHECK-FP16: vcvtb.f32.f16 -; CHECK-ARMV8: vcvtb.f32.f16 -; CHECK-SOFTFLOAT-EABI: __aeabi_h2f -; CHECK-SOFTFLOAT-GNU: __gnu_h2f_ieee - %3 = tail call float @llvm.convert.from.fp16.f32(i16 %1) -; CHECK-HARDFLOAT-EABI: __aeabi_h2f -; CHECK-HARDFLOAT-GNU: __gnu_h2f_ieee -; CHECK-FP16: vcvtb.f32.f16 -; CHECK-ARMV8: vcvtb.f32.f16 -; CHECK-SOFTFLOAT-EABI: __aeabi_h2f -; CHECK-SOFTFLOAT-GNU: __gnu_h2f_ieee - %4 = fadd float %2, %3 - %5 = tail call i16 @llvm.convert.to.fp16.f32(float %4) -; CHECK-HARDFLOAT-EABI: __aeabi_f2h -; CHECK-HARDFLOAT-GNU: __gnu_f2h_ieee -; CHECK-FP16: vcvtb.f16.f32 -; CHECK-ARMV8: vcvtb.f16.f32 -; CHECK-SOFTFLOAT-EABI: __aeabi_f2h -; CHECK-SOFTFLOAT-GNU: __gnu_f2h_ieee - store i16 %5, ptr @x, align 2 - ret void -} - -define double @test_from_fp16(i16 %in) { -; CHECK-LABEL: test_from_fp16: - %val = call double @llvm.convert.from.fp16.f64(i16 %in) -; CHECK-HARDFLOAT-EABI: bl __aeabi_h2f -; CHECK-HARDFLOAT-EABI: vmov [[TMP:s[0-9]+]], r0 -; CHECK-HARDFLOAT-EABI: vcvt.f64.f32 {{d[0-9]+}}, [[TMP]] - -; CHECK-HARDFLOAT-GNU: bl __gnu_h2f_ieee -; CHECK-HARDFLOAT-GNU: vmov [[TMP:s[0-9]+]], r0 -; CHECK-HARDFLOAT-GNU: vcvt.f64.f32 {{d[0-9]+}}, [[TMP]] - -; CHECK-FP16: vmov [[TMP16:s[0-9]+]], r0 -; CHECK-FP16: vcvtb.f32.f16 [[TMP32:s[0-9]+]], [[TMP16]] -; CHECK-FP16: vcvt.f64.f32 d0, [[TMP32]] - -; CHECK-ARMV8: vmov [[TMP:s[0-9]+]], r0 -; CHECK-ARMV8: vcvtb.f64.f16 d0, [[TMP]] - -; CHECK-SOFTFLOAT-EABI: bl __aeabi_h2f -; CHECK-SOFTFLOAT-EABI: bl __aeabi_f2d - -; CHECK-SOFTFLOAT-GNU: bl __gnu_h2f_ieee -; CHECK-SOFTFLOAT-GNU: bl __aeabi_f2d - ret double %val -} - -define i16 @test_to_fp16(double %in) { -; CHECK-LABEL: test_to_fp16: - %val = call i16 @llvm.convert.to.fp16.f64(double %in) -; CHECK-HARDFLOAT-EABI: bl __aeabi_d2h - -; CHECK-HARDFLOAT-GNU: bl __aeabi_d2h - -; CHECK-FP16-SAFE: bl __aeabi_d2h - -; CHECK-FP16-UNSAFE: vmov r0, r1, d0 -; CHECK-FP16-UNSAFE-NEXT: bl __aeabi_d2h - -; CHECK-ARMV8: vcvtb.f16.f64 [[TMP:s[0-9]+]], d0 -; CHECK-ARMV8: vmov r0, [[TMP]] - -; CHECK-SOFTFLOAT-EABI: bl __aeabi_d2h - -; CHECK-SOFTFLOAT-GNU: bl __aeabi_d2h - ret i16 %val -} - -declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone -declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone - -declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone -declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone diff --git a/llvm/test/CodeGen/ARM/fp_convert.ll b/llvm/test/CodeGen/ARM/fp_convert.ll index 6f4707573fb50..0b749bf1c7ad4 100644 --- a/llvm/test/CodeGen/ARM/fp_convert.ll +++ b/llvm/test/CodeGen/ARM/fp_convert.ll @@ -7,7 +7,7 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \ ; RUN: | FileCheck %s -check-prefix=VFP2 -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \ ; RUN: | FileCheck %s -check-prefix=NEON ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \ diff --git a/llvm/test/CodeGen/ARM/fsubs.ll b/llvm/test/CodeGen/ARM/fsubs.ll index baff34ab31fcf..7170f04ea0dd3 100644 --- a/llvm/test/CodeGen/ARM/fsubs.ll +++ b/llvm/test/CodeGen/ARM/fsubs.ll @@ -4,7 +4,7 @@ ; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 %s -o - \ ; RUN: | FileCheck %s -check-prefix=NFP1 -; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --enable-unsafe-fp-math %s -o - \ +; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 --denormal-fp-math=preserve-sign %s -o - \ ; RUN: | FileCheck %s -check-prefix=NFP1U ; RUN: llc -mtriple=arm-darwin -mcpu=cortex-a8 %s -o - \ diff --git a/llvm/test/CodeGen/ARM/neon-spfp.ll b/llvm/test/CodeGen/ARM/neon-spfp.ll index cbf25965a2fac..70a809583ff65 100644 --- a/llvm/test/CodeGen/ARM/neon-spfp.ll +++ b/llvm/test/CodeGen/ARM/neon-spfp.ll @@ -4,11 +4,11 @@ ; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 | FileCheck %s -check-prefix=CHECK-LINUXA15 ; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift | FileCheck %s -check-prefix=CHECK-LINUXSWIFT -; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA5 -; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA8 -; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA9 -; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFEA15 -; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK-UNSAFESWIFT +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a5 --denormal-fp-math=preserve-sign | FileCheck %s -check-prefix=CHECK-UNSAFEA5 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a8 --denormal-fp-math=preserve-sign | FileCheck %s -check-prefix=CHECK-UNSAFEA8 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a9 --denormal-fp-math=preserve-sign | FileCheck %s -check-prefix=CHECK-UNSAFEA9 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=cortex-a15 --denormal-fp-math=preserve-sign| FileCheck %s -check-prefix=CHECK-UNSAFEA15 +; RUN: llc < %s -mtriple armv7a-none-linux-gnueabihf -mcpu=swift --denormal-fp-math=preserve-sign | FileCheck %s -check-prefix=CHECK-UNSAFESWIFT ; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a5 | FileCheck %s -check-prefix=CHECK-DARWINA5 ; RUN: llc < %s -mtriple armv7a-none-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-DARWINA8 diff --git a/llvm/test/CodeGen/ARM/sincos.ll b/llvm/test/CodeGen/ARM/sincos.ll index dc8fdf69ca610..e1b683a8a6657 100644 --- a/llvm/test/CodeGen/ARM/sincos.ll +++ b/llvm/test/CodeGen/ARM/sincos.ll @@ -1,8 +1,7 @@ ; RUN: llc < %s -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT ; RUN: llc < %s -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS ; RUN: llc < %s -mtriple=armv7-linux-gnu -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU -; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 \ -; RUN: --enable-unsafe-fp-math | FileCheck %s --check-prefix=SINCOS-GNU +; RUN: llc < %s -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU ; RUN: llc < %s -mtriple=armv7-linux-android -mcpu=cortex-a8 | FileCheck %s --check-prefix=NOOPT-ANDROID ; RUN: llc < %s -mtriple=armv7-linux-android9 -mcpu=cortex-a8 | FileCheck %s --check-prefix=SINCOS-GNU @@ -33,6 +32,28 @@ entry: ret float %add } +define float @test1_fast(float %x) nounwind { +entry: +; SINCOS-LABEL: test1_fast: +; SINCOS: bl ___sincosf_stret + +; SINCOS-GNU-LABEL: test1_fast: +; SINCOS-GNU: bl sincosf + +; NOOPT-LABEL: test1_fast: +; NOOPT: bl _sinf +; NOOPT: bl _cosf + +; NOOPT-ANDROID-LABEL: test1_fast: +; NOOPT-ANDROID: bl sinf +; NOOPT-ANDROID: bl cosf + + %call = tail call fast float @sinf(float %x) readnone + %call1 = tail call fast float @cosf(float %x) readnone + %add = fadd float %call, %call1 + ret float %add +} + define float @test1_errno(float %x) nounwind { entry: ; SINCOS-LABEL: test1_errno: @@ -79,6 +100,28 @@ entry: ret double %add } +define double @test2_fast(double %x) nounwind { +entry: +; SINCOS-LABEL: test2_fast: +; SINCOS: bl ___sincos_stret + +; SINCOS-GNU-LABEL: test2_fast: +; SINCOS-GNU: bl sincos + +; NOOPT-LABEL: test2_fast: +; NOOPT: bl _sin +; NOOPT: bl _cos + +; NOOPT-ANDROID-LABEL: test2_fast: +; NOOPT-ANDROID: bl sin +; NOOPT-ANDROID: bl cos + + %call = tail call fast double @sin(double %x) readnone + %call1 = tail call fast double @cos(double %x) readnone + %add = fadd double %call, %call1 + ret double %add +} + define double @test2_errno(double %x) nounwind { entry: ; SINCOS-LABEL: test2_errno: diff --git a/llvm/test/CodeGen/ARM/vminmaxnm.ll b/llvm/test/CodeGen/ARM/vminmaxnm.ll index bb3ea3067541e..be33dbfc61b04 100644 --- a/llvm/test/CodeGen/ARM/vminmaxnm.ll +++ b/llvm/test/CodeGen/ARM/vminmaxnm.ll @@ -1,146 +1,163 @@ -; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple armv8-none-eabihf -mattr=+neon,+fp-armv8 -enable-no-nans-fp-math | FileCheck %s ; scalars -define float @fp-armv8_vminnm_o(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_o": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_o(float %a, float %b) { +; CHECK-LABEL: fparmv8_vminnm_o: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast olt float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define double @fp-armv8_vminnm_ole(double %a, double %b) { -; CHECK-LABEL: "fp-armv8_vminnm_ole": -; CHECK-NOT: vcmp -; CHECK: vminnm.f64 +define double @fparmv8_vminnm_ole(double %a, double %b) { +; CHECK-LABEL: fparmv8_vminnm_ole: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f64 d0, d0, d1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ole double %a, %b %cond = select nsz i1 %cmp, double %a, double %b ret double %cond } -define float @fp-armv8_vminnm_o_rev(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_o_rev": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_o_rev(float %a, float %b) { +; CHECK-LABEL: fparmv8_vminnm_o_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ogt float %a, %b %cond = select nsz i1 %cmp, float %b, float %a ret float %cond } -define double @fp-armv8_vminnm_oge_rev(double %a, double %b) { -; CHECK-LABEL: "fp-armv8_vminnm_oge_rev": -; CHECK-NOT: vcmp -; CHECK: vminnm.f64 +define double @fparmv8_vminnm_oge_rev(double %a, double %b) { +; CHECK-LABEL: fparmv8_vminnm_oge_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f64 d0, d0, d1 +; CHECK-NEXT: bx lr %cmp = fcmp fast oge double %a, %b %cond = select nsz i1 %cmp, double %b, double %a ret double %cond } -define float @fp-armv8_vminnm_u(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_u": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_u(float %a, float %b) { +; CHECK-LABEL: fparmv8_vminnm_u: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ult float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define float @fp-armv8_vminnm_ule(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_ule": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_ule(float %a, float %b) { +; CHECK-LABEL: fparmv8_vminnm_ule: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ule float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define float @fp-armv8_vminnm_u_rev(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_u_rev": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_u_rev(float %a, float %b) { +; CHECK-LABEL: fparmv8_vminnm_u_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ugt float %a, %b %cond = select nsz i1 %cmp, float %b, float %a ret float %cond } -define double @fp-armv8_vminnm_uge_rev(double %a, double %b) { -; CHECK-LABEL: "fp-armv8_vminnm_uge_rev": -; CHECK-NOT: vcmp -; CHECK: vminnm.f64 +define double @fparmv8_vminnm_uge_rev(double %a, double %b) { +; CHECK-LABEL: fparmv8_vminnm_uge_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vminnm.f64 d0, d0, d1 +; CHECK-NEXT: bx lr %cmp = fcmp fast uge double %a, %b %cond = select nsz i1 %cmp, double %b, double %a ret double %cond } -define float @fp-armv8_vmaxnm_o(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_o": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_o(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_o: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ogt float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define float @fp-armv8_vmaxnm_oge(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_oge": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_oge(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_oge: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast oge float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define float @fp-armv8_vmaxnm_o_rev(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_o_rev": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_o_rev(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_o_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast olt float %a, %b %cond = select nsz i1 %cmp, float %b, float %a ret float %cond } -define float @fp-armv8_vmaxnm_ole_rev(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_ole_rev": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_ole_rev(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_ole_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ole float %a, %b %cond = select nsz i1 %cmp, float %b, float %a ret float %cond } -define float @fp-armv8_vmaxnm_u(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_u": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_u(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_u: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ugt float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define float @fp-armv8_vmaxnm_uge(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_uge": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_uge(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_uge: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast uge float %a, %b %cond = select nsz i1 %cmp, float %a, float %b ret float %cond } -define float @fp-armv8_vmaxnm_u_rev(float %a, float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_u_rev": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_u_rev(float %a, float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_u_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ult float %a, %b %cond = select nsz i1 %cmp, float %b, float %a ret float %cond } -define double @fp-armv8_vmaxnm_ule_rev(double %a, double %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_ule_rev": -; CHECK-NOT: vcmp -; CHECK: vmaxnm.f64 +define double @fparmv8_vmaxnm_ule_rev(double %a, double %b) { +; CHECK-LABEL: fparmv8_vmaxnm_ule_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmaxnm.f64 d0, d0, d1 +; CHECK-NEXT: bx lr %cmp = fcmp fast ule double %a, %b %cond = select nsz i1 %cmp, double %b, double %a ret double %cond @@ -148,10 +165,18 @@ define double @fp-armv8_vmaxnm_ule_rev(double %a, double %b) { ; known non-NaNs -define float @fp-armv8_vminnm_NNNo(float %a) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNo": -; CHECK: vminnm.f32 -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_NNNo(float %a) { +; CHECK-LABEL: fparmv8_vminnm_NNNo: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.f32 s2, #1.200000e+01 +; CHECK-NEXT: vldr s4, .LCPI16_0 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 %cmp1 = fcmp fast olt float %a, 12. %cond1 = select nsz i1 %cmp1, float %a, float 12. %cmp2 = fcmp fast olt float 34., %cond1 @@ -159,10 +184,22 @@ define float @fp-armv8_vminnm_NNNo(float %a) { ret float %cond2 } -define double @fp-armv8_vminnm_NNNole(double %a) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNole": -; CHECK: vminnm.f64 -; CHECK: vminnm.f64 +define double @fparmv8_vminnm_NNNole(double %a) { +; CHECK-LABEL: fparmv8_vminnm_NNNole: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, .LCPI17_0 +; CHECK-NEXT: vldr d17, .LCPI17_1 +; CHECK-NEXT: vminnm.f64 d16, d0, d16 +; CHECK-NEXT: vminnm.f64 d0, d16, d17 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .long 0 @ double 34 +; CHECK-NEXT: .long 1078001664 +; CHECK-NEXT: .LCPI17_1: +; CHECK-NEXT: .long 0 @ double 56 +; CHECK-NEXT: .long 1078722560 %cmp1 = fcmp fast ole double %a, 34. %cond1 = select nsz i1 %cmp1, double %a, double 34. %cmp2 = fcmp fast ole double 56., %cond1 @@ -170,10 +207,20 @@ define double @fp-armv8_vminnm_NNNole(double %a) { ret double %cond2 } -define float @fp-armv8_vminnm_NNNo_rev(float %a) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNo_rev": -; CHECK: vminnm.f32 -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_NNNo_rev(float %a) { +; CHECK-LABEL: fparmv8_vminnm_NNNo_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI18_0 +; CHECK-NEXT: vldr s4, .LCPI18_1 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI18_0: +; CHECK-NEXT: .long 0x42600000 @ float 56 +; CHECK-NEXT: .LCPI18_1: +; CHECK-NEXT: .long 0x429c0000 @ float 78 %cmp1 = fcmp fast ogt float %a, 56. %cond1 = select nsz i1 %cmp1, float 56., float %a %cmp2 = fcmp fast ogt float 78., %cond1 @@ -181,10 +228,22 @@ define float @fp-armv8_vminnm_NNNo_rev(float %a) { ret float %cond2 } -define double @fp-armv8_vminnm_NNNoge_rev(double %a) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNoge_rev": -; CHECK: vminnm.f64 -; CHECK: vminnm.f64 +define double @fparmv8_vminnm_NNNoge_rev(double %a) { +; CHECK-LABEL: fparmv8_vminnm_NNNoge_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, .LCPI19_0 +; CHECK-NEXT: vldr d17, .LCPI19_1 +; CHECK-NEXT: vminnm.f64 d16, d0, d16 +; CHECK-NEXT: vminnm.f64 d0, d16, d17 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI19_0: +; CHECK-NEXT: .long 0 @ double 78 +; CHECK-NEXT: .long 1079214080 +; CHECK-NEXT: .LCPI19_1: +; CHECK-NEXT: .long 0 @ double 90 +; CHECK-NEXT: .long 1079410688 %cmp1 = fcmp fast oge double %a, 78. %cond1 = select nsz i1 %cmp1, double 78., double %a %cmp2 = fcmp fast oge double 90., %cond1 @@ -192,10 +251,18 @@ define double @fp-armv8_vminnm_NNNoge_rev(double %a) { ret double %cond2 } -define float @fp-armv8_vminnm_NNNu(float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNu": -; CHECK: vminnm.f32 -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_NNNu(float %b) { +; CHECK-LABEL: fparmv8_vminnm_NNNu: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.f32 s2, #1.200000e+01 +; CHECK-NEXT: vldr s4, .LCPI20_0 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI20_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 %cmp1 = fcmp fast ult float 12., %b %cond1 = select nsz i1 %cmp1, float 12., float %b %cmp2 = fcmp fast ult float %cond1, 34. @@ -203,10 +270,20 @@ define float @fp-armv8_vminnm_NNNu(float %b) { ret float %cond2 } -define float @fp-armv8_vminnm_NNNule(float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNule": -; CHECK: vminnm.f32 -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_NNNule(float %b) { +; CHECK-LABEL: fparmv8_vminnm_NNNule: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI21_0 +; CHECK-NEXT: vldr s4, .LCPI21_1 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 +; CHECK-NEXT: .LCPI21_1: +; CHECK-NEXT: .long 0x42600000 @ float 56 %cmp1 = fcmp fast ule float 34., %b %cond1 = select nsz i1 %cmp1, float 34., float %b %cmp2 = fcmp fast ule float %cond1, 56. @@ -214,10 +291,20 @@ define float @fp-armv8_vminnm_NNNule(float %b) { ret float %cond2 } -define float @fp-armv8_vminnm_NNNu_rev(float %b) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNu_rev": -; CHECK: vminnm.f32 -; CHECK: vminnm.f32 +define float @fparmv8_vminnm_NNNu_rev(float %b) { +; CHECK-LABEL: fparmv8_vminnm_NNNu_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI22_0 +; CHECK-NEXT: vldr s4, .LCPI22_1 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vminnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI22_0: +; CHECK-NEXT: .long 0x42600000 @ float 56 +; CHECK-NEXT: .LCPI22_1: +; CHECK-NEXT: .long 0x429c0000 @ float 78 %cmp1 = fcmp fast ugt float 56., %b %cond1 = select nsz i1 %cmp1, float %b, float 56. %cmp2 = fcmp fast ugt float %cond1, 78. @@ -225,10 +312,22 @@ define float @fp-armv8_vminnm_NNNu_rev(float %b) { ret float %cond2 } -define double @fp-armv8_vminnm_NNNuge_rev(double %b) { -; CHECK-LABEL: "fp-armv8_vminnm_NNNuge_rev": -; CHECK: vminnm.f64 -; CHECK: vminnm.f64 +define double @fparmv8_vminnm_NNNuge_rev(double %b) { +; CHECK-LABEL: fparmv8_vminnm_NNNuge_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, .LCPI23_0 +; CHECK-NEXT: vldr d17, .LCPI23_1 +; CHECK-NEXT: vminnm.f64 d16, d0, d16 +; CHECK-NEXT: vminnm.f64 d0, d16, d17 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI23_0: +; CHECK-NEXT: .long 0 @ double 78 +; CHECK-NEXT: .long 1079214080 +; CHECK-NEXT: .LCPI23_1: +; CHECK-NEXT: .long 0 @ double 90 +; CHECK-NEXT: .long 1079410688 %cmp1 = fcmp fast uge double 78., %b %cond1 = select nsz i1 %cmp1, double %b, double 78. %cmp2 = fcmp fast uge double %cond1, 90. @@ -236,10 +335,18 @@ define double @fp-armv8_vminnm_NNNuge_rev(double %b) { ret double %cond2 } -define float @fp-armv8_vmaxnm_NNNo(float %a) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNo": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNo(float %a) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNo: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.f32 s2, #1.200000e+01 +; CHECK-NEXT: vldr s4, .LCPI24_0 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI24_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 %cmp1 = fcmp fast ogt float %a, 12. %cond1 = select nsz i1 %cmp1, float %a, float 12. %cmp2 = fcmp fast ogt float 34., %cond1 @@ -247,10 +354,20 @@ define float @fp-armv8_vmaxnm_NNNo(float %a) { ret float %cond2 } -define float @fp-armv8_vmaxnm_NNNoge(float %a) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNoge": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNoge(float %a) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNoge: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI25_0 +; CHECK-NEXT: vldr s4, .LCPI25_1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI25_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 +; CHECK-NEXT: .LCPI25_1: +; CHECK-NEXT: .long 0x42600000 @ float 56 %cmp1 = fcmp fast oge float %a, 34. %cond1 = select nsz i1 %cmp1, float %a, float 34. %cmp2 = fcmp fast oge float 56., %cond1 @@ -258,10 +375,20 @@ define float @fp-armv8_vmaxnm_NNNoge(float %a) { ret float %cond2 } -define float @fp-armv8_vmaxnm_NNNo_rev(float %a) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNo_rev": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNo_rev(float %a) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNo_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI26_0 +; CHECK-NEXT: vldr s4, .LCPI26_1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI26_0: +; CHECK-NEXT: .long 0x42600000 @ float 56 +; CHECK-NEXT: .LCPI26_1: +; CHECK-NEXT: .long 0x429c0000 @ float 78 %cmp1 = fcmp fast olt float %a, 56. %cond1 = select nsz i1 %cmp1, float 56., float %a %cmp2 = fcmp fast olt float 78., %cond1 @@ -269,10 +396,20 @@ define float @fp-armv8_vmaxnm_NNNo_rev(float %a) { ret float %cond2 } -define float @fp-armv8_vmaxnm_NNNole_rev(float %a) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNole_rev": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNole_rev(float %a) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNole_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI27_0 +; CHECK-NEXT: vldr s4, .LCPI27_1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI27_0: +; CHECK-NEXT: .long 0x429c0000 @ float 78 +; CHECK-NEXT: .LCPI27_1: +; CHECK-NEXT: .long 0x42b40000 @ float 90 %cmp1 = fcmp fast ole float %a, 78. %cond1 = select nsz i1 %cmp1, float 78., float %a %cmp2 = fcmp fast ole float 90., %cond1 @@ -280,10 +417,18 @@ define float @fp-armv8_vmaxnm_NNNole_rev(float %a) { ret float %cond2 } -define float @fp-armv8_vmaxnm_NNNu(float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNu": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNu(float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNu: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.f32 s2, #1.200000e+01 +; CHECK-NEXT: vldr s4, .LCPI28_0 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI28_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 %cmp1 = fcmp fast ugt float 12., %b %cond1 = select nsz i1 %cmp1, float 12., float %b %cmp2 = fcmp fast ugt float %cond1, 34. @@ -291,10 +436,20 @@ define float @fp-armv8_vmaxnm_NNNu(float %b) { ret float %cond2 } -define float @fp-armv8_vmaxnm_NNNuge(float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNuge": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNuge(float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNuge: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI29_0 +; CHECK-NEXT: vldr s4, .LCPI29_1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI29_0: +; CHECK-NEXT: .long 0x42080000 @ float 34 +; CHECK-NEXT: .LCPI29_1: +; CHECK-NEXT: .long 0x42600000 @ float 56 %cmp1 = fcmp fast uge float 34., %b %cond1 = select nsz i1 %cmp1, float 34., float %b %cmp2 = fcmp fast uge float %cond1, 56. @@ -302,10 +457,20 @@ define float @fp-armv8_vmaxnm_NNNuge(float %b) { ret float %cond2 } -define float @fp-armv8_vmaxnm_NNNu_rev(float %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNu_rev": -; CHECK: vmaxnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vmaxnm_NNNu_rev(float %b) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNu_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI30_0 +; CHECK-NEXT: vldr s4, .LCPI30_1 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s4 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI30_0: +; CHECK-NEXT: .long 0x42600000 @ float 56 +; CHECK-NEXT: .LCPI30_1: +; CHECK-NEXT: .long 0x429c0000 @ float 78 %cmp1 = fcmp fast ult float 56., %b %cond1 = select nsz i1 %cmp1, float %b, float 56. %cmp2 = fcmp fast ult float %cond1, 78. @@ -313,10 +478,22 @@ define float @fp-armv8_vmaxnm_NNNu_rev(float %b) { ret float %cond2 } -define double @fp-armv8_vmaxnm_NNNule_rev( double %b) { -; CHECK-LABEL: "fp-armv8_vmaxnm_NNNule_rev": -; CHECK: vmaxnm.f64 -; CHECK: vmaxnm.f64 +define double @fparmv8_vmaxnm_NNNule_rev( double %b) { +; CHECK-LABEL: fparmv8_vmaxnm_NNNule_rev: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, .LCPI31_0 +; CHECK-NEXT: vldr d17, .LCPI31_1 +; CHECK-NEXT: vmaxnm.f64 d16, d0, d16 +; CHECK-NEXT: vmaxnm.f64 d0, d16, d17 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 3 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI31_0: +; CHECK-NEXT: .long 0 @ double 78 +; CHECK-NEXT: .long 1079214080 +; CHECK-NEXT: .LCPI31_1: +; CHECK-NEXT: .long 0 @ double 90 +; CHECK-NEXT: .long 1079410688 %cmp1 = fcmp fast ule double 78., %b %cond1 = select nsz i1 %cmp1, double %b, double 78. %cmp2 = fcmp fast ule double %cond1, 90. @@ -324,11 +501,17 @@ define double @fp-armv8_vmaxnm_NNNule_rev( double %b) { ret double %cond2 } -define float @fp-armv8_vminmaxnm_0(float %a) { -; CHECK-LABEL: "fp-armv8_vminmaxnm_0": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vminmaxnm_0(float %a) { +; CHECK-LABEL: fparmv8_vminmaxnm_0: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI32_0 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI32_0: +; CHECK-NEXT: .long 0x00000000 @ float 0 %cmp1 = fcmp fast olt float %a, 0. %cond1 = select nsz i1 %cmp1, float %a, float 0. %cmp2 = fcmp fast ogt float %cond1, 0. @@ -336,11 +519,17 @@ define float @fp-armv8_vminmaxnm_0(float %a) { ret float %cond2 } -define float @fp-armv8_vminmaxnm_neg0(float %a) { -; CHECK-LABEL: "fp-armv8_vminmaxnm_neg0": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vminmaxnm_neg0(float %a) { +; CHECK-LABEL: fparmv8_vminmaxnm_neg0: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI33_0 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI33_0: +; CHECK-NEXT: .long 0x80000000 @ float -0 %cmp1 = fcmp fast olt float %a, -0. %cond1 = select nsz i1 %cmp1, float %a, float -0. %cmp2 = fcmp fast ugt float %cond1, -0. @@ -348,11 +537,17 @@ define float @fp-armv8_vminmaxnm_neg0(float %a) { ret float %cond2 } -define float @fp-armv8_vminmaxnm_e_0(float %a) { -; CHECK-LABEL: "fp-armv8_vminmaxnm_e_0": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vminmaxnm_e_0(float %a) { +; CHECK-LABEL: fparmv8_vminmaxnm_e_0: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI34_0 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI34_0: +; CHECK-NEXT: .long 0x00000000 @ float 0 %cmp1 = fcmp fast ule float 0., %a %cond1 = select nsz i1 %cmp1, float 0., float %a %cmp2 = fcmp fast uge float 0., %cond1 @@ -360,11 +555,17 @@ define float @fp-armv8_vminmaxnm_e_0(float %a) { ret float %cond2 } -define float @fp-armv8_vminmaxnm_e_neg0(float %a) { -; CHECK-LABEL: "fp-armv8_vminmaxnm_e_neg0": -; CHECK-NOT: vcmp -; CHECK: vminnm.f32 -; CHECK: vmaxnm.f32 +define float @fparmv8_vminmaxnm_e_neg0(float %a) { +; CHECK-LABEL: fparmv8_vminmaxnm_e_neg0: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s2, .LCPI35_0 +; CHECK-NEXT: vminnm.f32 s0, s0, s2 +; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI35_0: +; CHECK-NEXT: .long 0x80000000 @ float -0 %cmp1 = fcmp fast ule float -0., %a %cond1 = select nsz i1 %cmp1, float -0., float %a %cmp2 = fcmp fast oge float -0., %cond1 diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll index d6cb05b5d0dd9..850b9a7f36ff3 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinations.ll @@ -11,8 +11,9 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !dx.rootsignatures = !{!2} ; list of function/root signature pairs !2 = !{ ptr @main, !3, i32 2 } ; function, root signature -!3 = !{ !5 } ; list of root signature elements -!5 = !{ !"DescriptorTable", i32 0, !6, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20 } +!3 = !{ !5, !21 } ; list of root signature elements +!5 = !{ !"DescriptorTable", i32 0, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20 } +!21 = !{ !"DescriptorTable", i32 0, !6, !8, !9 } ; typedef enum D3D12_DESCRIPTOR_RANGE_FLAGS { ; NONE = 0, @@ -53,37 +54,20 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !20 = !{ !"UAV", i32 5, i32 1, i32 15, i32 5, i32 65540 } ;DXC:- Name: RTS0 -;DXC-NEXT: Size: 380 +;DXC-NEXT: Size: 400 ;DXC-NEXT: RootSignature: ;DXC-NEXT: Version: 2 -;DXC-NEXT: NumRootParameters: 1 +;DXC-NEXT: NumRootParameters: 2 ;DXC-NEXT: RootParametersOffset: 24 ;DXC-NEXT: NumStaticSamplers: 0 -;DXC-NEXT: StaticSamplersOffset: 380 +;DXC-NEXT: StaticSamplersOffset: 400 ;DXC-NEXT: Parameters: ;DXC-NEXT: - ParameterType: DescriptorTable ;DXC-NEXT: ShaderVisibility: All ;DXC-NEXT: Table: -;DXC-NEXT: NumRanges: 14 -;DXC-NEXT: RangesOffset: 44 +;DXC-NEXT: NumRanges: 11 +;DXC-NEXT: RangesOffset: 56 ;DXC-NEXT: Ranges: -;DXC-NEXT: - RangeType: Sampler -;DXC-NEXT: NumDescriptors: 1 -;DXC-NEXT: BaseShaderRegister: 0 -;DXC-NEXT: RegisterSpace: 1 -;DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 -;DXC-NEXT: - RangeType: Sampler -;DXC-NEXT: NumDescriptors: 1 -;DXC-NEXT: BaseShaderRegister: 0 -;DXC-NEXT: RegisterSpace: 3 -;DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 -;DXC-NEXT: DESCRIPTORS_VOLATILE: true -;DXC-NEXT: - RangeType: Sampler -;DXC-NEXT: NumDescriptors: 1 -;DXC-NEXT: BaseShaderRegister: 0 -;DXC-NEXT: RegisterSpace: 4 -;DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 -;DXC-NEXT: DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true ;DXC-NEXT: - RangeType: SRV ;DXC-NEXT: NumDescriptors: 1 ;DXC-NEXT: BaseShaderRegister: 0 @@ -155,3 +139,26 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } ;DXC-NEXT: OffsetInDescriptorsFromTableStart: 5 ;DXC-NEXT: DATA_STATIC_WHILE_SET_AT_EXECUTE: true ;DXC-NEXT: DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true +;DXC-NEXT: - ParameterType: DescriptorTable +;DXC-NEXT: ShaderVisibility: All +;DXC-NEXT: Table: +;DXC-NEXT: NumRanges: 3 +;DXC-NEXT: RangesOffset: 328 +;DXC-NEXT: Ranges: +;DXC-NEXT: - RangeType: Sampler +;DXC-NEXT: NumDescriptors: 1 +;DXC-NEXT: BaseShaderRegister: 0 +;DXC-NEXT: RegisterSpace: 1 +;DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 +;DXC-NEXT: - RangeType: Sampler +;DXC-NEXT: NumDescriptors: 1 +;DXC-NEXT: BaseShaderRegister: 0 +;DXC-NEXT: RegisterSpace: 3 +;DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 +;DXC-NEXT: DESCRIPTORS_VOLATILE: true +;DXC-NEXT: - RangeType: Sampler +;DXC-NEXT: NumDescriptors: 1 +;DXC-NEXT: BaseShaderRegister: 0 +;DXC-NEXT: RegisterSpace: 4 +;DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 +;DXC-NEXT: DESCRIPTORS_STATIC_KEEPING_BUFFER_BOUNDS_CHECKS: true diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll index c65eab5f4aa5f..098b2d51a0bf4 100644 --- a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-DescriptorTable-AllValidFlagCombinationsV1.ll @@ -11,33 +11,40 @@ attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } !dx.rootsignatures = !{!2} ; list of function/root signature pairs !2 = !{ ptr @main, !3, i32 1 } ; function, root signature -!3 = !{ !5 } ; list of root signature elements -!5 = !{ !"DescriptorTable", i32 0, !6, !7 } +!3 = !{ !5, !8 } ; list of root signature elements +!5 = !{ !"DescriptorTable", i32 0, !6 } !6 = !{ !"Sampler", i32 1, i32 1, i32 0, i32 -1, i32 1 } +!8 = !{ !"DescriptorTable", i32 0, !7 } !7 = !{ !"UAV", i32 5, i32 1, i32 10, i32 5, i32 3 } ; DXC: - Name: RTS0 -; DXC-NEXT: Size: 84 +; DXC-NEXT: Size: 104 ; DXC-NEXT: RootSignature: ; DXC-NEXT: Version: 1 -; DXC-NEXT: NumRootParameters: 1 +; DXC-NEXT: NumRootParameters: 2 ; DXC-NEXT: RootParametersOffset: 24 ; DXC-NEXT: NumStaticSamplers: 0 -; DXC-NEXT: StaticSamplersOffset: 84 +; DXC-NEXT: StaticSamplersOffset: 104 ; DXC-NEXT: Parameters: ; DXC-NEXT: - ParameterType: DescriptorTable ; DXC-NEXT: ShaderVisibility: All ; DXC-NEXT: Table: -; DXC-NEXT: NumRanges: 2 -; DXC-NEXT: RangesOffset: 44 +; DXC-NEXT: NumRanges: 1 +; DXC-NEXT: RangesOffset: 56 ; DXC-NEXT: Ranges: ; DXC-NEXT: - RangeType: Sampler ; DXC-NEXT: NumDescriptors: 1 ; DXC-NEXT: BaseShaderRegister: 1 ; DXC-NEXT: RegisterSpace: 0 ; DXC-NEXT: OffsetInDescriptorsFromTableStart: 4294967295 -; DXC-NEXT: - RangeType: UAV +; DXC-NEXT: - ParameterType: DescriptorTable +; DXC-NEXT: ShaderVisibility: All +; DXC-NEXT: Table: +; DXC-NEXT: NumRanges: 1 +; DXC-NEXT: RangesOffset: 84 +; DXC-NEXT: Ranges: +; DXC-NEXT: - RangeType: UAV ; DXC-NEXT: NumDescriptors: 5 ; DXC-NEXT: BaseShaderRegister: 1 ; DXC-NEXT: RegisterSpace: 10 diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll new file mode 100644 index 0000000000000..26867e6d7ec25 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Invalid-Version.ll @@ -0,0 +1,20 @@ +; RUN: not opt -passes='print' %s -S -o - 2>&1 | FileCheck %s + +target triple = "dxil-unknown-shadermodel6.0-compute" + + +; CHECK: error: Invalid value for Version: 4 +; CHECK-NOT: Root Signature Definitions +define void @main() #0 { +entry: + ret void +} +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } + + +!dx.rootsignatures = !{!2, !3, !4, !5} ; list of function/root signature pairs +!2 = !{ ptr @main, !6, i32 1 } ; function, root signature +!3 = !{ ptr @main, !6, i32 4 } ; function, root signature +!4 = !{ ptr @main, !6, i32 2 } ; function, root signature +!5 = !{ ptr @main, !6, i32 3 } ; function, root signature +!6 = !{ } ; list of root signature elements diff --git a/llvm/test/CodeGen/DirectX/CreateHandle-NURI.ll b/llvm/test/CodeGen/DirectX/CreateHandle-NURI.ll new file mode 100644 index 0000000000000..cfa6c983df3f4 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CreateHandle-NURI.ll @@ -0,0 +1,70 @@ +; RUN: opt -S -passes=dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.0-compute" + +@A.str = internal unnamed_addr constant [2 x i8] c"A\00", align 1 +@B.str = internal unnamed_addr constant [2 x i8] c"A\00", align 1 + +declare i32 @some_val(); + +define void @test_buffers_with_nuri() { + + %val = call i32 @some_val() + %foo = alloca i32, align 4 + + ; RWBuffer A[10]; + ; + ; A[NonUniformResourceIndex(val)]; + + %nuri1 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + %res1 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %nuri1, ptr @A.str) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 %val, i1 true) #[[ATTR:.*]] + ; CHECK-NOT: @llvm.dx.cast.handle + ; CHECK-NOT: @llvm.dx.resource.nonuniformindex + + ; A[NonUniformResourceIndex(val + 1) % 10]; + %add1 = add i32 %val, 1 + %nuri2 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %add1) + %rem1 = urem i32 %nuri2, 10 + %res2 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %rem1, ptr @A.str) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 %rem1, i1 true) #[[ATTR]] + + ; A[10 + 3 * NonUniformResourceIndex(GI)]; + %mul1 = mul i32 %nuri1, 3 + %add2 = add i32 %mul1, 10 + %res3 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %add2, ptr @A.str) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 %add2, i1 true) #[[ATTR]] + + ; NonUniformResourceIndex value going through store & load - the flag is not going to get picked up: + %a = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + store i32 %a, ptr %foo + %b = load i32, ptr %foo + %res4 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %b, ptr @A.str) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 %b, i1 false) #[[ATTR]] + + ; NonUniformResourceIndex index value on a single resouce (not an array) - the flag is not going to get picked up: + ; + ; RWBuffer B : register(u20); + ; B[NonUniformResourceIndex(val)]; + %nuri3 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + %res5 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 20, i32 0, i32 1, i32 %nuri1, ptr @B.str) + ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 %val, i1 false) #[[ATTR]] + + ; NonUniformResourceIndex on unrelated value - the call is removed: + ; foo = NonUniformResourceIndex(val); + %nuri4 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + store i32 %nuri4, ptr %foo + ; CHECK: store i32 %val, ptr %foo + ; CHECK-NOT: @llvm.dx.resource.nonuniformindex + + ret void +} + +; CHECK: attributes #[[ATTR]] = {{{.*}} memory(read) {{.*}}} + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding-NURI.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding-NURI.ll new file mode 100644 index 0000000000000..80bf5a6a67c91 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding-NURI.ll @@ -0,0 +1,77 @@ +; RUN: opt -S -passes=dxil-op-lower %s | FileCheck %s + +target triple = "dxil-pc-shadermodel6.6-compute" + +@A.str = internal unnamed_addr constant [2 x i8] c"A\00", align 1 +@B.str = internal unnamed_addr constant [2 x i8] c"A\00", align 1 + +declare i32 @some_val(); + +define void @test_buffers_with_nuri() { + + %val = call i32 @some_val() + %foo = alloca i32, align 4 + + ; RWBuffer A[10]; + ; + ; A[NonUniformResourceIndex(val)]; + + %nuri1 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + %res1 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %nuri1, ptr @A.str) + ; CHECK: %[[RES1:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 9, i32 0, i8 1 }, i32 %val, i1 true) #[[ATTR:.*]] + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RES1]], %dx.types.ResourceProperties { i32 4106, i32 265 }) #[[ATTR]] + ; CHECK-NOT: @llvm.dx.cast.handle + ; CHECK-NOT: @llvm.dx.resource.nonuniformindex + + ; A[NonUniformResourceIndex(val + 1) % 10]; + %add1 = add i32 %val, 1 + %nuri2 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %add1) + %rem1 = urem i32 %nuri2, 10 + %res2 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %rem1, ptr @A.str) + ; CHECK: %[[RES2:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 9, i32 0, i8 1 }, i32 %rem1, i1 true) #[[ATTR]] + ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RES2]], %dx.types.ResourceProperties { i32 4106, i32 265 }) #[[ATTR]] + + ; A[10 + 3 * NonUniformResourceIndex(GI)]; + %mul1 = mul i32 %nuri1, 3 + %add2 = add i32 %mul1, 10 + %res3 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %add2, ptr @A.str) + ; CHECK: %[[RES3:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 9, i32 0, i8 1 }, i32 %add2, i1 true) #[[ATTR]] + ; CHECK: %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RES3]], %dx.types.ResourceProperties { i32 4106, i32 265 }) #[[ATTR]] + ret void + + ; NonUniformResourceIndex value going through store & load: the flag is not going to get picked up + %a = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + store i32 %a, ptr %foo + %b = load i32, ptr %foo + %res4 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 10, i32 %b, ptr @A.str) + ; CHECK: %[[RES4:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 9, i32 0, i8 1 }, i32 %b, i1 false) #[[ATTR]] + ; CHECK: %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RES4]], %dx.types.ResourceProperties { i32 4106, i32 265 }) #[[ATTR]] + + ; NonUniformResourceIndex index value on a single resouce (not an array): the flag is not going to get picked up + ; RWBuffer B : register(u20); + ; + ; B[NonUniformResourceIndex(val)]; + + %nuri3 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + %res5 = call target("dx.TypedBuffer", float, 1, 0, 0) + @llvm.dx.resource.handlefrombinding(i32 20, i32 0, i32 1, i32 %nuri1, ptr @B.str) + ; CHECK: %[[RES4:.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 20, i8 1 }, i32 %val, i1 false) #[[ATTR]] + ; CHECK: %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RES4]], %dx.types.ResourceProperties { i32 4106, i32 265 }) #[[ATTR]] + + ; NonUniformResourceIndex on unrelated value - the call is removed: + ; foo = NonUniformResourceIndex(val); + %nuri4 = tail call noundef i32 @llvm.dx.resource.nonuniformindex(i32 %val) + store i32 %nuri4, ptr %foo + ; CHECK: store i32 %val, ptr %foo + ; CHECK-NOT: @llvm.dx.resource.nonuniformindex + + ret void +} + +; CHECK: attributes #[[ATTR]] = {{{.*}} memory(none) {{.*}}} + +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) } diff --git a/llvm/test/CodeGen/DirectX/isnan.ll b/llvm/test/CodeGen/DirectX/isnan.ll new file mode 100644 index 0000000000000..2becd75209331 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/isnan.ll @@ -0,0 +1,53 @@ +; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s --check-prefixes=CHECK,SM69CHECK +; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.8-library %s | FileCheck %s --check-prefixes=CHECK,SMOLDCHECK + +; Make sure dxil operation function calls for isnan are generated for float and half. + +define noundef i1 @isnan_float(float noundef %a) { +entry: + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 8, float %{{.*}}) #[[#ATTR:]] + ; SMOLDCHECK: call i1 @llvm.dx.isnan.f32(float %{{.*}}) + %dx.isnan = call i1 @llvm.dx.isnan.f32(float %a) + ret i1 %dx.isnan +} + +define noundef i1 @isnan_half(half noundef %a) { +entry: + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 8, half %{{.*}}) #[[#ATTR]] + ; SMOLDCHECK: [[BITCAST:%.*]] = bitcast half %{{.*}} to i16 + ; SMOLDCHECK: [[ANDHIGH:%.*]] = and i16 [[BITCAST]], 31744 + ; SMOLDCHECK: [[CMPHIGH:%.*]] = icmp eq i16 [[ANDHIGH]], 31744 + ; SMOLDCHECK: [[ANDLOW:%.*]] = and i16 [[BITCAST]], 1023 + ; SMOLDCHECK: [[CMPLOW:%.*]] = icmp ne i16 [[ANDLOW]], 0 + ; SMOLDCHECK: [[AND:%.*]] = and i1 [[CMPHIGH]], [[CMPLOW]] + %dx.isnan = call i1 @llvm.dx.isnan.f16(half %a) + ret i1 %dx.isnan +} + +define noundef <4 x i1> @isnan_half4(<4 x half> noundef %p0) { +entry: + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 8, half + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 8, half + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 8, half + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f16(i32 8, half + ; SMOLDCHECK: [[BITCAST:%.*]] = bitcast <4 x half> %{{.*}} to <4 x i16> + ; SMOLDCHECK: [[ANDHIGH:%.*]] = and <4 x i16> [[BITCAST]], splat (i16 31744) + ; SMOLDCHECK: [[CMPHIGH:%.*]] = icmp eq <4 x i16> [[ANDHIGH]], splat (i16 31744) + ; SMOLDCHECK: [[ANDLOW:%.*]] = and <4 x i16> [[BITCAST]], splat (i16 1023) + ; SMOLDCHECK: [[CMPLOW:%.*]] = icmp ne <4 x i16> [[ANDLOW]], zeroinitializer + ; SMOLDCHECK: [[AND:%.*]] = and <4 x i1> [[CMPHIGH]], [[CMPLOW]] + %hlsl.isnan = call <4 x i1> @llvm.dx.isnan.v4f16(<4 x half> %p0) + ret <4 x i1> %hlsl.isnan +} + +define noundef <3 x i1> @isnan_float3(<3 x float> noundef %p0) { +entry: + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 8, float + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 8, float + ; SM69CHECK: call i1 @dx.op.isSpecialFloat.f32(i32 8, float + ; SMOLDCHECK: = call <3 x i1> @llvm.dx.isnan.v3f32(<3 x float> + %hlsl.isnan = call <3 x i1> @llvm.dx.isnan.v3f32(<3 x float> %p0) + ret <3 x i1> %hlsl.isnan +} + +; CHECK: attributes #{{[0-9]*}} = {{{.*}} memory(none) {{.*}}} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-appending-limits.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-appending-limits.ll new file mode 100644 index 0000000000000..7fa42e9697898 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-appending-limits.ll @@ -0,0 +1,16 @@ +; RUN: opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s +; A descriptor range can be placed at UINT_MAX, matching DXC's behaviour +; CHECK-NOT: error: + +define void @CSMain() "hlsl.shader"="compute" { +entry: + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!3} +!3 = !{!"DescriptorTable", i32 0, !4, !5} +!4 = !{!"UAV", i32 1, i32 1, i32 0, i32 4294967294, i32 0} +!5 = !{!"UAV", i32 1, i32 0, i32 0, i32 -1, i32 0} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-deny-no-binding.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-deny-no-binding.ll new file mode 100644 index 0000000000000..15326d438f021 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-deny-no-binding.ll @@ -0,0 +1,17 @@ +; RUN: opt -S -passes='dxil-post-optimization-validation' %s +; This is a valid case where no resource is being used +target triple = "dxil-pc-shadermodel6.6-pixel" + +define void @CSMain() #0 { +entry: + ret void +} +attributes #0 = { noinline nounwind "exp-shader"="cs" "hlsl.numthreads"="1,2,1" "hlsl.shader"="geometry" } + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!2, !3, !4} +!2 = !{!"RootConstants", i32 0, i32 2, i32 0, i32 4} +!3 = !{ !"RootFlags", i32 294 } ; 294 = deny_pixel/hull/vertex/amplification_shader_root_access +!4 = !{ !"RootSRV", i32 0, i32 1, i32 0, i32 0 } diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-appending-limits-multiples.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-appending-limits-multiples.ll new file mode 100644 index 0000000000000..e51f15a1d3fc2 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-appending-limits-multiples.ll @@ -0,0 +1,16 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s +; CHECK: error: Offset overflow for descriptor range: CBV(register=2, space=0). + +define void @CSMain() "hlsl.shader"="compute" { +entry: + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!3} +!3 = !{!"DescriptorTable", i32 0, !4, !5, !6} +!4 = !{!"CBV", i32 1, i32 0, i32 0, i32 4294967294, i32 0} +!5 = !{!"CBV", i32 1, i32 1, i32 0, i32 -1, i32 0} +!6 = !{!"CBV", i32 1, i32 2, i32 0, i32 -1, i32 0} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-appending-overflow.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-appending-overflow.ll new file mode 100644 index 0000000000000..1bc97d9ae2091 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-appending-overflow.ll @@ -0,0 +1,17 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s +; This test checks if a resource is implicitly overflowing. That means, it is appending a resource after an unbounded range. + +; CHECK: error: Range UAV(register=0, space=0) cannot be appended after an unbounded range + +define void @CSMain() "hlsl.shader"="compute" { +entry: + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!3} +!3 = !{!"DescriptorTable", i32 0, !4, !5} +!4 = !{!"UAV", i32 -1, i32 1, i32 0, i32 2, i32 0} +!5 = !{!"UAV", i32 1, i32 0, i32 0, i32 -1, i32 0} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-multiple-shader.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-multiple-shader.ll new file mode 100644 index 0000000000000..b11cce694bd25 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-multiple-shader.ll @@ -0,0 +1,20 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' %s 2>&1 | FileCheck %s +; CHECK: error: Shader has root bindings but root signature uses a DENY flag to disallow root binding access to the shader stage. +target triple = "dxil-pc-shadermodel6.6-pixel" + +%__cblayout_CB = type <{ float }> + +@CB.str = private unnamed_addr constant [3 x i8] c"CB\00", align 1 + +define void @CSMain() "hlsl.shader"="compute" { +entry: + %CB = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) @llvm.dx.resource.handlefrombinding(i32 0, i32 2, i32 1, i32 0, ptr nonnull @CB.str) + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!2, !3} +!2 = !{!"RootConstants", i32 0, i32 2, i32 0, i32 4} +!3 = !{!"RootFlags", i32 294} ; 294 = deny_pixel/hull/vertex/amplification_shader_root_access diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-root-descriptor.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-root-descriptor.ll new file mode 100644 index 0000000000000..6d323757d5897 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-root-descriptor.ll @@ -0,0 +1,20 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' %s 2>&1 | FileCheck %s + +; CHECK: error: Shader has root bindings but root signature uses a DENY flag to disallow root binding access to the shader stage. +target triple = "dxil-pc-shadermodel6.6-pixel" + +@SB.str = private unnamed_addr constant [3 x i8] c"SB\00", align 1 + +define void @CSMain() "hlsl.shader"="pixel" { +entry: + %SB = tail call target("dx.RawBuffer", i32, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr nonnull @SB.str) + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!2, !3} +!2 = !{!"DescriptorTable", i32 0, !4} +!4 = !{!"SRV", i32 1, i32 0, i32 0, i32 -1, i32 4} +!3 = !{!"RootFlags", i32 32} ; 32 = deny_pixel_shader_root_access diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-single-shader.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-single-shader.ll new file mode 100644 index 0000000000000..4e50f50049b0e --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-deny-single-shader.ll @@ -0,0 +1,19 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' %s 2>&1 | FileCheck %s + +; CHECK: error: Shader has root bindings but root signature uses a DENY flag to disallow root binding access to the shader stage. +target triple = "dxil-pc-shadermodel6.6-pixel" + +@SB.str = private unnamed_addr constant [3 x i8] c"SB\00", align 1 + +define void @CSMain() "hlsl.shader"="pixel" { +entry: + %SB = tail call target("dx.RawBuffer", i32, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, ptr nonnull @SB.str) + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!2, !3} +!2 = !{!"RootSRV", i32 0, i32 0, i32 0, i32 4} +!3 = !{!"RootFlags", i32 32} ; 32 = deny_pixel_shader_root_access diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-offset-overflow.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-offset-overflow.ll new file mode 100644 index 0000000000000..6e56949562740 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-offset-overflow.ll @@ -0,0 +1,15 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s +; CHECK: error: Offset overflow for descriptor range: UAV(register=0, space=0). + +define void @CSMain() "hlsl.shader"="compute" { +entry: + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!3} +!3 = !{!"DescriptorTable", i32 0, !4, !5} +!4 = !{!"UAV", i32 100, i32 0, i32 0, i32 4294967294, i32 0} +!5 = !{!"UAV", i32 1, i32 101, i32 0, i32 10, i32 0} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-register-overflow.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-register-overflow.ll new file mode 100644 index 0000000000000..bff1727c18924 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-register-overflow.ll @@ -0,0 +1,13 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s +; CHECK: error: Overflow for shader register range: UAV(register=4294967295, space=0) +define void @CSMain() "hlsl.shader"="compute" { +entry: + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!3} +!3 = !{!"DescriptorTable", i32 0, !4} +!4 = !{!"UAV", i32 100, i32 4294967295, i32 0, i32 -1, i32 0} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler-mix.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler-mix.ll new file mode 100644 index 0000000000000..95d00619b02a0 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-fail-sampler-mix.ll @@ -0,0 +1,15 @@ +; RUN: not opt -S -passes='dxil-post-optimization-validation' -mtriple=dxil-pc-shadermodel6.6-compute %s 2>&1 | FileCheck %s +; CHECK: error: Samplers cannot be mixed with other resource types in a descriptor table, UAV(location=0) + +define void @CSMain() "hlsl.shader"="compute" { +entry: + ret void +} + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!3} +!3 = !{!"DescriptorTable", i32 0, !4, !5} +!4 = !{!"UAV", i32 1, i32 0, i32 0, i32 -1, i32 0} +!5 = !{!"Sampler", i32 2, i32 0, i32 0, i32 -1, i32 0} diff --git a/llvm/test/CodeGen/DirectX/rootsignature-validation-not-dening-shader.ll b/llvm/test/CodeGen/DirectX/rootsignature-validation-not-dening-shader.ll new file mode 100644 index 0000000000000..775fc3512ca84 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/rootsignature-validation-not-dening-shader.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -passes='dxil-post-optimization-validation' %s +; Valid scenario where shader stage is not blocked from accessing root bindings +target triple = "dxil-pc-shadermodel6.6-geometry" + +%__cblayout_CB = type <{ float }> + +@CB.str = private unnamed_addr constant [3 x i8] c"CB\00", align 1 + +define void @CSMain() "hlsl.shader"="geometry" { +entry: + %CB = tail call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) @llvm.dx.resource.handlefrombinding(i32 0, i32 2, i32 1, i32 0, ptr nonnull @CB.str) + ret void +} +attributes #0 = { noinline nounwind "exp-shader"="cs" "hlsl.numthreads"="1,2,1" "hlsl.shader"="geometry" } + +!dx.rootsignatures = !{!0} + +!0 = !{ptr @CSMain, !1, i32 2} +!1 = !{!2, !3} +!2 = !{ !"RootFlags", i32 294 } ; 294 = deny_pixel/hull/vertex/amplification_shader_root_access +!3 = !{ !"RootCBV", i32 0, i32 2, i32 0, i32 0 } diff --git a/llvm/test/CodeGen/Hexagon/autohvx/deal-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/deal-128b.ll index 30a3b2d7e93a2..138beced0d2ec 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/deal-128b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/deal-128b.ll @@ -517,7 +517,7 @@ define <256 x i8> @vdeal_3f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_40: -; CHECK: [[REG40:r[0-9]+]] = #64 +; CHECK: [[REG40:r[0-9]+]] = #-64 ; CHECK: vshuff(v1,v0,[[REG40]]) define <256 x i8> @vdeal_40(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -525,7 +525,7 @@ define <256 x i8> @vdeal_40(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_41: -; CHECK: [[REG41:r[0-9]+]] = #65 +; CHECK: [[REG41:r[0-9]+]] = #-63 ; CHECK: vdeal(v1,v0,[[REG41]]) define <256 x i8> @vdeal_41(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126, i32 128, i32 192, i32 130, i32 194, i32 132, i32 196, i32 134, i32 198, i32 136, i32 200, i32 138, i32 202, i32 140, i32 204, i32 142, i32 206, i32 144, i32 208, i32 146, i32 210, i32 148, i32 212, i32 150, i32 214, i32 152, i32 216, i32 154, i32 218, i32 156, i32 220, i32 158, i32 222, i32 160, i32 224, i32 162, i32 226, i32 164, i32 228, i32 166, i32 230, i32 168, i32 232, i32 170, i32 234, i32 172, i32 236, i32 174, i32 238, i32 176, i32 240, i32 178, i32 242, i32 180, i32 244, i32 182, i32 246, i32 184, i32 248, i32 186, i32 250, i32 188, i32 252, i32 190, i32 254, i32 1, i32 65, i32 3, i32 67, i32 5, i32 69, i32 7, i32 71, i32 9, i32 73, i32 11, i32 75, i32 13, i32 77, i32 15, i32 79, i32 17, i32 81, i32 19, i32 83, i32 21, i32 85, i32 23, i32 87, i32 25, i32 89, i32 27, i32 91, i32 29, i32 93, i32 31, i32 95, i32 33, i32 97, i32 35, i32 99, i32 37, i32 101, i32 39, i32 103, i32 41, i32 105, i32 43, i32 107, i32 45, i32 109, i32 47, i32 111, i32 49, i32 113, i32 51, i32 115, i32 53, i32 117, i32 55, i32 119, i32 57, i32 121, i32 59, i32 123, i32 61, i32 125, i32 63, i32 127, i32 129, i32 193, i32 131, i32 195, i32 133, i32 197, i32 135, i32 199, i32 137, i32 201, i32 139, i32 203, i32 141, i32 205, i32 143, i32 207, i32 145, i32 209, i32 147, i32 211, i32 149, i32 213, i32 151, i32 215, i32 153, i32 217, i32 155, i32 219, i32 157, i32 221, i32 159, i32 223, i32 161, i32 225, i32 163, i32 227, i32 165, i32 229, i32 167, i32 231, i32 169, i32 233, i32 171, i32 235, i32 173, i32 237, i32 175, i32 239, i32 177, i32 241, i32 179, i32 243, i32 181, i32 245, i32 183, i32 247, i32 185, i32 249, i32 187, i32 251, i32 189, i32 253, i32 191, i32 255> @@ -533,7 +533,7 @@ define <256 x i8> @vdeal_41(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_42: -; CHECK: [[REG42:r[0-9]+]] = #66 +; CHECK: [[REG42:r[0-9]+]] = #-62 ; CHECK: vdeal(v1,v0,[[REG42]]) define <256 x i8> @vdeal_42(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 64, i32 65, i32 4, i32 5, i32 68, i32 69, i32 8, i32 9, i32 72, i32 73, i32 12, i32 13, i32 76, i32 77, i32 16, i32 17, i32 80, i32 81, i32 20, i32 21, i32 84, i32 85, i32 24, i32 25, i32 88, i32 89, i32 28, i32 29, i32 92, i32 93, i32 32, i32 33, i32 96, i32 97, i32 36, i32 37, i32 100, i32 101, i32 40, i32 41, i32 104, i32 105, i32 44, i32 45, i32 108, i32 109, i32 48, i32 49, i32 112, i32 113, i32 52, i32 53, i32 116, i32 117, i32 56, i32 57, i32 120, i32 121, i32 60, i32 61, i32 124, i32 125, i32 128, i32 129, i32 192, i32 193, i32 132, i32 133, i32 196, i32 197, i32 136, i32 137, i32 200, i32 201, i32 140, i32 141, i32 204, i32 205, i32 144, i32 145, i32 208, i32 209, i32 148, i32 149, i32 212, i32 213, i32 152, i32 153, i32 216, i32 217, i32 156, i32 157, i32 220, i32 221, i32 160, i32 161, i32 224, i32 225, i32 164, i32 165, i32 228, i32 229, i32 168, i32 169, i32 232, i32 233, i32 172, i32 173, i32 236, i32 237, i32 176, i32 177, i32 240, i32 241, i32 180, i32 181, i32 244, i32 245, i32 184, i32 185, i32 248, i32 249, i32 188, i32 189, i32 252, i32 253, i32 2, i32 3, i32 66, i32 67, i32 6, i32 7, i32 70, i32 71, i32 10, i32 11, i32 74, i32 75, i32 14, i32 15, i32 78, i32 79, i32 18, i32 19, i32 82, i32 83, i32 22, i32 23, i32 86, i32 87, i32 26, i32 27, i32 90, i32 91, i32 30, i32 31, i32 94, i32 95, i32 34, i32 35, i32 98, i32 99, i32 38, i32 39, i32 102, i32 103, i32 42, i32 43, i32 106, i32 107, i32 46, i32 47, i32 110, i32 111, i32 50, i32 51, i32 114, i32 115, i32 54, i32 55, i32 118, i32 119, i32 58, i32 59, i32 122, i32 123, i32 62, i32 63, i32 126, i32 127, i32 130, i32 131, i32 194, i32 195, i32 134, i32 135, i32 198, i32 199, i32 138, i32 139, i32 202, i32 203, i32 142, i32 143, i32 206, i32 207, i32 146, i32 147, i32 210, i32 211, i32 150, i32 151, i32 214, i32 215, i32 154, i32 155, i32 218, i32 219, i32 158, i32 159, i32 222, i32 223, i32 162, i32 163, i32 226, i32 227, i32 166, i32 167, i32 230, i32 231, i32 170, i32 171, i32 234, i32 235, i32 174, i32 175, i32 238, i32 239, i32 178, i32 179, i32 242, i32 243, i32 182, i32 183, i32 246, i32 247, i32 186, i32 187, i32 250, i32 251, i32 190, i32 191, i32 254, i32 255> @@ -541,7 +541,7 @@ define <256 x i8> @vdeal_42(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_43: -; CHECK: [[REG43:r[0-9]+]] = #67 +; CHECK: [[REG43:r[0-9]+]] = #-61 ; CHECK: vdeal(v1,v0,[[REG43]]) define <256 x i8> @vdeal_43(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 64, i32 66, i32 4, i32 6, i32 68, i32 70, i32 8, i32 10, i32 72, i32 74, i32 12, i32 14, i32 76, i32 78, i32 16, i32 18, i32 80, i32 82, i32 20, i32 22, i32 84, i32 86, i32 24, i32 26, i32 88, i32 90, i32 28, i32 30, i32 92, i32 94, i32 32, i32 34, i32 96, i32 98, i32 36, i32 38, i32 100, i32 102, i32 40, i32 42, i32 104, i32 106, i32 44, i32 46, i32 108, i32 110, i32 48, i32 50, i32 112, i32 114, i32 52, i32 54, i32 116, i32 118, i32 56, i32 58, i32 120, i32 122, i32 60, i32 62, i32 124, i32 126, i32 128, i32 130, i32 192, i32 194, i32 132, i32 134, i32 196, i32 198, i32 136, i32 138, i32 200, i32 202, i32 140, i32 142, i32 204, i32 206, i32 144, i32 146, i32 208, i32 210, i32 148, i32 150, i32 212, i32 214, i32 152, i32 154, i32 216, i32 218, i32 156, i32 158, i32 220, i32 222, i32 160, i32 162, i32 224, i32 226, i32 164, i32 166, i32 228, i32 230, i32 168, i32 170, i32 232, i32 234, i32 172, i32 174, i32 236, i32 238, i32 176, i32 178, i32 240, i32 242, i32 180, i32 182, i32 244, i32 246, i32 184, i32 186, i32 248, i32 250, i32 188, i32 190, i32 252, i32 254, i32 1, i32 3, i32 65, i32 67, i32 5, i32 7, i32 69, i32 71, i32 9, i32 11, i32 73, i32 75, i32 13, i32 15, i32 77, i32 79, i32 17, i32 19, i32 81, i32 83, i32 21, i32 23, i32 85, i32 87, i32 25, i32 27, i32 89, i32 91, i32 29, i32 31, i32 93, i32 95, i32 33, i32 35, i32 97, i32 99, i32 37, i32 39, i32 101, i32 103, i32 41, i32 43, i32 105, i32 107, i32 45, i32 47, i32 109, i32 111, i32 49, i32 51, i32 113, i32 115, i32 53, i32 55, i32 117, i32 119, i32 57, i32 59, i32 121, i32 123, i32 61, i32 63, i32 125, i32 127, i32 129, i32 131, i32 193, i32 195, i32 133, i32 135, i32 197, i32 199, i32 137, i32 139, i32 201, i32 203, i32 141, i32 143, i32 205, i32 207, i32 145, i32 147, i32 209, i32 211, i32 149, i32 151, i32 213, i32 215, i32 153, i32 155, i32 217, i32 219, i32 157, i32 159, i32 221, i32 223, i32 161, i32 163, i32 225, i32 227, i32 165, i32 167, i32 229, i32 231, i32 169, i32 171, i32 233, i32 235, i32 173, i32 175, i32 237, i32 239, i32 177, i32 179, i32 241, i32 243, i32 181, i32 183, i32 245, i32 247, i32 185, i32 187, i32 249, i32 251, i32 189, i32 191, i32 253, i32 255> @@ -549,7 +549,7 @@ define <256 x i8> @vdeal_43(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_44: -; CHECK: [[REG44:r[0-9]+]] = #68 +; CHECK: [[REG44:r[0-9]+]] = #-60 ; CHECK: vdeal(v1,v0,[[REG44]]) define <256 x i8> @vdeal_44(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 72, i32 73, i32 74, i32 75, i32 16, i32 17, i32 18, i32 19, i32 80, i32 81, i32 82, i32 83, i32 24, i32 25, i32 26, i32 27, i32 88, i32 89, i32 90, i32 91, i32 32, i32 33, i32 34, i32 35, i32 96, i32 97, i32 98, i32 99, i32 40, i32 41, i32 42, i32 43, i32 104, i32 105, i32 106, i32 107, i32 48, i32 49, i32 50, i32 51, i32 112, i32 113, i32 114, i32 115, i32 56, i32 57, i32 58, i32 59, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 192, i32 193, i32 194, i32 195, i32 136, i32 137, i32 138, i32 139, i32 200, i32 201, i32 202, i32 203, i32 144, i32 145, i32 146, i32 147, i32 208, i32 209, i32 210, i32 211, i32 152, i32 153, i32 154, i32 155, i32 216, i32 217, i32 218, i32 219, i32 160, i32 161, i32 162, i32 163, i32 224, i32 225, i32 226, i32 227, i32 168, i32 169, i32 170, i32 171, i32 232, i32 233, i32 234, i32 235, i32 176, i32 177, i32 178, i32 179, i32 240, i32 241, i32 242, i32 243, i32 184, i32 185, i32 186, i32 187, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 68, i32 69, i32 70, i32 71, i32 12, i32 13, i32 14, i32 15, i32 76, i32 77, i32 78, i32 79, i32 20, i32 21, i32 22, i32 23, i32 84, i32 85, i32 86, i32 87, i32 28, i32 29, i32 30, i32 31, i32 92, i32 93, i32 94, i32 95, i32 36, i32 37, i32 38, i32 39, i32 100, i32 101, i32 102, i32 103, i32 44, i32 45, i32 46, i32 47, i32 108, i32 109, i32 110, i32 111, i32 52, i32 53, i32 54, i32 55, i32 116, i32 117, i32 118, i32 119, i32 60, i32 61, i32 62, i32 63, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 196, i32 197, i32 198, i32 199, i32 140, i32 141, i32 142, i32 143, i32 204, i32 205, i32 206, i32 207, i32 148, i32 149, i32 150, i32 151, i32 212, i32 213, i32 214, i32 215, i32 156, i32 157, i32 158, i32 159, i32 220, i32 221, i32 222, i32 223, i32 164, i32 165, i32 166, i32 167, i32 228, i32 229, i32 230, i32 231, i32 172, i32 173, i32 174, i32 175, i32 236, i32 237, i32 238, i32 239, i32 180, i32 181, i32 182, i32 183, i32 244, i32 245, i32 246, i32 247, i32 188, i32 189, i32 190, i32 191, i32 252, i32 253, i32 254, i32 255> @@ -557,7 +557,7 @@ define <256 x i8> @vdeal_44(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_45: -; CHECK: [[REG45:r[0-9]+]] = #69 +; CHECK: [[REG45:r[0-9]+]] = #-59 ; CHECK: vdeal(v1,v0,[[REG45]]) define <256 x i8> @vdeal_45(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 64, i32 68, i32 66, i32 70, i32 8, i32 12, i32 10, i32 14, i32 72, i32 76, i32 74, i32 78, i32 16, i32 20, i32 18, i32 22, i32 80, i32 84, i32 82, i32 86, i32 24, i32 28, i32 26, i32 30, i32 88, i32 92, i32 90, i32 94, i32 32, i32 36, i32 34, i32 38, i32 96, i32 100, i32 98, i32 102, i32 40, i32 44, i32 42, i32 46, i32 104, i32 108, i32 106, i32 110, i32 48, i32 52, i32 50, i32 54, i32 112, i32 116, i32 114, i32 118, i32 56, i32 60, i32 58, i32 62, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 192, i32 196, i32 194, i32 198, i32 136, i32 140, i32 138, i32 142, i32 200, i32 204, i32 202, i32 206, i32 144, i32 148, i32 146, i32 150, i32 208, i32 212, i32 210, i32 214, i32 152, i32 156, i32 154, i32 158, i32 216, i32 220, i32 218, i32 222, i32 160, i32 164, i32 162, i32 166, i32 224, i32 228, i32 226, i32 230, i32 168, i32 172, i32 170, i32 174, i32 232, i32 236, i32 234, i32 238, i32 176, i32 180, i32 178, i32 182, i32 240, i32 244, i32 242, i32 246, i32 184, i32 188, i32 186, i32 190, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 65, i32 69, i32 67, i32 71, i32 9, i32 13, i32 11, i32 15, i32 73, i32 77, i32 75, i32 79, i32 17, i32 21, i32 19, i32 23, i32 81, i32 85, i32 83, i32 87, i32 25, i32 29, i32 27, i32 31, i32 89, i32 93, i32 91, i32 95, i32 33, i32 37, i32 35, i32 39, i32 97, i32 101, i32 99, i32 103, i32 41, i32 45, i32 43, i32 47, i32 105, i32 109, i32 107, i32 111, i32 49, i32 53, i32 51, i32 55, i32 113, i32 117, i32 115, i32 119, i32 57, i32 61, i32 59, i32 63, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 193, i32 197, i32 195, i32 199, i32 137, i32 141, i32 139, i32 143, i32 201, i32 205, i32 203, i32 207, i32 145, i32 149, i32 147, i32 151, i32 209, i32 213, i32 211, i32 215, i32 153, i32 157, i32 155, i32 159, i32 217, i32 221, i32 219, i32 223, i32 161, i32 165, i32 163, i32 167, i32 225, i32 229, i32 227, i32 231, i32 169, i32 173, i32 171, i32 175, i32 233, i32 237, i32 235, i32 239, i32 177, i32 181, i32 179, i32 183, i32 241, i32 245, i32 243, i32 247, i32 185, i32 189, i32 187, i32 191, i32 249, i32 253, i32 251, i32 255> @@ -565,7 +565,7 @@ define <256 x i8> @vdeal_45(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_46: -; CHECK: [[REG46:r[0-9]+]] = #70 +; CHECK: [[REG46:r[0-9]+]] = #-58 ; CHECK: vdeal(v1,v0,[[REG46]]) define <256 x i8> @vdeal_46(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 64, i32 65, i32 68, i32 69, i32 8, i32 9, i32 12, i32 13, i32 72, i32 73, i32 76, i32 77, i32 16, i32 17, i32 20, i32 21, i32 80, i32 81, i32 84, i32 85, i32 24, i32 25, i32 28, i32 29, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 96, i32 97, i32 100, i32 101, i32 40, i32 41, i32 44, i32 45, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 112, i32 113, i32 116, i32 117, i32 56, i32 57, i32 60, i32 61, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 192, i32 193, i32 196, i32 197, i32 136, i32 137, i32 140, i32 141, i32 200, i32 201, i32 204, i32 205, i32 144, i32 145, i32 148, i32 149, i32 208, i32 209, i32 212, i32 213, i32 152, i32 153, i32 156, i32 157, i32 216, i32 217, i32 220, i32 221, i32 160, i32 161, i32 164, i32 165, i32 224, i32 225, i32 228, i32 229, i32 168, i32 169, i32 172, i32 173, i32 232, i32 233, i32 236, i32 237, i32 176, i32 177, i32 180, i32 181, i32 240, i32 241, i32 244, i32 245, i32 184, i32 185, i32 188, i32 189, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 66, i32 67, i32 70, i32 71, i32 10, i32 11, i32 14, i32 15, i32 74, i32 75, i32 78, i32 79, i32 18, i32 19, i32 22, i32 23, i32 82, i32 83, i32 86, i32 87, i32 26, i32 27, i32 30, i32 31, i32 90, i32 91, i32 94, i32 95, i32 34, i32 35, i32 38, i32 39, i32 98, i32 99, i32 102, i32 103, i32 42, i32 43, i32 46, i32 47, i32 106, i32 107, i32 110, i32 111, i32 50, i32 51, i32 54, i32 55, i32 114, i32 115, i32 118, i32 119, i32 58, i32 59, i32 62, i32 63, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 194, i32 195, i32 198, i32 199, i32 138, i32 139, i32 142, i32 143, i32 202, i32 203, i32 206, i32 207, i32 146, i32 147, i32 150, i32 151, i32 210, i32 211, i32 214, i32 215, i32 154, i32 155, i32 158, i32 159, i32 218, i32 219, i32 222, i32 223, i32 162, i32 163, i32 166, i32 167, i32 226, i32 227, i32 230, i32 231, i32 170, i32 171, i32 174, i32 175, i32 234, i32 235, i32 238, i32 239, i32 178, i32 179, i32 182, i32 183, i32 242, i32 243, i32 246, i32 247, i32 186, i32 187, i32 190, i32 191, i32 250, i32 251, i32 254, i32 255> @@ -573,7 +573,7 @@ define <256 x i8> @vdeal_46(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_47: -; CHECK: [[REG47:r[0-9]+]] = #71 +; CHECK: [[REG47:r[0-9]+]] = #-57 ; CHECK: vdeal(v1,v0,[[REG47]]) define <256 x i8> @vdeal_47(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 64, i32 66, i32 68, i32 70, i32 8, i32 10, i32 12, i32 14, i32 72, i32 74, i32 76, i32 78, i32 16, i32 18, i32 20, i32 22, i32 80, i32 82, i32 84, i32 86, i32 24, i32 26, i32 28, i32 30, i32 88, i32 90, i32 92, i32 94, i32 32, i32 34, i32 36, i32 38, i32 96, i32 98, i32 100, i32 102, i32 40, i32 42, i32 44, i32 46, i32 104, i32 106, i32 108, i32 110, i32 48, i32 50, i32 52, i32 54, i32 112, i32 114, i32 116, i32 118, i32 56, i32 58, i32 60, i32 62, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 192, i32 194, i32 196, i32 198, i32 136, i32 138, i32 140, i32 142, i32 200, i32 202, i32 204, i32 206, i32 144, i32 146, i32 148, i32 150, i32 208, i32 210, i32 212, i32 214, i32 152, i32 154, i32 156, i32 158, i32 216, i32 218, i32 220, i32 222, i32 160, i32 162, i32 164, i32 166, i32 224, i32 226, i32 228, i32 230, i32 168, i32 170, i32 172, i32 174, i32 232, i32 234, i32 236, i32 238, i32 176, i32 178, i32 180, i32 182, i32 240, i32 242, i32 244, i32 246, i32 184, i32 186, i32 188, i32 190, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 65, i32 67, i32 69, i32 71, i32 9, i32 11, i32 13, i32 15, i32 73, i32 75, i32 77, i32 79, i32 17, i32 19, i32 21, i32 23, i32 81, i32 83, i32 85, i32 87, i32 25, i32 27, i32 29, i32 31, i32 89, i32 91, i32 93, i32 95, i32 33, i32 35, i32 37, i32 39, i32 97, i32 99, i32 101, i32 103, i32 41, i32 43, i32 45, i32 47, i32 105, i32 107, i32 109, i32 111, i32 49, i32 51, i32 53, i32 55, i32 113, i32 115, i32 117, i32 119, i32 57, i32 59, i32 61, i32 63, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 193, i32 195, i32 197, i32 199, i32 137, i32 139, i32 141, i32 143, i32 201, i32 203, i32 205, i32 207, i32 145, i32 147, i32 149, i32 151, i32 209, i32 211, i32 213, i32 215, i32 153, i32 155, i32 157, i32 159, i32 217, i32 219, i32 221, i32 223, i32 161, i32 163, i32 165, i32 167, i32 225, i32 227, i32 229, i32 231, i32 169, i32 171, i32 173, i32 175, i32 233, i32 235, i32 237, i32 239, i32 177, i32 179, i32 181, i32 183, i32 241, i32 243, i32 245, i32 247, i32 185, i32 187, i32 189, i32 191, i32 249, i32 251, i32 253, i32 255> @@ -581,7 +581,7 @@ define <256 x i8> @vdeal_47(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_48: -; CHECK: [[REG48:r[0-9]+]] = #72 +; CHECK: [[REG48:r[0-9]+]] = #-56 ; CHECK: vdeal(v1,v0,[[REG48]]) define <256 x i8> @vdeal_48(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -589,7 +589,7 @@ define <256 x i8> @vdeal_48(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_49: -; CHECK: [[REG49:r[0-9]+]] = #73 +; CHECK: [[REG49:r[0-9]+]] = #-55 ; CHECK: vdeal(v1,v0,[[REG49]]) define <256 x i8> @vdeal_49(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 64, i32 72, i32 66, i32 74, i32 68, i32 76, i32 70, i32 78, i32 16, i32 24, i32 18, i32 26, i32 20, i32 28, i32 22, i32 30, i32 80, i32 88, i32 82, i32 90, i32 84, i32 92, i32 86, i32 94, i32 32, i32 40, i32 34, i32 42, i32 36, i32 44, i32 38, i32 46, i32 96, i32 104, i32 98, i32 106, i32 100, i32 108, i32 102, i32 110, i32 48, i32 56, i32 50, i32 58, i32 52, i32 60, i32 54, i32 62, i32 112, i32 120, i32 114, i32 122, i32 116, i32 124, i32 118, i32 126, i32 128, i32 136, i32 130, i32 138, i32 132, i32 140, i32 134, i32 142, i32 192, i32 200, i32 194, i32 202, i32 196, i32 204, i32 198, i32 206, i32 144, i32 152, i32 146, i32 154, i32 148, i32 156, i32 150, i32 158, i32 208, i32 216, i32 210, i32 218, i32 212, i32 220, i32 214, i32 222, i32 160, i32 168, i32 162, i32 170, i32 164, i32 172, i32 166, i32 174, i32 224, i32 232, i32 226, i32 234, i32 228, i32 236, i32 230, i32 238, i32 176, i32 184, i32 178, i32 186, i32 180, i32 188, i32 182, i32 190, i32 240, i32 248, i32 242, i32 250, i32 244, i32 252, i32 246, i32 254, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, i32 65, i32 73, i32 67, i32 75, i32 69, i32 77, i32 71, i32 79, i32 17, i32 25, i32 19, i32 27, i32 21, i32 29, i32 23, i32 31, i32 81, i32 89, i32 83, i32 91, i32 85, i32 93, i32 87, i32 95, i32 33, i32 41, i32 35, i32 43, i32 37, i32 45, i32 39, i32 47, i32 97, i32 105, i32 99, i32 107, i32 101, i32 109, i32 103, i32 111, i32 49, i32 57, i32 51, i32 59, i32 53, i32 61, i32 55, i32 63, i32 113, i32 121, i32 115, i32 123, i32 117, i32 125, i32 119, i32 127, i32 129, i32 137, i32 131, i32 139, i32 133, i32 141, i32 135, i32 143, i32 193, i32 201, i32 195, i32 203, i32 197, i32 205, i32 199, i32 207, i32 145, i32 153, i32 147, i32 155, i32 149, i32 157, i32 151, i32 159, i32 209, i32 217, i32 211, i32 219, i32 213, i32 221, i32 215, i32 223, i32 161, i32 169, i32 163, i32 171, i32 165, i32 173, i32 167, i32 175, i32 225, i32 233, i32 227, i32 235, i32 229, i32 237, i32 231, i32 239, i32 177, i32 185, i32 179, i32 187, i32 181, i32 189, i32 183, i32 191, i32 241, i32 249, i32 243, i32 251, i32 245, i32 253, i32 247, i32 255> @@ -597,7 +597,7 @@ define <256 x i8> @vdeal_49(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_4a: -; CHECK: [[REG4a:r[0-9]+]] = #74 +; CHECK: [[REG4a:r[0-9]+]] = #-54 ; CHECK: vdeal(v1,v0,[[REG4a]]) define <256 x i8> @vdeal_4a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13, i32 64, i32 65, i32 72, i32 73, i32 68, i32 69, i32 76, i32 77, i32 16, i32 17, i32 24, i32 25, i32 20, i32 21, i32 28, i32 29, i32 80, i32 81, i32 88, i32 89, i32 84, i32 85, i32 92, i32 93, i32 32, i32 33, i32 40, i32 41, i32 36, i32 37, i32 44, i32 45, i32 96, i32 97, i32 104, i32 105, i32 100, i32 101, i32 108, i32 109, i32 48, i32 49, i32 56, i32 57, i32 52, i32 53, i32 60, i32 61, i32 112, i32 113, i32 120, i32 121, i32 116, i32 117, i32 124, i32 125, i32 128, i32 129, i32 136, i32 137, i32 132, i32 133, i32 140, i32 141, i32 192, i32 193, i32 200, i32 201, i32 196, i32 197, i32 204, i32 205, i32 144, i32 145, i32 152, i32 153, i32 148, i32 149, i32 156, i32 157, i32 208, i32 209, i32 216, i32 217, i32 212, i32 213, i32 220, i32 221, i32 160, i32 161, i32 168, i32 169, i32 164, i32 165, i32 172, i32 173, i32 224, i32 225, i32 232, i32 233, i32 228, i32 229, i32 236, i32 237, i32 176, i32 177, i32 184, i32 185, i32 180, i32 181, i32 188, i32 189, i32 240, i32 241, i32 248, i32 249, i32 244, i32 245, i32 252, i32 253, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15, i32 66, i32 67, i32 74, i32 75, i32 70, i32 71, i32 78, i32 79, i32 18, i32 19, i32 26, i32 27, i32 22, i32 23, i32 30, i32 31, i32 82, i32 83, i32 90, i32 91, i32 86, i32 87, i32 94, i32 95, i32 34, i32 35, i32 42, i32 43, i32 38, i32 39, i32 46, i32 47, i32 98, i32 99, i32 106, i32 107, i32 102, i32 103, i32 110, i32 111, i32 50, i32 51, i32 58, i32 59, i32 54, i32 55, i32 62, i32 63, i32 114, i32 115, i32 122, i32 123, i32 118, i32 119, i32 126, i32 127, i32 130, i32 131, i32 138, i32 139, i32 134, i32 135, i32 142, i32 143, i32 194, i32 195, i32 202, i32 203, i32 198, i32 199, i32 206, i32 207, i32 146, i32 147, i32 154, i32 155, i32 150, i32 151, i32 158, i32 159, i32 210, i32 211, i32 218, i32 219, i32 214, i32 215, i32 222, i32 223, i32 162, i32 163, i32 170, i32 171, i32 166, i32 167, i32 174, i32 175, i32 226, i32 227, i32 234, i32 235, i32 230, i32 231, i32 238, i32 239, i32 178, i32 179, i32 186, i32 187, i32 182, i32 183, i32 190, i32 191, i32 242, i32 243, i32 250, i32 251, i32 246, i32 247, i32 254, i32 255> @@ -605,7 +605,7 @@ define <256 x i8> @vdeal_4a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_4b: -; CHECK: [[REG4b:r[0-9]+]] = #75 +; CHECK: [[REG4b:r[0-9]+]] = #-53 ; CHECK: vdeal(v1,v0,[[REG4b]]) define <256 x i8> @vdeal_4b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14, i32 64, i32 66, i32 72, i32 74, i32 68, i32 70, i32 76, i32 78, i32 16, i32 18, i32 24, i32 26, i32 20, i32 22, i32 28, i32 30, i32 80, i32 82, i32 88, i32 90, i32 84, i32 86, i32 92, i32 94, i32 32, i32 34, i32 40, i32 42, i32 36, i32 38, i32 44, i32 46, i32 96, i32 98, i32 104, i32 106, i32 100, i32 102, i32 108, i32 110, i32 48, i32 50, i32 56, i32 58, i32 52, i32 54, i32 60, i32 62, i32 112, i32 114, i32 120, i32 122, i32 116, i32 118, i32 124, i32 126, i32 128, i32 130, i32 136, i32 138, i32 132, i32 134, i32 140, i32 142, i32 192, i32 194, i32 200, i32 202, i32 196, i32 198, i32 204, i32 206, i32 144, i32 146, i32 152, i32 154, i32 148, i32 150, i32 156, i32 158, i32 208, i32 210, i32 216, i32 218, i32 212, i32 214, i32 220, i32 222, i32 160, i32 162, i32 168, i32 170, i32 164, i32 166, i32 172, i32 174, i32 224, i32 226, i32 232, i32 234, i32 228, i32 230, i32 236, i32 238, i32 176, i32 178, i32 184, i32 186, i32 180, i32 182, i32 188, i32 190, i32 240, i32 242, i32 248, i32 250, i32 244, i32 246, i32 252, i32 254, i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15, i32 65, i32 67, i32 73, i32 75, i32 69, i32 71, i32 77, i32 79, i32 17, i32 19, i32 25, i32 27, i32 21, i32 23, i32 29, i32 31, i32 81, i32 83, i32 89, i32 91, i32 85, i32 87, i32 93, i32 95, i32 33, i32 35, i32 41, i32 43, i32 37, i32 39, i32 45, i32 47, i32 97, i32 99, i32 105, i32 107, i32 101, i32 103, i32 109, i32 111, i32 49, i32 51, i32 57, i32 59, i32 53, i32 55, i32 61, i32 63, i32 113, i32 115, i32 121, i32 123, i32 117, i32 119, i32 125, i32 127, i32 129, i32 131, i32 137, i32 139, i32 133, i32 135, i32 141, i32 143, i32 193, i32 195, i32 201, i32 203, i32 197, i32 199, i32 205, i32 207, i32 145, i32 147, i32 153, i32 155, i32 149, i32 151, i32 157, i32 159, i32 209, i32 211, i32 217, i32 219, i32 213, i32 215, i32 221, i32 223, i32 161, i32 163, i32 169, i32 171, i32 165, i32 167, i32 173, i32 175, i32 225, i32 227, i32 233, i32 235, i32 229, i32 231, i32 237, i32 239, i32 177, i32 179, i32 185, i32 187, i32 181, i32 183, i32 189, i32 191, i32 241, i32 243, i32 249, i32 251, i32 245, i32 247, i32 253, i32 255> @@ -613,7 +613,7 @@ define <256 x i8> @vdeal_4b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_4c: -; CHECK: [[REG4c:r[0-9]+]] = #76 +; CHECK: [[REG4c:r[0-9]+]] = #-52 ; CHECK: vdeal(v1,v0,[[REG4c]]) define <256 x i8> @vdeal_4c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 64, i32 65, i32 66, i32 67, i32 72, i32 73, i32 74, i32 75, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 80, i32 81, i32 82, i32 83, i32 88, i32 89, i32 90, i32 91, i32 32, i32 33, i32 34, i32 35, i32 40, i32 41, i32 42, i32 43, i32 96, i32 97, i32 98, i32 99, i32 104, i32 105, i32 106, i32 107, i32 48, i32 49, i32 50, i32 51, i32 56, i32 57, i32 58, i32 59, i32 112, i32 113, i32 114, i32 115, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 136, i32 137, i32 138, i32 139, i32 192, i32 193, i32 194, i32 195, i32 200, i32 201, i32 202, i32 203, i32 144, i32 145, i32 146, i32 147, i32 152, i32 153, i32 154, i32 155, i32 208, i32 209, i32 210, i32 211, i32 216, i32 217, i32 218, i32 219, i32 160, i32 161, i32 162, i32 163, i32 168, i32 169, i32 170, i32 171, i32 224, i32 225, i32 226, i32 227, i32 232, i32 233, i32 234, i32 235, i32 176, i32 177, i32 178, i32 179, i32 184, i32 185, i32 186, i32 187, i32 240, i32 241, i32 242, i32 243, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 68, i32 69, i32 70, i32 71, i32 76, i32 77, i32 78, i32 79, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 84, i32 85, i32 86, i32 87, i32 92, i32 93, i32 94, i32 95, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 100, i32 101, i32 102, i32 103, i32 108, i32 109, i32 110, i32 111, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 116, i32 117, i32 118, i32 119, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 140, i32 141, i32 142, i32 143, i32 196, i32 197, i32 198, i32 199, i32 204, i32 205, i32 206, i32 207, i32 148, i32 149, i32 150, i32 151, i32 156, i32 157, i32 158, i32 159, i32 212, i32 213, i32 214, i32 215, i32 220, i32 221, i32 222, i32 223, i32 164, i32 165, i32 166, i32 167, i32 172, i32 173, i32 174, i32 175, i32 228, i32 229, i32 230, i32 231, i32 236, i32 237, i32 238, i32 239, i32 180, i32 181, i32 182, i32 183, i32 188, i32 189, i32 190, i32 191, i32 244, i32 245, i32 246, i32 247, i32 252, i32 253, i32 254, i32 255> @@ -621,7 +621,7 @@ define <256 x i8> @vdeal_4c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_4d: -; CHECK: [[REG4d:r[0-9]+]] = #77 +; CHECK: [[REG4d:r[0-9]+]] = #-51 ; CHECK: vdeal(v1,v0,[[REG4d]]) define <256 x i8> @vdeal_4d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 8, i32 12, i32 10, i32 14, i32 64, i32 68, i32 66, i32 70, i32 72, i32 76, i32 74, i32 78, i32 16, i32 20, i32 18, i32 22, i32 24, i32 28, i32 26, i32 30, i32 80, i32 84, i32 82, i32 86, i32 88, i32 92, i32 90, i32 94, i32 32, i32 36, i32 34, i32 38, i32 40, i32 44, i32 42, i32 46, i32 96, i32 100, i32 98, i32 102, i32 104, i32 108, i32 106, i32 110, i32 48, i32 52, i32 50, i32 54, i32 56, i32 60, i32 58, i32 62, i32 112, i32 116, i32 114, i32 118, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 136, i32 140, i32 138, i32 142, i32 192, i32 196, i32 194, i32 198, i32 200, i32 204, i32 202, i32 206, i32 144, i32 148, i32 146, i32 150, i32 152, i32 156, i32 154, i32 158, i32 208, i32 212, i32 210, i32 214, i32 216, i32 220, i32 218, i32 222, i32 160, i32 164, i32 162, i32 166, i32 168, i32 172, i32 170, i32 174, i32 224, i32 228, i32 226, i32 230, i32 232, i32 236, i32 234, i32 238, i32 176, i32 180, i32 178, i32 182, i32 184, i32 188, i32 186, i32 190, i32 240, i32 244, i32 242, i32 246, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15, i32 65, i32 69, i32 67, i32 71, i32 73, i32 77, i32 75, i32 79, i32 17, i32 21, i32 19, i32 23, i32 25, i32 29, i32 27, i32 31, i32 81, i32 85, i32 83, i32 87, i32 89, i32 93, i32 91, i32 95, i32 33, i32 37, i32 35, i32 39, i32 41, i32 45, i32 43, i32 47, i32 97, i32 101, i32 99, i32 103, i32 105, i32 109, i32 107, i32 111, i32 49, i32 53, i32 51, i32 55, i32 57, i32 61, i32 59, i32 63, i32 113, i32 117, i32 115, i32 119, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 137, i32 141, i32 139, i32 143, i32 193, i32 197, i32 195, i32 199, i32 201, i32 205, i32 203, i32 207, i32 145, i32 149, i32 147, i32 151, i32 153, i32 157, i32 155, i32 159, i32 209, i32 213, i32 211, i32 215, i32 217, i32 221, i32 219, i32 223, i32 161, i32 165, i32 163, i32 167, i32 169, i32 173, i32 171, i32 175, i32 225, i32 229, i32 227, i32 231, i32 233, i32 237, i32 235, i32 239, i32 177, i32 181, i32 179, i32 183, i32 185, i32 189, i32 187, i32 191, i32 241, i32 245, i32 243, i32 247, i32 249, i32 253, i32 251, i32 255> @@ -629,7 +629,7 @@ define <256 x i8> @vdeal_4d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_4e: -; CHECK: [[REG4e:r[0-9]+]] = #78 +; CHECK: [[REG4e:r[0-9]+]] = #-50 ; CHECK: vdeal(v1,v0,[[REG4e]]) define <256 x i8> @vdeal_4e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 136, i32 137, i32 140, i32 141, i32 192, i32 193, i32 196, i32 197, i32 200, i32 201, i32 204, i32 205, i32 144, i32 145, i32 148, i32 149, i32 152, i32 153, i32 156, i32 157, i32 208, i32 209, i32 212, i32 213, i32 216, i32 217, i32 220, i32 221, i32 160, i32 161, i32 164, i32 165, i32 168, i32 169, i32 172, i32 173, i32 224, i32 225, i32 228, i32 229, i32 232, i32 233, i32 236, i32 237, i32 176, i32 177, i32 180, i32 181, i32 184, i32 185, i32 188, i32 189, i32 240, i32 241, i32 244, i32 245, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 10, i32 11, i32 14, i32 15, i32 66, i32 67, i32 70, i32 71, i32 74, i32 75, i32 78, i32 79, i32 18, i32 19, i32 22, i32 23, i32 26, i32 27, i32 30, i32 31, i32 82, i32 83, i32 86, i32 87, i32 90, i32 91, i32 94, i32 95, i32 34, i32 35, i32 38, i32 39, i32 42, i32 43, i32 46, i32 47, i32 98, i32 99, i32 102, i32 103, i32 106, i32 107, i32 110, i32 111, i32 50, i32 51, i32 54, i32 55, i32 58, i32 59, i32 62, i32 63, i32 114, i32 115, i32 118, i32 119, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 138, i32 139, i32 142, i32 143, i32 194, i32 195, i32 198, i32 199, i32 202, i32 203, i32 206, i32 207, i32 146, i32 147, i32 150, i32 151, i32 154, i32 155, i32 158, i32 159, i32 210, i32 211, i32 214, i32 215, i32 218, i32 219, i32 222, i32 223, i32 162, i32 163, i32 166, i32 167, i32 170, i32 171, i32 174, i32 175, i32 226, i32 227, i32 230, i32 231, i32 234, i32 235, i32 238, i32 239, i32 178, i32 179, i32 182, i32 183, i32 186, i32 187, i32 190, i32 191, i32 242, i32 243, i32 246, i32 247, i32 250, i32 251, i32 254, i32 255> @@ -637,7 +637,7 @@ define <256 x i8> @vdeal_4e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_4f: -; CHECK: [[REG4f:r[0-9]+]] = #79 +; CHECK: [[REG4f:r[0-9]+]] = #-49 ; CHECK: vdeal(v1,v0,[[REG4f]]) define <256 x i8> @vdeal_4f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 136, i32 138, i32 140, i32 142, i32 192, i32 194, i32 196, i32 198, i32 200, i32 202, i32 204, i32 206, i32 144, i32 146, i32 148, i32 150, i32 152, i32 154, i32 156, i32 158, i32 208, i32 210, i32 212, i32 214, i32 216, i32 218, i32 220, i32 222, i32 160, i32 162, i32 164, i32 166, i32 168, i32 170, i32 172, i32 174, i32 224, i32 226, i32 228, i32 230, i32 232, i32 234, i32 236, i32 238, i32 176, i32 178, i32 180, i32 182, i32 184, i32 186, i32 188, i32 190, i32 240, i32 242, i32 244, i32 246, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 137, i32 139, i32 141, i32 143, i32 193, i32 195, i32 197, i32 199, i32 201, i32 203, i32 205, i32 207, i32 145, i32 147, i32 149, i32 151, i32 153, i32 155, i32 157, i32 159, i32 209, i32 211, i32 213, i32 215, i32 217, i32 219, i32 221, i32 223, i32 161, i32 163, i32 165, i32 167, i32 169, i32 171, i32 173, i32 175, i32 225, i32 227, i32 229, i32 231, i32 233, i32 235, i32 237, i32 239, i32 177, i32 179, i32 181, i32 183, i32 185, i32 187, i32 189, i32 191, i32 241, i32 243, i32 245, i32 247, i32 249, i32 251, i32 253, i32 255> @@ -645,7 +645,7 @@ define <256 x i8> @vdeal_4f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_50: -; CHECK: [[REG50:r[0-9]+]] = #80 +; CHECK: [[REG50:r[0-9]+]] = #-48 ; CHECK: vdeal(v1,v0,[[REG50]]) define <256 x i8> @vdeal_50(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -653,7 +653,7 @@ define <256 x i8> @vdeal_50(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_51: -; CHECK: [[REG51:r[0-9]+]] = #81 +; CHECK: [[REG51:r[0-9]+]] = #-47 ; CHECK: vdeal(v1,v0,[[REG51]]) define <256 x i8> @vdeal_51(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30, i32 64, i32 80, i32 66, i32 82, i32 68, i32 84, i32 70, i32 86, i32 72, i32 88, i32 74, i32 90, i32 76, i32 92, i32 78, i32 94, i32 32, i32 48, i32 34, i32 50, i32 36, i32 52, i32 38, i32 54, i32 40, i32 56, i32 42, i32 58, i32 44, i32 60, i32 46, i32 62, i32 96, i32 112, i32 98, i32 114, i32 100, i32 116, i32 102, i32 118, i32 104, i32 120, i32 106, i32 122, i32 108, i32 124, i32 110, i32 126, i32 128, i32 144, i32 130, i32 146, i32 132, i32 148, i32 134, i32 150, i32 136, i32 152, i32 138, i32 154, i32 140, i32 156, i32 142, i32 158, i32 192, i32 208, i32 194, i32 210, i32 196, i32 212, i32 198, i32 214, i32 200, i32 216, i32 202, i32 218, i32 204, i32 220, i32 206, i32 222, i32 160, i32 176, i32 162, i32 178, i32 164, i32 180, i32 166, i32 182, i32 168, i32 184, i32 170, i32 186, i32 172, i32 188, i32 174, i32 190, i32 224, i32 240, i32 226, i32 242, i32 228, i32 244, i32 230, i32 246, i32 232, i32 248, i32 234, i32 250, i32 236, i32 252, i32 238, i32 254, i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31, i32 65, i32 81, i32 67, i32 83, i32 69, i32 85, i32 71, i32 87, i32 73, i32 89, i32 75, i32 91, i32 77, i32 93, i32 79, i32 95, i32 33, i32 49, i32 35, i32 51, i32 37, i32 53, i32 39, i32 55, i32 41, i32 57, i32 43, i32 59, i32 45, i32 61, i32 47, i32 63, i32 97, i32 113, i32 99, i32 115, i32 101, i32 117, i32 103, i32 119, i32 105, i32 121, i32 107, i32 123, i32 109, i32 125, i32 111, i32 127, i32 129, i32 145, i32 131, i32 147, i32 133, i32 149, i32 135, i32 151, i32 137, i32 153, i32 139, i32 155, i32 141, i32 157, i32 143, i32 159, i32 193, i32 209, i32 195, i32 211, i32 197, i32 213, i32 199, i32 215, i32 201, i32 217, i32 203, i32 219, i32 205, i32 221, i32 207, i32 223, i32 161, i32 177, i32 163, i32 179, i32 165, i32 181, i32 167, i32 183, i32 169, i32 185, i32 171, i32 187, i32 173, i32 189, i32 175, i32 191, i32 225, i32 241, i32 227, i32 243, i32 229, i32 245, i32 231, i32 247, i32 233, i32 249, i32 235, i32 251, i32 237, i32 253, i32 239, i32 255> @@ -661,7 +661,7 @@ define <256 x i8> @vdeal_51(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_52: -; CHECK: [[REG52:r[0-9]+]] = #82 +; CHECK: [[REG52:r[0-9]+]] = #-46 ; CHECK: vdeal(v1,v0,[[REG52]]) define <256 x i8> @vdeal_52(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 20, i32 21, i32 8, i32 9, i32 24, i32 25, i32 12, i32 13, i32 28, i32 29, i32 64, i32 65, i32 80, i32 81, i32 68, i32 69, i32 84, i32 85, i32 72, i32 73, i32 88, i32 89, i32 76, i32 77, i32 92, i32 93, i32 32, i32 33, i32 48, i32 49, i32 36, i32 37, i32 52, i32 53, i32 40, i32 41, i32 56, i32 57, i32 44, i32 45, i32 60, i32 61, i32 96, i32 97, i32 112, i32 113, i32 100, i32 101, i32 116, i32 117, i32 104, i32 105, i32 120, i32 121, i32 108, i32 109, i32 124, i32 125, i32 128, i32 129, i32 144, i32 145, i32 132, i32 133, i32 148, i32 149, i32 136, i32 137, i32 152, i32 153, i32 140, i32 141, i32 156, i32 157, i32 192, i32 193, i32 208, i32 209, i32 196, i32 197, i32 212, i32 213, i32 200, i32 201, i32 216, i32 217, i32 204, i32 205, i32 220, i32 221, i32 160, i32 161, i32 176, i32 177, i32 164, i32 165, i32 180, i32 181, i32 168, i32 169, i32 184, i32 185, i32 172, i32 173, i32 188, i32 189, i32 224, i32 225, i32 240, i32 241, i32 228, i32 229, i32 244, i32 245, i32 232, i32 233, i32 248, i32 249, i32 236, i32 237, i32 252, i32 253, i32 2, i32 3, i32 18, i32 19, i32 6, i32 7, i32 22, i32 23, i32 10, i32 11, i32 26, i32 27, i32 14, i32 15, i32 30, i32 31, i32 66, i32 67, i32 82, i32 83, i32 70, i32 71, i32 86, i32 87, i32 74, i32 75, i32 90, i32 91, i32 78, i32 79, i32 94, i32 95, i32 34, i32 35, i32 50, i32 51, i32 38, i32 39, i32 54, i32 55, i32 42, i32 43, i32 58, i32 59, i32 46, i32 47, i32 62, i32 63, i32 98, i32 99, i32 114, i32 115, i32 102, i32 103, i32 118, i32 119, i32 106, i32 107, i32 122, i32 123, i32 110, i32 111, i32 126, i32 127, i32 130, i32 131, i32 146, i32 147, i32 134, i32 135, i32 150, i32 151, i32 138, i32 139, i32 154, i32 155, i32 142, i32 143, i32 158, i32 159, i32 194, i32 195, i32 210, i32 211, i32 198, i32 199, i32 214, i32 215, i32 202, i32 203, i32 218, i32 219, i32 206, i32 207, i32 222, i32 223, i32 162, i32 163, i32 178, i32 179, i32 166, i32 167, i32 182, i32 183, i32 170, i32 171, i32 186, i32 187, i32 174, i32 175, i32 190, i32 191, i32 226, i32 227, i32 242, i32 243, i32 230, i32 231, i32 246, i32 247, i32 234, i32 235, i32 250, i32 251, i32 238, i32 239, i32 254, i32 255> @@ -669,7 +669,7 @@ define <256 x i8> @vdeal_52(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_53: -; CHECK: [[REG53:r[0-9]+]] = #83 +; CHECK: [[REG53:r[0-9]+]] = #-45 ; CHECK: vdeal(v1,v0,[[REG53]]) define <256 x i8> @vdeal_53(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30, i32 64, i32 66, i32 80, i32 82, i32 68, i32 70, i32 84, i32 86, i32 72, i32 74, i32 88, i32 90, i32 76, i32 78, i32 92, i32 94, i32 32, i32 34, i32 48, i32 50, i32 36, i32 38, i32 52, i32 54, i32 40, i32 42, i32 56, i32 58, i32 44, i32 46, i32 60, i32 62, i32 96, i32 98, i32 112, i32 114, i32 100, i32 102, i32 116, i32 118, i32 104, i32 106, i32 120, i32 122, i32 108, i32 110, i32 124, i32 126, i32 128, i32 130, i32 144, i32 146, i32 132, i32 134, i32 148, i32 150, i32 136, i32 138, i32 152, i32 154, i32 140, i32 142, i32 156, i32 158, i32 192, i32 194, i32 208, i32 210, i32 196, i32 198, i32 212, i32 214, i32 200, i32 202, i32 216, i32 218, i32 204, i32 206, i32 220, i32 222, i32 160, i32 162, i32 176, i32 178, i32 164, i32 166, i32 180, i32 182, i32 168, i32 170, i32 184, i32 186, i32 172, i32 174, i32 188, i32 190, i32 224, i32 226, i32 240, i32 242, i32 228, i32 230, i32 244, i32 246, i32 232, i32 234, i32 248, i32 250, i32 236, i32 238, i32 252, i32 254, i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31, i32 65, i32 67, i32 81, i32 83, i32 69, i32 71, i32 85, i32 87, i32 73, i32 75, i32 89, i32 91, i32 77, i32 79, i32 93, i32 95, i32 33, i32 35, i32 49, i32 51, i32 37, i32 39, i32 53, i32 55, i32 41, i32 43, i32 57, i32 59, i32 45, i32 47, i32 61, i32 63, i32 97, i32 99, i32 113, i32 115, i32 101, i32 103, i32 117, i32 119, i32 105, i32 107, i32 121, i32 123, i32 109, i32 111, i32 125, i32 127, i32 129, i32 131, i32 145, i32 147, i32 133, i32 135, i32 149, i32 151, i32 137, i32 139, i32 153, i32 155, i32 141, i32 143, i32 157, i32 159, i32 193, i32 195, i32 209, i32 211, i32 197, i32 199, i32 213, i32 215, i32 201, i32 203, i32 217, i32 219, i32 205, i32 207, i32 221, i32 223, i32 161, i32 163, i32 177, i32 179, i32 165, i32 167, i32 181, i32 183, i32 169, i32 171, i32 185, i32 187, i32 173, i32 175, i32 189, i32 191, i32 225, i32 227, i32 241, i32 243, i32 229, i32 231, i32 245, i32 247, i32 233, i32 235, i32 249, i32 251, i32 237, i32 239, i32 253, i32 255> @@ -677,7 +677,7 @@ define <256 x i8> @vdeal_53(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_54: -; CHECK: [[REG54:r[0-9]+]] = #84 +; CHECK: [[REG54:r[0-9]+]] = #-44 ; CHECK: vdeal(v1,v0,[[REG54]]) define <256 x i8> @vdeal_54(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 80, i32 81, i32 82, i32 83, i32 72, i32 73, i32 74, i32 75, i32 88, i32 89, i32 90, i32 91, i32 32, i32 33, i32 34, i32 35, i32 48, i32 49, i32 50, i32 51, i32 40, i32 41, i32 42, i32 43, i32 56, i32 57, i32 58, i32 59, i32 96, i32 97, i32 98, i32 99, i32 112, i32 113, i32 114, i32 115, i32 104, i32 105, i32 106, i32 107, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 144, i32 145, i32 146, i32 147, i32 136, i32 137, i32 138, i32 139, i32 152, i32 153, i32 154, i32 155, i32 192, i32 193, i32 194, i32 195, i32 208, i32 209, i32 210, i32 211, i32 200, i32 201, i32 202, i32 203, i32 216, i32 217, i32 218, i32 219, i32 160, i32 161, i32 162, i32 163, i32 176, i32 177, i32 178, i32 179, i32 168, i32 169, i32 170, i32 171, i32 184, i32 185, i32 186, i32 187, i32 224, i32 225, i32 226, i32 227, i32 240, i32 241, i32 242, i32 243, i32 232, i32 233, i32 234, i32 235, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31, i32 68, i32 69, i32 70, i32 71, i32 84, i32 85, i32 86, i32 87, i32 76, i32 77, i32 78, i32 79, i32 92, i32 93, i32 94, i32 95, i32 36, i32 37, i32 38, i32 39, i32 52, i32 53, i32 54, i32 55, i32 44, i32 45, i32 46, i32 47, i32 60, i32 61, i32 62, i32 63, i32 100, i32 101, i32 102, i32 103, i32 116, i32 117, i32 118, i32 119, i32 108, i32 109, i32 110, i32 111, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 148, i32 149, i32 150, i32 151, i32 140, i32 141, i32 142, i32 143, i32 156, i32 157, i32 158, i32 159, i32 196, i32 197, i32 198, i32 199, i32 212, i32 213, i32 214, i32 215, i32 204, i32 205, i32 206, i32 207, i32 220, i32 221, i32 222, i32 223, i32 164, i32 165, i32 166, i32 167, i32 180, i32 181, i32 182, i32 183, i32 172, i32 173, i32 174, i32 175, i32 188, i32 189, i32 190, i32 191, i32 228, i32 229, i32 230, i32 231, i32 244, i32 245, i32 246, i32 247, i32 236, i32 237, i32 238, i32 239, i32 252, i32 253, i32 254, i32 255> @@ -685,7 +685,7 @@ define <256 x i8> @vdeal_54(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_55: -; CHECK: [[REG55:r[0-9]+]] = #85 +; CHECK: [[REG55:r[0-9]+]] = #-43 ; CHECK: vdeal(v1,v0,[[REG55]]) define <256 x i8> @vdeal_55(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 16, i32 20, i32 18, i32 22, i32 8, i32 12, i32 10, i32 14, i32 24, i32 28, i32 26, i32 30, i32 64, i32 68, i32 66, i32 70, i32 80, i32 84, i32 82, i32 86, i32 72, i32 76, i32 74, i32 78, i32 88, i32 92, i32 90, i32 94, i32 32, i32 36, i32 34, i32 38, i32 48, i32 52, i32 50, i32 54, i32 40, i32 44, i32 42, i32 46, i32 56, i32 60, i32 58, i32 62, i32 96, i32 100, i32 98, i32 102, i32 112, i32 116, i32 114, i32 118, i32 104, i32 108, i32 106, i32 110, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 144, i32 148, i32 146, i32 150, i32 136, i32 140, i32 138, i32 142, i32 152, i32 156, i32 154, i32 158, i32 192, i32 196, i32 194, i32 198, i32 208, i32 212, i32 210, i32 214, i32 200, i32 204, i32 202, i32 206, i32 216, i32 220, i32 218, i32 222, i32 160, i32 164, i32 162, i32 166, i32 176, i32 180, i32 178, i32 182, i32 168, i32 172, i32 170, i32 174, i32 184, i32 188, i32 186, i32 190, i32 224, i32 228, i32 226, i32 230, i32 240, i32 244, i32 242, i32 246, i32 232, i32 236, i32 234, i32 238, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 17, i32 21, i32 19, i32 23, i32 9, i32 13, i32 11, i32 15, i32 25, i32 29, i32 27, i32 31, i32 65, i32 69, i32 67, i32 71, i32 81, i32 85, i32 83, i32 87, i32 73, i32 77, i32 75, i32 79, i32 89, i32 93, i32 91, i32 95, i32 33, i32 37, i32 35, i32 39, i32 49, i32 53, i32 51, i32 55, i32 41, i32 45, i32 43, i32 47, i32 57, i32 61, i32 59, i32 63, i32 97, i32 101, i32 99, i32 103, i32 113, i32 117, i32 115, i32 119, i32 105, i32 109, i32 107, i32 111, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 145, i32 149, i32 147, i32 151, i32 137, i32 141, i32 139, i32 143, i32 153, i32 157, i32 155, i32 159, i32 193, i32 197, i32 195, i32 199, i32 209, i32 213, i32 211, i32 215, i32 201, i32 205, i32 203, i32 207, i32 217, i32 221, i32 219, i32 223, i32 161, i32 165, i32 163, i32 167, i32 177, i32 181, i32 179, i32 183, i32 169, i32 173, i32 171, i32 175, i32 185, i32 189, i32 187, i32 191, i32 225, i32 229, i32 227, i32 231, i32 241, i32 245, i32 243, i32 247, i32 233, i32 237, i32 235, i32 239, i32 249, i32 253, i32 251, i32 255> @@ -693,7 +693,7 @@ define <256 x i8> @vdeal_55(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_56: -; CHECK: [[REG56:r[0-9]+]] = #86 +; CHECK: [[REG56:r[0-9]+]] = #-42 ; CHECK: vdeal(v1,v0,[[REG56]]) define <256 x i8> @vdeal_56(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 16, i32 17, i32 20, i32 21, i32 8, i32 9, i32 12, i32 13, i32 24, i32 25, i32 28, i32 29, i32 64, i32 65, i32 68, i32 69, i32 80, i32 81, i32 84, i32 85, i32 72, i32 73, i32 76, i32 77, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 48, i32 49, i32 52, i32 53, i32 40, i32 41, i32 44, i32 45, i32 56, i32 57, i32 60, i32 61, i32 96, i32 97, i32 100, i32 101, i32 112, i32 113, i32 116, i32 117, i32 104, i32 105, i32 108, i32 109, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 144, i32 145, i32 148, i32 149, i32 136, i32 137, i32 140, i32 141, i32 152, i32 153, i32 156, i32 157, i32 192, i32 193, i32 196, i32 197, i32 208, i32 209, i32 212, i32 213, i32 200, i32 201, i32 204, i32 205, i32 216, i32 217, i32 220, i32 221, i32 160, i32 161, i32 164, i32 165, i32 176, i32 177, i32 180, i32 181, i32 168, i32 169, i32 172, i32 173, i32 184, i32 185, i32 188, i32 189, i32 224, i32 225, i32 228, i32 229, i32 240, i32 241, i32 244, i32 245, i32 232, i32 233, i32 236, i32 237, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 18, i32 19, i32 22, i32 23, i32 10, i32 11, i32 14, i32 15, i32 26, i32 27, i32 30, i32 31, i32 66, i32 67, i32 70, i32 71, i32 82, i32 83, i32 86, i32 87, i32 74, i32 75, i32 78, i32 79, i32 90, i32 91, i32 94, i32 95, i32 34, i32 35, i32 38, i32 39, i32 50, i32 51, i32 54, i32 55, i32 42, i32 43, i32 46, i32 47, i32 58, i32 59, i32 62, i32 63, i32 98, i32 99, i32 102, i32 103, i32 114, i32 115, i32 118, i32 119, i32 106, i32 107, i32 110, i32 111, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 146, i32 147, i32 150, i32 151, i32 138, i32 139, i32 142, i32 143, i32 154, i32 155, i32 158, i32 159, i32 194, i32 195, i32 198, i32 199, i32 210, i32 211, i32 214, i32 215, i32 202, i32 203, i32 206, i32 207, i32 218, i32 219, i32 222, i32 223, i32 162, i32 163, i32 166, i32 167, i32 178, i32 179, i32 182, i32 183, i32 170, i32 171, i32 174, i32 175, i32 186, i32 187, i32 190, i32 191, i32 226, i32 227, i32 230, i32 231, i32 242, i32 243, i32 246, i32 247, i32 234, i32 235, i32 238, i32 239, i32 250, i32 251, i32 254, i32 255> @@ -701,7 +701,7 @@ define <256 x i8> @vdeal_56(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_57: -; CHECK: [[REG57:r[0-9]+]] = #87 +; CHECK: [[REG57:r[0-9]+]] = #-41 ; CHECK: vdeal(v1,v0,[[REG57]]) define <256 x i8> @vdeal_57(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30, i32 64, i32 66, i32 68, i32 70, i32 80, i32 82, i32 84, i32 86, i32 72, i32 74, i32 76, i32 78, i32 88, i32 90, i32 92, i32 94, i32 32, i32 34, i32 36, i32 38, i32 48, i32 50, i32 52, i32 54, i32 40, i32 42, i32 44, i32 46, i32 56, i32 58, i32 60, i32 62, i32 96, i32 98, i32 100, i32 102, i32 112, i32 114, i32 116, i32 118, i32 104, i32 106, i32 108, i32 110, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 144, i32 146, i32 148, i32 150, i32 136, i32 138, i32 140, i32 142, i32 152, i32 154, i32 156, i32 158, i32 192, i32 194, i32 196, i32 198, i32 208, i32 210, i32 212, i32 214, i32 200, i32 202, i32 204, i32 206, i32 216, i32 218, i32 220, i32 222, i32 160, i32 162, i32 164, i32 166, i32 176, i32 178, i32 180, i32 182, i32 168, i32 170, i32 172, i32 174, i32 184, i32 186, i32 188, i32 190, i32 224, i32 226, i32 228, i32 230, i32 240, i32 242, i32 244, i32 246, i32 232, i32 234, i32 236, i32 238, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31, i32 65, i32 67, i32 69, i32 71, i32 81, i32 83, i32 85, i32 87, i32 73, i32 75, i32 77, i32 79, i32 89, i32 91, i32 93, i32 95, i32 33, i32 35, i32 37, i32 39, i32 49, i32 51, i32 53, i32 55, i32 41, i32 43, i32 45, i32 47, i32 57, i32 59, i32 61, i32 63, i32 97, i32 99, i32 101, i32 103, i32 113, i32 115, i32 117, i32 119, i32 105, i32 107, i32 109, i32 111, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 145, i32 147, i32 149, i32 151, i32 137, i32 139, i32 141, i32 143, i32 153, i32 155, i32 157, i32 159, i32 193, i32 195, i32 197, i32 199, i32 209, i32 211, i32 213, i32 215, i32 201, i32 203, i32 205, i32 207, i32 217, i32 219, i32 221, i32 223, i32 161, i32 163, i32 165, i32 167, i32 177, i32 179, i32 181, i32 183, i32 169, i32 171, i32 173, i32 175, i32 185, i32 187, i32 189, i32 191, i32 225, i32 227, i32 229, i32 231, i32 241, i32 243, i32 245, i32 247, i32 233, i32 235, i32 237, i32 239, i32 249, i32 251, i32 253, i32 255> @@ -709,7 +709,7 @@ define <256 x i8> @vdeal_57(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_58: -; CHECK: [[REG58:r[0-9]+]] = #88 +; CHECK: [[REG58:r[0-9]+]] = #-40 ; CHECK: vdeal(v1,v0,[[REG58]]) define <256 x i8> @vdeal_58(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -717,7 +717,7 @@ define <256 x i8> @vdeal_58(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_59: -; CHECK: [[REG59:r[0-9]+]] = #89 +; CHECK: [[REG59:r[0-9]+]] = #-39 ; CHECK: vdeal(v1,v0,[[REG59]]) define <256 x i8> @vdeal_59(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 16, i32 24, i32 18, i32 26, i32 20, i32 28, i32 22, i32 30, i32 64, i32 72, i32 66, i32 74, i32 68, i32 76, i32 70, i32 78, i32 80, i32 88, i32 82, i32 90, i32 84, i32 92, i32 86, i32 94, i32 32, i32 40, i32 34, i32 42, i32 36, i32 44, i32 38, i32 46, i32 48, i32 56, i32 50, i32 58, i32 52, i32 60, i32 54, i32 62, i32 96, i32 104, i32 98, i32 106, i32 100, i32 108, i32 102, i32 110, i32 112, i32 120, i32 114, i32 122, i32 116, i32 124, i32 118, i32 126, i32 128, i32 136, i32 130, i32 138, i32 132, i32 140, i32 134, i32 142, i32 144, i32 152, i32 146, i32 154, i32 148, i32 156, i32 150, i32 158, i32 192, i32 200, i32 194, i32 202, i32 196, i32 204, i32 198, i32 206, i32 208, i32 216, i32 210, i32 218, i32 212, i32 220, i32 214, i32 222, i32 160, i32 168, i32 162, i32 170, i32 164, i32 172, i32 166, i32 174, i32 176, i32 184, i32 178, i32 186, i32 180, i32 188, i32 182, i32 190, i32 224, i32 232, i32 226, i32 234, i32 228, i32 236, i32 230, i32 238, i32 240, i32 248, i32 242, i32 250, i32 244, i32 252, i32 246, i32 254, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, i32 17, i32 25, i32 19, i32 27, i32 21, i32 29, i32 23, i32 31, i32 65, i32 73, i32 67, i32 75, i32 69, i32 77, i32 71, i32 79, i32 81, i32 89, i32 83, i32 91, i32 85, i32 93, i32 87, i32 95, i32 33, i32 41, i32 35, i32 43, i32 37, i32 45, i32 39, i32 47, i32 49, i32 57, i32 51, i32 59, i32 53, i32 61, i32 55, i32 63, i32 97, i32 105, i32 99, i32 107, i32 101, i32 109, i32 103, i32 111, i32 113, i32 121, i32 115, i32 123, i32 117, i32 125, i32 119, i32 127, i32 129, i32 137, i32 131, i32 139, i32 133, i32 141, i32 135, i32 143, i32 145, i32 153, i32 147, i32 155, i32 149, i32 157, i32 151, i32 159, i32 193, i32 201, i32 195, i32 203, i32 197, i32 205, i32 199, i32 207, i32 209, i32 217, i32 211, i32 219, i32 213, i32 221, i32 215, i32 223, i32 161, i32 169, i32 163, i32 171, i32 165, i32 173, i32 167, i32 175, i32 177, i32 185, i32 179, i32 187, i32 181, i32 189, i32 183, i32 191, i32 225, i32 233, i32 227, i32 235, i32 229, i32 237, i32 231, i32 239, i32 241, i32 249, i32 243, i32 251, i32 245, i32 253, i32 247, i32 255> @@ -725,7 +725,7 @@ define <256 x i8> @vdeal_59(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_5a: -; CHECK: [[REG5a:r[0-9]+]] = #90 +; CHECK: [[REG5a:r[0-9]+]] = #-38 ; CHECK: vdeal(v1,v0,[[REG5a]]) define <256 x i8> @vdeal_5a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13, i32 16, i32 17, i32 24, i32 25, i32 20, i32 21, i32 28, i32 29, i32 64, i32 65, i32 72, i32 73, i32 68, i32 69, i32 76, i32 77, i32 80, i32 81, i32 88, i32 89, i32 84, i32 85, i32 92, i32 93, i32 32, i32 33, i32 40, i32 41, i32 36, i32 37, i32 44, i32 45, i32 48, i32 49, i32 56, i32 57, i32 52, i32 53, i32 60, i32 61, i32 96, i32 97, i32 104, i32 105, i32 100, i32 101, i32 108, i32 109, i32 112, i32 113, i32 120, i32 121, i32 116, i32 117, i32 124, i32 125, i32 128, i32 129, i32 136, i32 137, i32 132, i32 133, i32 140, i32 141, i32 144, i32 145, i32 152, i32 153, i32 148, i32 149, i32 156, i32 157, i32 192, i32 193, i32 200, i32 201, i32 196, i32 197, i32 204, i32 205, i32 208, i32 209, i32 216, i32 217, i32 212, i32 213, i32 220, i32 221, i32 160, i32 161, i32 168, i32 169, i32 164, i32 165, i32 172, i32 173, i32 176, i32 177, i32 184, i32 185, i32 180, i32 181, i32 188, i32 189, i32 224, i32 225, i32 232, i32 233, i32 228, i32 229, i32 236, i32 237, i32 240, i32 241, i32 248, i32 249, i32 244, i32 245, i32 252, i32 253, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15, i32 18, i32 19, i32 26, i32 27, i32 22, i32 23, i32 30, i32 31, i32 66, i32 67, i32 74, i32 75, i32 70, i32 71, i32 78, i32 79, i32 82, i32 83, i32 90, i32 91, i32 86, i32 87, i32 94, i32 95, i32 34, i32 35, i32 42, i32 43, i32 38, i32 39, i32 46, i32 47, i32 50, i32 51, i32 58, i32 59, i32 54, i32 55, i32 62, i32 63, i32 98, i32 99, i32 106, i32 107, i32 102, i32 103, i32 110, i32 111, i32 114, i32 115, i32 122, i32 123, i32 118, i32 119, i32 126, i32 127, i32 130, i32 131, i32 138, i32 139, i32 134, i32 135, i32 142, i32 143, i32 146, i32 147, i32 154, i32 155, i32 150, i32 151, i32 158, i32 159, i32 194, i32 195, i32 202, i32 203, i32 198, i32 199, i32 206, i32 207, i32 210, i32 211, i32 218, i32 219, i32 214, i32 215, i32 222, i32 223, i32 162, i32 163, i32 170, i32 171, i32 166, i32 167, i32 174, i32 175, i32 178, i32 179, i32 186, i32 187, i32 182, i32 183, i32 190, i32 191, i32 226, i32 227, i32 234, i32 235, i32 230, i32 231, i32 238, i32 239, i32 242, i32 243, i32 250, i32 251, i32 246, i32 247, i32 254, i32 255> @@ -733,7 +733,7 @@ define <256 x i8> @vdeal_5a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_5b: -; CHECK: [[REG5b:r[0-9]+]] = #91 +; CHECK: [[REG5b:r[0-9]+]] = #-37 ; CHECK: vdeal(v1,v0,[[REG5b]]) define <256 x i8> @vdeal_5b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14, i32 16, i32 18, i32 24, i32 26, i32 20, i32 22, i32 28, i32 30, i32 64, i32 66, i32 72, i32 74, i32 68, i32 70, i32 76, i32 78, i32 80, i32 82, i32 88, i32 90, i32 84, i32 86, i32 92, i32 94, i32 32, i32 34, i32 40, i32 42, i32 36, i32 38, i32 44, i32 46, i32 48, i32 50, i32 56, i32 58, i32 52, i32 54, i32 60, i32 62, i32 96, i32 98, i32 104, i32 106, i32 100, i32 102, i32 108, i32 110, i32 112, i32 114, i32 120, i32 122, i32 116, i32 118, i32 124, i32 126, i32 128, i32 130, i32 136, i32 138, i32 132, i32 134, i32 140, i32 142, i32 144, i32 146, i32 152, i32 154, i32 148, i32 150, i32 156, i32 158, i32 192, i32 194, i32 200, i32 202, i32 196, i32 198, i32 204, i32 206, i32 208, i32 210, i32 216, i32 218, i32 212, i32 214, i32 220, i32 222, i32 160, i32 162, i32 168, i32 170, i32 164, i32 166, i32 172, i32 174, i32 176, i32 178, i32 184, i32 186, i32 180, i32 182, i32 188, i32 190, i32 224, i32 226, i32 232, i32 234, i32 228, i32 230, i32 236, i32 238, i32 240, i32 242, i32 248, i32 250, i32 244, i32 246, i32 252, i32 254, i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15, i32 17, i32 19, i32 25, i32 27, i32 21, i32 23, i32 29, i32 31, i32 65, i32 67, i32 73, i32 75, i32 69, i32 71, i32 77, i32 79, i32 81, i32 83, i32 89, i32 91, i32 85, i32 87, i32 93, i32 95, i32 33, i32 35, i32 41, i32 43, i32 37, i32 39, i32 45, i32 47, i32 49, i32 51, i32 57, i32 59, i32 53, i32 55, i32 61, i32 63, i32 97, i32 99, i32 105, i32 107, i32 101, i32 103, i32 109, i32 111, i32 113, i32 115, i32 121, i32 123, i32 117, i32 119, i32 125, i32 127, i32 129, i32 131, i32 137, i32 139, i32 133, i32 135, i32 141, i32 143, i32 145, i32 147, i32 153, i32 155, i32 149, i32 151, i32 157, i32 159, i32 193, i32 195, i32 201, i32 203, i32 197, i32 199, i32 205, i32 207, i32 209, i32 211, i32 217, i32 219, i32 213, i32 215, i32 221, i32 223, i32 161, i32 163, i32 169, i32 171, i32 165, i32 167, i32 173, i32 175, i32 177, i32 179, i32 185, i32 187, i32 181, i32 183, i32 189, i32 191, i32 225, i32 227, i32 233, i32 235, i32 229, i32 231, i32 237, i32 239, i32 241, i32 243, i32 249, i32 251, i32 245, i32 247, i32 253, i32 255> @@ -741,7 +741,7 @@ define <256 x i8> @vdeal_5b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_5c: -; CHECK: [[REG5c:r[0-9]+]] = #92 +; CHECK: [[REG5c:r[0-9]+]] = #-36 ; CHECK: vdeal(v1,v0,[[REG5c]]) define <256 x i8> @vdeal_5c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 64, i32 65, i32 66, i32 67, i32 72, i32 73, i32 74, i32 75, i32 80, i32 81, i32 82, i32 83, i32 88, i32 89, i32 90, i32 91, i32 32, i32 33, i32 34, i32 35, i32 40, i32 41, i32 42, i32 43, i32 48, i32 49, i32 50, i32 51, i32 56, i32 57, i32 58, i32 59, i32 96, i32 97, i32 98, i32 99, i32 104, i32 105, i32 106, i32 107, i32 112, i32 113, i32 114, i32 115, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 136, i32 137, i32 138, i32 139, i32 144, i32 145, i32 146, i32 147, i32 152, i32 153, i32 154, i32 155, i32 192, i32 193, i32 194, i32 195, i32 200, i32 201, i32 202, i32 203, i32 208, i32 209, i32 210, i32 211, i32 216, i32 217, i32 218, i32 219, i32 160, i32 161, i32 162, i32 163, i32 168, i32 169, i32 170, i32 171, i32 176, i32 177, i32 178, i32 179, i32 184, i32 185, i32 186, i32 187, i32 224, i32 225, i32 226, i32 227, i32 232, i32 233, i32 234, i32 235, i32 240, i32 241, i32 242, i32 243, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 68, i32 69, i32 70, i32 71, i32 76, i32 77, i32 78, i32 79, i32 84, i32 85, i32 86, i32 87, i32 92, i32 93, i32 94, i32 95, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 100, i32 101, i32 102, i32 103, i32 108, i32 109, i32 110, i32 111, i32 116, i32 117, i32 118, i32 119, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 140, i32 141, i32 142, i32 143, i32 148, i32 149, i32 150, i32 151, i32 156, i32 157, i32 158, i32 159, i32 196, i32 197, i32 198, i32 199, i32 204, i32 205, i32 206, i32 207, i32 212, i32 213, i32 214, i32 215, i32 220, i32 221, i32 222, i32 223, i32 164, i32 165, i32 166, i32 167, i32 172, i32 173, i32 174, i32 175, i32 180, i32 181, i32 182, i32 183, i32 188, i32 189, i32 190, i32 191, i32 228, i32 229, i32 230, i32 231, i32 236, i32 237, i32 238, i32 239, i32 244, i32 245, i32 246, i32 247, i32 252, i32 253, i32 254, i32 255> @@ -749,7 +749,7 @@ define <256 x i8> @vdeal_5c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_5d: -; CHECK: [[REG5d:r[0-9]+]] = #93 +; CHECK: [[REG5d:r[0-9]+]] = #-35 ; CHECK: vdeal(v1,v0,[[REG5d]]) define <256 x i8> @vdeal_5d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 8, i32 12, i32 10, i32 14, i32 16, i32 20, i32 18, i32 22, i32 24, i32 28, i32 26, i32 30, i32 64, i32 68, i32 66, i32 70, i32 72, i32 76, i32 74, i32 78, i32 80, i32 84, i32 82, i32 86, i32 88, i32 92, i32 90, i32 94, i32 32, i32 36, i32 34, i32 38, i32 40, i32 44, i32 42, i32 46, i32 48, i32 52, i32 50, i32 54, i32 56, i32 60, i32 58, i32 62, i32 96, i32 100, i32 98, i32 102, i32 104, i32 108, i32 106, i32 110, i32 112, i32 116, i32 114, i32 118, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 136, i32 140, i32 138, i32 142, i32 144, i32 148, i32 146, i32 150, i32 152, i32 156, i32 154, i32 158, i32 192, i32 196, i32 194, i32 198, i32 200, i32 204, i32 202, i32 206, i32 208, i32 212, i32 210, i32 214, i32 216, i32 220, i32 218, i32 222, i32 160, i32 164, i32 162, i32 166, i32 168, i32 172, i32 170, i32 174, i32 176, i32 180, i32 178, i32 182, i32 184, i32 188, i32 186, i32 190, i32 224, i32 228, i32 226, i32 230, i32 232, i32 236, i32 234, i32 238, i32 240, i32 244, i32 242, i32 246, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15, i32 17, i32 21, i32 19, i32 23, i32 25, i32 29, i32 27, i32 31, i32 65, i32 69, i32 67, i32 71, i32 73, i32 77, i32 75, i32 79, i32 81, i32 85, i32 83, i32 87, i32 89, i32 93, i32 91, i32 95, i32 33, i32 37, i32 35, i32 39, i32 41, i32 45, i32 43, i32 47, i32 49, i32 53, i32 51, i32 55, i32 57, i32 61, i32 59, i32 63, i32 97, i32 101, i32 99, i32 103, i32 105, i32 109, i32 107, i32 111, i32 113, i32 117, i32 115, i32 119, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 137, i32 141, i32 139, i32 143, i32 145, i32 149, i32 147, i32 151, i32 153, i32 157, i32 155, i32 159, i32 193, i32 197, i32 195, i32 199, i32 201, i32 205, i32 203, i32 207, i32 209, i32 213, i32 211, i32 215, i32 217, i32 221, i32 219, i32 223, i32 161, i32 165, i32 163, i32 167, i32 169, i32 173, i32 171, i32 175, i32 177, i32 181, i32 179, i32 183, i32 185, i32 189, i32 187, i32 191, i32 225, i32 229, i32 227, i32 231, i32 233, i32 237, i32 235, i32 239, i32 241, i32 245, i32 243, i32 247, i32 249, i32 253, i32 251, i32 255> @@ -757,7 +757,7 @@ define <256 x i8> @vdeal_5d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_5e: -; CHECK: [[REG5e:r[0-9]+]] = #94 +; CHECK: [[REG5e:r[0-9]+]] = #-34 ; CHECK: vdeal(v1,v0,[[REG5e]]) define <256 x i8> @vdeal_5e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 136, i32 137, i32 140, i32 141, i32 144, i32 145, i32 148, i32 149, i32 152, i32 153, i32 156, i32 157, i32 192, i32 193, i32 196, i32 197, i32 200, i32 201, i32 204, i32 205, i32 208, i32 209, i32 212, i32 213, i32 216, i32 217, i32 220, i32 221, i32 160, i32 161, i32 164, i32 165, i32 168, i32 169, i32 172, i32 173, i32 176, i32 177, i32 180, i32 181, i32 184, i32 185, i32 188, i32 189, i32 224, i32 225, i32 228, i32 229, i32 232, i32 233, i32 236, i32 237, i32 240, i32 241, i32 244, i32 245, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 10, i32 11, i32 14, i32 15, i32 18, i32 19, i32 22, i32 23, i32 26, i32 27, i32 30, i32 31, i32 66, i32 67, i32 70, i32 71, i32 74, i32 75, i32 78, i32 79, i32 82, i32 83, i32 86, i32 87, i32 90, i32 91, i32 94, i32 95, i32 34, i32 35, i32 38, i32 39, i32 42, i32 43, i32 46, i32 47, i32 50, i32 51, i32 54, i32 55, i32 58, i32 59, i32 62, i32 63, i32 98, i32 99, i32 102, i32 103, i32 106, i32 107, i32 110, i32 111, i32 114, i32 115, i32 118, i32 119, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 138, i32 139, i32 142, i32 143, i32 146, i32 147, i32 150, i32 151, i32 154, i32 155, i32 158, i32 159, i32 194, i32 195, i32 198, i32 199, i32 202, i32 203, i32 206, i32 207, i32 210, i32 211, i32 214, i32 215, i32 218, i32 219, i32 222, i32 223, i32 162, i32 163, i32 166, i32 167, i32 170, i32 171, i32 174, i32 175, i32 178, i32 179, i32 182, i32 183, i32 186, i32 187, i32 190, i32 191, i32 226, i32 227, i32 230, i32 231, i32 234, i32 235, i32 238, i32 239, i32 242, i32 243, i32 246, i32 247, i32 250, i32 251, i32 254, i32 255> @@ -765,7 +765,7 @@ define <256 x i8> @vdeal_5e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_5f: -; CHECK: [[REG5f:r[0-9]+]] = #95 +; CHECK: [[REG5f:r[0-9]+]] = #-33 ; CHECK: vdeal(v1,v0,[[REG5f]]) define <256 x i8> @vdeal_5f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 136, i32 138, i32 140, i32 142, i32 144, i32 146, i32 148, i32 150, i32 152, i32 154, i32 156, i32 158, i32 192, i32 194, i32 196, i32 198, i32 200, i32 202, i32 204, i32 206, i32 208, i32 210, i32 212, i32 214, i32 216, i32 218, i32 220, i32 222, i32 160, i32 162, i32 164, i32 166, i32 168, i32 170, i32 172, i32 174, i32 176, i32 178, i32 180, i32 182, i32 184, i32 186, i32 188, i32 190, i32 224, i32 226, i32 228, i32 230, i32 232, i32 234, i32 236, i32 238, i32 240, i32 242, i32 244, i32 246, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 137, i32 139, i32 141, i32 143, i32 145, i32 147, i32 149, i32 151, i32 153, i32 155, i32 157, i32 159, i32 193, i32 195, i32 197, i32 199, i32 201, i32 203, i32 205, i32 207, i32 209, i32 211, i32 213, i32 215, i32 217, i32 219, i32 221, i32 223, i32 161, i32 163, i32 165, i32 167, i32 169, i32 171, i32 173, i32 175, i32 177, i32 179, i32 181, i32 183, i32 185, i32 187, i32 189, i32 191, i32 225, i32 227, i32 229, i32 231, i32 233, i32 235, i32 237, i32 239, i32 241, i32 243, i32 245, i32 247, i32 249, i32 251, i32 253, i32 255> @@ -773,7 +773,7 @@ define <256 x i8> @vdeal_5f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_60: -; CHECK: [[REG60:r[0-9]+]] = #96 +; CHECK: [[REG60:r[0-9]+]] = #-32 ; CHECK: vdeal(v1,v0,[[REG60]]) define <256 x i8> @vdeal_60(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -781,7 +781,7 @@ define <256 x i8> @vdeal_60(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_61: -; CHECK: [[REG61:r[0-9]+]] = #97 +; CHECK: [[REG61:r[0-9]+]] = #-31 ; CHECK: vdeal(v1,v0,[[REG61]]) define <256 x i8> @vdeal_61(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62, i32 64, i32 96, i32 66, i32 98, i32 68, i32 100, i32 70, i32 102, i32 72, i32 104, i32 74, i32 106, i32 76, i32 108, i32 78, i32 110, i32 80, i32 112, i32 82, i32 114, i32 84, i32 116, i32 86, i32 118, i32 88, i32 120, i32 90, i32 122, i32 92, i32 124, i32 94, i32 126, i32 128, i32 160, i32 130, i32 162, i32 132, i32 164, i32 134, i32 166, i32 136, i32 168, i32 138, i32 170, i32 140, i32 172, i32 142, i32 174, i32 144, i32 176, i32 146, i32 178, i32 148, i32 180, i32 150, i32 182, i32 152, i32 184, i32 154, i32 186, i32 156, i32 188, i32 158, i32 190, i32 192, i32 224, i32 194, i32 226, i32 196, i32 228, i32 198, i32 230, i32 200, i32 232, i32 202, i32 234, i32 204, i32 236, i32 206, i32 238, i32 208, i32 240, i32 210, i32 242, i32 212, i32 244, i32 214, i32 246, i32 216, i32 248, i32 218, i32 250, i32 220, i32 252, i32 222, i32 254, i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47, i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63, i32 65, i32 97, i32 67, i32 99, i32 69, i32 101, i32 71, i32 103, i32 73, i32 105, i32 75, i32 107, i32 77, i32 109, i32 79, i32 111, i32 81, i32 113, i32 83, i32 115, i32 85, i32 117, i32 87, i32 119, i32 89, i32 121, i32 91, i32 123, i32 93, i32 125, i32 95, i32 127, i32 129, i32 161, i32 131, i32 163, i32 133, i32 165, i32 135, i32 167, i32 137, i32 169, i32 139, i32 171, i32 141, i32 173, i32 143, i32 175, i32 145, i32 177, i32 147, i32 179, i32 149, i32 181, i32 151, i32 183, i32 153, i32 185, i32 155, i32 187, i32 157, i32 189, i32 159, i32 191, i32 193, i32 225, i32 195, i32 227, i32 197, i32 229, i32 199, i32 231, i32 201, i32 233, i32 203, i32 235, i32 205, i32 237, i32 207, i32 239, i32 209, i32 241, i32 211, i32 243, i32 213, i32 245, i32 215, i32 247, i32 217, i32 249, i32 219, i32 251, i32 221, i32 253, i32 223, i32 255> @@ -789,7 +789,7 @@ define <256 x i8> @vdeal_61(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_62: -; CHECK: [[REG62:r[0-9]+]] = #98 +; CHECK: [[REG62:r[0-9]+]] = #-30 ; CHECK: vdeal(v1,v0,[[REG62]]) define <256 x i8> @vdeal_62(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 32, i32 33, i32 4, i32 5, i32 36, i32 37, i32 8, i32 9, i32 40, i32 41, i32 12, i32 13, i32 44, i32 45, i32 16, i32 17, i32 48, i32 49, i32 20, i32 21, i32 52, i32 53, i32 24, i32 25, i32 56, i32 57, i32 28, i32 29, i32 60, i32 61, i32 64, i32 65, i32 96, i32 97, i32 68, i32 69, i32 100, i32 101, i32 72, i32 73, i32 104, i32 105, i32 76, i32 77, i32 108, i32 109, i32 80, i32 81, i32 112, i32 113, i32 84, i32 85, i32 116, i32 117, i32 88, i32 89, i32 120, i32 121, i32 92, i32 93, i32 124, i32 125, i32 128, i32 129, i32 160, i32 161, i32 132, i32 133, i32 164, i32 165, i32 136, i32 137, i32 168, i32 169, i32 140, i32 141, i32 172, i32 173, i32 144, i32 145, i32 176, i32 177, i32 148, i32 149, i32 180, i32 181, i32 152, i32 153, i32 184, i32 185, i32 156, i32 157, i32 188, i32 189, i32 192, i32 193, i32 224, i32 225, i32 196, i32 197, i32 228, i32 229, i32 200, i32 201, i32 232, i32 233, i32 204, i32 205, i32 236, i32 237, i32 208, i32 209, i32 240, i32 241, i32 212, i32 213, i32 244, i32 245, i32 216, i32 217, i32 248, i32 249, i32 220, i32 221, i32 252, i32 253, i32 2, i32 3, i32 34, i32 35, i32 6, i32 7, i32 38, i32 39, i32 10, i32 11, i32 42, i32 43, i32 14, i32 15, i32 46, i32 47, i32 18, i32 19, i32 50, i32 51, i32 22, i32 23, i32 54, i32 55, i32 26, i32 27, i32 58, i32 59, i32 30, i32 31, i32 62, i32 63, i32 66, i32 67, i32 98, i32 99, i32 70, i32 71, i32 102, i32 103, i32 74, i32 75, i32 106, i32 107, i32 78, i32 79, i32 110, i32 111, i32 82, i32 83, i32 114, i32 115, i32 86, i32 87, i32 118, i32 119, i32 90, i32 91, i32 122, i32 123, i32 94, i32 95, i32 126, i32 127, i32 130, i32 131, i32 162, i32 163, i32 134, i32 135, i32 166, i32 167, i32 138, i32 139, i32 170, i32 171, i32 142, i32 143, i32 174, i32 175, i32 146, i32 147, i32 178, i32 179, i32 150, i32 151, i32 182, i32 183, i32 154, i32 155, i32 186, i32 187, i32 158, i32 159, i32 190, i32 191, i32 194, i32 195, i32 226, i32 227, i32 198, i32 199, i32 230, i32 231, i32 202, i32 203, i32 234, i32 235, i32 206, i32 207, i32 238, i32 239, i32 210, i32 211, i32 242, i32 243, i32 214, i32 215, i32 246, i32 247, i32 218, i32 219, i32 250, i32 251, i32 222, i32 223, i32 254, i32 255> @@ -797,7 +797,7 @@ define <256 x i8> @vdeal_62(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_63: -; CHECK: [[REG63:r[0-9]+]] = #99 +; CHECK: [[REG63:r[0-9]+]] = #-29 ; CHECK: vdeal(v1,v0,[[REG63]]) define <256 x i8> @vdeal_63(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 32, i32 34, i32 4, i32 6, i32 36, i32 38, i32 8, i32 10, i32 40, i32 42, i32 12, i32 14, i32 44, i32 46, i32 16, i32 18, i32 48, i32 50, i32 20, i32 22, i32 52, i32 54, i32 24, i32 26, i32 56, i32 58, i32 28, i32 30, i32 60, i32 62, i32 64, i32 66, i32 96, i32 98, i32 68, i32 70, i32 100, i32 102, i32 72, i32 74, i32 104, i32 106, i32 76, i32 78, i32 108, i32 110, i32 80, i32 82, i32 112, i32 114, i32 84, i32 86, i32 116, i32 118, i32 88, i32 90, i32 120, i32 122, i32 92, i32 94, i32 124, i32 126, i32 128, i32 130, i32 160, i32 162, i32 132, i32 134, i32 164, i32 166, i32 136, i32 138, i32 168, i32 170, i32 140, i32 142, i32 172, i32 174, i32 144, i32 146, i32 176, i32 178, i32 148, i32 150, i32 180, i32 182, i32 152, i32 154, i32 184, i32 186, i32 156, i32 158, i32 188, i32 190, i32 192, i32 194, i32 224, i32 226, i32 196, i32 198, i32 228, i32 230, i32 200, i32 202, i32 232, i32 234, i32 204, i32 206, i32 236, i32 238, i32 208, i32 210, i32 240, i32 242, i32 212, i32 214, i32 244, i32 246, i32 216, i32 218, i32 248, i32 250, i32 220, i32 222, i32 252, i32 254, i32 1, i32 3, i32 33, i32 35, i32 5, i32 7, i32 37, i32 39, i32 9, i32 11, i32 41, i32 43, i32 13, i32 15, i32 45, i32 47, i32 17, i32 19, i32 49, i32 51, i32 21, i32 23, i32 53, i32 55, i32 25, i32 27, i32 57, i32 59, i32 29, i32 31, i32 61, i32 63, i32 65, i32 67, i32 97, i32 99, i32 69, i32 71, i32 101, i32 103, i32 73, i32 75, i32 105, i32 107, i32 77, i32 79, i32 109, i32 111, i32 81, i32 83, i32 113, i32 115, i32 85, i32 87, i32 117, i32 119, i32 89, i32 91, i32 121, i32 123, i32 93, i32 95, i32 125, i32 127, i32 129, i32 131, i32 161, i32 163, i32 133, i32 135, i32 165, i32 167, i32 137, i32 139, i32 169, i32 171, i32 141, i32 143, i32 173, i32 175, i32 145, i32 147, i32 177, i32 179, i32 149, i32 151, i32 181, i32 183, i32 153, i32 155, i32 185, i32 187, i32 157, i32 159, i32 189, i32 191, i32 193, i32 195, i32 225, i32 227, i32 197, i32 199, i32 229, i32 231, i32 201, i32 203, i32 233, i32 235, i32 205, i32 207, i32 237, i32 239, i32 209, i32 211, i32 241, i32 243, i32 213, i32 215, i32 245, i32 247, i32 217, i32 219, i32 249, i32 251, i32 221, i32 223, i32 253, i32 255> @@ -805,7 +805,7 @@ define <256 x i8> @vdeal_63(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_64: -; CHECK: [[REG64:r[0-9]+]] = #100 +; CHECK: [[REG64:r[0-9]+]] = #-28 ; CHECK: vdeal(v1,v0,[[REG64]]) define <256 x i8> @vdeal_64(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 32, i32 33, i32 34, i32 35, i32 8, i32 9, i32 10, i32 11, i32 40, i32 41, i32 42, i32 43, i32 16, i32 17, i32 18, i32 19, i32 48, i32 49, i32 50, i32 51, i32 24, i32 25, i32 26, i32 27, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 96, i32 97, i32 98, i32 99, i32 72, i32 73, i32 74, i32 75, i32 104, i32 105, i32 106, i32 107, i32 80, i32 81, i32 82, i32 83, i32 112, i32 113, i32 114, i32 115, i32 88, i32 89, i32 90, i32 91, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 160, i32 161, i32 162, i32 163, i32 136, i32 137, i32 138, i32 139, i32 168, i32 169, i32 170, i32 171, i32 144, i32 145, i32 146, i32 147, i32 176, i32 177, i32 178, i32 179, i32 152, i32 153, i32 154, i32 155, i32 184, i32 185, i32 186, i32 187, i32 192, i32 193, i32 194, i32 195, i32 224, i32 225, i32 226, i32 227, i32 200, i32 201, i32 202, i32 203, i32 232, i32 233, i32 234, i32 235, i32 208, i32 209, i32 210, i32 211, i32 240, i32 241, i32 242, i32 243, i32 216, i32 217, i32 218, i32 219, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 36, i32 37, i32 38, i32 39, i32 12, i32 13, i32 14, i32 15, i32 44, i32 45, i32 46, i32 47, i32 20, i32 21, i32 22, i32 23, i32 52, i32 53, i32 54, i32 55, i32 28, i32 29, i32 30, i32 31, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 100, i32 101, i32 102, i32 103, i32 76, i32 77, i32 78, i32 79, i32 108, i32 109, i32 110, i32 111, i32 84, i32 85, i32 86, i32 87, i32 116, i32 117, i32 118, i32 119, i32 92, i32 93, i32 94, i32 95, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 164, i32 165, i32 166, i32 167, i32 140, i32 141, i32 142, i32 143, i32 172, i32 173, i32 174, i32 175, i32 148, i32 149, i32 150, i32 151, i32 180, i32 181, i32 182, i32 183, i32 156, i32 157, i32 158, i32 159, i32 188, i32 189, i32 190, i32 191, i32 196, i32 197, i32 198, i32 199, i32 228, i32 229, i32 230, i32 231, i32 204, i32 205, i32 206, i32 207, i32 236, i32 237, i32 238, i32 239, i32 212, i32 213, i32 214, i32 215, i32 244, i32 245, i32 246, i32 247, i32 220, i32 221, i32 222, i32 223, i32 252, i32 253, i32 254, i32 255> @@ -813,7 +813,7 @@ define <256 x i8> @vdeal_64(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_65: -; CHECK: [[REG65:r[0-9]+]] = #101 +; CHECK: [[REG65:r[0-9]+]] = #-27 ; CHECK: vdeal(v1,v0,[[REG65]]) define <256 x i8> @vdeal_65(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 32, i32 36, i32 34, i32 38, i32 8, i32 12, i32 10, i32 14, i32 40, i32 44, i32 42, i32 46, i32 16, i32 20, i32 18, i32 22, i32 48, i32 52, i32 50, i32 54, i32 24, i32 28, i32 26, i32 30, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 96, i32 100, i32 98, i32 102, i32 72, i32 76, i32 74, i32 78, i32 104, i32 108, i32 106, i32 110, i32 80, i32 84, i32 82, i32 86, i32 112, i32 116, i32 114, i32 118, i32 88, i32 92, i32 90, i32 94, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 160, i32 164, i32 162, i32 166, i32 136, i32 140, i32 138, i32 142, i32 168, i32 172, i32 170, i32 174, i32 144, i32 148, i32 146, i32 150, i32 176, i32 180, i32 178, i32 182, i32 152, i32 156, i32 154, i32 158, i32 184, i32 188, i32 186, i32 190, i32 192, i32 196, i32 194, i32 198, i32 224, i32 228, i32 226, i32 230, i32 200, i32 204, i32 202, i32 206, i32 232, i32 236, i32 234, i32 238, i32 208, i32 212, i32 210, i32 214, i32 240, i32 244, i32 242, i32 246, i32 216, i32 220, i32 218, i32 222, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 33, i32 37, i32 35, i32 39, i32 9, i32 13, i32 11, i32 15, i32 41, i32 45, i32 43, i32 47, i32 17, i32 21, i32 19, i32 23, i32 49, i32 53, i32 51, i32 55, i32 25, i32 29, i32 27, i32 31, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 97, i32 101, i32 99, i32 103, i32 73, i32 77, i32 75, i32 79, i32 105, i32 109, i32 107, i32 111, i32 81, i32 85, i32 83, i32 87, i32 113, i32 117, i32 115, i32 119, i32 89, i32 93, i32 91, i32 95, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 161, i32 165, i32 163, i32 167, i32 137, i32 141, i32 139, i32 143, i32 169, i32 173, i32 171, i32 175, i32 145, i32 149, i32 147, i32 151, i32 177, i32 181, i32 179, i32 183, i32 153, i32 157, i32 155, i32 159, i32 185, i32 189, i32 187, i32 191, i32 193, i32 197, i32 195, i32 199, i32 225, i32 229, i32 227, i32 231, i32 201, i32 205, i32 203, i32 207, i32 233, i32 237, i32 235, i32 239, i32 209, i32 213, i32 211, i32 215, i32 241, i32 245, i32 243, i32 247, i32 217, i32 221, i32 219, i32 223, i32 249, i32 253, i32 251, i32 255> @@ -821,7 +821,7 @@ define <256 x i8> @vdeal_65(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_66: -; CHECK: [[REG66:r[0-9]+]] = #102 +; CHECK: [[REG66:r[0-9]+]] = #-26 ; CHECK: vdeal(v1,v0,[[REG66]]) define <256 x i8> @vdeal_66(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 32, i32 33, i32 36, i32 37, i32 8, i32 9, i32 12, i32 13, i32 40, i32 41, i32 44, i32 45, i32 16, i32 17, i32 20, i32 21, i32 48, i32 49, i32 52, i32 53, i32 24, i32 25, i32 28, i32 29, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 96, i32 97, i32 100, i32 101, i32 72, i32 73, i32 76, i32 77, i32 104, i32 105, i32 108, i32 109, i32 80, i32 81, i32 84, i32 85, i32 112, i32 113, i32 116, i32 117, i32 88, i32 89, i32 92, i32 93, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 160, i32 161, i32 164, i32 165, i32 136, i32 137, i32 140, i32 141, i32 168, i32 169, i32 172, i32 173, i32 144, i32 145, i32 148, i32 149, i32 176, i32 177, i32 180, i32 181, i32 152, i32 153, i32 156, i32 157, i32 184, i32 185, i32 188, i32 189, i32 192, i32 193, i32 196, i32 197, i32 224, i32 225, i32 228, i32 229, i32 200, i32 201, i32 204, i32 205, i32 232, i32 233, i32 236, i32 237, i32 208, i32 209, i32 212, i32 213, i32 240, i32 241, i32 244, i32 245, i32 216, i32 217, i32 220, i32 221, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 34, i32 35, i32 38, i32 39, i32 10, i32 11, i32 14, i32 15, i32 42, i32 43, i32 46, i32 47, i32 18, i32 19, i32 22, i32 23, i32 50, i32 51, i32 54, i32 55, i32 26, i32 27, i32 30, i32 31, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 98, i32 99, i32 102, i32 103, i32 74, i32 75, i32 78, i32 79, i32 106, i32 107, i32 110, i32 111, i32 82, i32 83, i32 86, i32 87, i32 114, i32 115, i32 118, i32 119, i32 90, i32 91, i32 94, i32 95, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 162, i32 163, i32 166, i32 167, i32 138, i32 139, i32 142, i32 143, i32 170, i32 171, i32 174, i32 175, i32 146, i32 147, i32 150, i32 151, i32 178, i32 179, i32 182, i32 183, i32 154, i32 155, i32 158, i32 159, i32 186, i32 187, i32 190, i32 191, i32 194, i32 195, i32 198, i32 199, i32 226, i32 227, i32 230, i32 231, i32 202, i32 203, i32 206, i32 207, i32 234, i32 235, i32 238, i32 239, i32 210, i32 211, i32 214, i32 215, i32 242, i32 243, i32 246, i32 247, i32 218, i32 219, i32 222, i32 223, i32 250, i32 251, i32 254, i32 255> @@ -829,7 +829,7 @@ define <256 x i8> @vdeal_66(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_67: -; CHECK: [[REG67:r[0-9]+]] = #103 +; CHECK: [[REG67:r[0-9]+]] = #-25 ; CHECK: vdeal(v1,v0,[[REG67]]) define <256 x i8> @vdeal_67(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 32, i32 34, i32 36, i32 38, i32 8, i32 10, i32 12, i32 14, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 48, i32 50, i32 52, i32 54, i32 24, i32 26, i32 28, i32 30, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 96, i32 98, i32 100, i32 102, i32 72, i32 74, i32 76, i32 78, i32 104, i32 106, i32 108, i32 110, i32 80, i32 82, i32 84, i32 86, i32 112, i32 114, i32 116, i32 118, i32 88, i32 90, i32 92, i32 94, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 160, i32 162, i32 164, i32 166, i32 136, i32 138, i32 140, i32 142, i32 168, i32 170, i32 172, i32 174, i32 144, i32 146, i32 148, i32 150, i32 176, i32 178, i32 180, i32 182, i32 152, i32 154, i32 156, i32 158, i32 184, i32 186, i32 188, i32 190, i32 192, i32 194, i32 196, i32 198, i32 224, i32 226, i32 228, i32 230, i32 200, i32 202, i32 204, i32 206, i32 232, i32 234, i32 236, i32 238, i32 208, i32 210, i32 212, i32 214, i32 240, i32 242, i32 244, i32 246, i32 216, i32 218, i32 220, i32 222, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 33, i32 35, i32 37, i32 39, i32 9, i32 11, i32 13, i32 15, i32 41, i32 43, i32 45, i32 47, i32 17, i32 19, i32 21, i32 23, i32 49, i32 51, i32 53, i32 55, i32 25, i32 27, i32 29, i32 31, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 97, i32 99, i32 101, i32 103, i32 73, i32 75, i32 77, i32 79, i32 105, i32 107, i32 109, i32 111, i32 81, i32 83, i32 85, i32 87, i32 113, i32 115, i32 117, i32 119, i32 89, i32 91, i32 93, i32 95, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 161, i32 163, i32 165, i32 167, i32 137, i32 139, i32 141, i32 143, i32 169, i32 171, i32 173, i32 175, i32 145, i32 147, i32 149, i32 151, i32 177, i32 179, i32 181, i32 183, i32 153, i32 155, i32 157, i32 159, i32 185, i32 187, i32 189, i32 191, i32 193, i32 195, i32 197, i32 199, i32 225, i32 227, i32 229, i32 231, i32 201, i32 203, i32 205, i32 207, i32 233, i32 235, i32 237, i32 239, i32 209, i32 211, i32 213, i32 215, i32 241, i32 243, i32 245, i32 247, i32 217, i32 219, i32 221, i32 223, i32 249, i32 251, i32 253, i32 255> @@ -837,7 +837,7 @@ define <256 x i8> @vdeal_67(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_68: -; CHECK: [[REG68:r[0-9]+]] = #104 +; CHECK: [[REG68:r[0-9]+]] = #-24 ; CHECK: vdeal(v1,v0,[[REG68]]) define <256 x i8> @vdeal_68(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -845,7 +845,7 @@ define <256 x i8> @vdeal_68(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_69: -; CHECK: [[REG69:r[0-9]+]] = #105 +; CHECK: [[REG69:r[0-9]+]] = #-23 ; CHECK: vdeal(v1,v0,[[REG69]]) define <256 x i8> @vdeal_69(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 32, i32 40, i32 34, i32 42, i32 36, i32 44, i32 38, i32 46, i32 16, i32 24, i32 18, i32 26, i32 20, i32 28, i32 22, i32 30, i32 48, i32 56, i32 50, i32 58, i32 52, i32 60, i32 54, i32 62, i32 64, i32 72, i32 66, i32 74, i32 68, i32 76, i32 70, i32 78, i32 96, i32 104, i32 98, i32 106, i32 100, i32 108, i32 102, i32 110, i32 80, i32 88, i32 82, i32 90, i32 84, i32 92, i32 86, i32 94, i32 112, i32 120, i32 114, i32 122, i32 116, i32 124, i32 118, i32 126, i32 128, i32 136, i32 130, i32 138, i32 132, i32 140, i32 134, i32 142, i32 160, i32 168, i32 162, i32 170, i32 164, i32 172, i32 166, i32 174, i32 144, i32 152, i32 146, i32 154, i32 148, i32 156, i32 150, i32 158, i32 176, i32 184, i32 178, i32 186, i32 180, i32 188, i32 182, i32 190, i32 192, i32 200, i32 194, i32 202, i32 196, i32 204, i32 198, i32 206, i32 224, i32 232, i32 226, i32 234, i32 228, i32 236, i32 230, i32 238, i32 208, i32 216, i32 210, i32 218, i32 212, i32 220, i32 214, i32 222, i32 240, i32 248, i32 242, i32 250, i32 244, i32 252, i32 246, i32 254, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, i32 33, i32 41, i32 35, i32 43, i32 37, i32 45, i32 39, i32 47, i32 17, i32 25, i32 19, i32 27, i32 21, i32 29, i32 23, i32 31, i32 49, i32 57, i32 51, i32 59, i32 53, i32 61, i32 55, i32 63, i32 65, i32 73, i32 67, i32 75, i32 69, i32 77, i32 71, i32 79, i32 97, i32 105, i32 99, i32 107, i32 101, i32 109, i32 103, i32 111, i32 81, i32 89, i32 83, i32 91, i32 85, i32 93, i32 87, i32 95, i32 113, i32 121, i32 115, i32 123, i32 117, i32 125, i32 119, i32 127, i32 129, i32 137, i32 131, i32 139, i32 133, i32 141, i32 135, i32 143, i32 161, i32 169, i32 163, i32 171, i32 165, i32 173, i32 167, i32 175, i32 145, i32 153, i32 147, i32 155, i32 149, i32 157, i32 151, i32 159, i32 177, i32 185, i32 179, i32 187, i32 181, i32 189, i32 183, i32 191, i32 193, i32 201, i32 195, i32 203, i32 197, i32 205, i32 199, i32 207, i32 225, i32 233, i32 227, i32 235, i32 229, i32 237, i32 231, i32 239, i32 209, i32 217, i32 211, i32 219, i32 213, i32 221, i32 215, i32 223, i32 241, i32 249, i32 243, i32 251, i32 245, i32 253, i32 247, i32 255> @@ -853,7 +853,7 @@ define <256 x i8> @vdeal_69(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_6a: -; CHECK: [[REG6a:r[0-9]+]] = #106 +; CHECK: [[REG6a:r[0-9]+]] = #-22 ; CHECK: vdeal(v1,v0,[[REG6a]]) define <256 x i8> @vdeal_6a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13, i32 32, i32 33, i32 40, i32 41, i32 36, i32 37, i32 44, i32 45, i32 16, i32 17, i32 24, i32 25, i32 20, i32 21, i32 28, i32 29, i32 48, i32 49, i32 56, i32 57, i32 52, i32 53, i32 60, i32 61, i32 64, i32 65, i32 72, i32 73, i32 68, i32 69, i32 76, i32 77, i32 96, i32 97, i32 104, i32 105, i32 100, i32 101, i32 108, i32 109, i32 80, i32 81, i32 88, i32 89, i32 84, i32 85, i32 92, i32 93, i32 112, i32 113, i32 120, i32 121, i32 116, i32 117, i32 124, i32 125, i32 128, i32 129, i32 136, i32 137, i32 132, i32 133, i32 140, i32 141, i32 160, i32 161, i32 168, i32 169, i32 164, i32 165, i32 172, i32 173, i32 144, i32 145, i32 152, i32 153, i32 148, i32 149, i32 156, i32 157, i32 176, i32 177, i32 184, i32 185, i32 180, i32 181, i32 188, i32 189, i32 192, i32 193, i32 200, i32 201, i32 196, i32 197, i32 204, i32 205, i32 224, i32 225, i32 232, i32 233, i32 228, i32 229, i32 236, i32 237, i32 208, i32 209, i32 216, i32 217, i32 212, i32 213, i32 220, i32 221, i32 240, i32 241, i32 248, i32 249, i32 244, i32 245, i32 252, i32 253, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15, i32 34, i32 35, i32 42, i32 43, i32 38, i32 39, i32 46, i32 47, i32 18, i32 19, i32 26, i32 27, i32 22, i32 23, i32 30, i32 31, i32 50, i32 51, i32 58, i32 59, i32 54, i32 55, i32 62, i32 63, i32 66, i32 67, i32 74, i32 75, i32 70, i32 71, i32 78, i32 79, i32 98, i32 99, i32 106, i32 107, i32 102, i32 103, i32 110, i32 111, i32 82, i32 83, i32 90, i32 91, i32 86, i32 87, i32 94, i32 95, i32 114, i32 115, i32 122, i32 123, i32 118, i32 119, i32 126, i32 127, i32 130, i32 131, i32 138, i32 139, i32 134, i32 135, i32 142, i32 143, i32 162, i32 163, i32 170, i32 171, i32 166, i32 167, i32 174, i32 175, i32 146, i32 147, i32 154, i32 155, i32 150, i32 151, i32 158, i32 159, i32 178, i32 179, i32 186, i32 187, i32 182, i32 183, i32 190, i32 191, i32 194, i32 195, i32 202, i32 203, i32 198, i32 199, i32 206, i32 207, i32 226, i32 227, i32 234, i32 235, i32 230, i32 231, i32 238, i32 239, i32 210, i32 211, i32 218, i32 219, i32 214, i32 215, i32 222, i32 223, i32 242, i32 243, i32 250, i32 251, i32 246, i32 247, i32 254, i32 255> @@ -861,7 +861,7 @@ define <256 x i8> @vdeal_6a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_6b: -; CHECK: [[REG6b:r[0-9]+]] = #107 +; CHECK: [[REG6b:r[0-9]+]] = #-21 ; CHECK: vdeal(v1,v0,[[REG6b]]) define <256 x i8> @vdeal_6b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14, i32 32, i32 34, i32 40, i32 42, i32 36, i32 38, i32 44, i32 46, i32 16, i32 18, i32 24, i32 26, i32 20, i32 22, i32 28, i32 30, i32 48, i32 50, i32 56, i32 58, i32 52, i32 54, i32 60, i32 62, i32 64, i32 66, i32 72, i32 74, i32 68, i32 70, i32 76, i32 78, i32 96, i32 98, i32 104, i32 106, i32 100, i32 102, i32 108, i32 110, i32 80, i32 82, i32 88, i32 90, i32 84, i32 86, i32 92, i32 94, i32 112, i32 114, i32 120, i32 122, i32 116, i32 118, i32 124, i32 126, i32 128, i32 130, i32 136, i32 138, i32 132, i32 134, i32 140, i32 142, i32 160, i32 162, i32 168, i32 170, i32 164, i32 166, i32 172, i32 174, i32 144, i32 146, i32 152, i32 154, i32 148, i32 150, i32 156, i32 158, i32 176, i32 178, i32 184, i32 186, i32 180, i32 182, i32 188, i32 190, i32 192, i32 194, i32 200, i32 202, i32 196, i32 198, i32 204, i32 206, i32 224, i32 226, i32 232, i32 234, i32 228, i32 230, i32 236, i32 238, i32 208, i32 210, i32 216, i32 218, i32 212, i32 214, i32 220, i32 222, i32 240, i32 242, i32 248, i32 250, i32 244, i32 246, i32 252, i32 254, i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15, i32 33, i32 35, i32 41, i32 43, i32 37, i32 39, i32 45, i32 47, i32 17, i32 19, i32 25, i32 27, i32 21, i32 23, i32 29, i32 31, i32 49, i32 51, i32 57, i32 59, i32 53, i32 55, i32 61, i32 63, i32 65, i32 67, i32 73, i32 75, i32 69, i32 71, i32 77, i32 79, i32 97, i32 99, i32 105, i32 107, i32 101, i32 103, i32 109, i32 111, i32 81, i32 83, i32 89, i32 91, i32 85, i32 87, i32 93, i32 95, i32 113, i32 115, i32 121, i32 123, i32 117, i32 119, i32 125, i32 127, i32 129, i32 131, i32 137, i32 139, i32 133, i32 135, i32 141, i32 143, i32 161, i32 163, i32 169, i32 171, i32 165, i32 167, i32 173, i32 175, i32 145, i32 147, i32 153, i32 155, i32 149, i32 151, i32 157, i32 159, i32 177, i32 179, i32 185, i32 187, i32 181, i32 183, i32 189, i32 191, i32 193, i32 195, i32 201, i32 203, i32 197, i32 199, i32 205, i32 207, i32 225, i32 227, i32 233, i32 235, i32 229, i32 231, i32 237, i32 239, i32 209, i32 211, i32 217, i32 219, i32 213, i32 215, i32 221, i32 223, i32 241, i32 243, i32 249, i32 251, i32 245, i32 247, i32 253, i32 255> @@ -869,7 +869,7 @@ define <256 x i8> @vdeal_6b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_6c: -; CHECK: [[REG6c:r[0-9]+]] = #108 +; CHECK: [[REG6c:r[0-9]+]] = #-20 ; CHECK: vdeal(v1,v0,[[REG6c]]) define <256 x i8> @vdeal_6c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 32, i32 33, i32 34, i32 35, i32 40, i32 41, i32 42, i32 43, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 48, i32 49, i32 50, i32 51, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 72, i32 73, i32 74, i32 75, i32 96, i32 97, i32 98, i32 99, i32 104, i32 105, i32 106, i32 107, i32 80, i32 81, i32 82, i32 83, i32 88, i32 89, i32 90, i32 91, i32 112, i32 113, i32 114, i32 115, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 136, i32 137, i32 138, i32 139, i32 160, i32 161, i32 162, i32 163, i32 168, i32 169, i32 170, i32 171, i32 144, i32 145, i32 146, i32 147, i32 152, i32 153, i32 154, i32 155, i32 176, i32 177, i32 178, i32 179, i32 184, i32 185, i32 186, i32 187, i32 192, i32 193, i32 194, i32 195, i32 200, i32 201, i32 202, i32 203, i32 224, i32 225, i32 226, i32 227, i32 232, i32 233, i32 234, i32 235, i32 208, i32 209, i32 210, i32 211, i32 216, i32 217, i32 218, i32 219, i32 240, i32 241, i32 242, i32 243, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 76, i32 77, i32 78, i32 79, i32 100, i32 101, i32 102, i32 103, i32 108, i32 109, i32 110, i32 111, i32 84, i32 85, i32 86, i32 87, i32 92, i32 93, i32 94, i32 95, i32 116, i32 117, i32 118, i32 119, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 140, i32 141, i32 142, i32 143, i32 164, i32 165, i32 166, i32 167, i32 172, i32 173, i32 174, i32 175, i32 148, i32 149, i32 150, i32 151, i32 156, i32 157, i32 158, i32 159, i32 180, i32 181, i32 182, i32 183, i32 188, i32 189, i32 190, i32 191, i32 196, i32 197, i32 198, i32 199, i32 204, i32 205, i32 206, i32 207, i32 228, i32 229, i32 230, i32 231, i32 236, i32 237, i32 238, i32 239, i32 212, i32 213, i32 214, i32 215, i32 220, i32 221, i32 222, i32 223, i32 244, i32 245, i32 246, i32 247, i32 252, i32 253, i32 254, i32 255> @@ -877,7 +877,7 @@ define <256 x i8> @vdeal_6c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_6d: -; CHECK: [[REG6d:r[0-9]+]] = #109 +; CHECK: [[REG6d:r[0-9]+]] = #-19 ; CHECK: vdeal(v1,v0,[[REG6d]]) define <256 x i8> @vdeal_6d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 8, i32 12, i32 10, i32 14, i32 32, i32 36, i32 34, i32 38, i32 40, i32 44, i32 42, i32 46, i32 16, i32 20, i32 18, i32 22, i32 24, i32 28, i32 26, i32 30, i32 48, i32 52, i32 50, i32 54, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 72, i32 76, i32 74, i32 78, i32 96, i32 100, i32 98, i32 102, i32 104, i32 108, i32 106, i32 110, i32 80, i32 84, i32 82, i32 86, i32 88, i32 92, i32 90, i32 94, i32 112, i32 116, i32 114, i32 118, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 136, i32 140, i32 138, i32 142, i32 160, i32 164, i32 162, i32 166, i32 168, i32 172, i32 170, i32 174, i32 144, i32 148, i32 146, i32 150, i32 152, i32 156, i32 154, i32 158, i32 176, i32 180, i32 178, i32 182, i32 184, i32 188, i32 186, i32 190, i32 192, i32 196, i32 194, i32 198, i32 200, i32 204, i32 202, i32 206, i32 224, i32 228, i32 226, i32 230, i32 232, i32 236, i32 234, i32 238, i32 208, i32 212, i32 210, i32 214, i32 216, i32 220, i32 218, i32 222, i32 240, i32 244, i32 242, i32 246, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15, i32 33, i32 37, i32 35, i32 39, i32 41, i32 45, i32 43, i32 47, i32 17, i32 21, i32 19, i32 23, i32 25, i32 29, i32 27, i32 31, i32 49, i32 53, i32 51, i32 55, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 73, i32 77, i32 75, i32 79, i32 97, i32 101, i32 99, i32 103, i32 105, i32 109, i32 107, i32 111, i32 81, i32 85, i32 83, i32 87, i32 89, i32 93, i32 91, i32 95, i32 113, i32 117, i32 115, i32 119, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 137, i32 141, i32 139, i32 143, i32 161, i32 165, i32 163, i32 167, i32 169, i32 173, i32 171, i32 175, i32 145, i32 149, i32 147, i32 151, i32 153, i32 157, i32 155, i32 159, i32 177, i32 181, i32 179, i32 183, i32 185, i32 189, i32 187, i32 191, i32 193, i32 197, i32 195, i32 199, i32 201, i32 205, i32 203, i32 207, i32 225, i32 229, i32 227, i32 231, i32 233, i32 237, i32 235, i32 239, i32 209, i32 213, i32 211, i32 215, i32 217, i32 221, i32 219, i32 223, i32 241, i32 245, i32 243, i32 247, i32 249, i32 253, i32 251, i32 255> @@ -885,7 +885,7 @@ define <256 x i8> @vdeal_6d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_6e: -; CHECK: [[REG6e:r[0-9]+]] = #110 +; CHECK: [[REG6e:r[0-9]+]] = #-18 ; CHECK: vdeal(v1,v0,[[REG6e]]) define <256 x i8> @vdeal_6e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 136, i32 137, i32 140, i32 141, i32 160, i32 161, i32 164, i32 165, i32 168, i32 169, i32 172, i32 173, i32 144, i32 145, i32 148, i32 149, i32 152, i32 153, i32 156, i32 157, i32 176, i32 177, i32 180, i32 181, i32 184, i32 185, i32 188, i32 189, i32 192, i32 193, i32 196, i32 197, i32 200, i32 201, i32 204, i32 205, i32 224, i32 225, i32 228, i32 229, i32 232, i32 233, i32 236, i32 237, i32 208, i32 209, i32 212, i32 213, i32 216, i32 217, i32 220, i32 221, i32 240, i32 241, i32 244, i32 245, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 10, i32 11, i32 14, i32 15, i32 34, i32 35, i32 38, i32 39, i32 42, i32 43, i32 46, i32 47, i32 18, i32 19, i32 22, i32 23, i32 26, i32 27, i32 30, i32 31, i32 50, i32 51, i32 54, i32 55, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 74, i32 75, i32 78, i32 79, i32 98, i32 99, i32 102, i32 103, i32 106, i32 107, i32 110, i32 111, i32 82, i32 83, i32 86, i32 87, i32 90, i32 91, i32 94, i32 95, i32 114, i32 115, i32 118, i32 119, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 138, i32 139, i32 142, i32 143, i32 162, i32 163, i32 166, i32 167, i32 170, i32 171, i32 174, i32 175, i32 146, i32 147, i32 150, i32 151, i32 154, i32 155, i32 158, i32 159, i32 178, i32 179, i32 182, i32 183, i32 186, i32 187, i32 190, i32 191, i32 194, i32 195, i32 198, i32 199, i32 202, i32 203, i32 206, i32 207, i32 226, i32 227, i32 230, i32 231, i32 234, i32 235, i32 238, i32 239, i32 210, i32 211, i32 214, i32 215, i32 218, i32 219, i32 222, i32 223, i32 242, i32 243, i32 246, i32 247, i32 250, i32 251, i32 254, i32 255> @@ -893,7 +893,7 @@ define <256 x i8> @vdeal_6e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_6f: -; CHECK: [[REG6f:r[0-9]+]] = #111 +; CHECK: [[REG6f:r[0-9]+]] = #-17 ; CHECK: vdeal(v1,v0,[[REG6f]]) define <256 x i8> @vdeal_6f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 136, i32 138, i32 140, i32 142, i32 160, i32 162, i32 164, i32 166, i32 168, i32 170, i32 172, i32 174, i32 144, i32 146, i32 148, i32 150, i32 152, i32 154, i32 156, i32 158, i32 176, i32 178, i32 180, i32 182, i32 184, i32 186, i32 188, i32 190, i32 192, i32 194, i32 196, i32 198, i32 200, i32 202, i32 204, i32 206, i32 224, i32 226, i32 228, i32 230, i32 232, i32 234, i32 236, i32 238, i32 208, i32 210, i32 212, i32 214, i32 216, i32 218, i32 220, i32 222, i32 240, i32 242, i32 244, i32 246, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 137, i32 139, i32 141, i32 143, i32 161, i32 163, i32 165, i32 167, i32 169, i32 171, i32 173, i32 175, i32 145, i32 147, i32 149, i32 151, i32 153, i32 155, i32 157, i32 159, i32 177, i32 179, i32 181, i32 183, i32 185, i32 187, i32 189, i32 191, i32 193, i32 195, i32 197, i32 199, i32 201, i32 203, i32 205, i32 207, i32 225, i32 227, i32 229, i32 231, i32 233, i32 235, i32 237, i32 239, i32 209, i32 211, i32 213, i32 215, i32 217, i32 219, i32 221, i32 223, i32 241, i32 243, i32 245, i32 247, i32 249, i32 251, i32 253, i32 255> @@ -901,7 +901,7 @@ define <256 x i8> @vdeal_6f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_70: -; CHECK: [[REG70:r[0-9]+]] = #112 +; CHECK: [[REG70:r[0-9]+]] = #-16 ; CHECK: vdeal(v1,v0,[[REG70]]) define <256 x i8> @vdeal_70(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -909,7 +909,7 @@ define <256 x i8> @vdeal_70(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_71: -; CHECK: [[REG71:r[0-9]+]] = #113 +; CHECK: [[REG71:r[0-9]+]] = #-15 ; CHECK: vdeal(v1,v0,[[REG71]]) define <256 x i8> @vdeal_71(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30, i32 32, i32 48, i32 34, i32 50, i32 36, i32 52, i32 38, i32 54, i32 40, i32 56, i32 42, i32 58, i32 44, i32 60, i32 46, i32 62, i32 64, i32 80, i32 66, i32 82, i32 68, i32 84, i32 70, i32 86, i32 72, i32 88, i32 74, i32 90, i32 76, i32 92, i32 78, i32 94, i32 96, i32 112, i32 98, i32 114, i32 100, i32 116, i32 102, i32 118, i32 104, i32 120, i32 106, i32 122, i32 108, i32 124, i32 110, i32 126, i32 128, i32 144, i32 130, i32 146, i32 132, i32 148, i32 134, i32 150, i32 136, i32 152, i32 138, i32 154, i32 140, i32 156, i32 142, i32 158, i32 160, i32 176, i32 162, i32 178, i32 164, i32 180, i32 166, i32 182, i32 168, i32 184, i32 170, i32 186, i32 172, i32 188, i32 174, i32 190, i32 192, i32 208, i32 194, i32 210, i32 196, i32 212, i32 198, i32 214, i32 200, i32 216, i32 202, i32 218, i32 204, i32 220, i32 206, i32 222, i32 224, i32 240, i32 226, i32 242, i32 228, i32 244, i32 230, i32 246, i32 232, i32 248, i32 234, i32 250, i32 236, i32 252, i32 238, i32 254, i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31, i32 33, i32 49, i32 35, i32 51, i32 37, i32 53, i32 39, i32 55, i32 41, i32 57, i32 43, i32 59, i32 45, i32 61, i32 47, i32 63, i32 65, i32 81, i32 67, i32 83, i32 69, i32 85, i32 71, i32 87, i32 73, i32 89, i32 75, i32 91, i32 77, i32 93, i32 79, i32 95, i32 97, i32 113, i32 99, i32 115, i32 101, i32 117, i32 103, i32 119, i32 105, i32 121, i32 107, i32 123, i32 109, i32 125, i32 111, i32 127, i32 129, i32 145, i32 131, i32 147, i32 133, i32 149, i32 135, i32 151, i32 137, i32 153, i32 139, i32 155, i32 141, i32 157, i32 143, i32 159, i32 161, i32 177, i32 163, i32 179, i32 165, i32 181, i32 167, i32 183, i32 169, i32 185, i32 171, i32 187, i32 173, i32 189, i32 175, i32 191, i32 193, i32 209, i32 195, i32 211, i32 197, i32 213, i32 199, i32 215, i32 201, i32 217, i32 203, i32 219, i32 205, i32 221, i32 207, i32 223, i32 225, i32 241, i32 227, i32 243, i32 229, i32 245, i32 231, i32 247, i32 233, i32 249, i32 235, i32 251, i32 237, i32 253, i32 239, i32 255> @@ -917,7 +917,7 @@ define <256 x i8> @vdeal_71(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_72: -; CHECK: [[REG72:r[0-9]+]] = #114 +; CHECK: [[REG72:r[0-9]+]] = #-14 ; CHECK: vdeal(v1,v0,[[REG72]]) define <256 x i8> @vdeal_72(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 20, i32 21, i32 8, i32 9, i32 24, i32 25, i32 12, i32 13, i32 28, i32 29, i32 32, i32 33, i32 48, i32 49, i32 36, i32 37, i32 52, i32 53, i32 40, i32 41, i32 56, i32 57, i32 44, i32 45, i32 60, i32 61, i32 64, i32 65, i32 80, i32 81, i32 68, i32 69, i32 84, i32 85, i32 72, i32 73, i32 88, i32 89, i32 76, i32 77, i32 92, i32 93, i32 96, i32 97, i32 112, i32 113, i32 100, i32 101, i32 116, i32 117, i32 104, i32 105, i32 120, i32 121, i32 108, i32 109, i32 124, i32 125, i32 128, i32 129, i32 144, i32 145, i32 132, i32 133, i32 148, i32 149, i32 136, i32 137, i32 152, i32 153, i32 140, i32 141, i32 156, i32 157, i32 160, i32 161, i32 176, i32 177, i32 164, i32 165, i32 180, i32 181, i32 168, i32 169, i32 184, i32 185, i32 172, i32 173, i32 188, i32 189, i32 192, i32 193, i32 208, i32 209, i32 196, i32 197, i32 212, i32 213, i32 200, i32 201, i32 216, i32 217, i32 204, i32 205, i32 220, i32 221, i32 224, i32 225, i32 240, i32 241, i32 228, i32 229, i32 244, i32 245, i32 232, i32 233, i32 248, i32 249, i32 236, i32 237, i32 252, i32 253, i32 2, i32 3, i32 18, i32 19, i32 6, i32 7, i32 22, i32 23, i32 10, i32 11, i32 26, i32 27, i32 14, i32 15, i32 30, i32 31, i32 34, i32 35, i32 50, i32 51, i32 38, i32 39, i32 54, i32 55, i32 42, i32 43, i32 58, i32 59, i32 46, i32 47, i32 62, i32 63, i32 66, i32 67, i32 82, i32 83, i32 70, i32 71, i32 86, i32 87, i32 74, i32 75, i32 90, i32 91, i32 78, i32 79, i32 94, i32 95, i32 98, i32 99, i32 114, i32 115, i32 102, i32 103, i32 118, i32 119, i32 106, i32 107, i32 122, i32 123, i32 110, i32 111, i32 126, i32 127, i32 130, i32 131, i32 146, i32 147, i32 134, i32 135, i32 150, i32 151, i32 138, i32 139, i32 154, i32 155, i32 142, i32 143, i32 158, i32 159, i32 162, i32 163, i32 178, i32 179, i32 166, i32 167, i32 182, i32 183, i32 170, i32 171, i32 186, i32 187, i32 174, i32 175, i32 190, i32 191, i32 194, i32 195, i32 210, i32 211, i32 198, i32 199, i32 214, i32 215, i32 202, i32 203, i32 218, i32 219, i32 206, i32 207, i32 222, i32 223, i32 226, i32 227, i32 242, i32 243, i32 230, i32 231, i32 246, i32 247, i32 234, i32 235, i32 250, i32 251, i32 238, i32 239, i32 254, i32 255> @@ -925,7 +925,7 @@ define <256 x i8> @vdeal_72(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_73: -; CHECK: [[REG73:r[0-9]+]] = #115 +; CHECK: [[REG73:r[0-9]+]] = #-13 ; CHECK: vdeal(v1,v0,[[REG73]]) define <256 x i8> @vdeal_73(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30, i32 32, i32 34, i32 48, i32 50, i32 36, i32 38, i32 52, i32 54, i32 40, i32 42, i32 56, i32 58, i32 44, i32 46, i32 60, i32 62, i32 64, i32 66, i32 80, i32 82, i32 68, i32 70, i32 84, i32 86, i32 72, i32 74, i32 88, i32 90, i32 76, i32 78, i32 92, i32 94, i32 96, i32 98, i32 112, i32 114, i32 100, i32 102, i32 116, i32 118, i32 104, i32 106, i32 120, i32 122, i32 108, i32 110, i32 124, i32 126, i32 128, i32 130, i32 144, i32 146, i32 132, i32 134, i32 148, i32 150, i32 136, i32 138, i32 152, i32 154, i32 140, i32 142, i32 156, i32 158, i32 160, i32 162, i32 176, i32 178, i32 164, i32 166, i32 180, i32 182, i32 168, i32 170, i32 184, i32 186, i32 172, i32 174, i32 188, i32 190, i32 192, i32 194, i32 208, i32 210, i32 196, i32 198, i32 212, i32 214, i32 200, i32 202, i32 216, i32 218, i32 204, i32 206, i32 220, i32 222, i32 224, i32 226, i32 240, i32 242, i32 228, i32 230, i32 244, i32 246, i32 232, i32 234, i32 248, i32 250, i32 236, i32 238, i32 252, i32 254, i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31, i32 33, i32 35, i32 49, i32 51, i32 37, i32 39, i32 53, i32 55, i32 41, i32 43, i32 57, i32 59, i32 45, i32 47, i32 61, i32 63, i32 65, i32 67, i32 81, i32 83, i32 69, i32 71, i32 85, i32 87, i32 73, i32 75, i32 89, i32 91, i32 77, i32 79, i32 93, i32 95, i32 97, i32 99, i32 113, i32 115, i32 101, i32 103, i32 117, i32 119, i32 105, i32 107, i32 121, i32 123, i32 109, i32 111, i32 125, i32 127, i32 129, i32 131, i32 145, i32 147, i32 133, i32 135, i32 149, i32 151, i32 137, i32 139, i32 153, i32 155, i32 141, i32 143, i32 157, i32 159, i32 161, i32 163, i32 177, i32 179, i32 165, i32 167, i32 181, i32 183, i32 169, i32 171, i32 185, i32 187, i32 173, i32 175, i32 189, i32 191, i32 193, i32 195, i32 209, i32 211, i32 197, i32 199, i32 213, i32 215, i32 201, i32 203, i32 217, i32 219, i32 205, i32 207, i32 221, i32 223, i32 225, i32 227, i32 241, i32 243, i32 229, i32 231, i32 245, i32 247, i32 233, i32 235, i32 249, i32 251, i32 237, i32 239, i32 253, i32 255> @@ -933,7 +933,7 @@ define <256 x i8> @vdeal_73(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_74: -; CHECK: [[REG74:r[0-9]+]] = #116 +; CHECK: [[REG74:r[0-9]+]] = #-12 ; CHECK: vdeal(v1,v0,[[REG74]]) define <256 x i8> @vdeal_74(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 32, i32 33, i32 34, i32 35, i32 48, i32 49, i32 50, i32 51, i32 40, i32 41, i32 42, i32 43, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 80, i32 81, i32 82, i32 83, i32 72, i32 73, i32 74, i32 75, i32 88, i32 89, i32 90, i32 91, i32 96, i32 97, i32 98, i32 99, i32 112, i32 113, i32 114, i32 115, i32 104, i32 105, i32 106, i32 107, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 144, i32 145, i32 146, i32 147, i32 136, i32 137, i32 138, i32 139, i32 152, i32 153, i32 154, i32 155, i32 160, i32 161, i32 162, i32 163, i32 176, i32 177, i32 178, i32 179, i32 168, i32 169, i32 170, i32 171, i32 184, i32 185, i32 186, i32 187, i32 192, i32 193, i32 194, i32 195, i32 208, i32 209, i32 210, i32 211, i32 200, i32 201, i32 202, i32 203, i32 216, i32 217, i32 218, i32 219, i32 224, i32 225, i32 226, i32 227, i32 240, i32 241, i32 242, i32 243, i32 232, i32 233, i32 234, i32 235, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 52, i32 53, i32 54, i32 55, i32 44, i32 45, i32 46, i32 47, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 84, i32 85, i32 86, i32 87, i32 76, i32 77, i32 78, i32 79, i32 92, i32 93, i32 94, i32 95, i32 100, i32 101, i32 102, i32 103, i32 116, i32 117, i32 118, i32 119, i32 108, i32 109, i32 110, i32 111, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 148, i32 149, i32 150, i32 151, i32 140, i32 141, i32 142, i32 143, i32 156, i32 157, i32 158, i32 159, i32 164, i32 165, i32 166, i32 167, i32 180, i32 181, i32 182, i32 183, i32 172, i32 173, i32 174, i32 175, i32 188, i32 189, i32 190, i32 191, i32 196, i32 197, i32 198, i32 199, i32 212, i32 213, i32 214, i32 215, i32 204, i32 205, i32 206, i32 207, i32 220, i32 221, i32 222, i32 223, i32 228, i32 229, i32 230, i32 231, i32 244, i32 245, i32 246, i32 247, i32 236, i32 237, i32 238, i32 239, i32 252, i32 253, i32 254, i32 255> @@ -941,7 +941,7 @@ define <256 x i8> @vdeal_74(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_75: -; CHECK: [[REG75:r[0-9]+]] = #117 +; CHECK: [[REG75:r[0-9]+]] = #-11 ; CHECK: vdeal(v1,v0,[[REG75]]) define <256 x i8> @vdeal_75(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 16, i32 20, i32 18, i32 22, i32 8, i32 12, i32 10, i32 14, i32 24, i32 28, i32 26, i32 30, i32 32, i32 36, i32 34, i32 38, i32 48, i32 52, i32 50, i32 54, i32 40, i32 44, i32 42, i32 46, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 80, i32 84, i32 82, i32 86, i32 72, i32 76, i32 74, i32 78, i32 88, i32 92, i32 90, i32 94, i32 96, i32 100, i32 98, i32 102, i32 112, i32 116, i32 114, i32 118, i32 104, i32 108, i32 106, i32 110, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 144, i32 148, i32 146, i32 150, i32 136, i32 140, i32 138, i32 142, i32 152, i32 156, i32 154, i32 158, i32 160, i32 164, i32 162, i32 166, i32 176, i32 180, i32 178, i32 182, i32 168, i32 172, i32 170, i32 174, i32 184, i32 188, i32 186, i32 190, i32 192, i32 196, i32 194, i32 198, i32 208, i32 212, i32 210, i32 214, i32 200, i32 204, i32 202, i32 206, i32 216, i32 220, i32 218, i32 222, i32 224, i32 228, i32 226, i32 230, i32 240, i32 244, i32 242, i32 246, i32 232, i32 236, i32 234, i32 238, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 17, i32 21, i32 19, i32 23, i32 9, i32 13, i32 11, i32 15, i32 25, i32 29, i32 27, i32 31, i32 33, i32 37, i32 35, i32 39, i32 49, i32 53, i32 51, i32 55, i32 41, i32 45, i32 43, i32 47, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 81, i32 85, i32 83, i32 87, i32 73, i32 77, i32 75, i32 79, i32 89, i32 93, i32 91, i32 95, i32 97, i32 101, i32 99, i32 103, i32 113, i32 117, i32 115, i32 119, i32 105, i32 109, i32 107, i32 111, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 145, i32 149, i32 147, i32 151, i32 137, i32 141, i32 139, i32 143, i32 153, i32 157, i32 155, i32 159, i32 161, i32 165, i32 163, i32 167, i32 177, i32 181, i32 179, i32 183, i32 169, i32 173, i32 171, i32 175, i32 185, i32 189, i32 187, i32 191, i32 193, i32 197, i32 195, i32 199, i32 209, i32 213, i32 211, i32 215, i32 201, i32 205, i32 203, i32 207, i32 217, i32 221, i32 219, i32 223, i32 225, i32 229, i32 227, i32 231, i32 241, i32 245, i32 243, i32 247, i32 233, i32 237, i32 235, i32 239, i32 249, i32 253, i32 251, i32 255> @@ -949,7 +949,7 @@ define <256 x i8> @vdeal_75(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_76: -; CHECK: [[REG76:r[0-9]+]] = #118 +; CHECK: [[REG76:r[0-9]+]] = #-10 ; CHECK: vdeal(v1,v0,[[REG76]]) define <256 x i8> @vdeal_76(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 16, i32 17, i32 20, i32 21, i32 8, i32 9, i32 12, i32 13, i32 24, i32 25, i32 28, i32 29, i32 32, i32 33, i32 36, i32 37, i32 48, i32 49, i32 52, i32 53, i32 40, i32 41, i32 44, i32 45, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 80, i32 81, i32 84, i32 85, i32 72, i32 73, i32 76, i32 77, i32 88, i32 89, i32 92, i32 93, i32 96, i32 97, i32 100, i32 101, i32 112, i32 113, i32 116, i32 117, i32 104, i32 105, i32 108, i32 109, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 144, i32 145, i32 148, i32 149, i32 136, i32 137, i32 140, i32 141, i32 152, i32 153, i32 156, i32 157, i32 160, i32 161, i32 164, i32 165, i32 176, i32 177, i32 180, i32 181, i32 168, i32 169, i32 172, i32 173, i32 184, i32 185, i32 188, i32 189, i32 192, i32 193, i32 196, i32 197, i32 208, i32 209, i32 212, i32 213, i32 200, i32 201, i32 204, i32 205, i32 216, i32 217, i32 220, i32 221, i32 224, i32 225, i32 228, i32 229, i32 240, i32 241, i32 244, i32 245, i32 232, i32 233, i32 236, i32 237, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 18, i32 19, i32 22, i32 23, i32 10, i32 11, i32 14, i32 15, i32 26, i32 27, i32 30, i32 31, i32 34, i32 35, i32 38, i32 39, i32 50, i32 51, i32 54, i32 55, i32 42, i32 43, i32 46, i32 47, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 82, i32 83, i32 86, i32 87, i32 74, i32 75, i32 78, i32 79, i32 90, i32 91, i32 94, i32 95, i32 98, i32 99, i32 102, i32 103, i32 114, i32 115, i32 118, i32 119, i32 106, i32 107, i32 110, i32 111, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 146, i32 147, i32 150, i32 151, i32 138, i32 139, i32 142, i32 143, i32 154, i32 155, i32 158, i32 159, i32 162, i32 163, i32 166, i32 167, i32 178, i32 179, i32 182, i32 183, i32 170, i32 171, i32 174, i32 175, i32 186, i32 187, i32 190, i32 191, i32 194, i32 195, i32 198, i32 199, i32 210, i32 211, i32 214, i32 215, i32 202, i32 203, i32 206, i32 207, i32 218, i32 219, i32 222, i32 223, i32 226, i32 227, i32 230, i32 231, i32 242, i32 243, i32 246, i32 247, i32 234, i32 235, i32 238, i32 239, i32 250, i32 251, i32 254, i32 255> @@ -957,7 +957,7 @@ define <256 x i8> @vdeal_76(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_77: -; CHECK: [[REG77:r[0-9]+]] = #119 +; CHECK: [[REG77:r[0-9]+]] = #-9 ; CHECK: vdeal(v1,v0,[[REG77]]) define <256 x i8> @vdeal_77(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 48, i32 50, i32 52, i32 54, i32 40, i32 42, i32 44, i32 46, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 80, i32 82, i32 84, i32 86, i32 72, i32 74, i32 76, i32 78, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 112, i32 114, i32 116, i32 118, i32 104, i32 106, i32 108, i32 110, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 144, i32 146, i32 148, i32 150, i32 136, i32 138, i32 140, i32 142, i32 152, i32 154, i32 156, i32 158, i32 160, i32 162, i32 164, i32 166, i32 176, i32 178, i32 180, i32 182, i32 168, i32 170, i32 172, i32 174, i32 184, i32 186, i32 188, i32 190, i32 192, i32 194, i32 196, i32 198, i32 208, i32 210, i32 212, i32 214, i32 200, i32 202, i32 204, i32 206, i32 216, i32 218, i32 220, i32 222, i32 224, i32 226, i32 228, i32 230, i32 240, i32 242, i32 244, i32 246, i32 232, i32 234, i32 236, i32 238, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 49, i32 51, i32 53, i32 55, i32 41, i32 43, i32 45, i32 47, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 81, i32 83, i32 85, i32 87, i32 73, i32 75, i32 77, i32 79, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 113, i32 115, i32 117, i32 119, i32 105, i32 107, i32 109, i32 111, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 145, i32 147, i32 149, i32 151, i32 137, i32 139, i32 141, i32 143, i32 153, i32 155, i32 157, i32 159, i32 161, i32 163, i32 165, i32 167, i32 177, i32 179, i32 181, i32 183, i32 169, i32 171, i32 173, i32 175, i32 185, i32 187, i32 189, i32 191, i32 193, i32 195, i32 197, i32 199, i32 209, i32 211, i32 213, i32 215, i32 201, i32 203, i32 205, i32 207, i32 217, i32 219, i32 221, i32 223, i32 225, i32 227, i32 229, i32 231, i32 241, i32 243, i32 245, i32 247, i32 233, i32 235, i32 237, i32 239, i32 249, i32 251, i32 253, i32 255> @@ -965,7 +965,7 @@ define <256 x i8> @vdeal_77(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_78: -; CHECK: [[REG78:r[0-9]+]] = #120 +; CHECK: [[REG78:r[0-9]+]] = #-8 ; CHECK: vdeal(v1,v0,[[REG78]]) define <256 x i8> @vdeal_78(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -973,7 +973,7 @@ define <256 x i8> @vdeal_78(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_79: -; CHECK: [[REG79:r[0-9]+]] = #121 +; CHECK: [[REG79:r[0-9]+]] = #-7 ; CHECK: vdeal(v1,v0,[[REG79]]) define <256 x i8> @vdeal_79(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 16, i32 24, i32 18, i32 26, i32 20, i32 28, i32 22, i32 30, i32 32, i32 40, i32 34, i32 42, i32 36, i32 44, i32 38, i32 46, i32 48, i32 56, i32 50, i32 58, i32 52, i32 60, i32 54, i32 62, i32 64, i32 72, i32 66, i32 74, i32 68, i32 76, i32 70, i32 78, i32 80, i32 88, i32 82, i32 90, i32 84, i32 92, i32 86, i32 94, i32 96, i32 104, i32 98, i32 106, i32 100, i32 108, i32 102, i32 110, i32 112, i32 120, i32 114, i32 122, i32 116, i32 124, i32 118, i32 126, i32 128, i32 136, i32 130, i32 138, i32 132, i32 140, i32 134, i32 142, i32 144, i32 152, i32 146, i32 154, i32 148, i32 156, i32 150, i32 158, i32 160, i32 168, i32 162, i32 170, i32 164, i32 172, i32 166, i32 174, i32 176, i32 184, i32 178, i32 186, i32 180, i32 188, i32 182, i32 190, i32 192, i32 200, i32 194, i32 202, i32 196, i32 204, i32 198, i32 206, i32 208, i32 216, i32 210, i32 218, i32 212, i32 220, i32 214, i32 222, i32 224, i32 232, i32 226, i32 234, i32 228, i32 236, i32 230, i32 238, i32 240, i32 248, i32 242, i32 250, i32 244, i32 252, i32 246, i32 254, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, i32 17, i32 25, i32 19, i32 27, i32 21, i32 29, i32 23, i32 31, i32 33, i32 41, i32 35, i32 43, i32 37, i32 45, i32 39, i32 47, i32 49, i32 57, i32 51, i32 59, i32 53, i32 61, i32 55, i32 63, i32 65, i32 73, i32 67, i32 75, i32 69, i32 77, i32 71, i32 79, i32 81, i32 89, i32 83, i32 91, i32 85, i32 93, i32 87, i32 95, i32 97, i32 105, i32 99, i32 107, i32 101, i32 109, i32 103, i32 111, i32 113, i32 121, i32 115, i32 123, i32 117, i32 125, i32 119, i32 127, i32 129, i32 137, i32 131, i32 139, i32 133, i32 141, i32 135, i32 143, i32 145, i32 153, i32 147, i32 155, i32 149, i32 157, i32 151, i32 159, i32 161, i32 169, i32 163, i32 171, i32 165, i32 173, i32 167, i32 175, i32 177, i32 185, i32 179, i32 187, i32 181, i32 189, i32 183, i32 191, i32 193, i32 201, i32 195, i32 203, i32 197, i32 205, i32 199, i32 207, i32 209, i32 217, i32 211, i32 219, i32 213, i32 221, i32 215, i32 223, i32 225, i32 233, i32 227, i32 235, i32 229, i32 237, i32 231, i32 239, i32 241, i32 249, i32 243, i32 251, i32 245, i32 253, i32 247, i32 255> @@ -981,7 +981,7 @@ define <256 x i8> @vdeal_79(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_7a: -; CHECK: [[REG7a:r[0-9]+]] = #122 +; CHECK: [[REG7a:r[0-9]+]] = #-6 ; CHECK: vdeal(v1,v0,[[REG7a]]) define <256 x i8> @vdeal_7a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13, i32 16, i32 17, i32 24, i32 25, i32 20, i32 21, i32 28, i32 29, i32 32, i32 33, i32 40, i32 41, i32 36, i32 37, i32 44, i32 45, i32 48, i32 49, i32 56, i32 57, i32 52, i32 53, i32 60, i32 61, i32 64, i32 65, i32 72, i32 73, i32 68, i32 69, i32 76, i32 77, i32 80, i32 81, i32 88, i32 89, i32 84, i32 85, i32 92, i32 93, i32 96, i32 97, i32 104, i32 105, i32 100, i32 101, i32 108, i32 109, i32 112, i32 113, i32 120, i32 121, i32 116, i32 117, i32 124, i32 125, i32 128, i32 129, i32 136, i32 137, i32 132, i32 133, i32 140, i32 141, i32 144, i32 145, i32 152, i32 153, i32 148, i32 149, i32 156, i32 157, i32 160, i32 161, i32 168, i32 169, i32 164, i32 165, i32 172, i32 173, i32 176, i32 177, i32 184, i32 185, i32 180, i32 181, i32 188, i32 189, i32 192, i32 193, i32 200, i32 201, i32 196, i32 197, i32 204, i32 205, i32 208, i32 209, i32 216, i32 217, i32 212, i32 213, i32 220, i32 221, i32 224, i32 225, i32 232, i32 233, i32 228, i32 229, i32 236, i32 237, i32 240, i32 241, i32 248, i32 249, i32 244, i32 245, i32 252, i32 253, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15, i32 18, i32 19, i32 26, i32 27, i32 22, i32 23, i32 30, i32 31, i32 34, i32 35, i32 42, i32 43, i32 38, i32 39, i32 46, i32 47, i32 50, i32 51, i32 58, i32 59, i32 54, i32 55, i32 62, i32 63, i32 66, i32 67, i32 74, i32 75, i32 70, i32 71, i32 78, i32 79, i32 82, i32 83, i32 90, i32 91, i32 86, i32 87, i32 94, i32 95, i32 98, i32 99, i32 106, i32 107, i32 102, i32 103, i32 110, i32 111, i32 114, i32 115, i32 122, i32 123, i32 118, i32 119, i32 126, i32 127, i32 130, i32 131, i32 138, i32 139, i32 134, i32 135, i32 142, i32 143, i32 146, i32 147, i32 154, i32 155, i32 150, i32 151, i32 158, i32 159, i32 162, i32 163, i32 170, i32 171, i32 166, i32 167, i32 174, i32 175, i32 178, i32 179, i32 186, i32 187, i32 182, i32 183, i32 190, i32 191, i32 194, i32 195, i32 202, i32 203, i32 198, i32 199, i32 206, i32 207, i32 210, i32 211, i32 218, i32 219, i32 214, i32 215, i32 222, i32 223, i32 226, i32 227, i32 234, i32 235, i32 230, i32 231, i32 238, i32 239, i32 242, i32 243, i32 250, i32 251, i32 246, i32 247, i32 254, i32 255> @@ -989,7 +989,7 @@ define <256 x i8> @vdeal_7a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_7b: -; CHECK: [[REG7b:r[0-9]+]] = #123 +; CHECK: [[REG7b:r[0-9]+]] = #-5 ; CHECK: vdeal(v1,v0,[[REG7b]]) define <256 x i8> @vdeal_7b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14, i32 16, i32 18, i32 24, i32 26, i32 20, i32 22, i32 28, i32 30, i32 32, i32 34, i32 40, i32 42, i32 36, i32 38, i32 44, i32 46, i32 48, i32 50, i32 56, i32 58, i32 52, i32 54, i32 60, i32 62, i32 64, i32 66, i32 72, i32 74, i32 68, i32 70, i32 76, i32 78, i32 80, i32 82, i32 88, i32 90, i32 84, i32 86, i32 92, i32 94, i32 96, i32 98, i32 104, i32 106, i32 100, i32 102, i32 108, i32 110, i32 112, i32 114, i32 120, i32 122, i32 116, i32 118, i32 124, i32 126, i32 128, i32 130, i32 136, i32 138, i32 132, i32 134, i32 140, i32 142, i32 144, i32 146, i32 152, i32 154, i32 148, i32 150, i32 156, i32 158, i32 160, i32 162, i32 168, i32 170, i32 164, i32 166, i32 172, i32 174, i32 176, i32 178, i32 184, i32 186, i32 180, i32 182, i32 188, i32 190, i32 192, i32 194, i32 200, i32 202, i32 196, i32 198, i32 204, i32 206, i32 208, i32 210, i32 216, i32 218, i32 212, i32 214, i32 220, i32 222, i32 224, i32 226, i32 232, i32 234, i32 228, i32 230, i32 236, i32 238, i32 240, i32 242, i32 248, i32 250, i32 244, i32 246, i32 252, i32 254, i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15, i32 17, i32 19, i32 25, i32 27, i32 21, i32 23, i32 29, i32 31, i32 33, i32 35, i32 41, i32 43, i32 37, i32 39, i32 45, i32 47, i32 49, i32 51, i32 57, i32 59, i32 53, i32 55, i32 61, i32 63, i32 65, i32 67, i32 73, i32 75, i32 69, i32 71, i32 77, i32 79, i32 81, i32 83, i32 89, i32 91, i32 85, i32 87, i32 93, i32 95, i32 97, i32 99, i32 105, i32 107, i32 101, i32 103, i32 109, i32 111, i32 113, i32 115, i32 121, i32 123, i32 117, i32 119, i32 125, i32 127, i32 129, i32 131, i32 137, i32 139, i32 133, i32 135, i32 141, i32 143, i32 145, i32 147, i32 153, i32 155, i32 149, i32 151, i32 157, i32 159, i32 161, i32 163, i32 169, i32 171, i32 165, i32 167, i32 173, i32 175, i32 177, i32 179, i32 185, i32 187, i32 181, i32 183, i32 189, i32 191, i32 193, i32 195, i32 201, i32 203, i32 197, i32 199, i32 205, i32 207, i32 209, i32 211, i32 217, i32 219, i32 213, i32 215, i32 221, i32 223, i32 225, i32 227, i32 233, i32 235, i32 229, i32 231, i32 237, i32 239, i32 241, i32 243, i32 249, i32 251, i32 245, i32 247, i32 253, i32 255> @@ -997,7 +997,7 @@ define <256 x i8> @vdeal_7b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_7c: -; CHECK: [[REG7c:r[0-9]+]] = #124 +; CHECK: [[REG7c:r[0-9]+]] = #-4 ; CHECK: vdeal(v1,v0,[[REG7c]]) define <256 x i8> @vdeal_7c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 32, i32 33, i32 34, i32 35, i32 40, i32 41, i32 42, i32 43, i32 48, i32 49, i32 50, i32 51, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 72, i32 73, i32 74, i32 75, i32 80, i32 81, i32 82, i32 83, i32 88, i32 89, i32 90, i32 91, i32 96, i32 97, i32 98, i32 99, i32 104, i32 105, i32 106, i32 107, i32 112, i32 113, i32 114, i32 115, i32 120, i32 121, i32 122, i32 123, i32 128, i32 129, i32 130, i32 131, i32 136, i32 137, i32 138, i32 139, i32 144, i32 145, i32 146, i32 147, i32 152, i32 153, i32 154, i32 155, i32 160, i32 161, i32 162, i32 163, i32 168, i32 169, i32 170, i32 171, i32 176, i32 177, i32 178, i32 179, i32 184, i32 185, i32 186, i32 187, i32 192, i32 193, i32 194, i32 195, i32 200, i32 201, i32 202, i32 203, i32 208, i32 209, i32 210, i32 211, i32 216, i32 217, i32 218, i32 219, i32 224, i32 225, i32 226, i32 227, i32 232, i32 233, i32 234, i32 235, i32 240, i32 241, i32 242, i32 243, i32 248, i32 249, i32 250, i32 251, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 76, i32 77, i32 78, i32 79, i32 84, i32 85, i32 86, i32 87, i32 92, i32 93, i32 94, i32 95, i32 100, i32 101, i32 102, i32 103, i32 108, i32 109, i32 110, i32 111, i32 116, i32 117, i32 118, i32 119, i32 124, i32 125, i32 126, i32 127, i32 132, i32 133, i32 134, i32 135, i32 140, i32 141, i32 142, i32 143, i32 148, i32 149, i32 150, i32 151, i32 156, i32 157, i32 158, i32 159, i32 164, i32 165, i32 166, i32 167, i32 172, i32 173, i32 174, i32 175, i32 180, i32 181, i32 182, i32 183, i32 188, i32 189, i32 190, i32 191, i32 196, i32 197, i32 198, i32 199, i32 204, i32 205, i32 206, i32 207, i32 212, i32 213, i32 214, i32 215, i32 220, i32 221, i32 222, i32 223, i32 228, i32 229, i32 230, i32 231, i32 236, i32 237, i32 238, i32 239, i32 244, i32 245, i32 246, i32 247, i32 252, i32 253, i32 254, i32 255> @@ -1005,7 +1005,7 @@ define <256 x i8> @vdeal_7c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_7d: -; CHECK: [[REG7d:r[0-9]+]] = #125 +; CHECK: [[REG7d:r[0-9]+]] = #-3 ; CHECK: vdeal(v1,v0,[[REG7d]]) define <256 x i8> @vdeal_7d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 4, i32 2, i32 6, i32 8, i32 12, i32 10, i32 14, i32 16, i32 20, i32 18, i32 22, i32 24, i32 28, i32 26, i32 30, i32 32, i32 36, i32 34, i32 38, i32 40, i32 44, i32 42, i32 46, i32 48, i32 52, i32 50, i32 54, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 72, i32 76, i32 74, i32 78, i32 80, i32 84, i32 82, i32 86, i32 88, i32 92, i32 90, i32 94, i32 96, i32 100, i32 98, i32 102, i32 104, i32 108, i32 106, i32 110, i32 112, i32 116, i32 114, i32 118, i32 120, i32 124, i32 122, i32 126, i32 128, i32 132, i32 130, i32 134, i32 136, i32 140, i32 138, i32 142, i32 144, i32 148, i32 146, i32 150, i32 152, i32 156, i32 154, i32 158, i32 160, i32 164, i32 162, i32 166, i32 168, i32 172, i32 170, i32 174, i32 176, i32 180, i32 178, i32 182, i32 184, i32 188, i32 186, i32 190, i32 192, i32 196, i32 194, i32 198, i32 200, i32 204, i32 202, i32 206, i32 208, i32 212, i32 210, i32 214, i32 216, i32 220, i32 218, i32 222, i32 224, i32 228, i32 226, i32 230, i32 232, i32 236, i32 234, i32 238, i32 240, i32 244, i32 242, i32 246, i32 248, i32 252, i32 250, i32 254, i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15, i32 17, i32 21, i32 19, i32 23, i32 25, i32 29, i32 27, i32 31, i32 33, i32 37, i32 35, i32 39, i32 41, i32 45, i32 43, i32 47, i32 49, i32 53, i32 51, i32 55, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 73, i32 77, i32 75, i32 79, i32 81, i32 85, i32 83, i32 87, i32 89, i32 93, i32 91, i32 95, i32 97, i32 101, i32 99, i32 103, i32 105, i32 109, i32 107, i32 111, i32 113, i32 117, i32 115, i32 119, i32 121, i32 125, i32 123, i32 127, i32 129, i32 133, i32 131, i32 135, i32 137, i32 141, i32 139, i32 143, i32 145, i32 149, i32 147, i32 151, i32 153, i32 157, i32 155, i32 159, i32 161, i32 165, i32 163, i32 167, i32 169, i32 173, i32 171, i32 175, i32 177, i32 181, i32 179, i32 183, i32 185, i32 189, i32 187, i32 191, i32 193, i32 197, i32 195, i32 199, i32 201, i32 205, i32 203, i32 207, i32 209, i32 213, i32 211, i32 215, i32 217, i32 221, i32 219, i32 223, i32 225, i32 229, i32 227, i32 231, i32 233, i32 237, i32 235, i32 239, i32 241, i32 245, i32 243, i32 247, i32 249, i32 253, i32 251, i32 255> @@ -1013,7 +1013,7 @@ define <256 x i8> @vdeal_7d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_7e: -; CHECK: [[REG7e:r[0-9]+]] = #126 +; CHECK: [[REG7e:r[0-9]+]] = #-2 ; CHECK: vdeal(v1,v0,[[REG7e]]) define <256 x i8> @vdeal_7e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125, i32 128, i32 129, i32 132, i32 133, i32 136, i32 137, i32 140, i32 141, i32 144, i32 145, i32 148, i32 149, i32 152, i32 153, i32 156, i32 157, i32 160, i32 161, i32 164, i32 165, i32 168, i32 169, i32 172, i32 173, i32 176, i32 177, i32 180, i32 181, i32 184, i32 185, i32 188, i32 189, i32 192, i32 193, i32 196, i32 197, i32 200, i32 201, i32 204, i32 205, i32 208, i32 209, i32 212, i32 213, i32 216, i32 217, i32 220, i32 221, i32 224, i32 225, i32 228, i32 229, i32 232, i32 233, i32 236, i32 237, i32 240, i32 241, i32 244, i32 245, i32 248, i32 249, i32 252, i32 253, i32 2, i32 3, i32 6, i32 7, i32 10, i32 11, i32 14, i32 15, i32 18, i32 19, i32 22, i32 23, i32 26, i32 27, i32 30, i32 31, i32 34, i32 35, i32 38, i32 39, i32 42, i32 43, i32 46, i32 47, i32 50, i32 51, i32 54, i32 55, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 74, i32 75, i32 78, i32 79, i32 82, i32 83, i32 86, i32 87, i32 90, i32 91, i32 94, i32 95, i32 98, i32 99, i32 102, i32 103, i32 106, i32 107, i32 110, i32 111, i32 114, i32 115, i32 118, i32 119, i32 122, i32 123, i32 126, i32 127, i32 130, i32 131, i32 134, i32 135, i32 138, i32 139, i32 142, i32 143, i32 146, i32 147, i32 150, i32 151, i32 154, i32 155, i32 158, i32 159, i32 162, i32 163, i32 166, i32 167, i32 170, i32 171, i32 174, i32 175, i32 178, i32 179, i32 182, i32 183, i32 186, i32 187, i32 190, i32 191, i32 194, i32 195, i32 198, i32 199, i32 202, i32 203, i32 206, i32 207, i32 210, i32 211, i32 214, i32 215, i32 218, i32 219, i32 222, i32 223, i32 226, i32 227, i32 230, i32 231, i32 234, i32 235, i32 238, i32 239, i32 242, i32 243, i32 246, i32 247, i32 250, i32 251, i32 254, i32 255> @@ -1021,7 +1021,7 @@ define <256 x i8> @vdeal_7e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_7f: -; CHECK: [[REG7f:r[0-9]+]] = #127 +; CHECK: [[REG7f:r[0-9]+]] = #-1 ; CHECK: vdeal(v1,v0,[[REG7f]]) define <256 x i8> @vdeal_7f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126, i32 128, i32 130, i32 132, i32 134, i32 136, i32 138, i32 140, i32 142, i32 144, i32 146, i32 148, i32 150, i32 152, i32 154, i32 156, i32 158, i32 160, i32 162, i32 164, i32 166, i32 168, i32 170, i32 172, i32 174, i32 176, i32 178, i32 180, i32 182, i32 184, i32 186, i32 188, i32 190, i32 192, i32 194, i32 196, i32 198, i32 200, i32 202, i32 204, i32 206, i32 208, i32 210, i32 212, i32 214, i32 216, i32 218, i32 220, i32 222, i32 224, i32 226, i32 228, i32 230, i32 232, i32 234, i32 236, i32 238, i32 240, i32 242, i32 244, i32 246, i32 248, i32 250, i32 252, i32 254, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127, i32 129, i32 131, i32 133, i32 135, i32 137, i32 139, i32 141, i32 143, i32 145, i32 147, i32 149, i32 151, i32 153, i32 155, i32 157, i32 159, i32 161, i32 163, i32 165, i32 167, i32 169, i32 171, i32 173, i32 175, i32 177, i32 179, i32 181, i32 183, i32 185, i32 187, i32 189, i32 191, i32 193, i32 195, i32 197, i32 199, i32 201, i32 203, i32 205, i32 207, i32 209, i32 211, i32 213, i32 215, i32 217, i32 219, i32 221, i32 223, i32 225, i32 227, i32 229, i32 231, i32 233, i32 235, i32 237, i32 239, i32 241, i32 243, i32 245, i32 247, i32 249, i32 251, i32 253, i32 255> diff --git a/llvm/test/CodeGen/Hexagon/autohvx/deal-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/deal-64b.ll index 525d942d518e8..efd6a327876c4 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/deal-64b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/deal-64b.ll @@ -261,7 +261,7 @@ define <128 x i8> @vdeal_1f(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_20: -; CHECK: [[REG20:r[0-9]+]] = #32 +; CHECK: [[REG20:r[0-9]+]] = #-32 ; CHECK: vshuff(v1,v0,[[REG20]]) define <128 x i8> @vdeal_20(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -269,7 +269,7 @@ define <128 x i8> @vdeal_20(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_21: -; CHECK: [[REG21:r[0-9]+]] = #33 +; CHECK: [[REG21:r[0-9]+]] = #-31 ; CHECK: vdeal(v1,v0,[[REG21]]) define <128 x i8> @vdeal_21(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 32, i32 2, i32 34, i32 4, i32 36, i32 6, i32 38, i32 8, i32 40, i32 10, i32 42, i32 12, i32 44, i32 14, i32 46, i32 16, i32 48, i32 18, i32 50, i32 20, i32 52, i32 22, i32 54, i32 24, i32 56, i32 26, i32 58, i32 28, i32 60, i32 30, i32 62, i32 64, i32 96, i32 66, i32 98, i32 68, i32 100, i32 70, i32 102, i32 72, i32 104, i32 74, i32 106, i32 76, i32 108, i32 78, i32 110, i32 80, i32 112, i32 82, i32 114, i32 84, i32 116, i32 86, i32 118, i32 88, i32 120, i32 90, i32 122, i32 92, i32 124, i32 94, i32 126, i32 1, i32 33, i32 3, i32 35, i32 5, i32 37, i32 7, i32 39, i32 9, i32 41, i32 11, i32 43, i32 13, i32 45, i32 15, i32 47, i32 17, i32 49, i32 19, i32 51, i32 21, i32 53, i32 23, i32 55, i32 25, i32 57, i32 27, i32 59, i32 29, i32 61, i32 31, i32 63, i32 65, i32 97, i32 67, i32 99, i32 69, i32 101, i32 71, i32 103, i32 73, i32 105, i32 75, i32 107, i32 77, i32 109, i32 79, i32 111, i32 81, i32 113, i32 83, i32 115, i32 85, i32 117, i32 87, i32 119, i32 89, i32 121, i32 91, i32 123, i32 93, i32 125, i32 95, i32 127> @@ -277,7 +277,7 @@ define <128 x i8> @vdeal_21(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_22: -; CHECK: [[REG22:r[0-9]+]] = #34 +; CHECK: [[REG22:r[0-9]+]] = #-30 ; CHECK: vdeal(v1,v0,[[REG22]]) define <128 x i8> @vdeal_22(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 32, i32 33, i32 4, i32 5, i32 36, i32 37, i32 8, i32 9, i32 40, i32 41, i32 12, i32 13, i32 44, i32 45, i32 16, i32 17, i32 48, i32 49, i32 20, i32 21, i32 52, i32 53, i32 24, i32 25, i32 56, i32 57, i32 28, i32 29, i32 60, i32 61, i32 64, i32 65, i32 96, i32 97, i32 68, i32 69, i32 100, i32 101, i32 72, i32 73, i32 104, i32 105, i32 76, i32 77, i32 108, i32 109, i32 80, i32 81, i32 112, i32 113, i32 84, i32 85, i32 116, i32 117, i32 88, i32 89, i32 120, i32 121, i32 92, i32 93, i32 124, i32 125, i32 2, i32 3, i32 34, i32 35, i32 6, i32 7, i32 38, i32 39, i32 10, i32 11, i32 42, i32 43, i32 14, i32 15, i32 46, i32 47, i32 18, i32 19, i32 50, i32 51, i32 22, i32 23, i32 54, i32 55, i32 26, i32 27, i32 58, i32 59, i32 30, i32 31, i32 62, i32 63, i32 66, i32 67, i32 98, i32 99, i32 70, i32 71, i32 102, i32 103, i32 74, i32 75, i32 106, i32 107, i32 78, i32 79, i32 110, i32 111, i32 82, i32 83, i32 114, i32 115, i32 86, i32 87, i32 118, i32 119, i32 90, i32 91, i32 122, i32 123, i32 94, i32 95, i32 126, i32 127> @@ -285,7 +285,7 @@ define <128 x i8> @vdeal_22(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_23: -; CHECK: [[REG23:r[0-9]+]] = #35 +; CHECK: [[REG23:r[0-9]+]] = #-29 ; CHECK: vdeal(v1,v0,[[REG23]]) define <128 x i8> @vdeal_23(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 32, i32 34, i32 4, i32 6, i32 36, i32 38, i32 8, i32 10, i32 40, i32 42, i32 12, i32 14, i32 44, i32 46, i32 16, i32 18, i32 48, i32 50, i32 20, i32 22, i32 52, i32 54, i32 24, i32 26, i32 56, i32 58, i32 28, i32 30, i32 60, i32 62, i32 64, i32 66, i32 96, i32 98, i32 68, i32 70, i32 100, i32 102, i32 72, i32 74, i32 104, i32 106, i32 76, i32 78, i32 108, i32 110, i32 80, i32 82, i32 112, i32 114, i32 84, i32 86, i32 116, i32 118, i32 88, i32 90, i32 120, i32 122, i32 92, i32 94, i32 124, i32 126, i32 1, i32 3, i32 33, i32 35, i32 5, i32 7, i32 37, i32 39, i32 9, i32 11, i32 41, i32 43, i32 13, i32 15, i32 45, i32 47, i32 17, i32 19, i32 49, i32 51, i32 21, i32 23, i32 53, i32 55, i32 25, i32 27, i32 57, i32 59, i32 29, i32 31, i32 61, i32 63, i32 65, i32 67, i32 97, i32 99, i32 69, i32 71, i32 101, i32 103, i32 73, i32 75, i32 105, i32 107, i32 77, i32 79, i32 109, i32 111, i32 81, i32 83, i32 113, i32 115, i32 85, i32 87, i32 117, i32 119, i32 89, i32 91, i32 121, i32 123, i32 93, i32 95, i32 125, i32 127> @@ -293,7 +293,7 @@ define <128 x i8> @vdeal_23(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_24: -; CHECK: [[REG24:r[0-9]+]] = #36 +; CHECK: [[REG24:r[0-9]+]] = #-28 ; CHECK: vdeal(v1,v0,[[REG24]]) define <128 x i8> @vdeal_24(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 32, i32 33, i32 34, i32 35, i32 8, i32 9, i32 10, i32 11, i32 40, i32 41, i32 42, i32 43, i32 16, i32 17, i32 18, i32 19, i32 48, i32 49, i32 50, i32 51, i32 24, i32 25, i32 26, i32 27, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 96, i32 97, i32 98, i32 99, i32 72, i32 73, i32 74, i32 75, i32 104, i32 105, i32 106, i32 107, i32 80, i32 81, i32 82, i32 83, i32 112, i32 113, i32 114, i32 115, i32 88, i32 89, i32 90, i32 91, i32 120, i32 121, i32 122, i32 123, i32 4, i32 5, i32 6, i32 7, i32 36, i32 37, i32 38, i32 39, i32 12, i32 13, i32 14, i32 15, i32 44, i32 45, i32 46, i32 47, i32 20, i32 21, i32 22, i32 23, i32 52, i32 53, i32 54, i32 55, i32 28, i32 29, i32 30, i32 31, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 100, i32 101, i32 102, i32 103, i32 76, i32 77, i32 78, i32 79, i32 108, i32 109, i32 110, i32 111, i32 84, i32 85, i32 86, i32 87, i32 116, i32 117, i32 118, i32 119, i32 92, i32 93, i32 94, i32 95, i32 124, i32 125, i32 126, i32 127> @@ -301,7 +301,7 @@ define <128 x i8> @vdeal_24(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_25: -; CHECK: [[REG25:r[0-9]+]] = #37 +; CHECK: [[REG25:r[0-9]+]] = #-27 ; CHECK: vdeal(v1,v0,[[REG25]]) define <128 x i8> @vdeal_25(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 4, i32 2, i32 6, i32 32, i32 36, i32 34, i32 38, i32 8, i32 12, i32 10, i32 14, i32 40, i32 44, i32 42, i32 46, i32 16, i32 20, i32 18, i32 22, i32 48, i32 52, i32 50, i32 54, i32 24, i32 28, i32 26, i32 30, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 96, i32 100, i32 98, i32 102, i32 72, i32 76, i32 74, i32 78, i32 104, i32 108, i32 106, i32 110, i32 80, i32 84, i32 82, i32 86, i32 112, i32 116, i32 114, i32 118, i32 88, i32 92, i32 90, i32 94, i32 120, i32 124, i32 122, i32 126, i32 1, i32 5, i32 3, i32 7, i32 33, i32 37, i32 35, i32 39, i32 9, i32 13, i32 11, i32 15, i32 41, i32 45, i32 43, i32 47, i32 17, i32 21, i32 19, i32 23, i32 49, i32 53, i32 51, i32 55, i32 25, i32 29, i32 27, i32 31, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 97, i32 101, i32 99, i32 103, i32 73, i32 77, i32 75, i32 79, i32 105, i32 109, i32 107, i32 111, i32 81, i32 85, i32 83, i32 87, i32 113, i32 117, i32 115, i32 119, i32 89, i32 93, i32 91, i32 95, i32 121, i32 125, i32 123, i32 127> @@ -309,7 +309,7 @@ define <128 x i8> @vdeal_25(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_26: -; CHECK: [[REG26:r[0-9]+]] = #38 +; CHECK: [[REG26:r[0-9]+]] = #-26 ; CHECK: vdeal(v1,v0,[[REG26]]) define <128 x i8> @vdeal_26(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 4, i32 5, i32 32, i32 33, i32 36, i32 37, i32 8, i32 9, i32 12, i32 13, i32 40, i32 41, i32 44, i32 45, i32 16, i32 17, i32 20, i32 21, i32 48, i32 49, i32 52, i32 53, i32 24, i32 25, i32 28, i32 29, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 96, i32 97, i32 100, i32 101, i32 72, i32 73, i32 76, i32 77, i32 104, i32 105, i32 108, i32 109, i32 80, i32 81, i32 84, i32 85, i32 112, i32 113, i32 116, i32 117, i32 88, i32 89, i32 92, i32 93, i32 120, i32 121, i32 124, i32 125, i32 2, i32 3, i32 6, i32 7, i32 34, i32 35, i32 38, i32 39, i32 10, i32 11, i32 14, i32 15, i32 42, i32 43, i32 46, i32 47, i32 18, i32 19, i32 22, i32 23, i32 50, i32 51, i32 54, i32 55, i32 26, i32 27, i32 30, i32 31, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 98, i32 99, i32 102, i32 103, i32 74, i32 75, i32 78, i32 79, i32 106, i32 107, i32 110, i32 111, i32 82, i32 83, i32 86, i32 87, i32 114, i32 115, i32 118, i32 119, i32 90, i32 91, i32 94, i32 95, i32 122, i32 123, i32 126, i32 127> @@ -317,7 +317,7 @@ define <128 x i8> @vdeal_26(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_27: -; CHECK: [[REG27:r[0-9]+]] = #39 +; CHECK: [[REG27:r[0-9]+]] = #-25 ; CHECK: vdeal(v1,v0,[[REG27]]) define <128 x i8> @vdeal_27(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 4, i32 6, i32 32, i32 34, i32 36, i32 38, i32 8, i32 10, i32 12, i32 14, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 48, i32 50, i32 52, i32 54, i32 24, i32 26, i32 28, i32 30, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 96, i32 98, i32 100, i32 102, i32 72, i32 74, i32 76, i32 78, i32 104, i32 106, i32 108, i32 110, i32 80, i32 82, i32 84, i32 86, i32 112, i32 114, i32 116, i32 118, i32 88, i32 90, i32 92, i32 94, i32 120, i32 122, i32 124, i32 126, i32 1, i32 3, i32 5, i32 7, i32 33, i32 35, i32 37, i32 39, i32 9, i32 11, i32 13, i32 15, i32 41, i32 43, i32 45, i32 47, i32 17, i32 19, i32 21, i32 23, i32 49, i32 51, i32 53, i32 55, i32 25, i32 27, i32 29, i32 31, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 97, i32 99, i32 101, i32 103, i32 73, i32 75, i32 77, i32 79, i32 105, i32 107, i32 109, i32 111, i32 81, i32 83, i32 85, i32 87, i32 113, i32 115, i32 117, i32 119, i32 89, i32 91, i32 93, i32 95, i32 121, i32 123, i32 125, i32 127> @@ -325,7 +325,7 @@ define <128 x i8> @vdeal_27(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_28: -; CHECK: [[REG28:r[0-9]+]] = #40 +; CHECK: [[REG28:r[0-9]+]] = #-24 ; CHECK: vdeal(v1,v0,[[REG28]]) define <128 x i8> @vdeal_28(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -333,7 +333,7 @@ define <128 x i8> @vdeal_28(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_29: -; CHECK: [[REG29:r[0-9]+]] = #41 +; CHECK: [[REG29:r[0-9]+]] = #-23 ; CHECK: vdeal(v1,v0,[[REG29]]) define <128 x i8> @vdeal_29(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 32, i32 40, i32 34, i32 42, i32 36, i32 44, i32 38, i32 46, i32 16, i32 24, i32 18, i32 26, i32 20, i32 28, i32 22, i32 30, i32 48, i32 56, i32 50, i32 58, i32 52, i32 60, i32 54, i32 62, i32 64, i32 72, i32 66, i32 74, i32 68, i32 76, i32 70, i32 78, i32 96, i32 104, i32 98, i32 106, i32 100, i32 108, i32 102, i32 110, i32 80, i32 88, i32 82, i32 90, i32 84, i32 92, i32 86, i32 94, i32 112, i32 120, i32 114, i32 122, i32 116, i32 124, i32 118, i32 126, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, i32 33, i32 41, i32 35, i32 43, i32 37, i32 45, i32 39, i32 47, i32 17, i32 25, i32 19, i32 27, i32 21, i32 29, i32 23, i32 31, i32 49, i32 57, i32 51, i32 59, i32 53, i32 61, i32 55, i32 63, i32 65, i32 73, i32 67, i32 75, i32 69, i32 77, i32 71, i32 79, i32 97, i32 105, i32 99, i32 107, i32 101, i32 109, i32 103, i32 111, i32 81, i32 89, i32 83, i32 91, i32 85, i32 93, i32 87, i32 95, i32 113, i32 121, i32 115, i32 123, i32 117, i32 125, i32 119, i32 127> @@ -341,7 +341,7 @@ define <128 x i8> @vdeal_29(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_2a: -; CHECK: [[REG2a:r[0-9]+]] = #42 +; CHECK: [[REG2a:r[0-9]+]] = #-22 ; CHECK: vdeal(v1,v0,[[REG2a]]) define <128 x i8> @vdeal_2a(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13, i32 32, i32 33, i32 40, i32 41, i32 36, i32 37, i32 44, i32 45, i32 16, i32 17, i32 24, i32 25, i32 20, i32 21, i32 28, i32 29, i32 48, i32 49, i32 56, i32 57, i32 52, i32 53, i32 60, i32 61, i32 64, i32 65, i32 72, i32 73, i32 68, i32 69, i32 76, i32 77, i32 96, i32 97, i32 104, i32 105, i32 100, i32 101, i32 108, i32 109, i32 80, i32 81, i32 88, i32 89, i32 84, i32 85, i32 92, i32 93, i32 112, i32 113, i32 120, i32 121, i32 116, i32 117, i32 124, i32 125, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15, i32 34, i32 35, i32 42, i32 43, i32 38, i32 39, i32 46, i32 47, i32 18, i32 19, i32 26, i32 27, i32 22, i32 23, i32 30, i32 31, i32 50, i32 51, i32 58, i32 59, i32 54, i32 55, i32 62, i32 63, i32 66, i32 67, i32 74, i32 75, i32 70, i32 71, i32 78, i32 79, i32 98, i32 99, i32 106, i32 107, i32 102, i32 103, i32 110, i32 111, i32 82, i32 83, i32 90, i32 91, i32 86, i32 87, i32 94, i32 95, i32 114, i32 115, i32 122, i32 123, i32 118, i32 119, i32 126, i32 127> @@ -349,7 +349,7 @@ define <128 x i8> @vdeal_2a(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_2b: -; CHECK: [[REG2b:r[0-9]+]] = #43 +; CHECK: [[REG2b:r[0-9]+]] = #-21 ; CHECK: vdeal(v1,v0,[[REG2b]]) define <128 x i8> @vdeal_2b(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14, i32 32, i32 34, i32 40, i32 42, i32 36, i32 38, i32 44, i32 46, i32 16, i32 18, i32 24, i32 26, i32 20, i32 22, i32 28, i32 30, i32 48, i32 50, i32 56, i32 58, i32 52, i32 54, i32 60, i32 62, i32 64, i32 66, i32 72, i32 74, i32 68, i32 70, i32 76, i32 78, i32 96, i32 98, i32 104, i32 106, i32 100, i32 102, i32 108, i32 110, i32 80, i32 82, i32 88, i32 90, i32 84, i32 86, i32 92, i32 94, i32 112, i32 114, i32 120, i32 122, i32 116, i32 118, i32 124, i32 126, i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15, i32 33, i32 35, i32 41, i32 43, i32 37, i32 39, i32 45, i32 47, i32 17, i32 19, i32 25, i32 27, i32 21, i32 23, i32 29, i32 31, i32 49, i32 51, i32 57, i32 59, i32 53, i32 55, i32 61, i32 63, i32 65, i32 67, i32 73, i32 75, i32 69, i32 71, i32 77, i32 79, i32 97, i32 99, i32 105, i32 107, i32 101, i32 103, i32 109, i32 111, i32 81, i32 83, i32 89, i32 91, i32 85, i32 87, i32 93, i32 95, i32 113, i32 115, i32 121, i32 123, i32 117, i32 119, i32 125, i32 127> @@ -357,7 +357,7 @@ define <128 x i8> @vdeal_2b(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_2c: -; CHECK: [[REG2c:r[0-9]+]] = #44 +; CHECK: [[REG2c:r[0-9]+]] = #-20 ; CHECK: vdeal(v1,v0,[[REG2c]]) define <128 x i8> @vdeal_2c(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 32, i32 33, i32 34, i32 35, i32 40, i32 41, i32 42, i32 43, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 48, i32 49, i32 50, i32 51, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 72, i32 73, i32 74, i32 75, i32 96, i32 97, i32 98, i32 99, i32 104, i32 105, i32 106, i32 107, i32 80, i32 81, i32 82, i32 83, i32 88, i32 89, i32 90, i32 91, i32 112, i32 113, i32 114, i32 115, i32 120, i32 121, i32 122, i32 123, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 76, i32 77, i32 78, i32 79, i32 100, i32 101, i32 102, i32 103, i32 108, i32 109, i32 110, i32 111, i32 84, i32 85, i32 86, i32 87, i32 92, i32 93, i32 94, i32 95, i32 116, i32 117, i32 118, i32 119, i32 124, i32 125, i32 126, i32 127> @@ -365,7 +365,7 @@ define <128 x i8> @vdeal_2c(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_2d: -; CHECK: [[REG2d:r[0-9]+]] = #45 +; CHECK: [[REG2d:r[0-9]+]] = #-19 ; CHECK: vdeal(v1,v0,[[REG2d]]) define <128 x i8> @vdeal_2d(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 4, i32 2, i32 6, i32 8, i32 12, i32 10, i32 14, i32 32, i32 36, i32 34, i32 38, i32 40, i32 44, i32 42, i32 46, i32 16, i32 20, i32 18, i32 22, i32 24, i32 28, i32 26, i32 30, i32 48, i32 52, i32 50, i32 54, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 72, i32 76, i32 74, i32 78, i32 96, i32 100, i32 98, i32 102, i32 104, i32 108, i32 106, i32 110, i32 80, i32 84, i32 82, i32 86, i32 88, i32 92, i32 90, i32 94, i32 112, i32 116, i32 114, i32 118, i32 120, i32 124, i32 122, i32 126, i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15, i32 33, i32 37, i32 35, i32 39, i32 41, i32 45, i32 43, i32 47, i32 17, i32 21, i32 19, i32 23, i32 25, i32 29, i32 27, i32 31, i32 49, i32 53, i32 51, i32 55, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 73, i32 77, i32 75, i32 79, i32 97, i32 101, i32 99, i32 103, i32 105, i32 109, i32 107, i32 111, i32 81, i32 85, i32 83, i32 87, i32 89, i32 93, i32 91, i32 95, i32 113, i32 117, i32 115, i32 119, i32 121, i32 125, i32 123, i32 127> @@ -373,7 +373,7 @@ define <128 x i8> @vdeal_2d(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_2e: -; CHECK: [[REG2e:r[0-9]+]] = #46 +; CHECK: [[REG2e:r[0-9]+]] = #-18 ; CHECK: vdeal(v1,v0,[[REG2e]]) define <128 x i8> @vdeal_2e(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125, i32 2, i32 3, i32 6, i32 7, i32 10, i32 11, i32 14, i32 15, i32 34, i32 35, i32 38, i32 39, i32 42, i32 43, i32 46, i32 47, i32 18, i32 19, i32 22, i32 23, i32 26, i32 27, i32 30, i32 31, i32 50, i32 51, i32 54, i32 55, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 74, i32 75, i32 78, i32 79, i32 98, i32 99, i32 102, i32 103, i32 106, i32 107, i32 110, i32 111, i32 82, i32 83, i32 86, i32 87, i32 90, i32 91, i32 94, i32 95, i32 114, i32 115, i32 118, i32 119, i32 122, i32 123, i32 126, i32 127> @@ -381,7 +381,7 @@ define <128 x i8> @vdeal_2e(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_2f: -; CHECK: [[REG2f:r[0-9]+]] = #47 +; CHECK: [[REG2f:r[0-9]+]] = #-17 ; CHECK: vdeal(v1,v0,[[REG2f]]) define <128 x i8> @vdeal_2f(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> @@ -389,7 +389,7 @@ define <128 x i8> @vdeal_2f(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_30: -; CHECK: [[REG30:r[0-9]+]] = #48 +; CHECK: [[REG30:r[0-9]+]] = #-16 ; CHECK: vdeal(v1,v0,[[REG30]]) define <128 x i8> @vdeal_30(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -397,7 +397,7 @@ define <128 x i8> @vdeal_30(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_31: -; CHECK: [[REG31:r[0-9]+]] = #49 +; CHECK: [[REG31:r[0-9]+]] = #-15 ; CHECK: vdeal(v1,v0,[[REG31]]) define <128 x i8> @vdeal_31(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30, i32 32, i32 48, i32 34, i32 50, i32 36, i32 52, i32 38, i32 54, i32 40, i32 56, i32 42, i32 58, i32 44, i32 60, i32 46, i32 62, i32 64, i32 80, i32 66, i32 82, i32 68, i32 84, i32 70, i32 86, i32 72, i32 88, i32 74, i32 90, i32 76, i32 92, i32 78, i32 94, i32 96, i32 112, i32 98, i32 114, i32 100, i32 116, i32 102, i32 118, i32 104, i32 120, i32 106, i32 122, i32 108, i32 124, i32 110, i32 126, i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31, i32 33, i32 49, i32 35, i32 51, i32 37, i32 53, i32 39, i32 55, i32 41, i32 57, i32 43, i32 59, i32 45, i32 61, i32 47, i32 63, i32 65, i32 81, i32 67, i32 83, i32 69, i32 85, i32 71, i32 87, i32 73, i32 89, i32 75, i32 91, i32 77, i32 93, i32 79, i32 95, i32 97, i32 113, i32 99, i32 115, i32 101, i32 117, i32 103, i32 119, i32 105, i32 121, i32 107, i32 123, i32 109, i32 125, i32 111, i32 127> @@ -405,7 +405,7 @@ define <128 x i8> @vdeal_31(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_32: -; CHECK: [[REG32:r[0-9]+]] = #50 +; CHECK: [[REG32:r[0-9]+]] = #-14 ; CHECK: vdeal(v1,v0,[[REG32]]) define <128 x i8> @vdeal_32(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 20, i32 21, i32 8, i32 9, i32 24, i32 25, i32 12, i32 13, i32 28, i32 29, i32 32, i32 33, i32 48, i32 49, i32 36, i32 37, i32 52, i32 53, i32 40, i32 41, i32 56, i32 57, i32 44, i32 45, i32 60, i32 61, i32 64, i32 65, i32 80, i32 81, i32 68, i32 69, i32 84, i32 85, i32 72, i32 73, i32 88, i32 89, i32 76, i32 77, i32 92, i32 93, i32 96, i32 97, i32 112, i32 113, i32 100, i32 101, i32 116, i32 117, i32 104, i32 105, i32 120, i32 121, i32 108, i32 109, i32 124, i32 125, i32 2, i32 3, i32 18, i32 19, i32 6, i32 7, i32 22, i32 23, i32 10, i32 11, i32 26, i32 27, i32 14, i32 15, i32 30, i32 31, i32 34, i32 35, i32 50, i32 51, i32 38, i32 39, i32 54, i32 55, i32 42, i32 43, i32 58, i32 59, i32 46, i32 47, i32 62, i32 63, i32 66, i32 67, i32 82, i32 83, i32 70, i32 71, i32 86, i32 87, i32 74, i32 75, i32 90, i32 91, i32 78, i32 79, i32 94, i32 95, i32 98, i32 99, i32 114, i32 115, i32 102, i32 103, i32 118, i32 119, i32 106, i32 107, i32 122, i32 123, i32 110, i32 111, i32 126, i32 127> @@ -413,7 +413,7 @@ define <128 x i8> @vdeal_32(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_33: -; CHECK: [[REG33:r[0-9]+]] = #51 +; CHECK: [[REG33:r[0-9]+]] = #-13 ; CHECK: vdeal(v1,v0,[[REG33]]) define <128 x i8> @vdeal_33(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 16, i32 18, i32 4, i32 6, i32 20, i32 22, i32 8, i32 10, i32 24, i32 26, i32 12, i32 14, i32 28, i32 30, i32 32, i32 34, i32 48, i32 50, i32 36, i32 38, i32 52, i32 54, i32 40, i32 42, i32 56, i32 58, i32 44, i32 46, i32 60, i32 62, i32 64, i32 66, i32 80, i32 82, i32 68, i32 70, i32 84, i32 86, i32 72, i32 74, i32 88, i32 90, i32 76, i32 78, i32 92, i32 94, i32 96, i32 98, i32 112, i32 114, i32 100, i32 102, i32 116, i32 118, i32 104, i32 106, i32 120, i32 122, i32 108, i32 110, i32 124, i32 126, i32 1, i32 3, i32 17, i32 19, i32 5, i32 7, i32 21, i32 23, i32 9, i32 11, i32 25, i32 27, i32 13, i32 15, i32 29, i32 31, i32 33, i32 35, i32 49, i32 51, i32 37, i32 39, i32 53, i32 55, i32 41, i32 43, i32 57, i32 59, i32 45, i32 47, i32 61, i32 63, i32 65, i32 67, i32 81, i32 83, i32 69, i32 71, i32 85, i32 87, i32 73, i32 75, i32 89, i32 91, i32 77, i32 79, i32 93, i32 95, i32 97, i32 99, i32 113, i32 115, i32 101, i32 103, i32 117, i32 119, i32 105, i32 107, i32 121, i32 123, i32 109, i32 111, i32 125, i32 127> @@ -421,7 +421,7 @@ define <128 x i8> @vdeal_33(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_34: -; CHECK: [[REG34:r[0-9]+]] = #52 +; CHECK: [[REG34:r[0-9]+]] = #-12 ; CHECK: vdeal(v1,v0,[[REG34]]) define <128 x i8> @vdeal_34(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 32, i32 33, i32 34, i32 35, i32 48, i32 49, i32 50, i32 51, i32 40, i32 41, i32 42, i32 43, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 80, i32 81, i32 82, i32 83, i32 72, i32 73, i32 74, i32 75, i32 88, i32 89, i32 90, i32 91, i32 96, i32 97, i32 98, i32 99, i32 112, i32 113, i32 114, i32 115, i32 104, i32 105, i32 106, i32 107, i32 120, i32 121, i32 122, i32 123, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 52, i32 53, i32 54, i32 55, i32 44, i32 45, i32 46, i32 47, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 84, i32 85, i32 86, i32 87, i32 76, i32 77, i32 78, i32 79, i32 92, i32 93, i32 94, i32 95, i32 100, i32 101, i32 102, i32 103, i32 116, i32 117, i32 118, i32 119, i32 108, i32 109, i32 110, i32 111, i32 124, i32 125, i32 126, i32 127> @@ -429,7 +429,7 @@ define <128 x i8> @vdeal_34(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_35: -; CHECK: [[REG35:r[0-9]+]] = #53 +; CHECK: [[REG35:r[0-9]+]] = #-11 ; CHECK: vdeal(v1,v0,[[REG35]]) define <128 x i8> @vdeal_35(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 4, i32 2, i32 6, i32 16, i32 20, i32 18, i32 22, i32 8, i32 12, i32 10, i32 14, i32 24, i32 28, i32 26, i32 30, i32 32, i32 36, i32 34, i32 38, i32 48, i32 52, i32 50, i32 54, i32 40, i32 44, i32 42, i32 46, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 80, i32 84, i32 82, i32 86, i32 72, i32 76, i32 74, i32 78, i32 88, i32 92, i32 90, i32 94, i32 96, i32 100, i32 98, i32 102, i32 112, i32 116, i32 114, i32 118, i32 104, i32 108, i32 106, i32 110, i32 120, i32 124, i32 122, i32 126, i32 1, i32 5, i32 3, i32 7, i32 17, i32 21, i32 19, i32 23, i32 9, i32 13, i32 11, i32 15, i32 25, i32 29, i32 27, i32 31, i32 33, i32 37, i32 35, i32 39, i32 49, i32 53, i32 51, i32 55, i32 41, i32 45, i32 43, i32 47, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 81, i32 85, i32 83, i32 87, i32 73, i32 77, i32 75, i32 79, i32 89, i32 93, i32 91, i32 95, i32 97, i32 101, i32 99, i32 103, i32 113, i32 117, i32 115, i32 119, i32 105, i32 109, i32 107, i32 111, i32 121, i32 125, i32 123, i32 127> @@ -437,7 +437,7 @@ define <128 x i8> @vdeal_35(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_36: -; CHECK: [[REG36:r[0-9]+]] = #54 +; CHECK: [[REG36:r[0-9]+]] = #-10 ; CHECK: vdeal(v1,v0,[[REG36]]) define <128 x i8> @vdeal_36(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 4, i32 5, i32 16, i32 17, i32 20, i32 21, i32 8, i32 9, i32 12, i32 13, i32 24, i32 25, i32 28, i32 29, i32 32, i32 33, i32 36, i32 37, i32 48, i32 49, i32 52, i32 53, i32 40, i32 41, i32 44, i32 45, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 80, i32 81, i32 84, i32 85, i32 72, i32 73, i32 76, i32 77, i32 88, i32 89, i32 92, i32 93, i32 96, i32 97, i32 100, i32 101, i32 112, i32 113, i32 116, i32 117, i32 104, i32 105, i32 108, i32 109, i32 120, i32 121, i32 124, i32 125, i32 2, i32 3, i32 6, i32 7, i32 18, i32 19, i32 22, i32 23, i32 10, i32 11, i32 14, i32 15, i32 26, i32 27, i32 30, i32 31, i32 34, i32 35, i32 38, i32 39, i32 50, i32 51, i32 54, i32 55, i32 42, i32 43, i32 46, i32 47, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 82, i32 83, i32 86, i32 87, i32 74, i32 75, i32 78, i32 79, i32 90, i32 91, i32 94, i32 95, i32 98, i32 99, i32 102, i32 103, i32 114, i32 115, i32 118, i32 119, i32 106, i32 107, i32 110, i32 111, i32 122, i32 123, i32 126, i32 127> @@ -445,7 +445,7 @@ define <128 x i8> @vdeal_36(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_37: -; CHECK: [[REG37:r[0-9]+]] = #55 +; CHECK: [[REG37:r[0-9]+]] = #-9 ; CHECK: vdeal(v1,v0,[[REG37]]) define <128 x i8> @vdeal_37(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 48, i32 50, i32 52, i32 54, i32 40, i32 42, i32 44, i32 46, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 80, i32 82, i32 84, i32 86, i32 72, i32 74, i32 76, i32 78, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 112, i32 114, i32 116, i32 118, i32 104, i32 106, i32 108, i32 110, i32 120, i32 122, i32 124, i32 126, i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 49, i32 51, i32 53, i32 55, i32 41, i32 43, i32 45, i32 47, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 81, i32 83, i32 85, i32 87, i32 73, i32 75, i32 77, i32 79, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 113, i32 115, i32 117, i32 119, i32 105, i32 107, i32 109, i32 111, i32 121, i32 123, i32 125, i32 127> @@ -453,7 +453,7 @@ define <128 x i8> @vdeal_37(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_38: -; CHECK: [[REG38:r[0-9]+]] = #56 +; CHECK: [[REG38:r[0-9]+]] = #-8 ; CHECK: vdeal(v1,v0,[[REG38]]) define <128 x i8> @vdeal_38(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -461,7 +461,7 @@ define <128 x i8> @vdeal_38(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_39: -; CHECK: [[REG39:r[0-9]+]] = #57 +; CHECK: [[REG39:r[0-9]+]] = #-7 ; CHECK: vdeal(v1,v0,[[REG39]]) define <128 x i8> @vdeal_39(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14, i32 16, i32 24, i32 18, i32 26, i32 20, i32 28, i32 22, i32 30, i32 32, i32 40, i32 34, i32 42, i32 36, i32 44, i32 38, i32 46, i32 48, i32 56, i32 50, i32 58, i32 52, i32 60, i32 54, i32 62, i32 64, i32 72, i32 66, i32 74, i32 68, i32 76, i32 70, i32 78, i32 80, i32 88, i32 82, i32 90, i32 84, i32 92, i32 86, i32 94, i32 96, i32 104, i32 98, i32 106, i32 100, i32 108, i32 102, i32 110, i32 112, i32 120, i32 114, i32 122, i32 116, i32 124, i32 118, i32 126, i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, i32 17, i32 25, i32 19, i32 27, i32 21, i32 29, i32 23, i32 31, i32 33, i32 41, i32 35, i32 43, i32 37, i32 45, i32 39, i32 47, i32 49, i32 57, i32 51, i32 59, i32 53, i32 61, i32 55, i32 63, i32 65, i32 73, i32 67, i32 75, i32 69, i32 77, i32 71, i32 79, i32 81, i32 89, i32 83, i32 91, i32 85, i32 93, i32 87, i32 95, i32 97, i32 105, i32 99, i32 107, i32 101, i32 109, i32 103, i32 111, i32 113, i32 121, i32 115, i32 123, i32 117, i32 125, i32 119, i32 127> @@ -469,7 +469,7 @@ define <128 x i8> @vdeal_39(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_3a: -; CHECK: [[REG3a:r[0-9]+]] = #58 +; CHECK: [[REG3a:r[0-9]+]] = #-6 ; CHECK: vdeal(v1,v0,[[REG3a]]) define <128 x i8> @vdeal_3a(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 12, i32 13, i32 16, i32 17, i32 24, i32 25, i32 20, i32 21, i32 28, i32 29, i32 32, i32 33, i32 40, i32 41, i32 36, i32 37, i32 44, i32 45, i32 48, i32 49, i32 56, i32 57, i32 52, i32 53, i32 60, i32 61, i32 64, i32 65, i32 72, i32 73, i32 68, i32 69, i32 76, i32 77, i32 80, i32 81, i32 88, i32 89, i32 84, i32 85, i32 92, i32 93, i32 96, i32 97, i32 104, i32 105, i32 100, i32 101, i32 108, i32 109, i32 112, i32 113, i32 120, i32 121, i32 116, i32 117, i32 124, i32 125, i32 2, i32 3, i32 10, i32 11, i32 6, i32 7, i32 14, i32 15, i32 18, i32 19, i32 26, i32 27, i32 22, i32 23, i32 30, i32 31, i32 34, i32 35, i32 42, i32 43, i32 38, i32 39, i32 46, i32 47, i32 50, i32 51, i32 58, i32 59, i32 54, i32 55, i32 62, i32 63, i32 66, i32 67, i32 74, i32 75, i32 70, i32 71, i32 78, i32 79, i32 82, i32 83, i32 90, i32 91, i32 86, i32 87, i32 94, i32 95, i32 98, i32 99, i32 106, i32 107, i32 102, i32 103, i32 110, i32 111, i32 114, i32 115, i32 122, i32 123, i32 118, i32 119, i32 126, i32 127> @@ -477,7 +477,7 @@ define <128 x i8> @vdeal_3a(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_3b: -; CHECK: [[REG3b:r[0-9]+]] = #59 +; CHECK: [[REG3b:r[0-9]+]] = #-5 ; CHECK: vdeal(v1,v0,[[REG3b]]) define <128 x i8> @vdeal_3b(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14, i32 16, i32 18, i32 24, i32 26, i32 20, i32 22, i32 28, i32 30, i32 32, i32 34, i32 40, i32 42, i32 36, i32 38, i32 44, i32 46, i32 48, i32 50, i32 56, i32 58, i32 52, i32 54, i32 60, i32 62, i32 64, i32 66, i32 72, i32 74, i32 68, i32 70, i32 76, i32 78, i32 80, i32 82, i32 88, i32 90, i32 84, i32 86, i32 92, i32 94, i32 96, i32 98, i32 104, i32 106, i32 100, i32 102, i32 108, i32 110, i32 112, i32 114, i32 120, i32 122, i32 116, i32 118, i32 124, i32 126, i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15, i32 17, i32 19, i32 25, i32 27, i32 21, i32 23, i32 29, i32 31, i32 33, i32 35, i32 41, i32 43, i32 37, i32 39, i32 45, i32 47, i32 49, i32 51, i32 57, i32 59, i32 53, i32 55, i32 61, i32 63, i32 65, i32 67, i32 73, i32 75, i32 69, i32 71, i32 77, i32 79, i32 81, i32 83, i32 89, i32 91, i32 85, i32 87, i32 93, i32 95, i32 97, i32 99, i32 105, i32 107, i32 101, i32 103, i32 109, i32 111, i32 113, i32 115, i32 121, i32 123, i32 117, i32 119, i32 125, i32 127> @@ -485,7 +485,7 @@ define <128 x i8> @vdeal_3b(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_3c: -; CHECK: [[REG3c:r[0-9]+]] = #60 +; CHECK: [[REG3c:r[0-9]+]] = #-4 ; CHECK: vdeal(v1,v0,[[REG3c]]) define <128 x i8> @vdeal_3c(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 32, i32 33, i32 34, i32 35, i32 40, i32 41, i32 42, i32 43, i32 48, i32 49, i32 50, i32 51, i32 56, i32 57, i32 58, i32 59, i32 64, i32 65, i32 66, i32 67, i32 72, i32 73, i32 74, i32 75, i32 80, i32 81, i32 82, i32 83, i32 88, i32 89, i32 90, i32 91, i32 96, i32 97, i32 98, i32 99, i32 104, i32 105, i32 106, i32 107, i32 112, i32 113, i32 114, i32 115, i32 120, i32 121, i32 122, i32 123, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31, i32 36, i32 37, i32 38, i32 39, i32 44, i32 45, i32 46, i32 47, i32 52, i32 53, i32 54, i32 55, i32 60, i32 61, i32 62, i32 63, i32 68, i32 69, i32 70, i32 71, i32 76, i32 77, i32 78, i32 79, i32 84, i32 85, i32 86, i32 87, i32 92, i32 93, i32 94, i32 95, i32 100, i32 101, i32 102, i32 103, i32 108, i32 109, i32 110, i32 111, i32 116, i32 117, i32 118, i32 119, i32 124, i32 125, i32 126, i32 127> @@ -493,7 +493,7 @@ define <128 x i8> @vdeal_3c(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_3d: -; CHECK: [[REG3d:r[0-9]+]] = #61 +; CHECK: [[REG3d:r[0-9]+]] = #-3 ; CHECK: vdeal(v1,v0,[[REG3d]]) define <128 x i8> @vdeal_3d(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 4, i32 2, i32 6, i32 8, i32 12, i32 10, i32 14, i32 16, i32 20, i32 18, i32 22, i32 24, i32 28, i32 26, i32 30, i32 32, i32 36, i32 34, i32 38, i32 40, i32 44, i32 42, i32 46, i32 48, i32 52, i32 50, i32 54, i32 56, i32 60, i32 58, i32 62, i32 64, i32 68, i32 66, i32 70, i32 72, i32 76, i32 74, i32 78, i32 80, i32 84, i32 82, i32 86, i32 88, i32 92, i32 90, i32 94, i32 96, i32 100, i32 98, i32 102, i32 104, i32 108, i32 106, i32 110, i32 112, i32 116, i32 114, i32 118, i32 120, i32 124, i32 122, i32 126, i32 1, i32 5, i32 3, i32 7, i32 9, i32 13, i32 11, i32 15, i32 17, i32 21, i32 19, i32 23, i32 25, i32 29, i32 27, i32 31, i32 33, i32 37, i32 35, i32 39, i32 41, i32 45, i32 43, i32 47, i32 49, i32 53, i32 51, i32 55, i32 57, i32 61, i32 59, i32 63, i32 65, i32 69, i32 67, i32 71, i32 73, i32 77, i32 75, i32 79, i32 81, i32 85, i32 83, i32 87, i32 89, i32 93, i32 91, i32 95, i32 97, i32 101, i32 99, i32 103, i32 105, i32 109, i32 107, i32 111, i32 113, i32 117, i32 115, i32 119, i32 121, i32 125, i32 123, i32 127> @@ -501,7 +501,7 @@ define <128 x i8> @vdeal_3d(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_3e: -; CHECK: [[REG3e:r[0-9]+]] = #62 +; CHECK: [[REG3e:r[0-9]+]] = #-2 ; CHECK: vdeal(v1,v0,[[REG3e]]) define <128 x i8> @vdeal_3e(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13, i32 16, i32 17, i32 20, i32 21, i32 24, i32 25, i32 28, i32 29, i32 32, i32 33, i32 36, i32 37, i32 40, i32 41, i32 44, i32 45, i32 48, i32 49, i32 52, i32 53, i32 56, i32 57, i32 60, i32 61, i32 64, i32 65, i32 68, i32 69, i32 72, i32 73, i32 76, i32 77, i32 80, i32 81, i32 84, i32 85, i32 88, i32 89, i32 92, i32 93, i32 96, i32 97, i32 100, i32 101, i32 104, i32 105, i32 108, i32 109, i32 112, i32 113, i32 116, i32 117, i32 120, i32 121, i32 124, i32 125, i32 2, i32 3, i32 6, i32 7, i32 10, i32 11, i32 14, i32 15, i32 18, i32 19, i32 22, i32 23, i32 26, i32 27, i32 30, i32 31, i32 34, i32 35, i32 38, i32 39, i32 42, i32 43, i32 46, i32 47, i32 50, i32 51, i32 54, i32 55, i32 58, i32 59, i32 62, i32 63, i32 66, i32 67, i32 70, i32 71, i32 74, i32 75, i32 78, i32 79, i32 82, i32 83, i32 86, i32 87, i32 90, i32 91, i32 94, i32 95, i32 98, i32 99, i32 102, i32 103, i32 106, i32 107, i32 110, i32 111, i32 114, i32 115, i32 118, i32 119, i32 122, i32 123, i32 126, i32 127> @@ -509,7 +509,7 @@ define <128 x i8> @vdeal_3e(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vdeal_3f: -; CHECK: [[REG3f:r[0-9]+]] = #63 +; CHECK: [[REG3f:r[0-9]+]] = #-1 ; CHECK: vdeal(v1,v0,[[REG3f]]) define <128 x i8> @vdeal_3f(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127> diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-shuff-single.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-shuff-single.ll index 97dc25931dba5..e4eb7bed6fabf 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-shuff-single.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-shuff-single.ll @@ -6,9 +6,9 @@ ; was missing). ; CHECK-LABEL: f0: -; CHECK-DAG: r[[R0:[0-9]+]] = #66 +; CHECK-DAG: r[[R0:[0-9]+]] = #-62 ; CHECK-DAG: r[[R1:[0-9]+]] = #40 -; CHECK-DAG: r[[R2:[0-9]+]] = #85 +; CHECK-DAG: r[[R2:[0-9]+]] = #-43 ; CHECK: v1:0 = vdeal(v{{[0-9]+}},v0,r[[R0]]) ; CHECK: v1:0 = vshuff(v1,v0,r[[R1]]) ; CHECK: v1:0 = vshuff(v1,v0,r[[R2]]) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll index 716e0367efe6f..67d9e19b8975e 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-vpackew.ll @@ -5,7 +5,7 @@ define void @f0(ptr %a0, ptr %a1, ptr %a2) #0 { ; CHECK-LABEL: f0: ; CHECK: // %bb.0: // %b0 ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #124 +; CHECK-NEXT: r7 = #-4 ; CHECK-NEXT: v0 = vmem(r0+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { diff --git a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll index ef0173880f024..b5ad0ab703146 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/mulh.ll @@ -10,7 +10,7 @@ define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V60-NEXT: v1:0.w = vmpy(v1.h,v0.h) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r7 = #124 +; V60-NEXT: r7 = #-4 ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: v1:0 = vshuff(v1,v0,r7) @@ -28,7 +28,7 @@ define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V65-NEXT: v1:0.w = vmpy(v1.h,v0.h) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: r7 = #124 +; V65-NEXT: r7 = #-4 ; V65-NEXT: } ; V65-NEXT: { ; V65-NEXT: v1:0 = vshuff(v1,v0,r7) @@ -46,7 +46,7 @@ define <64 x i16> @mulhs16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V69-NEXT: v1:0.w = vmpy(v1.h,v0.h) ; V69-NEXT: } ; V69-NEXT: { -; V69-NEXT: r7 = #124 +; V69-NEXT: r7 = #-4 ; V69-NEXT: } ; V69-NEXT: { ; V69-NEXT: v1:0 = vshuff(v1,v0,r7) @@ -72,7 +72,7 @@ define <64 x i16> @mulhu16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V60-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh) ; V60-NEXT: } ; V60-NEXT: { -; V60-NEXT: r7 = #124 +; V60-NEXT: r7 = #-4 ; V60-NEXT: } ; V60-NEXT: { ; V60-NEXT: v1:0 = vshuff(v1,v0,r7) @@ -90,7 +90,7 @@ define <64 x i16> @mulhu16(<64 x i16> %a0, <64 x i16> %a1) #0 { ; V65-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh) ; V65-NEXT: } ; V65-NEXT: { -; V65-NEXT: r7 = #124 +; V65-NEXT: r7 = #-4 ; V65-NEXT: } ; V65-NEXT: { ; V65-NEXT: v1:0 = vshuff(v1,v0,r7) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll b/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll index 872c93fa7cb23..0021a626b5fcd 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/qmul.ll @@ -75,7 +75,7 @@ define void @f2(ptr %a0, ptr %a1, ptr %a2) #0 { ; CHECK-NEXT: v0 = vmem(r1+#0) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r7 = #124 +; CHECK-NEXT: r7 = #-4 ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: r3 = #15 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuff-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuff-128b.ll index 7b815496bcb56..607118d76f043 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuff-128b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuff-128b.ll @@ -515,7 +515,7 @@ define <256 x i8> @vshuff_3f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_40: -; CHECK: [[REG40:r[0-9]+]] = #64 +; CHECK: [[REG40:r[0-9]+]] = #-64 ; CHECK: vshuff(v1,v0,[[REG40]]) define <256 x i8> @vshuff_40(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -523,7 +523,7 @@ define <256 x i8> @vshuff_40(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_41: -; CHECK: [[REG41:r[0-9]+]] = #65 +; CHECK: [[REG41:r[0-9]+]] = #-63 ; CHECK: vshuff(v1,v0,[[REG41]]) define <256 x i8> @vshuff_41(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -531,7 +531,7 @@ define <256 x i8> @vshuff_41(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_42: -; CHECK: [[REG42:r[0-9]+]] = #66 +; CHECK: [[REG42:r[0-9]+]] = #-62 ; CHECK: vshuff(v1,v0,[[REG42]]) define <256 x i8> @vshuff_42(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -539,7 +539,7 @@ define <256 x i8> @vshuff_42(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_43: -; CHECK: [[REG43:r[0-9]+]] = #67 +; CHECK: [[REG43:r[0-9]+]] = #-61 ; CHECK: vshuff(v1,v0,[[REG43]]) define <256 x i8> @vshuff_43(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -547,7 +547,7 @@ define <256 x i8> @vshuff_43(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_44: -; CHECK: [[REG44:r[0-9]+]] = #68 +; CHECK: [[REG44:r[0-9]+]] = #-60 ; CHECK: vshuff(v1,v0,[[REG44]]) define <256 x i8> @vshuff_44(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -555,7 +555,7 @@ define <256 x i8> @vshuff_44(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_45: -; CHECK: [[REG45:r[0-9]+]] = #69 +; CHECK: [[REG45:r[0-9]+]] = #-59 ; CHECK: vshuff(v1,v0,[[REG45]]) define <256 x i8> @vshuff_45(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -563,7 +563,7 @@ define <256 x i8> @vshuff_45(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_46: -; CHECK: [[REG46:r[0-9]+]] = #70 +; CHECK: [[REG46:r[0-9]+]] = #-58 ; CHECK: vshuff(v1,v0,[[REG46]]) define <256 x i8> @vshuff_46(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -571,7 +571,7 @@ define <256 x i8> @vshuff_46(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_47: -; CHECK: [[REG47:r[0-9]+]] = #71 +; CHECK: [[REG47:r[0-9]+]] = #-57 ; CHECK: vshuff(v1,v0,[[REG47]]) define <256 x i8> @vshuff_47(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -579,7 +579,7 @@ define <256 x i8> @vshuff_47(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_48: -; CHECK: [[REG48:r[0-9]+]] = #72 +; CHECK: [[REG48:r[0-9]+]] = #-56 ; CHECK: vshuff(v1,v0,[[REG48]]) define <256 x i8> @vshuff_48(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -587,7 +587,7 @@ define <256 x i8> @vshuff_48(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_49: -; CHECK: [[REG49:r[0-9]+]] = #73 +; CHECK: [[REG49:r[0-9]+]] = #-55 ; CHECK: vshuff(v1,v0,[[REG49]]) define <256 x i8> @vshuff_49(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -595,7 +595,7 @@ define <256 x i8> @vshuff_49(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_4a: -; CHECK: [[REG4a:r[0-9]+]] = #74 +; CHECK: [[REG4a:r[0-9]+]] = #-54 ; CHECK: vshuff(v1,v0,[[REG4a]]) define <256 x i8> @vshuff_4a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -603,7 +603,7 @@ define <256 x i8> @vshuff_4a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_4b: -; CHECK: [[REG4b:r[0-9]+]] = #75 +; CHECK: [[REG4b:r[0-9]+]] = #-53 ; CHECK: vshuff(v1,v0,[[REG4b]]) define <256 x i8> @vshuff_4b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -611,7 +611,7 @@ define <256 x i8> @vshuff_4b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_4c: -; CHECK: [[REG4c:r[0-9]+]] = #76 +; CHECK: [[REG4c:r[0-9]+]] = #-52 ; CHECK: vshuff(v1,v0,[[REG4c]]) define <256 x i8> @vshuff_4c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -619,7 +619,7 @@ define <256 x i8> @vshuff_4c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_4d: -; CHECK: [[REG4d:r[0-9]+]] = #77 +; CHECK: [[REG4d:r[0-9]+]] = #-51 ; CHECK: vshuff(v1,v0,[[REG4d]]) define <256 x i8> @vshuff_4d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -627,7 +627,7 @@ define <256 x i8> @vshuff_4d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_4e: -; CHECK: [[REG4e:r[0-9]+]] = #78 +; CHECK: [[REG4e:r[0-9]+]] = #-50 ; CHECK: vshuff(v1,v0,[[REG4e]]) define <256 x i8> @vshuff_4e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -635,7 +635,7 @@ define <256 x i8> @vshuff_4e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_4f: -; CHECK: [[REG4f:r[0-9]+]] = #79 +; CHECK: [[REG4f:r[0-9]+]] = #-49 ; CHECK: vshuff(v1,v0,[[REG4f]]) define <256 x i8> @vshuff_4f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -643,7 +643,7 @@ define <256 x i8> @vshuff_4f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_50: -; CHECK: [[REG50:r[0-9]+]] = #80 +; CHECK: [[REG50:r[0-9]+]] = #-48 ; CHECK: vshuff(v1,v0,[[REG50]]) define <256 x i8> @vshuff_50(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -651,7 +651,7 @@ define <256 x i8> @vshuff_50(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_51: -; CHECK: [[REG51:r[0-9]+]] = #81 +; CHECK: [[REG51:r[0-9]+]] = #-47 ; CHECK: vshuff(v1,v0,[[REG51]]) define <256 x i8> @vshuff_51(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -659,7 +659,7 @@ define <256 x i8> @vshuff_51(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_52: -; CHECK: [[REG52:r[0-9]+]] = #82 +; CHECK: [[REG52:r[0-9]+]] = #-46 ; CHECK: vshuff(v1,v0,[[REG52]]) define <256 x i8> @vshuff_52(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -667,7 +667,7 @@ define <256 x i8> @vshuff_52(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_53: -; CHECK: [[REG53:r[0-9]+]] = #83 +; CHECK: [[REG53:r[0-9]+]] = #-45 ; CHECK: vshuff(v1,v0,[[REG53]]) define <256 x i8> @vshuff_53(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -675,7 +675,7 @@ define <256 x i8> @vshuff_53(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_54: -; CHECK: [[REG54:r[0-9]+]] = #84 +; CHECK: [[REG54:r[0-9]+]] = #-44 ; CHECK: vshuff(v1,v0,[[REG54]]) define <256 x i8> @vshuff_54(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -683,7 +683,7 @@ define <256 x i8> @vshuff_54(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_55: -; CHECK: [[REG55:r[0-9]+]] = #85 +; CHECK: [[REG55:r[0-9]+]] = #-43 ; CHECK: vshuff(v1,v0,[[REG55]]) define <256 x i8> @vshuff_55(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -691,7 +691,7 @@ define <256 x i8> @vshuff_55(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_56: -; CHECK: [[REG56:r[0-9]+]] = #86 +; CHECK: [[REG56:r[0-9]+]] = #-42 ; CHECK: vshuff(v1,v0,[[REG56]]) define <256 x i8> @vshuff_56(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -699,7 +699,7 @@ define <256 x i8> @vshuff_56(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_57: -; CHECK: [[REG57:r[0-9]+]] = #87 +; CHECK: [[REG57:r[0-9]+]] = #-41 ; CHECK: vshuff(v1,v0,[[REG57]]) define <256 x i8> @vshuff_57(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -707,7 +707,7 @@ define <256 x i8> @vshuff_57(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_58: -; CHECK: [[REG58:r[0-9]+]] = #88 +; CHECK: [[REG58:r[0-9]+]] = #-40 ; CHECK: vshuff(v1,v0,[[REG58]]) define <256 x i8> @vshuff_58(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -715,7 +715,7 @@ define <256 x i8> @vshuff_58(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_59: -; CHECK: [[REG59:r[0-9]+]] = #89 +; CHECK: [[REG59:r[0-9]+]] = #-39 ; CHECK: vshuff(v1,v0,[[REG59]]) define <256 x i8> @vshuff_59(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -723,7 +723,7 @@ define <256 x i8> @vshuff_59(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_5a: -; CHECK: [[REG5a:r[0-9]+]] = #90 +; CHECK: [[REG5a:r[0-9]+]] = #-38 ; CHECK: vshuff(v1,v0,[[REG5a]]) define <256 x i8> @vshuff_5a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -731,7 +731,7 @@ define <256 x i8> @vshuff_5a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_5b: -; CHECK: [[REG5b:r[0-9]+]] = #91 +; CHECK: [[REG5b:r[0-9]+]] = #-37 ; CHECK: vshuff(v1,v0,[[REG5b]]) define <256 x i8> @vshuff_5b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -739,7 +739,7 @@ define <256 x i8> @vshuff_5b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_5c: -; CHECK: [[REG5c:r[0-9]+]] = #92 +; CHECK: [[REG5c:r[0-9]+]] = #-36 ; CHECK: vshuff(v1,v0,[[REG5c]]) define <256 x i8> @vshuff_5c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -747,7 +747,7 @@ define <256 x i8> @vshuff_5c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_5d: -; CHECK: [[REG5d:r[0-9]+]] = #93 +; CHECK: [[REG5d:r[0-9]+]] = #-35 ; CHECK: vshuff(v1,v0,[[REG5d]]) define <256 x i8> @vshuff_5d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -755,7 +755,7 @@ define <256 x i8> @vshuff_5d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_5e: -; CHECK: [[REG5e:r[0-9]+]] = #94 +; CHECK: [[REG5e:r[0-9]+]] = #-34 ; CHECK: vshuff(v1,v0,[[REG5e]]) define <256 x i8> @vshuff_5e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -763,7 +763,7 @@ define <256 x i8> @vshuff_5e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_5f: -; CHECK: [[REG5f:r[0-9]+]] = #95 +; CHECK: [[REG5f:r[0-9]+]] = #-33 ; CHECK: vshuff(v1,v0,[[REG5f]]) define <256 x i8> @vshuff_5f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -771,7 +771,7 @@ define <256 x i8> @vshuff_5f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_60: -; CHECK: [[REG60:r[0-9]+]] = #96 +; CHECK: [[REG60:r[0-9]+]] = #-32 ; CHECK: vshuff(v1,v0,[[REG60]]) define <256 x i8> @vshuff_60(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -779,7 +779,7 @@ define <256 x i8> @vshuff_60(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_61: -; CHECK: [[REG61:r[0-9]+]] = #97 +; CHECK: [[REG61:r[0-9]+]] = #-31 ; CHECK: vshuff(v1,v0,[[REG61]]) define <256 x i8> @vshuff_61(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -787,7 +787,7 @@ define <256 x i8> @vshuff_61(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_62: -; CHECK: [[REG62:r[0-9]+]] = #98 +; CHECK: [[REG62:r[0-9]+]] = #-30 ; CHECK: vshuff(v1,v0,[[REG62]]) define <256 x i8> @vshuff_62(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -795,7 +795,7 @@ define <256 x i8> @vshuff_62(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_63: -; CHECK: [[REG63:r[0-9]+]] = #99 +; CHECK: [[REG63:r[0-9]+]] = #-29 ; CHECK: vshuff(v1,v0,[[REG63]]) define <256 x i8> @vshuff_63(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -803,7 +803,7 @@ define <256 x i8> @vshuff_63(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_64: -; CHECK: [[REG64:r[0-9]+]] = #100 +; CHECK: [[REG64:r[0-9]+]] = #-28 ; CHECK: vshuff(v1,v0,[[REG64]]) define <256 x i8> @vshuff_64(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -811,7 +811,7 @@ define <256 x i8> @vshuff_64(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_65: -; CHECK: [[REG65:r[0-9]+]] = #101 +; CHECK: [[REG65:r[0-9]+]] = #-27 ; CHECK: vshuff(v1,v0,[[REG65]]) define <256 x i8> @vshuff_65(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -819,7 +819,7 @@ define <256 x i8> @vshuff_65(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_66: -; CHECK: [[REG66:r[0-9]+]] = #102 +; CHECK: [[REG66:r[0-9]+]] = #-26 ; CHECK: vshuff(v1,v0,[[REG66]]) define <256 x i8> @vshuff_66(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -827,7 +827,7 @@ define <256 x i8> @vshuff_66(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_67: -; CHECK: [[REG67:r[0-9]+]] = #103 +; CHECK: [[REG67:r[0-9]+]] = #-25 ; CHECK: vshuff(v1,v0,[[REG67]]) define <256 x i8> @vshuff_67(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -835,7 +835,7 @@ define <256 x i8> @vshuff_67(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_68: -; CHECK: [[REG68:r[0-9]+]] = #104 +; CHECK: [[REG68:r[0-9]+]] = #-24 ; CHECK: vshuff(v1,v0,[[REG68]]) define <256 x i8> @vshuff_68(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -843,7 +843,7 @@ define <256 x i8> @vshuff_68(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_69: -; CHECK: [[REG69:r[0-9]+]] = #105 +; CHECK: [[REG69:r[0-9]+]] = #-23 ; CHECK: vshuff(v1,v0,[[REG69]]) define <256 x i8> @vshuff_69(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -851,7 +851,7 @@ define <256 x i8> @vshuff_69(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_6a: -; CHECK: [[REG6a:r[0-9]+]] = #106 +; CHECK: [[REG6a:r[0-9]+]] = #-22 ; CHECK: vshuff(v1,v0,[[REG6a]]) define <256 x i8> @vshuff_6a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -859,7 +859,7 @@ define <256 x i8> @vshuff_6a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_6b: -; CHECK: [[REG6b:r[0-9]+]] = #107 +; CHECK: [[REG6b:r[0-9]+]] = #-21 ; CHECK: vshuff(v1,v0,[[REG6b]]) define <256 x i8> @vshuff_6b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -867,7 +867,7 @@ define <256 x i8> @vshuff_6b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_6c: -; CHECK: [[REG6c:r[0-9]+]] = #108 +; CHECK: [[REG6c:r[0-9]+]] = #-20 ; CHECK: vshuff(v1,v0,[[REG6c]]) define <256 x i8> @vshuff_6c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -875,7 +875,7 @@ define <256 x i8> @vshuff_6c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_6d: -; CHECK: [[REG6d:r[0-9]+]] = #109 +; CHECK: [[REG6d:r[0-9]+]] = #-19 ; CHECK: vshuff(v1,v0,[[REG6d]]) define <256 x i8> @vshuff_6d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -883,7 +883,7 @@ define <256 x i8> @vshuff_6d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_6e: -; CHECK: [[REG6e:r[0-9]+]] = #110 +; CHECK: [[REG6e:r[0-9]+]] = #-18 ; CHECK: vshuff(v1,v0,[[REG6e]]) define <256 x i8> @vshuff_6e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -891,7 +891,7 @@ define <256 x i8> @vshuff_6e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_6f: -; CHECK: [[REG6f:r[0-9]+]] = #111 +; CHECK: [[REG6f:r[0-9]+]] = #-17 ; CHECK: vshuff(v1,v0,[[REG6f]]) define <256 x i8> @vshuff_6f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -899,7 +899,7 @@ define <256 x i8> @vshuff_6f(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_70: -; CHECK: [[REG70:r[0-9]+]] = #112 +; CHECK: [[REG70:r[0-9]+]] = #-16 ; CHECK: vshuff(v1,v0,[[REG70]]) define <256 x i8> @vshuff_70(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -907,7 +907,7 @@ define <256 x i8> @vshuff_70(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_71: -; CHECK: [[REG71:r[0-9]+]] = #113 +; CHECK: [[REG71:r[0-9]+]] = #-15 ; CHECK: vshuff(v1,v0,[[REG71]]) define <256 x i8> @vshuff_71(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -915,7 +915,7 @@ define <256 x i8> @vshuff_71(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_72: -; CHECK: [[REG72:r[0-9]+]] = #114 +; CHECK: [[REG72:r[0-9]+]] = #-14 ; CHECK: vshuff(v1,v0,[[REG72]]) define <256 x i8> @vshuff_72(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -923,7 +923,7 @@ define <256 x i8> @vshuff_72(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_73: -; CHECK: [[REG73:r[0-9]+]] = #115 +; CHECK: [[REG73:r[0-9]+]] = #-13 ; CHECK: vshuff(v1,v0,[[REG73]]) define <256 x i8> @vshuff_73(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -931,7 +931,7 @@ define <256 x i8> @vshuff_73(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_74: -; CHECK: [[REG74:r[0-9]+]] = #116 +; CHECK: [[REG74:r[0-9]+]] = #-12 ; CHECK: vshuff(v1,v0,[[REG74]]) define <256 x i8> @vshuff_74(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -939,7 +939,7 @@ define <256 x i8> @vshuff_74(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_75: -; CHECK: [[REG75:r[0-9]+]] = #117 +; CHECK: [[REG75:r[0-9]+]] = #-11 ; CHECK: vshuff(v1,v0,[[REG75]]) define <256 x i8> @vshuff_75(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -947,7 +947,7 @@ define <256 x i8> @vshuff_75(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_76: -; CHECK: [[REG76:r[0-9]+]] = #118 +; CHECK: [[REG76:r[0-9]+]] = #-10 ; CHECK: vshuff(v1,v0,[[REG76]]) define <256 x i8> @vshuff_76(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -955,7 +955,7 @@ define <256 x i8> @vshuff_76(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_77: -; CHECK: [[REG77:r[0-9]+]] = #119 +; CHECK: [[REG77:r[0-9]+]] = #-9 ; CHECK: vshuff(v1,v0,[[REG77]]) define <256 x i8> @vshuff_77(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> @@ -963,7 +963,7 @@ define <256 x i8> @vshuff_77(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_78: -; CHECK: [[REG78:r[0-9]+]] = #120 +; CHECK: [[REG78:r[0-9]+]] = #-8 ; CHECK: vshuff(v1,v0,[[REG78]]) define <256 x i8> @vshuff_78(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255> @@ -971,7 +971,7 @@ define <256 x i8> @vshuff_78(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_79: -; CHECK: [[REG79:r[0-9]+]] = #121 +; CHECK: [[REG79:r[0-9]+]] = #-7 ; CHECK: vshuff(v1,v0,[[REG79]]) define <256 x i8> @vshuff_79(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 4, i32 132, i32 6, i32 134, i32 1, i32 129, i32 3, i32 131, i32 5, i32 133, i32 7, i32 135, i32 8, i32 136, i32 10, i32 138, i32 12, i32 140, i32 14, i32 142, i32 9, i32 137, i32 11, i32 139, i32 13, i32 141, i32 15, i32 143, i32 16, i32 144, i32 18, i32 146, i32 20, i32 148, i32 22, i32 150, i32 17, i32 145, i32 19, i32 147, i32 21, i32 149, i32 23, i32 151, i32 24, i32 152, i32 26, i32 154, i32 28, i32 156, i32 30, i32 158, i32 25, i32 153, i32 27, i32 155, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 36, i32 164, i32 38, i32 166, i32 33, i32 161, i32 35, i32 163, i32 37, i32 165, i32 39, i32 167, i32 40, i32 168, i32 42, i32 170, i32 44, i32 172, i32 46, i32 174, i32 41, i32 169, i32 43, i32 171, i32 45, i32 173, i32 47, i32 175, i32 48, i32 176, i32 50, i32 178, i32 52, i32 180, i32 54, i32 182, i32 49, i32 177, i32 51, i32 179, i32 53, i32 181, i32 55, i32 183, i32 56, i32 184, i32 58, i32 186, i32 60, i32 188, i32 62, i32 190, i32 57, i32 185, i32 59, i32 187, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 68, i32 196, i32 70, i32 198, i32 65, i32 193, i32 67, i32 195, i32 69, i32 197, i32 71, i32 199, i32 72, i32 200, i32 74, i32 202, i32 76, i32 204, i32 78, i32 206, i32 73, i32 201, i32 75, i32 203, i32 77, i32 205, i32 79, i32 207, i32 80, i32 208, i32 82, i32 210, i32 84, i32 212, i32 86, i32 214, i32 81, i32 209, i32 83, i32 211, i32 85, i32 213, i32 87, i32 215, i32 88, i32 216, i32 90, i32 218, i32 92, i32 220, i32 94, i32 222, i32 89, i32 217, i32 91, i32 219, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 100, i32 228, i32 102, i32 230, i32 97, i32 225, i32 99, i32 227, i32 101, i32 229, i32 103, i32 231, i32 104, i32 232, i32 106, i32 234, i32 108, i32 236, i32 110, i32 238, i32 105, i32 233, i32 107, i32 235, i32 109, i32 237, i32 111, i32 239, i32 112, i32 240, i32 114, i32 242, i32 116, i32 244, i32 118, i32 246, i32 113, i32 241, i32 115, i32 243, i32 117, i32 245, i32 119, i32 247, i32 120, i32 248, i32 122, i32 250, i32 124, i32 252, i32 126, i32 254, i32 121, i32 249, i32 123, i32 251, i32 125, i32 253, i32 127, i32 255> @@ -979,7 +979,7 @@ define <256 x i8> @vshuff_79(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_7a: -; CHECK: [[REG7a:r[0-9]+]] = #122 +; CHECK: [[REG7a:r[0-9]+]] = #-6 ; CHECK: vshuff(v1,v0,[[REG7a]]) define <256 x i8> @vshuff_7a(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 4, i32 5, i32 132, i32 133, i32 2, i32 3, i32 130, i32 131, i32 6, i32 7, i32 134, i32 135, i32 8, i32 9, i32 136, i32 137, i32 12, i32 13, i32 140, i32 141, i32 10, i32 11, i32 138, i32 139, i32 14, i32 15, i32 142, i32 143, i32 16, i32 17, i32 144, i32 145, i32 20, i32 21, i32 148, i32 149, i32 18, i32 19, i32 146, i32 147, i32 22, i32 23, i32 150, i32 151, i32 24, i32 25, i32 152, i32 153, i32 28, i32 29, i32 156, i32 157, i32 26, i32 27, i32 154, i32 155, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 36, i32 37, i32 164, i32 165, i32 34, i32 35, i32 162, i32 163, i32 38, i32 39, i32 166, i32 167, i32 40, i32 41, i32 168, i32 169, i32 44, i32 45, i32 172, i32 173, i32 42, i32 43, i32 170, i32 171, i32 46, i32 47, i32 174, i32 175, i32 48, i32 49, i32 176, i32 177, i32 52, i32 53, i32 180, i32 181, i32 50, i32 51, i32 178, i32 179, i32 54, i32 55, i32 182, i32 183, i32 56, i32 57, i32 184, i32 185, i32 60, i32 61, i32 188, i32 189, i32 58, i32 59, i32 186, i32 187, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 68, i32 69, i32 196, i32 197, i32 66, i32 67, i32 194, i32 195, i32 70, i32 71, i32 198, i32 199, i32 72, i32 73, i32 200, i32 201, i32 76, i32 77, i32 204, i32 205, i32 74, i32 75, i32 202, i32 203, i32 78, i32 79, i32 206, i32 207, i32 80, i32 81, i32 208, i32 209, i32 84, i32 85, i32 212, i32 213, i32 82, i32 83, i32 210, i32 211, i32 86, i32 87, i32 214, i32 215, i32 88, i32 89, i32 216, i32 217, i32 92, i32 93, i32 220, i32 221, i32 90, i32 91, i32 218, i32 219, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 100, i32 101, i32 228, i32 229, i32 98, i32 99, i32 226, i32 227, i32 102, i32 103, i32 230, i32 231, i32 104, i32 105, i32 232, i32 233, i32 108, i32 109, i32 236, i32 237, i32 106, i32 107, i32 234, i32 235, i32 110, i32 111, i32 238, i32 239, i32 112, i32 113, i32 240, i32 241, i32 116, i32 117, i32 244, i32 245, i32 114, i32 115, i32 242, i32 243, i32 118, i32 119, i32 246, i32 247, i32 120, i32 121, i32 248, i32 249, i32 124, i32 125, i32 252, i32 253, i32 122, i32 123, i32 250, i32 251, i32 126, i32 127, i32 254, i32 255> @@ -987,7 +987,7 @@ define <256 x i8> @vshuff_7a(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_7b: -; CHECK: [[REG7b:r[0-9]+]] = #123 +; CHECK: [[REG7b:r[0-9]+]] = #-5 ; CHECK: vshuff(v1,v0,[[REG7b]]) define <256 x i8> @vshuff_7b(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 4, i32 132, i32 5, i32 133, i32 2, i32 130, i32 3, i32 131, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 12, i32 140, i32 13, i32 141, i32 10, i32 138, i32 11, i32 139, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 20, i32 148, i32 21, i32 149, i32 18, i32 146, i32 19, i32 147, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 28, i32 156, i32 29, i32 157, i32 26, i32 154, i32 27, i32 155, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 36, i32 164, i32 37, i32 165, i32 34, i32 162, i32 35, i32 163, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 44, i32 172, i32 45, i32 173, i32 42, i32 170, i32 43, i32 171, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 52, i32 180, i32 53, i32 181, i32 50, i32 178, i32 51, i32 179, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 60, i32 188, i32 61, i32 189, i32 58, i32 186, i32 59, i32 187, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 68, i32 196, i32 69, i32 197, i32 66, i32 194, i32 67, i32 195, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 76, i32 204, i32 77, i32 205, i32 74, i32 202, i32 75, i32 203, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 84, i32 212, i32 85, i32 213, i32 82, i32 210, i32 83, i32 211, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 92, i32 220, i32 93, i32 221, i32 90, i32 218, i32 91, i32 219, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 100, i32 228, i32 101, i32 229, i32 98, i32 226, i32 99, i32 227, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 108, i32 236, i32 109, i32 237, i32 106, i32 234, i32 107, i32 235, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 116, i32 244, i32 117, i32 245, i32 114, i32 242, i32 115, i32 243, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 124, i32 252, i32 125, i32 253, i32 122, i32 250, i32 123, i32 251, i32 126, i32 254, i32 127, i32 255> @@ -995,7 +995,7 @@ define <256 x i8> @vshuff_7b(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_7c: -; CHECK: [[REG7c:r[0-9]+]] = #124 +; CHECK: [[REG7c:r[0-9]+]] = #-4 ; CHECK: vshuff(v1,v0,[[REG7c]]) define <256 x i8> @vshuff_7c(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 2, i32 3, i32 128, i32 129, i32 130, i32 131, i32 4, i32 5, i32 6, i32 7, i32 132, i32 133, i32 134, i32 135, i32 8, i32 9, i32 10, i32 11, i32 136, i32 137, i32 138, i32 139, i32 12, i32 13, i32 14, i32 15, i32 140, i32 141, i32 142, i32 143, i32 16, i32 17, i32 18, i32 19, i32 144, i32 145, i32 146, i32 147, i32 20, i32 21, i32 22, i32 23, i32 148, i32 149, i32 150, i32 151, i32 24, i32 25, i32 26, i32 27, i32 152, i32 153, i32 154, i32 155, i32 28, i32 29, i32 30, i32 31, i32 156, i32 157, i32 158, i32 159, i32 32, i32 33, i32 34, i32 35, i32 160, i32 161, i32 162, i32 163, i32 36, i32 37, i32 38, i32 39, i32 164, i32 165, i32 166, i32 167, i32 40, i32 41, i32 42, i32 43, i32 168, i32 169, i32 170, i32 171, i32 44, i32 45, i32 46, i32 47, i32 172, i32 173, i32 174, i32 175, i32 48, i32 49, i32 50, i32 51, i32 176, i32 177, i32 178, i32 179, i32 52, i32 53, i32 54, i32 55, i32 180, i32 181, i32 182, i32 183, i32 56, i32 57, i32 58, i32 59, i32 184, i32 185, i32 186, i32 187, i32 60, i32 61, i32 62, i32 63, i32 188, i32 189, i32 190, i32 191, i32 64, i32 65, i32 66, i32 67, i32 192, i32 193, i32 194, i32 195, i32 68, i32 69, i32 70, i32 71, i32 196, i32 197, i32 198, i32 199, i32 72, i32 73, i32 74, i32 75, i32 200, i32 201, i32 202, i32 203, i32 76, i32 77, i32 78, i32 79, i32 204, i32 205, i32 206, i32 207, i32 80, i32 81, i32 82, i32 83, i32 208, i32 209, i32 210, i32 211, i32 84, i32 85, i32 86, i32 87, i32 212, i32 213, i32 214, i32 215, i32 88, i32 89, i32 90, i32 91, i32 216, i32 217, i32 218, i32 219, i32 92, i32 93, i32 94, i32 95, i32 220, i32 221, i32 222, i32 223, i32 96, i32 97, i32 98, i32 99, i32 224, i32 225, i32 226, i32 227, i32 100, i32 101, i32 102, i32 103, i32 228, i32 229, i32 230, i32 231, i32 104, i32 105, i32 106, i32 107, i32 232, i32 233, i32 234, i32 235, i32 108, i32 109, i32 110, i32 111, i32 236, i32 237, i32 238, i32 239, i32 112, i32 113, i32 114, i32 115, i32 240, i32 241, i32 242, i32 243, i32 116, i32 117, i32 118, i32 119, i32 244, i32 245, i32 246, i32 247, i32 120, i32 121, i32 122, i32 123, i32 248, i32 249, i32 250, i32 251, i32 124, i32 125, i32 126, i32 127, i32 252, i32 253, i32 254, i32 255> @@ -1003,7 +1003,7 @@ define <256 x i8> @vshuff_7c(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_7d: -; CHECK: [[REG7d:r[0-9]+]] = #125 +; CHECK: [[REG7d:r[0-9]+]] = #-3 ; CHECK: vshuff(v1,v0,[[REG7d]]) define <256 x i8> @vshuff_7d(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 2, i32 130, i32 1, i32 129, i32 3, i32 131, i32 4, i32 132, i32 6, i32 134, i32 5, i32 133, i32 7, i32 135, i32 8, i32 136, i32 10, i32 138, i32 9, i32 137, i32 11, i32 139, i32 12, i32 140, i32 14, i32 142, i32 13, i32 141, i32 15, i32 143, i32 16, i32 144, i32 18, i32 146, i32 17, i32 145, i32 19, i32 147, i32 20, i32 148, i32 22, i32 150, i32 21, i32 149, i32 23, i32 151, i32 24, i32 152, i32 26, i32 154, i32 25, i32 153, i32 27, i32 155, i32 28, i32 156, i32 30, i32 158, i32 29, i32 157, i32 31, i32 159, i32 32, i32 160, i32 34, i32 162, i32 33, i32 161, i32 35, i32 163, i32 36, i32 164, i32 38, i32 166, i32 37, i32 165, i32 39, i32 167, i32 40, i32 168, i32 42, i32 170, i32 41, i32 169, i32 43, i32 171, i32 44, i32 172, i32 46, i32 174, i32 45, i32 173, i32 47, i32 175, i32 48, i32 176, i32 50, i32 178, i32 49, i32 177, i32 51, i32 179, i32 52, i32 180, i32 54, i32 182, i32 53, i32 181, i32 55, i32 183, i32 56, i32 184, i32 58, i32 186, i32 57, i32 185, i32 59, i32 187, i32 60, i32 188, i32 62, i32 190, i32 61, i32 189, i32 63, i32 191, i32 64, i32 192, i32 66, i32 194, i32 65, i32 193, i32 67, i32 195, i32 68, i32 196, i32 70, i32 198, i32 69, i32 197, i32 71, i32 199, i32 72, i32 200, i32 74, i32 202, i32 73, i32 201, i32 75, i32 203, i32 76, i32 204, i32 78, i32 206, i32 77, i32 205, i32 79, i32 207, i32 80, i32 208, i32 82, i32 210, i32 81, i32 209, i32 83, i32 211, i32 84, i32 212, i32 86, i32 214, i32 85, i32 213, i32 87, i32 215, i32 88, i32 216, i32 90, i32 218, i32 89, i32 217, i32 91, i32 219, i32 92, i32 220, i32 94, i32 222, i32 93, i32 221, i32 95, i32 223, i32 96, i32 224, i32 98, i32 226, i32 97, i32 225, i32 99, i32 227, i32 100, i32 228, i32 102, i32 230, i32 101, i32 229, i32 103, i32 231, i32 104, i32 232, i32 106, i32 234, i32 105, i32 233, i32 107, i32 235, i32 108, i32 236, i32 110, i32 238, i32 109, i32 237, i32 111, i32 239, i32 112, i32 240, i32 114, i32 242, i32 113, i32 241, i32 115, i32 243, i32 116, i32 244, i32 118, i32 246, i32 117, i32 245, i32 119, i32 247, i32 120, i32 248, i32 122, i32 250, i32 121, i32 249, i32 123, i32 251, i32 124, i32 252, i32 126, i32 254, i32 125, i32 253, i32 127, i32 255> @@ -1011,7 +1011,7 @@ define <256 x i8> @vshuff_7d(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_7e: -; CHECK: [[REG7e:r[0-9]+]] = #126 +; CHECK: [[REG7e:r[0-9]+]] = #-2 ; CHECK: vshuff(v1,v0,[[REG7e]]) define <256 x i8> @vshuff_7e(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 1, i32 128, i32 129, i32 2, i32 3, i32 130, i32 131, i32 4, i32 5, i32 132, i32 133, i32 6, i32 7, i32 134, i32 135, i32 8, i32 9, i32 136, i32 137, i32 10, i32 11, i32 138, i32 139, i32 12, i32 13, i32 140, i32 141, i32 14, i32 15, i32 142, i32 143, i32 16, i32 17, i32 144, i32 145, i32 18, i32 19, i32 146, i32 147, i32 20, i32 21, i32 148, i32 149, i32 22, i32 23, i32 150, i32 151, i32 24, i32 25, i32 152, i32 153, i32 26, i32 27, i32 154, i32 155, i32 28, i32 29, i32 156, i32 157, i32 30, i32 31, i32 158, i32 159, i32 32, i32 33, i32 160, i32 161, i32 34, i32 35, i32 162, i32 163, i32 36, i32 37, i32 164, i32 165, i32 38, i32 39, i32 166, i32 167, i32 40, i32 41, i32 168, i32 169, i32 42, i32 43, i32 170, i32 171, i32 44, i32 45, i32 172, i32 173, i32 46, i32 47, i32 174, i32 175, i32 48, i32 49, i32 176, i32 177, i32 50, i32 51, i32 178, i32 179, i32 52, i32 53, i32 180, i32 181, i32 54, i32 55, i32 182, i32 183, i32 56, i32 57, i32 184, i32 185, i32 58, i32 59, i32 186, i32 187, i32 60, i32 61, i32 188, i32 189, i32 62, i32 63, i32 190, i32 191, i32 64, i32 65, i32 192, i32 193, i32 66, i32 67, i32 194, i32 195, i32 68, i32 69, i32 196, i32 197, i32 70, i32 71, i32 198, i32 199, i32 72, i32 73, i32 200, i32 201, i32 74, i32 75, i32 202, i32 203, i32 76, i32 77, i32 204, i32 205, i32 78, i32 79, i32 206, i32 207, i32 80, i32 81, i32 208, i32 209, i32 82, i32 83, i32 210, i32 211, i32 84, i32 85, i32 212, i32 213, i32 86, i32 87, i32 214, i32 215, i32 88, i32 89, i32 216, i32 217, i32 90, i32 91, i32 218, i32 219, i32 92, i32 93, i32 220, i32 221, i32 94, i32 95, i32 222, i32 223, i32 96, i32 97, i32 224, i32 225, i32 98, i32 99, i32 226, i32 227, i32 100, i32 101, i32 228, i32 229, i32 102, i32 103, i32 230, i32 231, i32 104, i32 105, i32 232, i32 233, i32 106, i32 107, i32 234, i32 235, i32 108, i32 109, i32 236, i32 237, i32 110, i32 111, i32 238, i32 239, i32 112, i32 113, i32 240, i32 241, i32 114, i32 115, i32 242, i32 243, i32 116, i32 117, i32 244, i32 245, i32 118, i32 119, i32 246, i32 247, i32 120, i32 121, i32 248, i32 249, i32 122, i32 123, i32 250, i32 251, i32 124, i32 125, i32 252, i32 253, i32 126, i32 127, i32 254, i32 255> @@ -1019,7 +1019,7 @@ define <256 x i8> @vshuff_7e(<256 x i8> %v0, <256 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_7f: -; CHECK: [[REG7f:r[0-9]+]] = #127 +; CHECK: [[REG7f:r[0-9]+]] = #-1 ; CHECK: vshuff(v1,v0,[[REG7f]]) define <256 x i8> @vshuff_7f(<256 x i8> %v0, <256 x i8> %v1) #0 { %p = shufflevector <256 x i8> %v0, <256 x i8> undef, <256 x i32> < i32 0, i32 128, i32 1, i32 129, i32 2, i32 130, i32 3, i32 131, i32 4, i32 132, i32 5, i32 133, i32 6, i32 134, i32 7, i32 135, i32 8, i32 136, i32 9, i32 137, i32 10, i32 138, i32 11, i32 139, i32 12, i32 140, i32 13, i32 141, i32 14, i32 142, i32 15, i32 143, i32 16, i32 144, i32 17, i32 145, i32 18, i32 146, i32 19, i32 147, i32 20, i32 148, i32 21, i32 149, i32 22, i32 150, i32 23, i32 151, i32 24, i32 152, i32 25, i32 153, i32 26, i32 154, i32 27, i32 155, i32 28, i32 156, i32 29, i32 157, i32 30, i32 158, i32 31, i32 159, i32 32, i32 160, i32 33, i32 161, i32 34, i32 162, i32 35, i32 163, i32 36, i32 164, i32 37, i32 165, i32 38, i32 166, i32 39, i32 167, i32 40, i32 168, i32 41, i32 169, i32 42, i32 170, i32 43, i32 171, i32 44, i32 172, i32 45, i32 173, i32 46, i32 174, i32 47, i32 175, i32 48, i32 176, i32 49, i32 177, i32 50, i32 178, i32 51, i32 179, i32 52, i32 180, i32 53, i32 181, i32 54, i32 182, i32 55, i32 183, i32 56, i32 184, i32 57, i32 185, i32 58, i32 186, i32 59, i32 187, i32 60, i32 188, i32 61, i32 189, i32 62, i32 190, i32 63, i32 191, i32 64, i32 192, i32 65, i32 193, i32 66, i32 194, i32 67, i32 195, i32 68, i32 196, i32 69, i32 197, i32 70, i32 198, i32 71, i32 199, i32 72, i32 200, i32 73, i32 201, i32 74, i32 202, i32 75, i32 203, i32 76, i32 204, i32 77, i32 205, i32 78, i32 206, i32 79, i32 207, i32 80, i32 208, i32 81, i32 209, i32 82, i32 210, i32 83, i32 211, i32 84, i32 212, i32 85, i32 213, i32 86, i32 214, i32 87, i32 215, i32 88, i32 216, i32 89, i32 217, i32 90, i32 218, i32 91, i32 219, i32 92, i32 220, i32 93, i32 221, i32 94, i32 222, i32 95, i32 223, i32 96, i32 224, i32 97, i32 225, i32 98, i32 226, i32 99, i32 227, i32 100, i32 228, i32 101, i32 229, i32 102, i32 230, i32 103, i32 231, i32 104, i32 232, i32 105, i32 233, i32 106, i32 234, i32 107, i32 235, i32 108, i32 236, i32 109, i32 237, i32 110, i32 238, i32 111, i32 239, i32 112, i32 240, i32 113, i32 241, i32 114, i32 242, i32 115, i32 243, i32 116, i32 244, i32 117, i32 245, i32 118, i32 246, i32 119, i32 247, i32 120, i32 248, i32 121, i32 249, i32 122, i32 250, i32 123, i32 251, i32 124, i32 252, i32 125, i32 253, i32 126, i32 254, i32 127, i32 255> diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuff-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuff-64b.ll index b33b3be8052a4..40d5907a02c0e 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuff-64b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuff-64b.ll @@ -259,7 +259,7 @@ define <128 x i8> @vshuff_1f(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_20: -; CHECK: [[REG20:r[0-9]+]] = #32 +; CHECK: [[REG20:r[0-9]+]] = #-32 ; CHECK: vshuff(v1,v0,[[REG20]]) define <128 x i8> @vshuff_20(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -267,7 +267,7 @@ define <128 x i8> @vshuff_20(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_21: -; CHECK: [[REG21:r[0-9]+]] = #33 +; CHECK: [[REG21:r[0-9]+]] = #-31 ; CHECK: vshuff(v1,v0,[[REG21]]) define <128 x i8> @vshuff_21(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 1, i32 65, i32 3, i32 67, i32 5, i32 69, i32 7, i32 71, i32 9, i32 73, i32 11, i32 75, i32 13, i32 77, i32 15, i32 79, i32 17, i32 81, i32 19, i32 83, i32 21, i32 85, i32 23, i32 87, i32 25, i32 89, i32 27, i32 91, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126, i32 33, i32 97, i32 35, i32 99, i32 37, i32 101, i32 39, i32 103, i32 41, i32 105, i32 43, i32 107, i32 45, i32 109, i32 47, i32 111, i32 49, i32 113, i32 51, i32 115, i32 53, i32 117, i32 55, i32 119, i32 57, i32 121, i32 59, i32 123, i32 61, i32 125, i32 63, i32 127> @@ -275,7 +275,7 @@ define <128 x i8> @vshuff_21(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_22: -; CHECK: [[REG22:r[0-9]+]] = #34 +; CHECK: [[REG22:r[0-9]+]] = #-30 ; CHECK: vshuff(v1,v0,[[REG22]]) define <128 x i8> @vshuff_22(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 4, i32 5, i32 68, i32 69, i32 8, i32 9, i32 72, i32 73, i32 12, i32 13, i32 76, i32 77, i32 16, i32 17, i32 80, i32 81, i32 20, i32 21, i32 84, i32 85, i32 24, i32 25, i32 88, i32 89, i32 28, i32 29, i32 92, i32 93, i32 2, i32 3, i32 66, i32 67, i32 6, i32 7, i32 70, i32 71, i32 10, i32 11, i32 74, i32 75, i32 14, i32 15, i32 78, i32 79, i32 18, i32 19, i32 82, i32 83, i32 22, i32 23, i32 86, i32 87, i32 26, i32 27, i32 90, i32 91, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 36, i32 37, i32 100, i32 101, i32 40, i32 41, i32 104, i32 105, i32 44, i32 45, i32 108, i32 109, i32 48, i32 49, i32 112, i32 113, i32 52, i32 53, i32 116, i32 117, i32 56, i32 57, i32 120, i32 121, i32 60, i32 61, i32 124, i32 125, i32 34, i32 35, i32 98, i32 99, i32 38, i32 39, i32 102, i32 103, i32 42, i32 43, i32 106, i32 107, i32 46, i32 47, i32 110, i32 111, i32 50, i32 51, i32 114, i32 115, i32 54, i32 55, i32 118, i32 119, i32 58, i32 59, i32 122, i32 123, i32 62, i32 63, i32 126, i32 127> @@ -283,7 +283,7 @@ define <128 x i8> @vshuff_22(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_23: -; CHECK: [[REG23:r[0-9]+]] = #35 +; CHECK: [[REG23:r[0-9]+]] = #-29 ; CHECK: vshuff(v1,v0,[[REG23]]) define <128 x i8> @vshuff_23(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 4, i32 68, i32 5, i32 69, i32 8, i32 72, i32 9, i32 73, i32 12, i32 76, i32 13, i32 77, i32 16, i32 80, i32 17, i32 81, i32 20, i32 84, i32 21, i32 85, i32 24, i32 88, i32 25, i32 89, i32 28, i32 92, i32 29, i32 93, i32 2, i32 66, i32 3, i32 67, i32 6, i32 70, i32 7, i32 71, i32 10, i32 74, i32 11, i32 75, i32 14, i32 78, i32 15, i32 79, i32 18, i32 82, i32 19, i32 83, i32 22, i32 86, i32 23, i32 87, i32 26, i32 90, i32 27, i32 91, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 36, i32 100, i32 37, i32 101, i32 40, i32 104, i32 41, i32 105, i32 44, i32 108, i32 45, i32 109, i32 48, i32 112, i32 49, i32 113, i32 52, i32 116, i32 53, i32 117, i32 56, i32 120, i32 57, i32 121, i32 60, i32 124, i32 61, i32 125, i32 34, i32 98, i32 35, i32 99, i32 38, i32 102, i32 39, i32 103, i32 42, i32 106, i32 43, i32 107, i32 46, i32 110, i32 47, i32 111, i32 50, i32 114, i32 51, i32 115, i32 54, i32 118, i32 55, i32 119, i32 58, i32 122, i32 59, i32 123, i32 62, i32 126, i32 63, i32 127> @@ -291,7 +291,7 @@ define <128 x i8> @vshuff_23(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_24: -; CHECK: [[REG24:r[0-9]+]] = #36 +; CHECK: [[REG24:r[0-9]+]] = #-28 ; CHECK: vshuff(v1,v0,[[REG24]]) define <128 x i8> @vshuff_24(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 72, i32 73, i32 74, i32 75, i32 16, i32 17, i32 18, i32 19, i32 80, i32 81, i32 82, i32 83, i32 24, i32 25, i32 26, i32 27, i32 88, i32 89, i32 90, i32 91, i32 4, i32 5, i32 6, i32 7, i32 68, i32 69, i32 70, i32 71, i32 12, i32 13, i32 14, i32 15, i32 76, i32 77, i32 78, i32 79, i32 20, i32 21, i32 22, i32 23, i32 84, i32 85, i32 86, i32 87, i32 28, i32 29, i32 30, i32 31, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 96, i32 97, i32 98, i32 99, i32 40, i32 41, i32 42, i32 43, i32 104, i32 105, i32 106, i32 107, i32 48, i32 49, i32 50, i32 51, i32 112, i32 113, i32 114, i32 115, i32 56, i32 57, i32 58, i32 59, i32 120, i32 121, i32 122, i32 123, i32 36, i32 37, i32 38, i32 39, i32 100, i32 101, i32 102, i32 103, i32 44, i32 45, i32 46, i32 47, i32 108, i32 109, i32 110, i32 111, i32 52, i32 53, i32 54, i32 55, i32 116, i32 117, i32 118, i32 119, i32 60, i32 61, i32 62, i32 63, i32 124, i32 125, i32 126, i32 127> @@ -299,7 +299,7 @@ define <128 x i8> @vshuff_24(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_25: -; CHECK: [[REG25:r[0-9]+]] = #37 +; CHECK: [[REG25:r[0-9]+]] = #-27 ; CHECK: vshuff(v1,v0,[[REG25]]) define <128 x i8> @vshuff_25(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 1, i32 65, i32 3, i32 67, i32 8, i32 72, i32 10, i32 74, i32 9, i32 73, i32 11, i32 75, i32 16, i32 80, i32 18, i32 82, i32 17, i32 81, i32 19, i32 83, i32 24, i32 88, i32 26, i32 90, i32 25, i32 89, i32 27, i32 91, i32 4, i32 68, i32 6, i32 70, i32 5, i32 69, i32 7, i32 71, i32 12, i32 76, i32 14, i32 78, i32 13, i32 77, i32 15, i32 79, i32 20, i32 84, i32 22, i32 86, i32 21, i32 85, i32 23, i32 87, i32 28, i32 92, i32 30, i32 94, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 33, i32 97, i32 35, i32 99, i32 40, i32 104, i32 42, i32 106, i32 41, i32 105, i32 43, i32 107, i32 48, i32 112, i32 50, i32 114, i32 49, i32 113, i32 51, i32 115, i32 56, i32 120, i32 58, i32 122, i32 57, i32 121, i32 59, i32 123, i32 36, i32 100, i32 38, i32 102, i32 37, i32 101, i32 39, i32 103, i32 44, i32 108, i32 46, i32 110, i32 45, i32 109, i32 47, i32 111, i32 52, i32 116, i32 54, i32 118, i32 53, i32 117, i32 55, i32 119, i32 60, i32 124, i32 62, i32 126, i32 61, i32 125, i32 63, i32 127> @@ -307,7 +307,7 @@ define <128 x i8> @vshuff_25(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_26: -; CHECK: [[REG26:r[0-9]+]] = #38 +; CHECK: [[REG26:r[0-9]+]] = #-26 ; CHECK: vshuff(v1,v0,[[REG26]]) define <128 x i8> @vshuff_26(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 2, i32 3, i32 66, i32 67, i32 8, i32 9, i32 72, i32 73, i32 10, i32 11, i32 74, i32 75, i32 16, i32 17, i32 80, i32 81, i32 18, i32 19, i32 82, i32 83, i32 24, i32 25, i32 88, i32 89, i32 26, i32 27, i32 90, i32 91, i32 4, i32 5, i32 68, i32 69, i32 6, i32 7, i32 70, i32 71, i32 12, i32 13, i32 76, i32 77, i32 14, i32 15, i32 78, i32 79, i32 20, i32 21, i32 84, i32 85, i32 22, i32 23, i32 86, i32 87, i32 28, i32 29, i32 92, i32 93, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 34, i32 35, i32 98, i32 99, i32 40, i32 41, i32 104, i32 105, i32 42, i32 43, i32 106, i32 107, i32 48, i32 49, i32 112, i32 113, i32 50, i32 51, i32 114, i32 115, i32 56, i32 57, i32 120, i32 121, i32 58, i32 59, i32 122, i32 123, i32 36, i32 37, i32 100, i32 101, i32 38, i32 39, i32 102, i32 103, i32 44, i32 45, i32 108, i32 109, i32 46, i32 47, i32 110, i32 111, i32 52, i32 53, i32 116, i32 117, i32 54, i32 55, i32 118, i32 119, i32 60, i32 61, i32 124, i32 125, i32 62, i32 63, i32 126, i32 127> @@ -315,7 +315,7 @@ define <128 x i8> @vshuff_26(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_27: -; CHECK: [[REG27:r[0-9]+]] = #39 +; CHECK: [[REG27:r[0-9]+]] = #-25 ; CHECK: vshuff(v1,v0,[[REG27]]) define <128 x i8> @vshuff_27(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> @@ -323,7 +323,7 @@ define <128 x i8> @vshuff_27(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_28: -; CHECK: [[REG28:r[0-9]+]] = #40 +; CHECK: [[REG28:r[0-9]+]] = #-24 ; CHECK: vshuff(v1,v0,[[REG28]]) define <128 x i8> @vshuff_28(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -331,7 +331,7 @@ define <128 x i8> @vshuff_28(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_29: -; CHECK: [[REG29:r[0-9]+]] = #41 +; CHECK: [[REG29:r[0-9]+]] = #-23 ; CHECK: vshuff(v1,v0,[[REG29]]) define <128 x i8> @vshuff_29(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 1, i32 65, i32 3, i32 67, i32 5, i32 69, i32 7, i32 71, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 17, i32 81, i32 19, i32 83, i32 21, i32 85, i32 23, i32 87, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 9, i32 73, i32 11, i32 75, i32 13, i32 77, i32 15, i32 79, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 25, i32 89, i32 27, i32 91, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 33, i32 97, i32 35, i32 99, i32 37, i32 101, i32 39, i32 103, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 49, i32 113, i32 51, i32 115, i32 53, i32 117, i32 55, i32 119, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 41, i32 105, i32 43, i32 107, i32 45, i32 109, i32 47, i32 111, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126, i32 57, i32 121, i32 59, i32 123, i32 61, i32 125, i32 63, i32 127> @@ -339,7 +339,7 @@ define <128 x i8> @vshuff_29(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_2a: -; CHECK: [[REG2a:r[0-9]+]] = #42 +; CHECK: [[REG2a:r[0-9]+]] = #-22 ; CHECK: vshuff(v1,v0,[[REG2a]]) define <128 x i8> @vshuff_2a(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 4, i32 5, i32 68, i32 69, i32 2, i32 3, i32 66, i32 67, i32 6, i32 7, i32 70, i32 71, i32 16, i32 17, i32 80, i32 81, i32 20, i32 21, i32 84, i32 85, i32 18, i32 19, i32 82, i32 83, i32 22, i32 23, i32 86, i32 87, i32 8, i32 9, i32 72, i32 73, i32 12, i32 13, i32 76, i32 77, i32 10, i32 11, i32 74, i32 75, i32 14, i32 15, i32 78, i32 79, i32 24, i32 25, i32 88, i32 89, i32 28, i32 29, i32 92, i32 93, i32 26, i32 27, i32 90, i32 91, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 36, i32 37, i32 100, i32 101, i32 34, i32 35, i32 98, i32 99, i32 38, i32 39, i32 102, i32 103, i32 48, i32 49, i32 112, i32 113, i32 52, i32 53, i32 116, i32 117, i32 50, i32 51, i32 114, i32 115, i32 54, i32 55, i32 118, i32 119, i32 40, i32 41, i32 104, i32 105, i32 44, i32 45, i32 108, i32 109, i32 42, i32 43, i32 106, i32 107, i32 46, i32 47, i32 110, i32 111, i32 56, i32 57, i32 120, i32 121, i32 60, i32 61, i32 124, i32 125, i32 58, i32 59, i32 122, i32 123, i32 62, i32 63, i32 126, i32 127> @@ -347,7 +347,7 @@ define <128 x i8> @vshuff_2a(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_2b: -; CHECK: [[REG2b:r[0-9]+]] = #43 +; CHECK: [[REG2b:r[0-9]+]] = #-21 ; CHECK: vshuff(v1,v0,[[REG2b]]) define <128 x i8> @vshuff_2b(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 4, i32 68, i32 5, i32 69, i32 2, i32 66, i32 3, i32 67, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 20, i32 84, i32 21, i32 85, i32 18, i32 82, i32 19, i32 83, i32 22, i32 86, i32 23, i32 87, i32 8, i32 72, i32 9, i32 73, i32 12, i32 76, i32 13, i32 77, i32 10, i32 74, i32 11, i32 75, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 28, i32 92, i32 29, i32 93, i32 26, i32 90, i32 27, i32 91, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 36, i32 100, i32 37, i32 101, i32 34, i32 98, i32 35, i32 99, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 52, i32 116, i32 53, i32 117, i32 50, i32 114, i32 51, i32 115, i32 54, i32 118, i32 55, i32 119, i32 40, i32 104, i32 41, i32 105, i32 44, i32 108, i32 45, i32 109, i32 42, i32 106, i32 43, i32 107, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 60, i32 124, i32 61, i32 125, i32 58, i32 122, i32 59, i32 123, i32 62, i32 126, i32 63, i32 127> @@ -355,7 +355,7 @@ define <128 x i8> @vshuff_2b(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_2c: -; CHECK: [[REG2c:r[0-9]+]] = #44 +; CHECK: [[REG2c:r[0-9]+]] = #-20 ; CHECK: vshuff(v1,v0,[[REG2c]]) define <128 x i8> @vshuff_2c(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 68, i32 69, i32 70, i32 71, i32 16, i32 17, i32 18, i32 19, i32 80, i32 81, i32 82, i32 83, i32 20, i32 21, i32 22, i32 23, i32 84, i32 85, i32 86, i32 87, i32 8, i32 9, i32 10, i32 11, i32 72, i32 73, i32 74, i32 75, i32 12, i32 13, i32 14, i32 15, i32 76, i32 77, i32 78, i32 79, i32 24, i32 25, i32 26, i32 27, i32 88, i32 89, i32 90, i32 91, i32 28, i32 29, i32 30, i32 31, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 96, i32 97, i32 98, i32 99, i32 36, i32 37, i32 38, i32 39, i32 100, i32 101, i32 102, i32 103, i32 48, i32 49, i32 50, i32 51, i32 112, i32 113, i32 114, i32 115, i32 52, i32 53, i32 54, i32 55, i32 116, i32 117, i32 118, i32 119, i32 40, i32 41, i32 42, i32 43, i32 104, i32 105, i32 106, i32 107, i32 44, i32 45, i32 46, i32 47, i32 108, i32 109, i32 110, i32 111, i32 56, i32 57, i32 58, i32 59, i32 120, i32 121, i32 122, i32 123, i32 60, i32 61, i32 62, i32 63, i32 124, i32 125, i32 126, i32 127> @@ -363,7 +363,7 @@ define <128 x i8> @vshuff_2c(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_2d: -; CHECK: [[REG2d:r[0-9]+]] = #45 +; CHECK: [[REG2d:r[0-9]+]] = #-19 ; CHECK: vshuff(v1,v0,[[REG2d]]) define <128 x i8> @vshuff_2d(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 1, i32 65, i32 3, i32 67, i32 4, i32 68, i32 6, i32 70, i32 5, i32 69, i32 7, i32 71, i32 16, i32 80, i32 18, i32 82, i32 17, i32 81, i32 19, i32 83, i32 20, i32 84, i32 22, i32 86, i32 21, i32 85, i32 23, i32 87, i32 8, i32 72, i32 10, i32 74, i32 9, i32 73, i32 11, i32 75, i32 12, i32 76, i32 14, i32 78, i32 13, i32 77, i32 15, i32 79, i32 24, i32 88, i32 26, i32 90, i32 25, i32 89, i32 27, i32 91, i32 28, i32 92, i32 30, i32 94, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 33, i32 97, i32 35, i32 99, i32 36, i32 100, i32 38, i32 102, i32 37, i32 101, i32 39, i32 103, i32 48, i32 112, i32 50, i32 114, i32 49, i32 113, i32 51, i32 115, i32 52, i32 116, i32 54, i32 118, i32 53, i32 117, i32 55, i32 119, i32 40, i32 104, i32 42, i32 106, i32 41, i32 105, i32 43, i32 107, i32 44, i32 108, i32 46, i32 110, i32 45, i32 109, i32 47, i32 111, i32 56, i32 120, i32 58, i32 122, i32 57, i32 121, i32 59, i32 123, i32 60, i32 124, i32 62, i32 126, i32 61, i32 125, i32 63, i32 127> @@ -371,7 +371,7 @@ define <128 x i8> @vshuff_2d(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_2e: -; CHECK: [[REG2e:r[0-9]+]] = #46 +; CHECK: [[REG2e:r[0-9]+]] = #-18 ; CHECK: vshuff(v1,v0,[[REG2e]]) define <128 x i8> @vshuff_2e(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 2, i32 3, i32 66, i32 67, i32 4, i32 5, i32 68, i32 69, i32 6, i32 7, i32 70, i32 71, i32 16, i32 17, i32 80, i32 81, i32 18, i32 19, i32 82, i32 83, i32 20, i32 21, i32 84, i32 85, i32 22, i32 23, i32 86, i32 87, i32 8, i32 9, i32 72, i32 73, i32 10, i32 11, i32 74, i32 75, i32 12, i32 13, i32 76, i32 77, i32 14, i32 15, i32 78, i32 79, i32 24, i32 25, i32 88, i32 89, i32 26, i32 27, i32 90, i32 91, i32 28, i32 29, i32 92, i32 93, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 34, i32 35, i32 98, i32 99, i32 36, i32 37, i32 100, i32 101, i32 38, i32 39, i32 102, i32 103, i32 48, i32 49, i32 112, i32 113, i32 50, i32 51, i32 114, i32 115, i32 52, i32 53, i32 116, i32 117, i32 54, i32 55, i32 118, i32 119, i32 40, i32 41, i32 104, i32 105, i32 42, i32 43, i32 106, i32 107, i32 44, i32 45, i32 108, i32 109, i32 46, i32 47, i32 110, i32 111, i32 56, i32 57, i32 120, i32 121, i32 58, i32 59, i32 122, i32 123, i32 60, i32 61, i32 124, i32 125, i32 62, i32 63, i32 126, i32 127> @@ -379,7 +379,7 @@ define <128 x i8> @vshuff_2e(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_2f: -; CHECK: [[REG2f:r[0-9]+]] = #47 +; CHECK: [[REG2f:r[0-9]+]] = #-17 ; CHECK: vshuff(v1,v0,[[REG2f]]) define <128 x i8> @vshuff_2f(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> @@ -387,7 +387,7 @@ define <128 x i8> @vshuff_2f(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_30: -; CHECK: [[REG30:r[0-9]+]] = #48 +; CHECK: [[REG30:r[0-9]+]] = #-16 ; CHECK: vshuff(v1,v0,[[REG30]]) define <128 x i8> @vshuff_30(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -395,7 +395,7 @@ define <128 x i8> @vshuff_30(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_31: -; CHECK: [[REG31:r[0-9]+]] = #49 +; CHECK: [[REG31:r[0-9]+]] = #-15 ; CHECK: vshuff(v1,v0,[[REG31]]) define <128 x i8> @vshuff_31(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 1, i32 65, i32 3, i32 67, i32 5, i32 69, i32 7, i32 71, i32 9, i32 73, i32 11, i32 75, i32 13, i32 77, i32 15, i32 79, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 17, i32 81, i32 19, i32 83, i32 21, i32 85, i32 23, i32 87, i32 25, i32 89, i32 27, i32 91, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 33, i32 97, i32 35, i32 99, i32 37, i32 101, i32 39, i32 103, i32 41, i32 105, i32 43, i32 107, i32 45, i32 109, i32 47, i32 111, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126, i32 49, i32 113, i32 51, i32 115, i32 53, i32 117, i32 55, i32 119, i32 57, i32 121, i32 59, i32 123, i32 61, i32 125, i32 63, i32 127> @@ -403,7 +403,7 @@ define <128 x i8> @vshuff_31(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_32: -; CHECK: [[REG32:r[0-9]+]] = #50 +; CHECK: [[REG32:r[0-9]+]] = #-14 ; CHECK: vshuff(v1,v0,[[REG32]]) define <128 x i8> @vshuff_32(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 4, i32 5, i32 68, i32 69, i32 8, i32 9, i32 72, i32 73, i32 12, i32 13, i32 76, i32 77, i32 2, i32 3, i32 66, i32 67, i32 6, i32 7, i32 70, i32 71, i32 10, i32 11, i32 74, i32 75, i32 14, i32 15, i32 78, i32 79, i32 16, i32 17, i32 80, i32 81, i32 20, i32 21, i32 84, i32 85, i32 24, i32 25, i32 88, i32 89, i32 28, i32 29, i32 92, i32 93, i32 18, i32 19, i32 82, i32 83, i32 22, i32 23, i32 86, i32 87, i32 26, i32 27, i32 90, i32 91, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 36, i32 37, i32 100, i32 101, i32 40, i32 41, i32 104, i32 105, i32 44, i32 45, i32 108, i32 109, i32 34, i32 35, i32 98, i32 99, i32 38, i32 39, i32 102, i32 103, i32 42, i32 43, i32 106, i32 107, i32 46, i32 47, i32 110, i32 111, i32 48, i32 49, i32 112, i32 113, i32 52, i32 53, i32 116, i32 117, i32 56, i32 57, i32 120, i32 121, i32 60, i32 61, i32 124, i32 125, i32 50, i32 51, i32 114, i32 115, i32 54, i32 55, i32 118, i32 119, i32 58, i32 59, i32 122, i32 123, i32 62, i32 63, i32 126, i32 127> @@ -411,7 +411,7 @@ define <128 x i8> @vshuff_32(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_33: -; CHECK: [[REG33:r[0-9]+]] = #51 +; CHECK: [[REG33:r[0-9]+]] = #-13 ; CHECK: vshuff(v1,v0,[[REG33]]) define <128 x i8> @vshuff_33(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 4, i32 68, i32 5, i32 69, i32 8, i32 72, i32 9, i32 73, i32 12, i32 76, i32 13, i32 77, i32 2, i32 66, i32 3, i32 67, i32 6, i32 70, i32 7, i32 71, i32 10, i32 74, i32 11, i32 75, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 20, i32 84, i32 21, i32 85, i32 24, i32 88, i32 25, i32 89, i32 28, i32 92, i32 29, i32 93, i32 18, i32 82, i32 19, i32 83, i32 22, i32 86, i32 23, i32 87, i32 26, i32 90, i32 27, i32 91, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 36, i32 100, i32 37, i32 101, i32 40, i32 104, i32 41, i32 105, i32 44, i32 108, i32 45, i32 109, i32 34, i32 98, i32 35, i32 99, i32 38, i32 102, i32 39, i32 103, i32 42, i32 106, i32 43, i32 107, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 52, i32 116, i32 53, i32 117, i32 56, i32 120, i32 57, i32 121, i32 60, i32 124, i32 61, i32 125, i32 50, i32 114, i32 51, i32 115, i32 54, i32 118, i32 55, i32 119, i32 58, i32 122, i32 59, i32 123, i32 62, i32 126, i32 63, i32 127> @@ -419,7 +419,7 @@ define <128 x i8> @vshuff_33(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_34: -; CHECK: [[REG34:r[0-9]+]] = #52 +; CHECK: [[REG34:r[0-9]+]] = #-12 ; CHECK: vshuff(v1,v0,[[REG34]]) define <128 x i8> @vshuff_34(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 8, i32 9, i32 10, i32 11, i32 72, i32 73, i32 74, i32 75, i32 4, i32 5, i32 6, i32 7, i32 68, i32 69, i32 70, i32 71, i32 12, i32 13, i32 14, i32 15, i32 76, i32 77, i32 78, i32 79, i32 16, i32 17, i32 18, i32 19, i32 80, i32 81, i32 82, i32 83, i32 24, i32 25, i32 26, i32 27, i32 88, i32 89, i32 90, i32 91, i32 20, i32 21, i32 22, i32 23, i32 84, i32 85, i32 86, i32 87, i32 28, i32 29, i32 30, i32 31, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 96, i32 97, i32 98, i32 99, i32 40, i32 41, i32 42, i32 43, i32 104, i32 105, i32 106, i32 107, i32 36, i32 37, i32 38, i32 39, i32 100, i32 101, i32 102, i32 103, i32 44, i32 45, i32 46, i32 47, i32 108, i32 109, i32 110, i32 111, i32 48, i32 49, i32 50, i32 51, i32 112, i32 113, i32 114, i32 115, i32 56, i32 57, i32 58, i32 59, i32 120, i32 121, i32 122, i32 123, i32 52, i32 53, i32 54, i32 55, i32 116, i32 117, i32 118, i32 119, i32 60, i32 61, i32 62, i32 63, i32 124, i32 125, i32 126, i32 127> @@ -427,7 +427,7 @@ define <128 x i8> @vshuff_34(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_35: -; CHECK: [[REG35:r[0-9]+]] = #53 +; CHECK: [[REG35:r[0-9]+]] = #-11 ; CHECK: vshuff(v1,v0,[[REG35]]) define <128 x i8> @vshuff_35(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 1, i32 65, i32 3, i32 67, i32 8, i32 72, i32 10, i32 74, i32 9, i32 73, i32 11, i32 75, i32 4, i32 68, i32 6, i32 70, i32 5, i32 69, i32 7, i32 71, i32 12, i32 76, i32 14, i32 78, i32 13, i32 77, i32 15, i32 79, i32 16, i32 80, i32 18, i32 82, i32 17, i32 81, i32 19, i32 83, i32 24, i32 88, i32 26, i32 90, i32 25, i32 89, i32 27, i32 91, i32 20, i32 84, i32 22, i32 86, i32 21, i32 85, i32 23, i32 87, i32 28, i32 92, i32 30, i32 94, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 33, i32 97, i32 35, i32 99, i32 40, i32 104, i32 42, i32 106, i32 41, i32 105, i32 43, i32 107, i32 36, i32 100, i32 38, i32 102, i32 37, i32 101, i32 39, i32 103, i32 44, i32 108, i32 46, i32 110, i32 45, i32 109, i32 47, i32 111, i32 48, i32 112, i32 50, i32 114, i32 49, i32 113, i32 51, i32 115, i32 56, i32 120, i32 58, i32 122, i32 57, i32 121, i32 59, i32 123, i32 52, i32 116, i32 54, i32 118, i32 53, i32 117, i32 55, i32 119, i32 60, i32 124, i32 62, i32 126, i32 61, i32 125, i32 63, i32 127> @@ -435,7 +435,7 @@ define <128 x i8> @vshuff_35(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_36: -; CHECK: [[REG36:r[0-9]+]] = #54 +; CHECK: [[REG36:r[0-9]+]] = #-10 ; CHECK: vshuff(v1,v0,[[REG36]]) define <128 x i8> @vshuff_36(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 2, i32 3, i32 66, i32 67, i32 8, i32 9, i32 72, i32 73, i32 10, i32 11, i32 74, i32 75, i32 4, i32 5, i32 68, i32 69, i32 6, i32 7, i32 70, i32 71, i32 12, i32 13, i32 76, i32 77, i32 14, i32 15, i32 78, i32 79, i32 16, i32 17, i32 80, i32 81, i32 18, i32 19, i32 82, i32 83, i32 24, i32 25, i32 88, i32 89, i32 26, i32 27, i32 90, i32 91, i32 20, i32 21, i32 84, i32 85, i32 22, i32 23, i32 86, i32 87, i32 28, i32 29, i32 92, i32 93, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 34, i32 35, i32 98, i32 99, i32 40, i32 41, i32 104, i32 105, i32 42, i32 43, i32 106, i32 107, i32 36, i32 37, i32 100, i32 101, i32 38, i32 39, i32 102, i32 103, i32 44, i32 45, i32 108, i32 109, i32 46, i32 47, i32 110, i32 111, i32 48, i32 49, i32 112, i32 113, i32 50, i32 51, i32 114, i32 115, i32 56, i32 57, i32 120, i32 121, i32 58, i32 59, i32 122, i32 123, i32 52, i32 53, i32 116, i32 117, i32 54, i32 55, i32 118, i32 119, i32 60, i32 61, i32 124, i32 125, i32 62, i32 63, i32 126, i32 127> @@ -443,7 +443,7 @@ define <128 x i8> @vshuff_36(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_37: -; CHECK: [[REG37:r[0-9]+]] = #55 +; CHECK: [[REG37:r[0-9]+]] = #-9 ; CHECK: vshuff(v1,v0,[[REG37]]) define <128 x i8> @vshuff_37(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> @@ -451,7 +451,7 @@ define <128 x i8> @vshuff_37(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_38: -; CHECK: [[REG38:r[0-9]+]] = #56 +; CHECK: [[REG38:r[0-9]+]] = #-8 ; CHECK: vshuff(v1,v0,[[REG38]]) define <128 x i8> @vshuff_38(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> @@ -459,7 +459,7 @@ define <128 x i8> @vshuff_38(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_39: -; CHECK: [[REG39:r[0-9]+]] = #57 +; CHECK: [[REG39:r[0-9]+]] = #-7 ; CHECK: vshuff(v1,v0,[[REG39]]) define <128 x i8> @vshuff_39(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 4, i32 68, i32 6, i32 70, i32 1, i32 65, i32 3, i32 67, i32 5, i32 69, i32 7, i32 71, i32 8, i32 72, i32 10, i32 74, i32 12, i32 76, i32 14, i32 78, i32 9, i32 73, i32 11, i32 75, i32 13, i32 77, i32 15, i32 79, i32 16, i32 80, i32 18, i32 82, i32 20, i32 84, i32 22, i32 86, i32 17, i32 81, i32 19, i32 83, i32 21, i32 85, i32 23, i32 87, i32 24, i32 88, i32 26, i32 90, i32 28, i32 92, i32 30, i32 94, i32 25, i32 89, i32 27, i32 91, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 36, i32 100, i32 38, i32 102, i32 33, i32 97, i32 35, i32 99, i32 37, i32 101, i32 39, i32 103, i32 40, i32 104, i32 42, i32 106, i32 44, i32 108, i32 46, i32 110, i32 41, i32 105, i32 43, i32 107, i32 45, i32 109, i32 47, i32 111, i32 48, i32 112, i32 50, i32 114, i32 52, i32 116, i32 54, i32 118, i32 49, i32 113, i32 51, i32 115, i32 53, i32 117, i32 55, i32 119, i32 56, i32 120, i32 58, i32 122, i32 60, i32 124, i32 62, i32 126, i32 57, i32 121, i32 59, i32 123, i32 61, i32 125, i32 63, i32 127> @@ -467,7 +467,7 @@ define <128 x i8> @vshuff_39(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_3a: -; CHECK: [[REG3a:r[0-9]+]] = #58 +; CHECK: [[REG3a:r[0-9]+]] = #-6 ; CHECK: vshuff(v1,v0,[[REG3a]]) define <128 x i8> @vshuff_3a(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 4, i32 5, i32 68, i32 69, i32 2, i32 3, i32 66, i32 67, i32 6, i32 7, i32 70, i32 71, i32 8, i32 9, i32 72, i32 73, i32 12, i32 13, i32 76, i32 77, i32 10, i32 11, i32 74, i32 75, i32 14, i32 15, i32 78, i32 79, i32 16, i32 17, i32 80, i32 81, i32 20, i32 21, i32 84, i32 85, i32 18, i32 19, i32 82, i32 83, i32 22, i32 23, i32 86, i32 87, i32 24, i32 25, i32 88, i32 89, i32 28, i32 29, i32 92, i32 93, i32 26, i32 27, i32 90, i32 91, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 36, i32 37, i32 100, i32 101, i32 34, i32 35, i32 98, i32 99, i32 38, i32 39, i32 102, i32 103, i32 40, i32 41, i32 104, i32 105, i32 44, i32 45, i32 108, i32 109, i32 42, i32 43, i32 106, i32 107, i32 46, i32 47, i32 110, i32 111, i32 48, i32 49, i32 112, i32 113, i32 52, i32 53, i32 116, i32 117, i32 50, i32 51, i32 114, i32 115, i32 54, i32 55, i32 118, i32 119, i32 56, i32 57, i32 120, i32 121, i32 60, i32 61, i32 124, i32 125, i32 58, i32 59, i32 122, i32 123, i32 62, i32 63, i32 126, i32 127> @@ -475,7 +475,7 @@ define <128 x i8> @vshuff_3a(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_3b: -; CHECK: [[REG3b:r[0-9]+]] = #59 +; CHECK: [[REG3b:r[0-9]+]] = #-5 ; CHECK: vshuff(v1,v0,[[REG3b]]) define <128 x i8> @vshuff_3b(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 4, i32 68, i32 5, i32 69, i32 2, i32 66, i32 3, i32 67, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 12, i32 76, i32 13, i32 77, i32 10, i32 74, i32 11, i32 75, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 20, i32 84, i32 21, i32 85, i32 18, i32 82, i32 19, i32 83, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 28, i32 92, i32 29, i32 93, i32 26, i32 90, i32 27, i32 91, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 36, i32 100, i32 37, i32 101, i32 34, i32 98, i32 35, i32 99, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 44, i32 108, i32 45, i32 109, i32 42, i32 106, i32 43, i32 107, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 52, i32 116, i32 53, i32 117, i32 50, i32 114, i32 51, i32 115, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 60, i32 124, i32 61, i32 125, i32 58, i32 122, i32 59, i32 123, i32 62, i32 126, i32 63, i32 127> @@ -483,7 +483,7 @@ define <128 x i8> @vshuff_3b(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_3c: -; CHECK: [[REG3c:r[0-9]+]] = #60 +; CHECK: [[REG3c:r[0-9]+]] = #-4 ; CHECK: vshuff(v1,v0,[[REG3c]]) define <128 x i8> @vshuff_3c(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 2, i32 3, i32 64, i32 65, i32 66, i32 67, i32 4, i32 5, i32 6, i32 7, i32 68, i32 69, i32 70, i32 71, i32 8, i32 9, i32 10, i32 11, i32 72, i32 73, i32 74, i32 75, i32 12, i32 13, i32 14, i32 15, i32 76, i32 77, i32 78, i32 79, i32 16, i32 17, i32 18, i32 19, i32 80, i32 81, i32 82, i32 83, i32 20, i32 21, i32 22, i32 23, i32 84, i32 85, i32 86, i32 87, i32 24, i32 25, i32 26, i32 27, i32 88, i32 89, i32 90, i32 91, i32 28, i32 29, i32 30, i32 31, i32 92, i32 93, i32 94, i32 95, i32 32, i32 33, i32 34, i32 35, i32 96, i32 97, i32 98, i32 99, i32 36, i32 37, i32 38, i32 39, i32 100, i32 101, i32 102, i32 103, i32 40, i32 41, i32 42, i32 43, i32 104, i32 105, i32 106, i32 107, i32 44, i32 45, i32 46, i32 47, i32 108, i32 109, i32 110, i32 111, i32 48, i32 49, i32 50, i32 51, i32 112, i32 113, i32 114, i32 115, i32 52, i32 53, i32 54, i32 55, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 120, i32 121, i32 122, i32 123, i32 60, i32 61, i32 62, i32 63, i32 124, i32 125, i32 126, i32 127> @@ -491,7 +491,7 @@ define <128 x i8> @vshuff_3c(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_3d: -; CHECK: [[REG3d:r[0-9]+]] = #61 +; CHECK: [[REG3d:r[0-9]+]] = #-3 ; CHECK: vshuff(v1,v0,[[REG3d]]) define <128 x i8> @vshuff_3d(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 2, i32 66, i32 1, i32 65, i32 3, i32 67, i32 4, i32 68, i32 6, i32 70, i32 5, i32 69, i32 7, i32 71, i32 8, i32 72, i32 10, i32 74, i32 9, i32 73, i32 11, i32 75, i32 12, i32 76, i32 14, i32 78, i32 13, i32 77, i32 15, i32 79, i32 16, i32 80, i32 18, i32 82, i32 17, i32 81, i32 19, i32 83, i32 20, i32 84, i32 22, i32 86, i32 21, i32 85, i32 23, i32 87, i32 24, i32 88, i32 26, i32 90, i32 25, i32 89, i32 27, i32 91, i32 28, i32 92, i32 30, i32 94, i32 29, i32 93, i32 31, i32 95, i32 32, i32 96, i32 34, i32 98, i32 33, i32 97, i32 35, i32 99, i32 36, i32 100, i32 38, i32 102, i32 37, i32 101, i32 39, i32 103, i32 40, i32 104, i32 42, i32 106, i32 41, i32 105, i32 43, i32 107, i32 44, i32 108, i32 46, i32 110, i32 45, i32 109, i32 47, i32 111, i32 48, i32 112, i32 50, i32 114, i32 49, i32 113, i32 51, i32 115, i32 52, i32 116, i32 54, i32 118, i32 53, i32 117, i32 55, i32 119, i32 56, i32 120, i32 58, i32 122, i32 57, i32 121, i32 59, i32 123, i32 60, i32 124, i32 62, i32 126, i32 61, i32 125, i32 63, i32 127> @@ -499,7 +499,7 @@ define <128 x i8> @vshuff_3d(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_3e: -; CHECK: [[REG3e:r[0-9]+]] = #62 +; CHECK: [[REG3e:r[0-9]+]] = #-2 ; CHECK: vshuff(v1,v0,[[REG3e]]) define <128 x i8> @vshuff_3e(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 1, i32 64, i32 65, i32 2, i32 3, i32 66, i32 67, i32 4, i32 5, i32 68, i32 69, i32 6, i32 7, i32 70, i32 71, i32 8, i32 9, i32 72, i32 73, i32 10, i32 11, i32 74, i32 75, i32 12, i32 13, i32 76, i32 77, i32 14, i32 15, i32 78, i32 79, i32 16, i32 17, i32 80, i32 81, i32 18, i32 19, i32 82, i32 83, i32 20, i32 21, i32 84, i32 85, i32 22, i32 23, i32 86, i32 87, i32 24, i32 25, i32 88, i32 89, i32 26, i32 27, i32 90, i32 91, i32 28, i32 29, i32 92, i32 93, i32 30, i32 31, i32 94, i32 95, i32 32, i32 33, i32 96, i32 97, i32 34, i32 35, i32 98, i32 99, i32 36, i32 37, i32 100, i32 101, i32 38, i32 39, i32 102, i32 103, i32 40, i32 41, i32 104, i32 105, i32 42, i32 43, i32 106, i32 107, i32 44, i32 45, i32 108, i32 109, i32 46, i32 47, i32 110, i32 111, i32 48, i32 49, i32 112, i32 113, i32 50, i32 51, i32 114, i32 115, i32 52, i32 53, i32 116, i32 117, i32 54, i32 55, i32 118, i32 119, i32 56, i32 57, i32 120, i32 121, i32 58, i32 59, i32 122, i32 123, i32 60, i32 61, i32 124, i32 125, i32 62, i32 63, i32 126, i32 127> @@ -507,7 +507,7 @@ define <128 x i8> @vshuff_3e(<128 x i8> %v0, <128 x i8> %v1) #0 { } ; CHECK-LABEL: vshuff_3f: -; CHECK: [[REG3f:r[0-9]+]] = #63 +; CHECK: [[REG3f:r[0-9]+]] = #-1 ; CHECK: vshuff(v1,v0,[[REG3f]]) define <128 x i8> @vshuff_3f(<128 x i8> %v0, <128 x i8> %v1) #0 { %p = shufflevector <128 x i8> %v0, <128 x i8> undef, <128 x i32> < i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-128b.ll index 8691243351325..f359244d78f00 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-128b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-128b.ll @@ -2,7 +2,7 @@ ; Generator: vdeal(0x37), vdeal(0x53), vshuff(0x2f), vdeal(0x4b), vdeal(0x27), vdeal(0x43), vshuff(0x1f), vdeal(0x5b), vshuff(0x7e), vshuff(0x6c), vdeal(0x5a), vdeal(0x38), vshuff(0x16), vshuff(0x44), vdeal(0x72) ; CHECK-LABEL: test_0000: -; CHECK-DAG: [[R00:r[0-9]+]] = #66 +; CHECK-DAG: [[R00:r[0-9]+]] = #-62 ; CHECK-DAG: [[R01:r[0-9]+]] = #46 ; CHECK-DAG: [[R02:r[0-9]+]] = #1 ; CHECK: v[[H00:[0-9]+]]:[[L00:[0-9]+]] = vshuff(v1,v0,[[R00]]) @@ -18,7 +18,7 @@ define <256 x i8> @test_0000(<256 x i8> %v0) #0 { ; CHECK-LABEL: test_0001: ; CHECK-DAG: [[R10:r[0-9]+]] = #24 ; CHECK-DAG: [[R11:r[0-9]+]] = #9 -; CHECK-DAG: [[R12:r[0-9]+]] = #68 +; CHECK-DAG: [[R12:r[0-9]+]] = #-60 ; CHECK-DAG: [[R13:r[0-9]+]] = #34 ; CHECK: v[[H10:[0-9]+]]:[[L10:[0-9]+]] = vshuff(v1,v0,[[R10]]) ; CHECK: v[[H11:[0-9]+]]:[[L11:[0-9]+]] = vdeal(v[[H10]],v[[L10]],[[R11]]) @@ -34,7 +34,7 @@ define <256 x i8> @test_0001(<256 x i8> %v0) #0 { ; CHECK-LABEL: test_0002: ; CHECK-DAG: [[R20:r[0-9]+]] = #18 ; CHECK-DAG: [[R21:r[0-9]+]] = #10 -; CHECK-DAG: [[R22:r[0-9]+]] = #68 +; CHECK-DAG: [[R22:r[0-9]+]] = #-60 ; CHECK-DAG: [[R23:r[0-9]+]] = #5 ; CHECK: v[[H20:[0-9]+]]:[[L20:[0-9]+]] = vshuff(v1,v0,[[R20]]) ; CHECK: v[[H21:[0-9]+]]:[[L21:[0-9]+]] = vdeal(v[[H20]],v[[L20]],[[R21]]) @@ -51,7 +51,7 @@ define <256 x i8> @test_0002(<256 x i8> %v0) #0 { ; CHECK-DAG: [[R30:r[0-9]+]] = #21 ; CHECK-DAG: [[R31:r[0-9]+]] = #9 ; CHECK-DAG: [[R32:r[0-9]+]] = #34 -; CHECK-DAG: [[R33:r[0-9]+]] = #66 +; CHECK-DAG: [[R33:r[0-9]+]] = #-62 ; CHECK: v[[H30:[0-9]+]]:[[L30:[0-9]+]] = vshuff(v1,v0,[[R30]]) ; CHECK: v[[H31:[0-9]+]]:[[L31:[0-9]+]] = vdeal(v[[H30]],v[[L30]],[[R31]]) ; CHECK: v[[H32:[0-9]+]]:[[L32:[0-9]+]] = vshuff(v[[H31]],v[[L31]],[[R32]]) @@ -65,7 +65,7 @@ define <256 x i8> @test_0003(<256 x i8> %v0) #0 { ; Generator: vdeal(0x63), vshuff(0x6f), vdeal(0x77), vshuff(0x75), vdeal(0x3d), vshuff(0x2d), vshuff(0x00), vshuff(0x5c), vdeal(0x04), vshuff(0x79), vshuff(0x21), vdeal(0x7b), vdeal(0x66), vshuff(0x59), vdeal(0x54) ; CHECK-LABEL: test_0004: ; CHECK-DAG: [[R40:r[0-9]+]] = #38 -; CHECK-DAG: [[R41:r[0-9]+]] = #72 +; CHECK-DAG: [[R41:r[0-9]+]] = #-56 ; CHECK-DAG: [[R42:r[0-9]+]] = #18 ; CHECK: v[[H40:[0-9]+]]:[[L40:[0-9]+]] = vshuff(v1,v0,[[R40]]) ; CHECK: v[[H41:[0-9]+]]:[[L41:[0-9]+]] = vshuff(v[[H40]],v[[L40]],[[R41]]) @@ -81,7 +81,7 @@ define <256 x i8> @test_0004(<256 x i8> %v0) #0 { ; CHECK-DAG: [[R50:r[0-9]+]] = #9 ; CHECK-DAG: [[R51:r[0-9]+]] = #3 ; CHECK-DAG: [[R52:r[0-9]+]] = #48 -; CHECK-DAG: [[R53:r[0-9]+]] = #68 +; CHECK-DAG: [[R53:r[0-9]+]] = #-60 ; CHECK: v[[H50:[0-9]+]]:[[L50:[0-9]+]] = vshuff(v1,v0,[[R50]]) ; CHECK: v[[H51:[0-9]+]]:[[L51:[0-9]+]] = vdeal(v[[H50]],v[[L50]],[[R51]]) ; CHECK: v[[H52:[0-9]+]]:[[L52:[0-9]+]] = vdeal(v[[H51]],v[[L51]],[[R52]]) @@ -94,7 +94,7 @@ define <256 x i8> @test_0005(<256 x i8> %v0) #0 { ; Generator: vshuff(0x34), vshuff(0x07), vdeal(0x5d), vshuff(0x05), vshuff(0x50), vshuff(0x13), vdeal(0x31), vdeal(0x6e), vdeal(0x0f), vdeal(0x2c), vdeal(0x28), vdeal(0x76), vdeal(0x22), vdeal(0x3a), vdeal(0x51) ; CHECK-LABEL: test_0006: -; CHECK-DAG: [[R60:r[0-9]+]] = #85 +; CHECK-DAG: [[R60:r[0-9]+]] = #-43 ; CHECK-DAG: [[R61:r[0-9]+]] = #2 ; CHECK: v[[H60:[0-9]+]]:[[L60:[0-9]+]] = vdeal(v1,v0,[[R60]]) ; CHECK: v[[H61:[0-9]+]]:[[L61:[0-9]+]] = vshuff(v[[H60]],v[[L60]],[[R61]]) @@ -106,7 +106,7 @@ define <256 x i8> @test_0006(<256 x i8> %v0) #0 { ; Generator: vshuff(0x25), vshuff(0x4c), vshuff(0x72), vdeal(0x70), vshuff(0x3b), vshuff(0x26), vshuff(0x4d), vdeal(0x20), vshuff(0x7f), vdeal(0x6a), vdeal(0x78), vshuff(0x5f), vdeal(0x10), vdeal(0x71), vshuff(0x6d) ; CHECK-LABEL: test_0007: -; CHECK-DAG: [[R70:r[0-9]+]] = #74 +; CHECK-DAG: [[R70:r[0-9]+]] = #-54 ; CHECK-DAG: [[R71:r[0-9]+]] = #20 ; CHECK-DAG: [[R72:r[0-9]+]] = #34 ; CHECK: v[[H70:[0-9]+]]:[[L70:[0-9]+]] = vshuff(v1,v0,[[R70]]) @@ -120,7 +120,7 @@ define <256 x i8> @test_0007(<256 x i8> %v0) #0 { ; Generator: vshuff(0x2e), vshuff(0x40), vdeal(0x35), vdeal(0x3e), vdeal(0x06), vshuff(0x4b), vshuff(0x24), vshuff(0x09), vdeal(0x18), vshuff(0x42), vshuff(0x43), vshuff(0x41), vshuff(0x23), vdeal(0x3f), vdeal(0x39) ; CHECK-LABEL: test_0008: -; CHECK-DAG: [[R80:r[0-9]+]] = #73 +; CHECK-DAG: [[R80:r[0-9]+]] = #-55 ; CHECK-DAG: [[R81:r[0-9]+]] = #5 ; CHECK-DAG: [[R82:r[0-9]+]] = #48 ; CHECK-DAG: [[R83:r[0-9]+]] = #2 @@ -136,7 +136,7 @@ define <256 x i8> @test_0008(<256 x i8> %v0) #0 { ; Generator: vshuff(0x33), vshuff(0x5e), vshuff(0x2a), vdeal(0x2f), vdeal(0x1f), vshuff(0x14), vshuff(0x17), vshuff(0x1b), vdeal(0x1c), vdeal(0x15), vshuff(0x37), vshuff(0x3c), vdeal(0x4e), vdeal(0x7d), vshuff(0x61) ; CHECK-LABEL: test_0009: -; CHECK-DAG: [[R90:r[0-9]+]] = #96 +; CHECK-DAG: [[R90:r[0-9]+]] = #-32 ; CHECK-DAG: [[R91:r[0-9]+]] = #18 ; CHECK-DAG: [[R92:r[0-9]+]] = #5 ; CHECK: v[[H90:[0-9]+]]:[[L90:[0-9]+]] = vshuff(v1,v0,[[R90]]) @@ -152,7 +152,7 @@ define <256 x i8> @test_0009(<256 x i8> %v0) #0 { ; CHECK-LABEL: test_000a: ; CHECK-DAG: [[Ra0:r[0-9]+]] = #44 ; CHECK-DAG: [[Ra1:r[0-9]+]] = #6 -; CHECK-DAG: [[Ra2:r[0-9]+]] = #80 +; CHECK-DAG: [[Ra2:r[0-9]+]] = #-48 ; CHECK: v[[Ha0:[0-9]+]]:[[La0:[0-9]+]] = vshuff(v1,v0,[[Ra0]]) ; CHECK: v[[Ha1:[0-9]+]]:[[La1:[0-9]+]] = vdeal(v[[Ha0]],v[[La0]],[[Ra1]]) ; CHECK: v[[Ha2:[0-9]+]]:[[La2:[0-9]+]] = vshuff(v[[Ha1]],v[[La1]],[[Ra2]]) @@ -164,7 +164,7 @@ define <256 x i8> @test_000a(<256 x i8> %v0) #0 { ; Generator: vshuff(0x74), vshuff(0x11), vshuff(0x53), vshuff(0x66), vshuff(0x1d), vdeal(0x59), vshuff(0x63), vshuff(0x49), vdeal(0x00), vshuff(0x38), vshuff(0x45), vdeal(0x68), vshuff(0x65), vshuff(0x6e), vdeal(0x62) ; CHECK-LABEL: test_000b: -; CHECK-DAG: [[Rb0:r[0-9]+]] = #68 +; CHECK-DAG: [[Rb0:r[0-9]+]] = #-60 ; CHECK-DAG: [[Rb1:r[0-9]+]] = #5 ; CHECK-DAG: [[Rb2:r[0-9]+]] = #18 ; CHECK-DAG: [[Rb3:r[0-9]+]] = #40 @@ -182,7 +182,7 @@ define <256 x i8> @test_000b(<256 x i8> %v0) #0 { ; CHECK-LABEL: test_000c: ; CHECK-DAG: [[Rc0:r[0-9]+]] = #10 ; CHECK-DAG: [[Rc1:r[0-9]+]] = #3 -; CHECK-DAG: [[Rc2:r[0-9]+]] = #84 +; CHECK-DAG: [[Rc2:r[0-9]+]] = #-44 ; CHECK: v[[Hc0:[0-9]+]]:[[Lc0:[0-9]+]] = vshuff(v1,v0,[[Rc0]]) ; CHECK: v[[Hc1:[0-9]+]]:[[Lc1:[0-9]+]] = vdeal(v[[Hc0]],v[[Lc0]],[[Rc1]]) ; CHECK: v[[Hc2:[0-9]+]]:[[Lc2:[0-9]+]] = vshuff(v[[Hc1]],v[[Lc1]],[[Rc2]]) @@ -195,7 +195,7 @@ define <256 x i8> @test_000c(<256 x i8> %v0) #0 { ; Generator: vdeal(0x58), vdeal(0x0b), vdeal(0x21), vdeal(0x7f), vshuff(0x6a), vshuff(0x78), vshuff(0x52), vshuff(0x73), vshuff(0x06), vdeal(0x2d), vdeal(0x32), vdeal(0x48), vdeal(0x75), vdeal(0x55), vshuff(0x0e) ; CHECK-LABEL: test_000d: ; CHECK-DAG: [[Rd0:r[0-9]+]] = #36 -; CHECK-DAG: [[Rd1:r[0-9]+]] = #80 +; CHECK-DAG: [[Rd1:r[0-9]+]] = #-48 ; CHECK-DAG: [[Rd2:r[0-9]+]] = #9 ; CHECK: v[[Hd0:[0-9]+]]:[[Ld0:[0-9]+]] = vshuff(v1,v0,[[Rd0]]) ; CHECK: v[[Hd1:[0-9]+]]:[[Ld1:[0-9]+]] = vshuff(v[[Hd0]],v[[Ld0]],[[Rd1]]) @@ -208,7 +208,7 @@ define <256 x i8> @test_000d(<256 x i8> %v0) #0 { ; Generator: vdeal(0x6f), vdeal(0x13), vdeal(0x07), vdeal(0x56), vshuff(0x2c), vdeal(0x0c), vdeal(0x33), vshuff(0x22), vdeal(0x02), vshuff(0x18), vdeal(0x4d), vshuff(0x51), vshuff(0x3e), vshuff(0x77), vshuff(0x30) ; CHECK-LABEL: test_000e: -; CHECK-DAG: [[Re0:r[0-9]+]] = #65 +; CHECK-DAG: [[Re0:r[0-9]+]] = #-63 ; CHECK-DAG: [[Re1:r[0-9]+]] = #24 ; CHECK-DAG: [[Re2:r[0-9]+]] = #36 ; CHECK: v[[He0:[0-9]+]]:[[Le0:[0-9]+]] = vshuff(v1,v0,[[Re0]]) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-64b.ll index c81b3534e2eb4..7298cebe08481 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-64b.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuff-combos-64b.ll @@ -2,7 +2,7 @@ ; Generator: vdeal(0x1f), vshuff(0x32), vshuff(0x24), vshuff(0x26), vshuff(0x08), vdeal(0x3a), vshuff(0x0c), vdeal(0x0e), vdeal(0x30), vdeal(0x22), vdeal(0x14), vdeal(0x36), vdeal(0x18), vdeal(0x0a), vdeal(0x3c) ; CHECK-LABEL: test_0000: -; CHECK-DAG: [[R00:r[0-9]+]] = #49 +; CHECK-DAG: [[R00:r[0-9]+]] = #-15 ; CHECK-DAG: [[R01:r[0-9]+]] = #3 ; CHECK: v[[H00:[0-9]+]]:[[L00:[0-9]+]] = vshuff(v1,v0,[[R00]]) ; CHECK: v[[H01:[0-9]+]]:[[L01:[0-9]+]] = vdeal(v[[H00]],v[[L00]],[[R01]]) @@ -15,7 +15,7 @@ define <128 x i8> @test_0000(<128 x i8> %v0) #0 { ; Generator: vshuff(0x1e), vshuff(0x00), vdeal(0x12), vshuff(0x34), vshuff(0x0b), vshuff(0x2b), vdeal(0x16), vshuff(0x2e), vshuff(0x1a), vdeal(0x28), vshuff(0x2d), vdeal(0x15), vdeal(0x1d), vshuff(0x25), vshuff(0x0d) ; CHECK-LABEL: test_0001: ; CHECK-DAG: [[R10:r[0-9]+]] = #10 -; CHECK-DAG: [[R11:r[0-9]+]] = #34 +; CHECK-DAG: [[R11:r[0-9]+]] = #-30 ; CHECK-DAG: [[R12:r[0-9]+]] = #16 ; CHECK: v[[H10:[0-9]+]]:[[L10:[0-9]+]] = vshuff(v1,v0,[[R10]]) ; CHECK: v[[H11:[0-9]+]]:[[L11:[0-9]+]] = vshuff(v[[H10]],v[[L10]],[[R11]]) @@ -41,7 +41,7 @@ define <128 x i8> @test_0002(<128 x i8> %v0) #0 { ; Generator: vshuff(0x11), vshuff(0x2b), vdeal(0x3d), vdeal(0x3e), vshuff(0x02), vdeal(0x1c), vdeal(0x2f), vdeal(0x0f), vshuff(0x36), vshuff(0x38), vdeal(0x35), vshuff(0x1b), vshuff(0x3b), vdeal(0x21), vdeal(0x15) ; CHECK-LABEL: test_0003: -; CHECK-DAG: [[R30:r[0-9]+]] = #34 +; CHECK-DAG: [[R30:r[0-9]+]] = #-30 ; CHECK-DAG: [[R31:r[0-9]+]] = #10 ; CHECK-DAG: [[R32:r[0-9]+]] = #5 ; CHECK: v[[H30:[0-9]+]]:[[L30:[0-9]+]] = vshuff(v1,v0,[[R30]]) @@ -55,7 +55,7 @@ define <128 x i8> @test_0003(<128 x i8> %v0) #0 { ; Generator: vdeal(0x0a), vdeal(0x10), vdeal(0x31), vshuff(0x30), vdeal(0x00), vdeal(0x39), vdeal(0x0e), vshuff(0x37), vshuff(0x17), vshuff(0x06), vshuff(0x07), vshuff(0x09), vshuff(0x3c), vshuff(0x33), vshuff(0x33) ; CHECK-LABEL: test_0004: -; CHECK-DAG: [[R40:r[0-9]+]] = #57 +; CHECK-DAG: [[R40:r[0-9]+]] = #-7 ; CHECK-DAG: [[R41:r[0-9]+]] = #6 ; CHECK-DAG: [[R42:r[0-9]+]] = #1 ; CHECK: v[[H40:[0-9]+]]:[[L40:[0-9]+]] = vshuff(v1,v0,[[R40]]) @@ -69,7 +69,7 @@ define <128 x i8> @test_0004(<128 x i8> %v0) #0 { ; Generator: vdeal(0x1c), vshuff(0x31), vdeal(0x1f), vshuff(0x29), vdeal(0x1a), vshuff(0x2a), vshuff(0x25), vshuff(0x05), vshuff(0x04), vshuff(0x23), vdeal(0x0d), vdeal(0x20), vshuff(0x29), vdeal(0x2f), vshuff(0x1d) ; CHECK-LABEL: test_0005: -; CHECK-DAG: [[R50:r[0-9]+]] = #33 +; CHECK-DAG: [[R50:r[0-9]+]] = #-31 ; CHECK-DAG: [[R51:r[0-9]+]] = #12 ; CHECK-DAG: [[R52:r[0-9]+]] = #1{{$}} ; CHECK: v[[H50:[0-9]+]]:[[L50:[0-9]+]] = vshuff(v1,v0,[[R50]]) @@ -84,7 +84,7 @@ define <128 x i8> @test_0005(<128 x i8> %v0) #0 { ; Generator: vdeal(0x22), vshuff(0x24), vdeal(0x16), vdeal(0x18), vshuff(0x17), vdeal(0x2d), vshuff(0x38), vshuff(0x20), vshuff(0x37), vdeal(0x3f), vdeal(0x10), vdeal(0x32), vshuff(0x14), vshuff(0x13), vdeal(0x0b) ; CHECK-LABEL: test_0006: ; CHECK-DAG: [[R60:r[0-9]+]] = #3{{$}} -; CHECK-DAG: [[R61:r[0-9]+]] = #36 +; CHECK-DAG: [[R61:r[0-9]+]] = #-28 ; CHECK: v[[H60:[0-9]+]]:[[L60:[0-9]+]] = vdeal(v1,v0,[[R60]]) ; CHECK: v[[H61:[0-9]+]]:[[L61:[0-9]+]] = vshuff(v[[H60]],v[[L60]],[[R61]]) ; CHECK-NOT: v{{[0-9:]+}} = @@ -95,7 +95,7 @@ define <128 x i8> @test_0006(<128 x i8> %v0) #0 { ; Generator: vdeal(0x0f), vdeal(0x01), vshuff(0x3b), vdeal(0x0c), vdeal(0x3f), vdeal(0x26), vshuff(0x28), vdeal(0x3a), vdeal(0x02), vdeal(0x1b), vshuff(0x0e), vdeal(0x03), vshuff(0x3d), vshuff(0x2c), vshuff(0x15) ; CHECK-LABEL: test_0007: -; CHECK-DAG: [[R70:r[0-9]+]] = #50 +; CHECK-DAG: [[R70:r[0-9]+]] = #-14 ; CHECK-DAG: [[R71:r[0-9]+]] = #5{{$}} ; CHECK-DAG: [[R72:r[0-9]+]] = #8 ; CHECK: v[[H70:[0-9]+]]:[[L70:[0-9]+]] = vshuff(v1,v0,[[R70]]) @@ -124,7 +124,7 @@ define <128 x i8> @test_0008(<128 x i8> %v0) #0 { ; Generator: vshuff(0x1d), vshuff(0x18), vdeal(0x09), vshuff(0x2a), vdeal(0x03), vdeal(0x27), vdeal(0x25), vdeal(0x13), vshuff(0x3a), vshuff(0x19), vshuff(0x06), vshuff(0x0f), vshuff(0x3c), vshuff(0x2e), vshuff(0x36) ; CHECK-LABEL: test_0009: ; CHECK-DAG: [[R90:r[0-9]+]] = #17 -; CHECK-DAG: [[R91:r[0-9]+]] = #40 +; CHECK-DAG: [[R91:r[0-9]+]] = #-24 ; CHECK-DAG: [[R92:r[0-9]+]] = #6 ; CHECK: v[[H90:[0-9]+]]:[[L90:[0-9]+]] = vdeal(v1,v0,[[R90]]) ; CHECK: v[[H91:[0-9]+]]:[[L91:[0-9]+]] = vshuff(v[[H90]],v[[L90]],[[R91]]) @@ -137,7 +137,7 @@ define <128 x i8> @test_0009(<128 x i8> %v0) #0 { ; Generator: vdeal(0x05), vshuff(0x10), vdeal(0x0d), vshuff(0x12), vdeal(0x08), vshuff(0x22), vdeal(0x24), vshuff(0x3e), vdeal(0x00), vshuff(0x14), vdeal(0x3b), vdeal(0x33), vshuff(0x2f), vdeal(0x13), vdeal(0x14) ; CHECK-LABEL: test_000a: -; CHECK-DAG: [[Ra0:r[0-9]+]] = #56 +; CHECK-DAG: [[Ra0:r[0-9]+]] = #-8 ; CHECK-DAG: [[Ra1:r[0-9]+]] = #13 ; CHECK-DAG: [[Ra2:r[0-9]+]] = #2 ; CHECK: v[[Ha0:[0-9]+]]:[[La0:[0-9]+]] = vshuff(v1,v0,[[Ra0]]) @@ -152,7 +152,7 @@ define <128 x i8> @test_000a(<128 x i8> %v0) #0 { ; Generator: vdeal(0x12), vshuff(0x2c), vdeal(0x2d), vshuff(0x01), vshuff(0x1f), vshuff(0x30), vdeal(0x2a), vdeal(0x0b), vdeal(0x32), vshuff(0x08), vdeal(0x1b), vdeal(0x09), vshuff(0x1c), vshuff(0x16), vdeal(0x38) ; CHECK-LABEL: test_000b: ; CHECK-DAG: [[Rb0:r[0-9]+]] = #12 -; CHECK-DAG: [[Rb1:r[0-9]+]] = #33 +; CHECK-DAG: [[Rb1:r[0-9]+]] = #-31 ; CHECK-DAG: [[Rb2:r[0-9]+]] = #18 ; CHECK: v[[Hb0:[0-9]+]]:[[Lb0:[0-9]+]] = vdeal(v1,v0,[[Rb0]]) ; CHECK: v[[Hb1:[0-9]+]]:[[Lb1:[0-9]+]] = vdeal(v[[Hb0]],v[[Lb0]],[[Rb1]]) @@ -168,7 +168,7 @@ define <128 x i8> @test_000b(<128 x i8> %v0) #0 { ; CHECK-DAG: [[Rc0:r[0-9]+]] = #12 ; CHECK-DAG: [[Rc1:r[0-9]+]] = #6 ; CHECK-DAG: [[Rc2:r[0-9]+]] = #17 -; CHECK-DAG: [[Rc3:r[0-9]+]] = #32 +; CHECK-DAG: [[Rc3:r[0-9]+]] = #-32 ; CHECK: v[[Hc0:[0-9]+]]:[[Lc0:[0-9]+]] = vshuff(v1,v0,[[Rc0]]) ; CHECK: v[[Hc1:[0-9]+]]:[[Lc1:[0-9]+]] = vdeal(v[[Hc0]],v[[Lc0]],[[Rc1]]) ; CHECK: v[[Hc2:[0-9]+]]:[[Lc2:[0-9]+]] = vdeal(v[[Hc1]],v[[Lc1]],[[Rc2]]) @@ -181,7 +181,7 @@ define <128 x i8> @test_000c(<128 x i8> %v0) #0 { ; Generator: vdeal(0x3c), vdeal(0x24), vdeal(0x05), vdeal(0x37), vshuff(0x21), vdeal(0x11), vdeal(0x1d), vshuff(0x00), vshuff(0x34), vshuff(0x0d), vshuff(0x3a), vshuff(0x1f), vshuff(0x03), vshuff(0x1e), vdeal(0x29) ; CHECK-LABEL: test_000d: -; CHECK-DAG: [[Rd0:r[0-9]+]] = #40 +; CHECK-DAG: [[Rd0:r[0-9]+]] = #-24 ; CHECK-DAG: [[Rd1:r[0-9]+]] = #28 ; CHECK: v[[Hd0:[0-9]+]]:[[Ld0:[0-9]+]] = vshuff(v1,v0,[[Rd0]]) ; CHECK: v[[Hd1:[0-9]+]]:[[Ld1:[0-9]+]] = vdeal(v[[Hd0]],v[[Ld0]],[[Rd1]]) @@ -193,7 +193,7 @@ define <128 x i8> @test_000d(<128 x i8> %v0) #0 { ; Generator: vshuff(0x18), vdeal(0x36), vdeal(0x33), vdeal(0x26), vshuff(0x04), vshuff(0x2d), vshuff(0x35), vdeal(0x34), vdeal(0x2e), vdeal(0x25), vdeal(0x28), vshuff(0x0c), vdeal(0x07), vshuff(0x35), vshuff(0x01) ; CHECK-LABEL: test_000e: -; CHECK-DAG: [[Re0:r[0-9]+]] = #58 +; CHECK-DAG: [[Re0:r[0-9]+]] = #-6 ; CHECK: v[[He0:[0-9]+]]:[[Le0:[0-9]+]] = vshuff(v1,v0,[[Re0]]) ; CHECK-NOT: v{{[0-9:]+}} = define <128 x i8> @test_000e(<128 x i8> %v0) #0 { @@ -203,7 +203,7 @@ define <128 x i8> @test_000e(<128 x i8> %v0) #0 { ; Generator: vshuff(0x1a), vshuff(0x10), vdeal(0x2b), vshuff(0x15), vdeal(0x12), vdeal(0x30), vshuff(0x23), vshuff(0x02), vshuff(0x32), vshuff(0x08), vshuff(0x05), vdeal(0x3e), vshuff(0x39), vshuff(0x0a), vshuff(0x0e) ; CHECK-LABEL: test_000f: -; CHECK-DAG: [[Rf0:r[0-9]+]] = #44 +; CHECK-DAG: [[Rf0:r[0-9]+]] = #-20 ; CHECK-DAG: [[Rf1:r[0-9]+]] = #18 ; CHECK: v[[Hf0:[0-9]+]]:[[Lf0:[0-9]+]] = vshuff(v1,v0,[[Rf0]]) ; CHECK: v[[Hf1:[0-9]+]]:[[Lf1:[0-9]+]] = vshuff(v[[Hf0]],v[[Lf0]],[[Rf1]]) diff --git a/llvm/test/CodeGen/Hexagon/autohvx/shuff-perfect-inverted-pair.ll b/llvm/test/CodeGen/Hexagon/autohvx/shuff-perfect-inverted-pair.ll index 9ce849e464d9a..946658429bc25 100644 --- a/llvm/test/CodeGen/Hexagon/autohvx/shuff-perfect-inverted-pair.ll +++ b/llvm/test/CodeGen/Hexagon/autohvx/shuff-perfect-inverted-pair.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=hexagon < %s | FileCheck %s ; CHECK-LABEL: f0: -; CHECK: r[[R0:[0-9]+]] = #60 +; CHECK: r[[R0:[0-9]+]] = #-4 ; CHECK: v1:0 = vshuff(v0,v2,r[[R0]]) define <128 x i8> @f0(<128 x i8> %a0, <128 x i8> %a1) #0 { %v0 = shufflevector <128 x i8> %a0, <128 x i8> %a1, <128 x i32> @@ -9,7 +9,7 @@ define <128 x i8> @f0(<128 x i8> %a0, <128 x i8> %a1) #0 { } ; CHECK-LABEL: f1: -; CHECK: r[[R0:[0-9]+]] = #124 +; CHECK: r[[R0:[0-9]+]] = #-4 ; CHECK: v1:0 = vshuff(v0,v2,r[[R0]]) define <256 x i8> @f1(<256 x i8> %a0, <256 x i8> %a1) #1 { %v0 = shufflevector <256 x i8> %a0, <256 x i8> %a1, <256 x i32> diff --git a/llvm/test/CodeGen/LoongArch/lasx/abs.ll b/llvm/test/CodeGen/LoongArch/lasx/abs.ll new file mode 100644 index 0000000000000..e3b0d04d92d75 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/abs.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @vabs_b(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.b $xr1, $xr0 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <32 x i8>, ptr %src + %b = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a, i1 true) + store <32 x i8> %b, ptr %dst + ret void +} + +define void @vabs_b_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.b $xr1, $xr0 +; CHECK-NEXT: xvmax.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <32 x i8>, ptr %src + %b = tail call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a, i1 false) + store <32 x i8> %b, ptr %dst + ret void +} + +define void @vabs_h(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.h $xr1, $xr0 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <16 x i16>, ptr %src + %b = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a, i1 true) + store <16 x i16> %b, ptr %dst + ret void +} + +define void @vabs_h_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_h_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.h $xr1, $xr0 +; CHECK-NEXT: xvmax.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <16 x i16>, ptr %src + %b = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a, i1 false) + store <16 x i16> %b, ptr %dst + ret void +} + +define void @vabs_w(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.w $xr1, $xr0 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x i32>, ptr %src + %b = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 true) + store <8 x i32> %b, ptr %dst + ret void +} + +define void @vabs_w_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_w_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.w $xr1, $xr0 +; CHECK-NEXT: xvmax.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x i32>, ptr %src + %b = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 false) + store <8 x i32> %b, ptr %dst + ret void +} + +define void @vabs_d(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.d $xr1, $xr0 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %src + %b = tail call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a, i1 true) + store <4 x i64> %b, ptr %dst + ret void +} + +define void @vabs_d_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_d_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvneg.d $xr1, $xr0 +; CHECK-NEXT: xvmax.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i64>, ptr %src + %b = tail call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a, i1 false) + store <4 x i64> %b, ptr %dst + ret void +} + +declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) +declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) +declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1) diff --git a/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll new file mode 100644 index 0000000000000..67549599db2f3 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/and-not-combine.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @and_not_combine_v32i8(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a3, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvsub.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <32 x i8>, ptr %a0 + %v1 = load <32 x i8>, ptr %a1 + %v2 = load <32 x i8>, ptr %a2 + %not = xor <32 x i8> %v1, + %add = add <32 x i8> %not, %v2 + %and = and <32 x i8> %v0, %add + store <32 x i8> %and, ptr %res + ret void +} + +define void @and_not_combine_v16i16(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a3, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvsub.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <16 x i16>, ptr %a0 + %v1 = load <16 x i16>, ptr %a1 + %v2 = load <16 x i16>, ptr %a2 + %not = xor <16 x i16> %v1, + %add = add <16 x i16> %not, %v2 + %and = and <16 x i16> %v0, %add + store <16 x i16> %and, ptr %res + ret void +} + +define void @and_not_combine_v8i32(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v8i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a3, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvsub.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x i32>, ptr %a0 + %v1 = load <8 x i32>, ptr %a1 + %v2 = load <8 x i32>, ptr %a2 + %not = xor <8 x i32> %v1, + %add = add <8 x i32> %not, %v2 + %and = and <8 x i32> %v0, %add + store <8 x i32> %and, ptr %res + ret void +} + +define void @and_not_combine_v4i64(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v4i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a3, 0 +; CHECK-NEXT: xvld $xr2, $a1, 0 +; CHECK-NEXT: xvsub.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvandn.v $xr0, $xr0, $xr2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x i64>, ptr %a0 + %v1 = load <4 x i64>, ptr %a1 + %v2 = load <4 x i64>, ptr %a2 + %not = xor <4 x i64> %v1, + %add = add <4 x i64> %not, %v2 + %and = and <4 x i64> %v0, %add + store <4 x i64> %and, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll index 87ee4ad025395..8b12216d0f856 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll @@ -1,27 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lasx --verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefix=LA32 ; RUN: llc --mtriple=loongarch64 -mattr=+lasx --verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefix=LA64 declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>) define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.b $xr1, $xr0, 4 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 4 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 51 +; LA32-NEXT: xvslli.b $xr1, $xr1, 2 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 2 +; LA32-NEXT: xvandi.b $xr0, $xr0, 51 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 85 +; LA32-NEXT: xvslli.b $xr1, $xr1, 1 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 1 +; LA32-NEXT: xvandi.b $xr0, $xr0, 85 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvori.b $xr0, $xr1, 0 +; LA64-NEXT: ret %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a) ret <32 x i8> %b } @@ -29,23 +48,53 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind { declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2 -; CHECK-NEXT: xvshuf4i.h $xr0, $xr2, 27 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 5 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 4 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 7 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 6 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 3 +; LA32-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA32-NEXT: xvshuf4i.h $xr0, $xr2, 27 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA64-NEXT: xvshuf4i.h $xr0, $xr2, 27 +; LA64-NEXT: ret %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) ret <16 x i16> %b } @@ -53,23 +102,53 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2 -; CHECK-NEXT: xvshuf4i.w $xr0, $xr2, 177 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 4 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 5 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 6 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 7 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr2, $a0, 3 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA32-NEXT: xvori.b $xr0, $xr1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpermi.q $xr2, $xr1, 2 +; LA64-NEXT: xvshuf4i.w $xr0, $xr2, 177 +; LA64-NEXT: ret %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) ret <8 x i32> %b } @@ -77,23 +156,43 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; LA32-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI3_0) +; LA32-NEXT: xvshuf.b $xr0, $xr0, $xr0, $xr1 +; LA32-NEXT: xvslli.b $xr1, $xr0, 4 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 4 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 51 +; LA32-NEXT: xvslli.b $xr1, $xr1, 2 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 2 +; LA32-NEXT: xvandi.b $xr0, $xr0, 51 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvandi.b $xr1, $xr0, 85 +; LA32-NEXT: xvslli.b $xr1, $xr1, 1 +; LA32-NEXT: xvsrli.b $xr0, $xr0, 1 +; LA32-NEXT: xvandi.b $xr0, $xr0, 85 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 2 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr2, $a0, 1 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: xvpermi.q $xr1, $xr2, 2 +; LA64-NEXT: xvori.b $xr0, $xr1, 0 +; LA64-NEXT: ret %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) ret <4 x i64> %b } diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll index 7575bc1a9d3d2..d09ef0e2c6ac0 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll @@ -192,11 +192,11 @@ entry: ret void } -define void @buildvector_v2f32_const_splat(ptr %dst) nounwind { -; CHECK-LABEL: buildvector_v2f32_const_splat: +;; Also check buildvector_const_splat_xvldi_1010. +define void @buildvector_v8f32_const_splat(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_v8f32_const_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lu12i.w $a1, 260096 -; CHECK-NEXT: xvreplgr2vr.w $xr0, $a1 +; CHECK-NEXT: xvldi $xr0, -1424 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -204,30 +204,112 @@ entry: ret void } +;; Also check buildvector_const_splat_xvldi_1100. define void @buildvector_v4f64_const_splat(ptr %dst) nounwind { -; LA32-LABEL: buildvector_v4f64_const_splat: -; LA32: # %bb.0: # %entry -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI14_0) -; LA32-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI14_0) -; LA32-NEXT: xvst $xr0, $a0, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: buildvector_v4f64_const_splat: -; LA64: # %bb.0: # %entry -; LA64-NEXT: lu52i.d $a1, $zero, 1023 -; LA64-NEXT: xvreplgr2vr.d $xr0, $a1 -; LA64-NEXT: xvst $xr0, $a0, 0 -; LA64-NEXT: ret +; CHECK-LABEL: buildvector_v4f64_const_splat: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -912 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret entry: store <4 x double> , ptr %dst ret void } +;; imm[11:8] == 4'b0000/4'b0100/4'b1000 can be represented using xvrepli.[whb]. +define void @buildvector_const_splat_xvldi_0001(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_0001: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -3837 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_0010(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_0010: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -3583 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_0011(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_0011: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -3327 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_0101(ptr %dst) { +; CHECK-LABEL: buildvector_const_splat_xvldi_0101: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -2813 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <16 x i16> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_0110(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_0110: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -2557 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_0111(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_0111: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -2305 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_1001(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_1001: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -1789 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_xvldi_1011(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_xvldi_1011: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvldi $xr0, -1280 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x float> , ptr %dst + ret void +} + define void @buildvector_v32i8_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v32i8_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI15_0) -; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI15_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI23_0) +; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI23_0) ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -238,8 +320,8 @@ entry: define void @buildvector_v16i16_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v16i16_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI16_0) -; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI16_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI24_0) +; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI24_0) ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -250,8 +332,8 @@ entry: define void @buildvector_v8i32_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v8i32_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI17_0) -; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI17_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI25_0) +; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI25_0) ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -262,8 +344,8 @@ entry: define void @buildvector_v4i64_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v4i64_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI18_0) -; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI18_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI26_0) +; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI26_0) ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -274,8 +356,8 @@ entry: define void @buildvector_v2f32_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v2f32_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI19_0) -; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI19_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI27_0) +; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI27_0) ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -286,8 +368,8 @@ entry: define void @buildvector_v4f64_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v4f64_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI20_0) -; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI20_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI28_0) +; CHECK-NEXT: xvld $xr0, $a1, %pc_lo12(.LCPI28_0) ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -1511,8 +1593,7 @@ define void @buildvector_v8f32_with_constant(ptr %dst, float %a1, float %a2, flo ; CHECK-NEXT: # kill: def $f2 killed $f2 def $xr2 ; CHECK-NEXT: # kill: def $f1 killed $f1 def $xr1 ; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: lu12i.w $a1, 262144 -; CHECK-NEXT: xvreplgr2vr.w $xr4, $a1 +; CHECK-NEXT: xvldi $xr4, -3264 ; CHECK-NEXT: xvinsve0.w $xr4, $xr0, 1 ; CHECK-NEXT: xvinsve0.w $xr4, $xr1, 2 ; CHECK-NEXT: xvinsve0.w $xr4, $xr2, 5 diff --git a/llvm/test/CodeGen/LoongArch/lasx/extract-binop.ll b/llvm/test/CodeGen/LoongArch/lasx/extract-binop.ll new file mode 100644 index 0000000000000..4986b12199c31 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/extract-binop.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 + +define i8 @extractelt_add_v32i8(ptr %p) { +; CHECK-LABEL: extractelt_add_v32i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvaddi.bu $xr0, $xr0, 13 +; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 2 +; CHECK-NEXT: ret +entry: + %x = load <32 x i8>, ptr %p + %add = add <32 x i8> %x, + %ext = extractelement <32 x i8> %add, i32 2 + ret i8 %ext +} + +define i16 @extractelt_add_v16i16(ptr %p) { +; CHECK-LABEL: extractelt_add_v16i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvaddi.hu $xr0, $xr0, 13 +; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 2 +; CHECK-NEXT: ret +entry: + %x = load <16 x i16>, ptr %p + %add = add <16 x i16> %x, + %ext = extractelement <16 x i16> %add, i32 2 + ret i16 %ext +} + +define i32 @extractelt_add_v8i32(ptr %p) { +; LA32-LABEL: extractelt_add_v8i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a0, $a0, 8 +; LA32-NEXT: addi.w $a0, $a0, 13 +; LA32-NEXT: ret +; +; LA64-LABEL: extractelt_add_v8i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvaddi.wu $xr0, $xr0, 13 +; LA64-NEXT: xvpickve2gr.w $a0, $xr0, 2 +; LA64-NEXT: ret +entry: + %x = load <8 x i32>, ptr %p + %add = add <8 x i32> %x, + %ext = extractelement <8 x i32> %add, i32 2 + ret i32 %ext +} + +define i64 @extractelt_add_v4i64(ptr %p) { +; LA32-LABEL: extractelt_add_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvaddi.du $xr0, $xr0, 12 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 3 +; LA32-NEXT: ret +; +; LA64-LABEL: extractelt_add_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 8 +; LA64-NEXT: addi.d $a0, $a0, 12 +; LA64-NEXT: ret +entry: + %x = load <4 x i64>, ptr %p + %add = add <4 x i64> %x, + %ext = extractelement <4 x i64> %add, i32 1 + ret i64 %ext +} + +define float @extractelt_fadd_v8f32(ptr %p) { +; CHECK-LABEL: extractelt_fadd_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fld.s $fa0, $a0, 8 +; CHECK-NEXT: vldi $vr1, -1238 +; CHECK-NEXT: fadd.s $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %x = load <8 x float>, ptr %p + %add = fadd <8 x float> %x, + %ext = extractelement <8 x float> %add, i32 2 + ret float %ext +} + +define double @extractelt_fadd_v4f64(ptr %p) { +; CHECK-LABEL: extractelt_fadd_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fld.d $fa0, $a0, 8 +; CHECK-NEXT: vldi $vr1, -984 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %x = load <4 x double>, ptr %p + %add = fadd <4 x double> %x, + %ext = extractelement <4 x double> %add, i32 1 + ret double %ext +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll index 7514dafa8000b..d75985b1ac215 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,-frecipe < %s | FileCheck %s --check-prefixes=FAULT,FAULT-LA32 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,+frecipe < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefixes=FAULT,FAULT-LA64 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s --check-prefixes=CHECK,LA64 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s define void @fdiv_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind { ; FAULT-LABEL: fdiv_v8f32: @@ -40,35 +40,19 @@ define void @fdiv_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind { ; FAULT-NEXT: xvst $xr0, $a0, 0 ; FAULT-NEXT: ret ; -; LA32-LABEL: fdiv_v4f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI1_0) -; LA32-NEXT: xvld $xr0, $a2, 0 -; LA32-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI1_0) -; LA32-NEXT: xvld $xr2, $a1, 0 -; LA32-NEXT: xvfrecipe.d $xr3, $xr0 -; LA32-NEXT: xvfmadd.d $xr1, $xr0, $xr3, $xr1 -; LA32-NEXT: xvfnmsub.d $xr1, $xr1, $xr3, $xr3 -; LA32-NEXT: xvfmul.d $xr3, $xr2, $xr1 -; LA32-NEXT: xvfnmsub.d $xr0, $xr0, $xr3, $xr2 -; LA32-NEXT: xvfmadd.d $xr0, $xr1, $xr0, $xr3 -; LA32-NEXT: xvst $xr0, $a0, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: fdiv_v4f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: xvld $xr0, $a2, 0 -; LA64-NEXT: xvld $xr1, $a1, 0 -; LA64-NEXT: lu52i.d $a1, $zero, -1025 -; LA64-NEXT: xvreplgr2vr.d $xr2, $a1 -; LA64-NEXT: xvfrecipe.d $xr3, $xr0 -; LA64-NEXT: xvfmadd.d $xr2, $xr0, $xr3, $xr2 -; LA64-NEXT: xvfnmsub.d $xr2, $xr2, $xr3, $xr3 -; LA64-NEXT: xvfmul.d $xr3, $xr1, $xr2 -; LA64-NEXT: xvfnmsub.d $xr0, $xr0, $xr3, $xr1 -; LA64-NEXT: xvfmadd.d $xr0, $xr2, $xr0, $xr3 -; LA64-NEXT: xvst $xr0, $a0, 0 -; LA64-NEXT: ret +; CHECK-LABEL: fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvfrecipe.d $xr2, $xr0 +; CHECK-NEXT: xvldi $xr3, -784 +; CHECK-NEXT: xvfmadd.d $xr3, $xr0, $xr2, $xr3 +; CHECK-NEXT: xvfnmsub.d $xr2, $xr3, $xr2, $xr2 +; CHECK-NEXT: xvfmul.d $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret entry: %v0 = load <4 x double>, ptr %a0 %v1 = load <4 x double>, ptr %a1 @@ -90,8 +74,7 @@ define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvfrecipe.s $xr1, $xr0 -; CHECK-NEXT: lu12i.w $a1, -264192 -; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvldi $xr2, -1296 ; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 ; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr1, $xr1 ; CHECK-NEXT: xvst $xr0, $a0, 0 @@ -107,24 +90,22 @@ define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA32-LABEL: one_fdiv_v4f64: ; FAULT-LA32: # %bb.0: # %entry ; FAULT-LA32-NEXT: xvld $xr0, $a1, 0 -; FAULT-LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; FAULT-LA32-NEXT: xvld $xr1, $a1, %pc_lo12(.LCPI3_0) +; FAULT-LA32-NEXT: xvldi $xr1, -912 ; FAULT-LA32-NEXT: xvfdiv.d $xr0, $xr1, $xr0 ; FAULT-LA32-NEXT: xvst $xr0, $a0, 0 ; FAULT-LA32-NEXT: ret ; -; LA32-LABEL: one_fdiv_v4f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: xvld $xr1, $a1, %pc_lo12(.LCPI3_0) -; LA32-NEXT: xvfrecipe.d $xr2, $xr0 -; LA32-NEXT: xvfnmsub.d $xr3, $xr0, $xr2, $xr1 -; LA32-NEXT: xvfmadd.d $xr2, $xr2, $xr3, $xr2 -; LA32-NEXT: xvfnmsub.d $xr0, $xr0, $xr2, $xr1 -; LA32-NEXT: xvfmadd.d $xr0, $xr2, $xr0, $xr2 -; LA32-NEXT: xvst $xr0, $a0, 0 -; LA32-NEXT: ret +; CHECK-LABEL: one_fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.d $xr1, $xr0 +; CHECK-NEXT: xvldi $xr2, -912 +; CHECK-NEXT: xvfnmsub.d $xr3, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr1, $xr1, $xr3, $xr1 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr0, $xr1, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret ; ; FAULT-LA64-LABEL: one_fdiv_v4f64: ; FAULT-LA64: # %bb.0: # %entry @@ -132,19 +113,6 @@ define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA64-NEXT: xvfrecip.d $xr0, $xr0 ; FAULT-LA64-NEXT: xvst $xr0, $a0, 0 ; FAULT-LA64-NEXT: ret -; -; LA64-LABEL: one_fdiv_v4f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: xvfrecipe.d $xr1, $xr0 -; LA64-NEXT: lu52i.d $a1, $zero, 1023 -; LA64-NEXT: xvreplgr2vr.d $xr2, $a1 -; LA64-NEXT: xvfnmsub.d $xr3, $xr0, $xr1, $xr2 -; LA64-NEXT: xvfmadd.d $xr1, $xr1, $xr3, $xr1 -; LA64-NEXT: xvfnmsub.d $xr0, $xr0, $xr1, $xr2 -; LA64-NEXT: xvfmadd.d $xr0, $xr1, $xr0, $xr1 -; LA64-NEXT: xvst $xr0, $a0, 0 -; LA64-NEXT: ret entry: %v0 = load <4 x double>, ptr %a0 %div = fdiv fast <4 x double> , %v0 diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll index 156c829c2dfb6..45b25013c9173 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll @@ -1,97 +1,178 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefix=LA64 declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32) define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind { -; CHECK-LABEL: powi_v8f32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -128 -; CHECK-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill -; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 5 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 4 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr0, $vr1, 16 -; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 6 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 32 -; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 7 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 48 -; CHECK-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr0, $vr1, 16 -; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 2 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 32 -; CHECK-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 -; CHECK-NEXT: # kill: def $f0 killed $f0 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload -; CHECK-NEXT: vextrins.w $vr1, $vr0, 48 -; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 2 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 -; CHECK-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 128 -; CHECK-NEXT: ret +; LA32-LABEL: powi_v8f32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -128 +; LA32-NEXT: st.w $ra, $sp, 124 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 120 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill +; LA32-NEXT: xvpickve.w $xr0, $xr0, 5 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 4 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload +; LA32-NEXT: vextrins.w $vr0, $vr1, 16 +; LA32-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 6 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 32 +; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 7 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 48 +; LA32-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 1 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 0 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA32-NEXT: vextrins.w $vr0, $vr1, 16 +; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 2 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 32 +; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.w $xr0, $xr0, 3 +; LA32-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powisf2 +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA32-NEXT: vextrins.w $vr1, $vr0, 48 +; LA32-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload +; LA32-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA32-NEXT: xvori.b $xr0, $xr1, 0 +; LA32-NEXT: ld.w $fp, $sp, 120 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 124 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 128 +; LA32-NEXT: ret +; +; LA64-LABEL: powi_v8f32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -128 +; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill +; LA64-NEXT: xvst $xr0, $sp, 80 # 32-byte Folded Spill +; LA64-NEXT: addi.w $fp, $a0, 0 +; LA64-NEXT: xvpickve.w $xr0, $xr0, 5 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 4 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload +; LA64-NEXT: vextrins.w $vr0, $vr1, 16 +; LA64-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 6 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 32 +; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 7 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 48 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 48 +; LA64-NEXT: xvst $xr1, $sp, 48 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 1 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vextrins.w $vr0, $vr1, 16 +; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 2 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 32 +; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.w $xr0, $xr0, 3 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powisf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload +; LA64-NEXT: vextrins.w $vr1, $vr0, 48 +; LA64-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload +; LA64-NEXT: xvpermi.q $xr1, $xr0, 2 +; LA64-NEXT: xvori.b $xr0, $xr1, 0 +; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 128 +; LA64-NEXT: ret entry: %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b) ret <8 x float> %res @@ -100,53 +181,96 @@ entry: declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32) define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind { -; CHECK-LABEL: powi_v4f64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -112 -; CHECK-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill -; CHECK-NEXT: addi.w $fp, $a0, 0 -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 3 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 2 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; CHECK-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 -; CHECK-NEXT: move $a0, $fp -; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; CHECK-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload -; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload -; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 112 -; CHECK-NEXT: ret +; LA32-LABEL: powi_v4f64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -112 +; LA32-NEXT: st.w $ra, $sp, 108 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 104 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill +; LA32-NEXT: xvpickve.d $xr0, $xr0, 3 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.d $xr0, $xr0, 2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload +; LA32-NEXT: vextrins.d $vr0, $vr1, 16 +; LA32-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.d $xr0, $xr0, 1 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA32-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA32-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA32-NEXT: xvpickve.d $xr0, $xr0, 0 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: bl __powidf2 +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA32-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA32-NEXT: vextrins.d $vr0, $vr1, 16 +; LA32-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload +; LA32-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA32-NEXT: ld.w $fp, $sp, 104 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 108 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 112 +; LA32-NEXT: ret +; +; LA64-LABEL: powi_v4f64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -112 +; LA64-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill +; LA64-NEXT: xvst $xr0, $sp, 64 # 32-byte Folded Spill +; LA64-NEXT: addi.w $fp, $a0, 0 +; LA64-NEXT: xvpickve.d $xr0, $xr0, 3 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.d $xr0, $xr0, 2 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr0, $vr1, 16 +; LA64-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.d $xr0, $xr0, 1 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload +; LA64-NEXT: xvpickve.d $xr0, $xr0, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $xr0 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__powidf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vextrins.d $vr0, $vr1, 16 +; LA64-NEXT: xvld $xr1, $sp, 32 # 32-byte Folded Reload +; LA64-NEXT: xvpermi.q $xr0, $xr1, 2 +; LA64-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 112 +; LA64-NEXT: ret entry: %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b) ret <4 x double> %res diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll index 4e475daa8ced3..e696129acb862 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll @@ -63,11 +63,9 @@ define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind { ; LA32-NEXT: xvfrsqrte.s $xr1, $xr0 ; LA32-NEXT: xvfmul.s $xr1, $xr0, $xr1 ; LA32-NEXT: xvfmul.s $xr0, $xr0, $xr1 -; LA32-NEXT: lu12i.w $a1, -261120 -; LA32-NEXT: xvreplgr2vr.w $xr2, $a1 +; LA32-NEXT: xvldi $xr2, -1400 ; LA32-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 -; LA32-NEXT: lu12i.w $a1, -266240 -; LA32-NEXT: xvreplgr2vr.w $xr2, $a1 +; LA32-NEXT: xvldi $xr2, -3137 ; LA32-NEXT: xvfmul.s $xr1, $xr1, $xr2 ; LA32-NEXT: xvfmul.s $xr0, $xr1, $xr0 ; LA32-NEXT: xvst $xr0, $sp, 64 @@ -100,11 +98,9 @@ define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind { ; LA64-NEXT: xvfrsqrte.s $xr1, $xr0 ; LA64-NEXT: xvfmul.s $xr1, $xr0, $xr1 ; LA64-NEXT: xvfmul.s $xr0, $xr0, $xr1 -; LA64-NEXT: lu12i.w $a1, -261120 -; LA64-NEXT: xvreplgr2vr.w $xr2, $a1 +; LA64-NEXT: xvldi $xr2, -1400 ; LA64-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 -; LA64-NEXT: lu12i.w $a1, -266240 -; LA64-NEXT: xvreplgr2vr.w $xr2, $a1 +; LA64-NEXT: xvldi $xr2, -3137 ; LA64-NEXT: xvfmul.s $xr1, $xr1, $xr2 ; LA64-NEXT: xvfmul.s $xr0, $xr1, $xr0 ; LA64-NEXT: xvst $xr0, $a0, 0 @@ -136,9 +132,8 @@ define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA32-NEXT: ld.w $a1, $a1, 0 ; FAULT-LA32-NEXT: st.w $a1, $sp, 32 ; FAULT-LA32-NEXT: xvld $xr0, $sp, 32 -; FAULT-LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; FAULT-LA32-NEXT: xvld $xr1, $a1, %pc_lo12(.LCPI1_0) ; FAULT-LA32-NEXT: xvfsqrt.d $xr0, $xr0 +; FAULT-LA32-NEXT: xvldi $xr1, -912 ; FAULT-LA32-NEXT: xvfdiv.d $xr0, $xr1, $xr0 ; FAULT-LA32-NEXT: xvst $xr0, $sp, 64 ; FAULT-LA32-NEXT: vld $vr0, $sp, 80 @@ -176,18 +171,16 @@ define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { ; LA32-NEXT: st.w $a1, $sp, 32 ; LA32-NEXT: xvld $xr0, $sp, 32 ; LA32-NEXT: xvfrsqrte.d $xr1, $xr0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; LA32-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI1_0) -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_1) -; LA32-NEXT: xvld $xr3, $a1, %pc_lo12(.LCPI1_1) ; LA32-NEXT: xvfmul.d $xr1, $xr0, $xr1 -; LA32-NEXT: xvfmul.d $xr4, $xr0, $xr1 -; LA32-NEXT: xvfmadd.d $xr4, $xr4, $xr1, $xr2 -; LA32-NEXT: xvfmul.d $xr1, $xr1, $xr3 +; LA32-NEXT: xvfmul.d $xr2, $xr0, $xr1 +; LA32-NEXT: xvldi $xr3, -888 +; LA32-NEXT: xvfmadd.d $xr2, $xr2, $xr1, $xr3 +; LA32-NEXT: xvldi $xr4, -800 ; LA32-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; LA32-NEXT: xvfmul.d $xr1, $xr1, $xr2 ; LA32-NEXT: xvfmul.d $xr0, $xr0, $xr1 -; LA32-NEXT: xvfmadd.d $xr0, $xr0, $xr1, $xr2 -; LA32-NEXT: xvfmul.d $xr1, $xr1, $xr3 +; LA32-NEXT: xvfmadd.d $xr0, $xr0, $xr1, $xr3 +; LA32-NEXT: xvfmul.d $xr1, $xr1, $xr4 ; LA32-NEXT: xvfmul.d $xr0, $xr1, $xr0 ; LA32-NEXT: xvst $xr0, $sp, 64 ; LA32-NEXT: vld $vr0, $sp, 80 @@ -219,13 +212,9 @@ define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { ; LA64-NEXT: xvfrsqrte.d $xr1, $xr0 ; LA64-NEXT: xvfmul.d $xr1, $xr0, $xr1 ; LA64-NEXT: xvfmul.d $xr2, $xr0, $xr1 -; LA64-NEXT: ori $a1, $zero, 0 -; LA64-NEXT: lu32i.d $a1, -524288 -; LA64-NEXT: lu52i.d $a1, $a1, -1024 -; LA64-NEXT: xvreplgr2vr.d $xr3, $a1 +; LA64-NEXT: xvldi $xr3, -888 ; LA64-NEXT: xvfmadd.d $xr2, $xr2, $xr1, $xr3 -; LA64-NEXT: lu52i.d $a1, $zero, -1026 -; LA64-NEXT: xvreplgr2vr.d $xr4, $a1 +; LA64-NEXT: xvldi $xr4, -800 ; LA64-NEXT: xvfmul.d $xr1, $xr1, $xr4 ; LA64-NEXT: xvfmul.d $xr1, $xr1, $xr2 ; LA64-NEXT: xvfmul.d $xr0, $xr0, $xr1 diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll index f8a3284f04dc8..9ae651d612f18 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt.ll @@ -174,9 +174,8 @@ define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { ; LA32-NEXT: ld.w $a1, $a1, 0 ; LA32-NEXT: st.w $a1, $sp, 32 ; LA32-NEXT: xvld $xr0, $sp, 32 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: xvld $xr1, $a1, %pc_lo12(.LCPI3_0) ; LA32-NEXT: xvfsqrt.d $xr0, $xr0 +; LA32-NEXT: xvldi $xr1, -912 ; LA32-NEXT: xvfdiv.d $xr0, $xr1, $xr0 ; LA32-NEXT: xvst $xr0, $sp, 64 ; LA32-NEXT: vld $vr0, $sp, 80 diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll index 215436823af83..623a6de1bc402 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll index ad36c3aa5c29d..743ab10cc9b00 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll index a671e9979b2fe..e6688bacd3bf9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-max-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s declare <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll index 5ed4104c295fa..cfe9ec575222a 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-min-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s declare <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d-invalid-imm.ll new file mode 100644 index 0000000000000..5a5af4356f714 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d-invalid-imm.ll @@ -0,0 +1,33 @@ +; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_d_lo(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lasx_xvpickve2gr_d_hi(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 4) + ret i64 %res +} + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_du_lo(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lasx_xvpickve2gr_du_hi(<4 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 4) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d.ll new file mode 100644 index 0000000000000..178dd92cbdb80 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-d.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_d(<4 x i64> %va) nounwind { +; CHECK-LABEL: lasx_xvpickve2gr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 1) + ret i64 %res +} + +declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) + +define i64 @lasx_xvpickve2gr_du(<4 x i64> %va) nounwind { +; CHECK-LABEL: lasx_xvpickve2gr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpickve2gr.du $a0, $xr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 1) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll index 93056b272dfc5..0c91b56387f79 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lasx < %s 2>&1 | FileCheck %s declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32) @@ -16,22 +17,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_d_lo(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lasx_xvpickve2gr_d_hi(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 4) - ret i64 %res -} - declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32) define i32 @lasx_xvpickve2gr_wu_lo(<8 x i32> %va) nounwind { @@ -47,19 +32,3 @@ entry: %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 8) ret i32 %res } - -declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_du_lo(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lasx_xvpickve2gr_du_hi(<4 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lasx.xvpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 4) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll index 0617e7424321b..a6f19ce0c0140 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-pickve2gr.ll @@ -1,9 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s - - - declare i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32>, i32) define i32 @lasx_xvpickve2gr_w(<8 x i32> %va) nounwind { @@ -16,18 +14,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_d(<4 x i64> %va) nounwind { -; CHECK-LABEL: lasx_xvpickve2gr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> %va, i32 1) - ret i64 %res -} - declare i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32>, i32) define i32 @lasx_xvpickve2gr_wu(<8 x i32> %va) nounwind { @@ -39,15 +25,3 @@ entry: %res = call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> %va, i32 1) ret i32 %res } - -declare i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64>, i32) - -define i64 @lasx_xvpickve2gr_du(<4 x i64> %va) nounwind { -; CHECK-LABEL: lasx_xvpickve2gr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.du $a0, $xr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> %va, i32 1) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr-d.ll new file mode 100644 index 0000000000000..79ec7b51f6278 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr-d.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define <4 x i64> @xvrepl_ins_d(i64 %a, i64 %b) { +; CHECK-LABEL: xvrepl_ins_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 +; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 1 +; CHECK-NEXT: ret +entry: + %0 = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) + %1 = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %0, i64 %b, i32 1) + ret <4 x i64> %1 +} + +declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32 immarg) +declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll index 2e538ed66b250..31b809e016564 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-repl-ins-gr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s define <8 x i32> @xvrepl_ins_w(i32 %a, i32 %b) { @@ -13,19 +14,5 @@ entry: ret <8 x i32> %1 } -define <4 x i64> @xvrepl_ins_d(i64 %a, i64 %b) { -; CHECK-LABEL: xvrepl_ins_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 1 -; CHECK-NEXT: ret -entry: - %0 = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) - %1 = call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> %0, i64 %b, i32 1) - ret <4 x i64> %1 -} - declare <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32>, i32, i32 immarg) declare <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32) -declare <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64>, i64, i32 immarg) -declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr-d.ll new file mode 100644 index 0000000000000..61bc89249d97e --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr-d.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) + +define <4 x i64> @lasx_xvreplgr2vr_d(i64 %a) nounwind { +; CHECK-LABEL: lasx_xvreplgr2vr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 +; CHECK-NEXT: ret +entry: + %res = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) + ret <4 x i64> %res +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll index c71abd2205c67..a3c0e261e7122 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-replgr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32) @@ -36,15 +37,3 @@ entry: %res = call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 %a) ret <8 x i32> %res } - -declare <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64) - -define <4 x i64> @lasx_xvreplgr2vr_d(i64 %a) nounwind { -; CHECK-LABEL: lasx_xvreplgr2vr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvreplgr2vr.d $xr0, $a0 -; CHECK-NEXT: ret -entry: - %res = call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 %a) - ret <4 x i64> %res -} diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll index 6e3e2e0330f52..5e234e4bd8210 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-set.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare i32 @llvm.loongarch.lasx.xbz.v(<32 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll index a466b78bf8d2d..38e3289ef4cba 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setallnez.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll index 36e65fc5b3281..f6917cffb36b5 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-setanyeqz.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s declare i32 @llvm.loongarch.lasx.xbz.b(<32 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll new file mode 100644 index 0000000000000..98687755fcfb4 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/adda.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @vadda_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadda.b $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <32 x i8>, ptr %a + %vb = load <32 x i8>, ptr %b + %conda = icmp slt <32 x i8> %va, zeroinitializer + %nega = sub <32 x i8> zeroinitializer, %va + %absa = select <32 x i1> %conda, <32 x i8> %nega, <32 x i8> %va + %condb = icmp slt <32 x i8> %vb, zeroinitializer + %negb = sub <32 x i8> zeroinitializer, %vb + %absb = select <32 x i1> %condb, <32 x i8> %negb, <32 x i8> %vb + %add = add <32 x i8> %absa, %absb + store <32 x i8> %add, ptr %res + ret void +} + +define void @vadda_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadda.h $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i16>, ptr %a + %vb = load <16 x i16>, ptr %b + %conda = icmp slt <16 x i16> %va, zeroinitializer + %nega = sub <16 x i16> zeroinitializer, %va + %absa = select <16 x i1> %conda, <16 x i16> %nega, <16 x i16> %va + %condb = icmp slt <16 x i16> %vb, zeroinitializer + %negb = sub <16 x i16> zeroinitializer, %vb + %absb = select <16 x i1> %condb, <16 x i16> %negb, <16 x i16> %vb + %add = add <16 x i16> %absa, %absb + store <16 x i16> %add, ptr %res + ret void +} + +define void @vadda_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadda.w $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i32>, ptr %a + %vb = load <8 x i32>, ptr %b + %conda = icmp slt <8 x i32> %va, zeroinitializer + %nega = sub <8 x i32> zeroinitializer, %va + %absa = select <8 x i1> %conda, <8 x i32> %nega, <8 x i32> %va + %condb = icmp slt <8 x i32> %vb, zeroinitializer + %negb = sub <8 x i32> zeroinitializer, %vb + %absb = select <8 x i1> %condb, <8 x i32> %negb, <8 x i32> %vb + %add = add <8 x i32> %absa, %absb + store <8 x i32> %add, ptr %res + ret void +} + +define void @vadda_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvadda.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i64>, ptr %a + %vb = load <4 x i64>, ptr %b + %conda = icmp slt <4 x i64> %va, zeroinitializer + %nega = sub <4 x i64> zeroinitializer, %va + %absa = select <4 x i1> %conda, <4 x i64> %nega, <4 x i64> %va + %condb = icmp slt <4 x i64> %vb, zeroinitializer + %negb = sub <4 x i64> zeroinitializer, %vb + %absb = select <4 x i1> %condb, <4 x i64> %negb, <4 x i64> %vb + %add = add <4 x i64> %absa, %absb + store <4 x i64> %add, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll index cf0496fb8fb89..60b51755681a4 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll @@ -3,18 +3,11 @@ ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @extract_32xi8(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_32xi8: -; LA32: # %bb.0: -; LA32-NEXT: xvld $xr0, $a0, 0 -; LA32-NEXT: vpickve2gr.b $a0, $vr0, 1 -; LA32-NEXT: st.b $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_32xi8: -; LA64: # %bb.0: -; LA64-NEXT: xvld $xr0, $a0, 0 -; LA64-NEXT: xvstelm.b $xr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_32xi8: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvstelm.b $xr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <32 x i8>, ptr %src %e = extractelement <32 x i8> %v, i32 1 store i8 %e, ptr %dst @@ -22,18 +15,11 @@ define void @extract_32xi8(ptr %src, ptr %dst) nounwind { } define void @extract_16xi16(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_16xi16: -; LA32: # %bb.0: -; LA32-NEXT: xvld $xr0, $a0, 0 -; LA32-NEXT: vpickve2gr.h $a0, $vr0, 1 -; LA32-NEXT: st.h $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_16xi16: -; LA64: # %bb.0: -; LA64-NEXT: xvld $xr0, $a0, 0 -; LA64-NEXT: xvstelm.h $xr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_16xi16: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a0, 0 +; CHECK-NEXT: xvstelm.h $xr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <16 x i16>, ptr %src %e = extractelement <16 x i16> %v, i32 1 store i16 %e, ptr %dst @@ -111,8 +97,7 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; LA32-NEXT: movgr2fr.w $fa1, $a2 ; LA32-NEXT: xvpermi.q $xr2, $xr0, 1 ; LA32-NEXT: xvshuf.b $xr0, $xr2, $xr0, $xr1 -; LA32-NEXT: vpickve2gr.b $a0, $vr0, 0 -; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: xvstelm.b $xr0, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: extract_32xi8_idx: @@ -136,8 +121,7 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; LA32-NEXT: movgr2fr.w $fa1, $a2 ; LA32-NEXT: xvpermi.q $xr2, $xr0, 1 ; LA32-NEXT: xvshuf.h $xr1, $xr2, $xr0 -; LA32-NEXT: vpickve2gr.h $a0, $vr1, 0 -; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: xvstelm.h $xr1, $a1, 0, 0 ; LA32-NEXT: ret ; ; LA64-LABEL: extract_16xi16_idx: diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll index ae6f091ddb498..aefaa0efb079c 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll @@ -53,8 +53,7 @@ define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { ; LA32-LABEL: one_fdiv_v4f64: ; LA32: # %bb.0: # %entry ; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: xvld $xr1, $a1, %pc_lo12(.LCPI3_0) +; LA32-NEXT: xvldi $xr1, -912 ; LA32-NEXT: xvfdiv.d $xr0, $xr1, $xr0 ; LA32-NEXT: xvst $xr0, $a0, 0 ; LA32-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll index 765473ce166df..0b8015ddbdd4a 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll @@ -7,13 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: shufflevector_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve.d $xr2, $xr1, 3 -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvrepl128vei.d $xr3, $xr3, 1 -; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: vextrins.d $vr2, $vr3, 16 ; CHECK-NEXT: xvpickve.d $xr1, $xr1, 2 ; CHECK-NEXT: vextrins.d $vr0, $vr1, 16 -; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvpermi.q $xr0, $xr2, 2 ; CHECK-NEXT: ret entry: %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll index ca405314686e6..af1598f69569e 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v32i8: @@ -68,11 +69,19 @@ entry: } define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: insert_extract_v4i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve.d $xr1, $xr0, 3 -; CHECK-NEXT: xvinsve0.d $xr0, $xr1, 1 -; CHECK-NEXT: ret +; LA32-LABEL: insert_extract_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvpickve.w $xr1, $xr0, 6 +; LA32-NEXT: xvpickve.w $xr2, $xr0, 7 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 2 +; LA32-NEXT: xvinsve0.w $xr0, $xr2, 3 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_extract_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvpickve.d $xr1, $xr0, 3 +; LA64-NEXT: xvinsve0.d $xr0, $xr1, 1 +; LA64-NEXT: ret entry: %b = extractelement <4 x i64> %a, i32 3 %c = insertelement <4 x i64> %a, i64 %b, i32 1 @@ -80,10 +89,17 @@ entry: } define <4 x i64> @insert_extract0_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: insert_extract0_v4i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvinsve0.d $xr0, $xr0, 1 -; CHECK-NEXT: ret +; LA32-LABEL: insert_extract0_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvpickve.w $xr1, $xr0, 1 +; LA32-NEXT: xvinsve0.w $xr0, $xr0, 2 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 3 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_extract0_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvinsve0.d $xr0, $xr0, 1 +; LA64-NEXT: ret entry: %b = extractelement <4 x i64> %a, i32 0 %c = insertelement <4 x i64> %a, i64 %b, i32 1 diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll index 4e173c4feadba..c5d20003742e5 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v32i8: @@ -54,10 +55,22 @@ entry: } define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind { -; CHECK-LABEL: insert_extract_v4i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvextrins.d $xr0, $xr0, 1 -; CHECK-NEXT: ret +; LA32-LABEL: insert_extract_v4i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvpickve.w $xr1, $xr0, 2 +; LA32-NEXT: xvpickve.w $xr2, $xr0, 3 +; LA32-NEXT: xvpickve.w $xr3, $xr0, 6 +; LA32-NEXT: xvpickve.w $xr4, $xr0, 7 +; LA32-NEXT: xvinsve0.w $xr0, $xr1, 0 +; LA32-NEXT: xvinsve0.w $xr0, $xr2, 1 +; LA32-NEXT: xvinsve0.w $xr0, $xr3, 4 +; LA32-NEXT: xvinsve0.w $xr0, $xr4, 5 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_extract_v4i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvextrins.d $xr0, $xr0, 1 +; LA64-NEXT: ret entry: %b_lo = extractelement <4 x i64> %a, i32 1 %b_hi = extractelement <4 x i64> %a, i32 3 diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll index aa29264924df9..2f1db43e68fef 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @insert_32xi8(ptr %src, ptr %dst, i8 %in) nounwind { ; CHECK-LABEL: insert_32xi8: @@ -121,12 +122,20 @@ define void @insert_8xi32(ptr %src, ptr %dst, i32 %in) nounwind { } define void @insert_4xi64(ptr %src, ptr %dst, i64 %in) nounwind { -; CHECK-LABEL: insert_4xi64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a2, 1 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xi64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvinsgr2vr.w $xr0, $a2, 2 +; LA32-NEXT: xvinsgr2vr.w $xr0, $a3, 3 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xi64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvinsgr2vr.d $xr0, $a2, 1 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x i64>, ptr %src %v_new = insertelement <4 x i64> %v, i64 %in, i32 1 store <4 x i64> %v_new, ptr %dst @@ -162,18 +171,30 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind { } define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_32xi8_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI12_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI12_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.b $xr2, $a0 -; CHECK-NEXT: xvseq.b $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.b $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_32xi8_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI12_0) +; LA32-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI12_0) +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.b $xr2, $a3 +; LA32-NEXT: xvseq.b $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.b $xr2, $a2 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_32xi8_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI12_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI12_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.b $xr2, $a0 +; LA64-NEXT: xvseq.b $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.b $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <32 x i8>, ptr %src %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx store <32 x i8> %v_new, ptr %dst @@ -181,18 +202,30 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind { } define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_16xi16_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI13_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI13_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.h $xr2, $a0 -; CHECK-NEXT: xvseq.h $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.h $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_16xi16_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI13_0) +; LA32-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI13_0) +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.h $xr2, $a3 +; LA32-NEXT: xvseq.h $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.h $xr2, $a2 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_16xi16_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI13_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI13_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.h $xr2, $a0 +; LA64-NEXT: xvseq.h $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.h $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <16 x i16>, ptr %src %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx store <16 x i16> %v_new, ptr %dst @@ -200,18 +233,30 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind { } define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_8xi32_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI14_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI14_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.w $xr2, $a0 -; CHECK-NEXT: xvseq.w $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.w $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_8xi32_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI14_0) +; LA32-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI14_0) +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a3 +; LA32-NEXT: xvseq.w $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a2 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_8xi32_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI14_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI14_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.w $xr2, $a0 +; LA64-NEXT: xvseq.w $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.w $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <8 x i32>, ptr %src %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx store <8 x i32> %v_new, ptr %dst @@ -219,18 +264,36 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind { } define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xi64_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI15_0) -; CHECK-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI15_0) -; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: xvreplgr2vr.d $xr2, $a0 -; CHECK-NEXT: xvseq.d $xr0, $xr2, $xr0 -; CHECK-NEXT: xvreplgr2vr.d $xr2, $a2 -; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xi64_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a5, %pc_hi20(.LCPI15_0) +; LA32-NEXT: xvld $xr0, $a5, %pc_lo12(.LCPI15_0) +; LA32-NEXT: add.w $a4, $a4, $a4 +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a4 +; LA32-NEXT: xvseq.w $xr2, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.w $xr3, $a2 +; LA32-NEXT: xvbitsel.v $xr1, $xr1, $xr3, $xr2 +; LA32-NEXT: addi.w $a0, $a4, 1 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a0 +; LA32-NEXT: xvseq.w $xr0, $xr2, $xr0 +; LA32-NEXT: xvreplgr2vr.w $xr2, $a3 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xi64_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI15_0) +; LA64-NEXT: xvld $xr0, $a4, %pc_lo12(.LCPI15_0) +; LA64-NEXT: xvld $xr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: xvreplgr2vr.d $xr2, $a0 +; LA64-NEXT: xvseq.d $xr0, $xr2, $xr0 +; LA64-NEXT: xvreplgr2vr.d $xr2, $a2 +; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr2, $xr0 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x i64>, ptr %src %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx store <4 x i64> %v_new, ptr %dst @@ -238,19 +301,32 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind { } define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_8xfloat_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0 killed $f0 def $xr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI16_0) -; CHECK-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI16_0) -; CHECK-NEXT: xvld $xr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: xvreplgr2vr.w $xr3, $a0 -; CHECK-NEXT: xvseq.w $xr1, $xr3, $xr1 -; CHECK-NEXT: xvreplve0.w $xr0, $xr0 -; CHECK-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_8xfloat_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI16_0) +; LA32-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI16_0) +; LA32-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA32-NEXT: xvld $xr2, $a0, 0 +; LA32-NEXT: xvreplgr2vr.w $xr3, $a2 +; LA32-NEXT: xvseq.w $xr1, $xr3, $xr1 +; LA32-NEXT: xvreplve0.w $xr0, $xr0 +; LA32-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_8xfloat_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0 killed $f0 def $xr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI16_0) +; LA64-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI16_0) +; LA64-NEXT: xvld $xr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: xvreplgr2vr.w $xr3, $a0 +; LA64-NEXT: xvseq.w $xr1, $xr3, $xr1 +; LA64-NEXT: xvreplve0.w $xr0, $xr0 +; LA64-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <8 x float>, ptr %src %v_new = insertelement <8 x float> %v, float %in, i32 %idx store <8 x float> %v_new, ptr %dst @@ -258,19 +334,36 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin } define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xdouble_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI17_0) -; CHECK-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI17_0) -; CHECK-NEXT: xvld $xr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: xvreplgr2vr.d $xr3, $a0 -; CHECK-NEXT: xvseq.d $xr1, $xr3, $xr1 -; CHECK-NEXT: xvreplve0.d $xr0, $xr0 -; CHECK-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 -; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xdouble_idx: +; LA32: # %bb.0: +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA32-NEXT: xvld $xr1, $a0, 0 +; LA32-NEXT: xvrepli.b $xr2, 0 +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI17_0) +; LA32-NEXT: xvld $xr3, $a0, %pc_lo12(.LCPI17_0) +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 2 +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 4 +; LA32-NEXT: xvinsgr2vr.w $xr2, $a2, 6 +; LA32-NEXT: xvseq.d $xr2, $xr2, $xr3 +; LA32-NEXT: xvreplve0.d $xr0, $xr0 +; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; LA32-NEXT: xvst $xr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xdouble_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $xr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI17_0) +; LA64-NEXT: xvld $xr1, $a3, %pc_lo12(.LCPI17_0) +; LA64-NEXT: xvld $xr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: xvreplgr2vr.d $xr3, $a0 +; LA64-NEXT: xvseq.d $xr1, $xr3, $xr1 +; LA64-NEXT: xvreplve0.d $xr0, $xr0 +; LA64-NEXT: xvbitsel.v $xr0, $xr2, $xr0, $xr1 +; LA64-NEXT: xvst $xr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x double>, ptr %src %v_new = insertelement <4 x double> %v, double %in, i32 %idx store <4 x double> %v_new, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll index dce1e4b777e29..9afe16d029fb2 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvrepl128vei.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ;; xvrepl128vei.b @@ -12,6 +13,17 @@ define <32 x i8> @shufflevector_v32i8(<32 x i8> %a, <32 x i8> %b) { ret <32 x i8> %c } +;; xvrepl128vei.b +define <32 x i8> @shufflevector_v32i8_undef(<32 x i8> %a) { +; CHECK-LABEL: shufflevector_v32i8_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: xvrepl128vei.b $xr0, $xr0, 1 +; CHECK-NEXT: ret + %c = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> + ret <32 x i8> %c +} + ;; xvrepl128vei.h define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK-LABEL: shufflevector_v16i16: @@ -23,6 +35,17 @@ define <16 x i16> @shufflevector_v16i16(<16 x i16> %a, <16 x i16> %b) { ret <16 x i16> %c } +;; xvrepl128vei.h +define <16 x i16> @shufflevector_v16i16_undef(<16 x i16> %a) { +; CHECK-LABEL: shufflevector_v16i16_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: xvrepl128vei.h $xr0, $xr0, 3 +; CHECK-NEXT: ret + %c = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> + ret <16 x i16> %c +} + ;; xvrepl128vei.w define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: shufflevector_v8i32: @@ -34,6 +57,16 @@ define <8 x i32> @shufflevector_v8i32(<8 x i32> %a, <8 x i32> %b) { ret <8 x i32> %c } +;; xvrepl128vei.w +define <8 x i32> @shufflevector_v8i32_undef(<8 x i32> %a) { +; CHECK-LABEL: shufflevector_v8i32_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: xvrepl128vei.w $xr0, $xr0, 2 +; CHECK-NEXT: ret + %c = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> + ret <8 x i32> %c +} + ;; xvrepl128vei.d define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: shufflevector_v4i64: @@ -44,6 +77,16 @@ define <4 x i64> @shufflevector_v4i64(<4 x i64> %a, <4 x i64> %b) { ret <4 x i64> %c } +;; xvrepl128vei.d +define <4 x i64> @shufflevector_v4i64_undef(<4 x i64> %a) { +; CHECK-LABEL: shufflevector_v4i64_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: xvrepl128vei.d $xr0, $xr0, 1 +; CHECK-NEXT: ret + %c = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> + ret <4 x i64> %c +} + ;; xvrepl128vei.w define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-LABEL: shufflevector_v8f32: @@ -54,6 +97,17 @@ define <8 x float> @shufflevector_v8f32(<8 x float> %a, <8 x float> %b) { ret <8 x float> %c } +;; xvrepl128vei.w +define <8 x float> @shufflevector_v8f32_undef(<8 x float> %a) { +; CHECK-LABEL: shufflevector_v8f32_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 238 +; CHECK-NEXT: xvrepl128vei.w $xr0, $xr0, 1 +; CHECK-NEXT: ret + %c = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> + ret <8 x float> %c +} + ;; xvrepl128vei.d define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: shufflevector_v4f64: @@ -63,3 +117,13 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) { %c = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> ret <4 x double> %c } + +;; xvrepl128vei.d +define <4 x double> @shufflevector_v4f64_undef(<4 x double> %a) { +; CHECK-LABEL: shufflevector_v4f64_undef: +; CHECK: # %bb.0: +; CHECK-NEXT: xvrepl128vei.d $xr0, $xr0, 0 +; CHECK-NEXT: ret + %c = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> + ret <4 x double> %c +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll index 6a88805148715..4900146b69a25 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ;; xvshuf.b diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll index 02186d23e31e5..37b62ca989edb 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shuffle-as-xvshuf4i.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s ;; xxvshuf4i.b @@ -40,4 +41,4 @@ define <8 x float> @shufflevector_xvshuf4i_v8f32(<8 x float> %a, <8 x float> %b) ; CHECK-NEXT: ret %c = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %c -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll new file mode 100644 index 0000000000000..39ac647d6875c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/scalarize-fp.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 -mattr=+lasx < %s | FileCheck %s + +define <8 x float> @fadd_elt0_v8f32(float %a) nounwind { +; CHECK-LABEL: fadd_elt0_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -1168 +; CHECK-NEXT: fadd.s $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <8 x float> poison, float %a, i32 0 + %c = fadd <8 x float> %b, + ret <8 x float> %c +} + +define <4 x double> @fadd_elt0_v4f64(double %a) nounwind { +; CHECK-LABEL: fadd_elt0_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -912 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <4 x double> poison, double %a, i32 0 + %c = fadd <4 x double> %b, + ret <4 x double> %c +} + +define <8 x float> @fsub_splat_v8f32(float %a, float %b) nounwind { +; CHECK-LABEL: fsub_splat_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsub.s $fa0, $fa0, $fa1 +; CHECK-NEXT: xvreplve0.w $xr0, $xr0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <8 x float> poison, float %a, i32 0 + %insb = insertelement <8 x float> poison, float %b, i32 0 + %va = shufflevector <8 x float> %insa, <8 x float> poison, <8 x i32> zeroinitializer + %vb = shufflevector <8 x float> %insb, <8 x float> poison, <8 x i32> zeroinitializer + %c = fsub <8 x float> %va, %vb + ret <8 x float> %c +} + +define <4 x double> @fsub_splat_v4f64(double %a) nounwind { +; CHECK-LABEL: fsub_splat_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -784 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: xvreplve0.d $xr0, $xr0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <4 x double> poison, double %a, i32 0 + %insb = insertelement <4 x double> poison, double 1.0, i32 0 + %va = shufflevector <4 x double> %insa, <4 x double> poison, <4 x i32> zeroinitializer + %vb = shufflevector <4 x double> %insb, <4 x double> poison, <4 x i32> zeroinitializer + %c = fsub <4 x double> %va, %vb + ret <4 x double> %c +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll index 5f76d9951df9c..245f76472b844 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll @@ -1,15 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s define <32 x i8> @shuffle_v32i8(<32 x i8> %a) { ; CHECK-LABEL: shuffle_v32i8: ; CHECK: # %bb.0: ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) -; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI0_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI0_1) -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3 +; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI0_0) +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78 ; CHECK-NEXT: xvshuf.h $xr1, $xr2, $xr0 ; CHECK-NEXT: xvori.b $xr0, $xr1, 0 ; CHECK-NEXT: ret @@ -33,11 +31,8 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) { ; CHECK-LABEL: shuffle_v16i16: ; CHECK: # %bb.0: ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_1) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_1) -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3 +; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI2_0) +; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78 ; CHECK-NEXT: xvshuf.w $xr1, $xr2, $xr0 ; CHECK-NEXT: xvori.b $xr0, $xr1, 0 ; CHECK-NEXT: ret @@ -71,10 +66,7 @@ define <8 x i32> @shuffle_v8i32(<8 x i32> %a) { define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) { ; CHECK-LABEL: shuffle_v8i32_same_lane: ; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI5_0) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI5_0) -; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 225 ; CHECK-NEXT: ret %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> ret <8 x i32> %shuffle @@ -83,14 +75,7 @@ define <8 x i32> @shuffle_v8i32_same_lane(<8 x i32> %a) { define <4 x i64> @shuffle_v4i64(<4 x i64> %a) { ; CHECK-LABEL: shuffle_v4i64: ; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_0) -; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI6_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI6_1) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI6_1) -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3 -; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 39 ; CHECK-NEXT: ret %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> ret <4 x i64> %shuffle @@ -99,10 +84,7 @@ define <4 x i64> @shuffle_v4i64(<4 x i64> %a) { define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) { ; CHECK-LABEL: shuffle_v4i64_same_lane: ; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI7_0) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 225 ; CHECK-NEXT: ret %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> ret <4 x i64> %shuffle @@ -135,14 +117,7 @@ define <8 x float> @shuffle_v8f32_same_lane(<8 x float> %a) { define <4 x double> @shuffle_v4f64(<4 x double> %a) { ; CHECK-LABEL: shuffle_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_0) -; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI10_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI10_1) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI10_1) -; CHECK-NEXT: xvpermi.d $xr3, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr2, $xr0, $xr3 -; CHECK-NEXT: xvshuf.d $xr1, $xr2, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 39 ; CHECK-NEXT: ret %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> ret <4 x double> %shuffle @@ -151,11 +126,7 @@ define <4 x double> @shuffle_v4f64(<4 x double> %a) { define <4 x double> @shuffle_v4f64_same_lane(<4 x double> %a) { ; CHECK-LABEL: shuffle_v4f64_same_lane: ; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0) -; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI11_0) -; CHECK-NEXT: xvpermi.d $xr0, $xr0, 78 -; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr0 -; CHECK-NEXT: xvori.b $xr0, $xr1, 0 +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 75 ; CHECK-NEXT: ret %shuffle = shufflevector <4 x double> %a, <4 x double> poison, <4 x i32> ret <4 x double> %shuffle diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll index 7268eb24ee51c..3e815a174d232 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-add.ll @@ -1,19 +1,33 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefix=LA64 define void @vec_reduce_add_v32i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA32-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA32-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.h.b $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <32 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v) store i8 %res, ptr %dst @@ -21,17 +35,29 @@ define void @vec_reduce_add_v32i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v16i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA32-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA32-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.w.h $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <16 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v) store i16 %res, ptr %dst @@ -39,16 +65,27 @@ define void @vec_reduce_add_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: st.w $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA32-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA32-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA32-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.d.w $xr0, $xr0, $xr0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 0 +; LA64-NEXT: st.w $a0, $a1, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -56,14 +93,31 @@ define void @vec_reduce_add_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpermi.d $xr1, $xr0, 2 -; CHECK-NEXT: xvadd.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvstelm.d $xr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vadd.d $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: add.w $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 0 +; LA32-NEXT: add.w $a2, $a3, $a2 +; LA32-NEXT: sltu $a3, $a2, $a3 +; LA32-NEXT: add.w $a0, $a0, $a3 +; LA32-NEXT: st.w $a2, $a1, 0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvhaddw.q.d $xr0, $xr0, $xr0 +; LA64-NEXT: xvpermi.d $xr1, $xr0, 2 +; LA64-NEXT: xvadd.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvstelm.d $xr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll index fd64beab57bf0..23cc230f04503 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_and_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_and_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_and_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vand.v $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vand.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,30 @@ define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vand.v $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: and $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: and $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vand.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll index cdb08d9de3821..d7d3afc6dd1da 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_or_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_or_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_or_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,30 @@ define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll index 1d182731c93be..8cbbb52884865 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smax_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smax_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_smax_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.w $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.w $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_smax_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.d $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: slt $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a0, $a2 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 2 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a6, $a3 +; LA32-NEXT: maskeqz $a5, $a5, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a2, $a2, $a3 +; LA32-NEXT: maskeqz $a0, $a0, $a3 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.d $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll index 369afdd1fc7bc..c34852aa8a28f 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smin_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smin_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_smin_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.w $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.w $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.w $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_smin_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.d $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.d $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: slt $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a2, $a0 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 0 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a5, $a3 +; LA32-NEXT: maskeqz $a5, $a6, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a0, $a0, $a3 +; LA32-NEXT: maskeqz $a2, $a2, $a3 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.d $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll index 5256a72ad7d97..c44f83a909a68 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umax_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umax_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_umax_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.wu $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.wu $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_umax_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmax.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmax.du $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: sltu $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a0, $a2 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 2 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a6, $a3 +; LA32-NEXT: maskeqz $a5, $a5, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a2, $a2, $a3 +; LA32-NEXT: maskeqz $a0, $a0, $a3 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmax.du $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll index a82c886d8eed1..f91a1b34dffe9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umin_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umin_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_umin_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.wu $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.wu $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.wu $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,41 @@ define void @vec_reduce_umin_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vmin.du $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vmin.du $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: sltu $a3, $a2, $a0 +; LA32-NEXT: xor $a4, $a2, $a0 +; LA32-NEXT: sltui $a4, $a4, 1 +; LA32-NEXT: masknez $a3, $a3, $a4 +; LA32-NEXT: vpickve2gr.w $a5, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a6, $vr0, 0 +; LA32-NEXT: sltu $a7, $a6, $a5 +; LA32-NEXT: maskeqz $a4, $a7, $a4 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: masknez $a4, $a5, $a3 +; LA32-NEXT: maskeqz $a5, $a6, $a3 +; LA32-NEXT: or $a4, $a5, $a4 +; LA32-NEXT: masknez $a0, $a0, $a3 +; LA32-NEXT: maskeqz $a2, $a2, $a3 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a4, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vmin.du $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll index 429fadcdd156e..af1a66b574c03 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_xor_v32i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_xor_v32i8: @@ -44,17 +45,30 @@ define void @vec_reduce_xor_v16i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v8i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v) store i32 %res, ptr %dst @@ -62,15 +76,30 @@ define void @vec_reduce_xor_v8i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v4i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvld $xr0, $a0, 0 +; LA32-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: xor $a0, $a2, $a0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: xor $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvld $xr0, $a0, 0 +; LA64-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll index b697a2fd07435..2007f851129e8 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vec-shuffle-byte-rotate.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s ;; TODO For these special shuffle mask, we can lower it to xvbsll + xvbsrl + xvor. @@ -126,9 +127,7 @@ define <4 x i64> @byte_rotate_v4i64_2(<4 x i64> %a, <4 x i64> %b) nounwind { define <4 x i64> @byte_rotate_v4i64_3(<4 x i64> %a) nounwind { ; CHECK-LABEL: byte_rotate_v4i64_3: ; CHECK: # %bb.0: -; CHECK-NEXT: xvbsrl.v $xr1, $xr0, 8 -; CHECK-NEXT: xvbsll.v $xr0, $xr0, 8 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1 +; CHECK-NEXT: xvpermi.d $xr0, $xr0, 177 ; CHECK-NEXT: ret %shuffle = shufflevector <4 x i64> %a, <4 x i64> poison, <4 x i32> ret <4 x i64> %shuffle diff --git a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll index 44e4f71c8d08d..bf31ccb1d0104 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/vselect.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/vselect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA32 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s --check-prefixes=CHECK,LA64 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s define void @select_v32i8_imm(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: select_v32i8_imm: @@ -50,26 +50,14 @@ define void @select_v16i16(ptr %res, ptr %a0, ptr %a1) nounwind { } define void @select_v8i32(ptr %res, ptr %a0, ptr %a1) nounwind { -; LA32-LABEL: select_v8i32: -; LA32: # %bb.0: -; LA32-NEXT: xvld $xr0, $a1, 0 -; LA32-NEXT: xvld $xr1, $a2, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI3_0) -; LA32-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; LA32-NEXT: xvst $xr0, $a0, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: select_v8i32: -; LA64: # %bb.0: -; LA64-NEXT: xvld $xr0, $a1, 0 -; LA64-NEXT: xvld $xr1, $a2, 0 -; LA64-NEXT: ori $a1, $zero, 0 -; LA64-NEXT: lu32i.d $a1, -1 -; LA64-NEXT: xvreplgr2vr.d $xr2, $a1 -; LA64-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 -; LA64-NEXT: xvst $xr0, $a0, 0 -; LA64-NEXT: ret +; CHECK-LABEL: select_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvld $xr1, $a2, 0 +; CHECK-NEXT: xvldi $xr2, -1552 +; CHECK-NEXT: xvbitsel.v $xr0, $xr1, $xr0, $xr2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret %v0 = load <8 x i32>, ptr %a0 %v1 = load <8 x i32>, ptr %a1 %sel = select <8 x i1> , <8 x i32> %v0, <8 x i32> %v1 diff --git a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll index 06d4a5d03f276..09908f619fa1f 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/xvmskcond.ll @@ -1,15 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA64 define i32 @xmsk_eq_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_eq_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmsknz.b $xr0, $xr0 -; CHECK-NEXT: xvnor.v $xr0, $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_eq_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmsknz.b $xr0, $xr0 +; LA32-NEXT: xvnor.v $xr0, $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_eq_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmsknz.b $xr0, $xr0 +; LA64-NEXT: xvnor.v $xr0, $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp eq <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -17,15 +27,25 @@ entry: } define i32 @xmsk_sgt_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sgt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvrepli.b $xr1, 0 -; CHECK-NEXT: xvslt.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sgt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvrepli.b $xr1, 0 +; LA32-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sgt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvrepli.b $xr1, 0 +; LA64-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -33,13 +53,21 @@ entry: } define i32 @xmsk_sgt_allones_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sgt_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskgez.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sgt_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskgez.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sgt_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskgez.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -47,13 +75,21 @@ entry: } define i32 @xmsk_sge_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sge_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskgez.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sge_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskgez.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sge_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskgez.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sge <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -61,13 +97,21 @@ entry: } define i32 @xmsk_slt_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp slt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -75,13 +119,21 @@ entry: } define i16 @xmsk_slt_allzeros_i16(<16 x i16 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret entry: %1 = icmp slt <16 x i16> %a, splat (i16 0) %2 = bitcast <16 x i1> %1 to i16 @@ -89,13 +141,21 @@ entry: } define i8 @xmsk_slt_allzeros_i32(<8 x i32 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret entry: %1 = icmp slt <8 x i32> %a, splat (i32 0) %2 = bitcast <8 x i1> %1 to i8 @@ -103,13 +163,21 @@ entry: } define i4 @xmsk_slt_allzeros_i64(<4 x i64 > %a) { -; CHECK-LABEL: xmsk_slt_allzeros_i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_slt_allzeros_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_slt_allzeros_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret entry: %1 = icmp slt <4 x i64> %a, splat (i64 0) %2 = bitcast <4 x i1> %1 to i4 @@ -117,14 +185,23 @@ entry: } define i32 @xmsk_sle_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sle_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvslei.b $xr0, $xr0, 0 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvslei.b $xr0, $xr0, 0 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvslei.b $xr0, $xr0, 0 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -132,13 +209,21 @@ entry: } define i32 @xmsk_sle_allones_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -146,13 +231,21 @@ entry: } define i16 @xmsk_sle_allones_i32(<16 x i16 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret entry: %1 = icmp sle <16 x i16> %a, splat (i16 -1) %2 = bitcast <16 x i1> %1 to i16 @@ -160,13 +253,21 @@ entry: } define i8 @xmsk_sle_allones_i16(<8 x i32 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret entry: %1 = icmp sle <8 x i32> %a, splat (i32 -1) %2 = bitcast <8 x i1> %1 to i8 @@ -174,13 +275,21 @@ entry: } define i4 @xmsk_sle_allones_i64(<4 x i64 > %a) { -; CHECK-LABEL: xmsk_sle_allones_i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_sle_allones_i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_sle_allones_i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret entry: %1 = icmp sle <4 x i64> %a, splat (i64 -1) %2 = bitcast <4 x i1> %1 to i4 @@ -188,13 +297,21 @@ entry: } define i32 @xmsk_ne_allzeros_i8(<32 x i8 > %a) { -; CHECK-LABEL: xmsk_ne_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvmsknz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xmsk_ne_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: xvmsknz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xmsk_ne_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: xvmsknz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret entry: %1 = icmp ne <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -202,100 +319,165 @@ entry: } define i4 @xvmsk_sgt_v4i64(<4 x i64> %a, <4 x i64> %b) { -; CHECK-LABEL: xvmsk_sgt_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x = icmp sgt <4 x i64> %a, %b %res = bitcast <4 x i1> %x to i4 ret i4 %res } define i4 @xvmsk_ogt_v4f64(<4 x double> %a, <4 x double> %b) { -; CHECK-LABEL: xvmsk_ogt_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_v4f64: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_v4f64: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x = fcmp ogt <4 x double> %a, %b %res = bitcast <4 x i1> %x to i4 ret i4 %res } define i8 @xvmsk_sgt_v8i32(<8 x i32> %a, <8 x i32> %b) { -; CHECK-LABEL: xvmsk_sgt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x = icmp sgt <8 x i32> %a, %b %res = bitcast <8 x i1> %x to i8 ret i8 %res } define i8 @xvmsk_ogt_v8f32(<8 x float> %a, <8 x float> %b) { -; CHECK-LABEL: xvmsk_ogt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x = fcmp ogt <8 x float> %a, %b %res = bitcast <8 x i1> %x to i8 ret i8 %res } define i16 @xvmsk_sgt_v16i16(<16 x i16> %a, <16 x i16> %b) { -; CHECK-LABEL: xvmsk_sgt_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.h $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret %x = icmp sgt <16 x i16> %a, %b %res = bitcast <16 x i1> %x to i16 ret i16 %res } define i32 @xvmsk_sgt_v32i8(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: xvmsk_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret %x = icmp sgt <32 x i8> %a, %b %res = bitcast <32 x i1> %x to i32 ret i32 %res } define i4 @xvmsk_sgt_and_sgt_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v4i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.d $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v4i64: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.d $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v4i64: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.d $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x0 = icmp sgt <4 x i64> %a, %b %x1 = icmp sgt <4 x i64> %c, %d %y = and <4 x i1> %x0, %x1 @@ -304,16 +486,27 @@ define i4 @xvmsk_sgt_and_sgt_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 } define i4 @xvmsk_ogt_and_ogt_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) { -; CHECK-LABEL: xvmsk_ogt_and_ogt_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.d $xr2, $xr3, $xr2 -; CHECK-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_and_ogt_v4f64: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.d $xr2, $xr3, $xr2 +; LA32-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_and_ogt_v4f64: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.d $xr2, $xr3, $xr2 +; LA64-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %x0 = fcmp ogt <4 x double> %a, %b %x1 = fcmp ogt <4 x double> %c, %d %y = and <4 x i1> %x0, %x1 @@ -322,16 +515,27 @@ define i4 @xvmsk_ogt_and_ogt_v4f64(<4 x double> %a, <4 x double> %b, <4 x double } define i8 @xvmsk_sgt_and_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = icmp sgt <8 x i32> %a, %b %x1 = icmp sgt <8 x i32> %c, %d %y = and <8 x i1> %x0, %x1 @@ -340,16 +544,27 @@ define i8 @xvmsk_sgt_and_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 } define i8 @xvmsk_sgt_or_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { -; CHECK-LABEL: xvmsk_sgt_or_sgt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_or_sgt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_or_sgt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = icmp sgt <8 x i32> %a, %b %x1 = icmp sgt <8 x i32> %c, %d %y = or <8 x i1> %x0, %x1 @@ -358,18 +573,31 @@ define i8 @xvmsk_sgt_or_sgt_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x } define i8 @xvmsk_sgt_or_slt_and_eq_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d, <8 x i32> %e, <8 x i32> %f) { -; CHECK-LABEL: xvmsk_sgt_or_slt_and_eq_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.w $xr2, $xr2, $xr3 -; CHECK-NEXT: xvslt.w $xr0, $xr1, $xr0 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvseq.w $xr1, $xr4, $xr5 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_or_slt_and_eq_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.w $xr2, $xr2, $xr3 +; LA32-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvseq.w $xr1, $xr4, $xr5 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_or_slt_and_eq_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.w $xr2, $xr2, $xr3 +; LA64-NEXT: xvslt.w $xr0, $xr1, $xr0 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvseq.w $xr1, $xr4, $xr5 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = icmp sgt <8 x i32> %a, %b %x1 = icmp slt <8 x i32> %c, %d %x2 = icmp eq <8 x i32> %e, %f @@ -380,15 +608,25 @@ define i8 @xvmsk_sgt_or_slt_and_eq_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> % } define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) { -; CHECK-LABEL: xvmsk_eq_vsel_slt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_eq_vsel_slt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_eq_vsel_slt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %cmp = icmp eq <8 x i32> %a0, %a1 %slt = icmp slt <8 x i32> %a2, zeroinitializer %sel = select <8 x i1> %cmp, <8 x i1> , <8 x i1> %slt @@ -397,22 +635,39 @@ define i8 @xvmsk_eq_vsel_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) } define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2, <8 x i32> %a3, i1 %a4) { -; CHECK-LABEL: xvmsk_sel_eq_or_eq_or_slt_v8i32: -; CHECK: # %bb.0: -; CHECK-NEXT: andi $a0, $a0, 1 -; CHECK-NEXT: xvseq.w $xr2, $xr0, $xr2 -; CHECK-NEXT: addi.d $a1, $zero, -1 -; CHECK-NEXT: maskeqz $a0, $a1, $a0 -; CHECK-NEXT: xvreplgr2vr.w $xr4, $a0 -; CHECK-NEXT: xvand.v $xr2, $xr2, $xr4 -; CHECK-NEXT: xvseq.w $xr0, $xr0, $xr1 -; CHECK-NEXT: xvor.v $xr0, $xr3, $xr0 -; CHECK-NEXT: xvor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sel_eq_or_eq_or_slt_v8i32: +; LA32: # %bb.0: +; LA32-NEXT: andi $a0, $a0, 1 +; LA32-NEXT: xvseq.w $xr2, $xr0, $xr2 +; LA32-NEXT: addi.w $a1, $zero, -1 +; LA32-NEXT: maskeqz $a0, $a1, $a0 +; LA32-NEXT: xvreplgr2vr.w $xr4, $a0 +; LA32-NEXT: xvand.v $xr2, $xr2, $xr4 +; LA32-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA32-NEXT: xvor.v $xr0, $xr3, $xr0 +; LA32-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sel_eq_or_eq_or_slt_v8i32: +; LA64: # %bb.0: +; LA64-NEXT: andi $a0, $a0, 1 +; LA64-NEXT: xvseq.w $xr2, $xr0, $xr2 +; LA64-NEXT: addi.d $a1, $zero, -1 +; LA64-NEXT: maskeqz $a0, $a1, $a0 +; LA64-NEXT: xvreplgr2vr.w $xr4, $a0 +; LA64-NEXT: xvand.v $xr2, $xr2, $xr4 +; LA64-NEXT: xvseq.w $xr0, $xr0, $xr1 +; LA64-NEXT: xvor.v $xr0, $xr3, $xr0 +; LA64-NEXT: xvor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %cmp0 = icmp eq <8 x i32> %a0, %a1 %cmp1 = icmp eq <8 x i32> %a0, %a2 %cmp2 = icmp slt <8 x i32> %a3, zeroinitializer @@ -424,16 +679,27 @@ define i8 @xvmsk_sel_eq_or_eq_or_slt_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i3 } define i8 @xvmsk_ogt_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; CHECK-LABEL: xvmsk_ogt_and_ogt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 -; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_and_ogt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA32-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_and_ogt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA64-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = fcmp ogt <8 x float> %a, %b %x1 = fcmp ogt <8 x float> %c, %d %y = and <8 x i1> %x0, %x1 @@ -442,16 +708,27 @@ define i8 @xvmsk_ogt_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> % } define i8 @xvmsk_sgt_xor_sgt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) { -; CHECK-LABEL: xvmsk_sgt_xor_sgt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 -; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_xor_sgt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA32-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_xor_sgt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.clt.s $xr2, $xr3, $xr2 +; LA64-NEXT: xvfcmp.clt.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = fcmp ogt <8 x float> %a, %b %x1 = fcmp ogt <8 x float> %c, %d %y = xor <8 x i1> %x0, %x1 @@ -460,18 +737,31 @@ define i8 @xvmsk_sgt_xor_sgt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> % } define i8 @xvmsk_ugt_xor_ueq_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d, <8 x float> %e, <8 x float> %f) { -; CHECK-LABEL: xvmsk_ugt_xor_ueq_and_ogt_v8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvfcmp.cueq.s $xr2, $xr2, $xr3 -; CHECK-NEXT: xvfcmp.cult.s $xr0, $xr1, $xr0 -; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvfcmp.clt.s $xr1, $xr5, $xr4 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ugt_xor_ueq_and_ogt_v8f32: +; LA32: # %bb.0: +; LA32-NEXT: xvfcmp.cueq.s $xr2, $xr2, $xr3 +; LA32-NEXT: xvfcmp.cult.s $xr0, $xr1, $xr0 +; LA32-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvfcmp.clt.s $xr1, $xr5, $xr4 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ugt_xor_ueq_and_ogt_v8f32: +; LA64: # %bb.0: +; LA64-NEXT: xvfcmp.cueq.s $xr2, $xr2, $xr3 +; LA64-NEXT: xvfcmp.cult.s $xr0, $xr1, $xr0 +; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvfcmp.clt.s $xr1, $xr5, $xr4 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %x0 = fcmp ugt <8 x float> %a, %b %x1 = fcmp ueq <8 x float> %c, %d %x2 = fcmp ogt <8 x float> %e, %f @@ -482,16 +772,27 @@ define i8 @xvmsk_ugt_xor_ueq_and_ogt_v8f32(<8 x float> %a, <8 x float> %b, <8 x } define i16 @xvmsk_sgt_and_sgt_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v16i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.h $xr2, $xr3, $xr2 -; CHECK-NEXT: xvslt.h $xr0, $xr1, $xr0 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2 -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v16i16: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.h $xr2, $xr3, $xr2 +; LA32-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v16i16: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.h $xr2, $xr3, $xr2 +; LA64-NEXT: xvslt.h $xr0, $xr1, $xr0 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr2 +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret %x0 = icmp sgt <16 x i16> %a, %b %x1 = icmp sgt <16 x i16> %c, %d %y = and <16 x i1> %x0, %x1 @@ -500,16 +801,27 @@ define i16 @xvmsk_sgt_and_sgt_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c } define i32 @xvmsk_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { -; CHECK-LABEL: xvmsk_sgt_and_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslt.b $xr0, $xr1, $xr0 -; CHECK-NEXT: xvslt.b $xr1, $xr3, $xr2 -; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_sgt_and_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA32-NEXT: xvslt.b $xr1, $xr3, $xr2 +; LA32-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_sgt_and_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: xvslt.b $xr0, $xr1, $xr0 +; LA64-NEXT: xvslt.b $xr1, $xr3, $xr2 +; LA64-NEXT: xvand.v $xr0, $xr0, $xr1 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret %x0 = icmp sgt <32 x i8> %a, %b %x1 = icmp sgt <32 x i8> %c, %d %y = and <32 x i1> %x0, %x1 @@ -518,17 +830,29 @@ define i32 @xvmsk_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 } define i8 @xvmsk_eq_v2i64_concat_poison(<2 x i64> %vec) { -; CHECK-LABEL: xvmsk_eq_v2i64_concat_poison: -; CHECK: # %bb.0: -; CHECK-NEXT: vseqi.d $vr0, $vr0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a0, 1 -; CHECK-NEXT: vslli.h $vr0, $vr1, 15 -; CHECK-NEXT: vmskltz.h $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_eq_v2i64_concat_poison: +; LA32: # %bb.0: +; LA32-NEXT: vseqi.d $vr0, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: vinsgr2vr.h $vr1, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vinsgr2vr.h $vr1, $a0, 1 +; LA32-NEXT: vslli.h $vr0, $vr1, 15 +; LA32-NEXT: vmskltz.h $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_eq_v2i64_concat_poison: +; LA64: # %bb.0: +; LA64-NEXT: vseqi.d $vr0, $vr0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: vinsgr2vr.h $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: vinsgr2vr.h $vr1, $a0, 1 +; LA64-NEXT: vslli.h $vr0, $vr1, 15 +; LA64-NEXT: vmskltz.h $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: ret %tobool = icmp eq <2 x i64> %vec, zeroinitializer %insertvec = shufflevector <2 x i1> %tobool, <2 x i1> poison, <8 x i32> %res = bitcast <8 x i1> %insertvec to i8 @@ -560,22 +884,39 @@ define i8 @xvmsk_ne_v4i32_concat_poison(<4 x i32> %vec) { } define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) { -; CHECK-LABEL: xvmsk_ogt_v4f64_concat_poison: -; CHECK: # %bb.0: -; CHECK-NEXT: xvrepli.b $xr1, 0 -; CHECK-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 2 -; CHECK-NEXT: xvpickve2gr.d $a2, $xr0, 1 -; CHECK-NEXT: xvpickve2gr.d $a3, $xr0, 0 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a3, 0 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a2, 1 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 2 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 3 -; CHECK-NEXT: vslli.h $vr0, $vr0, 15 -; CHECK-NEXT: vmskltz.h $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_ogt_v4f64_concat_poison: +; LA32: # %bb.0: +; LA32-NEXT: xvrepli.b $xr1, 0 +; LA32-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA32-NEXT: xvpickve2gr.w $a0, $xr0, 6 +; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4 +; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 2 +; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 0 +; LA32-NEXT: vinsgr2vr.h $vr0, $a3, 0 +; LA32-NEXT: vinsgr2vr.h $vr0, $a2, 1 +; LA32-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; LA32-NEXT: vinsgr2vr.h $vr0, $a0, 3 +; LA32-NEXT: vslli.h $vr0, $vr0, 15 +; LA32-NEXT: vmskltz.h $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_ogt_v4f64_concat_poison: +; LA64: # %bb.0: +; LA64-NEXT: xvrepli.b $xr1, 0 +; LA64-NEXT: xvfcmp.clt.d $xr0, $xr1, $xr0 +; LA64-NEXT: xvpickve2gr.d $a0, $xr0, 3 +; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2 +; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 1 +; LA64-NEXT: xvpickve2gr.d $a3, $xr0, 0 +; LA64-NEXT: vinsgr2vr.h $vr0, $a3, 0 +; LA64-NEXT: vinsgr2vr.h $vr0, $a2, 1 +; LA64-NEXT: vinsgr2vr.h $vr0, $a1, 2 +; LA64-NEXT: vinsgr2vr.h $vr0, $a0, 3 +; LA64-NEXT: vslli.h $vr0, $vr0, 15 +; LA64-NEXT: vmskltz.h $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: ret %tobool = fcmp ogt <4 x double> %vec, zeroinitializer %insertvec = shufflevector <4 x i1> %tobool, <4 x i1> poison, <8 x i32> %res = bitcast <8 x i1> %insertvec to i8 @@ -583,56 +924,92 @@ define i8 @xvmsk_ogt_v4f64_concat_poison(<4 x double> %vec) { } define i32 @xvmsk_trunc_i8(<32 x i8> %a) { -; CHECK-LABEL: xvmsk_trunc_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.b $xr0, $xr0, 7 -; CHECK-NEXT: xvmskltz.b $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 31, 16 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i8: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.b $xr0, $xr0, 7 +; LA32-NEXT: xvmskltz.b $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 31, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i8: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.b $xr0, $xr0, 7 +; LA64-NEXT: xvmskltz.b $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 31, 16 +; LA64-NEXT: ret %y = trunc <32 x i8> %a to <32 x i1> %res = bitcast <32 x i1> %y to i32 ret i32 %res } define i16 @xvmsk_trunc_i16(<16 x i16> %a) { -; CHECK-LABEL: xvmsk_trunc_i16: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.h $xr0, $xr0, 15 -; CHECK-NEXT: xvmskltz.h $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 15, 8 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i16: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.h $xr0, $xr0, 15 +; LA32-NEXT: xvmskltz.h $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 15, 8 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i16: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.h $xr0, $xr0, 15 +; LA64-NEXT: xvmskltz.h $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 15, 8 +; LA64-NEXT: ret %y = trunc <16 x i16> %a to <16 x i1> %res = bitcast <16 x i1> %y to i16 ret i16 %res } define i8 @xvmsk_trunc_i32(<8 x i32> %a) { -; CHECK-LABEL: xvmsk_trunc_i32: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.w $xr0, $xr0, 31 -; CHECK-NEXT: xvmskltz.w $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 7, 4 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i32: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.w $xr0, $xr0, 31 +; LA32-NEXT: xvmskltz.w $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 7, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i32: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.w $xr0, $xr0, 31 +; LA64-NEXT: xvmskltz.w $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 7, 4 +; LA64-NEXT: ret %y = trunc <8 x i32> %a to <8 x i1> %res = bitcast <8 x i1> %y to i8 ret i8 %res } define i4 @xvmsk_trunc_i64(<4 x i64> %a) { -; CHECK-LABEL: xvmsk_trunc_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: xvslli.d $xr0, $xr0, 63 -; CHECK-NEXT: xvmskltz.d $xr0, $xr0 -; CHECK-NEXT: xvpickve2gr.wu $a0, $xr0, 0 -; CHECK-NEXT: xvpickve2gr.wu $a1, $xr0, 4 -; CHECK-NEXT: bstrins.d $a0, $a1, 3, 2 -; CHECK-NEXT: ret +; LA32-LABEL: xvmsk_trunc_i64: +; LA32: # %bb.0: +; LA32-NEXT: xvslli.d $xr0, $xr0, 63 +; LA32-NEXT: xvmskltz.d $xr0, $xr0 +; LA32-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA32-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA32-NEXT: bstrins.w $a0, $a1, 3, 2 +; LA32-NEXT: ret +; +; LA64-LABEL: xvmsk_trunc_i64: +; LA64: # %bb.0: +; LA64-NEXT: xvslli.d $xr0, $xr0, 63 +; LA64-NEXT: xvmskltz.d $xr0, $xr0 +; LA64-NEXT: xvpickve2gr.wu $a0, $xr0, 0 +; LA64-NEXT: xvpickve2gr.wu $a1, $xr0, 4 +; LA64-NEXT: bstrins.d $a0, $a1, 3, 2 +; LA64-NEXT: ret %y = trunc <4 x i64> %a to <4 x i1> %res = bitcast <4 x i1> %y to i4 ret i4 %res diff --git a/llvm/test/CodeGen/LoongArch/lsx/abs.ll b/llvm/test/CodeGen/LoongArch/lsx/abs.ll new file mode 100644 index 0000000000000..85fe1fe5c0da7 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/abs.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @vabs_b(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.b $vr1, $vr0 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <16 x i8>, ptr %src + %b = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 true) + store <16 x i8> %b, ptr %dst + ret void +} + +define void @vabs_b_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_b_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.b $vr1, $vr0 +; CHECK-NEXT: vmax.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <16 x i8>, ptr %src + %b = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false) + store <16 x i8> %b, ptr %dst + ret void +} + +define void @vabs_h(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.h $vr1, $vr0 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x i16>, ptr %src + %b = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 true) + store <8 x i16> %b, ptr %dst + ret void +} + +define void @vabs_h_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_h_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.h $vr1, $vr0 +; CHECK-NEXT: vmax.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <8 x i16>, ptr %src + %b = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 false) + store <8 x i16> %b, ptr %dst + ret void +} + +define void @vabs_w(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.w $vr1, $vr0 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i32>, ptr %src + %b = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 true) + store <4 x i32> %b, ptr %dst + ret void +} + +define void @vabs_w_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_w_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.w $vr1, $vr0 +; CHECK-NEXT: vmax.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <4 x i32>, ptr %src + %b = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false) + store <4 x i32> %b, ptr %dst + ret void +} + +define void @vabs_d(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.d $vr1, $vr0 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %src + %b = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a, i1 true) + store <2 x i64> %b, ptr %dst + ret void +} + +define void @vabs_d_1(ptr %dst, ptr %src) { +; CHECK-LABEL: vabs_d_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vneg.d $vr1, $vr0 +; CHECK-NEXT: vmax.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %a = load <2 x i64>, ptr %src + %b = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a, i1 false) + store <2 x i64> %b, ptr %dst + ret void +} + +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1) diff --git a/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll new file mode 100644 index 0000000000000..3c6d34505e114 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/and-not-combine.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @and_not_combine_v16i8(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a3, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vsub.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <16 x i8>, ptr %a0 + %v1 = load <16 x i8>, ptr %a1 + %v2 = load <16 x i8>, ptr %a2 + %not = xor <16 x i8> %v1, + %add = add <16 x i8> %not, %v2 + %and = and <16 x i8> %v0, %add + store <16 x i8> %and, ptr %res + ret void +} + +define void @and_not_combine_v8i16(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a3, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vsub.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x i16>, ptr %a0 + %v1 = load <8 x i16>, ptr %a1 + %v2 = load <8 x i16>, ptr %a2 + %not = xor <8 x i16> %v1, + %add = add <8 x i16> %not, %v2 + %and = and <8 x i16> %v0, %add + store <8 x i16> %and, ptr %res + ret void +} + +define void @and_not_combine_v4i32(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v4i32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a3, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vsub.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x i32>, ptr %a0 + %v1 = load <4 x i32>, ptr %a1 + %v2 = load <4 x i32>, ptr %a2 + %not = xor <4 x i32> %v1, + %add = add <4 x i32> %not, %v2 + %and = and <4 x i32> %v0, %add + store <4 x i32> %and, ptr %res + ret void +} + +define void @and_not_combine_v2i64(ptr %res, ptr %a0, ptr %a1, ptr %a2) nounwind { +; CHECK-LABEL: and_not_combine_v2i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a3, 0 +; CHECK-NEXT: vld $vr2, $a1, 0 +; CHECK-NEXT: vsub.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vandn.v $vr0, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x i64>, ptr %a0 + %v1 = load <2 x i64>, ptr %a1 + %v2 = load <2 x i64>, ptr %a2 + %not = xor <2 x i64> %v1, + %add = add <2 x i64> %not, %v2 + %and = and <2 x i64> %v0, %add + store <2 x i64> %and, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll index 4c17d3fd8d7b2..b0d36a8143fa1 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll @@ -1,20 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lsx --verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefix=LA32 ; RUN: llc --mtriple=loongarch64 -mattr=+lsx --verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefix=LA64 declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.8b $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vori.b $vr0, $vr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v16i8: +; LA32: # %bb.0: +; LA32-NEXT: vslli.b $vr1, $vr0, 4 +; LA32-NEXT: vsrli.b $vr0, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 51 +; LA32-NEXT: vslli.b $vr1, $vr1, 2 +; LA32-NEXT: vsrli.b $vr0, $vr0, 2 +; LA32-NEXT: vandi.b $vr0, $vr0, 51 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 85 +; LA32-NEXT: vslli.b $vr1, $vr1, 1 +; LA32-NEXT: vsrli.b $vr0, $vr0, 1 +; LA32-NEXT: vandi.b $vr0, $vr0, 85 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v16i8: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.8b $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vori.b $vr0, $vr1, 0 +; LA64-NEXT: ret %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a) ret <16 x i8> %b } @@ -22,16 +41,33 @@ define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind { declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vshuf4i.h $vr0, $vr1, 27 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v8i16: +; LA32: # %bb.0: +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: vshuf4i.h $vr0, $vr1, 27 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v8i16: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vshuf4i.h $vr0, $vr1, 27 +; LA64-NEXT: ret %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) ret <8 x i16> %b } @@ -39,16 +75,33 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vshuf4i.w $vr0, $vr1, 177 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 2 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: bitrev.w $a0, $a0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 3 +; LA32-NEXT: vori.b $vr0, $vr1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vshuf4i.w $vr0, $vr1, 177 +; LA64-NEXT: ret %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) ret <4 x i32> %b } @@ -56,16 +109,36 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { -; CHECK-LABEL: test_bitreverse_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: bitrev.d $a0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; CHECK-NEXT: vori.b $vr0, $vr1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: test_bitreverse_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI3_0) +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vslli.b $vr1, $vr0, 4 +; LA32-NEXT: vsrli.b $vr0, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 51 +; LA32-NEXT: vslli.b $vr1, $vr1, 2 +; LA32-NEXT: vsrli.b $vr0, $vr0, 2 +; LA32-NEXT: vandi.b $vr0, $vr0, 51 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: vandi.b $vr1, $vr0, 85 +; LA32-NEXT: vslli.b $vr1, $vr1, 1 +; LA32-NEXT: vsrli.b $vr0, $vr0, 1 +; LA32-NEXT: vandi.b $vr0, $vr0, 85 +; LA32-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-NEXT: ret +; +; LA64-LABEL: test_bitreverse_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 1 +; LA64-NEXT: bitrev.d $a0, $a0 +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 +; LA64-NEXT: vori.b $vr0, $vr1, 0 +; LA64-NEXT: ret %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) ret <2 x i64> %b } diff --git a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll index cae7c08f2d685..fe45e73b36f51 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/build-vector.ll @@ -192,11 +192,11 @@ entry: ret void } -define void @buildvector_v2f32_const_splat(ptr %dst) nounwind { -; CHECK-LABEL: buildvector_v2f32_const_splat: +;; Also check buildvector_const_splat_vldi_1010. +define void @buildvector_v4f32_const_splat(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_v4f32_const_splat: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lu12i.w $a1, 260096 -; CHECK-NEXT: vreplgr2vr.w $vr0, $a1 +; CHECK-NEXT: vldi $vr0, -1424 ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -204,30 +204,112 @@ entry: ret void } +;; Also check buildvector_const_splat_vldi_1100. define void @buildvector_v2f64_const_splat(ptr %dst) nounwind { -; LA32-LABEL: buildvector_v2f64_const_splat: -; LA32: # %bb.0: # %entry -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI14_0) -; LA32-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI14_0) -; LA32-NEXT: vst $vr0, $a0, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: buildvector_v2f64_const_splat: -; LA64: # %bb.0: # %entry -; LA64-NEXT: lu52i.d $a1, $zero, 1023 -; LA64-NEXT: vreplgr2vr.d $vr0, $a1 -; LA64-NEXT: vst $vr0, $a0, 0 -; LA64-NEXT: ret +; CHECK-LABEL: buildvector_v2f64_const_splat: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -912 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret entry: store <2 x double> , ptr %dst ret void } +;; imm[11:8] == 4'b0000/4'b0100/4'b1000 can be represented using vrepli.[whb]. +define void @buildvector_const_splat_vldi_0001(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_0001: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -3837 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_0010(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_0010: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -3583 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_0011(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_0011: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -3327 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_0101(ptr %dst) { +; CHECK-LABEL: buildvector_const_splat_vldi_0101: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -2813 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <8 x i16> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_0110(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_0110: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -2557 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_0111(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_0111: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -2305 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_1001(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_1001: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -1789 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x i32> , ptr %dst + ret void +} + +define void @buildvector_const_splat_vldi_1011(ptr %dst) nounwind { +; CHECK-LABEL: buildvector_const_splat_vldi_1011: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr0, -1280 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + store <4 x float> , ptr %dst + ret void +} + define void @buildvector_v16i8_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v16i8_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI15_0) -; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI15_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI23_0) +; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI23_0) ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -238,8 +320,8 @@ entry: define void @buildvector_v8i16_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v8i16_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI16_0) -; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI16_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI24_0) +; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI24_0) ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -250,8 +332,8 @@ entry: define void @buildvector_v4i32_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v4i32_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI17_0) -; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI17_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI25_0) +; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI25_0) ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -262,8 +344,8 @@ entry: define void @buildvector_v2i64_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v2i64_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI18_0) -; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI18_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI26_0) +; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI26_0) ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -274,8 +356,8 @@ entry: define void @buildvector_v2f32_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v2f32_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI19_0) -; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI19_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI27_0) +; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI27_0) ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: @@ -286,8 +368,8 @@ entry: define void @buildvector_v2f64_const(ptr %dst) nounwind { ; CHECK-LABEL: buildvector_v2f64_const: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI20_0) -; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI20_0) +; CHECK-NEXT: pcalau12i $a1, %pc_hi20(.LCPI28_0) +; CHECK-NEXT: vld $vr0, $a1, %pc_lo12(.LCPI28_0) ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/extract-binop.ll b/llvm/test/CodeGen/LoongArch/lsx/extract-binop.ll new file mode 100644 index 0000000000000..e8ddf84de6dff --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/extract-binop.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 + +define i8 @extractelt_add_v16i8(ptr %p) { +; CHECK-LABEL: extractelt_add_v16i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vaddi.bu $vr0, $vr0, 13 +; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 2 +; CHECK-NEXT: ret +entry: + %x = load <16 x i8>, ptr %p + %add = add <16 x i8> %x, + %ext = extractelement <16 x i8> %add, i32 2 + ret i8 %ext +} + +define i16 @extractelt_add_v8i16(ptr %p) { +; CHECK-LABEL: extractelt_add_v8i16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vaddi.hu $vr0, $vr0, 13 +; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 2 +; CHECK-NEXT: ret +entry: + %x = load <8 x i16>, ptr %p + %add = add <8 x i16> %x, + %ext = extractelement <8 x i16> %add, i32 2 + ret i16 %ext +} + +define i32 @extractelt_add_v4i32(ptr %p) { +; LA32-LABEL: extractelt_add_v4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a0, $a0, 8 +; LA32-NEXT: addi.w $a0, $a0, 13 +; LA32-NEXT: ret +; +; LA64-LABEL: extractelt_add_v4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vaddi.wu $vr0, $vr0, 13 +; LA64-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA64-NEXT: ret +entry: + %x = load <4 x i32>, ptr %p + %add = add <4 x i32> %x, + %ext = extractelement <4 x i32> %add, i32 2 + ret i32 %ext +} + +define i64 @extractelt_add_v2i64(ptr %p) { +; LA32-LABEL: extractelt_add_v2i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vaddi.du $vr0, $vr0, 12 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a1, $vr0, 3 +; LA32-NEXT: ret +; +; LA64-LABEL: extractelt_add_v2i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 8 +; LA64-NEXT: addi.d $a0, $a0, 12 +; LA64-NEXT: ret +entry: + %x = load <2 x i64>, ptr %p + %add = add <2 x i64> %x, + %ext = extractelement <2 x i64> %add, i32 1 + ret i64 %ext +} + +define float @extractelt_fadd_v4f32(ptr %p) { +; CHECK-LABEL: extractelt_fadd_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fld.s $fa0, $a0, 8 +; CHECK-NEXT: vldi $vr1, -1238 +; CHECK-NEXT: fadd.s $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %x = load <4 x float>, ptr %p + %add = fadd <4 x float> %x, + %ext = extractelement <4 x float> %add, i32 2 + ret float %ext +} + +define double @extractelt_fadd_v2f64(ptr %p) { +; CHECK-LABEL: extractelt_fadd_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fld.d $fa0, $a0, 8 +; CHECK-NEXT: vldi $vr1, -984 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %x = load <2 x double>, ptr %p + %add = fadd <2 x double> %x, + %ext = extractelement <2 x double> %add, i32 1 + ret double %ext +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll index 58e16d37ae278..46eb91e4079bf 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,-frecipe < %s | FileCheck %s --check-prefixes=FAULT,FAULT-LA32 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefixes=FAULT,FAULT-LA64 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s --check-prefixes=CHECK,LA64 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s define void @fdiv_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind { ; FAULT-LABEL: fdiv_v4f32: @@ -40,35 +40,19 @@ define void @fdiv_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind { ; FAULT-NEXT: vst $vr0, $a0, 0 ; FAULT-NEXT: ret ; -; LA32-LABEL: fdiv_v2f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI1_0) -; LA32-NEXT: vld $vr0, $a2, 0 -; LA32-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI1_0) -; LA32-NEXT: vld $vr2, $a1, 0 -; LA32-NEXT: vfrecipe.d $vr3, $vr0 -; LA32-NEXT: vfmadd.d $vr1, $vr0, $vr3, $vr1 -; LA32-NEXT: vfnmsub.d $vr1, $vr1, $vr3, $vr3 -; LA32-NEXT: vfmul.d $vr3, $vr2, $vr1 -; LA32-NEXT: vfnmsub.d $vr0, $vr0, $vr3, $vr2 -; LA32-NEXT: vfmadd.d $vr0, $vr1, $vr0, $vr3 -; LA32-NEXT: vst $vr0, $a0, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: fdiv_v2f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: vld $vr0, $a2, 0 -; LA64-NEXT: vld $vr1, $a1, 0 -; LA64-NEXT: lu52i.d $a1, $zero, -1025 -; LA64-NEXT: vreplgr2vr.d $vr2, $a1 -; LA64-NEXT: vfrecipe.d $vr3, $vr0 -; LA64-NEXT: vfmadd.d $vr2, $vr0, $vr3, $vr2 -; LA64-NEXT: vfnmsub.d $vr2, $vr2, $vr3, $vr3 -; LA64-NEXT: vfmul.d $vr3, $vr1, $vr2 -; LA64-NEXT: vfnmsub.d $vr0, $vr0, $vr3, $vr1 -; LA64-NEXT: vfmadd.d $vr0, $vr2, $vr0, $vr3 -; LA64-NEXT: vst $vr0, $a0, 0 -; LA64-NEXT: ret +; CHECK-LABEL: fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vfrecipe.d $vr2, $vr0 +; CHECK-NEXT: vldi $vr3, -784 +; CHECK-NEXT: vfmadd.d $vr3, $vr0, $vr2, $vr3 +; CHECK-NEXT: vfnmsub.d $vr2, $vr3, $vr2, $vr2 +; CHECK-NEXT: vfmul.d $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret entry: %v0 = load <2 x double>, ptr %a0 %v1 = load <2 x double>, ptr %a1 @@ -90,8 +74,7 @@ define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vld $vr0, $a1, 0 ; CHECK-NEXT: vfrecipe.s $vr1, $vr0 -; CHECK-NEXT: lu12i.w $a1, -264192 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vldi $vr2, -1296 ; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 ; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr1, $vr1 ; CHECK-NEXT: vst $vr0, $a0, 0 @@ -107,24 +90,22 @@ define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA32-LABEL: one_fdiv_v2f64: ; FAULT-LA32: # %bb.0: # %entry ; FAULT-LA32-NEXT: vld $vr0, $a1, 0 -; FAULT-LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; FAULT-LA32-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI3_0) +; FAULT-LA32-NEXT: vldi $vr1, -912 ; FAULT-LA32-NEXT: vfdiv.d $vr0, $vr1, $vr0 ; FAULT-LA32-NEXT: vst $vr0, $a0, 0 ; FAULT-LA32-NEXT: ret ; -; LA32-LABEL: one_fdiv_v2f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI3_0) -; LA32-NEXT: vfrecipe.d $vr2, $vr0 -; LA32-NEXT: vfnmsub.d $vr3, $vr0, $vr2, $vr1 -; LA32-NEXT: vfmadd.d $vr2, $vr2, $vr3, $vr2 -; LA32-NEXT: vfnmsub.d $vr0, $vr0, $vr2, $vr1 -; LA32-NEXT: vfmadd.d $vr0, $vr2, $vr0, $vr2 -; LA32-NEXT: vst $vr0, $a0, 0 -; LA32-NEXT: ret +; CHECK-LABEL: one_fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.d $vr1, $vr0 +; CHECK-NEXT: vldi $vr2, -912 +; CHECK-NEXT: vfnmsub.d $vr3, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr1, $vr1, $vr3, $vr1 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr0, $vr1, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret ; ; FAULT-LA64-LABEL: one_fdiv_v2f64: ; FAULT-LA64: # %bb.0: # %entry @@ -132,19 +113,6 @@ define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA64-NEXT: vfrecip.d $vr0, $vr0 ; FAULT-LA64-NEXT: vst $vr0, $a0, 0 ; FAULT-LA64-NEXT: ret -; -; LA64-LABEL: one_fdiv_v2f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vfrecipe.d $vr1, $vr0 -; LA64-NEXT: lu52i.d $a1, $zero, 1023 -; LA64-NEXT: vreplgr2vr.d $vr2, $a1 -; LA64-NEXT: vfnmsub.d $vr3, $vr0, $vr1, $vr2 -; LA64-NEXT: vfmadd.d $vr1, $vr1, $vr3, $vr1 -; LA64-NEXT: vfnmsub.d $vr0, $vr0, $vr1, $vr2 -; LA64-NEXT: vfmadd.d $vr0, $vr1, $vr0, $vr1 -; LA64-NEXT: vst $vr0, $a0, 0 -; LA64-NEXT: ret entry: %v0 = load <2 x double>, ptr %a0 %div = fdiv fast <2 x double> , %v0 diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll index 1f744830bd56b..4951696e05a94 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,-frecipe < %s | FileCheck %s --check-prefixes=FAULT,FAULT-LA32 -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefixes=FAULT,FAULT-LA64 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s --check-prefixes=CHECK,LA64 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s ;; 1.0 / (fsqrt vec) define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind { @@ -19,11 +19,9 @@ define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind { ; CHECK-NEXT: vfrsqrte.s $vr1, $vr0 ; CHECK-NEXT: vfmul.s $vr1, $vr0, $vr1 ; CHECK-NEXT: vfmul.s $vr0, $vr0, $vr1 -; CHECK-NEXT: lu12i.w $a1, -261120 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vldi $vr2, -1400 ; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 -; CHECK-NEXT: lu12i.w $a1, -266240 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vldi $vr2, -3137 ; CHECK-NEXT: vfmul.s $vr1, $vr1, $vr2 ; CHECK-NEXT: vfmul.s $vr0, $vr1, $vr0 ; CHECK-NEXT: vst $vr0, $a0, 0 @@ -40,32 +38,29 @@ define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA32-LABEL: one_div_sqrt_v2f64: ; FAULT-LA32: # %bb.0: # %entry ; FAULT-LA32-NEXT: vld $vr0, $a1, 0 -; FAULT-LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; FAULT-LA32-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI1_0) ; FAULT-LA32-NEXT: vfsqrt.d $vr0, $vr0 +; FAULT-LA32-NEXT: vldi $vr1, -912 ; FAULT-LA32-NEXT: vfdiv.d $vr0, $vr1, $vr0 ; FAULT-LA32-NEXT: vst $vr0, $a0, 0 ; FAULT-LA32-NEXT: ret ; -; LA32-LABEL: one_div_sqrt_v2f64: -; LA32: # %bb.0: # %entry -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vfrsqrte.d $vr1, $vr0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_0) -; LA32-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI1_0) -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI1_1) -; LA32-NEXT: vld $vr3, $a1, %pc_lo12(.LCPI1_1) -; LA32-NEXT: vfmul.d $vr1, $vr0, $vr1 -; LA32-NEXT: vfmul.d $vr4, $vr0, $vr1 -; LA32-NEXT: vfmadd.d $vr4, $vr4, $vr1, $vr2 -; LA32-NEXT: vfmul.d $vr1, $vr1, $vr3 -; LA32-NEXT: vfmul.d $vr1, $vr1, $vr4 -; LA32-NEXT: vfmul.d $vr0, $vr0, $vr1 -; LA32-NEXT: vfmadd.d $vr0, $vr0, $vr1, $vr2 -; LA32-NEXT: vfmul.d $vr1, $vr1, $vr3 -; LA32-NEXT: vfmul.d $vr0, $vr1, $vr0 -; LA32-NEXT: vst $vr0, $a0, 0 -; LA32-NEXT: ret +; CHECK-LABEL: one_div_sqrt_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.d $vr1, $vr0 +; CHECK-NEXT: vfmul.d $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.d $vr2, $vr0, $vr1 +; CHECK-NEXT: vldi $vr3, -888 +; CHECK-NEXT: vfmadd.d $vr2, $vr2, $vr1, $vr3 +; CHECK-NEXT: vldi $vr4, -800 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr0, $vr1, $vr3 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret ; ; FAULT-LA64-LABEL: one_div_sqrt_v2f64: ; FAULT-LA64: # %bb.0: # %entry @@ -73,28 +68,6 @@ define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { ; FAULT-LA64-NEXT: vfrsqrt.d $vr0, $vr0 ; FAULT-LA64-NEXT: vst $vr0, $a0, 0 ; FAULT-LA64-NEXT: ret -; -; LA64-LABEL: one_div_sqrt_v2f64: -; LA64: # %bb.0: # %entry -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vfrsqrte.d $vr1, $vr0 -; LA64-NEXT: vfmul.d $vr1, $vr0, $vr1 -; LA64-NEXT: vfmul.d $vr2, $vr0, $vr1 -; LA64-NEXT: ori $a1, $zero, 0 -; LA64-NEXT: lu32i.d $a1, -524288 -; LA64-NEXT: lu52i.d $a1, $a1, -1024 -; LA64-NEXT: vreplgr2vr.d $vr3, $a1 -; LA64-NEXT: vfmadd.d $vr2, $vr2, $vr1, $vr3 -; LA64-NEXT: lu52i.d $a1, $zero, -1026 -; LA64-NEXT: vreplgr2vr.d $vr4, $a1 -; LA64-NEXT: vfmul.d $vr1, $vr1, $vr4 -; LA64-NEXT: vfmul.d $vr1, $vr1, $vr2 -; LA64-NEXT: vfmul.d $vr0, $vr0, $vr1 -; LA64-NEXT: vfmadd.d $vr0, $vr0, $vr1, $vr3 -; LA64-NEXT: vfmul.d $vr1, $vr1, $vr4 -; LA64-NEXT: vfmul.d $vr0, $vr1, $vr0 -; LA64-NEXT: vst $vr0, $a0, 0 -; LA64-NEXT: ret entry: %v0 = load <2 x double>, ptr %a0, align 16 %sqrt = call fast <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll index d88e0d1ea7c2d..9664808681bb8 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt.ll @@ -51,9 +51,8 @@ define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { ; LA32-LABEL: one_div_sqrt_v2f64: ; LA32: # %bb.0: # %entry ; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI3_0) ; LA32-NEXT: vfsqrt.d $vr0, $vr0 +; LA32-NEXT: vldi $vr1, -912 ; LA32-NEXT: vfdiv.d $vr0, $vr1, $vr0 ; LA32-NEXT: vst $vr0, $a0, 0 ; LA32-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll index 669c53b73b16f..92981211adeb8 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-fcmp.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll index 1b7a97d9f9720..324098b918890 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll index 3cd6c78e87d78..ad46b47c82c86 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx,+frecipe < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll index 667ba32723fc4..2ecbe685ff20b 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-max-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s declare <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll index b73bada4f06fb..f4348f57442e6 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-min-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s declare <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8>, i32) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d-invalid-imm.ll new file mode 100644 index 0000000000000..4dc5163e721ce --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d-invalid-imm.ll @@ -0,0 +1,33 @@ +; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s + +declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_d_lo(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lsx_vpickve2gr_d_hi(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 2) + ret i64 %res +} + +declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_du_lo(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 -1) + ret i64 %res +} + +define i64 @lsx_vpickve2gr_du_hi(<2 x i64> %va) nounwind { +; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 2) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d.ll new file mode 100644 index 0000000000000..78f4e3c1bc18b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-d.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_d(<2 x i64> %va) nounwind { +; CHECK-LABEL: lsx_vpickve2gr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 1) + ret i64 %res +} + +declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) + +define i64 @lsx_vpickve2gr_du(<2 x i64> %va) nounwind { +; CHECK-LABEL: lsx_vpickve2gr_du: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpickve2gr.du $a0, $vr0, 1 +; CHECK-NEXT: ret +entry: + %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 1) + ret i64 %res +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll index 3430c54d21941..492b97c8316c1 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr-invalid-imm.ll @@ -1,3 +1,4 @@ +; RUN: not llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s 2>&1 | FileCheck %s ; RUN: not llc --mtriple=loongarch64 --mattr=+lsx < %s 2>&1 | FileCheck %s declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32) @@ -48,22 +49,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_d_lo(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lsx_vpickve2gr_d_hi(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.d: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 2) - ret i64 %res -} - declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32) define i32 @lsx_vpickve2gr_bu_lo(<16 x i8> %va) nounwind { @@ -111,19 +96,3 @@ entry: %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 4) ret i32 %res } - -declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_du_lo(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 -1) - ret i64 %res -} - -define i64 @lsx_vpickve2gr_du_hi(<2 x i64> %va) nounwind { -; CHECK: llvm.loongarch.lsx.vpickve2gr.du: argument out of range -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 2) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll index ed56d30ce3c46..4e77f6b72fed9 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-pickve2gr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8>, i32) @@ -37,18 +38,6 @@ entry: ret i32 %res } -declare i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_d(<2 x i64> %va) nounwind { -; CHECK-LABEL: lsx_vpickve2gr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> %va, i32 1) - ret i64 %res -} - declare i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8>, i32) define i32 @lsx_vpickve2gr_bu(<16 x i8> %va) nounwind { @@ -84,15 +73,3 @@ entry: %res = call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> %va, i32 3) ret i32 %res } - -declare i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64>, i32) - -define i64 @lsx_vpickve2gr_du(<2 x i64> %va) nounwind { -; CHECK-LABEL: lsx_vpickve2gr_du: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.du $a0, $vr0, 1 -; CHECK-NEXT: ret -entry: - %res = call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> %va, i32 1) - ret i64 %res -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr-d.ll new file mode 100644 index 0000000000000..51533e4b2474c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr-d.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define <2 x i64> @vrepl_ins_d(i64 %a, i64 %b) { +; CHECK-LABEL: vrepl_ins_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 +; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1 +; CHECK-NEXT: ret +entry: + %0 = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) + %1 = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %0, i64 %b, i32 1) + ret <2 x i64> %1 +} + +declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32 immarg) +declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll index aee7492946829..9d7ab6e1ab5ef 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-repl-ins-gr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define <16 x i8> @vrepl_ins_b(i32 %a, i32 %b) { @@ -37,23 +38,9 @@ entry: ret <4 x i32> %1 } -define <2 x i64> @vrepl_ins_d(i64 %a, i64 %b) { -; CHECK-LABEL: vrepl_ins_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a1, 1 -; CHECK-NEXT: ret -entry: - %0 = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) - %1 = call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> %0, i64 %b, i32 1) - ret <2 x i64> %1 -} - declare <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8>, i32, i32 immarg) declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32) declare <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16>, i32, i32 immarg) declare <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32) declare <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32>, i32, i32 immarg) declare <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32) -declare <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64>, i64, i32 immarg) -declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr-d.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr-d.ll new file mode 100644 index 0000000000000..c8d0fce6ed5a2 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr-d.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) + +define <2 x i64> @lsx_vreplgr2vr_d(i64 %a) nounwind { +; CHECK-LABEL: lsx_vreplgr2vr_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 +; CHECK-NEXT: ret +entry: + %res = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) + ret <2 x i64> %res +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll index 091f1c98c2289..edaa20792012d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-replgr2vr.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32) @@ -36,15 +37,3 @@ entry: %res = call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 %a) ret <4 x i32> %res } - -declare <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64) - -define <2 x i64> @lsx_vreplgr2vr_d(i64 %a) nounwind { -; CHECK-LABEL: lsx_vreplgr2vr_d: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplgr2vr.d $vr0, $a0 -; CHECK-NEXT: ret -entry: - %res = call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 %a) - ret <2 x i64> %res -} diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll index 3188fb4e2c2ef..004bcde90907a 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-set.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.bz.v(<16 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll index 22e01922e87bb..6544f91f045a7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setallnez.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.bnz.b(<16 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll index 96c79c10e4688..5ba3eb788c1d7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-setanyeqz.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s declare i32 @llvm.loongarch.lsx.bz.b(<16 x i8>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll new file mode 100644 index 0000000000000..34f22e1f6bf45 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/adda.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @vadda_b(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_b: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadda.b $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <16 x i8>, ptr %a + %vb = load <16 x i8>, ptr %b + %conda = icmp slt <16 x i8> %va, zeroinitializer + %nega = sub <16 x i8> zeroinitializer, %va + %absa = select <16 x i1> %conda, <16 x i8> %nega, <16 x i8> %va + %condb = icmp slt <16 x i8> %vb, zeroinitializer + %negb = sub <16 x i8> zeroinitializer, %vb + %absb = select <16 x i1> %condb, <16 x i8> %negb, <16 x i8> %vb + %add = add <16 x i8> %absa, %absb + store <16 x i8> %add, ptr %res + ret void +} + +define void @vadda_h(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_h: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadda.h $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <8 x i16>, ptr %a + %vb = load <8 x i16>, ptr %b + %conda = icmp slt <8 x i16> %va, zeroinitializer + %nega = sub <8 x i16> zeroinitializer, %va + %absa = select <8 x i1> %conda, <8 x i16> %nega, <8 x i16> %va + %condb = icmp slt <8 x i16> %vb, zeroinitializer + %negb = sub <8 x i16> zeroinitializer, %vb + %absb = select <8 x i1> %condb, <8 x i16> %negb, <8 x i16> %vb + %add = add <8 x i16> %absa, %absb + store <8 x i16> %add, ptr %res + ret void +} + +define void @vadda_w(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_w: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadda.w $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <4 x i32>, ptr %a + %vb = load <4 x i32>, ptr %b + %conda = icmp slt <4 x i32> %va, zeroinitializer + %nega = sub <4 x i32> zeroinitializer, %va + %absa = select <4 x i1> %conda, <4 x i32> %nega, <4 x i32> %va + %condb = icmp slt <4 x i32> %vb, zeroinitializer + %negb = sub <4 x i32> zeroinitializer, %vb + %absb = select <4 x i1> %condb, <4 x i32> %negb, <4 x i32> %vb + %add = add <4 x i32> %absa, %absb + store <4 x i32> %add, ptr %res + ret void +} + +define void @vadda_d(ptr %res, ptr %a, ptr %b) nounwind { +; CHECK-LABEL: vadda_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vadda.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %va = load <2 x i64>, ptr %a + %vb = load <2 x i64>, ptr %b + %conda = icmp slt <2 x i64> %va, zeroinitializer + %nega = sub <2 x i64> zeroinitializer, %va + %absa = select <2 x i1> %conda, <2 x i64> %nega, <2 x i64> %va + %condb = icmp slt <2 x i64> %vb, zeroinitializer + %negb = sub <2 x i64> zeroinitializer, %vb + %absb = select <2 x i1> %condb, <2 x i64> %negb, <2 x i64> %vb + %add = add <2 x i64> %absa, %absb + store <2 x i64> %add, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll index 3fb55d4806160..b17a90e71e85a 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll @@ -3,18 +3,11 @@ ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @extract_16xi8(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_16xi8: -; LA32: # %bb.0: -; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: vpickve2gr.b $a0, $vr0, 1 -; LA32-NEXT: st.b $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_16xi8: -; LA64: # %bb.0: -; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: vstelm.b $vr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_16xi8: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <16 x i8>, ptr %src %e = extractelement <16 x i8> %v, i32 1 store i8 %e, ptr %dst @@ -22,18 +15,11 @@ define void @extract_16xi8(ptr %src, ptr %dst) nounwind { } define void @extract_8xi16(ptr %src, ptr %dst) nounwind { -; LA32-LABEL: extract_8xi16: -; LA32: # %bb.0: -; LA32-NEXT: vld $vr0, $a0, 0 -; LA32-NEXT: vpickve2gr.h $a0, $vr0, 1 -; LA32-NEXT: st.h $a0, $a1, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: extract_8xi16: -; LA64: # %bb.0: -; LA64-NEXT: vld $vr0, $a0, 0 -; LA64-NEXT: vstelm.h $vr0, $a1, 0, 1 -; LA64-NEXT: ret +; CHECK-LABEL: extract_8xi16: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a0, 0 +; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 1 +; CHECK-NEXT: ret %v = load volatile <8 x i16>, ptr %src %e = extractelement <8 x i16> %v, i32 1 store i16 %e, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll index 603bd21ab9af9..fb0b9cee67df5 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll @@ -53,8 +53,7 @@ define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { ; LA32-LABEL: one_fdiv_v2f64: ; LA32: # %bb.0: # %entry ; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: vld $vr1, $a1, %pc_lo12(.LCPI3_0) +; LA32-NEXT: vldi $vr1, -912 ; LA32-NEXT: vfdiv.d $vr0, $vr1, $vr0 ; LA32-NEXT: vst $vr0, $a0, 0 ; LA32-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll index 4bb1941724dc6..496a1aed39fb5 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @insert_16xi8(ptr %src, ptr %dst, i8 %ins) nounwind { ; CHECK-LABEL: insert_16xi8: @@ -41,12 +42,20 @@ define void @insert_4xi32(ptr %src, ptr %dst, i32 %ins) nounwind { } define void @insert_2xi64(ptr %src, ptr %dst, i64 %ins) nounwind { -; CHECK-LABEL: insert_2xi64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a2, 1 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_2xi64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 2 +; LA32-NEXT: vinsgr2vr.w $vr0, $a3, 3 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_2xi64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a2, 1 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <2 x i64>, ptr %src %v_new = insertelement <2 x i64> %v, i64 %ins, i32 1 store <2 x i64> %v_new, ptr %dst @@ -82,18 +91,30 @@ define void @insert_2xdouble(ptr %src, ptr %dst, double %ins) nounwind { } define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_16xi8_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI6_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI6_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.b $vr2, $a0 -; CHECK-NEXT: vseq.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.b $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_16xi8_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI6_0) +; LA32-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI6_0) +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.b $vr2, $a3 +; LA32-NEXT: vseq.b $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.b $vr2, $a2 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_16xi8_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI6_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI6_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.b $vr2, $a0 +; LA64-NEXT: vseq.b $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.b $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <16 x i8>, ptr %src %v_new = insertelement <16 x i8> %v, i8 %ins, i32 %idx store <16 x i8> %v_new, ptr %dst @@ -101,18 +122,30 @@ define void @insert_16xi8_idx(ptr %src, ptr %dst, i8 %ins, i32 %idx) nounwind { } define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_8xi16_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI7_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.h $vr2, $a0 -; CHECK-NEXT: vseq.h $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.h $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_8xi16_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI7_0) +; LA32-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI7_0) +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.h $vr2, $a3 +; LA32-NEXT: vseq.h $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.h $vr2, $a2 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_8xi16_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI7_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI7_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.h $vr2, $a0 +; LA64-NEXT: vseq.h $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.h $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <8 x i16>, ptr %src %v_new = insertelement <8 x i16> %v, i16 %ins, i32 %idx store <8 x i16> %v_new, ptr %dst @@ -120,18 +153,30 @@ define void @insert_8xi16_idx(ptr %src, ptr %dst, i16 %ins, i32 %idx) nounwind { } define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xi32_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI8_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI8_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a0 -; CHECK-NEXT: vseq.w $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.w $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xi32_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a4, %pc_hi20(.LCPI8_0) +; LA32-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI8_0) +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a3 +; LA32-NEXT: vseq.w $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a2 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xi32_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI8_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI8_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.w $vr2, $a0 +; LA64-NEXT: vseq.w $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.w $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x i32>, ptr %src %v_new = insertelement <4 x i32> %v, i32 %ins, i32 %idx store <4 x i32> %v_new, ptr %dst @@ -139,18 +184,36 @@ define void @insert_4xi32_idx(ptr %src, ptr %dst, i32 %ins, i32 %idx) nounwind { } define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_2xi64_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a4, %pc_hi20(.LCPI9_0) -; CHECK-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI9_0) -; CHECK-NEXT: vld $vr1, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a3, 31, 0 -; CHECK-NEXT: vreplgr2vr.d $vr2, $a0 -; CHECK-NEXT: vseq.d $vr0, $vr2, $vr0 -; CHECK-NEXT: vreplgr2vr.d $vr2, $a2 -; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_2xi64_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a5, %pc_hi20(.LCPI9_0) +; LA32-NEXT: vld $vr0, $a5, %pc_lo12(.LCPI9_0) +; LA32-NEXT: add.w $a4, $a4, $a4 +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a4 +; LA32-NEXT: vseq.w $vr2, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.w $vr3, $a2 +; LA32-NEXT: vbitsel.v $vr1, $vr1, $vr3, $vr2 +; LA32-NEXT: addi.w $a0, $a4, 1 +; LA32-NEXT: vreplgr2vr.w $vr2, $a0 +; LA32-NEXT: vseq.w $vr0, $vr2, $vr0 +; LA32-NEXT: vreplgr2vr.w $vr2, $a3 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_2xi64_idx: +; LA64: # %bb.0: +; LA64-NEXT: pcalau12i $a4, %pc_hi20(.LCPI9_0) +; LA64-NEXT: vld $vr0, $a4, %pc_lo12(.LCPI9_0) +; LA64-NEXT: vld $vr1, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a3, 31, 0 +; LA64-NEXT: vreplgr2vr.d $vr2, $a0 +; LA64-NEXT: vseq.d $vr0, $vr2, $vr0 +; LA64-NEXT: vreplgr2vr.d $vr2, $a2 +; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr2, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <2 x i64>, ptr %src %v_new = insertelement <2 x i64> %v, i64 %ins, i32 %idx store <2 x i64> %v_new, ptr %dst @@ -158,19 +221,32 @@ define void @insert_2xi64_idx(ptr %src, ptr %dst, i64 %ins, i32 %idx) nounwind { } define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_4xfloat_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0 killed $f0 def $vr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) -; CHECK-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI10_0) -; CHECK-NEXT: vld $vr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: vreplgr2vr.w $vr3, $a0 -; CHECK-NEXT: vseq.w $vr1, $vr3, $vr1 -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 -; CHECK-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_4xfloat_idx: +; LA32: # %bb.0: +; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) +; LA32-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI10_0) +; LA32-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA32-NEXT: vld $vr2, $a0, 0 +; LA32-NEXT: vreplgr2vr.w $vr3, $a2 +; LA32-NEXT: vseq.w $vr1, $vr3, $vr1 +; LA32-NEXT: vreplvei.w $vr0, $vr0, 0 +; LA32-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_4xfloat_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI10_0) +; LA64-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI10_0) +; LA64-NEXT: vld $vr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: vreplgr2vr.w $vr3, $a0 +; LA64-NEXT: vseq.w $vr1, $vr3, $vr1 +; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 +; LA64-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <4 x float>, ptr %src %v_new = insertelement <4 x float> %v, float %ins, i32 %idx store <4 x float> %v_new, ptr %dst @@ -178,19 +254,34 @@ define void @insert_4xfloat_idx(ptr %src, ptr %dst, float %ins, i32 %idx) nounwi } define void @insert_2xdouble_idx(ptr %src, ptr %dst, double %ins, i32 %idx) nounwind { -; CHECK-LABEL: insert_2xdouble_idx: -; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI11_0) -; CHECK-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI11_0) -; CHECK-NEXT: vld $vr2, $a0, 0 -; CHECK-NEXT: bstrpick.d $a0, $a2, 31, 0 -; CHECK-NEXT: vreplgr2vr.d $vr3, $a0 -; CHECK-NEXT: vseq.d $vr1, $vr3, $vr1 -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 -; CHECK-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: insert_2xdouble_idx: +; LA32: # %bb.0: +; LA32-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA32-NEXT: vld $vr1, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI11_0) +; LA32-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI11_0) +; LA32-NEXT: vrepli.b $vr3, 0 +; LA32-NEXT: vinsgr2vr.w $vr3, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr3, $a2, 2 +; LA32-NEXT: vseq.d $vr2, $vr3, $vr2 +; LA32-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: insert_2xdouble_idx: +; LA64: # %bb.0: +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 def $vr0 +; LA64-NEXT: pcalau12i $a3, %pc_hi20(.LCPI11_0) +; LA64-NEXT: vld $vr1, $a3, %pc_lo12(.LCPI11_0) +; LA64-NEXT: vld $vr2, $a0, 0 +; LA64-NEXT: bstrpick.d $a0, $a2, 31, 0 +; LA64-NEXT: vreplgr2vr.d $vr3, $a0 +; LA64-NEXT: vseq.d $vr1, $vr3, $vr1 +; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA64-NEXT: vbitsel.v $vr0, $vr2, $vr0, $vr1 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %v = load volatile <2 x double>, ptr %src %v_new = insertelement <2 x double> %v, double %ins, i32 %idx store <2 x double> %v_new, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll index 10510786f3216..40961bc9a08b9 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vreplvei.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ;; vreplvei.b diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll index d1c071b45ddff..b13433ee5d159 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s define <16 x i8> @shufflevector_v16i8(<16 x i8> %a, <16 x i8> %b) { diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll index cd80dcb44e433..bee4ba6a84334 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shuffle-as-vshuf4i.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s ;; vshuf4i.b diff --git a/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll new file mode 100644 index 0000000000000..b651f11596c82 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/scalarize-fp.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 -mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s + +define <4 x float> @fadd_elt0_v4f32(float %a) nounwind { +; CHECK-LABEL: fadd_elt0_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -1168 +; CHECK-NEXT: fadd.s $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <4 x float> poison, float %a, i32 0 + %c = fadd <4 x float> %b, + ret <4 x float> %c +} + +define <2 x double> @fadd_elt0_v2f64(double %a) nounwind { +; CHECK-LABEL: fadd_elt0_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -912 +; CHECK-NEXT: fadd.d $fa0, $fa0, $fa1 +; CHECK-NEXT: ret +entry: + %b = insertelement <2 x double> poison, double %a, i32 0 + %c = fadd <2 x double> %b, + ret <2 x double> %c +} + +define <4 x float> @fsub_splat_v4f32(float %b) nounwind { +; CHECK-LABEL: fsub_splat_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vldi $vr1, -1168 +; CHECK-NEXT: fsub.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <4 x float> poison, float 1.0, i32 0 + %insb = insertelement <4 x float> poison, float %b, i32 0 + %va = shufflevector <4 x float> %insa, <4 x float> poison, <4 x i32> zeroinitializer + %vb = shufflevector <4 x float> %insb, <4 x float> poison, <4 x i32> zeroinitializer + %c = fsub <4 x float> %va, %vb + ret <4 x float> %c +} + +define <2 x double> @fsub_splat_v2f64(double %a, double %b) nounwind { +; CHECK-LABEL: fsub_splat_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fsub.d $fa0, $fa0, $fa1 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: ret +entry: + %insa = insertelement <2 x double> poison, double %a, i32 0 + %insb = insertelement <2 x double> poison, double %b, i32 0 + %va = shufflevector <2 x double> %insa, <2 x double> poison, <2 x i32> zeroinitializer + %vb = shufflevector <2 x double> %insb, <2 x double> poison, <2 x i32> zeroinitializer + %c = fsub <2 x double> %va, %vb + ret <2 x double> %c +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll index 57fd09ed2e09b..9c3a6f7be0542 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-add.ll @@ -1,17 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefix=LA64 define void @vec_reduce_add_v16i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v16i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v16i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v16i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <16 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) store i8 %res, ptr %dst @@ -19,16 +31,29 @@ define void @vec_reduce_add_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -36,15 +61,25 @@ define void @vec_reduce_add_v8i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.w $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 0 -; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <4 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v) store i8 %res, ptr %dst @@ -52,13 +87,23 @@ define void @vec_reduce_add_v4i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.h $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.h $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.h $vr0, $a0, 0 +; LA32-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.b $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.h $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.h $vr0, $a0, 0 +; LA64-NEXT: vhaddw.h.b $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.b $a0, $a1, 0 +; LA64-NEXT: ret %v = load <2 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v) store i8 %res, ptr %dst @@ -66,15 +111,25 @@ define void @vec_reduce_add_v2i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v8i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v8i16: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v8i16: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <8 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) store i16 %res, ptr %dst @@ -82,15 +137,27 @@ define void @vec_reduce_add_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 0 -; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -98,13 +165,23 @@ define void @vec_reduce_add_v4i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.w $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA32-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA64-NEXT: vhaddw.w.h $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret %v = load <2 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v) store i16 %res, ptr %dst @@ -112,14 +189,23 @@ define void @vec_reduce_add_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0 -; CHECK-NEXT: st.w $a0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.w $a0, $a1, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -127,13 +213,25 @@ define void @vec_reduce_add_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vhaddw.d.w $vr0, $vr0, $vr0 +; LA64-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-NEXT: st.w $a0, $a1, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -141,12 +239,27 @@ define void @vec_reduce_add_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_add_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_add_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_add_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 1 +; LA32-NEXT: add.w $a3, $a4, $a3 +; LA32-NEXT: add.w $a0, $a2, $a0 +; LA32-NEXT: sltu $a2, $a0, $a2 +; LA32-NEXT: add.w $a2, $a3, $a2 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_add_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vhaddw.q.d $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll index cca4ce30758f1..734ecba843a4e 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_and_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_and_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_and_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_and_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_and_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,27 @@ define void @vec_reduce_and_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vori.b $vr1, $vr0, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr1, 4 +; LA32-NEXT: vand.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +207,26 @@ define void @vec_reduce_and_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_and_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_and_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vand.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_and_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 0 +; LA32-NEXT: and $a3, $a4, $a3 +; LA32-NEXT: and $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a3, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_and_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vand.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll index ce431f0cf6a74..e833930830c3f 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_or_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_or_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_or_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_or_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_or_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,27 @@ define void @vec_reduce_or_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vori.b $vr1, $vr0, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr1, 4 +; LA32-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +207,26 @@ define void @vec_reduce_or_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_or_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_or_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_or_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 0 +; LA32-NEXT: or $a3, $a4, $a3 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a3, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_or_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll index bdf153ad7794f..2220df68cddfd 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smax_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smax_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_smax_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmax.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmax.b $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_smax_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.h $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_smax_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_smax_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_smax_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smax_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smax_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smax_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 3 +; LA32-NEXT: slt $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a3, $a4 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a2, $a2, $a5 +; LA32-NEXT: maskeqz $a0, $a0, $a5 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: masknez $a2, $a4, $a5 +; LA32-NEXT: maskeqz $a3, $a3, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smax_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll index e3b3c5e6f2410..50d76a3872e1e 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_smin_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_smin_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_smin_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmin.b $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmin.b $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_smin_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.h $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.h $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_smin_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_smin_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.w $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.w $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_smin_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_smin_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_smin_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.d $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_smin_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 1 +; LA32-NEXT: slt $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a4, $a3 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a0, $a0, $a5 +; LA32-NEXT: maskeqz $a2, $a2, $a5 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: masknez $a2, $a3, $a5 +; LA32-NEXT: maskeqz $a3, $a4, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_smin_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.d $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll index fff2304befd68..88146c78a969d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umax_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umax_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_umax_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmax.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmax.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_umax_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmax.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmax.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_umax_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_umax_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmax.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmax.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_umax_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umax_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umax_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmax.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umax_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 3 +; LA32-NEXT: sltu $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a3, $a4 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a2, $a2, $a5 +; LA32-NEXT: maskeqz $a0, $a0, $a5 +; LA32-NEXT: or $a0, $a0, $a2 +; LA32-NEXT: masknez $a2, $a4, $a5 +; LA32-NEXT: maskeqz $a3, $a3, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umax_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmax.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll index e14a294cbcfb6..e9d4b4aab6f91 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_umin_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_umin_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_umin_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vmin.bu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vmin.bu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_umin_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vmin.hu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vmin.hu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_umin_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,26 @@ define void @vec_reduce_umin_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vmin.wu $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vmin.wu $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +206,37 @@ define void @vec_reduce_umin_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_umin_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_umin_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vmin.du $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_umin_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 0 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 1 +; LA32-NEXT: sltu $a5, $a4, $a3 +; LA32-NEXT: xor $a6, $a4, $a3 +; LA32-NEXT: sltui $a6, $a6, 1 +; LA32-NEXT: masknez $a5, $a5, $a6 +; LA32-NEXT: sltu $a7, $a2, $a0 +; LA32-NEXT: maskeqz $a6, $a7, $a6 +; LA32-NEXT: or $a5, $a6, $a5 +; LA32-NEXT: masknez $a0, $a0, $a5 +; LA32-NEXT: maskeqz $a2, $a2, $a5 +; LA32-NEXT: or $a0, $a2, $a0 +; LA32-NEXT: masknez $a2, $a3, $a5 +; LA32-NEXT: maskeqz $a3, $a4, $a5 +; LA32-NEXT: or $a2, $a3, $a2 +; LA32-NEXT: st.w $a2, $a1, 4 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_umin_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vmin.du $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll index ae2bb8f91de05..ed965e9e10ee7 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @vec_reduce_xor_v16i8(ptr %src, ptr %dst) nounwind { ; CHECK-LABEL: vec_reduce_xor_v16i8: @@ -22,18 +23,33 @@ define void @vec_reduce_xor_v16i8(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v8i8(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v8i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v8i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 1 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.b $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <8 x i8>, ptr %src %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v) store i8 %res, ptr %dst @@ -91,16 +107,29 @@ define void @vec_reduce_xor_v8i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v4i16(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v4i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v4i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 2 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i16>, ptr %src %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v) store i16 %res, ptr %dst @@ -123,15 +152,26 @@ define void @vec_reduce_xor_v2i16(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v4i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v4i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v4i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <4 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v) store i32 %res, ptr %dst @@ -139,14 +179,27 @@ define void @vec_reduce_xor_v4i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v2i32(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v2i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vori.b $vr1, $vr0, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vbsrl.v $vr1, $vr1, 4 +; LA32-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v2i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i32>, ptr %src %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v) store i32 %res, ptr %dst @@ -154,13 +207,26 @@ define void @vec_reduce_xor_v2i32(ptr %src, ptr %dst) nounwind { } define void @vec_reduce_xor_v2i64(ptr %src, ptr %dst) nounwind { -; CHECK-LABEL: vec_reduce_xor_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8 -; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: vec_reduce_xor_v2i64: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 3 +; LA32-NEXT: vpickve2gr.w $a2, $vr0, 1 +; LA32-NEXT: vpickve2gr.w $a3, $vr0, 2 +; LA32-NEXT: vpickve2gr.w $a4, $vr0, 0 +; LA32-NEXT: xor $a3, $a4, $a3 +; LA32-NEXT: xor $a0, $a2, $a0 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: st.w $a3, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: vec_reduce_xor_v2i64: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-NEXT: vxor.v $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %v = load <2 x i64>, ptr %src %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v) store i64 %res, ptr %dst diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll index 9485df746ff1c..dce6dc9f2aa37 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-sext.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s - +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefixes=CHECK,LA64 define void @load_sext_2i8_to_2i64(ptr %ptr, ptr %dst) { ; CHECK-LABEL: load_sext_2i8_to_2i64: @@ -40,15 +40,27 @@ entry: } define void @load_sext_8i8_to_8i16(ptr %ptr, ptr %dst) { -; CHECK-LABEL: load_sext_8i8_to_8i16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vslli.h $vr0, $vr0, 8 -; CHECK-NEXT: vsrai.h $vr0, $vr0, 8 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_sext_8i8_to_8i16: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA32-NEXT: vslli.h $vr0, $vr0, 8 +; LA32-NEXT: vsrai.h $vr0, $vr0, 8 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_sext_8i8_to_8i16: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA64-NEXT: vslli.h $vr0, $vr0, 8 +; LA64-NEXT: vsrai.h $vr0, $vr0, 8 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret entry: %A = load <8 x i8>, ptr %ptr %B = sext <8 x i8> %A to <8 x i16> @@ -75,15 +87,27 @@ entry: } define void @load_sext_4i16_to_4i32(ptr %ptr, ptr %dst) { -; CHECK-LABEL: load_sext_4i16_to_4i32: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vslli.w $vr0, $vr0, 16 -; CHECK-NEXT: vsrai.w $vr0, $vr0, 16 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_sext_4i16_to_4i32: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA32-NEXT: vslli.w $vr0, $vr0, 16 +; LA32-NEXT: vsrai.w $vr0, $vr0, 16 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_sext_4i16_to_4i32: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA64-NEXT: vslli.w $vr0, $vr0, 16 +; LA64-NEXT: vsrai.w $vr0, $vr0, 16 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret entry: %A = load <4 x i16>, ptr %ptr %B = sext <4 x i16> %A to <4 x i32> @@ -92,15 +116,26 @@ entry: } define void @load_sext_2i32_to_2i64(ptr %ptr, ptr %dst) { -; CHECK-LABEL: load_sext_2i32_to_2i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16 -; CHECK-NEXT: vslli.d $vr0, $vr0, 32 -; CHECK-NEXT: vsrai.d $vr0, $vr0, 32 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_sext_2i32_to_2i64: +; LA32: # %bb.0: # %entry +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 2 +; LA32-NEXT: vslli.d $vr0, $vr0, 32 +; LA32-NEXT: vsrai.d $vr0, $vr0, 32 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_sext_2i32_to_2i64: +; LA64: # %bb.0: # %entry +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.w $vr0, $vr0, 16 +; LA64-NEXT: vslli.d $vr0, $vr0, 32 +; LA64-NEXT: vsrai.d $vr0, $vr0, 32 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret entry: %A = load <2 x i32>, ptr %ptr %B = sext <2 x i32> %A to <2 x i64> diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll index 9b1b584bd9c76..bb008ee5eb903 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-any-ext.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 define void @shuffle_any_ext_2i8_to_2i64(ptr %ptr, ptr %dst) nounwind { ; CHECK-LABEL: shuffle_any_ext_2i8_to_2i64: @@ -35,13 +36,22 @@ define void @shuffle_any_ext_2i16_to_2i64(ptr %ptr, ptr %dst) nounwind { } define void @shuffle_any_ext_2i32_to_2i64(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: shuffle_any_ext_2i32_to_2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 16 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shuffle_any_ext_2i32_to_2i64: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 2 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shuffle_any_ext_2i32_to_2i64: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.w $vr0, $vr0, 16 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %x = load <2 x i32>, ptr %ptr %y = shufflevector <2 x i32> %x, <2 x i32> poison, <4 x i32> %r = bitcast <4 x i32> %y to <2 x i64> @@ -66,13 +76,23 @@ define void @shuffle_any_ext_4i8_to_4i32(ptr %ptr, ptr %dst) nounwind { } define void @shuffle_any_ext_4i16_to_4i32(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: shuffle_any_ext_4i16_to_4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shuffle_any_ext_4i16_to_4i32: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shuffle_any_ext_4i16_to_4i32: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.h $vr0, $vr0, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %x = load <4 x i16>, ptr %ptr %y = shufflevector <4 x i16> %x, <4 x i16> poison, <8 x i32> %r = bitcast <8 x i16> %y to <4 x i32> @@ -81,13 +101,23 @@ define void @shuffle_any_ext_4i16_to_4i32(ptr %ptr, ptr %dst) nounwind { } define void @shuffle_any_ext_8i8_to_8i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: shuffle_any_ext_8i8_to_8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vilvl.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vst $vr0, $a1, 0 -; CHECK-NEXT: ret +; LA32-LABEL: shuffle_any_ext_8i8_to_8i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA32-NEXT: vst $vr0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: shuffle_any_ext_8i8_to_8i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vilvl.b $vr0, $vr0, $vr0 +; LA64-NEXT: vst $vr0, $a1, 0 +; LA64-NEXT: ret %x = load <8 x i8>, ptr %ptr %y = shufflevector <8 x i8> %x, <8 x i8> poison, <16 x i32> %r = bitcast <16 x i8> %y to <8 x i16> diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll index b1e3f74cd1739..be241925a2788 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-rotate.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s ;; TODO For these special shuffle mask, we can lower it to vbsll + vbsrl + vor. diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll index ff0f252ba2bdf..5275d5326f73a 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-shuffle-byte-shift.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define <16 x i8> @shuffle_16i8_vbsll_v_1(<16 x i8> %a) nounwind { diff --git a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll index e056e7c38ddcd..314350acd23d6 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vec-trunc.ll @@ -1,13 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s --check-prefix=LA64 define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i64_to_2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.w $vr0, $vr0, 8 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i64_to_2i32: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 2 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i64_to_2i32: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.w $vr0, $vr0, 8 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i32> store <2 x i32> %trunc, ptr %dst @@ -15,14 +25,24 @@ define void @load_trunc_2i64_to_2i32(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i64_to_2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) -; CHECK-NEXT: vshuf.h $vr1, $vr0, $vr0 -; CHECK-NEXT: vstelm.w $vr1, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i64_to_2i16: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) +; LA32-NEXT: vshuf.h $vr1, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr1, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i64_to_2i16: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI1_0) +; LA64-NEXT: vshuf.h $vr1, $vr0, $vr0 +; LA64-NEXT: vstelm.w $vr1, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i16> store <2 x i16> %trunc, ptr %dst @@ -30,14 +50,23 @@ define void @load_trunc_2i64_to_2i16(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i64_to_2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i64_to_2i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i64_to_2i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI2_0) +; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI2_0) +; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i64>, ptr %ptr %trunc = trunc <2 x i64> %a to <2 x i8> store <2 x i8> %trunc, ptr %dst @@ -45,12 +74,22 @@ define void @load_trunc_2i64_to_2i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_4i32_to_4i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_4i32_to_4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vpickev.h $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_4i32_to_4i16: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickev.h $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_4i32_to_4i16: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vpickev.h $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <4 x i32>, ptr %ptr %trunc = trunc <4 x i32> %a to <4 x i16> store <4 x i16> %trunc, ptr %dst @@ -58,14 +97,24 @@ define void @load_trunc_4i32_to_4i16(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_4i32_to_4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) -; CHECK-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_4i32_to_4i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; LA32-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_4i32_to_4i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: pcalau12i $a0, %pc_hi20(.LCPI4_0) +; LA64-NEXT: vld $vr1, $a0, %pc_lo12(.LCPI4_0) +; LA64-NEXT: vshuf.b $vr0, $vr0, $vr0, $vr1 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <4 x i32>, ptr %ptr %trunc = trunc <4 x i32> %a to <4 x i8> store <4 x i8> %trunc, ptr %dst @@ -73,12 +122,22 @@ define void @load_trunc_4i32_to_4i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_8i16_to_8i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_8i16_to_8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vld $vr0, $a0, 0 -; CHECK-NEXT: vpickev.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_8i16_to_8i8: +; LA32: # %bb.0: +; LA32-NEXT: vld $vr0, $a0, 0 +; LA32-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 1 +; LA32-NEXT: st.w $a0, $a1, 4 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_8i16_to_8i8: +; LA64: # %bb.0: +; LA64-NEXT: vld $vr0, $a0, 0 +; LA64-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.d $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <8 x i16>, ptr %ptr %trunc = trunc <8 x i16> %a to <8 x i8> store <8 x i8> %trunc, ptr %dst @@ -86,13 +145,24 @@ define void @load_trunc_8i16_to_8i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i32_to_2i16: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.h $vr0, $vr0, 8 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i32_to_2i16: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vshuf4i.h $vr0, $vr0, 8 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i32_to_2i16: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.h $vr0, $vr0, 8 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i32>, ptr %ptr %trunc = trunc <2 x i32> %a to <2 x i16> store <2 x i16> %trunc, ptr %dst @@ -100,15 +170,27 @@ define void @load_trunc_2i32_to_2i16(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i32_to_2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) -; CHECK-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0) -; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0 -; CHECK-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i32_to_2i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: pcalau12i $a3, %pc_hi20(.LCPI7_0) +; LA32-NEXT: vld $vr0, $a3, %pc_lo12(.LCPI7_0) +; LA32-NEXT: vinsgr2vr.w $vr1, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr1, $a0, 1 +; LA32-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i32_to_2i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64-NEXT: vld $vr0, $a2, %pc_lo12(.LCPI7_0) +; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 0 +; LA64-NEXT: vshuf.b $vr0, $vr0, $vr1, $vr0 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i32>, ptr %ptr %trunc = trunc <2 x i32> %a to <2 x i8> store <2 x i8> %trunc, ptr %dst @@ -116,13 +198,24 @@ define void @load_trunc_2i32_to_2i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_4i16_to_4i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_4i16_to_4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.d $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; CHECK-NEXT: vpickev.b $vr0, $vr0, $vr0 -; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_4i16_to_4i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a2, $a0, 0 +; LA32-NEXT: ld.w $a0, $a0, 4 +; LA32-NEXT: vinsgr2vr.w $vr0, $a2, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; LA32-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA32-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-NEXT: st.w $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_4i16_to_4i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.d $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vpickev.b $vr0, $vr0, $vr0 +; LA64-NEXT: vstelm.w $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <4 x i16>, ptr %ptr %trunc = trunc <4 x i16> %a to <4 x i8> store <4 x i8> %trunc, ptr %dst @@ -130,17 +223,23 @@ define void @load_trunc_4i16_to_4i8(ptr %ptr, ptr %dst) nounwind { } define void @load_trunc_2i16_to_2i8(ptr %ptr, ptr %dst) nounwind { -; CHECK-LABEL: load_trunc_2i16_to_2i8: -; CHECK: # %bb.0: -; CHECK-NEXT: ld.w $a0, $a0, 0 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 -; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 8 -; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0 -; CHECK-NEXT: ret +; LA32-LABEL: load_trunc_2i16_to_2i8: +; LA32: # %bb.0: +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA32-NEXT: vshuf4i.b $vr0, $vr0, 8 +; LA32-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: load_trunc_2i16_to_2i8: +; LA64: # %bb.0: +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; LA64-NEXT: vshuf4i.b $vr0, $vr0, 8 +; LA64-NEXT: vstelm.h $vr0, $a1, 0, 0 +; LA64-NEXT: ret %a = load <2 x i16>, ptr %ptr %trunc = trunc <2 x i16> %a to <2 x i8> store <2 x i8> %trunc, ptr %dst ret void } - - diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll index 7fa591db5d1fa..8bdeebef13dd2 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx --verify-machineinstrs < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx --verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LA64 define i16 @vmsk_eq_allzeros_i8(<16 x i8 > %a) { ; CHECK-LABEL: vmsk_eq_allzeros_i8: @@ -605,17 +606,29 @@ define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) { } define i32 @vmsk2_eq_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_eq_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vseqi.b $vr0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vseqi.b $vr0, $vr1, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_eq_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vseqi.b $vr0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vseqi.b $vr0, $vr1, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_eq_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vseqi.b $vr0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vseqi.b $vr0, $vr1, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp eq <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -623,18 +636,31 @@ entry: } define i32 @vmsk2_sgt_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sgt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepli.b $vr2, 0 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vrepli.b $vr2, 0 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vrepli.b $vr2, 0 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -642,18 +668,31 @@ entry: } define i32 @vmsk2_sgt_allones_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sgt_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepli.b $vr2, -1 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vrepli.b $vr2, -1 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vrepli.b $vr2, -1 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslt.b $vr0, $vr2, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sgt <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -661,18 +700,31 @@ entry: } define i32 @vmsk2_sge_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sge_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vrepli.b $vr2, 0 -; CHECK-NEXT: vsle.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vsle.b $vr0, $vr2, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sge_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vrepli.b $vr2, 0 +; LA32-NEXT: vsle.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vsle.b $vr0, $vr2, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sge_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vrepli.b $vr2, 0 +; LA64-NEXT: vsle.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vsle.b $vr0, $vr2, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sge <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -680,15 +732,25 @@ entry: } define i32 @vmsk2_slt_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_slt_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr1 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_slt_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr1 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_slt_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr1 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp slt <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -696,17 +758,29 @@ entry: } define i32 @vmsk2_sle_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sle_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vslei.b $vr0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslei.b $vr0, $vr1, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sle_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vslei.b $vr0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslei.b $vr0, $vr1, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sle_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vslei.b $vr0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslei.b $vr0, $vr1, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -714,17 +788,29 @@ entry: } define i32 @vmsk2_sle_allones_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_sle_allones_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vslei.b $vr0, $vr0, -1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslei.b $vr0, $vr1, -1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sle_allones_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vslei.b $vr0, $vr0, -1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslei.b $vr0, $vr1, -1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sle_allones_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vslei.b $vr0, $vr0, -1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslei.b $vr0, $vr1, -1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp sle <32 x i8> %a, splat (i8 -1) %2 = bitcast <32 x i1> %1 to i32 @@ -732,19 +818,33 @@ entry: } define i32 @vmsk2_ne_allzeros_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_ne_allzeros_i8: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vseqi.b $vr0, $vr0, 0 -; CHECK-NEXT: vxori.b $vr0, $vr0, 255 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vseqi.b $vr0, $vr1, 0 -; CHECK-NEXT: vxori.b $vr0, $vr0, 255 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_ne_allzeros_i8: +; LA32: # %bb.0: # %entry +; LA32-NEXT: vseqi.b $vr0, $vr0, 0 +; LA32-NEXT: vxori.b $vr0, $vr0, 255 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vseqi.b $vr0, $vr1, 0 +; LA32-NEXT: vxori.b $vr0, $vr0, 255 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_ne_allzeros_i8: +; LA64: # %bb.0: # %entry +; LA64-NEXT: vseqi.b $vr0, $vr0, 0 +; LA64-NEXT: vxori.b $vr0, $vr0, 255 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vseqi.b $vr0, $vr1, 0 +; LA64-NEXT: vxori.b $vr0, $vr0, 255 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret entry: %1 = icmp ne <32 x i8> %a, splat (i8 0) %2 = bitcast <32 x i1> %1 to i32 @@ -752,38 +852,66 @@ entry: } define i32 @vmsk2_sgt_v32i8(<32 x i8> %a, <32 x i8> %b) { -; CHECK-LABEL: vmsk2_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslt.b $vr0, $vr3, $vr1 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslt.b $vr0, $vr3, $vr1 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslt.b $vr0, $vr3, $vr1 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret %x = icmp sgt <32 x i8> %a, %b %res = bitcast <32 x i1> %x to i32 ret i32 %res } define i32 @vmsk2_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { -; CHECK-LABEL: vmsk2_sgt_and_sgt_v32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vslt.b $vr0, $vr2, $vr0 -; CHECK-NEXT: vslt.b $vr1, $vr3, $vr1 -; CHECK-NEXT: vslt.b $vr2, $vr6, $vr4 -; CHECK-NEXT: vslt.b $vr3, $vr7, $vr5 -; CHECK-NEXT: vand.v $vr1, $vr1, $vr3 -; CHECK-NEXT: vand.v $vr0, $vr0, $vr2 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vmskltz.b $vr0, $vr1 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_sgt_and_sgt_v32i8: +; LA32: # %bb.0: +; LA32-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA32-NEXT: vslt.b $vr1, $vr3, $vr1 +; LA32-NEXT: vslt.b $vr2, $vr6, $vr4 +; LA32-NEXT: vslt.b $vr3, $vr7, $vr5 +; LA32-NEXT: vand.v $vr1, $vr1, $vr3 +; LA32-NEXT: vand.v $vr0, $vr0, $vr2 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vmskltz.b $vr0, $vr1 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_sgt_and_sgt_v32i8: +; LA64: # %bb.0: +; LA64-NEXT: vslt.b $vr0, $vr2, $vr0 +; LA64-NEXT: vslt.b $vr1, $vr3, $vr1 +; LA64-NEXT: vslt.b $vr2, $vr6, $vr4 +; LA64-NEXT: vslt.b $vr3, $vr7, $vr5 +; LA64-NEXT: vand.v $vr1, $vr1, $vr3 +; LA64-NEXT: vand.v $vr0, $vr0, $vr2 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vmskltz.b $vr0, $vr1 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret %x0 = icmp sgt <32 x i8> %a, %b %x1 = icmp sgt <32 x i8> %c, %d %y = and <32 x i1> %x0, %x1 @@ -792,17 +920,29 @@ define i32 @vmsk2_sgt_and_sgt_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3 } define i32 @vmsk2_trunc_i8(<32 x i8> %a) { -; CHECK-LABEL: vmsk2_trunc_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vslli.b $vr0, $vr0, 7 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a0, $vr0, 0 -; CHECK-NEXT: vslli.b $vr0, $vr1, 7 -; CHECK-NEXT: vmskltz.b $vr0, $vr0 -; CHECK-NEXT: vpickve2gr.hu $a1, $vr0, 0 -; CHECK-NEXT: slli.d $a1, $a1, 16 -; CHECK-NEXT: or $a0, $a0, $a1 -; CHECK-NEXT: ret +; LA32-LABEL: vmsk2_trunc_i8: +; LA32: # %bb.0: +; LA32-NEXT: vslli.b $vr0, $vr0, 7 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA32-NEXT: vslli.b $vr0, $vr1, 7 +; LA32-NEXT: vmskltz.b $vr0, $vr0 +; LA32-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: ret +; +; LA64-LABEL: vmsk2_trunc_i8: +; LA64: # %bb.0: +; LA64-NEXT: vslli.b $vr0, $vr0, 7 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a0, $vr0, 0 +; LA64-NEXT: vslli.b $vr0, $vr1, 7 +; LA64-NEXT: vmskltz.b $vr0, $vr0 +; LA64-NEXT: vpickve2gr.hu $a1, $vr0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: ret %y = trunc <32 x i8> %a to <32 x i1> %res = bitcast <32 x i1> %y to i32 ret i32 %res diff --git a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll index 5dbff4a402b3d..8f25a6ba62f9f 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/vselect.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/vselect.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA32 -; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefixes=CHECK,LA64 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define void @select_v16i8_imm(ptr %res, ptr %a0) nounwind { ; CHECK-LABEL: select_v16i8_imm: @@ -50,26 +50,14 @@ define void @select_v8i16(ptr %res, ptr %a0, ptr %a1) nounwind { } define void @select_v4i32(ptr %res, ptr %a0, ptr %a1) nounwind { -; LA32-LABEL: select_v4i32: -; LA32: # %bb.0: -; LA32-NEXT: vld $vr0, $a1, 0 -; LA32-NEXT: vld $vr1, $a2, 0 -; LA32-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0) -; LA32-NEXT: vld $vr2, $a1, %pc_lo12(.LCPI3_0) -; LA32-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 -; LA32-NEXT: vst $vr0, $a0, 0 -; LA32-NEXT: ret -; -; LA64-LABEL: select_v4i32: -; LA64: # %bb.0: -; LA64-NEXT: vld $vr0, $a1, 0 -; LA64-NEXT: vld $vr1, $a2, 0 -; LA64-NEXT: ori $a1, $zero, 0 -; LA64-NEXT: lu32i.d $a1, -1 -; LA64-NEXT: vreplgr2vr.d $vr2, $a1 -; LA64-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 -; LA64-NEXT: vst $vr0, $a0, 0 -; LA64-NEXT: ret +; CHECK-LABEL: select_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 +; CHECK-NEXT: vldi $vr2, -1552 +; CHECK-NEXT: vbitsel.v $vr0, $vr1, $vr0, $vr2 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret %v0 = load <4 x i32>, ptr %a0 %v1 = load <4 x i32>, ptr %a1 %sel = select <4 x i1> , <4 x i32> %v0, <4 x i32> %v1 diff --git a/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll b/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll index 54328260d9d14..42ef9133bf04d 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/widen-shuffle-mask.ll @@ -1,4 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s ; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s define <16 x i8> @widen_shuffle_mask_v16i8_to_v8i16(<16 x i8> %a, <16 x i8> %b) { diff --git a/llvm/test/CodeGen/LoongArch/merge-offset-option.ll b/llvm/test/CodeGen/LoongArch/merge-offset-option.ll new file mode 100644 index 0000000000000..e5351a6589cf7 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/merge-offset-option.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 -mattr=+d --relocation-model=static -O1 \ +; RUN: < %s | FileCheck %s --check-prefix=MERGE +; RUN: llc --mtriple=loongarch64 -mattr=+d --relocation-model=static -O1 \ +; RUN: --loongarch-enable-merge-offset=false < %s | FileCheck %s --check-prefix=NO_MERGE + +@g = dso_local global i32 zeroinitializer, align 4 + +define void @foo() nounwind { +; MERGE-LABEL: foo: +; MERGE: # %bb.0: +; MERGE-NEXT: pcalau12i $a0, %pc_hi20(g) +; MERGE-NEXT: ld.w $zero, $a0, %pc_lo12(g) +; MERGE-NEXT: ret +; +; NO_MERGE-LABEL: foo: +; NO_MERGE: # %bb.0: +; NO_MERGE-NEXT: pcalau12i $a0, %pc_hi20(g) +; NO_MERGE-NEXT: addi.d $a0, $a0, %pc_lo12(g) +; NO_MERGE-NEXT: ld.w $zero, $a0, 0 +; NO_MERGE-NEXT: ret + %v = load volatile i32, ptr @g + ret void +} diff --git a/llvm/test/CodeGen/MIR/X86/frame-info-multiple-save-restore-points-parse.mir b/llvm/test/CodeGen/MIR/X86/frame-info-multiple-save-restore-points-parse.mir index 4c60ccd573595..2cdd6a0bce471 100644 --- a/llvm/test/CodeGen/MIR/X86/frame-info-multiple-save-restore-points-parse.mir +++ b/llvm/test/CodeGen/MIR/X86/frame-info-multiple-save-restore-points-parse.mir @@ -32,10 +32,14 @@ liveins: # CHECK: frameInfo: # CHECK: savePoint: # CHECK-NEXT: - point: '%bb.1' +# CHECK-NEXT: registers: [] # CHECK-NEXT: - point: '%bb.2' +# CHECK-NEXT: registers: [] # CHECK: restorePoint: # CHECK-NEXT: - point: '%bb.2' +# CHECK-NEXT: registers: [] # CHECK-NEXT: - point: '%bb.3' +# CHECK-NEXT: registers: [] # CHECK: stack frameInfo: maxAlignment: 4 diff --git a/llvm/test/CodeGen/MIR/X86/frame-info-save-restore-points-with-regs-parse.mir b/llvm/test/CodeGen/MIR/X86/frame-info-save-restore-points-with-regs-parse.mir new file mode 100644 index 0000000000000..e01114726385c --- /dev/null +++ b/llvm/test/CodeGen/MIR/X86/frame-info-save-restore-points-with-regs-parse.mir @@ -0,0 +1,156 @@ +# RUN: llc -run-pass none -o - %s | FileCheck %s + +--- | + target triple = "x86_64-unknown-linux-gnu" + + define ptr @foo(ptr %ptr, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6) { + entry: + %tobool.not = icmp eq ptr %ptr, null + br i1 %tobool.not, label %if.then, label %if.end + + if.then: ; preds = %entry + %call = tail call ptr @bar(ptr %ptr, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6) + br label %if.end + + if.end: ; preds = %if.then, %entry + %ptr.addr.0 = phi ptr [ %call, %if.then ], [ %ptr, %entry ] + %incdec.ptr = getelementptr inbounds i8, ptr %ptr.addr.0, i64 1 + %call2 = tail call ptr @qux(ptr %incdec.ptr, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6) + ret ptr %call2 + } + + declare ptr @bar(ptr, i64, i64, i64, i64, i64) + + declare ptr @qux(ptr, i64, i64, i64, i64, i64) +... +--- +name: foo +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: true +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: + - { reg: '$rdi', virtual-reg: '' } + - { reg: '$rsi', virtual-reg: '' } + - { reg: '$rdx', virtual-reg: '' } + - { reg: '$rcx', virtual-reg: '' } + - { reg: '$r8', virtual-reg: '' } + - { reg: '$r9', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 1 + adjustsStack: true + hasCalls: true + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: true + isCalleeSavedInfoValid: false + localFrameSize: 0 +# CHECK: savePoint: +# CHECK-NEXT: - point: '%bb.2' +# CHECK-NEXT: registers: +# CHECK-NEXT: - '$r12' +# CHECK-NEXT: - '$r13' +# CHECK-NEXT: - '$r14' +# CHECK-NEXT: - '$r15' +# CHECK-NEXT: - '$rbx' +# CHECK: restorePoint: +# CHECK-NEXT: - point: '%bb.2' +# CHECK-NEXT: registers: +# CHECK-NEXT: - '$r12' +# CHECK-NEXT: - '$r13' +# CHECK-NEXT: - '$r14' +# CHECK-NEXT: - '$r15' +# CHECK-NEXT: - '$rbx' + savePoint: + - point: '%bb.1' + registers: + - '$rbx' + - '$r12' + - '$r13' + - '$r14' + - '$r15' + restorePoint: + - point: '%bb.1' + registers: + - '$rbx' + - '$r12' + - '$r13' + - '$r14' + - '$r15' +fixedStack: [] +stack: [] +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: + amxProgModel: None +body: | + bb.0.entry: + successors: %bb.1(0x30000000), %bb.3(0x50000000) + liveins: $rcx, $rdi, $rdx, $rsi, $r8, $r9 + + TEST64rr renamable $rdi, renamable $rdi, implicit-def $eflags + JCC_1 %bb.1, 4, implicit killed $eflags + + bb.3: + successors: %bb.2(0x80000000) + liveins: $rcx, $rdi, $rdx, $rsi, $r8, $r9 + + JMP_1 %bb.2 + + bb.1.if.then: + successors: %bb.2(0x80000000) + liveins: $rcx, $rdi, $rdx, $rsi, $r8, $r9 + + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + renamable $rbx = COPY renamable $rsi + renamable $r14 = COPY renamable $rdx + renamable $r15 = COPY renamable $rcx + renamable $r12 = COPY renamable $r8 + renamable $r13 = COPY renamable $r9 + CALL64pcrel32 target-flags(x86-plt) @bar, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit $r9, implicit-def $rsp, implicit-def $ssp, implicit-def $rax + renamable $rsi = COPY killed renamable $rbx + renamable $rdx = COPY killed renamable $r14 + renamable $rcx = COPY killed renamable $r15 + renamable $r8 = COPY killed renamable $r12 + renamable $r9 = COPY killed renamable $r13 + ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + renamable $rdi = COPY killed $rax + + bb.2.if.end: + liveins: $rcx, $rdi, $rdx, $rsi, $r8, $r9 + + renamable $rdi = nuw INC64r killed renamable $rdi, implicit-def dead $eflags + TCRETURNdi64 target-flags(x86-plt) @qux, 0, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit $rsi, implicit $rdx, implicit $rcx, implicit $r8, implicit $r9 +... + diff --git a/llvm/test/CodeGen/Mips/atomic-min-max.ll b/llvm/test/CodeGen/Mips/atomic-min-max.ll index 85bf6d02c7d8f..02ae8d2b7480e 100644 --- a/llvm/test/CodeGen/Mips/atomic-min-max.ll +++ b/llvm/test/CodeGen/Mips/atomic-min-max.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6 ; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MM ; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMR6 +; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS2 ; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS32 ; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSEL ; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSELR6 @@ -31,6 +32,33 @@ define i32 @test_max_32(ptr nocapture %ptr, i32 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_max_32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: $BB0_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($4) +; MIPS2-NEXT: slt $3, $2, $5 +; MIPS2-NEXT: move $1, $5 +; MIPS2-NEXT: beqz $3, $BB0_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB0_1 Depth=1 +; MIPS2-NEXT: j $BB0_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB0_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB0_1 Depth=1 +; MIPS2-NEXT: move $1, $2 +; MIPS2-NEXT: $BB0_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB0_1 Depth=1 +; MIPS2-NEXT: sc $1, 0($4) +; MIPS2-NEXT: beqz $1, $BB0_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_max_32: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: sync @@ -251,6 +279,33 @@ define i32 @test_min_32(ptr nocapture %ptr, i32 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_min_32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: $BB1_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($4) +; MIPS2-NEXT: slt $3, $2, $5 +; MIPS2-NEXT: move $1, $2 +; MIPS2-NEXT: beqz $3, $BB1_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB1_1 Depth=1 +; MIPS2-NEXT: j $BB1_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB1_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB1_1 Depth=1 +; MIPS2-NEXT: move $1, $5 +; MIPS2-NEXT: $BB1_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB1_1 Depth=1 +; MIPS2-NEXT: sc $1, 0($4) +; MIPS2-NEXT: beqz $1, $BB1_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_min_32: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: sync @@ -471,6 +526,33 @@ define i32 @test_umax_32(ptr nocapture %ptr, i32 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_umax_32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: $BB2_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($4) +; MIPS2-NEXT: sltu $3, $2, $5 +; MIPS2-NEXT: move $1, $5 +; MIPS2-NEXT: beqz $3, $BB2_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB2_1 Depth=1 +; MIPS2-NEXT: j $BB2_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB2_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB2_1 Depth=1 +; MIPS2-NEXT: move $1, $2 +; MIPS2-NEXT: $BB2_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB2_1 Depth=1 +; MIPS2-NEXT: sc $1, 0($4) +; MIPS2-NEXT: beqz $1, $BB2_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_umax_32: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: sync @@ -691,6 +773,33 @@ define i32 @test_umin_32(ptr nocapture %ptr, i32 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_umin_32: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: $BB3_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($4) +; MIPS2-NEXT: sltu $3, $2, $5 +; MIPS2-NEXT: move $1, $2 +; MIPS2-NEXT: beqz $3, $BB3_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB3_1 Depth=1 +; MIPS2-NEXT: j $BB3_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB3_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB3_1 Depth=1 +; MIPS2-NEXT: move $1, $5 +; MIPS2-NEXT: $BB3_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB3_1 Depth=1 +; MIPS2-NEXT: sc $1, 0($4) +; MIPS2-NEXT: beqz $1, $BB3_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: sync +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_umin_32: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: sync @@ -936,6 +1045,58 @@ define i16 @test_max_16(ptr nocapture %ptr, i16 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_max_16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 65535 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB4_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: sll $4, $4, 16 +; MIPS2-NEXT: sra $4, $4, 16 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: slt $5, $4, $7 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: beqz $5, $BB4_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB4_1 Depth=1 +; MIPS2-NEXT: j $BB4_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB4_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB4_1 Depth=1 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: $BB4_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB4_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB4_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_max_16: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -1476,6 +1637,58 @@ define i16 @test_min_16(ptr nocapture %ptr, i16 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_min_16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 65535 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB5_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: sll $4, $4, 16 +; MIPS2-NEXT: sra $4, $4, 16 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: slt $5, $4, $7 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: beqz $5, $BB5_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB5_1 Depth=1 +; MIPS2-NEXT: j $BB5_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB5_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB5_1 Depth=1 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: $BB5_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB5_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB5_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_min_16: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -2015,6 +2228,57 @@ define i16 @test_umax_16(ptr nocapture %ptr, i16 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_umax_16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 65535 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB6_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: andi $4, $4, 65535 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: sltu $5, $4, $7 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: beqz $5, $BB6_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB6_1 Depth=1 +; MIPS2-NEXT: j $BB6_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB6_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB6_1 Depth=1 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: $BB6_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB6_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB6_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_umax_16: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -2553,6 +2817,57 @@ define i16 @test_umin_16(ptr nocapture %ptr, i16 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_umin_16: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 65535 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB7_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: andi $4, $4, 65535 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: sltu $5, $4, $7 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: beqz $5, $BB7_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB7_1 Depth=1 +; MIPS2-NEXT: j $BB7_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB7_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB7_1 Depth=1 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: $BB7_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB7_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB7_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_umin_16: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -3092,6 +3407,58 @@ define i8 @test_max_8(ptr nocapture %ptr, i8 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_max_8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 255 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB8_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: sll $4, $4, 24 +; MIPS2-NEXT: sra $4, $4, 24 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: slt $5, $4, $7 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: beqz $5, $BB8_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB8_1 Depth=1 +; MIPS2-NEXT: j $BB8_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB8_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB8_1 Depth=1 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: $BB8_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB8_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB8_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_max_8: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -3631,6 +3998,58 @@ define i8 @test_min_8(ptr nocapture %ptr, i8 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_min_8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 255 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB9_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: sll $4, $4, 24 +; MIPS2-NEXT: sra $4, $4, 24 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: slt $5, $4, $7 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: beqz $5, $BB9_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB9_1 Depth=1 +; MIPS2-NEXT: j $BB9_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB9_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB9_1 Depth=1 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: $BB9_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB9_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB9_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_min_8: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -4170,6 +4589,57 @@ define i8 @test_umax_8(ptr nocapture %ptr, i8 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_umax_8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 255 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB10_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: andi $4, $4, 255 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: sltu $5, $4, $7 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: beqz $5, $BB10_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB10_1 Depth=1 +; MIPS2-NEXT: j $BB10_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB10_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB10_1 Depth=1 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: $BB10_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB10_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB10_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_umax_8: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 @@ -4708,6 +5178,57 @@ define i8 @test_umin_8(ptr nocapture %ptr, i8 signext %val) { ; MIPS-NEXT: jr $ra ; MIPS-NEXT: nop ; +; MIPS2-LABEL: test_umin_8: +; MIPS2: # %bb.0: # %entry +; MIPS2-NEXT: addiu $sp, $sp, -8 +; MIPS2-NEXT: .cfi_def_cfa_offset 8 +; MIPS2-NEXT: # kill: def $at killed $a1 +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $1, $zero, -4 +; MIPS2-NEXT: and $6, $4, $1 +; MIPS2-NEXT: andi $1, $4, 3 +; MIPS2-NEXT: sll $10, $1, 3 +; MIPS2-NEXT: ori $1, $zero, 255 +; MIPS2-NEXT: sllv $8, $1, $10 +; MIPS2-NEXT: nor $9, $zero, $8 +; MIPS2-NEXT: sllv $7, $5, $10 +; MIPS2-NEXT: $BB11_1: # %entry +; MIPS2-NEXT: # =>This Inner Loop Header: Depth=1 +; MIPS2-NEXT: ll $2, 0($6) +; MIPS2-NEXT: srav $4, $2, $10 +; MIPS2-NEXT: andi $4, $4, 255 +; MIPS2-NEXT: or $1, $zero, $4 +; MIPS2-NEXT: sllv $4, $4, $10 +; MIPS2-NEXT: sltu $5, $4, $7 +; MIPS2-NEXT: move $3, $4 +; MIPS2-NEXT: beqz $5, $BB11_3 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.2: # %entry +; MIPS2-NEXT: # in Loop: Header=BB11_1 Depth=1 +; MIPS2-NEXT: j $BB11_4 +; MIPS2-NEXT: nop +; MIPS2-NEXT: $BB11_3: # %entry +; MIPS2-NEXT: # in Loop: Header=BB11_1 Depth=1 +; MIPS2-NEXT: move $3, $7 +; MIPS2-NEXT: $BB11_4: # %entry +; MIPS2-NEXT: # in Loop: Header=BB11_1 Depth=1 +; MIPS2-NEXT: and $3, $3, $8 +; MIPS2-NEXT: and $4, $2, $9 +; MIPS2-NEXT: or $4, $4, $3 +; MIPS2-NEXT: sc $4, 0($6) +; MIPS2-NEXT: beqz $4, $BB11_1 +; MIPS2-NEXT: nop +; MIPS2-NEXT: # %bb.5: # %entry +; MIPS2-NEXT: .insn +; MIPS2-NEXT: # %bb.6: # %entry +; MIPS2-NEXT: sw $1, 4($sp) # 4-byte Folded Spill +; MIPS2-NEXT: # %bb.7: # %entry +; MIPS2-NEXT: lw $2, 4($sp) # 4-byte Folded Reload +; MIPS2-NEXT: sync +; MIPS2-NEXT: addiu $sp, $sp, 8 +; MIPS2-NEXT: jr $ra +; MIPS2-NEXT: nop +; ; MIPSR6-LABEL: test_umin_8: ; MIPSR6: # %bb.0: # %entry ; MIPSR6-NEXT: addiu $sp, $sp, -8 diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll new file mode 100644 index 0000000000000..8cbc879310f61 --- /dev/null +++ b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float-varargs.ll @@ -0,0 +1,148 @@ +; RUN: llc -mtriple=mips -relocation-model=static -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,O32 %s +; RUN: llc -mtriple=mipsel -relocation-model=static -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,O32 %s + +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,N32,NEW,NEWBE %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,N32,NEW,NEWLE %s + +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64,N64,NEW,NEWBE %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64,N64,NEW,NEWLE %s + +@floats = global [11 x float] zeroinitializer +@doubles = global [11 x double] zeroinitializer + +define void @double_args(double %a, ...) + nounwind { +entry: + %0 = getelementptr [11 x double], ptr @doubles, i32 0, i32 1 + store volatile double %a, ptr %0 + + %ap = alloca ptr + call void @llvm.va_start(ptr %ap) + %b = va_arg ptr %ap, double + %1 = getelementptr [11 x double], ptr @doubles, i32 0, i32 2 + store volatile double %b, ptr %1 + call void @llvm.va_end(ptr %ap) + ret void +} + +; ALL-LABEL: double_args: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles) + +; O32 forbids using floating point registers for the non-variable portion. +; N32/N64 allow it. +; O32-DAG: sw $4, 8([[R2]]) +; O32-DAG: sw $5, 12([[R2]]) +; NEW-DAG: sd $4, 8([[R2]]) + +; The varargs portion is dumped to stack +; O32-DAG: sw $6, 16($sp) +; O32-DAG: sw $7, 20($sp) +; NEW-DAG: sd $5, 8($sp) +; NEW-DAG: sd $6, 16($sp) +; NEW-DAG: sd $7, 24($sp) +; NEW-DAG: sd $8, 32($sp) +; NEW-DAG: sd $9, 40($sp) +; NEW-DAG: sd $10, 48($sp) +; NEW-DAG: sd $11, 56($sp) + +; Get the varargs pointer +; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and 8 bytes reserved +; for arguments 1 and 2. +; N32/N64 has 8 bytes for the varargs pointer, and no reserved area. +; O32-DAG: addiu [[VAPTR:\$[0-9]+]], $sp, 16 +; O32-DAG: sw [[VAPTR]], 4($sp) +; N32-DAG: addiu [[VAPTR:\$[0-9]+]], $sp, 8 +; N32-DAG: sw [[VAPTR]], 4($sp) +; N64-DAG: daddiu [[VAPTR:\$[0-9]+]], $sp, 8 +; N64-DAG: sd [[VAPTR]], 0($sp) + +; Increment the pointer then get the varargs arg +; LLVM will rebind the load to the stack pointer instead of the varargs pointer +; during lowering. This is fine and doesn't change the behaviour. +; O32-DAG: addiu [[VAPTR]], [[VAPTR]], 8 +; N32-DAG: addiu [[VAPTR]], [[VAPTR]], 8 +; N64-DAG: daddiu [[VAPTR]], [[VAPTR]], 8 +; O32-DAG: lw [[R3:\$[0-9]+]], 16($sp) +; O32-DAG: lw [[R4:\$[0-9]+]], 20($sp) +; O32-DAG: sw [[R3]], 16([[R2]]) +; O32-DAG: sw [[R4]], 20([[R2]]) +; NEW-DAG: ld [[R3:\$[0-9]+]], 8($sp) +; NEW-DAG: sd [[R3]], 16([[R2]]) + +define void @float_args(float %a, ...) nounwind { +entry: + %0 = getelementptr [11 x float], ptr @floats, i32 0, i32 1 + store volatile float %a, ptr %0 + + %ap = alloca ptr + call void @llvm.va_start(ptr %ap) + %b = va_arg ptr %ap, float + %1 = getelementptr [11 x float], ptr @floats, i32 0, i32 2 + store volatile float %b, ptr %1 + call void @llvm.va_end(ptr %ap) + ret void +} + +; ALL-LABEL: float_args: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats) + +; The first four arguments are the same in O32/N32/N64. +; The non-variable portion should be unaffected. +; O32-DAG: mtc1 $4, $f0 +; O32-DAG: swc1 $f0, 4([[R2]]) +; NEW-DAG: swc1 $f12, 4([[R2]]) + +; The varargs portion is dumped to stack +; O32-DAG: sw $5, 12($sp) +; O32-DAG: sw $6, 16($sp) +; O32-DAG: sw $7, 20($sp) +; NEW-DAG: sd $5, 8($sp) +; NEW-DAG: sd $6, 16($sp) +; NEW-DAG: sd $7, 24($sp) +; NEW-DAG: sd $8, 32($sp) +; NEW-DAG: sd $9, 40($sp) +; NEW-DAG: sd $10, 48($sp) +; NEW-DAG: sd $11, 56($sp) + +; Get the varargs pointer +; O32 has 4 bytes padding, 4 bytes for the varargs pointer, and should have 8 +; bytes reserved for arguments 1 and 2 (the first float arg) but as discussed in +; arguments-float.ll, GCC doesn't agree with MD00305 and treats floats as 4 +; bytes so we only have 12 bytes total. +; N32/N64 has 8 bytes for the varargs pointer, and no reserved area. +; O32-DAG: addiu [[VAPTR:\$[0-9]+]], $sp, 12 +; O32-DAG: sw [[VAPTR]], 4($sp) +; N32-DAG: addiu [[VAPTR:\$[0-9]+]], $sp, 8 +; N32-DAG: sw [[VAPTR]], 4($sp) +; N64-DAG: daddiu [[VAPTR:\$[0-9]+]], $sp, 8 +; N64-DAG: sd [[VAPTR]], 0($sp) + +; Increment the pointer then get the varargs arg +; LLVM will rebind the load to the stack pointer instead of the varargs pointer +; during lowering. This is fine and doesn't change the behaviour. +; Also, in big-endian mode the offset must be increased by 4 to retrieve the +; correct half of the argument slot. +; +; O32-DAG: addiu [[VAPTR]], [[VAPTR]], 4 +; N32-DAG: addiu [[VAPTR]], [[VAPTR]], 8 +; N64-DAG: daddiu [[VAPTR]], [[VAPTR]], 8 +; O32-DAG: lwc1 [[FTMP1:\$f[0-9]+]], 12($sp) +; NEWLE-DAG: lwc1 [[FTMP1:\$f[0-9]+]], 8($sp) +; NEWBE-DAG: lwc1 [[FTMP1:\$f[0-9]+]], 12($sp) +; ALL-DAG: swc1 [[FTMP1]], 8([[R2]]) + +declare void @llvm.va_start(ptr) +declare void @llvm.va_copy(ptr, ptr) +declare void @llvm.va_end(ptr) diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll new file mode 100644 index 0000000000000..6b7ad03c8e1c2 --- /dev/null +++ b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-float.ll @@ -0,0 +1,224 @@ +; RUN: llc -mtriple=mips -relocation-model=static -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,O32 %s +; RUN: llc -mtriple=mipsel -relocation-model=static -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,O32 %s + +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,NEW %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32,NEW %s + +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64,NEW %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64,NEW %s + +@bytes = global [11 x i8] zeroinitializer +@dwords = global [11 x i64] zeroinitializer +@floats = global [11 x float] zeroinitializer +@doubles = global [11 x double] zeroinitializer + +define void @double_args(double %a, double %b, double %c, double %d, double %e, + double %f, double %g, double %h, double %i) nounwind { +entry: + %0 = getelementptr [11 x double], ptr @doubles, i32 0, i32 1 + store volatile double %a, ptr %0 + %1 = getelementptr [11 x double], ptr @doubles, i32 0, i32 2 + store volatile double %b, ptr %1 + %2 = getelementptr [11 x double], ptr @doubles, i32 0, i32 3 + store volatile double %c, ptr %2 + %3 = getelementptr [11 x double], ptr @doubles, i32 0, i32 4 + store volatile double %d, ptr %3 + %4 = getelementptr [11 x double], ptr @doubles, i32 0, i32 5 + store volatile double %e, ptr %4 + %5 = getelementptr [11 x double], ptr @doubles, i32 0, i32 6 + store volatile double %f, ptr %5 + %6 = getelementptr [11 x double], ptr @doubles, i32 0, i32 7 + store volatile double %g, ptr %6 + %7 = getelementptr [11 x double], ptr @doubles, i32 0, i32 8 + store volatile double %h, ptr %7 + %8 = getelementptr [11 x double], ptr @doubles, i32 0, i32 9 + store volatile double %i, ptr %8 + ret void +} + +; ALL-LABEL: double_args: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles) + +; The first four arguments are the same in O32/N32/N64. +; The first argument is floating point but single-float is enabled so floating +; point registers are not used. +; O32-DAG: sw $4, 8([[R2]]) +; O32-DAG: sw $5, 12([[R2]]) +; NEW-DAG: sd $4, 8([[R2]]) + +; O32-DAG: sw $6, 16([[R2]]) +; O32-DAG: sw $7, 20([[R2]]) +; NEW-DAG: sd $5, 16([[R2]]) + +; O32 has run out of argument registers and starts using the stack +; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 16($sp) +; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 20($sp) +; O32-DAG: sw [[R3]], 24([[R2]]) +; O32-DAG: sw [[R4]], 28([[R2]]) +; NEW-DAG: sd $6, 24([[R2]]) + +; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 24($sp) +; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 28($sp) +; O32-DAG: sw [[R3]], 32([[R2]]) +; O32-DAG: sw [[R4]], 36([[R2]]) +; NEW-DAG: sd $7, 32([[R2]]) + +; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 32($sp) +; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 36($sp) +; O32-DAG: sw [[R3]], 40([[R2]]) +; O32-DAG: sw [[R4]], 44([[R2]]) +; NEW-DAG: sd $8, 40([[R2]]) + +; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 40($sp) +; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 44($sp) +; O32-DAG: sw [[R3]], 48([[R2]]) +; O32-DAG: sw [[R4]], 52([[R2]]) +; NEW-DAG: sd $9, 48([[R2]]) + +; O32-DAG: lw [[R3:\$([0-9]+|gp)]], 48($sp) +; O32-DAG: lw [[R4:\$([0-9]+|gp)]], 52($sp) +; O32-DAG: sw [[R3]], 56([[R2]]) +; O32-DAG: sw [[R4]], 60([[R2]]) +; NEW-DAG: sd $10, 56([[R2]]) + +; N32/N64 have run out of registers and starts using the stack too +; O32-DAG: lw [[R3:\$[0-9]+]], 56($sp) +; O32-DAG: lw [[R4:\$[0-9]+]], 60($sp) +; O32-DAG: sw [[R3]], 64([[R2]]) +; O32-DAG: sw [[R4]], 68([[R2]]) +; NEW-DAG: ld [[R3:\$[0-9]+]], 0($sp) +; NEW-DAG: sd $11, 64([[R2]]) + +define void @float_args(float %a, float %b, float %c, float %d, float %e, + float %f, float %g, float %h, float %i) nounwind { +entry: + %0 = getelementptr [11 x float], ptr @floats, i32 0, i32 1 + store volatile float %a, ptr %0 + %1 = getelementptr [11 x float], ptr @floats, i32 0, i32 2 + store volatile float %b, ptr %1 + %2 = getelementptr [11 x float], ptr @floats, i32 0, i32 3 + store volatile float %c, ptr %2 + %3 = getelementptr [11 x float], ptr @floats, i32 0, i32 4 + store volatile float %d, ptr %3 + %4 = getelementptr [11 x float], ptr @floats, i32 0, i32 5 + store volatile float %e, ptr %4 + %5 = getelementptr [11 x float], ptr @floats, i32 0, i32 6 + store volatile float %f, ptr %5 + %6 = getelementptr [11 x float], ptr @floats, i32 0, i32 7 + store volatile float %g, ptr %6 + %7 = getelementptr [11 x float], ptr @floats, i32 0, i32 8 + store volatile float %h, ptr %7 + %8 = getelementptr [11 x float], ptr @floats, i32 0, i32 9 + store volatile float %i, ptr %8 + ret void +} + +; ALL-LABEL: float_args: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(floats) +; SYM64-DAG: daddiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(floats) + +; The first argument is floating point so floating point registers are used. +; The first argument is the same for O32/N32/N64 but the second argument differs +; by register +; ALL-DAG: swc1 $f12, 4([[R1]]) +; O32-DAG: swc1 $f14, 8([[R1]]) +; NEW-DAG: swc1 $f13, 8([[R1]]) + +; O32 has run out of argument registers and (in theory) starts using the stack +; I've yet to find a reference in the documentation about this but GCC uses up +; the remaining two argument slots in the GPR's first. We'll do the same for +; compatibility. +; O32-DAG: mtc1 $6, $f0 +; O32-DAG: swc1 $f0, 12([[R1]]) +; NEW-DAG: swc1 $f14, 12([[R1]]) +; O32-DAG: mtc1 $7, $f0 +; O32-DAG: swc1 $f0, 16([[R1]]) +; NEW-DAG: swc1 $f15, 16([[R1]]) + +; O32 is definitely out of registers now and switches to the stack. +; O32-DAG: lwc1 [[F1:\$f[0-9]+]], 16($sp) +; O32-DAG: swc1 [[F1]], 20([[R1]]) +; NEW-DAG: swc1 $f16, 20([[R1]]) +; O32-DAG: lwc1 [[F1:\$f[0-9]+]], 20($sp) +; O32-DAG: swc1 [[F1]], 24([[R1]]) +; NEW-DAG: swc1 $f17, 24([[R1]]) +; O32-DAG: lwc1 [[F1:\$f[0-9]+]], 24($sp) +; O32-DAG: swc1 [[F1]], 28([[R1]]) +; NEW-DAG: swc1 $f18, 28([[R1]]) +; O32-DAG: lwc1 [[F1:\$f[0-9]+]], 28($sp) +; O32-DAG: swc1 [[F1]], 32([[R1]]) +; NEW-DAG: swc1 $f19, 32([[R1]]) + +; N32/N64 have run out of registers and start using the stack too +; O32-DAG: lwc1 [[F1:\$f[0-9]+]], 32($sp) +; O32-DAG: swc1 [[F1]], 36([[R1]]) +; NEW-DAG: lwc1 [[F1:\$f[0-9]+]], 0($sp) +; NEW-DAG: swc1 [[F1]], 36([[R1]]) + + +define void @double_arg2(i8 %a, double %b) nounwind { +entry: + %0 = getelementptr [11 x i8], ptr @bytes, i32 0, i32 1 + store volatile i8 %a, ptr %0 + %1 = getelementptr [11 x double], ptr @doubles, i32 0, i32 1 + store volatile double %b, ptr %1 + ret void +} + +; ALL-LABEL: double_arg2: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes) +; SYM64-DAG: daddiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes) +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(doubles) + +; The first four arguments are the same in O32/N32/N64. +; The first argument isn't floating point so floating point registers are not +; used. +; The second slot is insufficiently aligned for double on O32 so it is skipped. +; Also, double occupies two slots on O32 and only one for N32/N64. +; ALL-DAG: sb $4, 1([[R1]]) +; O32-DAG: sw $6, 8([[R2]]) +; O32-DAG: sw $7, 12([[R2]]) +; NEW-DAG: sd $5, 8([[R2]]) + +define void @float_arg2(i8 %a, float %b) nounwind { +entry: + %0 = getelementptr [11 x i8], ptr @bytes, i32 0, i32 1 + store volatile i8 %a, ptr %0 + %1 = getelementptr [11 x float], ptr @floats, i32 0, i32 1 + store volatile float %b, ptr %1 + ret void +} + +; ALL-LABEL: float_arg2: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes) +; SYM64-DAG: daddiu [[R1:\$[0-9]+]], ${{[0-9]+}}, %lo(bytes) +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(floats) + +; The first argument is the same in O32/N32/N64. +; ALL-DAG: sb $4, 1([[R1]]) + +; The first argument isn't floating point so floating point registers are not +; used in O32, but N32/N64 will still use them. +; MD00305 and GCC disagree on this one. MD00305 says that floats are treated +; as 8-byte aligned and occupy two slots on O32. GCC is treating them as 4-byte +; aligned and occupying one slot. We'll use GCC's definition. +; O32-DAG: mtc1 $5, $f0 +; O32-DAG: swc1 $f0, 4([[R2]]) +; NEW-DAG: swc1 $f13, 4([[R2]]) diff --git a/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll new file mode 100644 index 0000000000000..9268e37b02fb5 --- /dev/null +++ b/llvm/test/CodeGen/Mips/cconv/arguments-hard-single-fp128.ll @@ -0,0 +1,42 @@ +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32 %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32 %s + +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64 %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64 %s + +@ldoubles = global [11 x fp128] zeroinitializer + +define void @ldouble_args(fp128 %a, fp128 %b, fp128 %c, fp128 %d, fp128 %e) nounwind { +entry: + %0 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 1 + store volatile fp128 %a, ptr %0 + %1 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 2 + store volatile fp128 %b, ptr %1 + %2 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 3 + store volatile fp128 %c, ptr %2 + %3 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 4 + store volatile fp128 %d, ptr %3 + %4 = getelementptr [11 x fp128], ptr @ldoubles, i32 0, i32 5 + store volatile fp128 %e, ptr %4 + ret void +} + +; ALL-LABEL: ldouble_args: +; We won't test the way the global address is calculated in this test. This is +; just to get the register number for the other checks. +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(ldoubles) + +; The first four arguments are the same in N32/N64. +; ALL-DAG: sd $5, 24([[R2]]) +; ALL-DAG: sd $4, 16([[R2]]) +; ALL-DAG: sd $7, 40([[R2]]) +; ALL-DAG: sd $6, 32([[R2]]) +; ALL-DAG: sd $9, 56([[R2]]) +; ALL-DAG: sd $8, 48([[R2]]) +; ALL-DAG: sd $11, 72([[R2]]) +; ALL-DAG: sd $10, 64([[R2]]) diff --git a/llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll b/llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll new file mode 100644 index 0000000000000..5bf1f2c2d60da --- /dev/null +++ b/llvm/test/CodeGen/Mips/cconv/callee-saved-singlefloat.ll @@ -0,0 +1,111 @@ +; RUN: llc -mtriple=mips -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,O32 %s +; RUN: llc -mtriple=mipsel -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,O32 %s + +; RUN: llc -mtriple=mips64 -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N32 %s +; RUN: llc -mtriple=mips64el -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N32 %s +; RUN: llc -mtriple=mips64 -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N32-INV %s +; RUN: llc -mtriple=mips64el -target-abi n32 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N32-INV %s + +; RUN: llc -mtriple=mips64 -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N64 %s +; RUN: llc -mtriple=mips64el -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,N64 %s +; RUN: llc -mtriple=mips64 -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N64-INV %s +; RUN: llc -mtriple=mips64el -target-abi n64 -mattr=+single-float < %s | FileCheck --check-prefixes=ALL,ALL-INV,N64-INV %s + +define void @fpu_clobber() nounwind { +entry: + call void asm "# Clobber", "~{$f0},~{$f1},~{$f2},~{$f3},~{$f4},~{$f5},~{$f6},~{$f7},~{$f8},~{$f9},~{$f10},~{$f11},~{$f12},~{$f13},~{$f14},~{$f15},~{$f16},~{$f17},~{$f18},~{$f19},~{$f20},~{$f21},~{$f22},~{$f23},~{$f24},~{$f25},~{$f26},~{$f27},~{$f28},~{$f29},~{$f30},~{$f31}"() + ret void +} + +; ALL-LABEL: fpu_clobber: +; ALL-INV-NOT: swc1 $f0, +; ALL-INV-NOT: swc1 $f1, +; ALL-INV-NOT: swc1 $f2, +; ALL-INV-NOT: swc1 $f3, +; ALL-INV-NOT: swc1 $f4, +; ALL-INV-NOT: swc1 $f5, +; ALL-INV-NOT: swc1 $f6, +; ALL-INV-NOT: swc1 $f7, +; ALL-INV-NOT: swc1 $f8, +; ALL-INV-NOT: swc1 $f9, +; ALL-INV-NOT: swc1 $f10, +; ALL-INV-NOT: swc1 $f11, +; ALL-INV-NOT: swc1 $f12, +; ALL-INV-NOT: swc1 $f13, +; ALL-INV-NOT: swc1 $f14, +; ALL-INV-NOT: swc1 $f15, +; ALL-INV-NOT: swc1 $f16, +; ALL-INV-NOT: swc1 $f17, +; ALL-INV-NOT: swc1 $f18, +; ALL-INV-NOT: swc1 $f19, + +; O32: addiu $sp, $sp, -48 +; O32-DAG: swc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp) +; O32-DAG: swc1 [[F21:\$f21]], [[OFF21:[0-9]+]]($sp) +; O32-DAG: swc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp) +; O32-DAG: swc1 [[F23:\$f23]], [[OFF23:[0-9]+]]($sp) +; O32-DAG: swc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp) +; O32-DAG: swc1 [[F25:\$f25]], [[OFF25:[0-9]+]]($sp) +; O32-DAG: swc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp) +; O32-DAG: swc1 [[F27:\$f27]], [[OFF27:[0-9]+]]($sp) +; O32-DAG: swc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp) +; O32-DAG: swc1 [[F29:\$f29]], [[OFF29:[0-9]+]]($sp) +; O32-DAG: swc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp) +; O32-DAG: swc1 [[F31:\$f31]], [[OFF31:[0-9]+]]($sp) +; O32-DAG: lwc1 [[F20]], [[OFF20]]($sp) +; O32-DAG: lwc1 [[F21]], [[OFF21]]($sp) +; O32-DAG: lwc1 [[F22]], [[OFF22]]($sp) +; O32-DAG: lwc1 [[F23]], [[OFF23]]($sp) +; O32-DAG: lwc1 [[F24]], [[OFF24]]($sp) +; O32-DAG: lwc1 [[F25]], [[OFF25]]($sp) +; O32-DAG: lwc1 [[F26]], [[OFF26]]($sp) +; O32-DAG: lwc1 [[F27]], [[OFF27]]($sp) +; O32-DAG: lwc1 [[F28]], [[OFF28]]($sp) +; O32-DAG: lwc1 [[F29]], [[OFF29]]($sp) +; O32-DAG: lwc1 [[F30]], [[OFF30]]($sp) +; O32-DAG: lwc1 [[F31]], [[OFF31]]($sp) +; O32: addiu $sp, $sp, 48 + +; N32: addiu $sp, $sp, -32 +; N32-DAG: swc1 [[F20:\$f20]], [[OFF20:[0-9]+]]($sp) +; N32-INV-NOT: swc1 $f21, +; N32-DAG: swc1 [[F22:\$f22]], [[OFF22:[0-9]+]]($sp) +; N32-INV-NOT: swc1 $f23, +; N32-DAG: swc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp) +; N32-INV-NOT: swc1 $f25, +; N32-DAG: swc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp) +; N32-INV-NOT: swc1 $f27, +; N32-DAG: swc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp) +; N32-INV-NOT: swc1 $f29, +; N32-DAG: swc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp) +; N32-INV-NOT: swc1 $f31, +; N32-DAG: lwc1 [[F20]], [[OFF20]]($sp) +; N32-DAG: lwc1 [[F22]], [[OFF22]]($sp) +; N32-DAG: lwc1 [[F24]], [[OFF24]]($sp) +; N32-DAG: lwc1 [[F26]], [[OFF26]]($sp) +; N32-DAG: lwc1 [[F28]], [[OFF28]]($sp) +; N32-DAG: lwc1 [[F30]], [[OFF30]]($sp) +; N32: addiu $sp, $sp, 32 + +; N64: addiu $sp, $sp, -32 +; N64-INV-NOT: swc1 $f20, +; N64-INV-NOT: swc1 $f21, +; N64-INV-NOT: swc1 $f22, +; N64-INV-NOT: swc1 $f23, +; N64-DAG: swc1 [[F24:\$f24]], [[OFF24:[0-9]+]]($sp) +; N64-DAG: swc1 [[F25:\$f25]], [[OFF25:[0-9]+]]($sp) +; N64-DAG: swc1 [[F26:\$f26]], [[OFF26:[0-9]+]]($sp) +; N64-DAG: swc1 [[F27:\$f27]], [[OFF27:[0-9]+]]($sp) +; N64-DAG: swc1 [[F28:\$f28]], [[OFF28:[0-9]+]]($sp) +; N64-DAG: swc1 [[F29:\$f29]], [[OFF29:[0-9]+]]($sp) +; N64-DAG: swc1 [[F30:\$f30]], [[OFF30:[0-9]+]]($sp) +; N64-DAG: swc1 [[F31:\$f31]], [[OFF31:[0-9]+]]($sp) +; N64-DAG: lwc1 [[F24]], [[OFF24]]($sp) +; N64-DAG: lwc1 [[F25]], [[OFF25]]($sp) +; N64-DAG: lwc1 [[F26]], [[OFF26]]($sp) +; N64-DAG: lwc1 [[F27]], [[OFF27]]($sp) +; N64-DAG: lwc1 [[F28]], [[OFF28]]($sp) +; N64-DAG: lwc1 [[F29]], [[OFF29]]($sp) +; N64-DAG: lwc1 [[F30]], [[OFF30]]($sp) +; N64-DAG: lwc1 [[F31]], [[OFF31]]($sp) +; N64: addiu $sp, $sp, 32 \ No newline at end of file diff --git a/llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll b/llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll new file mode 100644 index 0000000000000..1abf08d8200fb --- /dev/null +++ b/llvm/test/CodeGen/Mips/cconv/return-hard-single-float.ll @@ -0,0 +1,43 @@ +; RUN: llc -mtriple=mips-linux-gnu -relocation-model=static -mattr=+single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,O32 %s +; RUN: llc -mtriple=mipsel-linux-gnu -relocation-model=static -mattr=+single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,O32 %s + +; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n32 -mattr=+single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,N32 %s +; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n32 -mattr=+single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,N32 %s + +; RUN: llc -mtriple=mips64-linux-gnu -relocation-model=static -target-abi n64 -mattr=+single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,N64 %s +; RUN: llc -mtriple=mips64el-linux-gnu -relocation-model=static -target-abi n64 -mattr=+single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,N64 %s + +@float = global float zeroinitializer +@double = global double zeroinitializer + +define float @retfloat() nounwind { +entry: + %0 = load volatile float, ptr @float + ret float %0 +} + +; ALL-LABEL: retfloat: +; O32-DAG: lui [[R1:\$[0-9]+]], %hi(float) +; O32-DAG: lwc1 $f0, %lo(float)([[R1]]) +; N32-DAG: lui [[R1:\$[0-9]+]], %hi(float) +; N32-DAG: lwc1 $f0, %lo(float)([[R1]]) +; N64-DAG: lwc1 $f0, %lo(float)([[R1:\$[0-9+]]]) + +define double @retdouble() nounwind { +entry: + %0 = load volatile double, ptr @double + ret double %0 +} + +; ALL-LABEL: retdouble: +; O32-DAG: lw $2, %lo(double)([[R1:\$[0-9]+]]) +; O32-DAG: addiu [[R2:\$[0-9]+]], [[R1]], %lo(double) +; O32-DAG: lw $3, 4([[R2]]) +; N32-DAG: ld $2, %lo(double)([[R1:\$[0-9]+]]) +; N64-DAG: ld $2, %lo(double)([[R1:\$[0-9]+]]) diff --git a/llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll b/llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll new file mode 100644 index 0000000000000..e4d04146ecc2f --- /dev/null +++ b/llvm/test/CodeGen/Mips/cconv/return-hard-single-fp128.ll @@ -0,0 +1,24 @@ +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32 %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n32 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM32 %s + +; RUN: llc -mtriple=mips64 -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64 %s +; RUN: llc -mtriple=mips64el -relocation-model=static -target-abi n64 -mattr=single-float < %s \ +; RUN: | FileCheck --check-prefixes=ALL,SYM64 %s + +@fp128 = global fp128 zeroinitializer + +define fp128 @retldouble() nounwind { +entry: + %0 = load volatile fp128, ptr @fp128 + ret fp128 %0 +} + +; ALL-LABEL: retldouble: +; SYM32-DAG: addiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(fp128) +; SYM64-DAG: daddiu [[R2:\$[0-9]+]], ${{[0-9]+}}, %lo(fp128) + +; ALL-DAG: ld $2, %lo(fp128)([[R2]]) +; ALL-DAG: ld $3, 8([[R2]]) diff --git a/llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll b/llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll new file mode 100644 index 0000000000000..ddebddcdab260 --- /dev/null +++ b/llvm/test/CodeGen/Mips/inlineasm-constraints-singlefloat.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=mips -mattr=+single-float < %s | FileCheck %s --check-prefix=MIPS32 +; RUN: llc -mtriple=mips64 -mattr=+single-float < %s | FileCheck %s --check-prefix=MIPS64 + +define void @read_double(ptr %0) { +; MIPS32-LABEL: read_double: +; MIPS32: # %bb.0: +; MIPS32-NEXT: lw $2, 4($4) +; MIPS32-NEXT: lw $3, 0($4) +; MIPS32-NEXT: #APP +; MIPS32-NEXT: #NO_APP +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: nop +; +; MIPS64-LABEL: read_double: +; MIPS64: # %bb.0: +; MIPS64-NEXT: ld $2, 0($4) +; MIPS64-NEXT: #APP +; MIPS64-NEXT: #NO_APP +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: nop + %2 = load double, ptr %0, align 8 + tail call void asm sideeffect "", "r,~{$1}"(double %2) + ret void +} + +define void @read_float(ptr %0) { +; MIPS32-LABEL: read_float: +; MIPS32: # %bb.0: +; MIPS32-NEXT: lwc1 $f0, 0($4) +; MIPS32-NEXT: #APP +; MIPS32-NEXT: #NO_APP +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: nop +; +; MIPS64-LABEL: read_float: +; MIPS64: # %bb.0: +; MIPS64-NEXT: lwc1 $f0, 0($4) +; MIPS64-NEXT: #APP +; MIPS64-NEXT: #NO_APP +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: nop + %2 = load float, ptr %0, align 8 + tail call void asm sideeffect "", "f"(float %2) + ret void +} + +; Test that a proper register class is assigned to clobbers in single-float mode +define float @explicit_float_register_clobber(ptr %0) { +; MIPS32-LABEL: explicit_float_register_clobber: +; MIPS32: # %bb.0: +; MIPS32-NEXT: lwc1 $f1, 0($4) +; MIPS32-NEXT: #APP +; MIPS32-NEXT: #NO_APP +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: mov.s $f0, $f1 +; +; MIPS64-LABEL: explicit_float_register_clobber: +; MIPS64: # %bb.0: +; MIPS64-NEXT: lwc1 $f1, 0($4) +; MIPS64-NEXT: #APP +; MIPS64-NEXT: #NO_APP +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: mov.s $f0, $f1 + %2 = load float, ptr %0, align 8 + tail call void asm sideeffect "", "~{$f0}"() + ret float %2 +} diff --git a/llvm/test/CodeGen/Mips/int-to-float-conversion.ll b/llvm/test/CodeGen/Mips/int-to-float-conversion.ll index 84bc6a253595a..1c8ad9ad07e15 100644 --- a/llvm/test/CodeGen/Mips/int-to-float-conversion.ll +++ b/llvm/test/CodeGen/Mips/int-to-float-conversion.ll @@ -1,13 +1,24 @@ -; RUN: llc -mtriple=mipsel < %s | FileCheck %s -check-prefix=32 -; RUN: llc -mtriple=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefix=64 -; RUN: llc -mtriple=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefix=64 +; RUN: llc -mtriple=mipsel < %s | FileCheck %s -check-prefixes=ALL,32,32DF +; RUN: llc -mtriple=mipsel -mattr=+single-float < %s | FileCheck %s -check-prefixes=ALL,32,32SF + +; RUN: llc -mtriple=mips64el -mcpu=mips4 < %s | FileCheck %s -check-prefixes=ALL,64,64DF +; RUN: llc -mtriple=mips64el -mcpu=mips4 -mattr=+single-float < %s \ +; RUN: | FileCheck %s -check-prefixes=ALL,64,64SF + +; RUN: llc -mtriple=mips64el -mcpu=mips64 < %s | FileCheck %s -check-prefixes=ALL,64,64DF +; RUN: llc -mtriple=mips64el -mcpu=mips64 -mattr=+single-float < %s \ +; RUN: | FileCheck %s -check-prefixes=ALL,64,64SF + +; Test various combinations of 32/64bit GP registers and single/double floating point support. @i1 = global [3 x i32] [i32 1, i32 2, i32 3], align 4 @i3 = common global ptr null, align 4 -; 32-LABEL: test_float_int_: -; 32: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] -; 32: cvt.s.w $f{{[0-9]+}}, $f[[R0]] +; ALL-LABEL: test_float_int_: +; 32: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] +; 32: cvt.s.w $f{{[0-9]+}}, $f[[R0]] +; 64: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] +; 64: cvt.s.w $f{{[0-9]+}}, $f[[R0]] define float @test_float_int_(i32 %a) { entry: @@ -15,12 +26,13 @@ entry: ret float %conv } -; 32-LABEL: test_double_int_: -; 32: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] -; 32: cvt.d.w $f{{[0-9]+}}, $f[[R0]] -; 64-LABEL: test_double_int_: -; 64: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] -; 64: cvt.d.w $f{{[0-9]+}}, $f[[R0]] +; ALL-LABEL: test_double_int_: +; 32DF: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] +; 32DF: cvt.d.w $f{{[0-9]+}}, $f[[R0]] +; 32SF: jal __floatsidf +; 64DF: mtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] +; 64DF: cvt.d.w $f{{[0-9]+}}, $f[[R0]] +; 64SF: jal __floatsidf define double @test_double_int_(i32 %a) { entry: @@ -28,9 +40,11 @@ entry: ret double %conv } -; 64-LABEL: test_float_LL_: -; 64: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] -; 64: cvt.s.l $f{{[0-9]+}}, $f[[R0]] +; ALL-LABEL: test_float_LL_: +; 32: jal __floatdisf +; 64DF: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] +; 64DF: cvt.s.l $f{{[0-9]+}}, $f[[R0]] +; 64SF: jal __floatdisf define float @test_float_LL_(i64 %a) { entry: @@ -38,9 +52,11 @@ entry: ret float %conv } -; 64-LABEL: test_double_LL_: -; 64: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] -; 64: cvt.d.l $f{{[0-9]+}}, $f[[R0]] +; ALL-LABEL: test_double_LL_: +; 32: jal __floatdidf +; 64DF: dmtc1 ${{[0-9]+}}, $f[[R0:[0-9]+]] +; 64DF: cvt.d.l $f{{[0-9]+}}, $f[[R0]] +; 64SF: jal __floatdidf define double @test_double_LL_(i64 %a) { entry: diff --git a/llvm/test/CodeGen/NVPTX/combine-wide.ll b/llvm/test/CodeGen/NVPTX/combine-wide.ll index b5948d37c3505..63e0f3789f49f 100644 --- a/llvm/test/CodeGen/NVPTX/combine-wide.ll +++ b/llvm/test/CodeGen/NVPTX/combine-wide.ll @@ -1,24 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -O1 | FileCheck %s --check-prefixes=CHECK,O1 +; RUN: llc < %s -O1 | FileCheck %s --check-prefixes=CHECK,O1,O1-NO-MAD +; RUN: llc < %s -O1 -nvptx-mad-wide-opt | FileCheck %s --check-prefixes=CHECK,O1,O1-MAD ; RUN: llc < %s -O0 | FileCheck %s --check-prefixes=CHECK,O0 target triple = "nvptx64-nvidia-cuda" define i64 @t1(i32 %a, i32 %b, i64 %c) { -; -; O1-LABEL: t1( -; O1: { -; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t1_param_0]; -; O1-NEXT: ld.param.b32 %r2, [t1_param_1]; -; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; -; O1-NEXT: ld.param.b64 %rd2, [t1_param_2]; -; O1-NEXT: add.s64 %rd3, %rd2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t1( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t1_param_0]; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t1_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NO-MAD-NEXT: ld.param.b64 %rd2, [t1_param_2]; +; O1-NO-MAD-NEXT: add.s64 %rd3, %rd2, %rd1; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t1( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-NEXT: .reg .b64 %rd<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t1_param_0]; +; O1-MAD-NEXT: ld.param.b32 %r2, [t1_param_1]; +; O1-MAD-NEXT: ld.param.b64 %rd1, [t1_param_2]; +; O1-MAD-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t1( ; O0: { @@ -41,20 +54,32 @@ define i64 @t1(i32 %a, i32 %b, i64 %c) { } define i64 @t2(i32 %a, i32 %b, i64 %c) { -; -; O1-LABEL: t2( -; O1: { -; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t2_param_0]; -; O1-NEXT: ld.param.b32 %r2, [t2_param_1]; -; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; -; O1-NEXT: ld.param.b64 %rd2, [t2_param_2]; -; O1-NEXT: add.s64 %rd3, %rd1, %rd2; -; O1-NEXT: st.param.b64 [func_retval0], %rd3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t2( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t2_param_0]; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t2_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NO-MAD-NEXT: ld.param.b64 %rd2, [t2_param_2]; +; O1-NO-MAD-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t2( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-NEXT: .reg .b64 %rd<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t2_param_0]; +; O1-MAD-NEXT: ld.param.b32 %r2, [t2_param_1]; +; O1-MAD-NEXT: ld.param.b64 %rd1, [t2_param_2]; +; O1-MAD-NEXT: mad.wide.s32 %rd2, %r1, %r2, %rd1; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t2( ; O0: { @@ -77,19 +102,30 @@ define i64 @t2(i32 %a, i32 %b, i64 %c) { } define i64 @t3(i32 %a, i32 %b) { -; -; O1-LABEL: t3( -; O1: { -; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<3>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t3_param_0]; -; O1-NEXT: ld.param.b32 %r2, [t3_param_1]; -; O1-NEXT: mul.wide.s32 %rd1, %r1, %r2; -; O1-NEXT: add.s64 %rd2, %rd1, 1; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t3( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<3>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t3_param_0]; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t3_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s32 %rd1, %r1, %r2; +; O1-NO-MAD-NEXT: add.s64 %rd2, %rd1, 1; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t3( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-NEXT: .reg .b64 %rd<2>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t3_param_0]; +; O1-MAD-NEXT: ld.param.b32 %r2, [t3_param_1]; +; O1-MAD-NEXT: mad.wide.s32 %rd1, %r1, %r2, 1; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t3( ; O0: { @@ -111,19 +147,30 @@ define i64 @t3(i32 %a, i32 %b) { } define i64 @t4(i32 %a, i64 %c) { -; -; O1-LABEL: t4( -; O1: { -; O1-NEXT: .reg .b32 %r<2>; -; O1-NEXT: .reg .b64 %rd<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t4_param_0]; -; O1-NEXT: ld.param.b64 %rd1, [t4_param_1]; -; O1-NEXT: mul.wide.s32 %rd2, %r1, 3; -; O1-NEXT: add.s64 %rd3, %rd1, %rd2; -; O1-NEXT: st.param.b64 [func_retval0], %rd3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t4( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<2>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t4_param_0]; +; O1-NO-MAD-NEXT: ld.param.b64 %rd1, [t4_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s32 %rd2, %r1, 3; +; O1-NO-MAD-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t4( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<2>; +; O1-MAD-NEXT: .reg .b64 %rd<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t4_param_0]; +; O1-MAD-NEXT: ld.param.b64 %rd1, [t4_param_1]; +; O1-MAD-NEXT: mad.wide.s32 %rd2, %r1, 3, %rd1; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t4( ; O0: { @@ -145,18 +192,28 @@ define i64 @t4(i32 %a, i64 %c) { } define i64 @t4_1(i32 %a, i64 %c) { -; -; O1-LABEL: t4_1( -; O1: { -; O1-NEXT: .reg .b32 %r<2>; -; O1-NEXT: .reg .b64 %rd<3>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t4_1_param_0]; -; O1-NEXT: mul.wide.s32 %rd1, %r1, 3; -; O1-NEXT: add.s64 %rd2, %rd1, 5; -; O1-NEXT: st.param.b64 [func_retval0], %rd2; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t4_1( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<2>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<3>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t4_1_param_0]; +; O1-NO-MAD-NEXT: mul.wide.s32 %rd1, %r1, 3; +; O1-NO-MAD-NEXT: add.s64 %rd2, %rd1, 5; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t4_1( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<2>; +; O1-MAD-NEXT: .reg .b64 %rd<2>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t4_1_param_0]; +; O1-MAD-NEXT: mad.wide.s32 %rd1, %r1, 3, 5; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd1; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t4_1( ; O0: { @@ -177,20 +234,32 @@ define i64 @t4_1(i32 %a, i64 %c) { } define i64 @t5(i32 %a, i32 %b, i64 %c) { -; -; O1-LABEL: t5( -; O1: { -; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t5_param_0]; -; O1-NEXT: ld.param.b32 %r2, [t5_param_1]; -; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; -; O1-NEXT: ld.param.b64 %rd2, [t5_param_2]; -; O1-NEXT: add.s64 %rd3, %rd2, %rd1; -; O1-NEXT: st.param.b64 [func_retval0], %rd3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t5( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t5_param_0]; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t5_param_1]; +; O1-NO-MAD-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NO-MAD-NEXT: ld.param.b64 %rd2, [t5_param_2]; +; O1-NO-MAD-NEXT: add.s64 %rd3, %rd2, %rd1; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t5( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-NEXT: .reg .b64 %rd<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t5_param_0]; +; O1-MAD-NEXT: ld.param.b32 %r2, [t5_param_1]; +; O1-MAD-NEXT: ld.param.b64 %rd1, [t5_param_2]; +; O1-MAD-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t5( ; O0: { @@ -213,20 +282,32 @@ define i64 @t5(i32 %a, i32 %b, i64 %c) { } define i64 @t6(i32 %a, i32 %b, i64 %c) { -; -; O1-LABEL: t6( -; O1: { -; O1-NEXT: .reg .b32 %r<3>; -; O1-NEXT: .reg .b64 %rd<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b32 %r1, [t6_param_0]; -; O1-NEXT: ld.param.b32 %r2, [t6_param_1]; -; O1-NEXT: mul.wide.u32 %rd1, %r1, %r2; -; O1-NEXT: ld.param.b64 %rd2, [t6_param_2]; -; O1-NEXT: add.s64 %rd3, %rd1, %rd2; -; O1-NEXT: st.param.b64 [func_retval0], %rd3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t6( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-NEXT: .reg .b64 %rd<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t6_param_0]; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t6_param_1]; +; O1-NO-MAD-NEXT: mul.wide.u32 %rd1, %r1, %r2; +; O1-NO-MAD-NEXT: ld.param.b64 %rd2, [t6_param_2]; +; O1-NO-MAD-NEXT: add.s64 %rd3, %rd1, %rd2; +; O1-NO-MAD-NEXT: st.param.b64 [func_retval0], %rd3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t6( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-NEXT: .reg .b64 %rd<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b32 %r1, [t6_param_0]; +; O1-MAD-NEXT: ld.param.b32 %r2, [t6_param_1]; +; O1-MAD-NEXT: ld.param.b64 %rd1, [t6_param_2]; +; O1-MAD-NEXT: mad.wide.u32 %rd2, %r1, %r2, %rd1; +; O1-MAD-NEXT: st.param.b64 [func_retval0], %rd2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t6( ; O0: { @@ -249,7 +330,6 @@ define i64 @t6(i32 %a, i32 %b, i64 %c) { } define i32 @t7(i16 %a, i16 %b) { -; ; O1-LABEL: t7( ; O1: { ; O1-NEXT: .reg .b16 %rs<4>; @@ -281,7 +361,6 @@ define i32 @t7(i16 %a, i16 %b) { } define i32 @t8(i16 %a, i16 %b) { -; ; O1-LABEL: t8( ; O1: { ; O1-NEXT: .reg .b16 %rs<4>; @@ -313,7 +392,6 @@ define i32 @t8(i16 %a, i16 %b) { } define i64 @t9(i32 %a, i32 %b) { -; ; O1-LABEL: t9( ; O1: { ; O1-NEXT: .reg .b32 %r<4>; @@ -345,7 +423,6 @@ define i64 @t9(i32 %a, i32 %b) { } define i64 @t10(i32 %a, i32 %b) { -; ; O1-LABEL: t10( ; O1: { ; O1-NEXT: .reg .b32 %r<4>; @@ -377,7 +454,6 @@ define i64 @t10(i32 %a, i32 %b) { } define i32 @t11(i16 %a, i16 %b) { -; ; O1-LABEL: t11( ; O1: { ; O1-NEXT: .reg .b16 %rs<4>; @@ -409,7 +485,6 @@ define i32 @t11(i16 %a, i16 %b) { } define i32 @t12(i16 %a, i16 %b) { -; ; O1-LABEL: t12( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; @@ -440,7 +515,6 @@ define i32 @t12(i16 %a, i16 %b) { } define i64 @t13(i32 %a, i32 %b) { -; ; O1-LABEL: t13( ; O1: { ; O1-NEXT: .reg .b32 %r<4>; @@ -472,7 +546,6 @@ define i64 @t13(i32 %a, i32 %b) { } define i64 @t14(i32 %a, i32 %b) { -; ; O1-LABEL: t14( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; @@ -503,7 +576,6 @@ define i64 @t14(i32 %a, i32 %b) { } define i32 @t15(i16 %a, i16 %b) { -; ; O1-LABEL: t15( ; O1: { ; O1-NEXT: .reg .b16 %rs<3>; @@ -534,7 +606,6 @@ define i32 @t15(i16 %a, i16 %b) { } define i32 @t16(i16 %a, i16 %b) { -; ; O1-LABEL: t16( ; O1: { ; O1-NEXT: .reg .b16 %rs<4>; @@ -566,7 +637,6 @@ define i32 @t16(i16 %a, i16 %b) { } define i64 @t17(i32 %a, i32 %b) { -; ; O1-LABEL: t17( ; O1: { ; O1-NEXT: .reg .b32 %r<3>; @@ -597,7 +667,6 @@ define i64 @t17(i32 %a, i32 %b) { } define i64 @t18(i32 %a, i32 %b) { -; ; O1-LABEL: t18( ; O1: { ; O1-NEXT: .reg .b32 %r<4>; @@ -629,7 +698,6 @@ define i64 @t18(i32 %a, i32 %b) { } define i32 @t19(i16 %a, i16 %b) { -; ; O1-LABEL: t19( ; O1: { ; O1-NEXT: .reg .b16 %rs<4>; @@ -661,7 +729,6 @@ define i32 @t19(i16 %a, i16 %b) { } define i32 @t20(i16 %a) { -; ; CHECK-LABEL: t20( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; @@ -679,7 +746,6 @@ define i32 @t20(i16 %a) { } define i64 @t21(i32 %a) { -; ; CHECK-LABEL: t21( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -697,7 +763,6 @@ define i64 @t21(i32 %a) { } define i64 @t22(i32 %a) { -; ; CHECK-LABEL: t22( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -715,7 +780,6 @@ define i64 @t22(i32 %a) { } define i32 @t23(i16 %a, i16 %b) { -; ; CHECK-LABEL: t23( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; @@ -733,7 +797,6 @@ define i32 @t23(i16 %a, i16 %b) { } define i32 @t24(i16 %a, i16 %b) { -; ; O1-LABEL: t24( ; O1: { ; O1-NEXT: .reg .b16 %rs<2>; @@ -762,7 +825,6 @@ define i32 @t24(i16 %a, i16 %b) { } define i64 @t25(i32 %a) { -; ; CHECK-LABEL: t25( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -780,7 +842,6 @@ define i64 @t25(i32 %a) { } define i64 @t26(i32 %a) { -; ; O1-LABEL: t26( ; O1: { ; O1-NEXT: .reg .b32 %r<2>; @@ -809,7 +870,6 @@ define i64 @t26(i32 %a) { } define i32 @t27(i16 %a, i16 %b) { -; ; O1-LABEL: t27( ; O1: { ; O1-NEXT: .reg .b16 %rs<2>; @@ -838,7 +898,6 @@ define i32 @t27(i16 %a, i16 %b) { } define i32 @t28(i16 %a, i16 %b) { -; ; CHECK-LABEL: t28( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<3>; @@ -856,7 +915,6 @@ define i32 @t28(i16 %a, i16 %b) { } define i64 @t29(i32 %a) { -; ; O1-LABEL: t29( ; O1: { ; O1-NEXT: .reg .b32 %r<2>; @@ -885,7 +943,6 @@ define i64 @t29(i32 %a) { } define i64 @t30(i32 %a) { -; ; CHECK-LABEL: t30( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<3>; @@ -903,7 +960,6 @@ define i64 @t30(i32 %a) { } define i64 @t31(i32 %a, i32 %b) { -; ; O1-LABEL: t31( ; O1: { ; O1-NEXT: .reg .b32 %r<4>; @@ -935,20 +991,32 @@ define i64 @t31(i32 %a, i32 %b) { } define i32 @t32(i16 %a, i16 %b, i32 %c) { -; -; O1-LABEL: t32( -; O1: { -; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t32_param_0]; -; O1-NEXT: ld.param.b16 %rs2, [t32_param_1]; -; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; -; O1-NEXT: ld.param.b32 %r2, [t32_param_2]; -; O1-NEXT: add.s32 %r3, %r2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t32( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<3>; +; O1-NO-MAD-NEXT: .reg .b32 %r<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t32_param_0]; +; O1-NO-MAD-NEXT: ld.param.b16 %rs2, [t32_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t32_param_2]; +; O1-NO-MAD-NEXT: add.s32 %r3, %r2, %r1; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t32( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<3>; +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t32_param_0]; +; O1-MAD-NEXT: ld.param.b16 %rs2, [t32_param_1]; +; O1-MAD-NEXT: ld.param.b32 %r1, [t32_param_2]; +; O1-MAD-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t32( ; O0: { @@ -971,20 +1039,32 @@ define i32 @t32(i16 %a, i16 %b, i32 %c) { } define i32 @t33(i16 %a, i16 %b, i32 %c) { -; -; O1-LABEL: t33( -; O1: { -; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t33_param_0]; -; O1-NEXT: ld.param.b16 %rs2, [t33_param_1]; -; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; -; O1-NEXT: ld.param.b32 %r2, [t33_param_2]; -; O1-NEXT: add.s32 %r3, %r2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t33( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<3>; +; O1-NO-MAD-NEXT: .reg .b32 %r<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t33_param_0]; +; O1-NO-MAD-NEXT: ld.param.b16 %rs2, [t33_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t33_param_2]; +; O1-NO-MAD-NEXT: add.s32 %r3, %r2, %r1; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t33( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<3>; +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t33_param_0]; +; O1-MAD-NEXT: ld.param.b16 %rs2, [t33_param_1]; +; O1-MAD-NEXT: ld.param.b32 %r1, [t33_param_2]; +; O1-MAD-NEXT: mad.wide.s16 %r2, %rs1, %rs2, %r1; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t33( ; O0: { @@ -1007,19 +1087,30 @@ define i32 @t33(i16 %a, i16 %b, i32 %c) { } define i32 @t34(i16 %a, i16 %b) { -; -; O1-LABEL: t34( -; O1: { -; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<3>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t34_param_0]; -; O1-NEXT: ld.param.b16 %rs2, [t34_param_1]; -; O1-NEXT: mul.wide.s16 %r1, %rs1, %rs2; -; O1-NEXT: add.s32 %r2, %r1, 1; -; O1-NEXT: st.param.b32 [func_retval0], %r2; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t34( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<3>; +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t34_param_0]; +; O1-NO-MAD-NEXT: ld.param.b16 %rs2, [t34_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s16 %r1, %rs1, %rs2; +; O1-NO-MAD-NEXT: add.s32 %r2, %r1, 1; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t34( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<3>; +; O1-MAD-NEXT: .reg .b32 %r<2>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t34_param_0]; +; O1-MAD-NEXT: ld.param.b16 %rs2, [t34_param_1]; +; O1-MAD-NEXT: mad.wide.s16 %r1, %rs1, %rs2, 1; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r1; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t34( ; O0: { @@ -1041,19 +1132,30 @@ define i32 @t34(i16 %a, i16 %b) { } define i32 @t35(i16 %a, i32 %c) { -; -; O1-LABEL: t35( -; O1: { -; O1-NEXT: .reg .b16 %rs<2>; -; O1-NEXT: .reg .b32 %r<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t35_param_0]; -; O1-NEXT: ld.param.b32 %r1, [t35_param_1]; -; O1-NEXT: mul.wide.s16 %r2, %rs1, 3; -; O1-NEXT: add.s32 %r3, %r1, %r2; -; O1-NEXT: st.param.b32 [func_retval0], %r3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t35( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<2>; +; O1-NO-MAD-NEXT: .reg .b32 %r<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t35_param_0]; +; O1-NO-MAD-NEXT: ld.param.b32 %r1, [t35_param_1]; +; O1-NO-MAD-NEXT: mul.wide.s16 %r2, %rs1, 3; +; O1-NO-MAD-NEXT: add.s32 %r3, %r1, %r2; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t35( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<2>; +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t35_param_0]; +; O1-MAD-NEXT: ld.param.b32 %r1, [t35_param_1]; +; O1-MAD-NEXT: mad.wide.s16 %r2, %rs1, 3, %r1; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t35( ; O0: { @@ -1075,18 +1177,28 @@ define i32 @t35(i16 %a, i32 %c) { } define i32 @t36(i16 %a, i32 %c) { -; -; O1-LABEL: t36( -; O1: { -; O1-NEXT: .reg .b16 %rs<2>; -; O1-NEXT: .reg .b32 %r<3>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t36_param_0]; -; O1-NEXT: mul.wide.s16 %r1, %rs1, 3; -; O1-NEXT: add.s32 %r2, %r1, 5; -; O1-NEXT: st.param.b32 [func_retval0], %r2; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t36( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<2>; +; O1-NO-MAD-NEXT: .reg .b32 %r<3>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t36_param_0]; +; O1-NO-MAD-NEXT: mul.wide.s16 %r1, %rs1, 3; +; O1-NO-MAD-NEXT: add.s32 %r2, %r1, 5; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t36( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<2>; +; O1-MAD-NEXT: .reg .b32 %r<2>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t36_param_0]; +; O1-MAD-NEXT: mad.wide.s16 %r1, %rs1, 3, 5; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r1; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t36( ; O0: { @@ -1107,20 +1219,32 @@ define i32 @t36(i16 %a, i32 %c) { } define i32 @t37(i16 %a, i16 %b, i32 %c) { -; -; O1-LABEL: t37( -; O1: { -; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t37_param_0]; -; O1-NEXT: ld.param.b16 %rs2, [t37_param_1]; -; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; -; O1-NEXT: ld.param.b32 %r2, [t37_param_2]; -; O1-NEXT: add.s32 %r3, %r2, %r1; -; O1-NEXT: st.param.b32 [func_retval0], %r3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t37( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<3>; +; O1-NO-MAD-NEXT: .reg .b32 %r<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t37_param_0]; +; O1-NO-MAD-NEXT: ld.param.b16 %rs2, [t37_param_1]; +; O1-NO-MAD-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t37_param_2]; +; O1-NO-MAD-NEXT: add.s32 %r3, %r2, %r1; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t37( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<3>; +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t37_param_0]; +; O1-MAD-NEXT: ld.param.b16 %rs2, [t37_param_1]; +; O1-MAD-NEXT: ld.param.b32 %r1, [t37_param_2]; +; O1-MAD-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t37( ; O0: { @@ -1143,20 +1267,32 @@ define i32 @t37(i16 %a, i16 %b, i32 %c) { } define i32 @t38(i16 %a, i16 %b, i32 %c) { -; -; O1-LABEL: t38( -; O1: { -; O1-NEXT: .reg .b16 %rs<3>; -; O1-NEXT: .reg .b32 %r<4>; -; O1-EMPTY: -; O1-NEXT: // %bb.0: -; O1-NEXT: ld.param.b16 %rs1, [t38_param_0]; -; O1-NEXT: ld.param.b16 %rs2, [t38_param_1]; -; O1-NEXT: mul.wide.u16 %r1, %rs1, %rs2; -; O1-NEXT: ld.param.b32 %r2, [t38_param_2]; -; O1-NEXT: add.s32 %r3, %r1, %r2; -; O1-NEXT: st.param.b32 [func_retval0], %r3; -; O1-NEXT: ret; +; O1-NO-MAD-LABEL: t38( +; O1-NO-MAD: { +; O1-NO-MAD-NEXT: .reg .b16 %rs<3>; +; O1-NO-MAD-NEXT: .reg .b32 %r<4>; +; O1-NO-MAD-EMPTY: +; O1-NO-MAD-NEXT: // %bb.0: +; O1-NO-MAD-NEXT: ld.param.b16 %rs1, [t38_param_0]; +; O1-NO-MAD-NEXT: ld.param.b16 %rs2, [t38_param_1]; +; O1-NO-MAD-NEXT: mul.wide.u16 %r1, %rs1, %rs2; +; O1-NO-MAD-NEXT: ld.param.b32 %r2, [t38_param_2]; +; O1-NO-MAD-NEXT: add.s32 %r3, %r1, %r2; +; O1-NO-MAD-NEXT: st.param.b32 [func_retval0], %r3; +; O1-NO-MAD-NEXT: ret; +; +; O1-MAD-LABEL: t38( +; O1-MAD: { +; O1-MAD-NEXT: .reg .b16 %rs<3>; +; O1-MAD-NEXT: .reg .b32 %r<3>; +; O1-MAD-EMPTY: +; O1-MAD-NEXT: // %bb.0: +; O1-MAD-NEXT: ld.param.b16 %rs1, [t38_param_0]; +; O1-MAD-NEXT: ld.param.b16 %rs2, [t38_param_1]; +; O1-MAD-NEXT: ld.param.b32 %r1, [t38_param_2]; +; O1-MAD-NEXT: mad.wide.u16 %r2, %rs1, %rs2, %r1; +; O1-MAD-NEXT: st.param.b32 [func_retval0], %r2; +; O1-MAD-NEXT: ret; ; ; O0-LABEL: t38( ; O0: { diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll new file mode 100644 index 0000000000000..18fb87935d17d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll @@ -0,0 +1,119 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM90A %s +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_90a -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_90a \ +; RUN: %} +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | FileCheck --check-prefixes=CHECK,CHECK-SM100 %s +; RUN: %if ptxas-12.7 %{ \ +; RUN: llc < %s -mcpu=sm_100 -O0 -disable-post-ra -frame-pointer=all \ +; RUN: -verify-machineinstrs | %ptxas-verify -arch=sm_100 \ +; RUN: %} + +; Test that v2i32 -> v2f32 conversions don't emit bitwise operations on i64. + +target triple = "nvptx64-nvidia-cuda" + +declare <2 x i32> @return_i32x2(i32 %0) + +; Test with v2i32. +define ptx_kernel void @store_i32x2(i32 %0, ptr %p) { +; CHECK-SM90A-LABEL: store_i32x2( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<6>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; +; CHECK-SM90A-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; +; CHECK-SM90A-NEXT: { // callseq 0, 0 +; CHECK-SM90A-NEXT: .param .b32 param0; +; CHECK-SM90A-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-SM90A-NEXT: st.param.b32 [param0], %r1; +; CHECK-SM90A-NEXT: call.uni (retval0), return_i32x2, (param0); +; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r2, %r3}, [retval0]; +; CHECK-SM90A-NEXT: } // callseq 0 +; CHECK-SM90A-NEXT: add.rn.f32 %r4, %r3, %r3; +; CHECK-SM90A-NEXT: add.rn.f32 %r5, %r2, %r2; +; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r5, %r4}; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: store_i32x2( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<2>; +; CHECK-SM100-NEXT: .reg .b64 %rd<4>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [store_i32x2_param_1]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [store_i32x2_param_0]; +; CHECK-SM100-NEXT: { // callseq 0, 0 +; CHECK-SM100-NEXT: .param .b32 param0; +; CHECK-SM100-NEXT: .param .align 8 .b8 retval0[8]; +; CHECK-SM100-NEXT: st.param.b32 [param0], %r1; +; CHECK-SM100-NEXT: call.uni (retval0), return_i32x2, (param0); +; CHECK-SM100-NEXT: ld.param.b64 %rd2, [retval0]; +; CHECK-SM100-NEXT: } // callseq 0 +; CHECK-SM100-NEXT: add.rn.f32x2 %rd3, %rd2, %rd2; +; CHECK-SM100-NEXT: st.b64 [%rd1], %rd3; +; CHECK-SM100-NEXT: ret; + %v = call <2 x i32> @return_i32x2(i32 %0) + %v.f32x2 = bitcast <2 x i32> %v to <2 x float> + %res = fadd <2 x float> %v.f32x2, %v.f32x2 + store <2 x float> %res, ptr %p, align 8 + ret void +} + +; Test with inline ASM returning { <1 x float>, <1 x float> }, which decays to +; v2i32. +define ptx_kernel void @inlineasm(ptr %p) { +; CHECK-SM90A-LABEL: inlineasm( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<7>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r3, 0; +; CHECK-SM90A-NEXT: mov.b32 %r4, %r3; +; CHECK-SM90A-NEXT: mov.b32 %r2, %r4; +; CHECK-SM90A-NEXT: mov.b32 %r1, %r3; +; CHECK-SM90A-NEXT: // begin inline asm +; CHECK-SM90A-NEXT: // nop +; CHECK-SM90A-NEXT: // end inline asm +; CHECK-SM90A-NEXT: mul.rn.f32 %r5, %r2, 0f00000000; +; CHECK-SM90A-NEXT: mul.rn.f32 %r6, %r1, 0f00000000; +; CHECK-SM90A-NEXT: st.v2.b32 [%rd1], {%r6, %r5}; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: inlineasm( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<6>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [inlineasm_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r3, 0; +; CHECK-SM100-NEXT: mov.b32 %r4, %r3; +; CHECK-SM100-NEXT: mov.b32 %r2, %r4; +; CHECK-SM100-NEXT: mov.b32 %r1, %r3; +; CHECK-SM100-NEXT: // begin inline asm +; CHECK-SM100-NEXT: // nop +; CHECK-SM100-NEXT: // end inline asm +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r1, %r2}; +; CHECK-SM100-NEXT: mov.b32 %r5, 0f00000000; +; CHECK-SM100-NEXT: mov.b64 %rd3, {%r5, %r5}; +; CHECK-SM100-NEXT: mul.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-SM100-NEXT: st.b64 [%rd1], %rd4; +; CHECK-SM100-NEXT: ret; + %r = call { <1 x float>, <1 x float> } asm sideeffect "// nop", "=f,=f,0,1"(<1 x float> zeroinitializer, <1 x float> zeroinitializer) + %i0 = extractvalue { <1 x float>, <1 x float> } %r, 0 + %i1 = extractvalue { <1 x float>, <1 x float> } %r, 1 + %i4 = shufflevector <1 x float> %i0, <1 x float> %i1, <2 x i32> + %mul = fmul < 2 x float> %i4, zeroinitializer + store <2 x float> %mul, ptr %p, align 8 + ret void +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 217bb483682ff..a90cfff51e2c6 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -1938,16 +1938,29 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { } define <2 x float> @test_uitofp_2xi32(<2 x i32> %a) #0 { -; CHECK-LABEL: test_uitofp_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; -; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_uitofp_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_uitofp_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -1969,16 +1982,29 @@ define <2 x float> @test_uitofp_2xi64(<2 x i64> %a) #0 { } define <2 x float> @test_sitofp_2xi32(<2 x i32> %a) #0 { -; CHECK-LABEL: test_sitofp_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; -; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sitofp_2xi32( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sitofp_2xi32( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sitofp_2xi32_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2017,16 +2043,17 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-F32X2-LABEL: test_uitofp_2xi32_fadd( ; CHECK-F32X2: { ; CHECK-F32X2-NEXT: .reg .b32 %r<5>; -; CHECK-F32X2-NEXT: .reg .b64 %rd<4>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-F32X2-EMPTY: ; CHECK-F32X2-NEXT: // %bb.0: -; CHECK-F32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd2, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; ; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-F32X2-NEXT: mov.b64 %rd2, {%r4, %r3}; -; CHECK-F32X2-NEXT: add.rn.f32x2 %rd3, %rd1, %rd2; -; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd3; +; CHECK-F32X2-NEXT: mov.b64 %rd3, {%r4, %r3}; +; CHECK-F32X2-NEXT: add.rn.f32x2 %rd4, %rd2, %rd3; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-F32X2-NEXT: ret; %c = uitofp <2 x i32> %a to <2 x float> %r = fadd <2 x float> %b, %c @@ -2114,14 +2141,23 @@ define <2 x i32> @test_bitcast_2xfloat_to_2xi32(<2 x float> %a) #0 { } define <2 x float> @test_bitcast_2xi32_to_2xfloat(<2 x i32> %a) #0 { -; CHECK-LABEL: test_bitcast_2xi32_to_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0]; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r1, %r2}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_bitcast_2xi32_to_2xfloat( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_bitcast_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-NEXT: st.param.b64 [func_retval0], %rd1; +; CHECK-F32X2-NEXT: ret; %r = bitcast <2 x i32> %a to <2 x float> ret <2 x float> %r } @@ -2851,31 +2887,57 @@ define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { } define <2 x float> @test_sitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { -; CHECK-LABEL: test_sitofp_2xi32_to_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; -; CHECK-NEXT: cvt.rn.f32.s32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_sitofp_2xi32_to_2xfloat( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_sitofp_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.s32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = sitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { -; CHECK-LABEL: test_uitofp_2xi32_to_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; -; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; -; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r1; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<5>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_uitofp_2xi32_to_2xfloat( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<5>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_to_2xfloat_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-F32X2-NEXT: cvt.rn.f32.u32 %r4, %r1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; +; CHECK-F32X2-NEXT: ret; %r = uitofp <2 x i32> %a to <2 x float> ret <2 x float> %r } diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index d219493d2b31b..3fac29f74125b 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -346,15 +346,19 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { ; SM100-LABEL: ld_global_v8i32( ; SM100: { ; SM100-NEXT: .reg .b32 %r<16>; -; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-NEXT: .reg .b64 %rd<6>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; -; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: add.s32 %r9, %r1, %r2; -; SM100-NEXT: add.s32 %r10, %r3, %r4; -; SM100-NEXT: add.s32 %r11, %r5, %r6; -; SM100-NEXT: add.s32 %r12, %r7, %r8; +; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: mov.b64 {%r1, %r2}, %rd5; +; SM100-NEXT: mov.b64 {%r3, %r4}, %rd4; +; SM100-NEXT: mov.b64 {%r5, %r6}, %rd3; +; SM100-NEXT: mov.b64 {%r7, %r8}, %rd2; +; SM100-NEXT: add.s32 %r9, %r7, %r8; +; SM100-NEXT: add.s32 %r10, %r5, %r6; +; SM100-NEXT: add.s32 %r11, %r3, %r4; +; SM100-NEXT: add.s32 %r12, %r1, %r2; ; SM100-NEXT: add.s32 %r13, %r9, %r10; ; SM100-NEXT: add.s32 %r14, %r11, %r12; ; SM100-NEXT: add.s32 %r15, %r13, %r14; diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index 12e3287e73f0f..57852451c0c72 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -82,11 +82,11 @@ define void @avar_bfloat() { define void @avar_i32() { ; PTX-LABEL: avar_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0 store <8 x i32> %load, ptr addrspace(1) @globalout @@ -202,11 +202,11 @@ define void @asi_bfloat() { define void @asi_i32() { ; PTX-LABEL: asi_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -331,14 +331,13 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: areg_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; -; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0 store <8 x i32> %load, ptr addrspace(1) %out @@ -472,14 +471,13 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: ari_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; -; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index b7fa1dd5f2c4d..21604dfbf0013 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -78,11 +78,11 @@ define void @avar_bfloat() { define void @avar_i32() { ; PTX-LABEL: avar_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin store <8 x i32> %load, ptr addrspace(1) @globalout @@ -198,11 +198,11 @@ define void @asi_bfloat() { define void @asi_i32() { ; PTX-LABEL: asi_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset @@ -327,14 +327,13 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: areg_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; -; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) %in store <8 x i32> %load, ptr addrspace(1) %out @@ -468,14 +467,13 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-LABEL: ari_64_i32( ; PTX: { -; PTX-NEXT: .reg .b32 %r<9>; -; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; -; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index e8b43ad28ad27..b5319935f0f9d 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -100,19 +100,32 @@ define void @generic_16xbfloat(ptr %a, ptr %b) { } define void @generic_8xi32(ptr %a, ptr %b) { -; CHECK-LABEL: generic_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; -; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1]; -; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; +; SM90-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1]; +; SM90-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; +; SM100-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_8xi32_param_1]; +; SM100-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr %a store <8 x i32> %a.load, ptr %b ret void @@ -265,19 +278,32 @@ define void @generic_volatile_16xbfloat(ptr %a, ptr %b) { } define void @generic_volatile_8xi32(ptr %a, ptr %b) { -; CHECK-LABEL: generic_volatile_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1]; -; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: generic_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; +; SM90-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1]; +; SM90-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: generic_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [generic_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr %a store volatile <8 x i32> %a.load, ptr %b ret void @@ -496,14 +522,13 @@ define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; ; SM100-LABEL: global_8xi32( ; SM100: { -; SM100-NEXT: .reg .b32 %r<9>; -; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; -; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; -; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_8xi32_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(1) %a store <8 x i32> %a.load, ptr addrspace(1) %b @@ -741,14 +766,13 @@ define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; ; SM100-LABEL: global_volatile_8xi32( ; SM100: { -; SM100-NEXT: .reg .b32 %r<9>; -; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0]; -; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_8xi32_param_1]; -; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(1) %a store volatile <8 x i32> %a.load, ptr addrspace(1) %b @@ -924,19 +948,32 @@ define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { } define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; -; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1]; -; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; +; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1]; +; SM90-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; +; SM100-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_8xi32_param_1]; +; SM100-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(3) %a store <8 x i32> %a.load, ptr addrspace(3) %b ret void @@ -1089,19 +1126,32 @@ define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) } define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { -; CHECK-LABEL: shared_volatile_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1]; -; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: shared_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1]; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: shared_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [shared_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(3) %a store volatile <8 x i32> %a.load, ptr addrspace(3) %b ret void @@ -1256,19 +1306,32 @@ define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { } define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; -; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1]; -; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_8xi32_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(5) %a store <8 x i32> %a.load, ptr addrspace(5) %b ret void @@ -1421,19 +1484,32 @@ define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) } define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { -; CHECK-LABEL: local_volatile_8xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1]; -; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: ret; +; SM90-LABEL: local_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; +; SM90-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1]; +; SM90-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: local_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; +; SM100-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM100-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM100-NEXT: ld.param.b64 %rd6, [local_volatile_8xi32_param_1]; +; SM100-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM100-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; +; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(5) %a store volatile <8 x i32> %a.load, ptr addrspace(5) %b ret void diff --git a/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir new file mode 100644 index 0000000000000..0b2d85600a2ef --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/machinelicm-no-preheader.mir @@ -0,0 +1,80 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=nvptx64 -mcpu=sm_20 -run-pass=early-machinelicm %s -o - | FileCheck %s + +# This test checks that the early-machineLICM pass successfully creates a new +# loop preheader by splitting the critical edge and hoisting the loop invariant +# value `%8` to the preheader. +# Since the critical edge successor is a loop header, the splitting does not +# break the structured CFG, which is a requirement for the NVPTX target. + +--- +name: test_hoist +tracksRegLiveness: true +registers: + - { id: 0, class: b64, preferred-register: '', flags: [ ] } + - { id: 1, class: b32, preferred-register: '', flags: [ ] } + - { id: 2, class: b32, preferred-register: '', flags: [ ] } + - { id: 3, class: b32, preferred-register: '', flags: [ ] } + - { id: 4, class: b32, preferred-register: '', flags: [ ] } + - { id: 5, class: b32, preferred-register: '', flags: [ ] } + - { id: 6, class: b64, preferred-register: '', flags: [ ] } + - { id: 7, class: b1, preferred-register: '', flags: [ ] } + - { id: 8, class: b32, preferred-register: '', flags: [ ] } + - { id: 9, class: b1, preferred-register: '', flags: [ ] } +body: | + ; CHECK-LABEL: name: test_hoist + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x30000000), %bb.3(0x50000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[LD_i32_:%[0-9]+]]:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101) + ; CHECK-NEXT: [[LD_i64_:%[0-9]+]]:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101) + ; CHECK-NEXT: [[ADD64ri:%[0-9]+]]:b64 = nuw ADD64ri killed [[LD_i64_]], 2 + ; CHECK-NEXT: [[LD_i32_1:%[0-9]+]]:b32 = LD_i32 0, 0, 1, 3, 32, [[ADD64ri]], 0 + ; CHECK-NEXT: [[SETP_i32ri:%[0-9]+]]:b1 = SETP_i32ri [[LD_i32_]], 0, 0 + ; CHECK-NEXT: CBranch killed [[SETP_i32ri]], %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[ADD32ri:%[0-9]+]]:b32 = ADD32ri [[LD_i32_]], -1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:b32 = PHI [[LD_i32_1]], %bb.3, %3, %bb.1 + ; CHECK-NEXT: [[SREM32rr:%[0-9]+]]:b32 = SREM32rr [[PHI]], [[ADD32ri]] + ; CHECK-NEXT: [[SETP_i32ri1:%[0-9]+]]:b1 = SETP_i32ri [[SREM32rr]], 0, 1 + ; CHECK-NEXT: CBranch killed [[SETP_i32ri1]], %bb.1 + ; CHECK-NEXT: GOTO %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:b32 = PHI [[LD_i32_1]], %bb.0, [[SREM32rr]], %bb.1 + ; CHECK-NEXT: ST_i32 [[PHI1]], 0, 0, 1, 32, [[ADD64ri]], 0 + ; CHECK-NEXT: Return + bb.0.entry: + successors: %bb.2(0x30000000), %bb.1(0x50000000) + + %5:b32 = LD_i32 0, 0, 101, 3, 32, &test_hoist_param_1, 0 :: (dereferenceable invariant load (s32), addrspace 101) + %6:b64 = LD_i64 0, 0, 101, 3, 64, &test_hoist_param_0, 0 :: (dereferenceable invariant load (s64), addrspace 101) + %0:b64 = nuw ADD64ri killed %6, 2 + %1:b32 = LD_i32 0, 0, 1, 3, 32, %0, 0 + %7:b1 = SETP_i32ri %5, 0, 0 + CBranch killed %7, %bb.2 + GOTO %bb.1 + + + bb.1: + successors: %bb.2(0x04000000), %bb.1(0x7c000000) + + %2:b32 = PHI %1, %bb.0, %3, %bb.1 + %8:b32 = ADD32ri %5, -1 + %3:b32 = SREM32rr %2, %8 + %9:b1 = SETP_i32ri %3, 0, 1 + CBranch killed %9, %bb.1 + GOTO %bb.2 + + bb.2: + %4:b32 = PHI %1, %bb.0, %3, %bb.1 + ST_i32 %4, 0, 0, 1, 32, %0, 0 + Return +... diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index 5a55fa97033b7..625c93c3f0a53 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -1586,54 +1586,25 @@ define double @minimumnum_double(double %a, double %b) { ret double %x } -; TODO Improve the "Expand" path for minimumnum vectors on targets where -; f16 is not supported. Ideally it should use two f32 minimumnums first instead of -; fully expanding the minimumnum instruction into compare/select instructions. define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: minimumnum_v2half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; -; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: min.f32 %r6, %r5, %r4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimumnum_v2half( @@ -1649,48 +1620,22 @@ define <2 x half> @minimumnum_v2half(<2 x half> %a, <2 x half> %b) { ; ; CHECK-SM80-NOF16-LABEL: minimumnum_v2half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimumnum_v2half_param_0]; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimumnum_v2half_param_1]; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p3, %r2, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p9, %r7, %r9; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-SM80-NOF16-NEXT: min.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-SM80-NOF16-NEXT: min.f32 %r6, %r5, %r4; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.minimumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -1788,54 +1733,25 @@ define double @maximumnum_double(double %a, double %b) { ret double %x } -; TODO Improve the "Expand" path for maximumnum vectors on targets where -; f16 is not supported. Ideally it should use two f32 maximumnums first instead of -; fully expanding the maximumnum instruction into compare/select instructions. define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: maximumnum_v2half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; -; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: max.f32 %r6, %r5, %r4; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximumnum_v2half( @@ -1851,48 +1767,22 @@ define <2 x half> @maximumnum_v2half(<2 x half> %a, <2 x half> %b) { ; ; CHECK-SM80-NOF16-LABEL: maximumnum_v2half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<13>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<17>; -; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<11>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximumnum_v2half_param_0]; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximumnum_v2half_param_1]; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r3, %r3; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs6; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p3, %r2, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs5, %rs6, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs5, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs5, %rs7, %p4; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p5, %rs6, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs6, %rs8, %p5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs7; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p6, %r5, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs9, %rs7, %p6; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r6, %r6; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs3, %rs1, %p7; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p8, %r8, %r8; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs11, %rs3, %p8; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r9, %rs12; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p9, %r7, %r9; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs11, %rs12, %p9; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p10, %rs11, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs11, %rs13, %p10; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p11, %rs12, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs15, %rs12, %rs14, %p11; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r10, %rs13; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p12, %r10, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs16, %rs15, %rs13, %p12; -; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs16, %rs10}; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; +; CHECK-SM80-NOF16-NEXT: max.f32 %r3, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-SM80-NOF16-NEXT: max.f32 %r6, %r5, %r4; +; CHECK-SM80-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r6; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r7, {%rs6, %rs5}; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.maximumnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x diff --git a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll index bc67471209bf8..32b55a38e55ef 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll @@ -11,7 +11,6 @@ target triple = "nvptx64-unknown-unknown" define void @test_infer_const_from_cast() { ; INFER-LABEL: @test_infer_const_from_cast ; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap) -; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap) ; PTX-LABEL: .visible .func test_infer_const_from_cast( ; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap; ; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}; @@ -69,12 +68,40 @@ entry: %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4) %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr - call void @llvm.nvvm.prefetch.tensormap(ptr %cast3) + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast3) + ret void +} + +; Kernel Function Test +; Cast from Param space to Generic +define ptx_kernel void @test_param_to_generic_cast_kernel(ptr addrspace(101) %param_ptr) { +; INFER-LABEL: @test_param_to_generic_cast_kernel +; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) +; PTX-LABEL: .visible .entry test_param_to_generic_cast_kernel( +; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}]; +entry: + %cast = addrspacecast ptr addrspace(101) %param_ptr to ptr + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast) + ret void +} + +; Kernel Function Test +; Multiple casts in sequence +define ptx_kernel void @test_infer_through_multiple_casts_kernel() { +; INFER-LABEL: @test_infer_through_multiple_casts_kernel +; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap) +; PTX-LABEL: .visible .entry test_infer_through_multiple_casts_kernel( +; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap; +; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}}; +; PTX: prefetch.tensormap [%rd{{[0-9]+}}]; +entry: + %cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr + %cast2 = addrspacecast ptr %cast1 to ptr addrspace(4) + %cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr + call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast3) ret void } declare void @llvm.nvvm.prefetch.tensormap.p0(ptr) declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4)) declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101)) - - diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll index a1c5ec8f50a6b..c0489cc6fd73a 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -121,4 +121,40 @@ define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) { ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) ret void -} \ No newline at end of file +} + +define ptx_kernel void @prefetch_generic_tensormap_kernel(ptr %ptr) { +; CHECK-PTX64-LABEL: prefetch_generic_tensormap_kernel( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_generic_tensormap_kernel_param_0]; +; CHECK-PTX64-NEXT: prefetch.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr) + ret void +} + +define ptx_kernel void @prefetch_param_tensormap_kernel(ptr addrspace(101) %param_ptr) { +; CHECK-PTX64-LABEL: prefetch_param_tensormap_kernel( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_param_tensormap_kernel_param_0]; +; CHECK-PTX64-NEXT: prefetch.param.tensormap [%rd1]; +; CHECK-PTX64-NEXT: ret; + tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr) + ret void +} + +define ptx_kernel void @prefetch_grid_const_tensormap(ptr byval([64 x i8]) align 64 "nvvm.grid_constant" %ptr) { +; CHECK-PTX64-LABEL: .visible .entry prefetch_grid_const_tensormap( +; CHECK-PTX64: prefetch.tensormap [%{{(SP|rd[0-9]+).*}}]; +; CHECK-PTX64: ret; + +entry: + call void @llvm.nvvm.prefetch.tensormap.p0(ptr addrspace(0) %ptr) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index f871e4039a558..87787ba2bf81c 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -1452,22 +1452,44 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_add_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_add_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; -; CHECK-NEXT: add.s32 %r9, %r4, %r8; -; CHECK-NEXT: add.s32 %r10, %r2, %r6; -; CHECK-NEXT: add.s32 %r11, %r10, %r9; -; CHECK-NEXT: add.s32 %r12, %r3, %r7; -; CHECK-NEXT: add.s32 %r13, %r1, %r5; -; CHECK-NEXT: add.s32 %r14, %r13, %r12; -; CHECK-NEXT: add.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_add_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; +; CHECK-SM80-NEXT: add.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: add.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: add.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: add.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: add.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: add.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_add_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_add_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_add_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: add.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: add.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: add.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: add.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: add.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: add.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: add.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.add(<8 x i32> %in) ret i32 %res } @@ -1543,22 +1565,44 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_mul_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_mul_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; -; CHECK-NEXT: mul.lo.s32 %r9, %r4, %r8; -; CHECK-NEXT: mul.lo.s32 %r10, %r2, %r6; -; CHECK-NEXT: mul.lo.s32 %r11, %r10, %r9; -; CHECK-NEXT: mul.lo.s32 %r12, %r3, %r7; -; CHECK-NEXT: mul.lo.s32 %r13, %r1, %r5; -; CHECK-NEXT: mul.lo.s32 %r14, %r13, %r12; -; CHECK-NEXT: mul.lo.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_mul_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; +; CHECK-SM80-NEXT: mul.lo.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: mul.lo.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: mul.lo.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: mul.lo.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: mul.lo.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: mul.lo.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: mul.lo.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_mul_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_mul_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_mul_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: mul.lo.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: mul.lo.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: mul.lo.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: mul.lo.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: mul.lo.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: mul.lo.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: mul.lo.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.mul(<8 x i32> %in) ret i32 %res } @@ -1673,22 +1717,44 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_umax_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_umax_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; -; CHECK-NEXT: max.u32 %r9, %r4, %r8; -; CHECK-NEXT: max.u32 %r10, %r2, %r6; -; CHECK-NEXT: max.u32 %r11, %r10, %r9; -; CHECK-NEXT: max.u32 %r12, %r3, %r7; -; CHECK-NEXT: max.u32 %r13, %r1, %r5; -; CHECK-NEXT: max.u32 %r14, %r13, %r12; -; CHECK-NEXT: max.u32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_umax_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; +; CHECK-SM80-NEXT: max.u32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: max.u32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: max.u32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.u32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: max.u32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: max.u32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.u32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_umax_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_umax_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_umax_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: max.u32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: max.u32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.u32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: max.u32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: max.u32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: max.u32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: max.u32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.umax(<8 x i32> %in) ret i32 %res } @@ -1803,22 +1869,44 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_umin_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_umin_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; -; CHECK-NEXT: min.u32 %r9, %r4, %r8; -; CHECK-NEXT: min.u32 %r10, %r2, %r6; -; CHECK-NEXT: min.u32 %r11, %r10, %r9; -; CHECK-NEXT: min.u32 %r12, %r3, %r7; -; CHECK-NEXT: min.u32 %r13, %r1, %r5; -; CHECK-NEXT: min.u32 %r14, %r13, %r12; -; CHECK-NEXT: min.u32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_umin_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; +; CHECK-SM80-NEXT: min.u32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: min.u32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: min.u32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.u32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: min.u32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: min.u32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.u32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_umin_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_umin_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_umin_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: min.u32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: min.u32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.u32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: min.u32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: min.u32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: min.u32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: min.u32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.umin(<8 x i32> %in) ret i32 %res } @@ -1933,22 +2021,44 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_smax_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_smax_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; -; CHECK-NEXT: max.s32 %r9, %r4, %r8; -; CHECK-NEXT: max.s32 %r10, %r2, %r6; -; CHECK-NEXT: max.s32 %r11, %r10, %r9; -; CHECK-NEXT: max.s32 %r12, %r3, %r7; -; CHECK-NEXT: max.s32 %r13, %r1, %r5; -; CHECK-NEXT: max.s32 %r14, %r13, %r12; -; CHECK-NEXT: max.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_smax_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; +; CHECK-SM80-NEXT: max.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: max.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: max.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: max.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: max.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: max.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: max.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_smax_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_smax_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_smax_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: max.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: max.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: max.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: max.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: max.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: max.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: max.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.smax(<8 x i32> %in) ret i32 %res } @@ -2063,22 +2173,44 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_smin_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_smin_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; -; CHECK-NEXT: min.s32 %r9, %r4, %r8; -; CHECK-NEXT: min.s32 %r10, %r2, %r6; -; CHECK-NEXT: min.s32 %r11, %r10, %r9; -; CHECK-NEXT: min.s32 %r12, %r3, %r7; -; CHECK-NEXT: min.s32 %r13, %r1, %r5; -; CHECK-NEXT: min.s32 %r14, %r13, %r12; -; CHECK-NEXT: min.s32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_smin_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; +; CHECK-SM80-NEXT: min.s32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: min.s32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: min.s32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: min.s32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: min.s32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: min.s32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: min.s32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_smin_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_smin_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_smin_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: min.s32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: min.s32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: min.s32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: min.s32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: min.s32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: min.s32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: min.s32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.smin(<8 x i32> %in) ret i32 %res } @@ -2152,22 +2284,44 @@ define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_and_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_and_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; -; CHECK-NEXT: and.b32 %r9, %r4, %r8; -; CHECK-NEXT: and.b32 %r10, %r2, %r6; -; CHECK-NEXT: and.b32 %r11, %r10, %r9; -; CHECK-NEXT: and.b32 %r12, %r3, %r7; -; CHECK-NEXT: and.b32 %r13, %r1, %r5; -; CHECK-NEXT: and.b32 %r14, %r13, %r12; -; CHECK-NEXT: and.b32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_and_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; +; CHECK-SM80-NEXT: and.b32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: and.b32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: and.b32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: and.b32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: and.b32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: and.b32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_and_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_and_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_and_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: and.b32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: and.b32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: and.b32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: and.b32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: and.b32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: and.b32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.and(<8 x i32> %in) ret i32 %res } @@ -2241,22 +2395,44 @@ define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_or_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_or_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; -; CHECK-NEXT: or.b32 %r9, %r4, %r8; -; CHECK-NEXT: or.b32 %r10, %r2, %r6; -; CHECK-NEXT: or.b32 %r11, %r10, %r9; -; CHECK-NEXT: or.b32 %r12, %r3, %r7; -; CHECK-NEXT: or.b32 %r13, %r1, %r5; -; CHECK-NEXT: or.b32 %r14, %r13, %r12; -; CHECK-NEXT: or.b32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_or_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; +; CHECK-SM80-NEXT: or.b32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: or.b32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: or.b32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: or.b32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: or.b32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: or.b32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_or_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_or_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_or_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: or.b32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: or.b32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: or.b32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: or.b32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: or.b32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: or.b32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: or.b32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.or(<8 x i32> %in) ret i32 %res } @@ -2330,22 +2506,44 @@ define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) { } define i32 @reduce_xor_i32(<8 x i32> %in) { -; CHECK-LABEL: reduce_xor_i32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<16>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; -; CHECK-NEXT: xor.b32 %r9, %r4, %r8; -; CHECK-NEXT: xor.b32 %r10, %r2, %r6; -; CHECK-NEXT: xor.b32 %r11, %r10, %r9; -; CHECK-NEXT: xor.b32 %r12, %r3, %r7; -; CHECK-NEXT: xor.b32 %r13, %r1, %r5; -; CHECK-NEXT: xor.b32 %r14, %r13, %r12; -; CHECK-NEXT: xor.b32 %r15, %r14, %r11; -; CHECK-NEXT: st.param.b32 [func_retval0], %r15; -; CHECK-NEXT: ret; +; CHECK-SM80-LABEL: reduce_xor_i32( +; CHECK-SM80: { +; CHECK-SM80-NEXT: .reg .b32 %r<16>; +; CHECK-SM80-EMPTY: +; CHECK-SM80-NEXT: // %bb.0: +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; +; CHECK-SM80-NEXT: xor.b32 %r9, %r4, %r8; +; CHECK-SM80-NEXT: xor.b32 %r10, %r2, %r6; +; CHECK-SM80-NEXT: xor.b32 %r11, %r10, %r9; +; CHECK-SM80-NEXT: xor.b32 %r12, %r3, %r7; +; CHECK-SM80-NEXT: xor.b32 %r13, %r1, %r5; +; CHECK-SM80-NEXT: xor.b32 %r14, %r13, %r12; +; CHECK-SM80-NEXT: xor.b32 %r15, %r14, %r11; +; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM80-NEXT: ret; +; +; CHECK-SM100-LABEL: reduce_xor_i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<16>; +; CHECK-SM100-NEXT: .reg .b64 %rd<5>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_xor_i32_param_0+16]; +; CHECK-SM100-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_xor_i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM100-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM100-NEXT: xor.b32 %r5, %r4, %r2; +; CHECK-SM100-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM100-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM100-NEXT: xor.b32 %r10, %r9, %r7; +; CHECK-SM100-NEXT: xor.b32 %r11, %r10, %r5; +; CHECK-SM100-NEXT: xor.b32 %r12, %r3, %r1; +; CHECK-SM100-NEXT: xor.b32 %r13, %r8, %r6; +; CHECK-SM100-NEXT: xor.b32 %r14, %r13, %r12; +; CHECK-SM100-NEXT: xor.b32 %r15, %r14, %r11; +; CHECK-SM100-NEXT: st.param.b32 [func_retval0], %r15; +; CHECK-SM100-NEXT: ret; %res = call i32 @llvm.vector.reduce.xor(<8 x i32> %in) ret i32 %res } diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-invalid.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-invalid.ll new file mode 100644 index 0000000000000..c0f6f4c7c46bd --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-invalid.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: not llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx88 -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx88 -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 -o /dev/null 2>&1 | FileCheck %s + +define void @tcgen05_mma_block_scale_invalid_flags(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { + ; CHECK: immarg value 0 out of range [1, 3) + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 0, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 5) + ; CHECK: immarg value 0 out of range [1, 3) + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 0, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 5) + ; CHECK: immarg value 0 out of range [1, 3) + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 0, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 5) + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 0, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 5) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll new file mode 100644 index 0000000000000..f6c219107a677 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale-ptx88.ll @@ -0,0 +1,670 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %} + +define void @tcgen05_mma_mxf8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf8f6f4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_cta1_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_mxf8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf8f6f4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_cta2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_cta1_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_cta1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_cta2_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_cta2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_cta1_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_mxf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_cta2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_cta1_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_cta1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_mxf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_cta2_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_cta2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf4nvf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4nvf4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4nvf4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4nvf4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4nvf4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4nvf4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4nvf4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4nvf4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4nvf4_cta1_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4nvf4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf4nvf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4nvf4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4nvf4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4nvf4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4nvf4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4nvf4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4nvf4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4nvf4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4nvf4_cta2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4nvf4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4nvf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4nvf4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4nvf4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4nvf4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4nvf4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4nvf4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4nvf4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4nvf4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4nvf4_cta1_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4nvf4_cta1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4nvf4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4nvf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4nvf4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4nvf4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4nvf4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4nvf4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4nvf4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4nvf4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4nvf4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4nvf4_cta2_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4nvf4_cta2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4nvf4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block32.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4nvf4.block_scale.block32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll new file mode 100644 index 0000000000000..e071eaaf107fc --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-block-scale.ll @@ -0,0 +1,387 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx88 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %} + +define void @tcgen05_mma_mxf8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf8f6f4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_cta1_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf8f6f4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf8f6f4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf8f6f4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf8f6f4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf8f6f4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf8f6f4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf8f6f4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf8f6f4_cta2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf8f6f4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_cta1_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_cta1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf8f6f4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf8f6f4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf8f6f4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf8f6f4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf8f6f4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf8f6f4_cta2_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf8f6f4_cta2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf8f6f4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf8f6f4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf8f6f4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_cta1_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_mxf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b) { +; CHECK-LABEL: tcgen05_mma_mxf4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<6>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_mxf4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_mxf4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_mxf4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_mxf4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_mxf4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_mxf4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_mxf4_cta2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_mxf4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], [%r5], %rd2, %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_cta1_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_cta1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_mxf4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_mxf4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_mxf4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_mxf4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_mxf4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_mxf4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_mxf4_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_mxf4_cta2_param_6]; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_mxf4_cta2_param_7]; +; CHECK-NEXT: ld.param.b32 %r5, [tcgen05_mma_sp_mxf4_cta2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ld.param.b32 %r6, [tcgen05_mma_sp_mxf4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::discard [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::lastuse [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::fill [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], %rd1, %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.collector::a::use [%r1], [%r6], %rd2, [%r5], %r2, [%r3], [%r4], %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.mxf4.block_scale(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.mxf4.block_scale(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, ptr addrspace(6) %scale_a, ptr addrspace(6) %scale_b, i32 2, i32 3) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll new file mode 100644 index 0000000000000..f2d6c02b2cd7e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-disable-output-lane.ll @@ -0,0 +1,855 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %} + +define void @tcgen05_mma_fp16_shared_disable_output_lane_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_fp16_shared_disable_output_lane_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_fp16_shared_disable_output_lane_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + ret void +} + +define void @tcgen05_mma_fp16_shared_disable_output_lane_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_fp16_shared_disable_output_lane_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_fp16_shared_disable_output_lane_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 0, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_7]; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r12, [tcgen05_mma_sp_fp16_shared_disable_output_lane_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 0, i32 3) + + ret void +} + +define void @tcgen05_mma_tf32_shared_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_tf32_shared_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_tf32_shared_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_tf32_shared_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_tf32_shared_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_tf32_shared_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r12, [tcgen05_mma_sp_tf32_shared_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 1, i32 3) + + ret void +} + +define void @tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_f8f6f4_shared_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r12, [tcgen05_mma_sp_f8f6f4_shared_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.ashift.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 2, i32 3) + + ret void + +} + +define void @tcgen05_mma_i8_shared_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_i8_shared_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_i8_shared_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.ashift.collector::a::lastuse [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + ret void +} + +define void @tcgen05_mma_i8_shared_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_i8_shared_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_i8_shared_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.ashift.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.ashift.collector::a::lastuse [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_i8_shared_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_i8_shared_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r8, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.ashift.collector::a::discard [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.ashift.collector::a::lastuse [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], [%r8], %rd2, [%r7], %r2, {%r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <4 x i32> %disable_output_lanev4, i32 3, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_i8_shared_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_i8_shared_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_8]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ld.param.b32 %r12, [tcgen05_mma_sp_i8_shared_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.ashift.collector::a::discard [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.ashift.collector::a::lastuse [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], [%r12], %rd2, [%r11], %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, <8 x i32> %disable_output_lanev8, i32 3, i32 3) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll new file mode 100644 index 0000000000000..dff829ecf5321 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-invalid.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_101a -mattr=+ptx86 -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_110a -mattr=+ptx90 -o /dev/null 2>&1 | FileCheck %s +target triple = "nvptx64-nvidia-cuda" + +define void @tcgen05_mma_invalid_flag_values(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 5, i32 1, i32 3) + ; CHECK: immarg value 0 out of range [1, 3) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + ; CHECK: immarg value 3 out of range [1, 3) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 3, i32 3) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 5) + ; CHECK: immarg value 2 out of range [0, 2) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 2) + ret void +} + +define void @tcgen05_mma_disable_output_lane_invalid_flag_values(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4) { + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 5, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.tensor.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, i32 0, i32 5) + ret void +} + +define void @tcgen05_mma_ws_invalid_flag_values(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 5, i32 0, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 5, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 5) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d-invalid.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d-invalid.ll new file mode 100644 index 0000000000000..7c884a70b7530 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d-invalid.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 -o /dev/null 2>&1 | FileCheck %s +target triple = "nvptx64-nvidia-cuda" + +define void @tcgen05_mma_scale_d_invalid_flag_values(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { + ; CHECK: immarg value 16 out of range [0, 16) + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 16, i32 0, i32 1, i32 0) + ; CHECK: immarg value 3 out of range [0, 2) + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 3, i32 1, i32 0) + ; CHECK: immarg value 0 out of range [1, 3) + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 0, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 5) + ret void +} + +define void @tcgen05_mma_scale_d_disable_output_lane_invalid_flag_values(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4) { + ; CHECK: immarg value 16 out of range [0, 16) + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 16, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + ; CHECK: immarg value 3 out of range [0, 2) + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 3, i32 0) + ; CHECK: immarg value 5 out of range [0, 4) + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 5) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll new file mode 100644 index 0000000000000..ffe88616af10d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll @@ -0,0 +1,537 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} + +define void @tcgen05_mma_fp16_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_fp16_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_cg1_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_fp16_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_fp16_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_fp16_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_cg2_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_fp16_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 0, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_fp16_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_cg1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_fp16_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_fp16_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_cg2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_fp16_cg2_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 0, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_tf32_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_tf32_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_cg1_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_tf32_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_tf32_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_tf32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_cg2_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_tf32_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, i32 1, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_sp_tf32_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_cg1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_tf32_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_tf32_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_cg2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_tf32_cg2_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 0) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 1) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 2) + + call void @llvm.nvvm.tcgen05.mma.sp.shared.scale_d(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 3) + + call void @llvm.nvvm.tcgen05.mma.sp.tensor.scale_d(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i64 0, i32 1, i32 2, i32 3) + + ret void +} + +define void @tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 0, i32 0) + + ret void +} + +define void @tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_fp16_shared_scale_d_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 0, i32 0) + + ret void +} + +define void @tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_6]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r7, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r7], %rd2, %r2, {%r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg1.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <4 x i32> %disable_output_lanev4, i32 1, i32 0) + + ret void +} + +define void @tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, <4 x i32> %disable_output_lanev4, <8 x i32> %disable_output_lanev8) { +; CHECK-LABEL: tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<12>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_7]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ld.param.b32 %r11, [tcgen05_mma_tf32_shared_scale_d_disable_output_lane_cg2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r11], %rd2, %r2, {%r7, %r8, %r9, %r10, %r3, %r4, %r5, %r6}, %p1, 0; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + call void @llvm.nvvm.tcgen05.mma.tensor.scale_d.disable_output_lane.cg2.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 0, <8 x i32> %disable_output_lanev8, i32 1, i32 0) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll new file mode 100644 index 0000000000000..7e60b3a3ece6e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-ws.ll @@ -0,0 +1,571 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %} + +define void @tcgen05_mma_ws_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_fp16_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_fp16_zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask) { +; CHECK-LABEL: tcgen05_mma_ws_fp16_zero_col_mask( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_fp16_zero_col_mask_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_fp16_zero_col_mask_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_fp16_zero_col_mask_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_fp16_zero_col_mask_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_fp16_zero_col_mask_param_4]; +; CHECK-NEXT: ld.param.b64 %rd3, [tcgen05_mma_ws_fp16_zero_col_mask_param_6]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_fp16_zero_col_mask_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f16.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1, %rd3; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_sp_fp16(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta) { +; CHECK-LABEL: tcgen05_mma_ws_sp_fp16( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_sp_fp16_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_sp_fp16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_sp_fp16_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_sp_fp16_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_sp_fp16_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_sp_fp16_param_6]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_ws_sp_fp16_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_sp_fp16_zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i64 %zero_col_mask, ptr addrspace(6) %spmeta) { +; CHECK-LABEL: tcgen05_mma_ws_sp_fp16_zero_col_mask( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_4]; +; CHECK-NEXT: ld.param.b64 %rd3, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_6]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_7]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_ws_sp_fp16_zero_col_mask_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], %rd1, %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: tcgen05.mma.ws.sp.cta_group::1.kind::f16.collector::b0::use [%r1], [%r4], %rd2, [%r3], %r2, %p1, %rd3; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.shared.zero_col_mask(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.sp.tensor.zero_col_mask(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmeta, i64 %zero_col_mask, i32 0, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_tf32(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_tf32( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_tf32_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_tf32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_tf32_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_tf32_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_tf32_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_tf32_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::tf32.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_f8f6f4(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_f8f6f4( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_f8f6f4_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_f8f6f4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_f8f6f4_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_f8f6f4_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_f8f6f4_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_f8f6f4_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::f8f6f4.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 0, i32 3) + ret void +} + +define void @tcgen05_mma_ws_i8(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_ws_i8( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_ws_i8_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_ws_i8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_ws_i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_ws_i8_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_ws_i8_param_4]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_ws_i8_param_1]; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.ws.cta_group::1.kind::i8.collector::b0::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 0) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 1) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 2) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + + call void @llvm.nvvm.tcgen05.mma.ws.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 0, i32 3) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll new file mode 100644 index 0000000000000..711e566df5034 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll @@ -0,0 +1,639 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s +; RUN: llc < %s -o - -mcpu=sm_110a -march=nvptx64 -mattr=+ptx90 | FileCheck %s +; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %} +; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110a | %ptxas-verify -arch=sm_110a %} + +define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_fp16_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_cta1_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_fp16_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_fp16_cta1_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_tf32_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_tf32_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_cta1_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_tf32_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_tf32_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_tf32_cta1_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_f8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_f8f6f4_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_f8f6f4_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_f8f6f4_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_f8f6f4_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_f8f6f4_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_f8f6f4_cta1_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_f8f6f4_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_f8f6fr_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_f8f6fr_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_f8f6fr_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_f8f6fr_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_f8f6fr_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_f8f6fr_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_f8f6fr_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_f8f6fr_cta1_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_f8f6fr_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_i8_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_i8_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_i8_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_i8_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_i8_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_i8_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_i8_cta1_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_i8_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::1.kind::i8.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_sp_i8_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_i8_cta1( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_i8_cta1_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_i8_cta1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_i8_cta1_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_i8_cta1_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_i8_cta1_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_i8_cta1_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_i8_cta1_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 1, i32 3) + ret void +} + +define void @tcgen05_mma_fp16_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_fp16_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_fp16_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_fp16_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_fp16_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_fp16_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_fp16_cta2_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_fp16_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f16.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_fp16_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_fp16_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_fp16_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_fp16_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_fp16_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_fp16_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_fp16_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_fp16_cta2_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 0, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_tf32_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_tf32_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_tf32_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_tf32_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_tf32_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_tf32_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_tf32_cta2_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_tf32_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::tf32.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 1, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_tf32_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_tf32_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_tf32_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_tf32_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_tf32_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_tf32_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_tf32_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_tf32_cta2_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 1, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_f8f6f4_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_f8f6f4_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_f8f6f4_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_f8f6f4_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_f8f6f4_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_f8f6f4_cta2_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_f8f6f4_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_f8f6fr_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_f8f6fr_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_f8f6fr_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_f8f6fr_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_f8f6fr_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_f8f6fr_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_f8f6fr_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_f8f6fr_cta2_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_f8f6fr_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 2, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_i8_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d) { +; CHECK-LABEL: tcgen05_mma_i8_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_i8_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_i8_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_i8_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_i8_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_i8_cta2_param_4]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_i8_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, %r2, %p1; +; CHECK-NEXT: tcgen05.mma.cta_group::2.kind::i8.collector::a::use [%r1], [%r3], %rd2, %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 3, i32 2, i32 3) + ret void +} + +define void @tcgen05_mma_sp_i8_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata) { +; CHECK-LABEL: tcgen05_mma_sp_i8_cta2( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b8 %rs1, [tcgen05_mma_sp_i8_cta2_param_5]; +; CHECK-NEXT: and.b16 %rs2, %rs1, 1; +; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; +; CHECK-NEXT: ld.param.b32 %r1, [tcgen05_mma_sp_i8_cta2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [tcgen05_mma_sp_i8_cta2_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [tcgen05_mma_sp_i8_cta2_param_3]; +; CHECK-NEXT: ld.param.b32 %r2, [tcgen05_mma_sp_i8_cta2_param_4]; +; CHECK-NEXT: ld.param.b32 %r3, [tcgen05_mma_sp_i8_cta2_param_6]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ld.param.b32 %r4, [tcgen05_mma_sp_i8_cta2_param_1]; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], %rd1, %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::use [%r1], [%r4], %rd2, [%r3], %r2, %p1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.tensor.ashift(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 0) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 1) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 2) + call void @llvm.nvvm.tcgen05.mma.sp.shared(ptr addrspace(6) %dtmem, i64 %ashared, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 3) + call void @llvm.nvvm.tcgen05.mma.sp.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, ptr addrspace(6) %spmetadata, i32 3, i32 2, i32 3) + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/pr160040.ll b/llvm/test/CodeGen/PowerPC/pr160040.ll new file mode 100644 index 0000000000000..865239b37112c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr160040.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +; Make sure this does not crash. +define i32 @test(i32 %arg) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: rlwinm 4, 3, 13, 19, 19 +; CHECK-NEXT: rlwinm 3, 3, 2, 30, 30 +; CHECK-NEXT: xori 4, 4, 4096 +; CHECK-NEXT: xori 3, 3, 2 +; CHECK-NEXT: rlwimi 3, 4, 0, 31, 29 +; CHECK-NEXT: blr + %icmp = icmp sgt i32 %arg, -1 + %select = select i1 %icmp, i16 1, i16 0 + %select1 = select i1 %icmp, i16 16384, i16 0 + %lshr = lshr i16 %select1, 1 + %zext = zext i16 %lshr to i32 + %lshr2 = lshr i32 %zext, 1 + %shl = shl i16 %select, 1 + %zext3 = zext i16 %shl to i32 + %or = or i32 %lshr2, %zext3 + ret i32 %or +} diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll index aaabd76e163bb..fd0b494d57677 100644 --- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll +++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll @@ -20,18 +20,18 @@ define float @select_oeq_float(float %a, float %b, float %c, float %d) { ; FAST-P8-LABEL: select_oeq_float: ; FAST-P8: # %bb.0: # %entry -; FAST-P8-NEXT: xssubsp f0, f2, f1 -; FAST-P8-NEXT: xssubsp f1, f1, f2 -; FAST-P8-NEXT: fsel f1, f1, f3, f4 -; FAST-P8-NEXT: fsel f1, f0, f1, f4 +; FAST-P8-NEXT: xssubsp f0, f1, f2 +; FAST-P8-NEXT: xsnegdp f1, f0 +; FAST-P8-NEXT: fsel f0, f0, f3, f4 +; FAST-P8-NEXT: fsel f1, f1, f0, f4 ; FAST-P8-NEXT: blr ; ; FAST-P9-LABEL: select_oeq_float: ; FAST-P9: # %bb.0: # %entry -; FAST-P9-NEXT: xssubsp f0, f2, f1 -; FAST-P9-NEXT: xssubsp f1, f1, f2 -; FAST-P9-NEXT: fsel f1, f1, f3, f4 -; FAST-P9-NEXT: fsel f1, f0, f1, f4 +; FAST-P9-NEXT: xssubsp f0, f1, f2 +; FAST-P9-NEXT: xsnegdp f1, f0 +; FAST-P9-NEXT: fsel f0, f0, f3, f4 +; FAST-P9-NEXT: fsel f1, f1, f0, f4 ; FAST-P9-NEXT: blr ; ; NO-FAST-P8-LABEL: select_oeq_float: @@ -59,6 +59,48 @@ entry: ret float %cond } +define float @select_oeq_float_nsz(float %a, float %b, float %c, float %d) { +; FAST-P8-LABEL: select_oeq_float_nsz: +; FAST-P8: # %bb.0: # %entry +; FAST-P8-NEXT: xssubsp f0, f2, f1 +; FAST-P8-NEXT: xssubsp f1, f1, f2 +; FAST-P8-NEXT: fsel f1, f1, f3, f4 +; FAST-P8-NEXT: fsel f1, f0, f1, f4 +; FAST-P8-NEXT: blr +; +; FAST-P9-LABEL: select_oeq_float_nsz: +; FAST-P9: # %bb.0: # %entry +; FAST-P9-NEXT: xssubsp f0, f2, f1 +; FAST-P9-NEXT: xssubsp f1, f1, f2 +; FAST-P9-NEXT: fsel f1, f1, f3, f4 +; FAST-P9-NEXT: fsel f1, f0, f1, f4 +; FAST-P9-NEXT: blr +; +; NO-FAST-P8-LABEL: select_oeq_float_nsz: +; NO-FAST-P8: # %bb.0: # %entry +; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P8-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P8-NEXT: # %bb.1: # %entry +; NO-FAST-P8-NEXT: fmr f3, f4 +; NO-FAST-P8-NEXT: .LBB1_2: # %entry +; NO-FAST-P8-NEXT: fmr f1, f3 +; NO-FAST-P8-NEXT: blr +; +; NO-FAST-P9-LABEL: select_oeq_float_nsz: +; NO-FAST-P9: # %bb.0: # %entry +; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P9-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P9-NEXT: # %bb.1: # %entry +; NO-FAST-P9-NEXT: fmr f3, f4 +; NO-FAST-P9-NEXT: .LBB1_2: # %entry +; NO-FAST-P9-NEXT: fmr f1, f3 +; NO-FAST-P9-NEXT: blr +entry: + %cmp = fcmp nsz oeq float %a, %b + %cond = select i1 %cmp, float %c, float %d + ret float %cond +} + define double @select_oeq_double(double %a, double %b, double %c, double %d) { ; FAST-P8-LABEL: select_oeq_double: ; FAST-P8: # %bb.0: # %entry @@ -79,20 +121,20 @@ define double @select_oeq_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8-LABEL: select_oeq_double: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P8-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P8-NEXT: beq cr0, .LBB2_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB1_2: # %entry +; NO-FAST-P8-NEXT: .LBB2_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_oeq_double: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P9-NEXT: beq cr0, .LBB1_2 +; NO-FAST-P9-NEXT: beq cr0, .LBB2_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB1_2: # %entry +; NO-FAST-P9-NEXT: .LBB2_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -182,13 +224,57 @@ entry: define float @select_one_float(float %a, float %b, float %c, float %d) { ; FAST-P8-LABEL: select_one_float: ; FAST-P8: # %bb.0: # %entry +; FAST-P8-NEXT: xssubsp f0, f1, f2 +; FAST-P8-NEXT: xsnegdp f1, f0 +; FAST-P8-NEXT: fsel f0, f0, f4, f3 +; FAST-P8-NEXT: fsel f1, f1, f0, f3 +; FAST-P8-NEXT: blr +; +; FAST-P9-LABEL: select_one_float: +; FAST-P9: # %bb.0: # %entry +; FAST-P9-NEXT: xssubsp f0, f1, f2 +; FAST-P9-NEXT: xsnegdp f1, f0 +; FAST-P9-NEXT: fsel f0, f0, f4, f3 +; FAST-P9-NEXT: fsel f1, f1, f0, f3 +; FAST-P9-NEXT: blr +; +; NO-FAST-P8-LABEL: select_one_float: +; NO-FAST-P8: # %bb.0: # %entry +; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, eq +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P8-NEXT: # %bb.1: # %entry +; NO-FAST-P8-NEXT: fmr f3, f4 +; NO-FAST-P8-NEXT: .LBB5_2: # %entry +; NO-FAST-P8-NEXT: fmr f1, f3 +; NO-FAST-P8-NEXT: blr +; +; NO-FAST-P9-LABEL: select_one_float: +; NO-FAST-P9: # %bb.0: # %entry +; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 +; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, eq +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P9-NEXT: # %bb.1: # %entry +; NO-FAST-P9-NEXT: fmr f3, f4 +; NO-FAST-P9-NEXT: .LBB5_2: # %entry +; NO-FAST-P9-NEXT: fmr f1, f3 +; NO-FAST-P9-NEXT: blr +entry: + %cmp = fcmp one float %a, %b + %cond = select i1 %cmp, float %c, float %d + ret float %cond +} + +define float @select_one_float_nsz(float %a, float %b, float %c, float %d) { +; FAST-P8-LABEL: select_one_float_nsz: +; FAST-P8: # %bb.0: # %entry ; FAST-P8-NEXT: xssubsp f0, f2, f1 ; FAST-P8-NEXT: xssubsp f1, f1, f2 ; FAST-P8-NEXT: fsel f1, f1, f4, f3 ; FAST-P8-NEXT: fsel f1, f0, f1, f3 ; FAST-P8-NEXT: blr ; -; FAST-P9-LABEL: select_one_float: +; FAST-P9-LABEL: select_one_float_nsz: ; FAST-P9: # %bb.0: # %entry ; FAST-P9-NEXT: xssubsp f0, f2, f1 ; FAST-P9-NEXT: xssubsp f1, f1, f2 @@ -196,29 +282,29 @@ define float @select_one_float(float %a, float %b, float %c, float %d) { ; FAST-P9-NEXT: fsel f1, f0, f1, f3 ; FAST-P9-NEXT: blr ; -; NO-FAST-P8-LABEL: select_one_float: +; NO-FAST-P8-LABEL: select_one_float_nsz: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB4_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB6_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB4_2: # %entry +; NO-FAST-P8-NEXT: .LBB6_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; -; NO-FAST-P9-LABEL: select_one_float: +; NO-FAST-P9-LABEL: select_one_float_nsz: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB4_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB6_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB4_2: # %entry +; NO-FAST-P9-NEXT: .LBB6_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: - %cmp = fcmp one float %a, %b + %cmp = fcmp nsz one float %a, %b %cond = select i1 %cmp, float %c, float %d ret float %cond } @@ -244,10 +330,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB7_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB5_2: # %entry +; NO-FAST-P8-NEXT: .LBB7_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -255,10 +341,10 @@ define double @select_one_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, eq -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB5_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB7_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB5_2: # %entry +; NO-FAST-P9-NEXT: .LBB7_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -362,10 +448,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB8_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB10_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB8_2: # %entry +; NO-FAST-P8-NEXT: .LBB10_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -373,10 +459,10 @@ define float @select_oge_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB8_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB10_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB8_2: # %entry +; NO-FAST-P9-NEXT: .LBB10_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -402,10 +488,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB9_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB11_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB9_2: # %entry +; NO-FAST-P8-NEXT: .LBB11_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -413,10 +499,10 @@ define double @select_oge_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, lt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB9_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB11_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB9_2: # %entry +; NO-FAST-P9-NEXT: .LBB11_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -503,20 +589,20 @@ define float @select_olt_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8-LABEL: select_olt_float: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P8-NEXT: blt cr0, .LBB12_2 +; NO-FAST-P8-NEXT: blt cr0, .LBB14_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB12_2: # %entry +; NO-FAST-P8-NEXT: .LBB14_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_olt_float: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P9-NEXT: blt cr0, .LBB12_2 +; NO-FAST-P9-NEXT: blt cr0, .LBB14_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB12_2: # %entry +; NO-FAST-P9-NEXT: .LBB14_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -541,20 +627,20 @@ define double @select_olt_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8-LABEL: select_olt_double: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P8-NEXT: blt cr0, .LBB13_2 +; NO-FAST-P8-NEXT: blt cr0, .LBB15_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB13_2: # %entry +; NO-FAST-P8-NEXT: .LBB15_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_olt_double: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P9-NEXT: blt cr0, .LBB13_2 +; NO-FAST-P9-NEXT: blt cr0, .LBB15_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB13_2: # %entry +; NO-FAST-P9-NEXT: .LBB15_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -641,20 +727,20 @@ define float @select_ogt_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8-LABEL: select_ogt_float: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P8-NEXT: bgt cr0, .LBB16_2 +; NO-FAST-P8-NEXT: bgt cr0, .LBB18_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB16_2: # %entry +; NO-FAST-P8-NEXT: .LBB18_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_ogt_float: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 -; NO-FAST-P9-NEXT: bgt cr0, .LBB16_2 +; NO-FAST-P9-NEXT: bgt cr0, .LBB18_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB16_2: # %entry +; NO-FAST-P9-NEXT: .LBB18_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -679,20 +765,20 @@ define double @select_ogt_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8-LABEL: select_ogt_double: ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P8-NEXT: bgt cr0, .LBB17_2 +; NO-FAST-P8-NEXT: bgt cr0, .LBB19_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB17_2: # %entry +; NO-FAST-P8-NEXT: .LBB19_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; ; NO-FAST-P9-LABEL: select_ogt_double: ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f2 -; NO-FAST-P9-NEXT: bgt cr0, .LBB17_2 +; NO-FAST-P9-NEXT: bgt cr0, .LBB19_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB17_2: # %entry +; NO-FAST-P9-NEXT: .LBB19_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -780,10 +866,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB20_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB22_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB20_2: # %entry +; NO-FAST-P8-NEXT: .LBB22_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -791,10 +877,10 @@ define float @select_ole_float(float %a, float %b, float %c, float %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB20_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB22_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB20_2: # %entry +; NO-FAST-P9-NEXT: .LBB22_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -820,10 +906,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P8: # %bb.0: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P8-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB21_2 +; NO-FAST-P8-NEXT: bc 12, 4*cr5+lt, .LBB23_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f3, f4 -; NO-FAST-P8-NEXT: .LBB21_2: # %entry +; NO-FAST-P8-NEXT: .LBB23_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -831,10 +917,10 @@ define double @select_ole_double(double %a, double %b, double %c, double %d) { ; NO-FAST-P9: # %bb.0: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f2 ; NO-FAST-P9-NEXT: crnor 4*cr5+lt, un, gt -; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB21_2 +; NO-FAST-P9-NEXT: bc 12, 4*cr5+lt, .LBB23_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f3, f4 -; NO-FAST-P9-NEXT: .LBB21_2: # %entry +; NO-FAST-P9-NEXT: .LBB23_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -926,13 +1012,13 @@ define double @onecmp1(double %a, double %y, double %z) { ; NO-FAST-P8-NEXT: vspltisw v2, 1 ; NO-FAST-P8-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f0 -; NO-FAST-P8-NEXT: bc 12, lt, .LBB24_3 +; NO-FAST-P8-NEXT: bc 12, lt, .LBB26_3 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fcmpu cr0, f1, f1 -; NO-FAST-P8-NEXT: bc 12, un, .LBB24_3 +; NO-FAST-P8-NEXT: bc 12, un, .LBB26_3 ; NO-FAST-P8-NEXT: # %bb.2: # %entry ; NO-FAST-P8-NEXT: fmr f3, f2 -; NO-FAST-P8-NEXT: .LBB24_3: # %entry +; NO-FAST-P8-NEXT: .LBB26_3: # %entry ; NO-FAST-P8-NEXT: fmr f1, f3 ; NO-FAST-P8-NEXT: blr ; @@ -941,13 +1027,13 @@ define double @onecmp1(double %a, double %y, double %z) { ; NO-FAST-P9-NEXT: vspltisw v2, 1 ; NO-FAST-P9-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f0 -; NO-FAST-P9-NEXT: bc 12, lt, .LBB24_3 +; NO-FAST-P9-NEXT: bc 12, lt, .LBB26_3 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fcmpu cr0, f1, f1 -; NO-FAST-P9-NEXT: bc 12, un, .LBB24_3 +; NO-FAST-P9-NEXT: bc 12, un, .LBB26_3 ; NO-FAST-P9-NEXT: # %bb.2: # %entry ; NO-FAST-P9-NEXT: fmr f3, f2 -; NO-FAST-P9-NEXT: .LBB24_3: # %entry +; NO-FAST-P9-NEXT: .LBB26_3: # %entry ; NO-FAST-P9-NEXT: fmr f1, f3 ; NO-FAST-P9-NEXT: blr entry: @@ -978,10 +1064,10 @@ define double @onecmp2(double %a, double %y, double %z) { ; NO-FAST-P8-NEXT: vspltisw v2, 1 ; NO-FAST-P8-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P8-NEXT: bgt cr0, .LBB25_2 +; NO-FAST-P8-NEXT: bgt cr0, .LBB27_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f2, f3 -; NO-FAST-P8-NEXT: .LBB25_2: # %entry +; NO-FAST-P8-NEXT: .LBB27_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f2 ; NO-FAST-P8-NEXT: blr ; @@ -990,10 +1076,10 @@ define double @onecmp2(double %a, double %y, double %z) { ; NO-FAST-P9-NEXT: vspltisw v2, 1 ; NO-FAST-P9-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P9-NEXT: bgt cr0, .LBB25_2 +; NO-FAST-P9-NEXT: bgt cr0, .LBB27_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f2, f3 -; NO-FAST-P9-NEXT: .LBB25_2: # %entry +; NO-FAST-P9-NEXT: .LBB27_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f2 ; NO-FAST-P9-NEXT: blr entry: @@ -1028,10 +1114,10 @@ define double @onecmp3(double %a, double %y, double %z) { ; NO-FAST-P8-NEXT: vspltisw v2, 1 ; NO-FAST-P8-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P8-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P8-NEXT: beq cr0, .LBB26_2 +; NO-FAST-P8-NEXT: beq cr0, .LBB28_2 ; NO-FAST-P8-NEXT: # %bb.1: # %entry ; NO-FAST-P8-NEXT: fmr f2, f3 -; NO-FAST-P8-NEXT: .LBB26_2: # %entry +; NO-FAST-P8-NEXT: .LBB28_2: # %entry ; NO-FAST-P8-NEXT: fmr f1, f2 ; NO-FAST-P8-NEXT: blr ; @@ -1040,10 +1126,10 @@ define double @onecmp3(double %a, double %y, double %z) { ; NO-FAST-P9-NEXT: vspltisw v2, 1 ; NO-FAST-P9-NEXT: xvcvsxwdp vs0, vs34 ; NO-FAST-P9-NEXT: xscmpudp cr0, f1, f0 -; NO-FAST-P9-NEXT: beq cr0, .LBB26_2 +; NO-FAST-P9-NEXT: beq cr0, .LBB28_2 ; NO-FAST-P9-NEXT: # %bb.1: # %entry ; NO-FAST-P9-NEXT: fmr f2, f3 -; NO-FAST-P9-NEXT: .LBB26_2: # %entry +; NO-FAST-P9-NEXT: .LBB28_2: # %entry ; NO-FAST-P9-NEXT: fmr f1, f2 ; NO-FAST-P9-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/vector-all-ones.ll b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll new file mode 100644 index 0000000000000..e4c93adcf50a6 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vector-all-ones.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc64-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr9 -mtriple=powerpc-ibm-aix \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; Currently the generated code uses `vspltisw` to generate vector of 1s followed by add operation. +; This pattern is expected to be optimized in a future patch by using `xxleqv` to generate vector of -1s +; followed by subtraction operation. +define dso_local noundef <4 x i32> @test1(<4 x i32> %a) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vspltisw v3, 1 +; CHECK-NEXT: vadduwm v2, v2, v3 +; CHECK-NEXT: blr +entry: + %add = add <4 x i32> %a, splat (i32 1) + ret <4 x i32> %add +} diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll index 7f387a763b6da..23f660bb026a7 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/float-intrinsics.ll @@ -10,7 +10,7 @@ ; RUN: | FileCheck -check-prefix=RV64IF %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel -mattr=+d \ ; RUN: -target-abi=lp64d \ -; RUN: | FileCheck -check-prefix=RV64IF %s +; RUN: | FileCheck -check-prefix=RV64IFD %s ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -global-isel \ ; RUN: | FileCheck -check-prefix=RV32I %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -global-isel \ @@ -27,6 +27,11 @@ define float @sqrt_f32(float %a) nounwind { ; RV64IF-NEXT: fsqrt.s fa0, fa0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: sqrt_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fsqrt.s fa0, fa0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: sqrt_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -68,6 +73,16 @@ define float @powi_f32(float %a, i32 %b) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: powi_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: sext.w a0, a0 +; RV64IFD-NEXT: call __powisf2 +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: powi_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -109,6 +124,15 @@ define float @sin_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: sin_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call sinf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: sin_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -149,6 +173,15 @@ define float @cos_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: cos_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call cosf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: cos_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -190,6 +223,42 @@ define float @sincos_f32(float %a) nounwind { ; RV32IF-NEXT: addi sp, sp, 16 ; RV32IF-NEXT: ret ; +; RV64IF-LABEL: sincos_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill +; RV64IF-NEXT: fsw fs1, 0(sp) # 4-byte Folded Spill +; RV64IF-NEXT: fmv.s fs0, fa0 +; RV64IF-NEXT: call sinf +; RV64IF-NEXT: fmv.s fs1, fa0 +; RV64IF-NEXT: fmv.s fa0, fs0 +; RV64IF-NEXT: call cosf +; RV64IF-NEXT: fadd.s fa0, fs1, fa0 +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload +; RV64IF-NEXT: flw fs1, 0(sp) # 4-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret +; +; RV64IFD-LABEL: sincos_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -32 +; RV64IFD-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: fmv.s fs0, fa0 +; RV64IFD-NEXT: call sinf +; RV64IFD-NEXT: fmv.s fs1, fa0 +; RV64IFD-NEXT: fmv.s fa0, fs0 +; RV64IFD-NEXT: call cosf +; RV64IFD-NEXT: fadd.s fa0, fs1, fa0 +; RV64IFD-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 32 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: sincos_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -254,6 +323,15 @@ define float @pow_f32(float %a, float %b) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: pow_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call powf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: pow_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -294,6 +372,15 @@ define float @exp_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: exp_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call expf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: exp_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -334,6 +421,15 @@ define float @exp2_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: exp2_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call exp2f +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: exp2_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -374,6 +470,15 @@ define float @exp10_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: exp10_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call exp10f +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: exp10_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -414,6 +519,15 @@ define float @log_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: log_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call logf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: log_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -454,6 +568,15 @@ define float @log10_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: log10_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call log10f +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: log10_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -494,6 +617,15 @@ define float @log2_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: log2_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call log2f +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: log2_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -526,6 +658,11 @@ define float @fma_f32(float %a, float %b, float %c) nounwind { ; RV64IF-NEXT: fmadd.s fa0, fa0, fa1, fa2 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: fma_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fma_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -558,6 +695,11 @@ define float @fmuladd_f32(float %a, float %b, float %c) nounwind { ; RV64IF-NEXT: fmadd.s fa0, fa0, fa1, fa2 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: fmuladd_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fmuladd_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -600,6 +742,11 @@ define float @fabs_f32(float %a) nounwind { ; RV64IF-NEXT: fabs.s fa0, fa0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: fabs_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fabs.s fa0, fa0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fabs_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -626,6 +773,11 @@ define float @minnum_f32(float %a, float %b) nounwind { ; RV64IF-NEXT: fmin.s fa0, fa0, fa1 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: minnum_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmin.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: minnum_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -658,6 +810,11 @@ define float @maxnum_f32(float %a, float %b) nounwind { ; RV64IF-NEXT: fmax.s fa0, fa0, fa1 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: maxnum_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmax.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: maxnum_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -690,6 +847,11 @@ define float @copysign_f32(float %a, float %b) nounwind { ; RV64IF-NEXT: fsgnj.s fa0, fa0, fa1 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: copysign_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fsgnj.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: copysign_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 @@ -730,6 +892,15 @@ define float @ceil_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: ceil_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call ceilf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: ceil_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -770,6 +941,15 @@ define float @trunc_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: trunc_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call truncf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: trunc_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -810,6 +990,15 @@ define float @rint_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: rint_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call rintf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: rint_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -850,6 +1039,15 @@ define float @nearbyint_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: nearbyint_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call nearbyintf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: nearbyint_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -890,6 +1088,15 @@ define float @round_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: round_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call roundf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: round_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -930,6 +1137,15 @@ define float @roundeven_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: roundeven_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call roundevenf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: roundeven_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -966,6 +1182,13 @@ define i1 @fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 927 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1039,6 +1262,13 @@ define i1 @isnan_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isnan_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 768 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1073,6 +1303,13 @@ define i1 @isqnan_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isqnan_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 512 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isqnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -1109,6 +1346,13 @@ define i1 @issnan_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: issnan_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 256 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: issnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1149,6 +1393,13 @@ define i1 @isinf_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isinf_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 129 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isinf_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1185,6 +1436,13 @@ define i1 @isposinf_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isposinf_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 128 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isposinf_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1218,6 +1476,13 @@ define i1 @isneginf_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isneginf_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 1 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isneginf_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 1046528 @@ -1251,6 +1516,13 @@ define i1 @isfinite_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 126 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1285,6 +1557,13 @@ define i1 @isposfinite_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isposfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 112 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isposfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1316,6 +1595,13 @@ define i1 @isnegfinite_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isnegfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 14 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isnegfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1357,6 +1643,13 @@ define i1 @isnotfinite_fpclass(float %x) { ; RV64IF-NEXT: snez a0, a0 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: isnotfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 897 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isnotfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1401,6 +1694,15 @@ define float @tan_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: tan_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call tanf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: tan_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1442,6 +1744,16 @@ define float @ldexp_float(float %x, i32 %y) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: ldexp_float: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: sext.w a0, a0 +; RV64IFD-NEXT: call ldexpf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: ldexp_float: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1483,6 +1795,15 @@ define float @asin_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: asin_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call asinf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: asin_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1523,6 +1844,15 @@ define float @acos_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: acos_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call acosf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: acos_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1563,6 +1893,15 @@ define float @atan_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: atan_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call atanf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: atan_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1603,6 +1942,15 @@ define float @atan2_f32(float %a, float %b) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: atan2_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call atan2f +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: atan2_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1643,6 +1991,15 @@ define float @sinh_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: sinh_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call sinhf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: sinh_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1683,6 +2040,15 @@ define float @cosh_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: cosh_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call coshf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: cosh_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1723,6 +2089,15 @@ define float @tanh_f32(float %a) nounwind { ; RV64IF-NEXT: addi sp, sp, 16 ; RV64IF-NEXT: ret ; +; RV64IFD-LABEL: tanh_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call tanhf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: tanh_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll index 287bbbad6d52d..2a2abbdf9fa35 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll @@ -30,6 +30,13 @@ define float @fadd_f32(float %x, float %y) { ; RV32I-NEXT: fadd.d fa5, fa5, fa4 ; RV32I-NEXT: fmv.x.d a0, fa5 ; RV32I-NEXT: ret +; RV64I-LABEL: fadd_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: fmv.w.x fa5, a0 +; RV64I-NEXT: fmv.w.x fa4, a1 +; RV64I-NEXT: fadd.s fa5, fa5, fa4 +; RV64I-NEXT: fmv.x.w a0, fa5 +; RV64I-NEXT: ret %a = fadd float %x, %y ret float %a } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir index 86084ae531cdb..7204064a07f40 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir @@ -635,6 +635,9 @@ # DEBUG-NEXT: G_GET_ROUNDING (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SET_ROUNDING (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_PTR_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 2, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK diff --git a/llvm/test/CodeGen/RISCV/attributes-andes.ll b/llvm/test/CodeGen/RISCV/attributes-andes.ll index ed27a9255a86f..fc2b1b123af51 100644 --- a/llvm/test/CodeGen/RISCV/attributes-andes.ll +++ b/llvm/test/CodeGen/RISCV/attributes-andes.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=riscv32 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV32XANDESPERF %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESBFHCVT %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV32XANDESVBFHCVT %s +; RUN: llc -mtriple=riscv32 -mattr=+xandesvsinth %s -o - | FileCheck --check-prefix=RV32XANDESVSINTH %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesvsintload %s -o - | FileCheck --check-prefix=RV32XANDESVSINTLOAD %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV32XANDESVDOT %s ; RUN: llc -mtriple=riscv32 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV32XANDESVPACKFPH %s @@ -10,6 +11,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+xandesperf %s -o - | FileCheck --check-prefix=RV64XANDESPERF %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESBFHCVT %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesvbfhcvt %s -o - | FileCheck --check-prefix=RV64XANDESVBFHCVT %s +; RUN: llc -mtriple=riscv64 -mattr=+xandesvsinth %s -o - | FileCheck --check-prefix=RV64XANDESVSINTH %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesvsintload %s -o - | FileCheck --check-prefix=RV64XANDESVSINTLOAD %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesvdot %s -o - | FileCheck --check-prefix=RV64XANDESVDOT %s ; RUN: llc -mtriple=riscv64 -mattr=+xandesvpackfph %s -o - | FileCheck --check-prefix=RV64XANDESVPACKFPH %s @@ -17,6 +19,7 @@ ; RV32XANDESPERF: .attribute 5, "rv32i2p1_xandesperf5p0" ; RV32XANDESBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_xandesbfhcvt5p0" ; RV32XANDESVBFHCVT: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0" +; RV32XANDESVSINTH: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvsinth5p0" ; RV32XANDESVSINTLOAD: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvsintload5p0" ; RV32XANDESVDOT: .attribute 5, "rv32i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0" ; RV32XANDESVPACKFPH: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_xandesvpackfph5p0" @@ -24,6 +27,7 @@ ; RV64XANDESPERF: .attribute 5, "rv64i2p1_xandesperf5p0" ; RV64XANDESBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_xandesbfhcvt5p0" ; RV64XANDESVBFHCVT: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvl32b1p0_xandesvbfhcvt5p0" +; RV64XANDESVSINTH: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvsinth5p0" ; RV64XANDESVSINTLOAD: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvsintload5p0" ; RV64XANDESVDOT: .attribute 5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xandesvdot5p0" ; RV64XANDESVPACKFPH: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_xandesvpackfph5p0" diff --git a/llvm/test/CodeGen/RISCV/bfloat-convert.ll b/llvm/test/CodeGen/RISCV/bfloat-convert.ll index 6207a17734d62..73ff888e44b3b 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-convert.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-convert.ll @@ -51,13 +51,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-LABEL: fcvt_si_bf16_sat: ; CHECK32ZFBFMIN: # %bb.0: # %start ; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK32ZFBFMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK32ZFBFMIN-NEXT: lui a0, 815104 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa3, a0 -; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK32ZFBFMIN-NEXT: neg a0, a1 +; CHECK32ZFBFMIN-NEXT: lui a1, 290816 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32ZFBFMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32ZFBFMIN-NEXT: addi a1, a1, -512 +; CHECK32ZFBFMIN-NEXT: neg a0, a0 +; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32ZFBFMIN-NEXT: and a0, a0, a1 @@ -68,12 +69,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: fmv.x.w a0, fa0 ; RV32ID-NEXT: lui a1, 815104 ; RV32ID-NEXT: fmv.w.x fa5, a1 -; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) +; RV32ID-NEXT: lui a1, 290816 ; RV32ID-NEXT: slli a0, a0, 16 -; RV32ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV32ID-NEXT: fmv.w.x fa3, a0 -; RV32ID-NEXT: feq.s a0, fa3, fa3 -; RV32ID-NEXT: fmax.s fa5, fa3, fa5 +; RV32ID-NEXT: addi a1, a1, -512 +; RV32ID-NEXT: fmv.w.x fa4, a0 +; RV32ID-NEXT: feq.s a0, fa4, fa4 +; RV32ID-NEXT: fmax.s fa5, fa4, fa5 +; RV32ID-NEXT: fmv.w.x fa4, a1 ; RV32ID-NEXT: neg a0, a0 ; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz @@ -83,13 +85,14 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; CHECK64ZFBFMIN-LABEL: fcvt_si_bf16_sat: ; CHECK64ZFBFMIN: # %bb.0: # %start ; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 -; CHECK64ZFBFMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK64ZFBFMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64ZFBFMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK64ZFBFMIN-NEXT: lui a0, 815104 -; CHECK64ZFBFMIN-NEXT: fmv.w.x fa3, a0 -; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK64ZFBFMIN-NEXT: neg a0, a1 +; CHECK64ZFBFMIN-NEXT: lui a1, 290816 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64ZFBFMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64ZFBFMIN-NEXT: addi a1, a1, -512 +; CHECK64ZFBFMIN-NEXT: neg a0, a0 +; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64ZFBFMIN-NEXT: and a0, a0, a1 @@ -100,12 +103,13 @@ define i16 @fcvt_si_bf16_sat(bfloat %a) nounwind { ; RV64ID-NEXT: fmv.x.w a0, fa0 ; RV64ID-NEXT: lui a1, 815104 ; RV64ID-NEXT: fmv.w.x fa5, a1 -; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) +; RV64ID-NEXT: lui a1, 290816 ; RV64ID-NEXT: slli a0, a0, 16 -; RV64ID-NEXT: flw fa4, %lo(.LCPI1_0)(a1) -; RV64ID-NEXT: fmv.w.x fa3, a0 -; RV64ID-NEXT: feq.s a0, fa3, fa3 -; RV64ID-NEXT: fmax.s fa5, fa3, fa5 +; RV64ID-NEXT: addi a1, a1, -512 +; RV64ID-NEXT: fmv.w.x fa4, a0 +; RV64ID-NEXT: feq.s a0, fa4, fa4 +; RV64ID-NEXT: fmax.s fa5, fa4, fa5 +; RV64ID-NEXT: fmv.w.x fa4, a1 ; RV64ID-NEXT: neg a0, a0 ; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz @@ -152,49 +156,53 @@ define i16 @fcvt_ui_bf16(bfloat %a) nounwind { define i16 @fcvt_ui_bf16_sat(bfloat %a) nounwind { ; CHECK32ZFBFMIN-LABEL: fcvt_ui_bf16_sat: ; CHECK32ZFBFMIN: # %bb.0: # %start -; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK32ZFBFMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa3, zero -; CHECK32ZFBFMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, zero +; CHECK32ZFBFMIN-NEXT: lui a0, 292864 +; CHECK32ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32ZFBFMIN-NEXT: addi a0, a0, -256 +; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32ZFBFMIN-NEXT: fcvt.wu.s a0, fa5, rtz ; CHECK32ZFBFMIN-NEXT: ret ; ; RV32ID-LABEL: fcvt_ui_bf16_sat: ; RV32ID: # %bb.0: # %start -; RV32ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) ; RV32ID-NEXT: fmv.x.w a0, fa0 +; RV32ID-NEXT: fmv.w.x fa5, zero ; RV32ID-NEXT: slli a0, a0, 16 ; RV32ID-NEXT: fmv.w.x fa4, a0 -; RV32ID-NEXT: fmv.w.x fa3, zero -; RV32ID-NEXT: fmax.s fa4, fa4, fa3 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: lui a0, 292864 +; RV32ID-NEXT: addi a0, a0, -256 +; RV32ID-NEXT: fmax.s fa5, fa4, fa5 +; RV32ID-NEXT: fmv.w.x fa4, a0 +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-NEXT: ret ; ; CHECK64ZFBFMIN-LABEL: fcvt_ui_bf16_sat: ; CHECK64ZFBFMIN: # %bb.0: # %start -; CHECK64ZFBFMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK64ZFBFMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa4, fa0 -; CHECK64ZFBFMIN-NEXT: fmv.w.x fa3, zero -; CHECK64ZFBFMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK64ZFBFMIN-NEXT: fcvt.s.bf16 fa5, fa0 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, zero +; CHECK64ZFBFMIN-NEXT: lui a0, 292864 +; CHECK64ZFBFMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64ZFBFMIN-NEXT: addi a0, a0, -256 +; CHECK64ZFBFMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64ZFBFMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64ZFBFMIN-NEXT: fcvt.lu.s a0, fa5, rtz ; CHECK64ZFBFMIN-NEXT: ret ; ; RV64ID-LABEL: fcvt_ui_bf16_sat: ; RV64ID: # %bb.0: # %start -; RV64ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) ; RV64ID-NEXT: fmv.x.w a0, fa0 +; RV64ID-NEXT: fmv.w.x fa5, zero ; RV64ID-NEXT: slli a0, a0, 16 ; RV64ID-NEXT: fmv.w.x fa4, a0 -; RV64ID-NEXT: fmv.w.x fa3, zero -; RV64ID-NEXT: fmax.s fa4, fa4, fa3 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: lui a0, 292864 +; RV64ID-NEXT: addi a0, a0, -256 +; RV64ID-NEXT: fmax.s fa5, fa4, fa5 +; RV64ID-NEXT: fmv.w.x fa4, a0 +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-NEXT: ret start: @@ -472,20 +480,21 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32IZFBFMIN-NEXT: # %bb.1: # %start ; RV32IZFBFMIN-NEXT: mv a2, a1 ; RV32IZFBFMIN-NEXT: .LBB10_2: # %start -; RV32IZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32IZFBFMIN-NEXT: lui a1, 389120 +; RV32IZFBFMIN-NEXT: addi a1, a1, -1 +; RV32IZFBFMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFBFMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFBFMIN-NEXT: beqz a1, .LBB10_4 ; RV32IZFBFMIN-NEXT: # %bb.3: ; RV32IZFBFMIN-NEXT: addi a2, a3, -1 ; RV32IZFBFMIN-NEXT: .LBB10_4: # %start ; RV32IZFBFMIN-NEXT: feq.s a3, fs0, fs0 -; RV32IZFBFMIN-NEXT: neg a4, a1 -; RV32IZFBFMIN-NEXT: neg a1, s0 +; RV32IZFBFMIN-NEXT: neg a4, s0 +; RV32IZFBFMIN-NEXT: neg a5, a1 ; RV32IZFBFMIN-NEXT: neg a3, a3 -; RV32IZFBFMIN-NEXT: and a0, a1, a0 +; RV32IZFBFMIN-NEXT: and a0, a4, a0 ; RV32IZFBFMIN-NEXT: and a1, a3, a2 -; RV32IZFBFMIN-NEXT: or a0, a4, a0 +; RV32IZFBFMIN-NEXT: or a0, a5, a0 ; RV32IZFBFMIN-NEXT: and a0, a3, a0 ; RV32IZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -511,20 +520,21 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; R32IDZFBFMIN-NEXT: # %bb.1: # %start ; R32IDZFBFMIN-NEXT: mv a2, a1 ; R32IDZFBFMIN-NEXT: .LBB10_2: # %start -; R32IDZFBFMIN-NEXT: lui a1, %hi(.LCPI10_0) -; R32IDZFBFMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; R32IDZFBFMIN-NEXT: lui a1, 389120 +; R32IDZFBFMIN-NEXT: addi a1, a1, -1 +; R32IDZFBFMIN-NEXT: fmv.w.x fa5, a1 ; R32IDZFBFMIN-NEXT: flt.s a1, fa5, fs0 ; R32IDZFBFMIN-NEXT: beqz a1, .LBB10_4 ; R32IDZFBFMIN-NEXT: # %bb.3: ; R32IDZFBFMIN-NEXT: addi a2, a3, -1 ; R32IDZFBFMIN-NEXT: .LBB10_4: # %start ; R32IDZFBFMIN-NEXT: feq.s a3, fs0, fs0 -; R32IDZFBFMIN-NEXT: neg a4, a1 -; R32IDZFBFMIN-NEXT: neg a1, s0 +; R32IDZFBFMIN-NEXT: neg a4, s0 +; R32IDZFBFMIN-NEXT: neg a5, a1 ; R32IDZFBFMIN-NEXT: neg a3, a3 -; R32IDZFBFMIN-NEXT: and a0, a1, a0 +; R32IDZFBFMIN-NEXT: and a0, a4, a0 ; R32IDZFBFMIN-NEXT: and a1, a3, a2 -; R32IDZFBFMIN-NEXT: or a0, a4, a0 +; R32IDZFBFMIN-NEXT: or a0, a5, a0 ; R32IDZFBFMIN-NEXT: and a0, a3, a0 ; R32IDZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; R32IDZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -552,8 +562,9 @@ define i64 @fcvt_l_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: # %bb.1: # %start ; RV32ID-NEXT: mv a2, a1 ; RV32ID-NEXT: .LBB10_2: # %start -; RV32ID-NEXT: lui a1, %hi(.LCPI10_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32ID-NEXT: lui a1, 389120 +; RV32ID-NEXT: addi a1, a1, -1 +; RV32ID-NEXT: fmv.w.x fa5, a1 ; RV32ID-NEXT: flt.s a1, fa5, fs0 ; RV32ID-NEXT: beqz a1, .LBB10_4 ; RV32ID-NEXT: # %bb.3: @@ -641,30 +652,59 @@ define i64 @fcvt_lu_bf16(bfloat %a) nounwind { } define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind { -; CHECK32ZFBFMIN-LABEL: fcvt_lu_bf16_sat: -; CHECK32ZFBFMIN: # %bb.0: # %start -; CHECK32ZFBFMIN-NEXT: addi sp, sp, -16 -; CHECK32ZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; CHECK32ZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; CHECK32ZFBFMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; CHECK32ZFBFMIN-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK32ZFBFMIN-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; CHECK32ZFBFMIN-NEXT: fcvt.s.bf16 fa0, fa0 -; CHECK32ZFBFMIN-NEXT: fmv.w.x fa4, zero -; CHECK32ZFBFMIN-NEXT: fle.s a0, fa4, fa0 -; CHECK32ZFBFMIN-NEXT: flt.s a1, fa5, fa0 -; CHECK32ZFBFMIN-NEXT: neg s0, a1 -; CHECK32ZFBFMIN-NEXT: neg s1, a0 -; CHECK32ZFBFMIN-NEXT: call __fixunssfdi -; CHECK32ZFBFMIN-NEXT: and a0, s1, a0 -; CHECK32ZFBFMIN-NEXT: and a1, s1, a1 -; CHECK32ZFBFMIN-NEXT: or a0, s0, a0 -; CHECK32ZFBFMIN-NEXT: or a1, s0, a1 -; CHECK32ZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; CHECK32ZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; CHECK32ZFBFMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload -; CHECK32ZFBFMIN-NEXT: addi sp, sp, 16 -; CHECK32ZFBFMIN-NEXT: ret +; RV32IZFBFMIN-LABEL: fcvt_lu_bf16_sat: +; RV32IZFBFMIN: # %bb.0: # %start +; RV32IZFBFMIN-NEXT: addi sp, sp, -16 +; RV32IZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32IZFBFMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill +; RV32IZFBFMIN-NEXT: fcvt.s.bf16 fs0, fa0 +; RV32IZFBFMIN-NEXT: fmv.w.x fa5, zero +; RV32IZFBFMIN-NEXT: fle.s a0, fa5, fs0 +; RV32IZFBFMIN-NEXT: neg s0, a0 +; RV32IZFBFMIN-NEXT: fmv.s fa0, fs0 +; RV32IZFBFMIN-NEXT: call __fixunssfdi +; RV32IZFBFMIN-NEXT: and a0, s0, a0 +; RV32IZFBFMIN-NEXT: lui a2, 391168 +; RV32IZFBFMIN-NEXT: and a1, s0, a1 +; RV32IZFBFMIN-NEXT: addi a2, a2, -1 +; RV32IZFBFMIN-NEXT: fmv.w.x fa5, a2 +; RV32IZFBFMIN-NEXT: flt.s a2, fa5, fs0 +; RV32IZFBFMIN-NEXT: neg a2, a2 +; RV32IZFBFMIN-NEXT: or a0, a2, a0 +; RV32IZFBFMIN-NEXT: or a1, a2, a1 +; RV32IZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32IZFBFMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload +; RV32IZFBFMIN-NEXT: addi sp, sp, 16 +; RV32IZFBFMIN-NEXT: ret +; +; R32IDZFBFMIN-LABEL: fcvt_lu_bf16_sat: +; R32IDZFBFMIN: # %bb.0: # %start +; R32IDZFBFMIN-NEXT: addi sp, sp, -16 +; R32IDZFBFMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; R32IDZFBFMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; R32IDZFBFMIN-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; R32IDZFBFMIN-NEXT: fcvt.s.bf16 fs0, fa0 +; R32IDZFBFMIN-NEXT: fmv.w.x fa5, zero +; R32IDZFBFMIN-NEXT: fle.s a0, fa5, fs0 +; R32IDZFBFMIN-NEXT: neg s0, a0 +; R32IDZFBFMIN-NEXT: fmv.s fa0, fs0 +; R32IDZFBFMIN-NEXT: call __fixunssfdi +; R32IDZFBFMIN-NEXT: and a0, s0, a0 +; R32IDZFBFMIN-NEXT: lui a2, 391168 +; R32IDZFBFMIN-NEXT: and a1, s0, a1 +; R32IDZFBFMIN-NEXT: addi a2, a2, -1 +; R32IDZFBFMIN-NEXT: fmv.w.x fa5, a2 +; R32IDZFBFMIN-NEXT: flt.s a2, fa5, fs0 +; R32IDZFBFMIN-NEXT: neg a2, a2 +; R32IDZFBFMIN-NEXT: or a0, a2, a0 +; R32IDZFBFMIN-NEXT: or a1, a2, a1 +; R32IDZFBFMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; R32IDZFBFMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; R32IDZFBFMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload +; R32IDZFBFMIN-NEXT: addi sp, sp, 16 +; R32IDZFBFMIN-NEXT: ret ; ; RV32ID-LABEL: fcvt_lu_bf16_sat: ; RV32ID: # %bb.0: # %start @@ -673,15 +713,16 @@ define i64 @fcvt_lu_bf16_sat(bfloat %a) nounwind { ; RV32ID-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32ID-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32ID-NEXT: fmv.x.w a0, fa0 -; RV32ID-NEXT: lui a1, %hi(.LCPI12_0) -; RV32ID-NEXT: fmv.w.x fa5, zero -; RV32ID-NEXT: flw fa4, %lo(.LCPI12_0)(a1) +; RV32ID-NEXT: lui a1, 391168 ; RV32ID-NEXT: slli a0, a0, 16 +; RV32ID-NEXT: addi a1, a1, -1 ; RV32ID-NEXT: fmv.w.x fa0, a0 -; RV32ID-NEXT: fle.s a0, fa5, fa0 -; RV32ID-NEXT: flt.s a1, fa4, fa0 -; RV32ID-NEXT: neg s0, a1 -; RV32ID-NEXT: neg s1, a0 +; RV32ID-NEXT: fmv.w.x fa5, a1 +; RV32ID-NEXT: flt.s a0, fa5, fa0 +; RV32ID-NEXT: fmv.w.x fa5, zero +; RV32ID-NEXT: fle.s a1, fa5, fa0 +; RV32ID-NEXT: neg s0, a0 +; RV32ID-NEXT: neg s1, a1 ; RV32ID-NEXT: call __fixunssfdi ; RV32ID-NEXT: and a0, s1, a0 ; RV32ID-NEXT: and a1, s1, a1 diff --git a/llvm/test/CodeGen/RISCV/bfloat-imm.ll b/llvm/test/CodeGen/RISCV/bfloat-imm.ll index 76ff720b1c268..61014891414d8 100644 --- a/llvm/test/CodeGen/RISCV/bfloat-imm.ll +++ b/llvm/test/CodeGen/RISCV/bfloat-imm.ll @@ -7,8 +7,9 @@ define bfloat @bfloat_imm() nounwind { ; CHECK-LABEL: bfloat_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa0, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: lui a0, 4 +; CHECK-NEXT: addi a0, a0, 64 +; CHECK-NEXT: fmv.h.x fa0, a0 ; CHECK-NEXT: ret ret bfloat 3.0 } diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll index d7957540d1b29..d8e6b7f3ede9a 100644 --- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll +++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll @@ -519,15 +519,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV32-ILP32F: # %bb.0: ; RV32-ILP32F-NEXT: addi sp, sp, -16 ; RV32-ILP32F-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32F-NEXT: lui a4, %hi(.LCPI3_0) +; RV32-ILP32F-NEXT: lui a7, 1048565 ; RV32-ILP32F-NEXT: li a0, 1 ; RV32-ILP32F-NEXT: li a1, 2 ; RV32-ILP32F-NEXT: li a2, 3 ; RV32-ILP32F-NEXT: li a3, 4 -; RV32-ILP32F-NEXT: flw fa0, %lo(.LCPI3_0)(a4) ; RV32-ILP32F-NEXT: li a4, 5 ; RV32-ILP32F-NEXT: li a5, 6 ; RV32-ILP32F-NEXT: li a6, 7 +; RV32-ILP32F-NEXT: addi a7, a7, -1792 +; RV32-ILP32F-NEXT: fmv.w.x fa0, a7 ; RV32-ILP32F-NEXT: li a7, 8 ; RV32-ILP32F-NEXT: call callee_half_on_stack ; RV32-ILP32F-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -538,15 +539,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV64-LP64F: # %bb.0: ; RV64-LP64F-NEXT: addi sp, sp, -16 ; RV64-LP64F-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-LP64F-NEXT: lui a4, %hi(.LCPI3_0) +; RV64-LP64F-NEXT: lui a7, 1048565 ; RV64-LP64F-NEXT: li a0, 1 ; RV64-LP64F-NEXT: li a1, 2 ; RV64-LP64F-NEXT: li a2, 3 ; RV64-LP64F-NEXT: li a3, 4 -; RV64-LP64F-NEXT: flw fa0, %lo(.LCPI3_0)(a4) ; RV64-LP64F-NEXT: li a4, 5 ; RV64-LP64F-NEXT: li a5, 6 ; RV64-LP64F-NEXT: li a6, 7 +; RV64-LP64F-NEXT: addi a7, a7, -1792 +; RV64-LP64F-NEXT: fmv.w.x fa0, a7 ; RV64-LP64F-NEXT: li a7, 8 ; RV64-LP64F-NEXT: call callee_half_on_stack ; RV64-LP64F-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -557,15 +559,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV32-ILP32ZFHMIN: # %bb.0: ; RV32-ILP32ZFHMIN-NEXT: addi sp, sp, -16 ; RV32-ILP32ZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32ZFHMIN-NEXT: lui a4, %hi(.LCPI3_0) +; RV32-ILP32ZFHMIN-NEXT: lui a7, 5 ; RV32-ILP32ZFHMIN-NEXT: li a0, 1 ; RV32-ILP32ZFHMIN-NEXT: li a1, 2 ; RV32-ILP32ZFHMIN-NEXT: li a2, 3 ; RV32-ILP32ZFHMIN-NEXT: li a3, 4 -; RV32-ILP32ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a4) ; RV32-ILP32ZFHMIN-NEXT: li a4, 5 ; RV32-ILP32ZFHMIN-NEXT: li a5, 6 ; RV32-ILP32ZFHMIN-NEXT: li a6, 7 +; RV32-ILP32ZFHMIN-NEXT: addi a7, a7, -1792 +; RV32-ILP32ZFHMIN-NEXT: fmv.h.x fa0, a7 ; RV32-ILP32ZFHMIN-NEXT: li a7, 8 ; RV32-ILP32ZFHMIN-NEXT: call callee_half_on_stack ; RV32-ILP32ZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -576,15 +579,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV64-LP64ZFHMIN: # %bb.0: ; RV64-LP64ZFHMIN-NEXT: addi sp, sp, -16 ; RV64-LP64ZFHMIN-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-LP64ZFHMIN-NEXT: lui a4, %hi(.LCPI3_0) +; RV64-LP64ZFHMIN-NEXT: lui a7, 5 ; RV64-LP64ZFHMIN-NEXT: li a0, 1 ; RV64-LP64ZFHMIN-NEXT: li a1, 2 ; RV64-LP64ZFHMIN-NEXT: li a2, 3 ; RV64-LP64ZFHMIN-NEXT: li a3, 4 -; RV64-LP64ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a4) ; RV64-LP64ZFHMIN-NEXT: li a4, 5 ; RV64-LP64ZFHMIN-NEXT: li a5, 6 ; RV64-LP64ZFHMIN-NEXT: li a6, 7 +; RV64-LP64ZFHMIN-NEXT: addi a7, a7, -1792 +; RV64-LP64ZFHMIN-NEXT: fmv.h.x fa0, a7 ; RV64-LP64ZFHMIN-NEXT: li a7, 8 ; RV64-LP64ZFHMIN-NEXT: call callee_half_on_stack ; RV64-LP64ZFHMIN-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -595,15 +599,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV32-ZFH-ILP32: # %bb.0: ; RV32-ZFH-ILP32-NEXT: addi sp, sp, -16 ; RV32-ZFH-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ZFH-ILP32-NEXT: lui a4, %hi(.LCPI3_0) +; RV32-ZFH-ILP32-NEXT: lui a7, 5 ; RV32-ZFH-ILP32-NEXT: li a0, 1 ; RV32-ZFH-ILP32-NEXT: li a1, 2 ; RV32-ZFH-ILP32-NEXT: li a2, 3 ; RV32-ZFH-ILP32-NEXT: li a3, 4 -; RV32-ZFH-ILP32-NEXT: flh fa5, %lo(.LCPI3_0)(a4) ; RV32-ZFH-ILP32-NEXT: li a4, 5 ; RV32-ZFH-ILP32-NEXT: li a5, 6 ; RV32-ZFH-ILP32-NEXT: li a6, 7 +; RV32-ZFH-ILP32-NEXT: addi a7, a7, -1792 +; RV32-ZFH-ILP32-NEXT: fmv.h.x fa5, a7 ; RV32-ZFH-ILP32-NEXT: li a7, 8 ; RV32-ZFH-ILP32-NEXT: fsh fa5, 0(sp) ; RV32-ZFH-ILP32-NEXT: call callee_half_on_stack @@ -615,15 +620,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV32-ZFH-ILP32F: # %bb.0: ; RV32-ZFH-ILP32F-NEXT: addi sp, sp, -16 ; RV32-ZFH-ILP32F-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ZFH-ILP32F-NEXT: lui a4, %hi(.LCPI3_0) +; RV32-ZFH-ILP32F-NEXT: lui a7, 5 ; RV32-ZFH-ILP32F-NEXT: li a0, 1 ; RV32-ZFH-ILP32F-NEXT: li a1, 2 ; RV32-ZFH-ILP32F-NEXT: li a2, 3 ; RV32-ZFH-ILP32F-NEXT: li a3, 4 -; RV32-ZFH-ILP32F-NEXT: flh fa0, %lo(.LCPI3_0)(a4) ; RV32-ZFH-ILP32F-NEXT: li a4, 5 ; RV32-ZFH-ILP32F-NEXT: li a5, 6 ; RV32-ZFH-ILP32F-NEXT: li a6, 7 +; RV32-ZFH-ILP32F-NEXT: addi a7, a7, -1792 +; RV32-ZFH-ILP32F-NEXT: fmv.h.x fa0, a7 ; RV32-ZFH-ILP32F-NEXT: li a7, 8 ; RV32-ZFH-ILP32F-NEXT: call callee_half_on_stack ; RV32-ZFH-ILP32F-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -634,15 +640,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV64-ZFH-LP64: # %bb.0: ; RV64-ZFH-LP64-NEXT: addi sp, sp, -16 ; RV64-ZFH-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-ZFH-LP64-NEXT: lui a4, %hi(.LCPI3_0) +; RV64-ZFH-LP64-NEXT: lui a7, 5 ; RV64-ZFH-LP64-NEXT: li a0, 1 ; RV64-ZFH-LP64-NEXT: li a1, 2 ; RV64-ZFH-LP64-NEXT: li a2, 3 ; RV64-ZFH-LP64-NEXT: li a3, 4 -; RV64-ZFH-LP64-NEXT: flh fa5, %lo(.LCPI3_0)(a4) ; RV64-ZFH-LP64-NEXT: li a4, 5 ; RV64-ZFH-LP64-NEXT: li a5, 6 ; RV64-ZFH-LP64-NEXT: li a6, 7 +; RV64-ZFH-LP64-NEXT: addi a7, a7, -1792 +; RV64-ZFH-LP64-NEXT: fmv.h.x fa5, a7 ; RV64-ZFH-LP64-NEXT: li a7, 8 ; RV64-ZFH-LP64-NEXT: fsh fa5, 0(sp) ; RV64-ZFH-LP64-NEXT: call callee_half_on_stack @@ -654,15 +661,16 @@ define i32 @caller_half_on_stack() nounwind { ; RV64-ZFH-LP64F: # %bb.0: ; RV64-ZFH-LP64F-NEXT: addi sp, sp, -16 ; RV64-ZFH-LP64F-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-ZFH-LP64F-NEXT: lui a4, %hi(.LCPI3_0) +; RV64-ZFH-LP64F-NEXT: lui a7, 5 ; RV64-ZFH-LP64F-NEXT: li a0, 1 ; RV64-ZFH-LP64F-NEXT: li a1, 2 ; RV64-ZFH-LP64F-NEXT: li a2, 3 ; RV64-ZFH-LP64F-NEXT: li a3, 4 -; RV64-ZFH-LP64F-NEXT: flh fa0, %lo(.LCPI3_0)(a4) ; RV64-ZFH-LP64F-NEXT: li a4, 5 ; RV64-ZFH-LP64F-NEXT: li a5, 6 ; RV64-ZFH-LP64F-NEXT: li a6, 7 +; RV64-ZFH-LP64F-NEXT: addi a7, a7, -1792 +; RV64-ZFH-LP64F-NEXT: fmv.h.x fa0, a7 ; RV64-ZFH-LP64F-NEXT: li a7, 8 ; RV64-ZFH-LP64F-NEXT: call callee_half_on_stack ; RV64-ZFH-LP64F-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -1038,31 +1046,32 @@ define i32 @caller_half_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ILP32ZFHMIN: # %bb.0: ; RV32-ILP32ZFHMIN-NEXT: addi sp, sp, -16 ; RV32-ILP32ZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ILP32ZFHMIN-NEXT: lui a0, %hi(.LCPI5_0) -; RV32-ILP32ZFHMIN-NEXT: lui a1, 260096 -; RV32-ILP32ZFHMIN-NEXT: lui a2, 262144 -; RV32-ILP32ZFHMIN-NEXT: lui a3, 263168 -; RV32-ILP32ZFHMIN-NEXT: lui a4, 264192 -; RV32-ILP32ZFHMIN-NEXT: lui a5, 264704 -; RV32-ILP32ZFHMIN-NEXT: lui a6, 265216 -; RV32-ILP32ZFHMIN-NEXT: lui a7, 265728 -; RV32-ILP32ZFHMIN-NEXT: flh ft0, %lo(.LCPI5_0)(a0) +; RV32-ILP32ZFHMIN-NEXT: lui a7, 5 +; RV32-ILP32ZFHMIN-NEXT: lui a0, 260096 +; RV32-ILP32ZFHMIN-NEXT: lui a1, 262144 +; RV32-ILP32ZFHMIN-NEXT: lui a2, 263168 +; RV32-ILP32ZFHMIN-NEXT: lui a3, 264192 +; RV32-ILP32ZFHMIN-NEXT: lui a4, 264704 +; RV32-ILP32ZFHMIN-NEXT: lui a5, 265216 +; RV32-ILP32ZFHMIN-NEXT: lui a6, 265728 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa0, a0 ; RV32-ILP32ZFHMIN-NEXT: lui t0, 266240 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa0, a1 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa1, a1 ; RV32-ILP32ZFHMIN-NEXT: li a0, 1 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa1, a2 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa2, a2 ; RV32-ILP32ZFHMIN-NEXT: li a1, 2 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa2, a3 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa3, a3 ; RV32-ILP32ZFHMIN-NEXT: li a2, 3 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa3, a4 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa4, a4 ; RV32-ILP32ZFHMIN-NEXT: li a3, 4 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa4, a5 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa5, a6 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa6, a7 -; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa7, t0 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa5, a5 ; RV32-ILP32ZFHMIN-NEXT: li a4, 5 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa6, a6 ; RV32-ILP32ZFHMIN-NEXT: li a5, 6 +; RV32-ILP32ZFHMIN-NEXT: fmv.w.x fa7, t0 ; RV32-ILP32ZFHMIN-NEXT: li a6, 7 +; RV32-ILP32ZFHMIN-NEXT: addi a7, a7, -1792 +; RV32-ILP32ZFHMIN-NEXT: fmv.h.x ft0, a7 ; RV32-ILP32ZFHMIN-NEXT: li a7, 8 ; RV32-ILP32ZFHMIN-NEXT: fsh ft0, 0(sp) ; RV32-ILP32ZFHMIN-NEXT: call callee_half_on_stack @@ -1074,31 +1083,32 @@ define i32 @caller_half_on_stack_exhausted_gprs_fprs() nounwind { ; RV64-LP64ZFHMIN: # %bb.0: ; RV64-LP64ZFHMIN-NEXT: addi sp, sp, -16 ; RV64-LP64ZFHMIN-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-LP64ZFHMIN-NEXT: lui a0, %hi(.LCPI5_0) -; RV64-LP64ZFHMIN-NEXT: lui a1, 260096 -; RV64-LP64ZFHMIN-NEXT: lui a2, 262144 -; RV64-LP64ZFHMIN-NEXT: lui a3, 263168 -; RV64-LP64ZFHMIN-NEXT: lui a4, 264192 -; RV64-LP64ZFHMIN-NEXT: lui a5, 264704 -; RV64-LP64ZFHMIN-NEXT: lui a6, 265216 -; RV64-LP64ZFHMIN-NEXT: lui a7, 265728 -; RV64-LP64ZFHMIN-NEXT: flh ft0, %lo(.LCPI5_0)(a0) +; RV64-LP64ZFHMIN-NEXT: lui a7, 5 +; RV64-LP64ZFHMIN-NEXT: lui a0, 260096 +; RV64-LP64ZFHMIN-NEXT: lui a1, 262144 +; RV64-LP64ZFHMIN-NEXT: lui a2, 263168 +; RV64-LP64ZFHMIN-NEXT: lui a3, 264192 +; RV64-LP64ZFHMIN-NEXT: lui a4, 264704 +; RV64-LP64ZFHMIN-NEXT: lui a5, 265216 +; RV64-LP64ZFHMIN-NEXT: lui a6, 265728 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa0, a0 ; RV64-LP64ZFHMIN-NEXT: lui t0, 266240 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa0, a1 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa1, a1 ; RV64-LP64ZFHMIN-NEXT: li a0, 1 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa1, a2 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa2, a2 ; RV64-LP64ZFHMIN-NEXT: li a1, 2 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa2, a3 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa3, a3 ; RV64-LP64ZFHMIN-NEXT: li a2, 3 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa3, a4 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa4, a4 ; RV64-LP64ZFHMIN-NEXT: li a3, 4 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa4, a5 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa5, a6 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa6, a7 -; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa7, t0 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa5, a5 ; RV64-LP64ZFHMIN-NEXT: li a4, 5 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa6, a6 ; RV64-LP64ZFHMIN-NEXT: li a5, 6 +; RV64-LP64ZFHMIN-NEXT: fmv.w.x fa7, t0 ; RV64-LP64ZFHMIN-NEXT: li a6, 7 +; RV64-LP64ZFHMIN-NEXT: addi a7, a7, -1792 +; RV64-LP64ZFHMIN-NEXT: fmv.h.x ft0, a7 ; RV64-LP64ZFHMIN-NEXT: li a7, 8 ; RV64-LP64ZFHMIN-NEXT: fsh ft0, 0(sp) ; RV64-LP64ZFHMIN-NEXT: call callee_half_on_stack @@ -1110,31 +1120,32 @@ define i32 @caller_half_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ZFH-ILP32: # %bb.0: ; RV32-ZFH-ILP32-NEXT: addi sp, sp, -48 ; RV32-ZFH-ILP32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32-ZFH-ILP32-NEXT: lui a2, %hi(.LCPI5_0) -; RV32-ZFH-ILP32-NEXT: lui a3, 266240 -; RV32-ZFH-ILP32-NEXT: li a4, 8 -; RV32-ZFH-ILP32-NEXT: lui a5, 265728 -; RV32-ZFH-ILP32-NEXT: li a6, 7 -; RV32-ZFH-ILP32-NEXT: lui a7, 265216 -; RV32-ZFH-ILP32-NEXT: li t0, 6 -; RV32-ZFH-ILP32-NEXT: lui t1, 264704 -; RV32-ZFH-ILP32-NEXT: li t2, 5 +; RV32-ZFH-ILP32-NEXT: lui a5, 266240 +; RV32-ZFH-ILP32-NEXT: li a6, 8 +; RV32-ZFH-ILP32-NEXT: lui a7, 265728 +; RV32-ZFH-ILP32-NEXT: li t0, 7 +; RV32-ZFH-ILP32-NEXT: lui t1, 265216 +; RV32-ZFH-ILP32-NEXT: li t2, 6 +; RV32-ZFH-ILP32-NEXT: lui t3, 264704 +; RV32-ZFH-ILP32-NEXT: li t4, 5 +; RV32-ZFH-ILP32-NEXT: lui t5, 5 ; RV32-ZFH-ILP32-NEXT: li a0, 1 ; RV32-ZFH-ILP32-NEXT: lui a1, 260096 -; RV32-ZFH-ILP32-NEXT: flh fa5, %lo(.LCPI5_0)(a2) ; RV32-ZFH-ILP32-NEXT: li a2, 2 -; RV32-ZFH-ILP32-NEXT: sw a6, 16(sp) -; RV32-ZFH-ILP32-NEXT: sw a5, 20(sp) -; RV32-ZFH-ILP32-NEXT: sw a4, 24(sp) -; RV32-ZFH-ILP32-NEXT: sw a3, 28(sp) ; RV32-ZFH-ILP32-NEXT: lui a3, 262144 -; RV32-ZFH-ILP32-NEXT: sw t2, 0(sp) -; RV32-ZFH-ILP32-NEXT: sw t1, 4(sp) -; RV32-ZFH-ILP32-NEXT: sw t0, 8(sp) -; RV32-ZFH-ILP32-NEXT: sw a7, 12(sp) ; RV32-ZFH-ILP32-NEXT: li a4, 3 +; RV32-ZFH-ILP32-NEXT: sw t0, 16(sp) +; RV32-ZFH-ILP32-NEXT: sw a7, 20(sp) +; RV32-ZFH-ILP32-NEXT: sw a6, 24(sp) +; RV32-ZFH-ILP32-NEXT: sw a5, 28(sp) ; RV32-ZFH-ILP32-NEXT: lui a5, 263168 +; RV32-ZFH-ILP32-NEXT: sw t4, 0(sp) +; RV32-ZFH-ILP32-NEXT: sw t3, 4(sp) +; RV32-ZFH-ILP32-NEXT: sw t2, 8(sp) +; RV32-ZFH-ILP32-NEXT: sw t1, 12(sp) ; RV32-ZFH-ILP32-NEXT: li a6, 4 +; RV32-ZFH-ILP32-NEXT: addi a7, t5, -1792 +; RV32-ZFH-ILP32-NEXT: fmv.h.x fa5, a7 ; RV32-ZFH-ILP32-NEXT: lui a7, 264192 ; RV32-ZFH-ILP32-NEXT: fsh fa5, 32(sp) ; RV32-ZFH-ILP32-NEXT: call callee_half_on_stack @@ -1146,31 +1157,32 @@ define i32 @caller_half_on_stack_exhausted_gprs_fprs() nounwind { ; RV32-ZFH-ILP32F: # %bb.0: ; RV32-ZFH-ILP32F-NEXT: addi sp, sp, -16 ; RV32-ZFH-ILP32F-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-ZFH-ILP32F-NEXT: lui a0, %hi(.LCPI5_0) -; RV32-ZFH-ILP32F-NEXT: lui a1, 260096 -; RV32-ZFH-ILP32F-NEXT: lui a2, 262144 -; RV32-ZFH-ILP32F-NEXT: lui a3, 263168 -; RV32-ZFH-ILP32F-NEXT: lui a4, 264192 -; RV32-ZFH-ILP32F-NEXT: lui a5, 264704 -; RV32-ZFH-ILP32F-NEXT: lui a6, 265216 -; RV32-ZFH-ILP32F-NEXT: lui a7, 265728 -; RV32-ZFH-ILP32F-NEXT: flh ft0, %lo(.LCPI5_0)(a0) +; RV32-ZFH-ILP32F-NEXT: lui a7, 5 +; RV32-ZFH-ILP32F-NEXT: lui a0, 260096 +; RV32-ZFH-ILP32F-NEXT: lui a1, 262144 +; RV32-ZFH-ILP32F-NEXT: lui a2, 263168 +; RV32-ZFH-ILP32F-NEXT: lui a3, 264192 +; RV32-ZFH-ILP32F-NEXT: lui a4, 264704 +; RV32-ZFH-ILP32F-NEXT: lui a5, 265216 +; RV32-ZFH-ILP32F-NEXT: lui a6, 265728 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa0, a0 ; RV32-ZFH-ILP32F-NEXT: lui t0, 266240 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa0, a1 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa1, a1 ; RV32-ZFH-ILP32F-NEXT: li a0, 1 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa1, a2 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa2, a2 ; RV32-ZFH-ILP32F-NEXT: li a1, 2 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa2, a3 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa3, a3 ; RV32-ZFH-ILP32F-NEXT: li a2, 3 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa3, a4 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa4, a4 ; RV32-ZFH-ILP32F-NEXT: li a3, 4 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa4, a5 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa5, a6 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa6, a7 -; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa7, t0 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa5, a5 ; RV32-ZFH-ILP32F-NEXT: li a4, 5 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa6, a6 ; RV32-ZFH-ILP32F-NEXT: li a5, 6 +; RV32-ZFH-ILP32F-NEXT: fmv.w.x fa7, t0 ; RV32-ZFH-ILP32F-NEXT: li a6, 7 +; RV32-ZFH-ILP32F-NEXT: addi a7, a7, -1792 +; RV32-ZFH-ILP32F-NEXT: fmv.h.x ft0, a7 ; RV32-ZFH-ILP32F-NEXT: li a7, 8 ; RV32-ZFH-ILP32F-NEXT: fsh ft0, 0(sp) ; RV32-ZFH-ILP32F-NEXT: call callee_half_on_stack @@ -1182,31 +1194,32 @@ define i32 @caller_half_on_stack_exhausted_gprs_fprs() nounwind { ; RV64-ZFH-LP64: # %bb.0: ; RV64-ZFH-LP64-NEXT: addi sp, sp, -80 ; RV64-ZFH-LP64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill -; RV64-ZFH-LP64-NEXT: lui a2, %hi(.LCPI5_0) -; RV64-ZFH-LP64-NEXT: lui a3, 266240 -; RV64-ZFH-LP64-NEXT: li a4, 8 -; RV64-ZFH-LP64-NEXT: lui a5, 265728 -; RV64-ZFH-LP64-NEXT: li a6, 7 -; RV64-ZFH-LP64-NEXT: lui a7, 265216 -; RV64-ZFH-LP64-NEXT: li t0, 6 -; RV64-ZFH-LP64-NEXT: lui t1, 264704 -; RV64-ZFH-LP64-NEXT: li t2, 5 +; RV64-ZFH-LP64-NEXT: lui a5, 266240 +; RV64-ZFH-LP64-NEXT: li a6, 8 +; RV64-ZFH-LP64-NEXT: lui a7, 265728 +; RV64-ZFH-LP64-NEXT: li t0, 7 +; RV64-ZFH-LP64-NEXT: lui t1, 265216 +; RV64-ZFH-LP64-NEXT: li t2, 6 +; RV64-ZFH-LP64-NEXT: lui t3, 264704 +; RV64-ZFH-LP64-NEXT: li t4, 5 +; RV64-ZFH-LP64-NEXT: lui t5, 5 ; RV64-ZFH-LP64-NEXT: li a0, 1 ; RV64-ZFH-LP64-NEXT: lui a1, 260096 -; RV64-ZFH-LP64-NEXT: flh fa5, %lo(.LCPI5_0)(a2) ; RV64-ZFH-LP64-NEXT: li a2, 2 -; RV64-ZFH-LP64-NEXT: sd a6, 32(sp) -; RV64-ZFH-LP64-NEXT: sw a5, 40(sp) -; RV64-ZFH-LP64-NEXT: sd a4, 48(sp) -; RV64-ZFH-LP64-NEXT: sw a3, 56(sp) ; RV64-ZFH-LP64-NEXT: lui a3, 262144 -; RV64-ZFH-LP64-NEXT: sd t2, 0(sp) -; RV64-ZFH-LP64-NEXT: sw t1, 8(sp) -; RV64-ZFH-LP64-NEXT: sd t0, 16(sp) -; RV64-ZFH-LP64-NEXT: sw a7, 24(sp) ; RV64-ZFH-LP64-NEXT: li a4, 3 +; RV64-ZFH-LP64-NEXT: sd t0, 32(sp) +; RV64-ZFH-LP64-NEXT: sw a7, 40(sp) +; RV64-ZFH-LP64-NEXT: sd a6, 48(sp) +; RV64-ZFH-LP64-NEXT: sw a5, 56(sp) ; RV64-ZFH-LP64-NEXT: lui a5, 263168 +; RV64-ZFH-LP64-NEXT: sd t4, 0(sp) +; RV64-ZFH-LP64-NEXT: sw t3, 8(sp) +; RV64-ZFH-LP64-NEXT: sd t2, 16(sp) +; RV64-ZFH-LP64-NEXT: sw t1, 24(sp) ; RV64-ZFH-LP64-NEXT: li a6, 4 +; RV64-ZFH-LP64-NEXT: addi a7, t5, -1792 +; RV64-ZFH-LP64-NEXT: fmv.h.x fa5, a7 ; RV64-ZFH-LP64-NEXT: lui a7, 264192 ; RV64-ZFH-LP64-NEXT: fsh fa5, 64(sp) ; RV64-ZFH-LP64-NEXT: call callee_half_on_stack @@ -1218,31 +1231,32 @@ define i32 @caller_half_on_stack_exhausted_gprs_fprs() nounwind { ; RV64-ZFH-LP64F: # %bb.0: ; RV64-ZFH-LP64F-NEXT: addi sp, sp, -16 ; RV64-ZFH-LP64F-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-ZFH-LP64F-NEXT: lui a0, %hi(.LCPI5_0) -; RV64-ZFH-LP64F-NEXT: lui a1, 260096 -; RV64-ZFH-LP64F-NEXT: lui a2, 262144 -; RV64-ZFH-LP64F-NEXT: lui a3, 263168 -; RV64-ZFH-LP64F-NEXT: lui a4, 264192 -; RV64-ZFH-LP64F-NEXT: lui a5, 264704 -; RV64-ZFH-LP64F-NEXT: lui a6, 265216 -; RV64-ZFH-LP64F-NEXT: lui a7, 265728 -; RV64-ZFH-LP64F-NEXT: flh ft0, %lo(.LCPI5_0)(a0) +; RV64-ZFH-LP64F-NEXT: lui a7, 5 +; RV64-ZFH-LP64F-NEXT: lui a0, 260096 +; RV64-ZFH-LP64F-NEXT: lui a1, 262144 +; RV64-ZFH-LP64F-NEXT: lui a2, 263168 +; RV64-ZFH-LP64F-NEXT: lui a3, 264192 +; RV64-ZFH-LP64F-NEXT: lui a4, 264704 +; RV64-ZFH-LP64F-NEXT: lui a5, 265216 +; RV64-ZFH-LP64F-NEXT: lui a6, 265728 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa0, a0 ; RV64-ZFH-LP64F-NEXT: lui t0, 266240 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa0, a1 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa1, a1 ; RV64-ZFH-LP64F-NEXT: li a0, 1 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa1, a2 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa2, a2 ; RV64-ZFH-LP64F-NEXT: li a1, 2 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa2, a3 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa3, a3 ; RV64-ZFH-LP64F-NEXT: li a2, 3 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa3, a4 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa4, a4 ; RV64-ZFH-LP64F-NEXT: li a3, 4 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa4, a5 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa5, a6 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa6, a7 -; RV64-ZFH-LP64F-NEXT: fmv.w.x fa7, t0 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa5, a5 ; RV64-ZFH-LP64F-NEXT: li a4, 5 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa6, a6 ; RV64-ZFH-LP64F-NEXT: li a5, 6 +; RV64-ZFH-LP64F-NEXT: fmv.w.x fa7, t0 ; RV64-ZFH-LP64F-NEXT: li a6, 7 +; RV64-ZFH-LP64F-NEXT: addi a7, a7, -1792 +; RV64-ZFH-LP64F-NEXT: fmv.h.x ft0, a7 ; RV64-ZFH-LP64F-NEXT: li a7, 8 ; RV64-ZFH-LP64F-NEXT: fsh ft0, 0(sp) ; RV64-ZFH-LP64F-NEXT: call callee_half_on_stack @@ -1280,26 +1294,30 @@ define half @callee_half_ret() nounwind { ; ; RV32-ILP32F-LABEL: callee_half_ret: ; RV32-ILP32F: # %bb.0: -; RV32-ILP32F-NEXT: lui a0, %hi(.LCPI6_0) -; RV32-ILP32F-NEXT: flw fa0, %lo(.LCPI6_0)(a0) +; RV32-ILP32F-NEXT: lui a0, 1048564 +; RV32-ILP32F-NEXT: addi a0, a0, -1024 +; RV32-ILP32F-NEXT: fmv.w.x fa0, a0 ; RV32-ILP32F-NEXT: ret ; ; RV64-LP64F-LABEL: callee_half_ret: ; RV64-LP64F: # %bb.0: -; RV64-LP64F-NEXT: lui a0, %hi(.LCPI6_0) -; RV64-LP64F-NEXT: flw fa0, %lo(.LCPI6_0)(a0) +; RV64-LP64F-NEXT: lui a0, 1048564 +; RV64-LP64F-NEXT: addi a0, a0, -1024 +; RV64-LP64F-NEXT: fmv.w.x fa0, a0 ; RV64-LP64F-NEXT: ret ; ; RV32-ILP32ZFHMIN-LABEL: callee_half_ret: ; RV32-ILP32ZFHMIN: # %bb.0: -; RV32-ILP32ZFHMIN-NEXT: lui a0, %hi(.LCPI6_0) -; RV32-ILP32ZFHMIN-NEXT: flh fa0, %lo(.LCPI6_0)(a0) +; RV32-ILP32ZFHMIN-NEXT: li a0, 15 +; RV32-ILP32ZFHMIN-NEXT: slli a0, a0, 10 +; RV32-ILP32ZFHMIN-NEXT: fmv.h.x fa0, a0 ; RV32-ILP32ZFHMIN-NEXT: ret ; ; RV64-LP64ZFHMIN-LABEL: callee_half_ret: ; RV64-LP64ZFHMIN: # %bb.0: -; RV64-LP64ZFHMIN-NEXT: lui a0, %hi(.LCPI6_0) -; RV64-LP64ZFHMIN-NEXT: flh fa0, %lo(.LCPI6_0)(a0) +; RV64-LP64ZFHMIN-NEXT: li a0, 15 +; RV64-LP64ZFHMIN-NEXT: slli a0, a0, 10 +; RV64-LP64ZFHMIN-NEXT: fmv.h.x fa0, a0 ; RV64-LP64ZFHMIN-NEXT: ret ; ; RV32-ZFH-ILP32-LABEL: callee_half_ret: @@ -1310,8 +1328,9 @@ define half @callee_half_ret() nounwind { ; ; RV32-ZFH-ILP32F-LABEL: callee_half_ret: ; RV32-ZFH-ILP32F: # %bb.0: -; RV32-ZFH-ILP32F-NEXT: lui a0, %hi(.LCPI6_0) -; RV32-ZFH-ILP32F-NEXT: flh fa0, %lo(.LCPI6_0)(a0) +; RV32-ZFH-ILP32F-NEXT: li a0, 15 +; RV32-ZFH-ILP32F-NEXT: slli a0, a0, 10 +; RV32-ZFH-ILP32F-NEXT: fmv.h.x fa0, a0 ; RV32-ZFH-ILP32F-NEXT: ret ; ; RV64-ZFH-LP64-LABEL: callee_half_ret: @@ -1322,8 +1341,9 @@ define half @callee_half_ret() nounwind { ; ; RV64-ZFH-LP64F-LABEL: callee_half_ret: ; RV64-ZFH-LP64F: # %bb.0: -; RV64-ZFH-LP64F-NEXT: lui a0, %hi(.LCPI6_0) -; RV64-ZFH-LP64F-NEXT: flh fa0, %lo(.LCPI6_0)(a0) +; RV64-ZFH-LP64F-NEXT: li a0, 15 +; RV64-ZFH-LP64F-NEXT: slli a0, a0, 10 +; RV64-ZFH-LP64F-NEXT: fmv.h.x fa0, a0 ; RV64-ZFH-LP64F-NEXT: ret ret half 1.0 } diff --git a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll index 6608874286e34..f8b1d505f4e81 100644 --- a/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/cmov-branch-opt.ll @@ -6,9 +6,9 @@ ; RUN: llc -mtriple=riscv64 -mattr=+conditional-cmv-fusion,+c,+zicond -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=CMOV,CMOV-ZICOND %s ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND %s +; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND,SFB-NOZICOND-NOC %s ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+c -verify-machineinstrs < %s \ -; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND %s +; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-NOZICOND,SFB-NOZICOND-C %s ; RUN: llc -mtriple=riscv64 -mattr=+short-forward-branch-opt,+zicond -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefixes=SHORT_FORWARD,SFB-ZICOND %s @@ -263,6 +263,24 @@ define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) { ; CMOV-NEXT: .LBB6_2: # %entry ; CMOV-NEXT: ret ; +; SFB-NOZICOND-NOC-LABEL: select_xor_2: +; SFB-NOZICOND-NOC: # %bb.0: # %entry +; SFB-NOZICOND-NOC-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-NOC-NEXT: beqz a2, .LBB6_2 +; SFB-NOZICOND-NOC-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-NOC-NEXT: xor a0, a1, a0 +; SFB-NOZICOND-NOC-NEXT: .LBB6_2: # %entry +; SFB-NOZICOND-NOC-NEXT: ret +; +; SFB-NOZICOND-C-LABEL: select_xor_2: +; SFB-NOZICOND-C: # %bb.0: # %entry +; SFB-NOZICOND-C-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-C-NEXT: beqz a2, .LBB6_2 +; SFB-NOZICOND-C-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-C-NEXT: xor a0, a0, a1 +; SFB-NOZICOND-C-NEXT: .LBB6_2: # %entry +; SFB-NOZICOND-C-NEXT: ret +; ; SFB-ZICOND-LABEL: select_xor_2: ; SFB-ZICOND: # %bb.0: # %entry ; SFB-ZICOND-NEXT: andi a2, a2, 1 @@ -300,6 +318,24 @@ define i32 @select_xor_2b(i32 %A, i32 %B, i8 %cond) { ; CMOV-NEXT: .LBB7_2: # %entry ; CMOV-NEXT: ret ; +; SFB-NOZICOND-NOC-LABEL: select_xor_2b: +; SFB-NOZICOND-NOC: # %bb.0: # %entry +; SFB-NOZICOND-NOC-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-NOC-NEXT: beqz a2, .LBB7_2 +; SFB-NOZICOND-NOC-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-NOC-NEXT: xor a0, a1, a0 +; SFB-NOZICOND-NOC-NEXT: .LBB7_2: # %entry +; SFB-NOZICOND-NOC-NEXT: ret +; +; SFB-NOZICOND-C-LABEL: select_xor_2b: +; SFB-NOZICOND-C: # %bb.0: # %entry +; SFB-NOZICOND-C-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-C-NEXT: beqz a2, .LBB7_2 +; SFB-NOZICOND-C-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-C-NEXT: xor a0, a0, a1 +; SFB-NOZICOND-C-NEXT: .LBB7_2: # %entry +; SFB-NOZICOND-C-NEXT: ret +; ; SFB-ZICOND-LABEL: select_xor_2b: ; SFB-ZICOND: # %bb.0: # %entry ; SFB-ZICOND-NEXT: andi a2, a2, 1 @@ -335,6 +371,24 @@ define i32 @select_or(i32 %A, i32 %B, i8 %cond) { ; CMOV-NEXT: .LBB8_2: # %entry ; CMOV-NEXT: ret ; +; SFB-NOZICOND-NOC-LABEL: select_or: +; SFB-NOZICOND-NOC: # %bb.0: # %entry +; SFB-NOZICOND-NOC-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-NOC-NEXT: beqz a2, .LBB8_2 +; SFB-NOZICOND-NOC-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-NOC-NEXT: or a0, a1, a0 +; SFB-NOZICOND-NOC-NEXT: .LBB8_2: # %entry +; SFB-NOZICOND-NOC-NEXT: ret +; +; SFB-NOZICOND-C-LABEL: select_or: +; SFB-NOZICOND-C: # %bb.0: # %entry +; SFB-NOZICOND-C-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-C-NEXT: beqz a2, .LBB8_2 +; SFB-NOZICOND-C-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-C-NEXT: or a0, a0, a1 +; SFB-NOZICOND-C-NEXT: .LBB8_2: # %entry +; SFB-NOZICOND-C-NEXT: ret +; ; SFB-ZICOND-LABEL: select_or: ; SFB-ZICOND: # %bb.0: # %entry ; SFB-ZICOND-NEXT: andi a2, a2, 1 @@ -372,6 +426,24 @@ define i32 @select_or_b(i32 %A, i32 %B, i8 %cond) { ; CMOV-NEXT: .LBB9_2: # %entry ; CMOV-NEXT: ret ; +; SFB-NOZICOND-NOC-LABEL: select_or_b: +; SFB-NOZICOND-NOC: # %bb.0: # %entry +; SFB-NOZICOND-NOC-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-NOC-NEXT: beqz a2, .LBB9_2 +; SFB-NOZICOND-NOC-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-NOC-NEXT: or a0, a1, a0 +; SFB-NOZICOND-NOC-NEXT: .LBB9_2: # %entry +; SFB-NOZICOND-NOC-NEXT: ret +; +; SFB-NOZICOND-C-LABEL: select_or_b: +; SFB-NOZICOND-C: # %bb.0: # %entry +; SFB-NOZICOND-C-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-C-NEXT: beqz a2, .LBB9_2 +; SFB-NOZICOND-C-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-C-NEXT: or a0, a0, a1 +; SFB-NOZICOND-C-NEXT: .LBB9_2: # %entry +; SFB-NOZICOND-C-NEXT: ret +; ; SFB-ZICOND-LABEL: select_or_b: ; SFB-ZICOND: # %bb.0: # %entry ; SFB-ZICOND-NEXT: andi a2, a2, 1 @@ -407,6 +479,24 @@ define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) { ; CMOV-NEXT: .LBB10_2: # %entry ; CMOV-NEXT: ret ; +; SFB-NOZICOND-NOC-LABEL: select_or_1: +; SFB-NOZICOND-NOC: # %bb.0: # %entry +; SFB-NOZICOND-NOC-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-NOC-NEXT: beqz a2, .LBB10_2 +; SFB-NOZICOND-NOC-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-NOC-NEXT: or a0, a1, a0 +; SFB-NOZICOND-NOC-NEXT: .LBB10_2: # %entry +; SFB-NOZICOND-NOC-NEXT: ret +; +; SFB-NOZICOND-C-LABEL: select_or_1: +; SFB-NOZICOND-C: # %bb.0: # %entry +; SFB-NOZICOND-C-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-C-NEXT: beqz a2, .LBB10_2 +; SFB-NOZICOND-C-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-C-NEXT: or a0, a0, a1 +; SFB-NOZICOND-C-NEXT: .LBB10_2: # %entry +; SFB-NOZICOND-C-NEXT: ret +; ; SFB-ZICOND-LABEL: select_or_1: ; SFB-ZICOND: # %bb.0: # %entry ; SFB-ZICOND-NEXT: andi a2, a2, 1 @@ -444,6 +534,24 @@ define i32 @select_or_1b(i32 %A, i32 %B, i32 %cond) { ; CMOV-NEXT: .LBB11_2: # %entry ; CMOV-NEXT: ret ; +; SFB-NOZICOND-NOC-LABEL: select_or_1b: +; SFB-NOZICOND-NOC: # %bb.0: # %entry +; SFB-NOZICOND-NOC-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-NOC-NEXT: beqz a2, .LBB11_2 +; SFB-NOZICOND-NOC-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-NOC-NEXT: or a0, a1, a0 +; SFB-NOZICOND-NOC-NEXT: .LBB11_2: # %entry +; SFB-NOZICOND-NOC-NEXT: ret +; +; SFB-NOZICOND-C-LABEL: select_or_1b: +; SFB-NOZICOND-C: # %bb.0: # %entry +; SFB-NOZICOND-C-NEXT: andi a2, a2, 1 +; SFB-NOZICOND-C-NEXT: beqz a2, .LBB11_2 +; SFB-NOZICOND-C-NEXT: # %bb.1: # %entry +; SFB-NOZICOND-C-NEXT: or a0, a0, a1 +; SFB-NOZICOND-C-NEXT: .LBB11_2: # %entry +; SFB-NOZICOND-C-NEXT: ret +; ; SFB-ZICOND-LABEL: select_or_1b: ; SFB-ZICOND: # %bb.0: # %entry ; SFB-ZICOND-NEXT: andi a2, a2, 1 diff --git a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll index 94f8d7cab9b95..220494a4c4ff8 100644 --- a/llvm/test/CodeGen/RISCV/codemodel-lowering.ll +++ b/llvm/test/CodeGen/RISCV/codemodel-lowering.ll @@ -287,8 +287,9 @@ indirectgoto: define float @lower_constantpool(float %a) nounwind { ; RV32F-SMALL-LABEL: lower_constantpool: ; RV32F-SMALL: # %bb.0: -; RV32F-SMALL-NEXT: lui a0, %hi(.LCPI3_0) -; RV32F-SMALL-NEXT: flw fa5, %lo(.LCPI3_0)(a0) +; RV32F-SMALL-NEXT: lui a0, 260097 +; RV32F-SMALL-NEXT: addi a0, a0, -2048 +; RV32F-SMALL-NEXT: fmv.w.x fa5, a0 ; RV32F-SMALL-NEXT: fadd.s fa0, fa0, fa5 ; RV32F-SMALL-NEXT: ret ; @@ -301,32 +302,33 @@ define float @lower_constantpool(float %a) nounwind { ; ; RV32F-MEDIUM-LABEL: lower_constantpool: ; RV32F-MEDIUM: # %bb.0: -; RV32F-MEDIUM-NEXT: .Lpcrel_hi3: -; RV32F-MEDIUM-NEXT: auipc a0, %pcrel_hi(.LCPI3_0) -; RV32F-MEDIUM-NEXT: flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0) +; RV32F-MEDIUM-NEXT: lui a0, 260097 +; RV32F-MEDIUM-NEXT: addi a0, a0, -2048 +; RV32F-MEDIUM-NEXT: fmv.w.x fa5, a0 ; RV32F-MEDIUM-NEXT: fadd.s fa0, fa0, fa5 ; RV32F-MEDIUM-NEXT: ret ; ; RV64F-SMALL-LABEL: lower_constantpool: ; RV64F-SMALL: # %bb.0: -; RV64F-SMALL-NEXT: lui a0, %hi(.LCPI3_0) -; RV64F-SMALL-NEXT: flw fa5, %lo(.LCPI3_0)(a0) +; RV64F-SMALL-NEXT: lui a0, 260097 +; RV64F-SMALL-NEXT: addi a0, a0, -2048 +; RV64F-SMALL-NEXT: fmv.w.x fa5, a0 ; RV64F-SMALL-NEXT: fadd.s fa0, fa0, fa5 ; RV64F-SMALL-NEXT: ret ; ; RV64F-MEDIUM-LABEL: lower_constantpool: ; RV64F-MEDIUM: # %bb.0: -; RV64F-MEDIUM-NEXT: .Lpcrel_hi3: -; RV64F-MEDIUM-NEXT: auipc a0, %pcrel_hi(.LCPI3_0) -; RV64F-MEDIUM-NEXT: flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0) +; RV64F-MEDIUM-NEXT: lui a0, 260097 +; RV64F-MEDIUM-NEXT: addi a0, a0, -2048 +; RV64F-MEDIUM-NEXT: fmv.w.x fa5, a0 ; RV64F-MEDIUM-NEXT: fadd.s fa0, fa0, fa5 ; RV64F-MEDIUM-NEXT: ret ; ; RV64F-LARGE-LABEL: lower_constantpool: ; RV64F-LARGE: # %bb.0: -; RV64F-LARGE-NEXT: .Lpcrel_hi3: -; RV64F-LARGE-NEXT: auipc a0, %pcrel_hi(.LCPI3_0) -; RV64F-LARGE-NEXT: flw fa5, %pcrel_lo(.Lpcrel_hi3)(a0) +; RV64F-LARGE-NEXT: lui a0, 260097 +; RV64F-LARGE-NEXT: addi a0, a0, -2048 +; RV64F-LARGE-NEXT: fmv.w.x fa5, a0 ; RV64F-LARGE-NEXT: fadd.s fa0, fa0, fa5 ; RV64F-LARGE-NEXT: ret ; @@ -390,13 +392,13 @@ define i32 @lower_extern_weak(i32 %a) nounwind { ; RV32IXQCILI-SMALL-NEXT: lw a0, 0(a0) ; RV32IXQCILI-SMALL-NEXT: ret ; -; RV32F-MEDIUM-LABEL: lower_extern_weak: -; RV32F-MEDIUM: # %bb.0: -; RV32F-MEDIUM-NEXT: .Lpcrel_hi4: -; RV32F-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(W) -; RV32F-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi4)(a0) -; RV32F-MEDIUM-NEXT: lw a0, 0(a0) -; RV32F-MEDIUM-NEXT: ret +; RV32I-MEDIUM-LABEL: lower_extern_weak: +; RV32I-MEDIUM: # %bb.0: +; RV32I-MEDIUM-NEXT: .Lpcrel_hi3: +; RV32I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(W) +; RV32I-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi3)(a0) +; RV32I-MEDIUM-NEXT: lw a0, 0(a0) +; RV32I-MEDIUM-NEXT: ret ; ; RV64I-SMALL-LABEL: lower_extern_weak: ; RV64I-SMALL: # %bb.0: @@ -404,45 +406,21 @@ define i32 @lower_extern_weak(i32 %a) nounwind { ; RV64I-SMALL-NEXT: lw a0, %lo(W)(a0) ; RV64I-SMALL-NEXT: ret ; -; RV64F-MEDIUM-LABEL: lower_extern_weak: -; RV64F-MEDIUM: # %bb.0: -; RV64F-MEDIUM-NEXT: .Lpcrel_hi4: -; RV64F-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(W) -; RV64F-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi4)(a0) -; RV64F-MEDIUM-NEXT: lw a0, 0(a0) -; RV64F-MEDIUM-NEXT: ret -; -; RV64F-LARGE-LABEL: lower_extern_weak: -; RV64F-LARGE: # %bb.0: -; RV64F-LARGE-NEXT: .Lpcrel_hi4: -; RV64F-LARGE-NEXT: auipc a0, %pcrel_hi(.LCPI4_0) -; RV64F-LARGE-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi4)(a0) -; RV64F-LARGE-NEXT: lw a0, 0(a0) -; RV64F-LARGE-NEXT: ret -; -; RV32FINX-MEDIUM-LABEL: lower_extern_weak: -; RV32FINX-MEDIUM: # %bb.0: -; RV32FINX-MEDIUM-NEXT: .Lpcrel_hi3: -; RV32FINX-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(W) -; RV32FINX-MEDIUM-NEXT: lw a0, %pcrel_lo(.Lpcrel_hi3)(a0) -; RV32FINX-MEDIUM-NEXT: lw a0, 0(a0) -; RV32FINX-MEDIUM-NEXT: ret -; -; RV64FINX-MEDIUM-LABEL: lower_extern_weak: -; RV64FINX-MEDIUM: # %bb.0: -; RV64FINX-MEDIUM-NEXT: .Lpcrel_hi3: -; RV64FINX-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(W) -; RV64FINX-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi3)(a0) -; RV64FINX-MEDIUM-NEXT: lw a0, 0(a0) -; RV64FINX-MEDIUM-NEXT: ret +; RV64I-MEDIUM-LABEL: lower_extern_weak: +; RV64I-MEDIUM: # %bb.0: +; RV64I-MEDIUM-NEXT: .Lpcrel_hi3: +; RV64I-MEDIUM-NEXT: auipc a0, %got_pcrel_hi(W) +; RV64I-MEDIUM-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi3)(a0) +; RV64I-MEDIUM-NEXT: lw a0, 0(a0) +; RV64I-MEDIUM-NEXT: ret ; -; RV64FINX-LARGE-LABEL: lower_extern_weak: -; RV64FINX-LARGE: # %bb.0: -; RV64FINX-LARGE-NEXT: .Lpcrel_hi3: -; RV64FINX-LARGE-NEXT: auipc a0, %pcrel_hi(.LCPI4_0) -; RV64FINX-LARGE-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi3)(a0) -; RV64FINX-LARGE-NEXT: lw a0, 0(a0) -; RV64FINX-LARGE-NEXT: ret +; RV64I-LARGE-LABEL: lower_extern_weak: +; RV64I-LARGE: # %bb.0: +; RV64I-LARGE-NEXT: .Lpcrel_hi3: +; RV64I-LARGE-NEXT: auipc a0, %pcrel_hi(.LCPI4_0) +; RV64I-LARGE-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi3)(a0) +; RV64I-LARGE-NEXT: lw a0, 0(a0) +; RV64I-LARGE-NEXT: ret %1 = load volatile i32, ptr @W ret i32 %1 } @@ -466,9 +444,9 @@ define half @lower_global_half(half %a) nounwind { ; ; RV32F-MEDIUM-LABEL: lower_global_half: ; RV32F-MEDIUM: # %bb.0: -; RV32F-MEDIUM-NEXT: .Lpcrel_hi5: +; RV32F-MEDIUM-NEXT: .Lpcrel_hi4: ; RV32F-MEDIUM-NEXT: auipc a0, %pcrel_hi(X) -; RV32F-MEDIUM-NEXT: flh fa5, %pcrel_lo(.Lpcrel_hi5)(a0) +; RV32F-MEDIUM-NEXT: flh fa5, %pcrel_lo(.Lpcrel_hi4)(a0) ; RV32F-MEDIUM-NEXT: fadd.h fa0, fa0, fa5 ; RV32F-MEDIUM-NEXT: ret ; @@ -481,17 +459,17 @@ define half @lower_global_half(half %a) nounwind { ; ; RV64F-MEDIUM-LABEL: lower_global_half: ; RV64F-MEDIUM: # %bb.0: -; RV64F-MEDIUM-NEXT: .Lpcrel_hi5: +; RV64F-MEDIUM-NEXT: .Lpcrel_hi4: ; RV64F-MEDIUM-NEXT: auipc a0, %pcrel_hi(X) -; RV64F-MEDIUM-NEXT: flh fa5, %pcrel_lo(.Lpcrel_hi5)(a0) +; RV64F-MEDIUM-NEXT: flh fa5, %pcrel_lo(.Lpcrel_hi4)(a0) ; RV64F-MEDIUM-NEXT: fadd.h fa0, fa0, fa5 ; RV64F-MEDIUM-NEXT: ret ; ; RV64F-LARGE-LABEL: lower_global_half: ; RV64F-LARGE: # %bb.0: -; RV64F-LARGE-NEXT: .Lpcrel_hi5: +; RV64F-LARGE-NEXT: .Lpcrel_hi4: ; RV64F-LARGE-NEXT: auipc a0, %pcrel_hi(.LCPI5_0) -; RV64F-LARGE-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi5)(a0) +; RV64F-LARGE-NEXT: ld a0, %pcrel_lo(.Lpcrel_hi4)(a0) ; RV64F-LARGE-NEXT: flh fa5, 0(a0) ; RV64F-LARGE-NEXT: fadd.h fa0, fa0, fa5 ; RV64F-LARGE-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll index 8124d00e63fa7..c3e729800616d 100644 --- a/llvm/test/CodeGen/RISCV/double-convert.ll +++ b/llvm/test/CodeGen/RISCV/double-convert.ll @@ -1636,14 +1636,15 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; ; RV64IFD-LABEL: fcvt_w_s_sat_i16: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: lui a0, %hi(.LCPI26_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI26_0)(a0) -; RV64IFD-NEXT: lui a0, %hi(.LCPI26_1) -; RV64IFD-NEXT: fld fa4, %lo(.LCPI26_1)(a0) ; RV64IFD-NEXT: feq.d a0, fa0, fa0 -; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: lui a1, %hi(.LCPI26_0) +; RV64IFD-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV64IFD-NEXT: li a1, -505 +; RV64IFD-NEXT: slli a1, a1, 53 +; RV64IFD-NEXT: fmv.d.x fa4, a1 +; RV64IFD-NEXT: fmax.d fa4, fa0, fa4 ; RV64IFD-NEXT: neg a0, a0 -; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 +; RV64IFD-NEXT: fmin.d fa5, fa4, fa5 ; RV64IFD-NEXT: fcvt.l.d a1, fa5, rtz ; RV64IFD-NEXT: and a0, a0, a1 ; RV64IFD-NEXT: ret @@ -1668,16 +1669,17 @@ define signext i16 @fcvt_w_s_sat_i16(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_w_s_sat_i16: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: li a1, -505 -; RV64IZFINXZDINX-NEXT: lui a2, %hi(.LCPI26_0) -; RV64IZFINXZDINX-NEXT: slli a1, a1, 53 -; RV64IZFINXZDINX-NEXT: ld a2, %lo(.LCPI26_0)(a2) -; RV64IZFINXZDINX-NEXT: fmax.d a1, a0, a1 -; RV64IZFINXZDINX-NEXT: feq.d a0, a0, a0 -; RV64IZFINXZDINX-NEXT: neg a0, a0 -; RV64IZFINXZDINX-NEXT: fmin.d a1, a1, a2 -; RV64IZFINXZDINX-NEXT: fcvt.l.d a1, a1, rtz -; RV64IZFINXZDINX-NEXT: and a0, a0, a1 +; RV64IZFINXZDINX-NEXT: feq.d a1, a0, a0 +; RV64IZFINXZDINX-NEXT: li a2, -505 +; RV64IZFINXZDINX-NEXT: slli a2, a2, 53 +; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, a2 +; RV64IZFINXZDINX-NEXT: lui a2, 4152 +; RV64IZFINXZDINX-NEXT: neg a1, a1 +; RV64IZFINXZDINX-NEXT: addi a2, a2, -1 +; RV64IZFINXZDINX-NEXT: slli a2, a2, 38 +; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a2 +; RV64IZFINXZDINX-NEXT: fcvt.l.d a0, a0, rtz +; RV64IZFINXZDINX-NEXT: and a0, a1, a0 ; RV64IZFINXZDINX-NEXT: ret ; ; RV32I-LABEL: fcvt_w_s_sat_i16: @@ -1859,9 +1861,10 @@ define zeroext i16 @fcvt_wu_s_sat_i16(double %a) nounwind { ; ; RV64IZFINXZDINX-LABEL: fcvt_wu_s_sat_i16: ; RV64IZFINXZDINX: # %bb.0: # %start -; RV64IZFINXZDINX-NEXT: lui a1, %hi(.LCPI28_0) -; RV64IZFINXZDINX-NEXT: ld a1, %lo(.LCPI28_0)(a1) ; RV64IZFINXZDINX-NEXT: fmax.d a0, a0, zero +; RV64IZFINXZDINX-NEXT: lui a1, 8312 +; RV64IZFINXZDINX-NEXT: addi a1, a1, -1 +; RV64IZFINXZDINX-NEXT: slli a1, a1, 37 ; RV64IZFINXZDINX-NEXT: fmin.d a0, a0, a1 ; RV64IZFINXZDINX-NEXT: fcvt.lu.d a0, a0, rtz ; RV64IZFINXZDINX-NEXT: ret @@ -2012,13 +2015,15 @@ define signext i8 @fcvt_w_s_sat_i8(double %a) nounwind { ; ; RV64IFD-LABEL: fcvt_w_s_sat_i8: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: lui a0, %hi(.LCPI30_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI30_0)(a0) -; RV64IFD-NEXT: lui a0, %hi(.LCPI30_1) -; RV64IFD-NEXT: fld fa4, %lo(.LCPI30_1)(a0) ; RV64IFD-NEXT: feq.d a0, fa0, fa0 -; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: li a1, -509 +; RV64IFD-NEXT: slli a1, a1, 53 +; RV64IFD-NEXT: fmv.d.x fa5, a1 +; RV64IFD-NEXT: lui a1, 65919 ; RV64IFD-NEXT: neg a0, a0 +; RV64IFD-NEXT: slli a1, a1, 34 +; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: fmv.d.x fa4, a1 ; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.l.d a1, fa5, rtz ; RV64IFD-NEXT: and a0, a0, a1 @@ -2214,11 +2219,12 @@ define zeroext i8 @fcvt_wu_s_sat_i8(double %a) nounwind { ; ; RV64IFD-LABEL: fcvt_wu_s_sat_i8: ; RV64IFD: # %bb.0: # %start -; RV64IFD-NEXT: lui a0, %hi(.LCPI32_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI32_0)(a0) -; RV64IFD-NEXT: fmv.d.x fa4, zero -; RV64IFD-NEXT: fmax.d fa4, fa0, fa4 -; RV64IFD-NEXT: fmin.d fa5, fa4, fa5 +; RV64IFD-NEXT: fmv.d.x fa5, zero +; RV64IFD-NEXT: lui a0, 131967 +; RV64IFD-NEXT: fmax.d fa5, fa0, fa5 +; RV64IFD-NEXT: slli a0, a0, 33 +; RV64IFD-NEXT: fmv.d.x fa4, a0 +; RV64IFD-NEXT: fmin.d fa5, fa5, fa4 ; RV64IFD-NEXT: fcvt.lu.d a0, fa5, rtz ; RV64IFD-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/double-imm.ll b/llvm/test/CodeGen/RISCV/double-imm.ll index 1119fd6d74a25..6f7c30edba3ea 100644 --- a/llvm/test/CodeGen/RISCV/double-imm.ll +++ b/llvm/test/CodeGen/RISCV/double-imm.ll @@ -47,8 +47,9 @@ define double @double_imm_op(double %a) nounwind { ; ; CHECK64D-LABEL: double_imm_op: ; CHECK64D: # %bb.0: -; CHECK64D-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK64D-NEXT: fld fa5, %lo(.LCPI1_0)(a0) +; CHECK64D-NEXT: li a0, 1023 +; CHECK64D-NEXT: slli a0, a0, 52 +; CHECK64D-NEXT: fmv.d.x fa5, a0 ; CHECK64D-NEXT: fadd.d fa0, fa0, fa5 ; CHECK64D-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics.ll b/llvm/test/CodeGen/RISCV/double-intrinsics.ll index bb57665fa1801..caeb6e6ce70af 100644 --- a/llvm/test/CodeGen/RISCV/double-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/double-intrinsics.ll @@ -866,8 +866,9 @@ define double @floor_f64(double %a) nounwind { ; ; RV64IFD-LABEL: floor_f64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI18_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB18_2 @@ -931,8 +932,9 @@ define double @ceil_f64(double %a) nounwind { ; ; RV64IFD-LABEL: ceil_f64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI19_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB19_2 @@ -996,8 +998,9 @@ define double @trunc_f64(double %a) nounwind { ; ; RV64IFD-LABEL: trunc_f64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI20_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB20_2 @@ -1061,8 +1064,9 @@ define double @rint_f64(double %a) nounwind { ; ; RV64IFD-LABEL: rint_f64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI21_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI21_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB21_2 @@ -1167,8 +1171,9 @@ define double @round_f64(double %a) nounwind { ; ; RV64IFD-LABEL: round_f64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI23_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI23_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB23_2 @@ -1232,8 +1237,9 @@ define double @roundeven_f64(double %a) nounwind { ; ; RV64IFD-LABEL: roundeven_f64: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI24_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB24_2 diff --git a/llvm/test/CodeGen/RISCV/double-round-conv.ll b/llvm/test/CodeGen/RISCV/double-round-conv.ll index 3edbda3a4bf6b..6dd24c056e386 100644 --- a/llvm/test/CodeGen/RISCV/double-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/double-round-conv.ll @@ -1145,8 +1145,9 @@ define double @test_floor_double(double %x) { ; ; RV64IFD-LABEL: test_floor_double: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI40_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB40_2 @@ -1194,8 +1195,9 @@ define double @test_ceil_double(double %x) { ; ; RV64IFD-LABEL: test_ceil_double: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI41_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI41_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB41_2 @@ -1243,8 +1245,9 @@ define double @test_trunc_double(double %x) { ; ; RV64IFD-LABEL: test_trunc_double: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI42_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB42_2 @@ -1292,8 +1295,9 @@ define double @test_round_double(double %x) { ; ; RV64IFD-LABEL: test_round_double: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI43_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI43_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB43_2 @@ -1341,8 +1345,9 @@ define double @test_roundeven_double(double %x) { ; ; RV64IFD-LABEL: test_roundeven_double: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a0, %hi(.LCPI44_0) -; RV64IFD-NEXT: fld fa5, %lo(.LCPI44_0)(a0) +; RV64IFD-NEXT: li a0, 1075 +; RV64IFD-NEXT: slli a0, a0, 52 +; RV64IFD-NEXT: fmv.d.x fa5, a0 ; RV64IFD-NEXT: fabs.d fa4, fa0 ; RV64IFD-NEXT: flt.d a0, fa4, fa5 ; RV64IFD-NEXT: beqz a0, .LBB44_2 diff --git a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll index 1deea55b083ce..cd3ff779d8cd3 100644 --- a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d -verify-machineinstrs < %s \ -; RUN: -target-abi=ilp32d | FileCheck %s +; RUN: -target-abi=ilp32d | FileCheck --check-prefixes=CHECK,RV32D %s ; RUN: llc -mtriple=riscv64 -mattr=+d -verify-machineinstrs < %s \ -; RUN: -target-abi=lp64d | FileCheck %s +; RUN: -target-abi=lp64d | FileCheck --check-prefixes=CHECK,RV64D %s ; RUN: llc -mtriple=riscv32 -mattr=+zdinx -verify-machineinstrs < %s \ ; RUN: -target-abi=ilp32 | FileCheck --check-prefix=CHECKRV32ZDINX %s ; RUN: llc -mtriple=riscv64 -mattr=+zdinx -verify-machineinstrs < %s \ @@ -640,6 +640,39 @@ define signext i32 @select_fcmp_uge_1_2(double %a, double %b) nounwind { } define double @CascadedSelect(double noundef %a) { +; RV32D-LABEL: CascadedSelect: +; RV32D: # %bb.0: # %entry +; RV32D-NEXT: lui a0, %hi(.LCPI20_0) +; RV32D-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32D-NEXT: flt.d a0, fa5, fa0 +; RV32D-NEXT: bnez a0, .LBB20_3 +; RV32D-NEXT: # %bb.1: # %entry +; RV32D-NEXT: fcvt.d.w fa5, zero +; RV32D-NEXT: flt.d a0, fa0, fa5 +; RV32D-NEXT: bnez a0, .LBB20_3 +; RV32D-NEXT: # %bb.2: # %entry +; RV32D-NEXT: fmv.d fa5, fa0 +; RV32D-NEXT: .LBB20_3: # %entry +; RV32D-NEXT: fmv.d fa0, fa5 +; RV32D-NEXT: ret +; +; RV64D-LABEL: CascadedSelect: +; RV64D: # %bb.0: # %entry +; RV64D-NEXT: li a0, 1023 +; RV64D-NEXT: slli a0, a0, 52 +; RV64D-NEXT: fmv.d.x fa5, a0 +; RV64D-NEXT: flt.d a0, fa5, fa0 +; RV64D-NEXT: bnez a0, .LBB20_3 +; RV64D-NEXT: # %bb.1: # %entry +; RV64D-NEXT: fmv.d.x fa5, zero +; RV64D-NEXT: flt.d a0, fa0, fa5 +; RV64D-NEXT: bnez a0, .LBB20_3 +; RV64D-NEXT: # %bb.2: # %entry +; RV64D-NEXT: fmv.d fa5, fa0 +; RV64D-NEXT: .LBB20_3: # %entry +; RV64D-NEXT: fmv.d fa0, fa5 +; RV64D-NEXT: ret +; ; CHECKRV32ZDINX-LABEL: CascadedSelect: ; CHECKRV32ZDINX: # %bb.0: # %entry ; CHECKRV32ZDINX-NEXT: lui a3, %hi(.LCPI20_0) diff --git a/llvm/test/CodeGen/RISCV/double-zfa.ll b/llvm/test/CodeGen/RISCV/double-zfa.ll index 2f35496b9b32c..f17c63ddb6cae 100644 --- a/llvm/test/CodeGen/RISCV/double-zfa.ll +++ b/llvm/test/CodeGen/RISCV/double-zfa.ll @@ -69,21 +69,35 @@ define double @loadfpimm8() { } define double @loadfpimm9() { -; CHECK-LABEL: loadfpimm9: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI8_0)(a0) -; CHECK-NEXT: ret +; RV32IDZFA-LABEL: loadfpimm9: +; RV32IDZFA: # %bb.0: +; RV32IDZFA-NEXT: lui a0, %hi(.LCPI8_0) +; RV32IDZFA-NEXT: fld fa0, %lo(.LCPI8_0)(a0) +; RV32IDZFA-NEXT: ret +; +; RV64DZFA-LABEL: loadfpimm9: +; RV64DZFA: # %bb.0: +; RV64DZFA-NEXT: lui a0, 131967 +; RV64DZFA-NEXT: slli a0, a0, 33 +; RV64DZFA-NEXT: fmv.d.x fa0, a0 +; RV64DZFA-NEXT: ret ret double 255.0 } ; Negative test. This is 1 * 2^256. define double @loadfpimm10() { -; CHECK-LABEL: loadfpimm10: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI9_0)(a0) -; CHECK-NEXT: ret +; RV32IDZFA-LABEL: loadfpimm10: +; RV32IDZFA: # %bb.0: +; RV32IDZFA-NEXT: lui a0, %hi(.LCPI9_0) +; RV32IDZFA-NEXT: fld fa0, %lo(.LCPI9_0)(a0) +; RV32IDZFA-NEXT: ret +; +; RV64DZFA-LABEL: loadfpimm10: +; RV64DZFA: # %bb.0: +; RV64DZFA-NEXT: li a0, 1 +; RV64DZFA-NEXT: slli a0, a0, 60 +; RV64DZFA-NEXT: fmv.d.x fa0, a0 +; RV64DZFA-NEXT: ret ret double 0x1000000000000000 } @@ -125,11 +139,18 @@ define double @loadfpimm13() { ; Negative test. This is 2^-1023, a denormal. define double @loadfpimm15() { -; CHECK-LABEL: loadfpimm15: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: ret +; RV32IDZFA-LABEL: loadfpimm15: +; RV32IDZFA: # %bb.0: +; RV32IDZFA-NEXT: lui a0, %hi(.LCPI13_0) +; RV32IDZFA-NEXT: fld fa0, %lo(.LCPI13_0)(a0) +; RV32IDZFA-NEXT: ret +; +; RV64DZFA-LABEL: loadfpimm15: +; RV64DZFA: # %bb.0: +; RV64DZFA-NEXT: li a0, 1 +; RV64DZFA-NEXT: slli a0, a0, 51 +; RV64DZFA-NEXT: fmv.d.x fa0, a0 +; RV64DZFA-NEXT: ret ret double 0x0008000000000000 } diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index fc77d6cb7c7be..1a7a72d3e072b 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -191,6 +191,7 @@ ; CHECK-NEXT: xandesvbfhcvt - 'XAndesVBFHCvt' (Andes Vector BFLOAT16 Conversion Extension). ; CHECK-NEXT: xandesvdot - 'XAndesVDot' (Andes Vector Dot Product Extension). ; CHECK-NEXT: xandesvpackfph - 'XAndesVPackFPH' (Andes Vector Packed FP16 Extension). +; CHECK-NEXT: xandesvsinth - 'XAndesVSIntH' (Andes Vector Small INT Handling Extension). ; CHECK-NEXT: xandesvsintload - 'XAndesVSIntLoad' (Andes Vector INT4 Load Extension). ; CHECK-NEXT: xcvalu - 'XCValu' (CORE-V ALU Operations). ; CHECK-NEXT: xcvbi - 'XCVbi' (CORE-V Immediate Branching). diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll index 72578193ee4bf..e6e4f6642f685 100644 --- a/llvm/test/CodeGen/RISCV/float-convert.ll +++ b/llvm/test/CodeGen/RISCV/float-convert.ll @@ -623,20 +623,21 @@ define i64 @fcvt_l_s_sat(float %a) nounwind { ; RV32IF-NEXT: # %bb.1: # %start ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB12_2: # %start -; RV32IF-NEXT: lui a1, %hi(.LCPI12_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI12_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB12_4 ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: addi a2, a3, -1 ; RV32IF-NEXT: .LBB12_4: # %start ; RV32IF-NEXT: feq.s a3, fs0, fs0 -; RV32IF-NEXT: neg a4, a1 -; RV32IF-NEXT: neg a1, s0 +; RV32IF-NEXT: neg a4, s0 +; RV32IF-NEXT: neg a5, a1 ; RV32IF-NEXT: neg a3, a3 -; RV32IF-NEXT: and a0, a1, a0 +; RV32IF-NEXT: and a0, a4, a0 ; RV32IF-NEXT: and a1, a3, a2 -; RV32IF-NEXT: or a0, a4, a0 +; RV32IF-NEXT: or a0, a5, a0 ; RV32IF-NEXT: and a0, a3, a0 ; RV32IF-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IF-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -864,10 +865,11 @@ define i64 @fcvt_lu_s_sat(float %a) nounwind { ; RV32IF-NEXT: fle.s a0, fa5, fa0 ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI14_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI14_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 @@ -1405,13 +1407,14 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV32IF-LABEL: fcvt_w_s_sat_i16: ; RV32IF: # %bb.0: # %start ; RV32IF-NEXT: feq.s a0, fa0, fa0 -; RV32IF-NEXT: lui a1, %hi(.LCPI24_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI24_0)(a1) ; RV32IF-NEXT: lui a1, 815104 -; RV32IF-NEXT: fmv.w.x fa4, a1 -; RV32IF-NEXT: fmax.s fa4, fa0, fa4 +; RV32IF-NEXT: fmv.w.x fa5, a1 +; RV32IF-NEXT: lui a1, 290816 ; RV32IF-NEXT: neg a0, a0 -; RV32IF-NEXT: fmin.s fa5, fa4, fa5 +; RV32IF-NEXT: addi a1, a1, -512 +; RV32IF-NEXT: fmax.s fa5, fa0, fa5 +; RV32IF-NEXT: fmv.w.x fa4, a1 +; RV32IF-NEXT: fmin.s fa5, fa5, fa4 ; RV32IF-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IF-NEXT: and a0, a0, a1 ; RV32IF-NEXT: ret @@ -1419,13 +1422,14 @@ define signext i16 @fcvt_w_s_sat_i16(float %a) nounwind { ; RV64IF-LABEL: fcvt_w_s_sat_i16: ; RV64IF: # %bb.0: # %start ; RV64IF-NEXT: feq.s a0, fa0, fa0 -; RV64IF-NEXT: lui a1, %hi(.LCPI24_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI24_0)(a1) ; RV64IF-NEXT: lui a1, 815104 -; RV64IF-NEXT: fmv.w.x fa4, a1 -; RV64IF-NEXT: fmax.s fa4, fa0, fa4 +; RV64IF-NEXT: fmv.w.x fa5, a1 +; RV64IF-NEXT: lui a1, 290816 ; RV64IF-NEXT: neg a0, a0 -; RV64IF-NEXT: fmin.s fa5, fa4, fa5 +; RV64IF-NEXT: addi a1, a1, -512 +; RV64IF-NEXT: fmax.s fa5, fa0, fa5 +; RV64IF-NEXT: fmv.w.x fa4, a1 +; RV64IF-NEXT: fmin.s fa5, fa5, fa4 ; RV64IF-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IF-NEXT: and a0, a0, a1 ; RV64IF-NEXT: ret @@ -1590,21 +1594,23 @@ define zeroext i16 @fcvt_wu_s_i16(float %a) nounwind { define zeroext i16 @fcvt_wu_s_sat_i16(float %a) nounwind { ; RV32IF-LABEL: fcvt_wu_s_sat_i16: ; RV32IF: # %bb.0: # %start -; RV32IF-NEXT: lui a0, %hi(.LCPI26_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI26_0)(a0) -; RV32IF-NEXT: fmv.w.x fa4, zero -; RV32IF-NEXT: fmax.s fa4, fa0, fa4 -; RV32IF-NEXT: fmin.s fa5, fa4, fa5 +; RV32IF-NEXT: fmv.w.x fa5, zero +; RV32IF-NEXT: lui a0, 292864 +; RV32IF-NEXT: fmax.s fa5, fa0, fa5 +; RV32IF-NEXT: addi a0, a0, -256 +; RV32IF-NEXT: fmv.w.x fa4, a0 +; RV32IF-NEXT: fmin.s fa5, fa5, fa4 ; RV32IF-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fcvt_wu_s_sat_i16: ; RV64IF: # %bb.0: # %start -; RV64IF-NEXT: lui a0, %hi(.LCPI26_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI26_0)(a0) -; RV64IF-NEXT: fmv.w.x fa4, zero -; RV64IF-NEXT: fmax.s fa4, fa0, fa4 -; RV64IF-NEXT: fmin.s fa5, fa4, fa5 +; RV64IF-NEXT: fmv.w.x fa5, zero +; RV64IF-NEXT: lui a0, 292864 +; RV64IF-NEXT: fmax.s fa5, fa0, fa5 +; RV64IF-NEXT: addi a0, a0, -256 +; RV64IF-NEXT: fmv.w.x fa4, a0 +; RV64IF-NEXT: fmin.s fa5, fa5, fa4 ; RV64IF-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IF-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/float-imm.ll b/llvm/test/CodeGen/RISCV/float-imm.ll index a010ab49b2827..e4e34543d6314 100644 --- a/llvm/test/CodeGen/RISCV/float-imm.ll +++ b/llvm/test/CodeGen/RISCV/float-imm.ll @@ -12,8 +12,9 @@ define float @float_imm() nounwind { ; CHECK-LABEL: float_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flw fa0, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: lui a0, 263313 +; CHECK-NEXT: addi a0, a0, -37 +; CHECK-NEXT: fmv.w.x fa0, a0 ; CHECK-NEXT: ret ; ; CHECKZFINX-LABEL: float_imm: diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics.ll b/llvm/test/CodeGen/RISCV/float-intrinsics.ll index 8b8a3257a0027..b1230ae9dd6bf 100644 --- a/llvm/test/CodeGen/RISCV/float-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/float-intrinsics.ll @@ -16,7 +16,7 @@ ; RUN: | FileCheck -check-prefix=RV64IZFINX %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d \ ; RUN: -verify-machineinstrs -target-abi=lp64d \ -; RUN: | FileCheck -check-prefix=RV64IF %s +; RUN: | FileCheck -check-prefixes=RV64IFD %s ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 \ ; RUN: -verify-machineinstrs | FileCheck -check-prefix=RV32I %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 \ @@ -45,6 +45,11 @@ define float @sqrt_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fsqrt.s a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: sqrt_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fsqrt.s fa0, fa0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: sqrt_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -97,6 +102,16 @@ define float @powi_f32(float %a, i32 %b) nounwind { ; RV64IZFINX-NEXT: addi sp, sp, 16 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: powi_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: sext.w a0, a0 +; RV64IFD-NEXT: call __powisf2 +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: powi_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -138,6 +153,10 @@ define float @sin_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail sinf ; +; RV64IFD-LABEL: sin_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail sinf +; ; RV32I-LABEL: sin_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -178,6 +197,10 @@ define float @cos_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail cosf ; +; RV64IFD-LABEL: cos_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail cosf +; ; RV32I-LABEL: cos_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -237,6 +260,24 @@ define float @sincos_f32(float %a) nounwind { ; RV32IZFINX-NEXT: addi sp, sp, 16 ; RV32IZFINX-NEXT: ret ; +; RV64IF-LABEL: sincos_f32: +; RV64IF: # %bb.0: +; RV64IF-NEXT: addi sp, sp, -16 +; RV64IF-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IF-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill +; RV64IF-NEXT: fsw fs1, 0(sp) # 4-byte Folded Spill +; RV64IF-NEXT: fmv.s fs0, fa0 +; RV64IF-NEXT: call sinf +; RV64IF-NEXT: fmv.s fs1, fa0 +; RV64IF-NEXT: fmv.s fa0, fs0 +; RV64IF-NEXT: call cosf +; RV64IF-NEXT: fadd.s fa0, fs1, fa0 +; RV64IF-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IF-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload +; RV64IF-NEXT: flw fs1, 0(sp) # 4-byte Folded Reload +; RV64IF-NEXT: addi sp, sp, 16 +; RV64IF-NEXT: ret +; ; RV64IZFINX-LABEL: sincos_f32: ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: addi sp, sp, -32 @@ -255,6 +296,24 @@ define float @sincos_f32(float %a) nounwind { ; RV64IZFINX-NEXT: addi sp, sp, 32 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: sincos_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -32 +; RV64IFD-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: fmv.s fs0, fa0 +; RV64IFD-NEXT: call sinf +; RV64IFD-NEXT: fmv.s fs1, fa0 +; RV64IFD-NEXT: fmv.s fa0, fs0 +; RV64IFD-NEXT: call cosf +; RV64IFD-NEXT: fadd.s fa0, fs1, fa0 +; RV64IFD-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 32 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: sincos_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -319,6 +378,10 @@ define float @pow_f32(float %a, float %b) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail powf ; +; RV64IFD-LABEL: pow_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail powf +; ; RV32I-LABEL: pow_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -359,6 +422,10 @@ define float @exp_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail expf ; +; RV64IFD-LABEL: exp_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail expf +; ; RV32I-LABEL: exp_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -399,6 +466,10 @@ define float @exp2_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail exp2f ; +; RV64IFD-LABEL: exp2_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail exp2f +; ; RV32I-LABEL: exp2_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -437,6 +508,10 @@ define float @exp10_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail exp10f ; +; RV64IFD-LABEL: exp10_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail exp10f +; ; RV32I-LABEL: exp10_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -477,6 +552,10 @@ define float @log_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail logf ; +; RV64IFD-LABEL: log_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail logf +; ; RV32I-LABEL: log_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -517,6 +596,10 @@ define float @log10_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail log10f ; +; RV64IFD-LABEL: log10_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail log10f +; ; RV32I-LABEL: log10_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -557,6 +640,10 @@ define float @log2_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail log2f ; +; RV64IFD-LABEL: log2_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail log2f +; ; RV32I-LABEL: log2_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -601,6 +688,11 @@ define float @fma_f32(float %a, float %b, float %c) nounwind { ; RV64IZFINX-NEXT: fmadd.s a0, a0, a1, a2 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: fma_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fma_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -645,6 +737,11 @@ define float @fmuladd_f32(float %a, float %b, float %c) nounwind { ; RV64IZFINX-NEXT: fmadd.s a0, a0, a1, a2 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: fmuladd_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmadd.s fa0, fa0, fa1, fa2 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fmuladd_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -699,6 +796,11 @@ define float @fabs_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fabs.s a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: fabs_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fabs.s fa0, fa0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fabs_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -737,6 +839,11 @@ define float @minnum_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: fmin.s a0, a0, a1 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: minnum_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmin.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: minnum_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -781,6 +888,11 @@ define float @maxnum_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: fmax.s a0, a0, a1 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: maxnum_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmax.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: maxnum_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -825,6 +937,11 @@ define float @copysign_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: fsgnj.s a0, a0, a1 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: copysign_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fsgnj.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: copysign_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a2, 524288 @@ -903,6 +1020,20 @@ define float @floor_f32(float %a) nounwind { ; RV64IZFINX-NEXT: .LBB18_2: ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: floor_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: lui a0, 307200 +; RV64IFD-NEXT: fmv.w.x fa5, a0 +; RV64IFD-NEXT: fabs.s fa4, fa0 +; RV64IFD-NEXT: flt.s a0, fa4, fa5 +; RV64IFD-NEXT: beqz a0, .LBB18_2 +; RV64IFD-NEXT: # %bb.1: +; RV64IFD-NEXT: fcvt.w.s a0, fa0, rdn +; RV64IFD-NEXT: fcvt.s.w fa5, a0, rdn +; RV64IFD-NEXT: fsgnj.s fa0, fa5, fa0 +; RV64IFD-NEXT: .LBB18_2: +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: floor_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -981,6 +1112,20 @@ define float @ceil_f32(float %a) nounwind { ; RV64IZFINX-NEXT: .LBB19_2: ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: ceil_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: lui a0, 307200 +; RV64IFD-NEXT: fmv.w.x fa5, a0 +; RV64IFD-NEXT: fabs.s fa4, fa0 +; RV64IFD-NEXT: flt.s a0, fa4, fa5 +; RV64IFD-NEXT: beqz a0, .LBB19_2 +; RV64IFD-NEXT: # %bb.1: +; RV64IFD-NEXT: fcvt.w.s a0, fa0, rup +; RV64IFD-NEXT: fcvt.s.w fa5, a0, rup +; RV64IFD-NEXT: fsgnj.s fa0, fa5, fa0 +; RV64IFD-NEXT: .LBB19_2: +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: ceil_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1059,6 +1204,20 @@ define float @trunc_f32(float %a) nounwind { ; RV64IZFINX-NEXT: .LBB20_2: ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: trunc_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: lui a0, 307200 +; RV64IFD-NEXT: fmv.w.x fa5, a0 +; RV64IFD-NEXT: fabs.s fa4, fa0 +; RV64IFD-NEXT: flt.s a0, fa4, fa5 +; RV64IFD-NEXT: beqz a0, .LBB20_2 +; RV64IFD-NEXT: # %bb.1: +; RV64IFD-NEXT: fcvt.w.s a0, fa0, rtz +; RV64IFD-NEXT: fcvt.s.w fa5, a0, rtz +; RV64IFD-NEXT: fsgnj.s fa0, fa5, fa0 +; RV64IFD-NEXT: .LBB20_2: +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: trunc_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1137,6 +1296,20 @@ define float @rint_f32(float %a) nounwind { ; RV64IZFINX-NEXT: .LBB21_2: ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: rint_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: lui a0, 307200 +; RV64IFD-NEXT: fmv.w.x fa5, a0 +; RV64IFD-NEXT: fabs.s fa4, fa0 +; RV64IFD-NEXT: flt.s a0, fa4, fa5 +; RV64IFD-NEXT: beqz a0, .LBB21_2 +; RV64IFD-NEXT: # %bb.1: +; RV64IFD-NEXT: fcvt.w.s a0, fa0 +; RV64IFD-NEXT: fcvt.s.w fa5, a0 +; RV64IFD-NEXT: fsgnj.s fa0, fa5, fa0 +; RV64IFD-NEXT: .LBB21_2: +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: rint_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1177,6 +1350,10 @@ define float @nearbyint_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail nearbyintf ; +; RV64IFD-LABEL: nearbyint_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail nearbyintf +; ; RV32I-LABEL: nearbyint_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1255,6 +1432,20 @@ define float @round_f32(float %a) nounwind { ; RV64IZFINX-NEXT: .LBB23_2: ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: round_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: lui a0, 307200 +; RV64IFD-NEXT: fmv.w.x fa5, a0 +; RV64IFD-NEXT: fabs.s fa4, fa0 +; RV64IFD-NEXT: flt.s a0, fa4, fa5 +; RV64IFD-NEXT: beqz a0, .LBB23_2 +; RV64IFD-NEXT: # %bb.1: +; RV64IFD-NEXT: fcvt.w.s a0, fa0, rmm +; RV64IFD-NEXT: fcvt.s.w fa5, a0, rmm +; RV64IFD-NEXT: fsgnj.s fa0, fa5, fa0 +; RV64IFD-NEXT: .LBB23_2: +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: round_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1333,6 +1524,20 @@ define float @roundeven_f32(float %a) nounwind { ; RV64IZFINX-NEXT: .LBB24_2: ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: roundeven_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: lui a0, 307200 +; RV64IFD-NEXT: fmv.w.x fa5, a0 +; RV64IFD-NEXT: fabs.s fa4, fa0 +; RV64IFD-NEXT: flt.s a0, fa4, fa5 +; RV64IFD-NEXT: beqz a0, .LBB24_2 +; RV64IFD-NEXT: # %bb.1: +; RV64IFD-NEXT: fcvt.w.s a0, fa0, rne +; RV64IFD-NEXT: fcvt.s.w fa5, a0, rne +; RV64IFD-NEXT: fsgnj.s fa0, fa5, fa0 +; RV64IFD-NEXT: .LBB24_2: +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: roundeven_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1377,6 +1582,11 @@ define iXLen @lrint_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fcvt.l.s a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: lrint_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fcvt.l.s a0, fa0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: lrint_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1422,6 +1632,11 @@ define iXLen @lround_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rmm ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: lround_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fcvt.l.s a0, fa0, rmm +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: lround_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1466,6 +1681,11 @@ define i32 @lround_i32_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fcvt.w.s a0, a0, rmm ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: lround_i32_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fcvt.w.s a0, fa0, rmm +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: lround_i32_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1518,6 +1738,11 @@ define i64 @llrint_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fcvt.l.s a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: llrint_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fcvt.l.s a0, fa0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: llrint_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1570,6 +1795,11 @@ define i64 @llround_f32(float %a) nounwind { ; RV64IZFINX-NEXT: fcvt.l.s a0, a0, rmm ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: llround_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fcvt.l.s a0, fa0, rmm +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: llround_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -1621,6 +1851,13 @@ define i1 @fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 927 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 1 @@ -1705,6 +1942,13 @@ define i1 @isnan_fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isnan_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 768 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -1749,6 +1993,12 @@ define i1 @isqnan_fpclass(float %x) { ; RV64IZFINX-NEXT: srli a0, a0, 9 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isqnan_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: srli a0, a0, 9 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isqnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -1799,6 +2049,13 @@ define i1 @issnan_fpclass(float %x) { ; RV64IZFINX-NEXT: srli a0, a0, 63 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: issnan_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: slli a0, a0, 55 +; RV64IFD-NEXT: srli a0, a0, 63 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: issnan_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -1853,6 +2110,13 @@ define i1 @isinf_fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isinf_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 129 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isinf_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -1903,6 +2167,13 @@ define i1 @isposinf_fpclass(float %x) { ; RV64IZFINX-NEXT: srli a0, a0, 63 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isposinf_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: slli a0, a0, 56 +; RV64IFD-NEXT: srli a0, a0, 63 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isposinf_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 522240 @@ -1946,6 +2217,12 @@ define i1 @isneginf_fpclass(float %x) { ; RV64IZFINX-NEXT: andi a0, a0, 1 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isneginf_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isneginf_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: lui a1, 1046528 @@ -1993,6 +2270,13 @@ define i1 @isfinite_fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 126 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -2041,6 +2325,13 @@ define i1 @isposfinite_fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isposfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 112 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isposfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: srli a0, a0, 23 @@ -2085,6 +2376,13 @@ define i1 @isnegfinite_fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isnegfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 14 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isnegfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 1 @@ -2137,6 +2435,13 @@ define i1 @isnotfinite_fpclass(float %x) { ; RV64IZFINX-NEXT: snez a0, a0 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: isnotfinite_fpclass: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fclass.s a0, fa0 +; RV64IFD-NEXT: andi a0, a0, 897 +; RV64IFD-NEXT: snez a0, a0 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: isnotfinite_fpclass: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 1 @@ -2175,6 +2480,10 @@ define float @tan_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail tanf ; +; RV64IFD-LABEL: tan_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail tanf +; ; RV32I-LABEL: tan_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2219,6 +2528,11 @@ define float @maximumnum_float(float %x, float %y) { ; RV64IZFINX-NEXT: fmax.s a0, a0, a1 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: maximumnum_float: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmax.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: maximumnum_float: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2271,6 +2585,11 @@ define float @minimumnum_float(float %x, float %y) { ; RV64IZFINX-NEXT: fmin.s a0, a0, a1 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: minimumnum_float: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: fmin.s fa0, fa0, fa1 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: minimumnum_float: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2327,6 +2646,15 @@ define float @ldexp_float(float %x, i32 signext %y) nounwind { ; RV64IZFINX-NEXT: addi sp, sp, 16 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: ldexp_float: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: call ldexpf +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: ldexp_float: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2393,6 +2721,17 @@ define {float, i32} @frexp_float(float %x) nounwind { ; RV64IZFINX-NEXT: addi sp, sp, 16 ; RV64IZFINX-NEXT: ret ; +; RV64IFD-LABEL: frexp_float: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: addi sp, sp, -16 +; RV64IFD-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64IFD-NEXT: mv a0, sp +; RV64IFD-NEXT: call frexpf +; RV64IFD-NEXT: ld a0, 0(sp) +; RV64IFD-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64IFD-NEXT: addi sp, sp, 16 +; RV64IFD-NEXT: ret +; ; RV32I-LABEL: frexp_float: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2435,6 +2774,10 @@ define float @asin_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail asinf ; +; RV64IFD-LABEL: asin_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail asinf +; ; RV32I-LABEL: asin_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2473,6 +2816,10 @@ define float @acos_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail acosf ; +; RV64IFD-LABEL: acos_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail acosf +; ; RV32I-LABEL: acos_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2511,6 +2858,10 @@ define float @atan_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail atanf ; +; RV64IFD-LABEL: atan_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail atanf +; ; RV32I-LABEL: atan_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2549,6 +2900,10 @@ define float @atan2_f32(float %a, float %b) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail atan2f ; +; RV64IFD-LABEL: atan2_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail atan2f +; ; RV32I-LABEL: atan2_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2587,6 +2942,10 @@ define float @sinh_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail sinhf ; +; RV64IFD-LABEL: sinh_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail sinhf +; ; RV32I-LABEL: sinh_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2625,6 +2984,10 @@ define float @cosh_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail coshf ; +; RV64IFD-LABEL: cosh_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail coshf +; ; RV32I-LABEL: cosh_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 @@ -2663,6 +3026,10 @@ define float @tanh_f32(float %a) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: tail tanhf ; +; RV64IFD-LABEL: tanh_f32: +; RV64IFD: # %bb.0: +; RV64IFD-NEXT: tail tanhf +; ; RV32I-LABEL: tanh_f32: ; RV32I: # %bb.0: ; RV32I-NEXT: addi sp, sp, -16 diff --git a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll index 809cc31abe612..6871f29cb8b05 100644 --- a/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/float-round-conv-sat.ll @@ -60,8 +60,9 @@ define i64 @test_floor_si64(float %x) nounwind { ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB1_4: -; RV32IF-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI1_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB1_6 ; RV32IF-NEXT: # %bb.5: @@ -196,10 +197,11 @@ define i64 @test_floor_ui64(float %x) nounwind { ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI3_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI3_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 @@ -318,8 +320,9 @@ define i64 @test_ceil_si64(float %x) nounwind { ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB5_4: -; RV32IF-NEXT: lui a1, %hi(.LCPI5_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI5_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB5_6 ; RV32IF-NEXT: # %bb.5: @@ -454,10 +457,11 @@ define i64 @test_ceil_ui64(float %x) nounwind { ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI7_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI7_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 @@ -576,8 +580,9 @@ define i64 @test_trunc_si64(float %x) nounwind { ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB9_4: -; RV32IF-NEXT: lui a1, %hi(.LCPI9_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI9_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB9_6 ; RV32IF-NEXT: # %bb.5: @@ -712,10 +717,11 @@ define i64 @test_trunc_ui64(float %x) nounwind { ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI11_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI11_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 @@ -834,8 +840,9 @@ define i64 @test_round_si64(float %x) nounwind { ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB13_4: -; RV32IF-NEXT: lui a1, %hi(.LCPI13_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI13_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB13_6 ; RV32IF-NEXT: # %bb.5: @@ -970,10 +977,11 @@ define i64 @test_round_ui64(float %x) nounwind { ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI15_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI15_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 @@ -1092,8 +1100,9 @@ define i64 @test_roundeven_si64(float %x) nounwind { ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB17_4: -; RV32IF-NEXT: lui a1, %hi(.LCPI17_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI17_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB17_6 ; RV32IF-NEXT: # %bb.5: @@ -1228,10 +1237,11 @@ define i64 @test_roundeven_ui64(float %x) nounwind { ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI19_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI19_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 @@ -1350,8 +1360,9 @@ define i64 @test_rint_si64(float %x) nounwind { ; RV32IF-NEXT: # %bb.3: ; RV32IF-NEXT: mv a2, a1 ; RV32IF-NEXT: .LBB21_4: -; RV32IF-NEXT: lui a1, %hi(.LCPI21_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI21_0)(a1) +; RV32IF-NEXT: lui a1, 389120 +; RV32IF-NEXT: addi a1, a1, -1 +; RV32IF-NEXT: fmv.w.x fa5, a1 ; RV32IF-NEXT: flt.s a1, fa5, fs0 ; RV32IF-NEXT: beqz a1, .LBB21_6 ; RV32IF-NEXT: # %bb.5: @@ -1486,10 +1497,11 @@ define i64 @test_rint_ui64(float %x) nounwind { ; RV32IF-NEXT: neg s0, a0 ; RV32IF-NEXT: fmv.s fa0, fs0 ; RV32IF-NEXT: call __fixunssfdi -; RV32IF-NEXT: lui a2, %hi(.LCPI23_0) -; RV32IF-NEXT: flw fa5, %lo(.LCPI23_0)(a2) ; RV32IF-NEXT: and a0, s0, a0 +; RV32IF-NEXT: lui a2, 391168 ; RV32IF-NEXT: and a1, s0, a1 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: fmv.w.x fa5, a2 ; RV32IF-NEXT: flt.s a2, fa5, fs0 ; RV32IF-NEXT: neg a2, a2 ; RV32IF-NEXT: or a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index 84163b52bb98d..2ebb6e9b97a4d 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -2883,39 +2883,20 @@ define half @fsgnjx_f16(half %x, half %y) nounwind { ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; -; RV32IZFHMIN-LABEL: fsgnjx_f16: -; RV32IZFHMIN: # %bb.0: -; RV32IZFHMIN-NEXT: lui a0, %hi(.LCPI23_0) -; RV32IZFHMIN-NEXT: lhu a0, %lo(.LCPI23_0)(a0) -; RV32IZFHMIN-NEXT: fmv.x.h a1, fa0 -; RV32IZFHMIN-NEXT: lui a2, 1048568 -; RV32IZFHMIN-NEXT: and a1, a1, a2 -; RV32IZFHMIN-NEXT: slli a0, a0, 17 -; RV32IZFHMIN-NEXT: srli a0, a0, 17 -; RV32IZFHMIN-NEXT: or a0, a0, a1 -; RV32IZFHMIN-NEXT: fmv.h.x fa5, a0 -; RV32IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV32IZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; RV32IZFHMIN-NEXT: fmul.s fa5, fa5, fa4 -; RV32IZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; RV32IZFHMIN-NEXT: ret -; -; RV64IZFHMIN-LABEL: fsgnjx_f16: -; RV64IZFHMIN: # %bb.0: -; RV64IZFHMIN-NEXT: lui a0, %hi(.LCPI23_0) -; RV64IZFHMIN-NEXT: lhu a0, %lo(.LCPI23_0)(a0) -; RV64IZFHMIN-NEXT: fmv.x.h a1, fa0 -; RV64IZFHMIN-NEXT: lui a2, 1048568 -; RV64IZFHMIN-NEXT: and a1, a1, a2 -; RV64IZFHMIN-NEXT: slli a0, a0, 49 -; RV64IZFHMIN-NEXT: srli a0, a0, 49 -; RV64IZFHMIN-NEXT: or a0, a0, a1 -; RV64IZFHMIN-NEXT: fmv.h.x fa5, a0 -; RV64IZFHMIN-NEXT: fcvt.s.h fa5, fa5 -; RV64IZFHMIN-NEXT: fcvt.s.h fa4, fa1 -; RV64IZFHMIN-NEXT: fmul.s fa5, fa5, fa4 -; RV64IZFHMIN-NEXT: fcvt.h.s fa0, fa5 -; RV64IZFHMIN-NEXT: ret +; CHECKIZFHMIN-LABEL: fsgnjx_f16: +; CHECKIZFHMIN: # %bb.0: +; CHECKIZFHMIN-NEXT: fmv.x.h a0, fa0 +; CHECKIZFHMIN-NEXT: lui a1, 1048568 +; CHECKIZFHMIN-NEXT: and a0, a0, a1 +; CHECKIZFHMIN-NEXT: li a1, 15 +; CHECKIZFHMIN-NEXT: slli a1, a1, 10 +; CHECKIZFHMIN-NEXT: or a0, a0, a1 +; CHECKIZFHMIN-NEXT: fmv.h.x fa5, a0 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa5 +; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa1 +; CHECKIZFHMIN-NEXT: fmul.s fa5, fa5, fa4 +; CHECKIZFHMIN-NEXT: fcvt.h.s fa0, fa5 +; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: fsgnjx_f16: ; CHECKIZHINXMIN: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll index 6cebf8b2828bf..c3c06e192f76f 100644 --- a/llvm/test/CodeGen/RISCV/half-convert.ll +++ b/llvm/test/CodeGen/RISCV/half-convert.ll @@ -194,13 +194,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_si_h_sat: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV32IZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV32IZFH-NEXT: lui a0, 815104 -; RV32IZFH-NEXT: fmv.w.x fa3, a0 -; RV32IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IZFH-NEXT: neg a0, a1 +; RV32IZFH-NEXT: lui a1, 290816 +; RV32IZFH-NEXT: fmv.w.x fa4, a0 +; RV32IZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IZFH-NEXT: addi a1, a1, -512 +; RV32IZFH-NEXT: neg a0, a0 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: fmv.w.x fa4, a1 ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IZFH-NEXT: and a0, a0, a1 @@ -209,13 +210,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_si_h_sat: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV64IZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV64IZFH-NEXT: lui a0, 815104 -; RV64IZFH-NEXT: fmv.w.x fa3, a0 -; RV64IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IZFH-NEXT: neg a0, a1 +; RV64IZFH-NEXT: lui a1, 290816 +; RV64IZFH-NEXT: fmv.w.x fa4, a0 +; RV64IZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IZFH-NEXT: addi a1, a1, -512 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: fmv.w.x fa4, a1 ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IZFH-NEXT: and a0, a0, a1 @@ -224,13 +226,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_si_h_sat: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV32IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV32IDZFH-NEXT: lui a0, 815104 -; RV32IDZFH-NEXT: fmv.w.x fa3, a0 -; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IDZFH-NEXT: neg a0, a1 +; RV32IDZFH-NEXT: lui a1, 290816 +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 +; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IDZFH-NEXT: addi a1, a1, -512 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: fmv.w.x fa4, a1 ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IDZFH-NEXT: and a0, a0, a1 @@ -239,13 +242,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_si_h_sat: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IDZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV64IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV64IDZFH-NEXT: lui a0, 815104 -; RV64IDZFH-NEXT: fmv.w.x fa3, a0 -; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IDZFH-NEXT: neg a0, a1 +; RV64IDZFH-NEXT: lui a1, 290816 +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 +; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IDZFH-NEXT: addi a1, a1, -512 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: fmv.w.x fa4, a1 ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IDZFH-NEXT: and a0, a0, a1 @@ -399,13 +403,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 -; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI1_0) -; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 -; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV32ID-ILP32-NEXT: lui a0, 815104 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, a0 -; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa3 -; RV32ID-ILP32-NEXT: neg a0, a1 +; RV32ID-ILP32-NEXT: lui a1, 290816 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 +; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 +; RV32ID-ILP32-NEXT: addi a1, a1, -512 +; RV32ID-ILP32-NEXT: neg a0, a0 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-ILP32-NEXT: and a0, a0, a1 @@ -419,13 +424,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 -; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI1_0) -; RV64ID-LP64-NEXT: feq.s a1, fa5, fa5 -; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; RV64ID-LP64-NEXT: lui a0, 815104 -; RV64ID-LP64-NEXT: fmv.w.x fa3, a0 -; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa3 -; RV64ID-LP64-NEXT: neg a0, a1 +; RV64ID-LP64-NEXT: lui a1, 290816 +; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 +; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 +; RV64ID-LP64-NEXT: addi a1, a1, -512 +; RV64ID-LP64-NEXT: neg a0, a0 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-LP64-NEXT: and a0, a0, a1 @@ -439,13 +445,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: lui a1, %hi(.LCPI1_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI1_0)(a1) ; RV32ID-NEXT: lui a1, 815104 -; RV32ID-NEXT: fmv.w.x fa4, a1 -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 +; RV32ID-NEXT: fmv.w.x fa5, a1 +; RV32ID-NEXT: lui a1, 290816 ; RV32ID-NEXT: neg a0, a0 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: addi a1, a1, -512 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: fmv.w.x fa4, a1 +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -458,13 +465,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: lui a1, %hi(.LCPI1_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI1_0)(a1) ; RV64ID-NEXT: lui a1, 815104 -; RV64ID-NEXT: fmv.w.x fa4, a1 -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 +; RV64ID-NEXT: fmv.w.x fa5, a1 +; RV64ID-NEXT: lui a1, 290816 ; RV64ID-NEXT: neg a0, a0 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: addi a1, a1, -512 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: fmv.w.x fa4, a1 +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -474,13 +482,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_si_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK32-IZFHMIN-NEXT: lui a0, 815104 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK32-IZFHMIN-NEXT: neg a0, a1 +; CHECK32-IZFHMIN-NEXT: lui a1, 290816 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: addi a1, a1, -512 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 @@ -489,13 +498,14 @@ define i16 @fcvt_si_h_sat(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_si_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI1_0)(a0) ; CHECK64-IZFHMIN-NEXT: lui a0, 815104 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK64-IZFHMIN-NEXT: neg a0, a1 +; CHECK64-IZFHMIN-NEXT: lui a1, 290816 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: addi a1, a1, -512 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 @@ -711,45 +721,49 @@ define i16 @fcvt_ui_h(half %a) nounwind { define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_ui_h_sat: ; RV32IZFH: # %bb.0: # %start -; RV32IZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV32IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IZFH-NEXT: fmv.w.x fa3, zero -; RV32IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFH-NEXT: fmv.w.x fa4, zero +; RV32IZFH-NEXT: lui a0, 292864 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: addi a0, a0, -256 +; RV32IZFH-NEXT: fmv.w.x fa4, a0 +; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IZFH-NEXT: ret ; ; RV64IZFH-LABEL: fcvt_ui_h_sat: ; RV64IZFH: # %bb.0: # %start -; RV64IZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV64IZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV64IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IZFH-NEXT: fmv.w.x fa3, zero -; RV64IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFH-NEXT: fmv.w.x fa4, zero +; RV64IZFH-NEXT: lui a0, 292864 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: addi a0, a0, -256 +; RV64IZFH-NEXT: fmv.w.x fa4, a0 +; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_ui_h_sat: ; RV32IDZFH: # %bb.0: # %start -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV32IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IDZFH-NEXT: fmv.w.x fa3, zero -; RV32IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IDZFH-NEXT: fmv.w.x fa4, zero +; RV32IDZFH-NEXT: lui a0, 292864 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: addi a0, a0, -256 +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 +; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_ui_h_sat: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV64IDZFH-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV64IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IDZFH-NEXT: fmv.w.x fa3, zero -; RV64IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IDZFH-NEXT: fmv.w.x fa4, zero +; RV64IDZFH-NEXT: lui a0, 292864 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: addi a0, a0, -256 +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 +; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IDZFH-NEXT: ret ; @@ -874,12 +888,13 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: addi sp, sp, -16 ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI3_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI3_0)(a1) +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero +; RV32ID-ILP32-NEXT: lui a0, 292864 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: addi a0, a0, -256 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero -; RV32ID-ILP32-NEXT: fmax.s fa4, fa4, fa3 -; RV32ID-ILP32-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: addi sp, sp, 16 @@ -890,12 +905,13 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 -; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI3_0) -; RV64ID-LP64-NEXT: flw fa5, %lo(.LCPI3_0)(a1) +; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64-NEXT: fmv.w.x fa4, zero +; RV64ID-LP64-NEXT: lui a0, 292864 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: addi a0, a0, -256 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 -; RV64ID-LP64-NEXT: fmv.w.x fa3, zero -; RV64ID-LP64-NEXT: fmax.s fa4, fa4, fa3 -; RV64ID-LP64-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-LP64-NEXT: addi sp, sp, 16 @@ -906,11 +922,12 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV32ID-NEXT: addi sp, sp, -16 ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 -; RV32ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: fmv.w.x fa5, zero +; RV32ID-NEXT: lui a0, 292864 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: addi a0, a0, -256 +; RV32ID-NEXT: fmv.w.x fa4, a0 +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: addi sp, sp, 16 @@ -921,11 +938,12 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 -; RV64ID-NEXT: lui a0, %hi(.LCPI3_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; RV64ID-NEXT: fmv.w.x fa4, zero -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: fmv.w.x fa5, zero +; RV64ID-NEXT: lui a0, 292864 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: addi a0, a0, -256 +; RV64ID-NEXT: fmv.w.x fa4, a0 +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-NEXT: addi sp, sp, 16 @@ -933,23 +951,25 @@ define i16 @fcvt_ui_h_sat(half %a) nounwind { ; ; CHECK32-IZFHMIN-LABEL: fcvt_ui_h_sat: ; CHECK32-IZFHMIN: # %bb.0: # %start -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK32-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK32-IZFHMIN-NEXT: lui a0, 292864 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: addi a0, a0, -256 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_ui_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start -; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK64-IZFHMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a0) -; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK64-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK64-IZFHMIN-NEXT: lui a0, 292864 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: addi a0, a0, -256 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz ; CHECK64-IZFHMIN-NEXT: ret ; @@ -2159,20 +2179,21 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: # %bb.1: # %start ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB10_2: # %start -; RV32IZFH-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB10_4 ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: addi a2, a3, -1 ; RV32IZFH-NEXT: .LBB10_4: # %start ; RV32IZFH-NEXT: feq.s a3, fs0, fs0 -; RV32IZFH-NEXT: neg a4, a1 -; RV32IZFH-NEXT: neg a1, s0 +; RV32IZFH-NEXT: neg a4, s0 +; RV32IZFH-NEXT: neg a5, a1 ; RV32IZFH-NEXT: neg a3, a3 -; RV32IZFH-NEXT: and a0, a1, a0 +; RV32IZFH-NEXT: and a0, a4, a0 ; RV32IZFH-NEXT: and a1, a3, a2 -; RV32IZFH-NEXT: or a0, a4, a0 +; RV32IZFH-NEXT: or a0, a5, a0 ; RV32IZFH-NEXT: and a0, a3, a0 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2207,20 +2228,21 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: # %bb.1: # %start ; RV32IDZFH-NEXT: mv a2, a1 ; RV32IDZFH-NEXT: .LBB10_2: # %start -; RV32IDZFH-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32IDZFH-NEXT: lui a1, 389120 +; RV32IDZFH-NEXT: addi a1, a1, -1 +; RV32IDZFH-NEXT: fmv.w.x fa5, a1 ; RV32IDZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IDZFH-NEXT: beqz a1, .LBB10_4 ; RV32IDZFH-NEXT: # %bb.3: ; RV32IDZFH-NEXT: addi a2, a3, -1 ; RV32IDZFH-NEXT: .LBB10_4: # %start ; RV32IDZFH-NEXT: feq.s a3, fs0, fs0 -; RV32IDZFH-NEXT: neg a4, a1 -; RV32IDZFH-NEXT: neg a1, s0 +; RV32IDZFH-NEXT: neg a4, s0 +; RV32IDZFH-NEXT: neg a5, a1 ; RV32IDZFH-NEXT: neg a3, a3 -; RV32IDZFH-NEXT: and a0, a1, a0 +; RV32IDZFH-NEXT: and a0, a4, a0 ; RV32IDZFH-NEXT: and a1, a3, a2 -; RV32IDZFH-NEXT: or a0, a4, a0 +; RV32IDZFH-NEXT: or a0, a5, a0 ; RV32IDZFH-NEXT: and a0, a3, a0 ; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2450,8 +2472,9 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: # %bb.1: # %start ; RV32ID-ILP32-NEXT: mv a2, a1 ; RV32ID-ILP32-NEXT: .LBB10_2: # %start -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI10_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32ID-ILP32-NEXT: lui a1, 389120 +; RV32ID-ILP32-NEXT: addi a1, a1, -1 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a1 ; RV32ID-ILP32-NEXT: flw fa4, 4(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: flt.s a1, fa5, fa4 ; RV32ID-ILP32-NEXT: fmv.s fa5, fa4 @@ -2505,8 +2528,9 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32ID-NEXT: # %bb.1: # %start ; RV32ID-NEXT: mv a2, a1 ; RV32ID-NEXT: .LBB10_2: # %start -; RV32ID-NEXT: lui a1, %hi(.LCPI10_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32ID-NEXT: lui a1, 389120 +; RV32ID-NEXT: addi a1, a1, -1 +; RV32ID-NEXT: fmv.w.x fa5, a1 ; RV32ID-NEXT: flt.s a1, fa5, fs0 ; RV32ID-NEXT: beqz a1, .LBB10_4 ; RV32ID-NEXT: # %bb.3: @@ -2558,20 +2582,21 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IFZFHMIN-NEXT: # %bb.1: # %start ; RV32IFZFHMIN-NEXT: mv a2, a1 ; RV32IFZFHMIN-NEXT: .LBB10_2: # %start -; RV32IFZFHMIN-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IFZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32IFZFHMIN-NEXT: lui a1, 389120 +; RV32IFZFHMIN-NEXT: addi a1, a1, -1 +; RV32IFZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IFZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IFZFHMIN-NEXT: beqz a1, .LBB10_4 ; RV32IFZFHMIN-NEXT: # %bb.3: ; RV32IFZFHMIN-NEXT: addi a2, a3, -1 ; RV32IFZFHMIN-NEXT: .LBB10_4: # %start ; RV32IFZFHMIN-NEXT: feq.s a3, fs0, fs0 -; RV32IFZFHMIN-NEXT: neg a4, a1 -; RV32IFZFHMIN-NEXT: neg a1, s0 +; RV32IFZFHMIN-NEXT: neg a4, s0 +; RV32IFZFHMIN-NEXT: neg a5, a1 ; RV32IFZFHMIN-NEXT: neg a3, a3 -; RV32IFZFHMIN-NEXT: and a0, a1, a0 +; RV32IFZFHMIN-NEXT: and a0, a4, a0 ; RV32IFZFHMIN-NEXT: and a1, a3, a2 -; RV32IFZFHMIN-NEXT: or a0, a4, a0 +; RV32IFZFHMIN-NEXT: or a0, a5, a0 ; RV32IFZFHMIN-NEXT: and a0, a3, a0 ; RV32IFZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IFZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2607,20 +2632,21 @@ define i64 @fcvt_l_h_sat(half %a) nounwind { ; RV32IDZFHMIN-NEXT: # %bb.1: # %start ; RV32IDZFHMIN-NEXT: mv a2, a1 ; RV32IDZFHMIN-NEXT: .LBB10_2: # %start -; RV32IDZFHMIN-NEXT: lui a1, %hi(.LCPI10_0) -; RV32IDZFHMIN-NEXT: flw fa5, %lo(.LCPI10_0)(a1) +; RV32IDZFHMIN-NEXT: lui a1, 389120 +; RV32IDZFHMIN-NEXT: addi a1, a1, -1 +; RV32IDZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IDZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IDZFHMIN-NEXT: beqz a1, .LBB10_4 ; RV32IDZFHMIN-NEXT: # %bb.3: ; RV32IDZFHMIN-NEXT: addi a2, a3, -1 ; RV32IDZFHMIN-NEXT: .LBB10_4: # %start ; RV32IDZFHMIN-NEXT: feq.s a3, fs0, fs0 -; RV32IDZFHMIN-NEXT: neg a4, a1 -; RV32IDZFHMIN-NEXT: neg a1, s0 +; RV32IDZFHMIN-NEXT: neg a4, s0 +; RV32IDZFHMIN-NEXT: neg a5, a1 ; RV32IDZFHMIN-NEXT: neg a3, a3 -; RV32IDZFHMIN-NEXT: and a0, a1, a0 +; RV32IDZFHMIN-NEXT: and a0, a4, a0 ; RV32IDZFHMIN-NEXT: and a1, a3, a2 -; RV32IDZFHMIN-NEXT: or a0, a4, a0 +; RV32IDZFHMIN-NEXT: or a0, a5, a0 ; RV32IDZFHMIN-NEXT: and a0, a3, a0 ; RV32IDZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload @@ -2903,23 +2929,25 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IZFH-NEXT: addi sp, sp, -16 ; RV32IZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32IZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32IZFH-NEXT: lui a0, %hi(.LCPI12_0) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32IZFH-NEXT: fcvt.s.h fa0, fa0 -; RV32IZFH-NEXT: fmv.w.x fa4, zero -; RV32IZFH-NEXT: fle.s a0, fa4, fa0 -; RV32IZFH-NEXT: flt.s a1, fa5, fa0 -; RV32IZFH-NEXT: neg s0, a1 -; RV32IZFH-NEXT: neg s1, a0 +; RV32IZFH-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill +; RV32IZFH-NEXT: fcvt.s.h fs0, fa0 +; RV32IZFH-NEXT: fmv.w.x fa5, zero +; RV32IZFH-NEXT: fle.s a0, fa5, fs0 +; RV32IZFH-NEXT: neg s0, a0 +; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: and a0, s1, a0 -; RV32IZFH-NEXT: and a1, s1, a1 -; RV32IZFH-NEXT: or a0, s0, a0 -; RV32IZFH-NEXT: or a1, s0, a1 +; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 +; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 +; RV32IZFH-NEXT: flt.s a2, fa5, fs0 +; RV32IZFH-NEXT: neg a2, a2 +; RV32IZFH-NEXT: or a0, a2, a0 +; RV32IZFH-NEXT: or a1, a2, a1 ; RV32IZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; RV32IZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IZFH-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload ; RV32IZFH-NEXT: addi sp, sp, 16 ; RV32IZFH-NEXT: ret ; @@ -2937,23 +2965,25 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32IDZFH-NEXT: addi sp, sp, -16 ; RV32IDZFH-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32IDZFH-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32IDZFH-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI12_0) -; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32IDZFH-NEXT: fcvt.s.h fa0, fa0 -; RV32IDZFH-NEXT: fmv.w.x fa4, zero -; RV32IDZFH-NEXT: fle.s a0, fa4, fa0 -; RV32IDZFH-NEXT: flt.s a1, fa5, fa0 -; RV32IDZFH-NEXT: neg s0, a1 -; RV32IDZFH-NEXT: neg s1, a0 +; RV32IDZFH-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; RV32IDZFH-NEXT: fcvt.s.h fs0, fa0 +; RV32IDZFH-NEXT: fmv.w.x fa5, zero +; RV32IDZFH-NEXT: fle.s a0, fa5, fs0 +; RV32IDZFH-NEXT: neg s0, a0 +; RV32IDZFH-NEXT: fmv.s fa0, fs0 ; RV32IDZFH-NEXT: call __fixunssfdi -; RV32IDZFH-NEXT: and a0, s1, a0 -; RV32IDZFH-NEXT: and a1, s1, a1 -; RV32IDZFH-NEXT: or a0, s0, a0 -; RV32IDZFH-NEXT: or a1, s0, a1 +; RV32IDZFH-NEXT: and a0, s0, a0 +; RV32IDZFH-NEXT: lui a2, 391168 +; RV32IDZFH-NEXT: and a1, s0, a1 +; RV32IDZFH-NEXT: addi a2, a2, -1 +; RV32IDZFH-NEXT: fmv.w.x fa5, a2 +; RV32IDZFH-NEXT: flt.s a2, fa5, fs0 +; RV32IDZFH-NEXT: neg a2, a2 +; RV32IDZFH-NEXT: or a0, a2, a0 +; RV32IDZFH-NEXT: or a1, a2, a1 ; RV32IDZFH-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32IDZFH-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; RV32IDZFH-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32IDZFH-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload ; RV32IDZFH-NEXT: addi sp, sp, 16 ; RV32IDZFH-NEXT: ret ; @@ -3105,14 +3135,15 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI12_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI12_0)(a1) -; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero -; RV32ID-ILP32-NEXT: fle.s a1, fa3, fa4 -; RV32ID-ILP32-NEXT: flt.s a2, fa5, fa4 -; RV32ID-ILP32-NEXT: neg s0, a2 -; RV32ID-ILP32-NEXT: neg s1, a1 +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: lui a1, 391168 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero +; RV32ID-ILP32-NEXT: addi a1, a1, -1 +; RV32ID-ILP32-NEXT: fle.s a2, fa4, fa5 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 +; RV32ID-ILP32-NEXT: flt.s a1, fa4, fa5 +; RV32ID-ILP32-NEXT: neg s0, a1 +; RV32ID-ILP32-NEXT: neg s1, a2 ; RV32ID-ILP32-NEXT: call __fixunssfdi ; RV32ID-ILP32-NEXT: and a0, s1, a0 ; RV32ID-ILP32-NEXT: and a1, s1, a1 @@ -3144,23 +3175,25 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV32ID-NEXT: addi sp, sp, -16 ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32ID-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32ID-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 -; RV32ID-NEXT: lui a0, %hi(.LCPI12_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fle.s a0, fa4, fa0 -; RV32ID-NEXT: flt.s a1, fa5, fa0 -; RV32ID-NEXT: neg s0, a1 -; RV32ID-NEXT: neg s1, a0 +; RV32ID-NEXT: fmv.s fs0, fa0 +; RV32ID-NEXT: fmv.w.x fa5, zero +; RV32ID-NEXT: fle.s a0, fa5, fa0 +; RV32ID-NEXT: neg s0, a0 ; RV32ID-NEXT: call __fixunssfdi -; RV32ID-NEXT: and a0, s1, a0 -; RV32ID-NEXT: and a1, s1, a1 -; RV32ID-NEXT: or a0, s0, a0 -; RV32ID-NEXT: or a1, s0, a1 +; RV32ID-NEXT: and a0, s0, a0 +; RV32ID-NEXT: lui a2, 391168 +; RV32ID-NEXT: and a1, s0, a1 +; RV32ID-NEXT: addi a2, a2, -1 +; RV32ID-NEXT: fmv.w.x fa5, a2 +; RV32ID-NEXT: flt.s a2, fa5, fs0 +; RV32ID-NEXT: neg a2, a2 +; RV32ID-NEXT: or a0, a2, a0 +; RV32ID-NEXT: or a1, a2, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; RV32ID-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32ID-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload ; RV32ID-NEXT: addi sp, sp, 16 ; RV32ID-NEXT: ret ; @@ -3178,30 +3211,32 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; RV64ID-NEXT: addi sp, sp, 16 ; RV64ID-NEXT: ret ; -; CHECK32-IZFHMIN-LABEL: fcvt_lu_h_sat: -; CHECK32-IZFHMIN: # %bb.0: # %start -; CHECK32-IZFHMIN-NEXT: addi sp, sp, -16 -; CHECK32-IZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; CHECK32-IZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; CHECK32-IZFHMIN-NEXT: sw s1, 4(sp) # 4-byte Folded Spill -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI12_0)(a0) -; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa0, fa0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero -; CHECK32-IZFHMIN-NEXT: fle.s a0, fa4, fa0 -; CHECK32-IZFHMIN-NEXT: flt.s a1, fa5, fa0 -; CHECK32-IZFHMIN-NEXT: neg s0, a1 -; CHECK32-IZFHMIN-NEXT: neg s1, a0 -; CHECK32-IZFHMIN-NEXT: call __fixunssfdi -; CHECK32-IZFHMIN-NEXT: and a0, s1, a0 -; CHECK32-IZFHMIN-NEXT: and a1, s1, a1 -; CHECK32-IZFHMIN-NEXT: or a0, s0, a0 -; CHECK32-IZFHMIN-NEXT: or a1, s0, a1 -; CHECK32-IZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; CHECK32-IZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload -; CHECK32-IZFHMIN-NEXT: lw s1, 4(sp) # 4-byte Folded Reload -; CHECK32-IZFHMIN-NEXT: addi sp, sp, 16 -; CHECK32-IZFHMIN-NEXT: ret +; RV32IFZFHMIN-LABEL: fcvt_lu_h_sat: +; RV32IFZFHMIN: # %bb.0: # %start +; RV32IFZFHMIN-NEXT: addi sp, sp, -16 +; RV32IFZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IFZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32IFZFHMIN-NEXT: fsw fs0, 4(sp) # 4-byte Folded Spill +; RV32IFZFHMIN-NEXT: fcvt.s.h fs0, fa0 +; RV32IFZFHMIN-NEXT: fmv.w.x fa5, zero +; RV32IFZFHMIN-NEXT: fle.s a0, fa5, fs0 +; RV32IFZFHMIN-NEXT: neg s0, a0 +; RV32IFZFHMIN-NEXT: fmv.s fa0, fs0 +; RV32IFZFHMIN-NEXT: call __fixunssfdi +; RV32IFZFHMIN-NEXT: and a0, s0, a0 +; RV32IFZFHMIN-NEXT: lui a2, 391168 +; RV32IFZFHMIN-NEXT: and a1, s0, a1 +; RV32IFZFHMIN-NEXT: addi a2, a2, -1 +; RV32IFZFHMIN-NEXT: fmv.w.x fa5, a2 +; RV32IFZFHMIN-NEXT: flt.s a2, fa5, fs0 +; RV32IFZFHMIN-NEXT: neg a2, a2 +; RV32IFZFHMIN-NEXT: or a0, a2, a0 +; RV32IFZFHMIN-NEXT: or a1, a2, a1 +; RV32IFZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IFZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32IFZFHMIN-NEXT: flw fs0, 4(sp) # 4-byte Folded Reload +; RV32IFZFHMIN-NEXT: addi sp, sp, 16 +; RV32IFZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_lu_h_sat: ; CHECK64-IZFHMIN: # %bb.0: # %start @@ -3213,6 +3248,33 @@ define i64 @fcvt_lu_h_sat(half %a) nounwind { ; CHECK64-IZFHMIN-NEXT: and a0, a1, a0 ; CHECK64-IZFHMIN-NEXT: ret ; +; RV32IDZFHMIN-LABEL: fcvt_lu_h_sat: +; RV32IDZFHMIN: # %bb.0: # %start +; RV32IDZFHMIN-NEXT: addi sp, sp, -16 +; RV32IDZFHMIN-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32IDZFHMIN-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32IDZFHMIN-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; RV32IDZFHMIN-NEXT: fcvt.s.h fs0, fa0 +; RV32IDZFHMIN-NEXT: fmv.w.x fa5, zero +; RV32IDZFHMIN-NEXT: fle.s a0, fa5, fs0 +; RV32IDZFHMIN-NEXT: neg s0, a0 +; RV32IDZFHMIN-NEXT: fmv.s fa0, fs0 +; RV32IDZFHMIN-NEXT: call __fixunssfdi +; RV32IDZFHMIN-NEXT: and a0, s0, a0 +; RV32IDZFHMIN-NEXT: lui a2, 391168 +; RV32IDZFHMIN-NEXT: and a1, s0, a1 +; RV32IDZFHMIN-NEXT: addi a2, a2, -1 +; RV32IDZFHMIN-NEXT: fmv.w.x fa5, a2 +; RV32IDZFHMIN-NEXT: flt.s a2, fa5, fs0 +; RV32IDZFHMIN-NEXT: neg a2, a2 +; RV32IDZFHMIN-NEXT: or a0, a2, a0 +; RV32IDZFHMIN-NEXT: or a1, a2, a1 +; RV32IDZFHMIN-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32IDZFHMIN-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32IDZFHMIN-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload +; RV32IDZFHMIN-NEXT: addi sp, sp, 16 +; RV32IDZFHMIN-NEXT: ret +; ; CHECK32-IZHINXMIN-LABEL: fcvt_lu_h_sat: ; CHECK32-IZHINXMIN: # %bb.0: # %start ; CHECK32-IZHINXMIN-NEXT: addi sp, sp, -16 @@ -6282,13 +6344,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_w_s_sat_i16: ; RV32IZFH: # %bb.0: # %start ; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV32IZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV32IZFH-NEXT: lui a0, 815104 -; RV32IZFH-NEXT: fmv.w.x fa3, a0 -; RV32IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IZFH-NEXT: neg a0, a1 +; RV32IZFH-NEXT: lui a1, 290816 +; RV32IZFH-NEXT: fmv.w.x fa4, a0 +; RV32IZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IZFH-NEXT: addi a1, a1, -512 +; RV32IZFH-NEXT: neg a0, a0 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: fmv.w.x fa4, a1 ; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IZFH-NEXT: and a0, a0, a1 @@ -6297,13 +6360,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IZFH-LABEL: fcvt_w_s_sat_i16: ; RV64IZFH: # %bb.0: # %start ; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV64IZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV64IZFH-NEXT: lui a0, 815104 -; RV64IZFH-NEXT: fmv.w.x fa3, a0 -; RV64IZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IZFH-NEXT: neg a0, a1 +; RV64IZFH-NEXT: lui a1, 290816 +; RV64IZFH-NEXT: fmv.w.x fa4, a0 +; RV64IZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IZFH-NEXT: addi a1, a1, -512 +; RV64IZFH-NEXT: neg a0, a0 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: fmv.w.x fa4, a1 ; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IZFH-NEXT: and a0, a0, a1 @@ -6312,13 +6376,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32IDZFH-LABEL: fcvt_w_s_sat_i16: ; RV32IDZFH: # %bb.0: # %start ; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV32IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV32IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV32IDZFH-NEXT: lui a0, 815104 -; RV32IDZFH-NEXT: fmv.w.x fa3, a0 -; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV32IDZFH-NEXT: neg a0, a1 +; RV32IDZFH-NEXT: lui a1, 290816 +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 +; RV32IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV32IDZFH-NEXT: addi a1, a1, -512 +; RV32IDZFH-NEXT: neg a0, a0 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: fmv.w.x fa4, a1 ; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.w.s a1, fa5, rtz ; RV32IDZFH-NEXT: and a0, a0, a1 @@ -6327,13 +6392,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64IDZFH-LABEL: fcvt_w_s_sat_i16: ; RV64IDZFH: # %bb.0: # %start ; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 -; RV64IDZFH-NEXT: lui a0, %hi(.LCPI32_0) -; RV64IDZFH-NEXT: feq.s a1, fa5, fa5 -; RV64IDZFH-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV64IDZFH-NEXT: lui a0, 815104 -; RV64IDZFH-NEXT: fmv.w.x fa3, a0 -; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa3 -; RV64IDZFH-NEXT: neg a0, a1 +; RV64IDZFH-NEXT: lui a1, 290816 +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 +; RV64IDZFH-NEXT: feq.s a0, fa5, fa5 +; RV64IDZFH-NEXT: addi a1, a1, -512 +; RV64IDZFH-NEXT: neg a0, a0 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: fmv.w.x fa4, a1 ; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.l.s a1, fa5, rtz ; RV64IDZFH-NEXT: and a0, a0, a1 @@ -6491,13 +6557,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 ; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 -; RV32ID-ILP32-NEXT: lui a0, %hi(.LCPI32_0) -; RV32ID-ILP32-NEXT: feq.s a1, fa5, fa5 -; RV32ID-ILP32-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV32ID-ILP32-NEXT: lui a0, 815104 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, a0 -; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa3 -; RV32ID-ILP32-NEXT: neg a0, a1 +; RV32ID-ILP32-NEXT: lui a1, 290816 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 +; RV32ID-ILP32-NEXT: feq.s a0, fa5, fa5 +; RV32ID-ILP32-NEXT: addi a1, a1, -512 +; RV32ID-ILP32-NEXT: neg a0, a0 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, a1 ; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-ILP32-NEXT: and a0, a0, a1 @@ -6511,13 +6578,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 ; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 -; RV64ID-LP64-NEXT: lui a0, %hi(.LCPI32_0) -; RV64ID-LP64-NEXT: feq.s a1, fa5, fa5 -; RV64ID-LP64-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; RV64ID-LP64-NEXT: lui a0, 815104 -; RV64ID-LP64-NEXT: fmv.w.x fa3, a0 -; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa3 -; RV64ID-LP64-NEXT: neg a0, a1 +; RV64ID-LP64-NEXT: lui a1, 290816 +; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 +; RV64ID-LP64-NEXT: feq.s a0, fa5, fa5 +; RV64ID-LP64-NEXT: addi a1, a1, -512 +; RV64ID-LP64-NEXT: neg a0, a0 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: fmv.w.x fa4, a1 ; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-LP64-NEXT: and a0, a0, a1 @@ -6531,13 +6599,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 ; RV32ID-NEXT: feq.s a0, fa0, fa0 -; RV32ID-NEXT: lui a1, %hi(.LCPI32_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI32_0)(a1) ; RV32ID-NEXT: lui a1, 815104 -; RV32ID-NEXT: fmv.w.x fa4, a1 -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 +; RV32ID-NEXT: fmv.w.x fa5, a1 +; RV32ID-NEXT: lui a1, 290816 ; RV32ID-NEXT: neg a0, a0 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: addi a1, a1, -512 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: fmv.w.x fa4, a1 +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.w.s a1, fa5, rtz ; RV32ID-NEXT: and a0, a0, a1 ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -6550,13 +6619,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 ; RV64ID-NEXT: feq.s a0, fa0, fa0 -; RV64ID-NEXT: lui a1, %hi(.LCPI32_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI32_0)(a1) ; RV64ID-NEXT: lui a1, 815104 -; RV64ID-NEXT: fmv.w.x fa4, a1 -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 +; RV64ID-NEXT: fmv.w.x fa5, a1 +; RV64ID-NEXT: lui a1, 290816 ; RV64ID-NEXT: neg a0, a0 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: addi a1, a1, -512 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: fmv.w.x fa4, a1 +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.l.s a1, fa5, rtz ; RV64ID-NEXT: and a0, a0, a1 ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload @@ -6566,13 +6636,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK32-IZFHMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK32-IZFHMIN: # %bb.0: # %start ; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI32_0) -; CHECK32-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK32-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; CHECK32-IZFHMIN-NEXT: lui a0, 815104 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK32-IZFHMIN-NEXT: neg a0, a1 +; CHECK32-IZFHMIN-NEXT: lui a1, 290816 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK32-IZFHMIN-NEXT: addi a1, a1, -512 +; CHECK32-IZFHMIN-NEXT: neg a0, a0 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.w.s a1, fa5, rtz ; CHECK32-IZFHMIN-NEXT: and a0, a0, a1 @@ -6581,13 +6652,14 @@ define signext i16 @fcvt_w_s_sat_i16(half %a) nounwind { ; CHECK64-IZFHMIN-LABEL: fcvt_w_s_sat_i16: ; CHECK64-IZFHMIN: # %bb.0: # %start ; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 -; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI32_0) -; CHECK64-IZFHMIN-NEXT: feq.s a1, fa5, fa5 -; CHECK64-IZFHMIN-NEXT: flw fa4, %lo(.LCPI32_0)(a0) ; CHECK64-IZFHMIN-NEXT: lui a0, 815104 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, a0 -; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa3 -; CHECK64-IZFHMIN-NEXT: neg a0, a1 +; CHECK64-IZFHMIN-NEXT: lui a1, 290816 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64-IZFHMIN-NEXT: feq.s a0, fa5, fa5 +; CHECK64-IZFHMIN-NEXT: addi a1, a1, -512 +; CHECK64-IZFHMIN-NEXT: neg a0, a0 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a1 ; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.l.s a1, fa5, rtz ; CHECK64-IZFHMIN-NEXT: and a0, a0, a1 @@ -6802,45 +6874,49 @@ define zeroext i16 @fcvt_wu_s_i16(half %a) nounwind { define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32IZFH-LABEL: fcvt_wu_s_sat_i16: ; RV32IZFH: # %bb.0: # %start -; RV32IZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV32IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IZFH-NEXT: fmv.w.x fa3, zero -; RV32IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IZFH-NEXT: fmv.w.x fa4, zero +; RV32IZFH-NEXT: lui a0, 292864 +; RV32IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IZFH-NEXT: addi a0, a0, -256 +; RV32IZFH-NEXT: fmv.w.x fa4, a0 +; RV32IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IZFH-NEXT: ret ; ; RV64IZFH-LABEL: fcvt_wu_s_sat_i16: ; RV64IZFH: # %bb.0: # %start -; RV64IZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV64IZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV64IZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IZFH-NEXT: fmv.w.x fa3, zero -; RV64IZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IZFH-NEXT: fmv.w.x fa4, zero +; RV64IZFH-NEXT: lui a0, 292864 +; RV64IZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IZFH-NEXT: addi a0, a0, -256 +; RV64IZFH-NEXT: fmv.w.x fa4, a0 +; RV64IZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IZFH-NEXT: ret ; ; RV32IDZFH-LABEL: fcvt_wu_s_sat_i16: ; RV32IDZFH: # %bb.0: # %start -; RV32IDZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV32IDZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV32IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV32IDZFH-NEXT: fmv.w.x fa3, zero -; RV32IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV32IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV32IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV32IDZFH-NEXT: fmv.w.x fa4, zero +; RV32IDZFH-NEXT: lui a0, 292864 +; RV32IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV32IDZFH-NEXT: addi a0, a0, -256 +; RV32IDZFH-NEXT: fmv.w.x fa4, a0 +; RV32IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV32IDZFH-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32IDZFH-NEXT: ret ; ; RV64IDZFH-LABEL: fcvt_wu_s_sat_i16: ; RV64IDZFH: # %bb.0: # %start -; RV64IDZFH-NEXT: lui a0, %hi(.LCPI34_0) -; RV64IDZFH-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV64IDZFH-NEXT: fcvt.s.h fa4, fa0 -; RV64IDZFH-NEXT: fmv.w.x fa3, zero -; RV64IDZFH-NEXT: fmax.s fa4, fa4, fa3 -; RV64IDZFH-NEXT: fmin.s fa5, fa4, fa5 +; RV64IDZFH-NEXT: fcvt.s.h fa5, fa0 +; RV64IDZFH-NEXT: fmv.w.x fa4, zero +; RV64IDZFH-NEXT: lui a0, 292864 +; RV64IDZFH-NEXT: fmax.s fa5, fa5, fa4 +; RV64IDZFH-NEXT: addi a0, a0, -256 +; RV64IDZFH-NEXT: fmv.w.x fa4, a0 +; RV64IDZFH-NEXT: fmin.s fa5, fa5, fa4 ; RV64IDZFH-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64IDZFH-NEXT: ret ; @@ -6971,12 +7047,13 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32ID-ILP32-NEXT: addi sp, sp, -16 ; RV32ID-ILP32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-ILP32-NEXT: call __extendhfsf2 -; RV32ID-ILP32-NEXT: lui a1, %hi(.LCPI34_0) -; RV32ID-ILP32-NEXT: flw fa5, %lo(.LCPI34_0)(a1) +; RV32ID-ILP32-NEXT: fmv.w.x fa5, a0 +; RV32ID-ILP32-NEXT: fmv.w.x fa4, zero +; RV32ID-ILP32-NEXT: lui a0, 292864 +; RV32ID-ILP32-NEXT: fmax.s fa5, fa5, fa4 +; RV32ID-ILP32-NEXT: addi a0, a0, -256 ; RV32ID-ILP32-NEXT: fmv.w.x fa4, a0 -; RV32ID-ILP32-NEXT: fmv.w.x fa3, zero -; RV32ID-ILP32-NEXT: fmax.s fa4, fa4, fa3 -; RV32ID-ILP32-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-ILP32-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-ILP32-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-ILP32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-ILP32-NEXT: addi sp, sp, 16 @@ -6987,12 +7064,13 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV64ID-LP64-NEXT: addi sp, sp, -16 ; RV64ID-LP64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-LP64-NEXT: call __extendhfsf2 -; RV64ID-LP64-NEXT: lui a1, %hi(.LCPI34_0) -; RV64ID-LP64-NEXT: flw fa5, %lo(.LCPI34_0)(a1) +; RV64ID-LP64-NEXT: fmv.w.x fa5, a0 +; RV64ID-LP64-NEXT: fmv.w.x fa4, zero +; RV64ID-LP64-NEXT: lui a0, 292864 +; RV64ID-LP64-NEXT: fmax.s fa5, fa5, fa4 +; RV64ID-LP64-NEXT: addi a0, a0, -256 ; RV64ID-LP64-NEXT: fmv.w.x fa4, a0 -; RV64ID-LP64-NEXT: fmv.w.x fa3, zero -; RV64ID-LP64-NEXT: fmax.s fa4, fa4, fa3 -; RV64ID-LP64-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-LP64-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-LP64-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-LP64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-LP64-NEXT: addi sp, sp, 16 @@ -7003,11 +7081,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV32ID-NEXT: addi sp, sp, -16 ; RV32ID-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32ID-NEXT: call __extendhfsf2 -; RV32ID-NEXT: lui a0, %hi(.LCPI34_0) -; RV32ID-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV32ID-NEXT: fmv.w.x fa4, zero -; RV32ID-NEXT: fmax.s fa4, fa0, fa4 -; RV32ID-NEXT: fmin.s fa5, fa4, fa5 +; RV32ID-NEXT: fmv.w.x fa5, zero +; RV32ID-NEXT: lui a0, 292864 +; RV32ID-NEXT: fmax.s fa5, fa0, fa5 +; RV32ID-NEXT: addi a0, a0, -256 +; RV32ID-NEXT: fmv.w.x fa4, a0 +; RV32ID-NEXT: fmin.s fa5, fa5, fa4 ; RV32ID-NEXT: fcvt.wu.s a0, fa5, rtz ; RV32ID-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ID-NEXT: addi sp, sp, 16 @@ -7018,11 +7097,12 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; RV64ID-NEXT: addi sp, sp, -16 ; RV64ID-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64ID-NEXT: call __extendhfsf2 -; RV64ID-NEXT: lui a0, %hi(.LCPI34_0) -; RV64ID-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; RV64ID-NEXT: fmv.w.x fa4, zero -; RV64ID-NEXT: fmax.s fa4, fa0, fa4 -; RV64ID-NEXT: fmin.s fa5, fa4, fa5 +; RV64ID-NEXT: fmv.w.x fa5, zero +; RV64ID-NEXT: lui a0, 292864 +; RV64ID-NEXT: fmax.s fa5, fa0, fa5 +; RV64ID-NEXT: addi a0, a0, -256 +; RV64ID-NEXT: fmv.w.x fa4, a0 +; RV64ID-NEXT: fmin.s fa5, fa5, fa4 ; RV64ID-NEXT: fcvt.lu.s a0, fa5, rtz ; RV64ID-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64ID-NEXT: addi sp, sp, 16 @@ -7030,23 +7110,25 @@ define zeroext i16 @fcvt_wu_s_sat_i16(half %a) nounwind { ; ; CHECK32-IZFHMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK32-IZFHMIN: # %bb.0: # %start -; CHECK32-IZFHMIN-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK32-IZFHMIN-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK32-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK32-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK32-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK32-IZFHMIN-NEXT: lui a0, 292864 +; CHECK32-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK32-IZFHMIN-NEXT: addi a0, a0, -256 +; CHECK32-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK32-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK32-IZFHMIN-NEXT: fcvt.wu.s a0, fa5, rtz ; CHECK32-IZFHMIN-NEXT: ret ; ; CHECK64-IZFHMIN-LABEL: fcvt_wu_s_sat_i16: ; CHECK64-IZFHMIN: # %bb.0: # %start -; CHECK64-IZFHMIN-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK64-IZFHMIN-NEXT: flw fa5, %lo(.LCPI34_0)(a0) -; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECK64-IZFHMIN-NEXT: fmv.w.x fa3, zero -; CHECK64-IZFHMIN-NEXT: fmax.s fa4, fa4, fa3 -; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa4, fa5 +; CHECK64-IZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECK64-IZFHMIN-NEXT: lui a0, 292864 +; CHECK64-IZFHMIN-NEXT: fmax.s fa5, fa5, fa4 +; CHECK64-IZFHMIN-NEXT: addi a0, a0, -256 +; CHECK64-IZFHMIN-NEXT: fmv.w.x fa4, a0 +; CHECK64-IZFHMIN-NEXT: fmin.s fa5, fa5, fa4 ; CHECK64-IZFHMIN-NEXT: fcvt.lu.s a0, fa5, rtz ; CHECK64-IZFHMIN-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/half-imm.ll b/llvm/test/CodeGen/RISCV/half-imm.ll index d68e19d15b4bb..1dc0da8c04dba 100644 --- a/llvm/test/CodeGen/RISCV/half-imm.ll +++ b/llvm/test/CodeGen/RISCV/half-imm.ll @@ -24,8 +24,9 @@ define half @half_imm() nounwind { ; CHECK-LABEL: half_imm: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa0, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: lui a0, 4 +; CHECK-NEXT: addi a0, a0, 512 +; CHECK-NEXT: fmv.h.x fa0, a0 ; CHECK-NEXT: ret ; ; RV32IZHINX-LABEL: half_imm: @@ -44,8 +45,9 @@ define half @half_imm() nounwind { ; ; CHECKIZFHMIN-LABEL: half_imm: ; CHECKIZFHMIN: # %bb.0: -; CHECKIZFHMIN-NEXT: lui a0, %hi(.LCPI0_0) -; CHECKIZFHMIN-NEXT: flh fa0, %lo(.LCPI0_0)(a0) +; CHECKIZFHMIN-NEXT: lui a0, 4 +; CHECKIZFHMIN-NEXT: addi a0, a0, 512 +; CHECKIZFHMIN-NEXT: fmv.h.x fa0, a0 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: half_imm: @@ -60,8 +62,9 @@ define half @half_imm() nounwind { define half @half_imm_op(half %a) nounwind { ; CHECK-LABEL: half_imm_op: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 15 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: fadd.h fa0, fa0, fa5 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/half-intrinsics.ll b/llvm/test/CodeGen/RISCV/half-intrinsics.ll index 4f0026175e7c7..e16d788f66ede 100644 --- a/llvm/test/CodeGen/RISCV/half-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/half-intrinsics.ll @@ -2222,8 +2222,9 @@ declare half @llvm.floor.f16(half) define half @floor_f16(half %a) nounwind { ; CHECKIZFH-LABEL: floor_f16: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI18_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI18_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB18_2 @@ -2313,8 +2314,9 @@ declare half @llvm.ceil.f16(half) define half @ceil_f16(half %a) nounwind { ; CHECKIZFH-LABEL: ceil_f16: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI19_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB19_2 @@ -2404,8 +2406,9 @@ declare half @llvm.trunc.f16(half) define half @trunc_f16(half %a) nounwind { ; CHECKIZFH-LABEL: trunc_f16: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI20_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI20_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB20_2 @@ -2495,8 +2498,9 @@ declare half @llvm.rint.f16(half) define half @rint_f16(half %a) nounwind { ; CHECKIZFH-LABEL: rint_f16: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI21_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB21_2 @@ -2706,8 +2710,9 @@ declare half @llvm.round.f16(half) define half @round_f16(half %a) nounwind { ; CHECKIZFH-LABEL: round_f16: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI23_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB23_2 @@ -2797,8 +2802,9 @@ declare half @llvm.roundeven.f16(half) define half @roundeven_f16(half %a) nounwind { ; CHECKIZFH-LABEL: roundeven_f16: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI24_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI24_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB24_2 diff --git a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll index 3b645bf8aef91..c815bc19e280c 100644 --- a/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv-sat.ll @@ -95,8 +95,9 @@ define signext i32 @test_floor_si32(half %x) { define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFH-LABEL: test_floor_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI1_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB1_2 @@ -121,8 +122,9 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB1_4: -; RV32IZFH-NEXT: lui a1, %hi(.LCPI1_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI1_1)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB1_6 ; RV32IZFH-NEXT: # %bb.5: @@ -248,8 +250,9 @@ define i64 @test_floor_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: # %bb.3: ; RV32IZFHMIN-NEXT: mv a2, a1 ; RV32IZFHMIN-NEXT: .LBB1_4: -; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI1_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI1_0)(a1) +; RV32IZFHMIN-NEXT: lui a1, 389120 +; RV32IZFHMIN-NEXT: addi a1, a1, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFHMIN-NEXT: beqz a1, .LBB1_6 ; RV32IZFHMIN-NEXT: # %bb.5: @@ -506,8 +509,9 @@ define signext i32 @test_floor_ui32(half %x) { define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZFH-LABEL: test_floor_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB3_2 @@ -526,10 +530,11 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZFH-NEXT: neg s0, a0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: lui a2, %hi(.LCPI3_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI3_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 ; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 @@ -627,10 +632,11 @@ define i64 @test_floor_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: neg s0, a0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixunssfdi -; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI3_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI3_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: lui a2, 391168 ; RV32IZFHMIN-NEXT: and a1, s0, a1 +; RV32IZFHMIN-NEXT: addi a2, a2, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a2 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 @@ -803,8 +809,9 @@ define signext i32 @test_ceil_si32(half %x) { define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFH-LABEL: test_ceil_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI5_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB5_2 @@ -829,8 +836,9 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB5_4: -; RV32IZFH-NEXT: lui a1, %hi(.LCPI5_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI5_1)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB5_6 ; RV32IZFH-NEXT: # %bb.5: @@ -956,8 +964,9 @@ define i64 @test_ceil_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: # %bb.3: ; RV32IZFHMIN-NEXT: mv a2, a1 ; RV32IZFHMIN-NEXT: .LBB5_4: -; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI5_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI5_0)(a1) +; RV32IZFHMIN-NEXT: lui a1, 389120 +; RV32IZFHMIN-NEXT: addi a1, a1, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFHMIN-NEXT: beqz a1, .LBB5_6 ; RV32IZFHMIN-NEXT: # %bb.5: @@ -1214,8 +1223,9 @@ define signext i32 @test_ceil_ui32(half %x) { define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZFH-LABEL: test_ceil_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI7_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB7_2 @@ -1234,10 +1244,11 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZFH-NEXT: neg s0, a0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: lui a2, %hi(.LCPI7_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI7_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 ; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 @@ -1335,10 +1346,11 @@ define i64 @test_ceil_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: neg s0, a0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixunssfdi -; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI7_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI7_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: lui a2, 391168 ; RV32IZFHMIN-NEXT: and a1, s0, a1 +; RV32IZFHMIN-NEXT: addi a2, a2, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a2 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 @@ -1511,8 +1523,9 @@ define signext i32 @test_trunc_si32(half %x) { define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFH-LABEL: test_trunc_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI9_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB9_2 @@ -1537,8 +1550,9 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB9_4: -; RV32IZFH-NEXT: lui a1, %hi(.LCPI9_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI9_1)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB9_6 ; RV32IZFH-NEXT: # %bb.5: @@ -1664,8 +1678,9 @@ define i64 @test_trunc_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: # %bb.3: ; RV32IZFHMIN-NEXT: mv a2, a1 ; RV32IZFHMIN-NEXT: .LBB9_4: -; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI9_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI9_0)(a1) +; RV32IZFHMIN-NEXT: lui a1, 389120 +; RV32IZFHMIN-NEXT: addi a1, a1, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFHMIN-NEXT: beqz a1, .LBB9_6 ; RV32IZFHMIN-NEXT: # %bb.5: @@ -1922,8 +1937,9 @@ define signext i32 @test_trunc_ui32(half %x) { define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZFH-LABEL: test_trunc_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI11_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB11_2 @@ -1942,10 +1958,11 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZFH-NEXT: neg s0, a0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: lui a2, %hi(.LCPI11_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI11_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 ; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 @@ -2043,10 +2060,11 @@ define i64 @test_trunc_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: neg s0, a0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixunssfdi -; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI11_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI11_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: lui a2, 391168 ; RV32IZFHMIN-NEXT: and a1, s0, a1 +; RV32IZFHMIN-NEXT: addi a2, a2, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a2 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 @@ -2219,8 +2237,9 @@ define signext i32 @test_round_si32(half %x) { define i64 @test_round_si64(half %x) nounwind { ; RV32IZFH-LABEL: test_round_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI13_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI13_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB13_2 @@ -2245,8 +2264,9 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB13_4: -; RV32IZFH-NEXT: lui a1, %hi(.LCPI13_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI13_1)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB13_6 ; RV32IZFH-NEXT: # %bb.5: @@ -2372,8 +2392,9 @@ define i64 @test_round_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: # %bb.3: ; RV32IZFHMIN-NEXT: mv a2, a1 ; RV32IZFHMIN-NEXT: .LBB13_4: -; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI13_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI13_0)(a1) +; RV32IZFHMIN-NEXT: lui a1, 389120 +; RV32IZFHMIN-NEXT: addi a1, a1, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFHMIN-NEXT: beqz a1, .LBB13_6 ; RV32IZFHMIN-NEXT: # %bb.5: @@ -2630,8 +2651,9 @@ define signext i32 @test_round_ui32(half %x) { define i64 @test_round_ui64(half %x) nounwind { ; RV32IZFH-LABEL: test_round_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI15_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB15_2 @@ -2650,10 +2672,11 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZFH-NEXT: neg s0, a0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: lui a2, %hi(.LCPI15_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI15_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 ; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 @@ -2751,10 +2774,11 @@ define i64 @test_round_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: neg s0, a0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixunssfdi -; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI15_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI15_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: lui a2, 391168 ; RV32IZFHMIN-NEXT: and a1, s0, a1 +; RV32IZFHMIN-NEXT: addi a2, a2, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a2 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 @@ -2927,8 +2951,9 @@ define signext i32 @test_roundeven_si32(half %x) { define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFH-LABEL: test_roundeven_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI17_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI17_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB17_2 @@ -2953,8 +2978,9 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB17_4: -; RV32IZFH-NEXT: lui a1, %hi(.LCPI17_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI17_1)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB17_6 ; RV32IZFH-NEXT: # %bb.5: @@ -3080,8 +3106,9 @@ define i64 @test_roundeven_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: # %bb.3: ; RV32IZFHMIN-NEXT: mv a2, a1 ; RV32IZFHMIN-NEXT: .LBB17_4: -; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI17_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI17_0)(a1) +; RV32IZFHMIN-NEXT: lui a1, 389120 +; RV32IZFHMIN-NEXT: addi a1, a1, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFHMIN-NEXT: beqz a1, .LBB17_6 ; RV32IZFHMIN-NEXT: # %bb.5: @@ -3338,8 +3365,9 @@ define signext i32 @test_roundeven_ui32(half %x) { define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZFH-LABEL: test_roundeven_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI19_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB19_2 @@ -3358,10 +3386,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZFH-NEXT: neg s0, a0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: lui a2, %hi(.LCPI19_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI19_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 ; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 @@ -3459,10 +3488,11 @@ define i64 @test_roundeven_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: neg s0, a0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixunssfdi -; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI19_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI19_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: lui a2, 391168 ; RV32IZFHMIN-NEXT: and a1, s0, a1 +; RV32IZFHMIN-NEXT: addi a2, a2, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a2 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 @@ -3635,8 +3665,9 @@ define signext i32 @test_rint_si32(half %x) { define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFH-LABEL: test_rint_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI21_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI21_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB21_2 @@ -3661,8 +3692,9 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFH-NEXT: # %bb.3: ; RV32IZFH-NEXT: mv a2, a1 ; RV32IZFH-NEXT: .LBB21_4: -; RV32IZFH-NEXT: lui a1, %hi(.LCPI21_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI21_1)(a1) +; RV32IZFH-NEXT: lui a1, 389120 +; RV32IZFH-NEXT: addi a1, a1, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a1 ; RV32IZFH-NEXT: flt.s a1, fa5, fs0 ; RV32IZFH-NEXT: beqz a1, .LBB21_6 ; RV32IZFH-NEXT: # %bb.5: @@ -3788,8 +3820,9 @@ define i64 @test_rint_si64(half %x) nounwind { ; RV32IZFHMIN-NEXT: # %bb.3: ; RV32IZFHMIN-NEXT: mv a2, a1 ; RV32IZFHMIN-NEXT: .LBB21_4: -; RV32IZFHMIN-NEXT: lui a1, %hi(.LCPI21_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI21_0)(a1) +; RV32IZFHMIN-NEXT: lui a1, 389120 +; RV32IZFHMIN-NEXT: addi a1, a1, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a1 ; RV32IZFHMIN-NEXT: flt.s a1, fa5, fs0 ; RV32IZFHMIN-NEXT: beqz a1, .LBB21_6 ; RV32IZFHMIN-NEXT: # %bb.5: @@ -4046,8 +4079,9 @@ define signext i32 @test_rint_ui32(half %x) { define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZFH-LABEL: test_rint_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI23_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB23_2 @@ -4066,10 +4100,11 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZFH-NEXT: neg s0, a0 ; RV32IZFH-NEXT: fmv.s fa0, fs0 ; RV32IZFH-NEXT: call __fixunssfdi -; RV32IZFH-NEXT: lui a2, %hi(.LCPI23_1) -; RV32IZFH-NEXT: flw fa5, %lo(.LCPI23_1)(a2) ; RV32IZFH-NEXT: and a0, s0, a0 +; RV32IZFH-NEXT: lui a2, 391168 ; RV32IZFH-NEXT: and a1, s0, a1 +; RV32IZFH-NEXT: addi a2, a2, -1 +; RV32IZFH-NEXT: fmv.w.x fa5, a2 ; RV32IZFH-NEXT: flt.s a2, fa5, fs0 ; RV32IZFH-NEXT: neg a2, a2 ; RV32IZFH-NEXT: or a0, a2, a0 @@ -4167,10 +4202,11 @@ define i64 @test_rint_ui64(half %x) nounwind { ; RV32IZFHMIN-NEXT: neg s0, a0 ; RV32IZFHMIN-NEXT: fmv.s fa0, fs0 ; RV32IZFHMIN-NEXT: call __fixunssfdi -; RV32IZFHMIN-NEXT: lui a2, %hi(.LCPI23_0) -; RV32IZFHMIN-NEXT: flw fa5, %lo(.LCPI23_0)(a2) ; RV32IZFHMIN-NEXT: and a0, s0, a0 +; RV32IZFHMIN-NEXT: lui a2, 391168 ; RV32IZFHMIN-NEXT: and a1, s0, a1 +; RV32IZFHMIN-NEXT: addi a2, a2, -1 +; RV32IZFHMIN-NEXT: fmv.w.x fa5, a2 ; RV32IZFHMIN-NEXT: flt.s a2, fa5, fs0 ; RV32IZFHMIN-NEXT: neg a2, a2 ; RV32IZFHMIN-NEXT: or a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/half-round-conv.ll b/llvm/test/CodeGen/RISCV/half-round-conv.ll index 8a787ee578990..cfc997d66ec56 100644 --- a/llvm/test/CodeGen/RISCV/half-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/half-round-conv.ll @@ -309,8 +309,9 @@ define signext i32 @test_floor_si32(half %x) { define i64 @test_floor_si64(half %x) { ; RV32IZFH-LABEL: test_floor_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI3_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB3_2 @@ -754,8 +755,9 @@ define signext i32 @test_floor_ui32(half %x) { define i64 @test_floor_ui64(half %x) { ; RV32IZFH-LABEL: test_floor_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI7_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB7_2 @@ -1199,8 +1201,9 @@ define signext i32 @test_ceil_si32(half %x) { define i64 @test_ceil_si64(half %x) { ; RV32IZFH-LABEL: test_ceil_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI11_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB11_2 @@ -1644,8 +1647,9 @@ define signext i32 @test_ceil_ui32(half %x) { define i64 @test_ceil_ui64(half %x) { ; RV32IZFH-LABEL: test_ceil_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI15_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI15_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB15_2 @@ -2089,8 +2093,9 @@ define signext i32 @test_trunc_si32(half %x) { define i64 @test_trunc_si64(half %x) { ; RV32IZFH-LABEL: test_trunc_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI19_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI19_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB19_2 @@ -2534,8 +2539,9 @@ define signext i32 @test_trunc_ui32(half %x) { define i64 @test_trunc_ui64(half %x) { ; RV32IZFH-LABEL: test_trunc_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI23_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI23_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB23_2 @@ -2979,8 +2985,9 @@ define signext i32 @test_round_si32(half %x) { define i64 @test_round_si64(half %x) { ; RV32IZFH-LABEL: test_round_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI27_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI27_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB27_2 @@ -3424,8 +3431,9 @@ define signext i32 @test_round_ui32(half %x) { define i64 @test_round_ui64(half %x) { ; RV32IZFH-LABEL: test_round_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI31_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI31_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB31_2 @@ -3869,8 +3877,9 @@ define signext i32 @test_roundeven_si32(half %x) { define i64 @test_roundeven_si64(half %x) { ; RV32IZFH-LABEL: test_roundeven_si64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI35_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI35_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB35_2 @@ -4314,8 +4323,9 @@ define signext i32 @test_roundeven_ui32(half %x) { define i64 @test_roundeven_ui64(half %x) { ; RV32IZFH-LABEL: test_roundeven_ui64: ; RV32IZFH: # %bb.0: -; RV32IZFH-NEXT: lui a0, %hi(.LCPI39_0) -; RV32IZFH-NEXT: flh fa5, %lo(.LCPI39_0)(a0) +; RV32IZFH-NEXT: li a0, 25 +; RV32IZFH-NEXT: slli a0, a0, 10 +; RV32IZFH-NEXT: fmv.h.x fa5, a0 ; RV32IZFH-NEXT: fabs.h fa4, fa0 ; RV32IZFH-NEXT: flt.h a0, fa4, fa5 ; RV32IZFH-NEXT: beqz a0, .LBB39_2 @@ -4490,8 +4500,9 @@ define half @test_floor_half(half %x) { ; RV64IFD-NEXT: ret ; CHECKIZFH-LABEL: test_floor_half: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI40_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI40_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB40_2 @@ -4574,8 +4585,9 @@ define half @test_ceil_half(half %x) { ; RV64IFD-NEXT: ret ; CHECKIZFH-LABEL: test_ceil_half: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI41_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI41_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB41_2 @@ -4658,8 +4670,9 @@ define half @test_trunc_half(half %x) { ; RV64IFD-NEXT: ret ; CHECKIZFH-LABEL: test_trunc_half: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI42_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI42_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB42_2 @@ -4742,8 +4755,9 @@ define half @test_round_half(half %x) { ; RV64IFD-NEXT: ret ; CHECKIZFH-LABEL: test_round_half: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI43_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI43_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB43_2 @@ -4826,8 +4840,9 @@ define half @test_roundeven_half(half %x) { ; RV64IFD-NEXT: ret ; CHECKIZFH-LABEL: test_roundeven_half: ; CHECKIZFH: # %bb.0: -; CHECKIZFH-NEXT: lui a0, %hi(.LCPI44_0) -; CHECKIZFH-NEXT: flh fa5, %lo(.LCPI44_0)(a0) +; CHECKIZFH-NEXT: li a0, 25 +; CHECKIZFH-NEXT: slli a0, a0, 10 +; CHECKIZFH-NEXT: fmv.h.x fa5, a0 ; CHECKIZFH-NEXT: fabs.h fa4, fa0 ; CHECKIZFH-NEXT: flt.h a0, fa4, fa5 ; CHECKIZFH-NEXT: beqz a0, .LBB44_2 diff --git a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll index bf535b1cbd084..e9699502ed3a9 100644 --- a/llvm/test/CodeGen/RISCV/half-select-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/half-select-fcmp.ll @@ -878,8 +878,9 @@ define signext i32 @select_fcmp_uge_1_2(half %a, half %b) nounwind { define half @CascadedSelect(half noundef %a) { ; CHECK-LABEL: CascadedSelect: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI20_0)(a0) +; CHECK-NEXT: li a0, 15 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: flt.h a0, fa5, fa0 ; CHECK-NEXT: bnez a0, .LBB20_3 ; CHECK-NEXT: # %bb.1: # %entry @@ -910,23 +911,24 @@ define half @CascadedSelect(half noundef %a) { ; ; CHECKIZFHMIN-LABEL: CascadedSelect: ; CHECKIZFHMIN: # %bb.0: # %entry -; CHECKIZFHMIN-NEXT: lui a0, %hi(.LCPI20_0) -; CHECKIZFHMIN-NEXT: flh fa5, %lo(.LCPI20_0)(a0) -; CHECKIZFHMIN-NEXT: fcvt.s.h fa3, fa5 -; CHECKIZFHMIN-NEXT: fcvt.s.h fa4, fa0 -; CHECKIZFHMIN-NEXT: flt.s a0, fa3, fa4 -; CHECKIZFHMIN-NEXT: bnez a0, .LBB20_3 -; CHECKIZFHMIN-NEXT: # %bb.1: # %entry -; CHECKIZFHMIN-NEXT: fmv.w.x fa5, zero +; CHECKIZFHMIN-NEXT: fcvt.s.h fa5, fa0 +; CHECKIZFHMIN-NEXT: lui a0, 260096 +; CHECKIZFHMIN-NEXT: fmv.w.x fa4, zero +; CHECKIZFHMIN-NEXT: flt.s a1, fa5, fa4 +; CHECKIZFHMIN-NEXT: fmv.w.x fa4, a0 ; CHECKIZFHMIN-NEXT: flt.s a0, fa4, fa5 +; CHECKIZFHMIN-NEXT: bnez a1, .LBB20_3 +; CHECKIZFHMIN-NEXT: # %bb.1: # %entry ; CHECKIZFHMIN-NEXT: bnez a0, .LBB20_4 -; CHECKIZFHMIN-NEXT: # %bb.2: # %entry -; CHECKIZFHMIN-NEXT: fmv.s fa5, fa0 -; CHECKIZFHMIN-NEXT: .LBB20_3: # %entry -; CHECKIZFHMIN-NEXT: fmv.s fa0, fa5 +; CHECKIZFHMIN-NEXT: .LBB20_2: # %entry ; CHECKIZFHMIN-NEXT: ret -; CHECKIZFHMIN-NEXT: .LBB20_4: +; CHECKIZFHMIN-NEXT: .LBB20_3: ; CHECKIZFHMIN-NEXT: fmv.h.x fa0, zero +; CHECKIZFHMIN-NEXT: beqz a0, .LBB20_2 +; CHECKIZFHMIN-NEXT: .LBB20_4: +; CHECKIZFHMIN-NEXT: li a0, 15 +; CHECKIZFHMIN-NEXT: slli a0, a0, 10 +; CHECKIZFHMIN-NEXT: fmv.h.x fa0, a0 ; CHECKIZFHMIN-NEXT: ret ; ; CHECKIZHINXMIN-LABEL: CascadedSelect: diff --git a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll index 281a873235623..928535d79f02c 100644 --- a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll +++ b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll @@ -16,8 +16,9 @@ define half @loadfpimm1() { ; ; ZFHMIN-LABEL: loadfpimm1: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI0_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI0_0)(a0) +; ZFHMIN-NEXT: li a0, 11 +; ZFHMIN-NEXT: slli a0, a0, 10 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 0.0625 } @@ -30,8 +31,9 @@ define half @loadfpimm2() { ; ; ZFHMIN-LABEL: loadfpimm2: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI1_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI1_0)(a0) +; ZFHMIN-NEXT: li a0, 29 +; ZFHMIN-NEXT: slli a0, a0, 9 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 0.75 } @@ -44,8 +46,9 @@ define half @loadfpimm3() { ; ; ZFHMIN-LABEL: loadfpimm3: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI2_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI2_0)(a0) +; ZFHMIN-NEXT: lui a0, 4 +; ZFHMIN-NEXT: addi a0, a0, -768 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 1.25 } @@ -58,8 +61,9 @@ define half @loadfpimm4() { ; ; ZFHMIN-LABEL: loadfpimm4: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI3_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI3_0)(a0) +; ZFHMIN-NEXT: lui a0, 4 +; ZFHMIN-NEXT: addi a0, a0, 512 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 3.0 } @@ -72,8 +76,9 @@ define half @loadfpimm5() { ; ; ZFHMIN-LABEL: loadfpimm5: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI4_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI4_0)(a0) +; ZFHMIN-NEXT: li a0, 23 +; ZFHMIN-NEXT: slli a0, a0, 10 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 256.0 } @@ -86,8 +91,9 @@ define half @loadfpimm6() { ; ; ZFHMIN-LABEL: loadfpimm6: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI5_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI5_0)(a0) +; ZFHMIN-NEXT: li a0, 31 +; ZFHMIN-NEXT: slli a0, a0, 10 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 0xH7C00 } @@ -100,8 +106,9 @@ define half @loadfpimm7() { ; ; ZFHMIN-LABEL: loadfpimm7: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI6_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI6_0)(a0) +; ZFHMIN-NEXT: lui a0, 8 +; ZFHMIN-NEXT: addi a0, a0, -512 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 0xH7E00 } @@ -123,14 +130,16 @@ define half @loadfpimm8() { define half @loadfpimm9() { ; CHECK-LABEL: loadfpimm9: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: flh fa0, %lo(.LCPI8_0)(a0) +; CHECK-NEXT: lui a0, 6 +; CHECK-NEXT: addi a0, a0, -1032 +; CHECK-NEXT: fmv.h.x fa0, a0 ; CHECK-NEXT: ret ; ; ZFHMIN-LABEL: loadfpimm9: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI8_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI8_0)(a0) +; ZFHMIN-NEXT: lui a0, 6 +; ZFHMIN-NEXT: addi a0, a0, -1032 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 255.0 } @@ -169,14 +178,16 @@ define half @loadfpimm11() { define half @loadfpimm12() { ; CHECK-LABEL: loadfpimm12: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: flh fa0, %lo(.LCPI11_0)(a0) +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1023 +; CHECK-NEXT: fmv.h.x fa0, a0 ; CHECK-NEXT: ret ; ; ZFHMIN-LABEL: loadfpimm12: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI11_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI11_0)(a0) +; ZFHMIN-NEXT: lui a0, 8 +; ZFHMIN-NEXT: addi a0, a0, -1023 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 0xH7c01 } @@ -189,8 +200,9 @@ define half @loadfpimm13() { ; ; ZFHMIN-LABEL: loadfpimm13: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI12_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI12_0)(a0) +; ZFHMIN-NEXT: li a0, -17 +; ZFHMIN-NEXT: slli a0, a0, 10 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half -1.0 } @@ -222,8 +234,9 @@ define half @loadfpimm15() { ; ; ZFHMIN-LABEL: loadfpimm15: ; ZFHMIN: # %bb.0: -; ZFHMIN-NEXT: lui a0, %hi(.LCPI14_0) -; ZFHMIN-NEXT: flh fa0, %lo(.LCPI14_0)(a0) +; ZFHMIN-NEXT: li a0, -31 +; ZFHMIN-NEXT: slli a0, a0, 10 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret ret half 0xH8400 } diff --git a/llvm/test/CodeGen/RISCV/half-zfa.ll b/llvm/test/CodeGen/RISCV/half-zfa.ll index 960c7c4a73e4f..90c66e7fe2ca4 100644 --- a/llvm/test/CodeGen/RISCV/half-zfa.ll +++ b/llvm/test/CodeGen/RISCV/half-zfa.ll @@ -350,12 +350,15 @@ define half @select_loadfpimm(half %x) nounwind { ; ZFHMIN-NEXT: fcvt.s.h fa5, fa0 ; ZFHMIN-NEXT: fmv.w.x fa4, zero ; ZFHMIN-NEXT: fle.s a0, fa4, fa5 -; ZFHMIN-NEXT: xori a0, a0, 1 -; ZFHMIN-NEXT: slli a0, a0, 1 -; ZFHMIN-NEXT: lui a1, %hi(.LCPI17_0) -; ZFHMIN-NEXT: addi a1, a1, %lo(.LCPI17_0) -; ZFHMIN-NEXT: add a0, a1, a0 -; ZFHMIN-NEXT: flh fa0, 0(a0) +; ZFHMIN-NEXT: beqz a0, .LBB17_2 +; ZFHMIN-NEXT: # %bb.1: # %entry +; ZFHMIN-NEXT: li a0, 7 +; ZFHMIN-NEXT: j .LBB17_3 +; ZFHMIN-NEXT: .LBB17_2: +; ZFHMIN-NEXT: li a0, -9 +; ZFHMIN-NEXT: .LBB17_3: # %entry +; ZFHMIN-NEXT: slli a0, a0, 11 +; ZFHMIN-NEXT: fmv.h.x fa0, a0 ; ZFHMIN-NEXT: ret entry: %cmp = fcmp ult half %x, 0.000000e+00 diff --git a/llvm/test/CodeGen/RISCV/idiv_large.ll b/llvm/test/CodeGen/RISCV/idiv_large.ll index fb7e4a4d103d0..9937627962208 100644 --- a/llvm/test/CodeGen/RISCV/idiv_large.ll +++ b/llvm/test/CodeGen/RISCV/idiv_large.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 < %s | FileCheck %s ; RUN: llc -mtriple=riscv64 < %s | FileCheck %s diff --git a/llvm/test/CodeGen/RISCV/repeated-fp-divisors.ll b/llvm/test/CodeGen/RISCV/repeated-fp-divisors.ll index f183c936fc672..f3b4319ccc4fa 100644 --- a/llvm/test/CodeGen/RISCV/repeated-fp-divisors.ll +++ b/llvm/test/CodeGen/RISCV/repeated-fp-divisors.ll @@ -17,8 +17,9 @@ entry: define void @two_fdivs(double %a0, double %a1, double %a2, ptr %res) { ; CHECK-LABEL: two_fdivs: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI1_0)(a1) +; CHECK-NEXT: li a1, 1023 +; CHECK-NEXT: slli a1, a1, 52 +; CHECK-NEXT: fmv.d.x fa5, a1 ; CHECK-NEXT: fdiv.d fa5, fa5, fa0 ; CHECK-NEXT: fmul.d fa4, fa1, fa5 ; CHECK-NEXT: fmul.d fa5, fa2, fa5 diff --git a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll index caa6c2f8ff96f..a919452389c43 100644 --- a/llvm/test/CodeGen/RISCV/rv64-double-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-double-convert.ll @@ -122,9 +122,10 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64ID-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64ID-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64ID-NEXT: fsd fs0, 8(sp) # 8-byte Folded Spill -; RV64ID-NEXT: lui a0, %hi(.LCPI4_0) -; RV64ID-NEXT: fld fa5, %lo(.LCPI4_0)(a0) ; RV64ID-NEXT: fmv.d fs0, fa0 +; RV64ID-NEXT: li a0, -449 +; RV64ID-NEXT: slli a0, a0, 53 +; RV64ID-NEXT: fmv.d.x fa5, a0 ; RV64ID-NEXT: fle.d s0, fa5, fa0 ; RV64ID-NEXT: call __fixdfti ; RV64ID-NEXT: li a2, -1 @@ -132,8 +133,8 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64ID-NEXT: # %bb.1: ; RV64ID-NEXT: slli a1, a2, 63 ; RV64ID-NEXT: .LBB4_2: -; RV64ID-NEXT: lui a3, %hi(.LCPI4_1) -; RV64ID-NEXT: fld fa5, %lo(.LCPI4_1)(a3) +; RV64ID-NEXT: lui a3, %hi(.LCPI4_0) +; RV64ID-NEXT: fld fa5, %lo(.LCPI4_0)(a3) ; RV64ID-NEXT: flt.d a3, fa5, fs0 ; RV64ID-NEXT: beqz a3, .LBB4_4 ; RV64ID-NEXT: # %bb.3: @@ -170,16 +171,17 @@ define i128 @fptosi_sat_f64_to_i128(double %a) nounwind { ; RV64IDINX-NEXT: # %bb.1: ; RV64IDINX-NEXT: slli a1, a2, 63 ; RV64IDINX-NEXT: .LBB4_2: -; RV64IDINX-NEXT: lui a3, %hi(.LCPI4_0) -; RV64IDINX-NEXT: ld a3, %lo(.LCPI4_0)(a3) +; RV64IDINX-NEXT: li a3, 575 +; RV64IDINX-NEXT: slli a3, a3, 53 +; RV64IDINX-NEXT: addi a3, a3, -1 ; RV64IDINX-NEXT: flt.d a3, a3, s0 ; RV64IDINX-NEXT: beqz a3, .LBB4_4 ; RV64IDINX-NEXT: # %bb.3: ; RV64IDINX-NEXT: srli a1, a2, 1 ; RV64IDINX-NEXT: .LBB4_4: ; RV64IDINX-NEXT: feq.d a2, s0, s0 -; RV64IDINX-NEXT: neg a3, a3 ; RV64IDINX-NEXT: neg a4, s1 +; RV64IDINX-NEXT: neg a3, a3 ; RV64IDINX-NEXT: neg a2, a2 ; RV64IDINX-NEXT: and a0, a4, a0 ; RV64IDINX-NEXT: and a1, a2, a1 @@ -267,10 +269,11 @@ define i128 @fptoui_sat_f64_to_i128(double %a) nounwind { ; RV64IDINX-NEXT: neg s1, a0 ; RV64IDINX-NEXT: mv a0, s0 ; RV64IDINX-NEXT: call __fixunsdfti -; RV64IDINX-NEXT: lui a2, %hi(.LCPI5_0) -; RV64IDINX-NEXT: ld a2, %lo(.LCPI5_0)(a2) ; RV64IDINX-NEXT: and a0, s1, a0 +; RV64IDINX-NEXT: li a2, 1151 ; RV64IDINX-NEXT: and a1, s1, a1 +; RV64IDINX-NEXT: slli a2, a2, 52 +; RV64IDINX-NEXT: addi a2, a2, -1 ; RV64IDINX-NEXT: flt.d a2, a2, s0 ; RV64IDINX-NEXT: neg a2, a2 ; RV64IDINX-NEXT: or a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll index ebda78528810f..0af75a789f7a2 100644 --- a/llvm/test/CodeGen/RISCV/rv64-float-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-float-convert.ll @@ -130,16 +130,17 @@ define i128 @fptosi_sat_f32_to_i128(float %a) nounwind { ; RV64IF-NEXT: # %bb.1: ; RV64IF-NEXT: slli a1, a2, 63 ; RV64IF-NEXT: .LBB4_2: -; RV64IF-NEXT: lui a3, %hi(.LCPI4_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI4_0)(a3) +; RV64IF-NEXT: lui a3, 520192 +; RV64IF-NEXT: addi a3, a3, -1 +; RV64IF-NEXT: fmv.w.x fa5, a3 ; RV64IF-NEXT: flt.s a3, fa5, fs0 ; RV64IF-NEXT: beqz a3, .LBB4_4 ; RV64IF-NEXT: # %bb.3: ; RV64IF-NEXT: srli a1, a2, 1 ; RV64IF-NEXT: .LBB4_4: ; RV64IF-NEXT: feq.s a2, fs0, fs0 -; RV64IF-NEXT: neg a3, a3 ; RV64IF-NEXT: neg a4, s0 +; RV64IF-NEXT: neg a3, a3 ; RV64IF-NEXT: neg a2, a2 ; RV64IF-NEXT: and a0, a4, a0 ; RV64IF-NEXT: and a1, a2, a1 @@ -235,10 +236,11 @@ define i128 @fptoui_sat_f32_to_i128(float %a) nounwind { ; RV64IF-NEXT: fle.s a0, fa5, fa0 ; RV64IF-NEXT: neg s0, a0 ; RV64IF-NEXT: call __fixunssfti -; RV64IF-NEXT: lui a2, %hi(.LCPI5_0) -; RV64IF-NEXT: flw fa5, %lo(.LCPI5_0)(a2) ; RV64IF-NEXT: and a0, s0, a0 +; RV64IF-NEXT: lui a2, 522240 ; RV64IF-NEXT: and a1, s0, a1 +; RV64IF-NEXT: addi a2, a2, -1 +; RV64IF-NEXT: fmv.w.x fa5, a2 ; RV64IF-NEXT: flt.s a2, fa5, fs0 ; RV64IF-NEXT: neg a2, a2 ; RV64IF-NEXT: or a0, a2, a0 diff --git a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll index 648f3789953aa..d8f3816b85485 100644 --- a/llvm/test/CodeGen/RISCV/rv64-half-convert.ll +++ b/llvm/test/CodeGen/RISCV/rv64-half-convert.ll @@ -208,16 +208,17 @@ define i128 @fptosi_sat_f16_to_i128(half %a) nounwind { ; RV64IZFH-NEXT: # %bb.1: ; RV64IZFH-NEXT: slli a1, a2, 63 ; RV64IZFH-NEXT: .LBB4_2: -; RV64IZFH-NEXT: lui a3, %hi(.LCPI4_0) -; RV64IZFH-NEXT: flw fa5, %lo(.LCPI4_0)(a3) +; RV64IZFH-NEXT: lui a3, 520192 +; RV64IZFH-NEXT: addi a3, a3, -1 +; RV64IZFH-NEXT: fmv.w.x fa5, a3 ; RV64IZFH-NEXT: flt.s a3, fa5, fs0 ; RV64IZFH-NEXT: beqz a3, .LBB4_4 ; RV64IZFH-NEXT: # %bb.3: ; RV64IZFH-NEXT: srli a1, a2, 1 ; RV64IZFH-NEXT: .LBB4_4: ; RV64IZFH-NEXT: feq.s a2, fs0, fs0 -; RV64IZFH-NEXT: neg a3, a3 ; RV64IZFH-NEXT: neg a4, s0 +; RV64IZFH-NEXT: neg a3, a3 ; RV64IZFH-NEXT: neg a2, a2 ; RV64IZFH-NEXT: and a0, a4, a0 ; RV64IZFH-NEXT: and a1, a2, a1 @@ -308,23 +309,25 @@ define i128 @fptoui_sat_f16_to_i128(half %a) nounwind { ; RV64IZFH-NEXT: addi sp, sp, -32 ; RV64IZFH-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64IZFH-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64IZFH-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; RV64IZFH-NEXT: lui a0, %hi(.LCPI5_0) -; RV64IZFH-NEXT: flw fa5, %lo(.LCPI5_0)(a0) -; RV64IZFH-NEXT: fcvt.s.h fa0, fa0 -; RV64IZFH-NEXT: fmv.w.x fa4, zero -; RV64IZFH-NEXT: fle.s a0, fa4, fa0 -; RV64IZFH-NEXT: flt.s a1, fa5, fa0 -; RV64IZFH-NEXT: neg s0, a1 -; RV64IZFH-NEXT: neg s1, a0 +; RV64IZFH-NEXT: fsw fs0, 12(sp) # 4-byte Folded Spill +; RV64IZFH-NEXT: fcvt.s.h fs0, fa0 +; RV64IZFH-NEXT: fmv.w.x fa5, zero +; RV64IZFH-NEXT: fle.s a0, fa5, fs0 +; RV64IZFH-NEXT: neg s0, a0 +; RV64IZFH-NEXT: fmv.s fa0, fs0 ; RV64IZFH-NEXT: call __fixunssfti -; RV64IZFH-NEXT: and a0, s1, a0 -; RV64IZFH-NEXT: and a1, s1, a1 -; RV64IZFH-NEXT: or a0, s0, a0 -; RV64IZFH-NEXT: or a1, s0, a1 +; RV64IZFH-NEXT: and a0, s0, a0 +; RV64IZFH-NEXT: lui a2, 522240 +; RV64IZFH-NEXT: and a1, s0, a1 +; RV64IZFH-NEXT: addi a2, a2, -1 +; RV64IZFH-NEXT: fmv.w.x fa5, a2 +; RV64IZFH-NEXT: flt.s a2, fa5, fs0 +; RV64IZFH-NEXT: neg a2, a2 +; RV64IZFH-NEXT: or a0, a2, a0 +; RV64IZFH-NEXT: or a1, a2, a1 ; RV64IZFH-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64IZFH-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; RV64IZFH-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64IZFH-NEXT: flw fs0, 12(sp) # 4-byte Folded Reload ; RV64IZFH-NEXT: addi sp, sp, 32 ; RV64IZFH-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll index 2fe8c8ce7975a..6507349f45a2f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ceil-vp.ll @@ -1,16 +1,16 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZFH declare @llvm.vp.ceil.nxv1bf16(, , i32) @@ -407,10 +407,11 @@ declare @llvm.vp.ceil.nxv1f16(, @vp_ceil_vv_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -453,10 +454,11 @@ define @vp_ceil_vv_nxv1f16( %va, @vp_ceil_vv_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -493,10 +495,11 @@ declare @llvm.vp.ceil.nxv2f16(, @vp_ceil_vv_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -539,10 +542,11 @@ define @vp_ceil_vv_nxv2f16( %va, @vp_ceil_vv_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -579,10 +583,11 @@ declare @llvm.vp.ceil.nxv4f16(, @vp_ceil_vv_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -625,10 +630,11 @@ define @vp_ceil_vv_nxv4f16( %va, @vp_ceil_vv_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -667,9 +673,10 @@ define @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16( %va, @vp_ceil_vv_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -755,9 +763,10 @@ define @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16( %va, @vp_ceil_vv_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -843,9 +853,10 @@ define @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16( %va, @vp_ceil_vv_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_vv_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1210,41 +1222,75 @@ define @vp_ceil_vv_nxv16f32_unmasked( declare @llvm.vp.ceil.nxv1f64(, , i32) define @vp_ceil_vv_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv1f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv1f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_ceil_vv_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv1f64_unmasked: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZFH-NEXT: vfabs.v v9, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv1f64_unmasked: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZFH-NEXT: vfabs.v v9, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1252,43 +1298,79 @@ define @vp_ceil_vv_nxv1f64_unmasked( declare @llvm.vp.ceil.nxv2f64(, , i32) define @vp_ceil_vv_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv2f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZFH-NEXT: vmv1r.v v10, v0 +; RV32ZFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vmv1r.v v0, v10 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv2f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZFH-NEXT: vmv1r.v v10, v0 +; RV64ZFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vmv1r.v v0, v10 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_ceil_vv_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv2f64_unmasked: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZFH-NEXT: vfabs.v v10, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv2f64_unmasked: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZFH-NEXT: vfabs.v v10, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1296,43 +1378,79 @@ define @vp_ceil_vv_nxv2f64_unmasked( declare @llvm.vp.ceil.nxv4f64(, , i32) define @vp_ceil_vv_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv4f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZFH-NEXT: vmv1r.v v12, v0 +; RV32ZFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vmv1r.v v0, v12 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv4f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZFH-NEXT: vmv1r.v v12, v0 +; RV64ZFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vmv1r.v v0, v12 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_ceil_vv_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv4f64_unmasked: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZFH-NEXT: vfabs.v v12, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv4f64_unmasked: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZFH-NEXT: vfabs.v v12, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1340,43 +1458,79 @@ define @vp_ceil_vv_nxv4f64_unmasked( declare @llvm.vp.ceil.nxv7f64(, , i32) define @vp_ceil_vv_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv7f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZFH-NEXT: vmv1r.v v16, v0 +; RV32ZFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vmv1r.v v0, v16 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv7f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZFH-NEXT: vmv1r.v v16, v0 +; RV64ZFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vmv1r.v v0, v16 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_ceil_vv_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv7f64_unmasked: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v16, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv7f64_unmasked: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v16, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1384,43 +1538,79 @@ define @vp_ceil_vv_nxv7f64_unmasked( declare @llvm.vp.ceil.nxv8f64(, , i32) define @vp_ceil_vv_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv8f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZFH-NEXT: vmv1r.v v16, v0 +; RV32ZFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vmv1r.v v0, v16 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv8f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZFH-NEXT: vmv1r.v v16, v0 +; RV64ZFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vmv1r.v v0, v16 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_ceil_vv_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv8f64_unmasked: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v16, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv8f64_unmasked: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v16, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1429,87 +1619,167 @@ define @vp_ceil_vv_nxv8f64_unmasked( declare @llvm.vp.ceil.nxv16f64(, , i32) define @vp_ceil_vv_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 3 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv16f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZFH-NEXT: vmv1r.v v7, v0 +; RV32ZFH-NEXT: csrr a1, vlenb +; RV32ZFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZFH-NEXT: srli a3, a1, 3 +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZFH-NEXT: sub a2, a0, a1 +; RV32ZFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZFH-NEXT: sltu a3, a0, a2 +; RV32ZFH-NEXT: addi a3, a3, -1 +; RV32ZFH-NEXT: and a2, a3, a2 +; RV32ZFH-NEXT: vmv1r.v v0, v6 +; RV32ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a2, 3 +; RV32ZFH-NEXT: vmv1r.v v0, v6 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZFH-NEXT: fsrm a2 +; RV32ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZFH-NEXT: # %bb.1: +; RV32ZFH-NEXT: mv a0, a1 +; RV32ZFH-NEXT: .LBB44_2: +; RV32ZFH-NEXT: vmv1r.v v0, v7 +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vmv1r.v v0, v7 +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv16f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZFH-NEXT: vmv1r.v v7, v0 +; RV64ZFH-NEXT: csrr a1, vlenb +; RV64ZFH-NEXT: li a2, 1075 +; RV64ZFH-NEXT: srli a3, a1, 3 +; RV64ZFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZFH-NEXT: sub a3, a0, a1 +; RV64ZFH-NEXT: slli a2, a2, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a2 +; RV64ZFH-NEXT: sltu a2, a0, a3 +; RV64ZFH-NEXT: addi a2, a2, -1 +; RV64ZFH-NEXT: and a2, a2, a3 +; RV64ZFH-NEXT: vmv1r.v v0, v6 +; RV64ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a2, 3 +; RV64ZFH-NEXT: vmv1r.v v0, v6 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZFH-NEXT: fsrm a2 +; RV64ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZFH-NEXT: # %bb.1: +; RV64ZFH-NEXT: mv a0, a1 +; RV64ZFH-NEXT: .LBB44_2: +; RV64ZFH-NEXT: vmv1r.v v0, v7 +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vmv1r.v v0, v7 +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_ceil_vv_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_vv_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 3 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: vp_ceil_vv_nxv16f64_unmasked: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: csrr a1, vlenb +; RV32ZFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZFH-NEXT: sub a3, a0, a1 +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZFH-NEXT: sltu a2, a0, a3 +; RV32ZFH-NEXT: addi a2, a2, -1 +; RV32ZFH-NEXT: and a2, a2, a3 +; RV32ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v24, v16 +; RV32ZFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZFH-NEXT: fsrmi a2, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZFH-NEXT: fsrm a2 +; RV32ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZFH-NEXT: # %bb.1: +; RV32ZFH-NEXT: mv a0, a1 +; RV32ZFH-NEXT: .LBB45_2: +; RV32ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v24, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: vp_ceil_vv_nxv16f64_unmasked: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: csrr a1, vlenb +; RV64ZFH-NEXT: li a2, 1075 +; RV64ZFH-NEXT: sub a3, a0, a1 +; RV64ZFH-NEXT: slli a2, a2, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a2 +; RV64ZFH-NEXT: sltu a2, a0, a3 +; RV64ZFH-NEXT: addi a2, a2, -1 +; RV64ZFH-NEXT: and a2, a2, a3 +; RV64ZFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v24, v16 +; RV64ZFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZFH-NEXT: fsrmi a2, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZFH-NEXT: fsrm a2 +; RV64ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZFH-NEXT: # %bb.1: +; RV64ZFH-NEXT: mv a0, a1 +; RV64ZFH-NEXT: .LBB45_2: +; RV64ZFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v24, v8 +; RV64ZFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZFH-NEXT: ret %v = call @llvm.vp.ceil.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll index fba27e3d548cf..ee18a426c1b12 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ctpop-vp.ll @@ -2025,7 +2025,8 @@ define @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64( %va, @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v0, (a4) # vscale x 64-byte Folded Spill ; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vsub.vv v24, v16, v24 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v0 -; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v24, v0 +; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v24, v16, v24 +; RV32-NEXT: vsrl.vi v16, v24, 4 ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a3, a3, -241 @@ -2312,16 +2312,16 @@ define @vp_ctpop_nxv16i64_unmasked( %va, ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v8, v8, v24 +; RV32-NEXT: vsub.vv v24, v8, v24 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v24, v8, v0 -; RV32-NEXT: vsrl.vi v8, v8, 2 -; RV32-NEXT: vand.vv v8, v8, v0 -; RV32-NEXT: vadd.vv v8, v24, v8 +; RV32-NEXT: vand.vv v8, v24, v0 +; RV32-NEXT: vsrl.vi v24, v24, 2 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsrl.vi v24, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: csrr a0, vlenb diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll index 6bf882fe47fef..52eaa51051631 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-vp.ll @@ -2193,7 +2193,8 @@ define @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64( %va, @vp_cttz_nxv16i64_unmasked( %va, i ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vadd.vi v24, v16, -1 ; RV32-NEXT: vnot.v v16, v16 -; RV32-NEXT: vand.vv v16, v16, v24 -; RV32-NEXT: vsrl.vi v24, v16, 1 +; RV32-NEXT: vand.vv v24, v16, v24 +; RV32-NEXT: vsrl.vi v16, v24, 1 ; RV32-NEXT: csrr a4, vlenb ; RV32-NEXT: li a5, 24 ; RV32-NEXT: mul a4, a4, a5 ; RV32-NEXT: add a4, sp, a4 ; RV32-NEXT: addi a4, a4, 16 ; RV32-NEXT: vs8r.v v0, (a4) # vscale x 64-byte Folded Spill -; RV32-NEXT: vand.vv v24, v24, v0 -; RV32-NEXT: vsub.vv v16, v16, v24 +; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vsub.vv v24, v24, v16 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v0, a3 ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v16, v0 -; RV32-NEXT: vsrl.vi v16, v16, 2 +; RV32-NEXT: vand.vv v16, v24, v0 +; RV32-NEXT: vsrl.vi v24, v24, 2 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 ; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill -; RV32-NEXT: vand.vv v16, v16, v0 +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: vadd.vv v24, v16, v24 +; RV32-NEXT: vsrl.vi v16, v24, 4 ; RV32-NEXT: vadd.vv v16, v24, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: lui a4, 4112 ; RV32-NEXT: addi a3, a3, -241 diff --git a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll index 8c63c2d4be8c1..51dc7b0714d7f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/double-round-conv.ll @@ -32,10 +32,11 @@ define @trunc_nxv1f64_to_si8( %x) { ; ; RV64-LABEL: trunc_nxv1f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI0_0) -; RV64-NEXT: fld fa5, %lo(.LCPI0_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -75,10 +76,11 @@ define @trunc_nxv1f64_to_ui8( %x) { ; ; RV64-LABEL: trunc_nxv1f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI1_0) -; RV64-NEXT: fld fa5, %lo(.LCPI1_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -116,10 +118,11 @@ define @trunc_nxv1f64_to_si16( %x) { ; ; RV64-LABEL: trunc_nxv1f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI2_0) -; RV64-NEXT: fld fa5, %lo(.LCPI2_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -155,10 +158,11 @@ define @trunc_nxv1f64_to_ui16( %x) { ; ; RV64-LABEL: trunc_nxv1f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI3_0) -; RV64-NEXT: fld fa5, %lo(.LCPI3_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -274,10 +278,11 @@ define @trunc_nxv4f64_to_si8( %x) { ; ; RV64-LABEL: trunc_nxv4f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI8_0) -; RV64-NEXT: fld fa5, %lo(.LCPI8_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -317,10 +322,11 @@ define @trunc_nxv4f64_to_ui8( %x) { ; ; RV64-LABEL: trunc_nxv4f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI9_0) -; RV64-NEXT: fld fa5, %lo(.LCPI9_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -358,10 +364,11 @@ define @trunc_nxv4f64_to_si16( %x) { ; ; RV64-LABEL: trunc_nxv4f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI10_0) -; RV64-NEXT: fld fa5, %lo(.LCPI10_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -397,10 +404,11 @@ define @trunc_nxv4f64_to_ui16( %x) { ; ; RV64-LABEL: trunc_nxv4f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI11_0) -; RV64-NEXT: fld fa5, %lo(.LCPI11_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -518,10 +526,11 @@ define @ceil_nxv1f64_to_si8( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI16_0) -; RV64-NEXT: fld fa5, %lo(.LCPI16_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -565,10 +574,11 @@ define @ceil_nxv1f64_to_ui8( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI17_0) -; RV64-NEXT: fld fa5, %lo(.LCPI17_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -610,10 +620,11 @@ define @ceil_nxv1f64_to_si16( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI18_0) -; RV64-NEXT: fld fa5, %lo(.LCPI18_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -653,10 +664,11 @@ define @ceil_nxv1f64_to_ui16( %x) { ; ; RV64-LABEL: ceil_nxv1f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-NEXT: fld fa5, %lo(.LCPI19_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -792,10 +804,11 @@ define @ceil_nxv4f64_to_si8( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI24_0) -; RV64-NEXT: fld fa5, %lo(.LCPI24_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -839,10 +852,11 @@ define @ceil_nxv4f64_to_ui8( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI25_0) -; RV64-NEXT: fld fa5, %lo(.LCPI25_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -884,10 +898,11 @@ define @ceil_nxv4f64_to_si16( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI26_0) -; RV64-NEXT: fld fa5, %lo(.LCPI26_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -927,10 +942,11 @@ define @ceil_nxv4f64_to_ui16( %x) { ; ; RV64-LABEL: ceil_nxv4f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI27_0) -; RV64-NEXT: fld fa5, %lo(.LCPI27_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: fsrmi a0, 3 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -1064,10 +1080,11 @@ define @rint_nxv1f64_to_si8( %x) { ; ; RV64-LABEL: rint_nxv1f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI32_0) -; RV64-NEXT: fld fa5, %lo(.LCPI32_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1107,10 +1124,11 @@ define @rint_nxv1f64_to_ui8( %x) { ; ; RV64-LABEL: rint_nxv1f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI33_0) -; RV64-NEXT: fld fa5, %lo(.LCPI33_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1148,10 +1166,11 @@ define @rint_nxv1f64_to_si16( %x) { ; ; RV64-LABEL: rint_nxv1f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI34_0) -; RV64-NEXT: fld fa5, %lo(.LCPI34_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1187,10 +1206,11 @@ define @rint_nxv1f64_to_ui16( %x) { ; ; RV64-LABEL: rint_nxv1f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI35_0) -; RV64-NEXT: fld fa5, %lo(.LCPI35_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v9, fa5 ; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -1306,10 +1326,11 @@ define @rint_nxv4f64_to_si8( %x) { ; ; RV64-LABEL: rint_nxv4f64_to_si8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI40_0) -; RV64-NEXT: fld fa5, %lo(.LCPI40_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1349,10 +1370,11 @@ define @rint_nxv4f64_to_ui8( %x) { ; ; RV64-LABEL: rint_nxv4f64_to_ui8: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI41_0) -; RV64-NEXT: fld fa5, %lo(.LCPI41_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1390,10 +1412,11 @@ define @rint_nxv4f64_to_si16( %x) { ; ; RV64-LABEL: rint_nxv4f64_to_si16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI42_0) -; RV64-NEXT: fld fa5, %lo(.LCPI42_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -1429,10 +1452,11 @@ define @rint_nxv4f64_to_ui16( %x) { ; ; RV64-LABEL: rint_nxv4f64_to_ui16: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, %hi(.LCPI43_0) -; RV64-NEXT: fld fa5, %lo(.LCPI43_0)(a0) ; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 ; RV64-NEXT: vmflt.vf v0, v12, fa5 ; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t ; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll index 1626b362fed15..316a84f98be2b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-constrained-sdnode.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define @ceil_nxv1f16( %x) strictfp { ; CHECK-LABEL: ceil_nxv1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -32,10 +33,11 @@ define @ceil_nxv2f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -55,10 +57,11 @@ define @ceil_nxv4f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -78,10 +81,11 @@ define @ceil_nxv8f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -101,10 +105,11 @@ define @ceil_nxv16f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -124,10 +129,11 @@ define @ceil_nxv32f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma @@ -258,92 +264,168 @@ define @ceil_nxv16f32( %x) strictfp { declare @llvm.experimental.constrained.ceil.nxv16f32(, metadata) define @ceil_nxv1f64( %x) strictfp { -; CHECK-LABEL: ceil_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.ceil.nxv1f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.ceil.nxv1f64(, metadata) define @ceil_nxv2f64( %x) strictfp { -; CHECK-LABEL: ceil_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.ceil.nxv2f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.ceil.nxv2f64(, metadata) define @ceil_nxv4f64( %x) strictfp { -; CHECK-LABEL: ceil_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.ceil.nxv4f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.ceil.nxv4f64(, metadata) define @ceil_nxv8f64( %x) strictfp { -; CHECK-LABEL: ceil_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.ceil.nxv8f64( %x, metadata !"fpexcept.strict") ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll index 4aca2d694dfbb..56edec1cc7a68 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fceil-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZFHMIN define @ceil_nxv1bf16( %x) { ; CHECK-LABEL: ceil_nxv1bf16: @@ -167,10 +167,11 @@ define @ceil_nxv32bf16( %x) { define @ceil_nxv1f16( %x) { ; ZVFH-LABEL: ceil_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -206,10 +207,11 @@ declare @llvm.ceil.nxv1f16() define @ceil_nxv2f16( %x) { ; ZVFH-LABEL: ceil_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -245,10 +247,11 @@ declare @llvm.ceil.nxv2f16() define @ceil_nxv4f16( %x) { ; ZVFH-LABEL: ceil_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -284,10 +287,11 @@ declare @llvm.ceil.nxv4f16() define @ceil_nxv8f16( %x) { ; ZVFH-LABEL: ceil_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -323,10 +327,11 @@ declare @llvm.ceil.nxv8f16() define @ceil_nxv16f16( %x) { ; ZVFH-LABEL: ceil_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -362,10 +367,11 @@ declare @llvm.ceil.nxv16f16() define @ceil_nxv32f16( %x) { ; ZVFH-LABEL: ceil_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -513,80 +519,268 @@ define @ceil_nxv16f32( %x) { declare @llvm.ceil.nxv16f32() define @ceil_nxv1f64( %x) { -; CHECK-LABEL: ceil_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: ceil_nxv1f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZFH-NEXT: vfabs.v v9, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: ceil_nxv1f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZFH-NEXT: vfabs.v v9, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: ceil_nxv1f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 3 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: ceil_nxv1f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 3 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.ceil.nxv1f64( %x) ret %a } declare @llvm.ceil.nxv1f64() define @ceil_nxv2f64( %x) { -; CHECK-LABEL: ceil_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: ceil_nxv2f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZFH-NEXT: vfabs.v v10, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: ceil_nxv2f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZFH-NEXT: vfabs.v v10, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: ceil_nxv2f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 3 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: ceil_nxv2f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 3 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.ceil.nxv2f64( %x) ret %a } declare @llvm.ceil.nxv2f64() define @ceil_nxv4f64( %x) { -; CHECK-LABEL: ceil_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: ceil_nxv4f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZFH-NEXT: vfabs.v v12, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: ceil_nxv4f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZFH-NEXT: vfabs.v v12, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: ceil_nxv4f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 3 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: ceil_nxv4f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 3 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.ceil.nxv4f64( %x) ret %a } declare @llvm.ceil.nxv4f64() define @ceil_nxv8f64( %x) { -; CHECK-LABEL: ceil_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: ceil_nxv8f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v16, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZFH-NEXT: fsrmi a0, 3 +; RV32ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: ceil_nxv8f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v16, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZFH-NEXT: fsrmi a0, 3 +; RV64ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: ceil_nxv8f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 3 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: ceil_nxv8f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 3 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.ceil.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll index d93f15ec44053..7045fc7c50847 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-constrained-sdnode.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define @floor_nxv1f16( %x) strictfp { ; CHECK-LABEL: floor_nxv1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -32,10 +33,11 @@ define @floor_nxv2f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -55,10 +57,11 @@ define @floor_nxv4f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -78,10 +81,11 @@ define @floor_nxv8f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -101,10 +105,11 @@ define @floor_nxv16f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -124,10 +129,11 @@ define @floor_nxv32f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma @@ -258,92 +264,168 @@ define @floor_nxv16f32( %x) strictfp declare @llvm.experimental.constrained.floor.nxv16f32(, metadata) define @floor_nxv1f64( %x) strictfp { -; CHECK-LABEL: floor_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.floor.nxv1f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.floor.nxv1f64(, metadata) define @floor_nxv2f64( %x) strictfp { -; CHECK-LABEL: floor_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.floor.nxv2f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.floor.nxv2f64(, metadata) define @floor_nxv4f64( %x) strictfp { -; CHECK-LABEL: floor_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.floor.nxv4f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.floor.nxv4f64(, metadata) define @floor_nxv8f64( %x) strictfp { -; CHECK-LABEL: floor_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.floor.nxv8f64( %x, metadata !"fpexcept.strict") ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll index 010d7786c8891..9adbca55bcd01 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ffloor-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZFHMIN define @floor_nxv1bf16( %x) { ; CHECK-LABEL: floor_nxv1bf16: @@ -173,10 +173,11 @@ declare @llvm.floor.nxv32bf16() define @floor_nxv1f16( %x) { ; ZVFH-LABEL: floor_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -212,10 +213,11 @@ declare @llvm.floor.nxv1f16() define @floor_nxv2f16( %x) { ; ZVFH-LABEL: floor_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -251,10 +253,11 @@ declare @llvm.floor.nxv2f16() define @floor_nxv4f16( %x) { ; ZVFH-LABEL: floor_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -290,10 +293,11 @@ declare @llvm.floor.nxv4f16() define @floor_nxv8f16( %x) { ; ZVFH-LABEL: floor_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -329,10 +333,11 @@ declare @llvm.floor.nxv8f16() define @floor_nxv16f16( %x) { ; ZVFH-LABEL: floor_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -368,10 +373,11 @@ declare @llvm.floor.nxv16f16() define @floor_nxv32f16( %x) { ; ZVFH-LABEL: floor_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -519,80 +525,268 @@ define @floor_nxv16f32( %x) { declare @llvm.floor.nxv16f32() define @floor_nxv1f64( %x) { -; CHECK-LABEL: floor_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: floor_nxv1f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZFH-NEXT: vfabs.v v9, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZFH-NEXT: fsrmi a0, 2 +; RV32ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: floor_nxv1f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZFH-NEXT: vfabs.v v9, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZFH-NEXT: fsrmi a0, 2 +; RV64ZFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: floor_nxv1f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 2 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: floor_nxv1f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 2 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.floor.nxv1f64( %x) ret %a } declare @llvm.floor.nxv1f64() define @floor_nxv2f64( %x) { -; CHECK-LABEL: floor_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: floor_nxv2f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZFH-NEXT: vfabs.v v10, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZFH-NEXT: fsrmi a0, 2 +; RV32ZFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: floor_nxv2f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZFH-NEXT: vfabs.v v10, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZFH-NEXT: fsrmi a0, 2 +; RV64ZFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: floor_nxv2f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 2 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: floor_nxv2f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 2 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.floor.nxv2f64( %x) ret %a } declare @llvm.floor.nxv2f64() define @floor_nxv4f64( %x) { -; CHECK-LABEL: floor_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: floor_nxv4f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZFH-NEXT: vfabs.v v12, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZFH-NEXT: fsrmi a0, 2 +; RV32ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: floor_nxv4f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZFH-NEXT: vfabs.v v12, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZFH-NEXT: fsrmi a0, 2 +; RV64ZFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: floor_nxv4f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 2 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: floor_nxv4f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 2 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.floor.nxv4f64( %x) ret %a } declare @llvm.floor.nxv4f64() define @floor_nxv8f64( %x) { -; CHECK-LABEL: floor_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZFH-LABEL: floor_nxv8f64: +; RV32ZFH: # %bb.0: +; RV32ZFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZFH-NEXT: vfabs.v v16, v8 +; RV32ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZFH-NEXT: fsrmi a0, 2 +; RV32ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFH-NEXT: fsrm a0 +; RV32ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFH-NEXT: ret +; +; RV64ZFH-LABEL: floor_nxv8f64: +; RV64ZFH: # %bb.0: +; RV64ZFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZFH-NEXT: vfabs.v v16, v8 +; RV64ZFH-NEXT: li a0, 1075 +; RV64ZFH-NEXT: slli a0, a0, 52 +; RV64ZFH-NEXT: fmv.d.x fa5, a0 +; RV64ZFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZFH-NEXT: fsrmi a0, 2 +; RV64ZFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFH-NEXT: fsrm a0 +; RV64ZFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFH-NEXT: ret +; +; RV32ZFHMIN-LABEL: floor_nxv8f64: +; RV32ZFHMIN: # %bb.0: +; RV32ZFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZFHMIN-NEXT: fsrmi a0, 2 +; RV32ZFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZFHMIN-NEXT: fsrm a0 +; RV32ZFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZFHMIN-NEXT: ret +; +; RV64ZFHMIN-LABEL: floor_nxv8f64: +; RV64ZFHMIN: # %bb.0: +; RV64ZFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZFHMIN-NEXT: li a0, 1075 +; RV64ZFHMIN-NEXT: slli a0, a0, 52 +; RV64ZFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZFHMIN-NEXT: fsrmi a0, 2 +; RV64ZFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZFHMIN-NEXT: fsrm a0 +; RV64ZFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZFHMIN-NEXT: ret %a = call @llvm.floor.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll index c6ff39ad10d6b..4b42c517379ad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ceil-vp.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare <2 x half> @llvm.vp.ceil.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -59,10 +60,11 @@ define <2 x half> @vp_ceil_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_ceil_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -99,10 +101,11 @@ declare <4 x half> @llvm.vp.ceil.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -145,10 +148,11 @@ define <4 x half> @vp_ceil_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_ceil_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -185,10 +189,11 @@ declare <8 x half> @llvm.vp.ceil.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -231,10 +236,11 @@ define <8 x half> @vp_ceil_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_ceil_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -273,9 +279,10 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 3 @@ -319,10 +326,11 @@ define <16 x half> @vp_ceil_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e define <16 x half> @vp_ceil_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_ceil_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 3 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -529,41 +537,141 @@ define <16 x float> @vp_ceil_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) declare <2 x double> @llvm.vp.ceil.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_ceil_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.ceil.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_ceil_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.ceil.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -571,43 +679,149 @@ define <2 x double> @vp_ceil_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) declare <4 x double> @llvm.vp.ceil.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_ceil_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.ceil.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_ceil_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.ceil.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -615,43 +829,149 @@ define <4 x double> @vp_ceil_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) declare <8 x double> @llvm.vp.ceil.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_ceil_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.ceil.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_ceil_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.ceil.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -659,43 +979,149 @@ define <8 x double> @vp_ceil_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) declare <15 x double> @llvm.vp.ceil.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_ceil_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v15f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v15f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v15f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v15f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.ceil.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_ceil_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v15f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v15f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v15f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v15f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.ceil.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -703,43 +1129,149 @@ define <15 x double> @vp_ceil_v15f64_unmasked(<15 x double> %va, i32 zeroext %ev declare <16 x double> @llvm.vp.ceil.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_ceil_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.ceil.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_ceil_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.ceil.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -747,91 +1279,341 @@ define <16 x double> @vp_ceil_v16f64_unmasked(<16 x double> %va, i32 zeroext %ev declare <32 x double> @llvm.vp.ceil.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_ceil_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 3 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v32f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v6, v0 +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB26_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFH-NEXT: addi a1, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a1 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 3 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 3 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v32f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v6, v0 +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB26_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: addi a1, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a1 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: and a0, a0, a1 +; RV64ZVFH-NEXT: fsrmi a1, 3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 3 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v32f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB26_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFHMIN-NEXT: addi a1, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 3 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 3 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v32f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB26_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: addi a1, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: and a0, a0, a1 +; RV64ZVFHMIN-NEXT: fsrmi a1, 3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.ceil.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_ceil_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_ceil_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: fsrmi a2, 3 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsrmi a1, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_ceil_v32f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB27_2: +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFH-NEXT: addi a2, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a2 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a2 +; RV32ZVFH-NEXT: fsrmi a2, 3 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 3 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_ceil_v32f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB27_2: +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: addi a2, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a2 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: and a0, a0, a2 +; RV64ZVFH-NEXT: fsrmi a2, 3 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsrmi a1, 3 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_ceil_v32f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB27_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFHMIN-NEXT: addi a2, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a2 +; RV32ZVFHMIN-NEXT: fsrmi a2, 3 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 3 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_ceil_v32f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB27_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: addi a2, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: and a0, a0, a2 +; RV64ZVFHMIN-NEXT: fsrmi a2, 3 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a1, 3 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.ceil.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll index 2250ab5bd0bbe..b1af4e685c58f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz-vp.ll @@ -1945,49 +1945,49 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma @@ -2025,12 +2025,12 @@ define <32 x i64> @vp_ctlz_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t @@ -2247,14 +2247,14 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v0, v16 +; RV32-NEXT: vand.vv v16, v0, v8 ; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v0, v0, v16 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v24, v24 ; RV32-NEXT: vsrl.vi v0, v24, 1 @@ -2265,39 +2265,39 @@ define <32 x i64> @vp_ctlz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v0, v0, v16 ; RV32-NEXT: vsub.vv v24, v24, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vand.vv v0, v24, v8 ; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vand.vv v8, v24, v8 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 4112 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v24, v8, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 @@ -4320,49 +4320,49 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: li a2, 16 ; RV32-NEXT: .LBB70_2: ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: li a1, 32 ; RV32-NEXT: lui a3, 349525 -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: addi a3, a3, 1365 -; RV32-NEXT: vsrl.vi v24, v8, 2, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 8, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 16, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t -; RV32-NEXT: vsrl.vx v24, v8, a1, v0.t -; RV32-NEXT: vor.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 16, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t +; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 +; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 5 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma ; RV32-NEXT: vnot.v v8, v8, v0.t -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: lui a3, 209715 ; RV32-NEXT: addi a3, a3, 819 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v24, a3 +; RV32-NEXT: vmv.v.x v16, a3 ; RV32-NEXT: csrr a3, vlenb ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vs8r.v v24, (a3) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vand.vv v24, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t -; RV32-NEXT: vand.vv v8, v8, v24, v0.t -; RV32-NEXT: vadd.vv v8, v16, v8, v0.t -; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v8, v8, v16, v0.t +; RV32-NEXT: vadd.vv v8, v24, v8, v0.t +; RV32-NEXT: vsrl.vi v24, v8, 4, v0.t +; RV32-NEXT: vadd.vv v8, v8, v24, v0.t ; RV32-NEXT: lui a3, 61681 ; RV32-NEXT: addi a3, a3, -241 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma @@ -4400,12 +4400,12 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64(<32 x i64> %va, <32 x i1> %m, i32 z ; RV32-NEXT: mul a3, a3, a4 ; RV32-NEXT: add a3, sp, a3 ; RV32-NEXT: addi a3, a3, 16 -; RV32-NEXT: vl8r.v v8, (a3) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a3) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t -; RV32-NEXT: vor.vv v16, v8, v16, v0.t -; RV32-NEXT: vsrl.vi v8, v16, 2, v0.t +; RV32-NEXT: vsrl.vi v8, v16, 1, v0.t ; RV32-NEXT: vor.vv v8, v16, v8, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 2, v0.t +; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vor.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 8, v0.t @@ -4622,14 +4622,14 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vor.vv v24, v16, v8 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a3 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv.v.x v8, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v0, v16 +; RV32-NEXT: vand.vv v16, v0, v8 ; RV32-NEXT: vsrl.vi v0, v0, 2 -; RV32-NEXT: vand.vv v0, v0, v16 -; RV32-NEXT: vadd.vv v8, v8, v0 +; RV32-NEXT: vand.vv v0, v0, v8 +; RV32-NEXT: vadd.vv v16, v16, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vnot.v v24, v24 ; RV32-NEXT: vsrl.vi v0, v24, 1 @@ -4640,39 +4640,39 @@ define <32 x i64> @vp_ctlz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v0, v0, v16 ; RV32-NEXT: vsub.vv v24, v24, v0 -; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v0, v8, 4 -; RV32-NEXT: vadd.vv v8, v8, v0 ; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vsrl.vi v0, v16, 4 +; RV32-NEXT: vadd.vv v16, v16, v0 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vand.vv v0, v24, v16 +; RV32-NEXT: vand.vv v0, v24, v8 ; RV32-NEXT: vsrl.vi v24, v24, 2 -; RV32-NEXT: vand.vv v16, v24, v16 +; RV32-NEXT: vand.vv v8, v24, v8 ; RV32-NEXT: lui a2, 61681 ; RV32-NEXT: lui a3, 4112 ; RV32-NEXT: addi a2, a2, -241 ; RV32-NEXT: addi a3, a3, 257 -; RV32-NEXT: vadd.vv v16, v0, v16 -; RV32-NEXT: vsrl.vi v24, v16, 4 -; RV32-NEXT: vadd.vv v16, v16, v24 +; RV32-NEXT: vadd.vv v8, v0, v8 +; RV32-NEXT: vsrl.vi v24, v8, 4 +; RV32-NEXT: vadd.vv v8, v8, v24 ; RV32-NEXT: vsetvli a4, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vand.vv v8, v8, v24 ; RV32-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a3 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vmul.vv v8, v8, v24 -; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v16, v16, v24 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmul.vv v24, v8, v24 ; RV32-NEXT: li a2, 56 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 +; RV32-NEXT: vsrl.vx v8, v16, a2 ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vx v16, v16, a2 +; RV32-NEXT: vsrl.vx v16, v24, a2 ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll index 94fecbdfde18e..a993ed909d940 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop-vp.ll @@ -1449,27 +1449,24 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: li a1, 16 ; RV32-NEXT: .LBB34_2: ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v24, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t ; RV32-NEXT: lui a2, 349525 ; RV32-NEXT: addi a2, a2, 1365 ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma -; RV32-NEXT: vmv.v.x v16, a2 +; RV32-NEXT: vmv.v.x v24, a2 ; RV32-NEXT: csrr a2, vlenb ; RV32-NEXT: slli a2, a2, 5 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; RV32-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; RV32-NEXT: vand.vv v24, v24, v16, v0.t -; RV32-NEXT: vsub.vv v8, v8, v24, v0.t +; RV32-NEXT: vand.vv v16, v16, v24, v0.t +; RV32-NEXT: vsub.vv v8, v8, v16, v0.t ; RV32-NEXT: lui a2, 209715 ; RV32-NEXT: addi a2, a2, 819 ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v24, a2 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 4 -; RV32-NEXT: add a2, sp, a2 -; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: addi a2, sp, 16 ; RV32-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v16, v8, v24, v0.t @@ -1494,14 +1491,17 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a2, a2, 257 ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: li a1, 56 ; RV32-NEXT: vsrl.vx v8, v8, a1, v0.t ; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 3 +; RV32-NEXT: slli a2, a2, 4 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 ; RV32-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill @@ -1515,25 +1515,22 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: mul a2, a2, a3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; RV32-NEXT: vsrl.vi v16, v8, 1, v0.t +; RV32-NEXT: vsrl.vi v24, v16, 1, v0.t ; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: slli a0, a0, 5 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vand.vv v8, v24, v8, v0.t +; RV32-NEXT: vsub.vv v8, v16, v8, v0.t +; RV32-NEXT: addi a0, sp, 16 ; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vsub.vv v16, v8, v16, v0.t -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 4 -; RV32-NEXT: add a0, sp, a0 -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v8, v16, v24, v0.t -; RV32-NEXT: vsrl.vi v16, v16, 2, v0.t -; RV32-NEXT: vand.vv v16, v16, v24, v0.t -; RV32-NEXT: vadd.vv v8, v8, v16, v0.t +; RV32-NEXT: vand.vv v16, v8, v24, v0.t +; RV32-NEXT: vsrl.vi v8, v8, 2, v0.t +; RV32-NEXT: vand.vv v8, v8, v24, v0.t +; RV32-NEXT: vadd.vv v8, v16, v8, v0.t ; RV32-NEXT: vsrl.vi v16, v8, 4, v0.t ; RV32-NEXT: vadd.vv v8, v8, v16, v0.t ; RV32-NEXT: csrr a0, vlenb @@ -1543,12 +1540,15 @@ define <32 x i64> @vp_ctpop_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %ev ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vand.vv v8, v8, v16, v0.t -; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; RV32-NEXT: vmul.vv v8, v8, v16, v0.t ; RV32-NEXT: vsrl.vx v16, v8, a1, v0.t ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: slli a0, a0, 4 ; RV32-NEXT: add a0, sp, a0 ; RV32-NEXT: addi a0, a0, 16 ; RV32-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll index bdce00b10e5a7..1922006b8a581 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-vp.ll @@ -1810,10 +1810,10 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: vadd.vi v0, v16, -1 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 @@ -1825,14 +1825,14 @@ define <32 x i64> @vp_cttz_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsub.vv v24, v0, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v0, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 @@ -3715,10 +3715,10 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: vadd.vi v0, v16, -1 ; RV32-NEXT: vnot.v v16, v16 ; RV32-NEXT: vand.vv v0, v16, v0 +; RV32-NEXT: addi a3, sp, 16 +; RV32-NEXT: vs8r.v v0, (a3) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli a3, zero, e32, m8, ta, ma ; RV32-NEXT: vmv.v.x v16, a2 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vand.vv v8, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 @@ -3730,14 +3730,14 @@ define <32 x i64> @vp_cttz_zero_undef_v32i64_unmasked(<32 x i64> %va, i32 zeroex ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a2, sp, a2 ; RV32-NEXT: addi a2, a2, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload -; RV32-NEXT: vand.vv v24, v24, v16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload +; RV32-NEXT: vand.vv v24, v24, v0 +; RV32-NEXT: addi a2, sp, 16 +; RV32-NEXT: vl8r.v v0, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsub.vv v24, v0, v24 ; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma ; RV32-NEXT: vsrl.vi v0, v8, 4 ; RV32-NEXT: vadd.vv v8, v8, v0 -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma ; RV32-NEXT: vand.vv v0, v24, v16 ; RV32-NEXT: vsrl.vi v24, v24, 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll index ab2d00b9b9137..71b0624d91f22 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fceil-constrained-sdnode.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define <1 x half> @ceil_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: ceil_v1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -32,10 +33,11 @@ define <2 x half> @ceil_v2f16(<2 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -55,10 +57,11 @@ define <4 x half> @ceil_v4f16(<4 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -78,10 +81,11 @@ define <8 x half> @ceil_v8f16(<8 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -101,10 +105,11 @@ define <16 x half> @ceil_v16f16(<16 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -123,11 +128,12 @@ define <32 x half> @ceil_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: ceil_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; CHECK-NEXT: slli a1, a1, 10 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 3 @@ -259,92 +265,168 @@ define <16 x float> @ceil_v16f32(<16 x float> %x) strictfp { declare <16 x float> @llvm.experimental.constrained.ceil.v16f32(<16 x float>, metadata) define <1 x double> @ceil_v1f64(<1 x double> %x) strictfp { -; CHECK-LABEL: ceil_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_v1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_v1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double> %x, metadata !"fpexcept.strict") ret <1 x double> %a } declare <1 x double> @llvm.experimental.constrained.ceil.v1f64(<1 x double>, metadata) define <2 x double> @ceil_v2f64(<2 x double> %x) strictfp { -; CHECK-LABEL: ceil_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double> %x, metadata !"fpexcept.strict") ret <2 x double> %a } declare <2 x double> @llvm.experimental.constrained.ceil.v2f64(<2 x double>, metadata) define <4 x double> @ceil_v4f64(<4 x double> %x) strictfp { -; CHECK-LABEL: ceil_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double> %x, metadata !"fpexcept.strict") ret <4 x double> %a } declare <4 x double> @llvm.experimental.constrained.ceil.v4f64(<4 x double>, metadata) define <8 x double> @ceil_v8f64(<8 x double> %x) strictfp { -; CHECK-LABEL: ceil_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 3 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: ceil_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 3 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: ceil_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 3 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call <8 x double> @llvm.experimental.constrained.ceil.v8f64(<8 x double> %x, metadata !"fpexcept.strict") ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll index c6ce7c1bbe8b4..9eca66eea865c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ffloor-constrained-sdnode.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define <1 x half> @floor_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: floor_v1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -32,10 +33,11 @@ define <2 x half> @floor_v2f16(<2 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -55,10 +57,11 @@ define <4 x half> @floor_v4f16(<4 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -78,10 +81,11 @@ define <8 x half> @floor_v8f16(<8 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -101,10 +105,11 @@ define <16 x half> @floor_v16f16(<16 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 2 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -123,11 +128,12 @@ define <32 x half> @floor_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: floor_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; CHECK-NEXT: slli a1, a1, 10 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 2 @@ -259,92 +265,168 @@ define <16 x float> @floor_v16f32(<16 x float> %x) strictfp { declare <16 x float> @llvm.experimental.constrained.floor.v16f32(<16 x float>, metadata) define <1 x double> @floor_v1f64(<1 x double> %x) strictfp { -; CHECK-LABEL: floor_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_v1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_v1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double> %x, metadata !"fpexcept.strict") ret <1 x double> %a } declare <1 x double> @llvm.experimental.constrained.floor.v1f64(<1 x double>, metadata) define <2 x double> @floor_v2f64(<2 x double> %x) strictfp { -; CHECK-LABEL: floor_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double> %x, metadata !"fpexcept.strict") ret <2 x double> %a } declare <2 x double> @llvm.experimental.constrained.floor.v2f64(<2 x double>, metadata) define <4 x double> @floor_v4f64(<4 x double> %x) strictfp { -; CHECK-LABEL: floor_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double> %x, metadata !"fpexcept.strict") ret <4 x double> %a } declare <4 x double> @llvm.experimental.constrained.floor.v4f64(<4 x double>, metadata) define <8 x double> @floor_v8f64(<8 x double> %x) strictfp { -; CHECK-LABEL: floor_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: floor_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 2 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: floor_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 2 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call <8 x double> @llvm.experimental.constrained.floor.v8f64(<8 x double> %x, metadata !"fpexcept.strict") ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll index 6fc0165d7e77f..4494b97119403 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-floor-vp.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare <2 x half> @llvm.vp.floor.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -59,10 +60,11 @@ define <2 x half> @vp_floor_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_floor_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -99,10 +101,11 @@ declare <4 x half> @llvm.vp.floor.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -145,10 +148,11 @@ define <4 x half> @vp_floor_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_floor_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -185,10 +189,11 @@ declare <8 x half> @llvm.vp.floor.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -231,10 +236,11 @@ define <8 x half> @vp_floor_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_floor_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -273,9 +279,10 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -319,10 +326,11 @@ define <16 x half> @vp_floor_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % define <16 x half> @vp_floor_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -529,41 +537,141 @@ define <16 x float> @vp_floor_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl declare <2 x double> @llvm.vp.floor.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_floor_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.floor.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_floor_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.floor.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -571,43 +679,149 @@ define <2 x double> @vp_floor_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) declare <4 x double> @llvm.vp.floor.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_floor_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.floor.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_floor_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.floor.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -615,43 +829,149 @@ define <4 x double> @vp_floor_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) declare <8 x double> @llvm.vp.floor.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_floor_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.floor.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_floor_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.floor.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -659,43 +979,149 @@ define <8 x double> @vp_floor_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) declare <15 x double> @llvm.vp.floor.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_floor_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v15f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v15f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v15f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v15f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.floor.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_floor_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v15f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v15f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v15f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v15f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.floor.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -703,43 +1129,149 @@ define <15 x double> @vp_floor_v15f64_unmasked(<15 x double> %va, i32 zeroext %e declare <16 x double> @llvm.vp.floor.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_floor_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.floor.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_floor_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.floor.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -747,91 +1279,341 @@ define <16 x double> @vp_floor_v16f64_unmasked(<16 x double> %va, i32 zeroext %e declare <32 x double> @llvm.vp.floor.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_floor_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v32f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v6, v0 +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB26_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFH-NEXT: addi a1, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a1 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v32f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v6, v0 +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB26_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: addi a1, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a1 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: and a0, a0, a1 +; RV64ZVFH-NEXT: fsrmi a1, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v32f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB26_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFHMIN-NEXT: addi a1, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v32f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB26_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: addi a1, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: and a0, a0, a1 +; RV64ZVFHMIN-NEXT: fsrmi a1, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.floor.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_floor_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: fsrmi a2, 2 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsrmi a1, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_v32f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB27_2: +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFH-NEXT: addi a2, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a2 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a2 +; RV32ZVFH-NEXT: fsrmi a2, 2 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 2 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_v32f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB27_2: +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: addi a2, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a2 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: and a0, a0, a2 +; RV64ZVFH-NEXT: fsrmi a2, 2 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsrmi a1, 2 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_v32f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB27_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFHMIN-NEXT: addi a2, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a2 +; RV32ZVFHMIN-NEXT: fsrmi a2, 2 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 2 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_v32f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB27_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: addi a2, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: and a0, a0, a2 +; RV64ZVFHMIN-NEXT: fsrmi a2, 2 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a1, 2 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.floor.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll index 3a7ded1537ef6..dd1b99bee6d55 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fnearbyint-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s declare <2 x half> @llvm.experimental.constrained.nearbyint.v2f16(<2 x half>, metadata, metadata) @@ -11,10 +11,11 @@ define <2 x half> @nearbyint_v2f16(<2 x half> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -35,10 +36,11 @@ define <4 x half> @nearbyint_v4f16(<4 x half> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -59,10 +61,11 @@ define <8 x half> @nearbyint_v8f16(<8 x half> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -83,10 +86,11 @@ define <16 x half> @nearbyint_v16f16(<16 x half> %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -106,11 +110,12 @@ define <32 x half> @nearbyint_v32f16(<32 x half> %v) strictfp { ; CHECK-LABEL: nearbyint_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: lui a1, %hi(.LCPI4_0) +; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a1) +; CHECK-NEXT: slli a1, a1, 10 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 @@ -224,23 +229,42 @@ define <16 x float> @nearbyint_v16f32(<16 x float> %v) strictfp { declare <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double>, metadata, metadata) define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp { -; CHECK-LABEL: nearbyint_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI9_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI9_0) +; RV32-NEXT: fld fa5, %lo(.LCPI9_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call <2 x double> @llvm.experimental.constrained.nearbyint.v2f64(<2 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <2 x double> %r } @@ -248,23 +272,42 @@ define <2 x double> @nearbyint_v2f64(<2 x double> %v) strictfp { declare <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double>, metadata, metadata) define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp { -; CHECK-LABEL: nearbyint_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI10_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI10_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI10_0) +; RV32-NEXT: fld fa5, %lo(.LCPI10_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call <4 x double> @llvm.experimental.constrained.nearbyint.v4f64(<4 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <4 x double> %r } @@ -272,23 +315,42 @@ define <4 x double> @nearbyint_v4f64(<4 x double> %v) strictfp { declare <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double>, metadata, metadata) define <8 x double> @nearbyint_v8f64(<8 x double> %v) strictfp { -; CHECK-LABEL: nearbyint_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call <8 x double> @llvm.experimental.constrained.nearbyint.v8f64(<8 x double> %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <8 x double> %r } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll index abb929eaaf6e6..e256ba9dd5997 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH ; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfhmin,+f,+d -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN define void @fcmp_oeq_vv_v8f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-LABEL: fcmp_oeq_vv_v8f16: @@ -437,6 +437,1036 @@ define void @fcmp_ugt_vv_v64f16(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vmnot.m v8, v24 ; ZVFH-NEXT: vsm.v v8, (a2) ; ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: fcmp_ugt_vv_v64f16: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: addi sp, sp, -512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 512 +; RV32ZVFHMIN-NEXT: sw ra, 508(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: sw s0, 504(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: .cfi_offset ra, -4 +; RV32ZVFHMIN-NEXT: .cfi_offset s0, -8 +; RV32ZVFHMIN-NEXT: addi s0, sp, 512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVFHMIN-NEXT: andi sp, sp, -128 +; RV32ZVFHMIN-NEXT: li a3, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; RV32ZVFHMIN-NEXT: vle16.v v16, (a1) +; RV32ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV32ZVFHMIN-NEXT: addi a0, sp, 128 +; RV32ZVFHMIN-NEXT: addi a1, sp, 256 +; RV32ZVFHMIN-NEXT: vse16.v v16, (a0) +; RV32ZVFHMIN-NEXT: vse16.v v8, (a1) +; RV32ZVFHMIN-NEXT: lh a0, 192(sp) +; RV32ZVFHMIN-NEXT: lh a1, 320(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 96(sp) +; RV32ZVFHMIN-NEXT: lh a0, 190(sp) +; RV32ZVFHMIN-NEXT: lh a1, 318(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 95(sp) +; RV32ZVFHMIN-NEXT: lh a0, 188(sp) +; RV32ZVFHMIN-NEXT: lh a1, 316(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 94(sp) +; RV32ZVFHMIN-NEXT: lh a0, 186(sp) +; RV32ZVFHMIN-NEXT: lh a1, 314(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 93(sp) +; RV32ZVFHMIN-NEXT: lh a0, 184(sp) +; RV32ZVFHMIN-NEXT: lh a1, 312(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 92(sp) +; RV32ZVFHMIN-NEXT: lh a0, 182(sp) +; RV32ZVFHMIN-NEXT: lh a1, 310(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 91(sp) +; RV32ZVFHMIN-NEXT: lh a0, 180(sp) +; RV32ZVFHMIN-NEXT: lh a1, 308(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 90(sp) +; RV32ZVFHMIN-NEXT: lh a0, 178(sp) +; RV32ZVFHMIN-NEXT: lh a1, 306(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 89(sp) +; RV32ZVFHMIN-NEXT: lh a1, 176(sp) +; RV32ZVFHMIN-NEXT: lh a4, 304(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a0, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a1, a1, 1 +; RV32ZVFHMIN-NEXT: sb a1, 88(sp) +; RV32ZVFHMIN-NEXT: lh a4, 174(sp) +; RV32ZVFHMIN-NEXT: lh a5, 302(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a1, v8 +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v16, 7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 87(sp) +; RV32ZVFHMIN-NEXT: lh a4, 172(sp) +; RV32ZVFHMIN-NEXT: lh a5, 300(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v13, v8, 7 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v16, 6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 86(sp) +; RV32ZVFHMIN-NEXT: lh a4, 170(sp) +; RV32ZVFHMIN-NEXT: lh a5, 298(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v15, v8, 6 +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v16, 5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 85(sp) +; RV32ZVFHMIN-NEXT: lh a4, 168(sp) +; RV32ZVFHMIN-NEXT: lh a5, 296(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v19, v8, 5 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v16, 4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 84(sp) +; RV32ZVFHMIN-NEXT: lh a4, 166(sp) +; RV32ZVFHMIN-NEXT: lh a5, 294(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v21, v8, 4 +; RV32ZVFHMIN-NEXT: vslidedown.vi v23, v16, 3 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 83(sp) +; RV32ZVFHMIN-NEXT: lh a4, 164(sp) +; RV32ZVFHMIN-NEXT: lh a5, 292(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v8, 3 +; RV32ZVFHMIN-NEXT: vslidedown.vi v25, v16, 2 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 82(sp) +; RV32ZVFHMIN-NEXT: lh a4, 162(sp) +; RV32ZVFHMIN-NEXT: lh a5, 290(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v24, v8, 2 +; RV32ZVFHMIN-NEXT: vslidedown.vi v26, v16, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: xori a0, a4, 1 +; RV32ZVFHMIN-NEXT: sb a0, 81(sp) +; RV32ZVFHMIN-NEXT: lh a0, 160(sp) +; RV32ZVFHMIN-NEXT: lh a1, 288(sp) +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a4, 64(sp) +; RV32ZVFHMIN-NEXT: sb a0, 80(sp) +; RV32ZVFHMIN-NEXT: lh a0, 226(sp) +; RV32ZVFHMIN-NEXT: lh a1, 354(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v27, v8, 1 +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v16, 15 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 113(sp) +; RV32ZVFHMIN-NEXT: lh a4, 224(sp) +; RV32ZVFHMIN-NEXT: lh a5, 352(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a1, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a0, v13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 112(sp) +; RV32ZVFHMIN-NEXT: lh a4, 222(sp) +; RV32ZVFHMIN-NEXT: lh a6, 350(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 15 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 111(sp) +; RV32ZVFHMIN-NEXT: lh a4, 220(sp) +; RV32ZVFHMIN-NEXT: lh a6, 348(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v15 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v16, 14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 110(sp) +; RV32ZVFHMIN-NEXT: lh t0, 218(sp) +; RV32ZVFHMIN-NEXT: lh t1, 346(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v18 +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v19 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori t0, t0, 1 +; RV32ZVFHMIN-NEXT: sb t0, 109(sp) +; RV32ZVFHMIN-NEXT: lh t0, 216(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v16, 13 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v20 +; RV32ZVFHMIN-NEXT: lh t2, 344(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v21 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v16, 12 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t2 +; RV32ZVFHMIN-NEXT: fle.h t2, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: xori a1, t2, 1 +; RV32ZVFHMIN-NEXT: sb a1, 108(sp) +; RV32ZVFHMIN-NEXT: lh a1, 214(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a0 +; RV32ZVFHMIN-NEXT: lh t3, 342(sp) +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v23 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV32ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: xori a1, a1, 1 +; RV32ZVFHMIN-NEXT: sb a1, 107(sp) +; RV32ZVFHMIN-NEXT: lh a5, 212(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: lh a7, 340(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s t3, v22 +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v16, 11 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: sb a5, 106(sp) +; RV32ZVFHMIN-NEXT: lh a5, 210(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: lh a6, 338(sp) +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v25 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: sb a5, 105(sp) +; RV32ZVFHMIN-NEXT: lh a6, 208(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: lh t0, 336(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV32ZVFHMIN-NEXT: vslidedown.vi v24, v16, 10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: xori t0, t0, 1 +; RV32ZVFHMIN-NEXT: sb t0, 104(sp) +; RV32ZVFHMIN-NEXT: lh t0, 206(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV32ZVFHMIN-NEXT: lh t1, 334(sp) +; RV32ZVFHMIN-NEXT: fle.h t2, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v26 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: xori a7, t1, 1 +; RV32ZVFHMIN-NEXT: sb a7, 103(sp) +; RV32ZVFHMIN-NEXT: lh a7, 204(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: fle.h a6, fa4, fa5 +; RV32ZVFHMIN-NEXT: lh t1, 332(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v27 +; RV32ZVFHMIN-NEXT: vslidedown.vi v26, v16, 9 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: xori a7, t1, 1 +; RV32ZVFHMIN-NEXT: sb a7, 102(sp) +; RV32ZVFHMIN-NEXT: lh a7, 202(sp) +; RV32ZVFHMIN-NEXT: lh t0, 330(sp) +; RV32ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a7, a7, 1 +; RV32ZVFHMIN-NEXT: sb a7, 101(sp) +; RV32ZVFHMIN-NEXT: lh a7, 200(sp) +; RV32ZVFHMIN-NEXT: lh t0, 328(sp) +; RV32ZVFHMIN-NEXT: xori a1, a1, 1 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a7, a7, 1 +; RV32ZVFHMIN-NEXT: sb a7, 100(sp) +; RV32ZVFHMIN-NEXT: lh a7, 198(sp) +; RV32ZVFHMIN-NEXT: lh t0, 326(sp) +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: xori t2, t2, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a7, a7, 1 +; RV32ZVFHMIN-NEXT: sb a7, 99(sp) +; RV32ZVFHMIN-NEXT: lh a7, 196(sp) +; RV32ZVFHMIN-NEXT: lh t0, 324(sp) +; RV32ZVFHMIN-NEXT: xori a6, a6, 1 +; RV32ZVFHMIN-NEXT: xori t1, t1, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a7, a7, 1 +; RV32ZVFHMIN-NEXT: sb a7, 98(sp) +; RV32ZVFHMIN-NEXT: lh a7, 194(sp) +; RV32ZVFHMIN-NEXT: lh t0, 322(sp) +; RV32ZVFHMIN-NEXT: sb t1, 65(sp) +; RV32ZVFHMIN-NEXT: sb a6, 66(sp) +; RV32ZVFHMIN-NEXT: sb t2, 67(sp) +; RV32ZVFHMIN-NEXT: sb a5, 68(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: sb a4, 69(sp) +; RV32ZVFHMIN-NEXT: sb a1, 70(sp) +; RV32ZVFHMIN-NEXT: sb a0, 71(sp) +; RV32ZVFHMIN-NEXT: sb a5, 97(sp) +; RV32ZVFHMIN-NEXT: lh a0, 254(sp) +; RV32ZVFHMIN-NEXT: lh a1, 382(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v16, 8 +; RV32ZVFHMIN-NEXT: vslidedown.vi v2, v8, 14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 127(sp) +; RV32ZVFHMIN-NEXT: lh a0, 252(sp) +; RV32ZVFHMIN-NEXT: lh a1, 380(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v0, v8, 13 +; RV32ZVFHMIN-NEXT: vslidedown.vi v4, v8, 12 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 126(sp) +; RV32ZVFHMIN-NEXT: lh a0, 250(sp) +; RV32ZVFHMIN-NEXT: lh a1, 378(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v6, v8, 11 +; RV32ZVFHMIN-NEXT: vslidedown.vi v30, v8, 10 +; RV32ZVFHMIN-NEXT: vslidedown.vi v28, v8, 9 +; RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 125(sp) +; RV32ZVFHMIN-NEXT: lh a0, 248(sp) +; RV32ZVFHMIN-NEXT: lh a1, 376(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 124(sp) +; RV32ZVFHMIN-NEXT: lh a0, 246(sp) +; RV32ZVFHMIN-NEXT: lh a1, 374(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v2 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v18 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 123(sp) +; RV32ZVFHMIN-NEXT: lh a0, 244(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v0 +; RV32ZVFHMIN-NEXT: lh a1, 372(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v20 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: vmv.x.s t3, v4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 122(sp) +; RV32ZVFHMIN-NEXT: lh a1, 242(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: lh a4, 370(sp) +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v22 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: xori a1, a1, 1 +; RV32ZVFHMIN-NEXT: sb a1, 121(sp) +; RV32ZVFHMIN-NEXT: lh a4, 240(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: lh a6, 368(sp) +; RV32ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: sb a4, 120(sp) +; RV32ZVFHMIN-NEXT: lh a6, 238(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: lh t0, 366(sp) +; RV32ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: xori t0, t0, 1 +; RV32ZVFHMIN-NEXT: sb t0, 119(sp) +; RV32ZVFHMIN-NEXT: lh t0, 236(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV32ZVFHMIN-NEXT: lh t1, 364(sp) +; RV32ZVFHMIN-NEXT: fle.h t2, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v30 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: xori a5, t1, 1 +; RV32ZVFHMIN-NEXT: sb a5, 118(sp) +; RV32ZVFHMIN-NEXT: lh a5, 234(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: lh a7, 362(sp) +; RV32ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v26 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: xori a6, a7, 1 +; RV32ZVFHMIN-NEXT: sb a6, 117(sp) +; RV32ZVFHMIN-NEXT: lh a6, 232(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: lh a7, 360(sp) +; RV32ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v28 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: xori a5, a7, 1 +; RV32ZVFHMIN-NEXT: sb a5, 116(sp) +; RV32ZVFHMIN-NEXT: lh a5, 230(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: lh a6, 358(sp) +; RV32ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: fle.h a6, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: xori a1, a1, 1 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: xori a5, t2, 1 +; RV32ZVFHMIN-NEXT: xori a6, a6, 1 +; RV32ZVFHMIN-NEXT: sb a6, 115(sp) +; RV32ZVFHMIN-NEXT: lh a6, 228(sp) +; RV32ZVFHMIN-NEXT: lh t2, 356(sp) +; RV32ZVFHMIN-NEXT: sb a5, 76(sp) +; RV32ZVFHMIN-NEXT: sb a4, 77(sp) +; RV32ZVFHMIN-NEXT: sb a1, 78(sp) +; RV32ZVFHMIN-NEXT: sb a0, 79(sp) +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a1, t1, 1 +; RV32ZVFHMIN-NEXT: xori a4, t0, 1 +; RV32ZVFHMIN-NEXT: xori a5, a7, 1 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 72(sp) +; RV32ZVFHMIN-NEXT: sb a5, 73(sp) +; RV32ZVFHMIN-NEXT: sb a4, 74(sp) +; RV32ZVFHMIN-NEXT: sb a1, 75(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t2 +; RV32ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 114(sp) +; RV32ZVFHMIN-NEXT: addi a0, sp, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV32ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV32ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV32ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV32ZVFHMIN-NEXT: vsm.v v12, (a2) +; RV32ZVFHMIN-NEXT: addi sp, s0, -512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa sp, 512 +; RV32ZVFHMIN-NEXT: lw ra, 508(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: lw s0, 504(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: .cfi_restore ra +; RV32ZVFHMIN-NEXT: .cfi_restore s0 +; RV32ZVFHMIN-NEXT: addi sp, sp, 512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: fcmp_ugt_vv_v64f16: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: addi sp, sp, -512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 512 +; RV64ZVFHMIN-NEXT: sd ra, 504(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: sd s0, 496(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: .cfi_offset ra, -8 +; RV64ZVFHMIN-NEXT: .cfi_offset s0, -16 +; RV64ZVFHMIN-NEXT: addi s0, sp, 512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVFHMIN-NEXT: andi sp, sp, -128 +; RV64ZVFHMIN-NEXT: li a3, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; RV64ZVFHMIN-NEXT: vle16.v v16, (a1) +; RV64ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV64ZVFHMIN-NEXT: addi a0, sp, 128 +; RV64ZVFHMIN-NEXT: addi a1, sp, 256 +; RV64ZVFHMIN-NEXT: vse16.v v16, (a0) +; RV64ZVFHMIN-NEXT: vse16.v v8, (a1) +; RV64ZVFHMIN-NEXT: lh a0, 192(sp) +; RV64ZVFHMIN-NEXT: lh a1, 320(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 96(sp) +; RV64ZVFHMIN-NEXT: lh a0, 190(sp) +; RV64ZVFHMIN-NEXT: lh a1, 318(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 95(sp) +; RV64ZVFHMIN-NEXT: lh a0, 188(sp) +; RV64ZVFHMIN-NEXT: lh a1, 316(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 94(sp) +; RV64ZVFHMIN-NEXT: lh a0, 186(sp) +; RV64ZVFHMIN-NEXT: lh a1, 314(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 93(sp) +; RV64ZVFHMIN-NEXT: lh a0, 184(sp) +; RV64ZVFHMIN-NEXT: lh a1, 312(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 92(sp) +; RV64ZVFHMIN-NEXT: lh a0, 182(sp) +; RV64ZVFHMIN-NEXT: lh a1, 310(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 91(sp) +; RV64ZVFHMIN-NEXT: lh a0, 180(sp) +; RV64ZVFHMIN-NEXT: lh a1, 308(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 90(sp) +; RV64ZVFHMIN-NEXT: lh a0, 178(sp) +; RV64ZVFHMIN-NEXT: lh a1, 306(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 89(sp) +; RV64ZVFHMIN-NEXT: lh a1, 176(sp) +; RV64ZVFHMIN-NEXT: lh a4, 304(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a0, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a1, a1, 1 +; RV64ZVFHMIN-NEXT: sb a1, 88(sp) +; RV64ZVFHMIN-NEXT: lh a4, 174(sp) +; RV64ZVFHMIN-NEXT: lh a5, 302(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a1, v8 +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v16, 7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 87(sp) +; RV64ZVFHMIN-NEXT: lh a4, 172(sp) +; RV64ZVFHMIN-NEXT: lh a5, 300(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v13, v8, 7 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v16, 6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 86(sp) +; RV64ZVFHMIN-NEXT: lh a4, 170(sp) +; RV64ZVFHMIN-NEXT: lh a5, 298(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v15, v8, 6 +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v16, 5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 85(sp) +; RV64ZVFHMIN-NEXT: lh a4, 168(sp) +; RV64ZVFHMIN-NEXT: lh a5, 296(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v19, v8, 5 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v16, 4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 84(sp) +; RV64ZVFHMIN-NEXT: lh a4, 166(sp) +; RV64ZVFHMIN-NEXT: lh a5, 294(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v21, v8, 4 +; RV64ZVFHMIN-NEXT: vslidedown.vi v23, v16, 3 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 83(sp) +; RV64ZVFHMIN-NEXT: lh a4, 164(sp) +; RV64ZVFHMIN-NEXT: lh a5, 292(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v8, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vi v25, v16, 2 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 82(sp) +; RV64ZVFHMIN-NEXT: lh a4, 162(sp) +; RV64ZVFHMIN-NEXT: lh a5, 290(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v24, v8, 2 +; RV64ZVFHMIN-NEXT: vslidedown.vi v26, v16, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: xori a0, a4, 1 +; RV64ZVFHMIN-NEXT: sb a0, 81(sp) +; RV64ZVFHMIN-NEXT: lh a0, 160(sp) +; RV64ZVFHMIN-NEXT: lh a1, 288(sp) +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a4, 64(sp) +; RV64ZVFHMIN-NEXT: sb a0, 80(sp) +; RV64ZVFHMIN-NEXT: lh a0, 226(sp) +; RV64ZVFHMIN-NEXT: lh a1, 354(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v27, v8, 1 +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v16, 15 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 113(sp) +; RV64ZVFHMIN-NEXT: lh a4, 224(sp) +; RV64ZVFHMIN-NEXT: lh a5, 352(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a1, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a0, v13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 112(sp) +; RV64ZVFHMIN-NEXT: lh a4, 222(sp) +; RV64ZVFHMIN-NEXT: lh a6, 350(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 15 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 111(sp) +; RV64ZVFHMIN-NEXT: lh a4, 220(sp) +; RV64ZVFHMIN-NEXT: lh a6, 348(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v15 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v16, 14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 110(sp) +; RV64ZVFHMIN-NEXT: lh t0, 218(sp) +; RV64ZVFHMIN-NEXT: lh t1, 346(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v18 +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v19 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori t0, t0, 1 +; RV64ZVFHMIN-NEXT: sb t0, 109(sp) +; RV64ZVFHMIN-NEXT: lh t0, 216(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v16, 13 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v20 +; RV64ZVFHMIN-NEXT: lh t2, 344(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v21 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v16, 12 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t2 +; RV64ZVFHMIN-NEXT: fle.h t2, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: xori a1, t2, 1 +; RV64ZVFHMIN-NEXT: sb a1, 108(sp) +; RV64ZVFHMIN-NEXT: lh a1, 214(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a0 +; RV64ZVFHMIN-NEXT: lh t3, 342(sp) +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v23 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV64ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: xori a1, a1, 1 +; RV64ZVFHMIN-NEXT: sb a1, 107(sp) +; RV64ZVFHMIN-NEXT: lh a5, 212(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: lh a7, 340(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s t3, v22 +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v16, 11 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: sb a5, 106(sp) +; RV64ZVFHMIN-NEXT: lh a5, 210(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: lh a6, 338(sp) +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v25 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: sb a5, 105(sp) +; RV64ZVFHMIN-NEXT: lh a6, 208(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: lh t0, 336(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV64ZVFHMIN-NEXT: vslidedown.vi v24, v16, 10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: xori t0, t0, 1 +; RV64ZVFHMIN-NEXT: sb t0, 104(sp) +; RV64ZVFHMIN-NEXT: lh t0, 206(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV64ZVFHMIN-NEXT: lh t1, 334(sp) +; RV64ZVFHMIN-NEXT: fle.h t2, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v26 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: xori a7, t1, 1 +; RV64ZVFHMIN-NEXT: sb a7, 103(sp) +; RV64ZVFHMIN-NEXT: lh a7, 204(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: fle.h a6, fa4, fa5 +; RV64ZVFHMIN-NEXT: lh t1, 332(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v27 +; RV64ZVFHMIN-NEXT: vslidedown.vi v26, v16, 9 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: xori a7, t1, 1 +; RV64ZVFHMIN-NEXT: sb a7, 102(sp) +; RV64ZVFHMIN-NEXT: lh a7, 202(sp) +; RV64ZVFHMIN-NEXT: lh t0, 330(sp) +; RV64ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a7, a7, 1 +; RV64ZVFHMIN-NEXT: sb a7, 101(sp) +; RV64ZVFHMIN-NEXT: lh a7, 200(sp) +; RV64ZVFHMIN-NEXT: lh t0, 328(sp) +; RV64ZVFHMIN-NEXT: xori a1, a1, 1 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a7, a7, 1 +; RV64ZVFHMIN-NEXT: sb a7, 100(sp) +; RV64ZVFHMIN-NEXT: lh a7, 198(sp) +; RV64ZVFHMIN-NEXT: lh t0, 326(sp) +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: xori t2, t2, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a7, a7, 1 +; RV64ZVFHMIN-NEXT: sb a7, 99(sp) +; RV64ZVFHMIN-NEXT: lh a7, 196(sp) +; RV64ZVFHMIN-NEXT: lh t0, 324(sp) +; RV64ZVFHMIN-NEXT: xori a6, a6, 1 +; RV64ZVFHMIN-NEXT: xori t1, t1, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a7, a7, 1 +; RV64ZVFHMIN-NEXT: sb a7, 98(sp) +; RV64ZVFHMIN-NEXT: lh a7, 194(sp) +; RV64ZVFHMIN-NEXT: lh t0, 322(sp) +; RV64ZVFHMIN-NEXT: sb t1, 65(sp) +; RV64ZVFHMIN-NEXT: sb a6, 66(sp) +; RV64ZVFHMIN-NEXT: sb t2, 67(sp) +; RV64ZVFHMIN-NEXT: sb a5, 68(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: sb a4, 69(sp) +; RV64ZVFHMIN-NEXT: sb a1, 70(sp) +; RV64ZVFHMIN-NEXT: sb a0, 71(sp) +; RV64ZVFHMIN-NEXT: sb a5, 97(sp) +; RV64ZVFHMIN-NEXT: lh a0, 254(sp) +; RV64ZVFHMIN-NEXT: lh a1, 382(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v16, 8 +; RV64ZVFHMIN-NEXT: vslidedown.vi v2, v8, 14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 127(sp) +; RV64ZVFHMIN-NEXT: lh a0, 252(sp) +; RV64ZVFHMIN-NEXT: lh a1, 380(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v0, v8, 13 +; RV64ZVFHMIN-NEXT: vslidedown.vi v4, v8, 12 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 126(sp) +; RV64ZVFHMIN-NEXT: lh a0, 250(sp) +; RV64ZVFHMIN-NEXT: lh a1, 378(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v6, v8, 11 +; RV64ZVFHMIN-NEXT: vslidedown.vi v30, v8, 10 +; RV64ZVFHMIN-NEXT: vslidedown.vi v28, v8, 9 +; RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 125(sp) +; RV64ZVFHMIN-NEXT: lh a0, 248(sp) +; RV64ZVFHMIN-NEXT: lh a1, 376(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 124(sp) +; RV64ZVFHMIN-NEXT: lh a0, 246(sp) +; RV64ZVFHMIN-NEXT: lh a1, 374(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v2 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v18 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 123(sp) +; RV64ZVFHMIN-NEXT: lh a0, 244(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v0 +; RV64ZVFHMIN-NEXT: lh a1, 372(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v20 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmv.x.s t3, v4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 122(sp) +; RV64ZVFHMIN-NEXT: lh a1, 242(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: lh a4, 370(sp) +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v22 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: xori a1, a1, 1 +; RV64ZVFHMIN-NEXT: sb a1, 121(sp) +; RV64ZVFHMIN-NEXT: lh a4, 240(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: lh a6, 368(sp) +; RV64ZVFHMIN-NEXT: fle.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: sb a4, 120(sp) +; RV64ZVFHMIN-NEXT: lh a6, 238(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: lh t0, 366(sp) +; RV64ZVFHMIN-NEXT: fle.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: xori t0, t0, 1 +; RV64ZVFHMIN-NEXT: sb t0, 119(sp) +; RV64ZVFHMIN-NEXT: lh t0, 236(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV64ZVFHMIN-NEXT: lh t1, 364(sp) +; RV64ZVFHMIN-NEXT: fle.h t2, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v30 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: xori a5, t1, 1 +; RV64ZVFHMIN-NEXT: sb a5, 118(sp) +; RV64ZVFHMIN-NEXT: lh a5, 234(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: lh a7, 362(sp) +; RV64ZVFHMIN-NEXT: fle.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v26 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: xori a6, a7, 1 +; RV64ZVFHMIN-NEXT: sb a6, 117(sp) +; RV64ZVFHMIN-NEXT: lh a6, 232(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: lh a7, 360(sp) +; RV64ZVFHMIN-NEXT: fle.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v28 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: xori a5, a7, 1 +; RV64ZVFHMIN-NEXT: sb a5, 116(sp) +; RV64ZVFHMIN-NEXT: lh a5, 230(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: lh a6, 358(sp) +; RV64ZVFHMIN-NEXT: fle.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: fle.h a6, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: xori a1, a1, 1 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: xori a5, t2, 1 +; RV64ZVFHMIN-NEXT: xori a6, a6, 1 +; RV64ZVFHMIN-NEXT: sb a6, 115(sp) +; RV64ZVFHMIN-NEXT: lh a6, 228(sp) +; RV64ZVFHMIN-NEXT: lh t2, 356(sp) +; RV64ZVFHMIN-NEXT: sb a5, 76(sp) +; RV64ZVFHMIN-NEXT: sb a4, 77(sp) +; RV64ZVFHMIN-NEXT: sb a1, 78(sp) +; RV64ZVFHMIN-NEXT: sb a0, 79(sp) +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a1, t1, 1 +; RV64ZVFHMIN-NEXT: xori a4, t0, 1 +; RV64ZVFHMIN-NEXT: xori a5, a7, 1 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 72(sp) +; RV64ZVFHMIN-NEXT: sb a5, 73(sp) +; RV64ZVFHMIN-NEXT: sb a4, 74(sp) +; RV64ZVFHMIN-NEXT: sb a1, 75(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t2 +; RV64ZVFHMIN-NEXT: fle.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 114(sp) +; RV64ZVFHMIN-NEXT: addi a0, sp, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV64ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV64ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV64ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV64ZVFHMIN-NEXT: vsm.v v12, (a2) +; RV64ZVFHMIN-NEXT: addi sp, s0, -512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa sp, 512 +; RV64ZVFHMIN-NEXT: ld ra, 504(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: ld s0, 496(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: .cfi_restore ra +; RV64ZVFHMIN-NEXT: .cfi_restore s0 +; RV64ZVFHMIN-NEXT: addi sp, sp, 512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVFHMIN-NEXT: ret %a = load <64 x half>, ptr %x %b = load <64 x half>, ptr %y %c = fcmp ugt <64 x half> %a, %b @@ -454,6 +1484,908 @@ define void @fcmp_ugt_vv_v64f16_nonans(ptr %x, ptr %y, ptr %z) { ; ZVFH-NEXT: vmflt.vv v24, v16, v8 ; ZVFH-NEXT: vsm.v v24, (a2) ; ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: fcmp_ugt_vv_v64f16_nonans: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: addi sp, sp, -512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 512 +; RV32ZVFHMIN-NEXT: sw ra, 508(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: sw s0, 504(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: .cfi_offset ra, -4 +; RV32ZVFHMIN-NEXT: .cfi_offset s0, -8 +; RV32ZVFHMIN-NEXT: addi s0, sp, 512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVFHMIN-NEXT: andi sp, sp, -128 +; RV32ZVFHMIN-NEXT: li a3, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; RV32ZVFHMIN-NEXT: vle16.v v16, (a0) +; RV32ZVFHMIN-NEXT: vle16.v v8, (a1) +; RV32ZVFHMIN-NEXT: addi a0, sp, 256 +; RV32ZVFHMIN-NEXT: addi a1, sp, 128 +; RV32ZVFHMIN-NEXT: vse16.v v16, (a0) +; RV32ZVFHMIN-NEXT: vse16.v v8, (a1) +; RV32ZVFHMIN-NEXT: lh a0, 320(sp) +; RV32ZVFHMIN-NEXT: lh a1, 192(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 96(sp) +; RV32ZVFHMIN-NEXT: lh a0, 318(sp) +; RV32ZVFHMIN-NEXT: lh a1, 190(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 95(sp) +; RV32ZVFHMIN-NEXT: lh a0, 316(sp) +; RV32ZVFHMIN-NEXT: lh a1, 188(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 94(sp) +; RV32ZVFHMIN-NEXT: lh a0, 314(sp) +; RV32ZVFHMIN-NEXT: lh a1, 186(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 93(sp) +; RV32ZVFHMIN-NEXT: lh a0, 312(sp) +; RV32ZVFHMIN-NEXT: lh a1, 184(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 92(sp) +; RV32ZVFHMIN-NEXT: lh a0, 310(sp) +; RV32ZVFHMIN-NEXT: lh a1, 182(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 91(sp) +; RV32ZVFHMIN-NEXT: lh a0, 308(sp) +; RV32ZVFHMIN-NEXT: lh a1, 180(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 90(sp) +; RV32ZVFHMIN-NEXT: lh a0, 306(sp) +; RV32ZVFHMIN-NEXT: lh a1, 178(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 89(sp) +; RV32ZVFHMIN-NEXT: lh a0, 304(sp) +; RV32ZVFHMIN-NEXT: lh a1, 176(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 88(sp) +; RV32ZVFHMIN-NEXT: lh a0, 302(sp) +; RV32ZVFHMIN-NEXT: lh a1, 174(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 87(sp) +; RV32ZVFHMIN-NEXT: lh a0, 300(sp) +; RV32ZVFHMIN-NEXT: lh a1, 172(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 86(sp) +; RV32ZVFHMIN-NEXT: lh a1, 298(sp) +; RV32ZVFHMIN-NEXT: lh a4, 170(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a0, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a1, 85(sp) +; RV32ZVFHMIN-NEXT: lh a4, 296(sp) +; RV32ZVFHMIN-NEXT: lh a5, 168(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a1, v8 +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v16, 7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 84(sp) +; RV32ZVFHMIN-NEXT: lh a4, 294(sp) +; RV32ZVFHMIN-NEXT: lh a5, 166(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v13, v8, 7 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v16, 6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 83(sp) +; RV32ZVFHMIN-NEXT: lh a4, 292(sp) +; RV32ZVFHMIN-NEXT: lh a5, 164(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v15, v8, 6 +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v16, 5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 82(sp) +; RV32ZVFHMIN-NEXT: lh a4, 290(sp) +; RV32ZVFHMIN-NEXT: lh a5, 162(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v19, v8, 5 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v16, 4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: sb a4, 81(sp) +; RV32ZVFHMIN-NEXT: lh a0, 288(sp) +; RV32ZVFHMIN-NEXT: lh a4, 160(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a1, 64(sp) +; RV32ZVFHMIN-NEXT: sb a0, 80(sp) +; RV32ZVFHMIN-NEXT: lh a0, 354(sp) +; RV32ZVFHMIN-NEXT: lh a1, 226(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v21, v8, 4 +; RV32ZVFHMIN-NEXT: vslidedown.vi v23, v16, 3 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 113(sp) +; RV32ZVFHMIN-NEXT: lh a0, 352(sp) +; RV32ZVFHMIN-NEXT: lh a1, 224(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v8, 3 +; RV32ZVFHMIN-NEXT: vslidedown.vi v25, v16, 2 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 112(sp) +; RV32ZVFHMIN-NEXT: lh a0, 350(sp) +; RV32ZVFHMIN-NEXT: lh a1, 222(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v24, v8, 2 +; RV32ZVFHMIN-NEXT: vslidedown.vi v27, v16, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 111(sp) +; RV32ZVFHMIN-NEXT: lh a0, 348(sp) +; RV32ZVFHMIN-NEXT: lh a1, 220(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v26, v8, 1 +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v16, 15 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 110(sp) +; RV32ZVFHMIN-NEXT: lh a4, 346(sp) +; RV32ZVFHMIN-NEXT: lh a5, 218(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a1, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a0, v13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 109(sp) +; RV32ZVFHMIN-NEXT: lh a4, 344(sp) +; RV32ZVFHMIN-NEXT: lh a6, 216(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 15 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 108(sp) +; RV32ZVFHMIN-NEXT: lh a4, 342(sp) +; RV32ZVFHMIN-NEXT: lh a6, 214(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v15 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v16, 14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 107(sp) +; RV32ZVFHMIN-NEXT: lh t0, 340(sp) +; RV32ZVFHMIN-NEXT: lh t1, 212(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v18 +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v19 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb t0, 106(sp) +; RV32ZVFHMIN-NEXT: lh t1, 338(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v16, 13 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV32ZVFHMIN-NEXT: lh t2, 210(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v21 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v16, 12 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t2 +; RV32ZVFHMIN-NEXT: flt.h t2, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: sb t2, 105(sp) +; RV32ZVFHMIN-NEXT: lh a1, 336(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a0 +; RV32ZVFHMIN-NEXT: lh t3, 208(sp) +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v23 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV32ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: sb a1, 104(sp) +; RV32ZVFHMIN-NEXT: lh a5, 334(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: lh a7, 206(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s t3, v22 +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v16, 11 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a5, 103(sp) +; RV32ZVFHMIN-NEXT: lh a5, 332(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: lh a6, 204(sp) +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v25 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb a5, 102(sp) +; RV32ZVFHMIN-NEXT: lh a6, 330(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: lh t0, 202(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV32ZVFHMIN-NEXT: vslidedown.vi v24, v16, 10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: sb t0, 101(sp) +; RV32ZVFHMIN-NEXT: lh t0, 328(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV32ZVFHMIN-NEXT: lh t1, 200(sp) +; RV32ZVFHMIN-NEXT: flt.h t2, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v27 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: sb t1, 100(sp) +; RV32ZVFHMIN-NEXT: lh a7, 326(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a6, fa4, fa5 +; RV32ZVFHMIN-NEXT: lh t1, 198(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v26 +; RV32ZVFHMIN-NEXT: vslidedown.vi v26, v16, 9 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb t1, 99(sp) +; RV32ZVFHMIN-NEXT: lh t0, 324(sp) +; RV32ZVFHMIN-NEXT: lh t1, 196(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb t0, 98(sp) +; RV32ZVFHMIN-NEXT: lh t0, 322(sp) +; RV32ZVFHMIN-NEXT: lh t1, 194(sp) +; RV32ZVFHMIN-NEXT: sb a7, 65(sp) +; RV32ZVFHMIN-NEXT: sb a6, 66(sp) +; RV32ZVFHMIN-NEXT: sb t2, 67(sp) +; RV32ZVFHMIN-NEXT: sb a5, 68(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a4, 69(sp) +; RV32ZVFHMIN-NEXT: sb a1, 70(sp) +; RV32ZVFHMIN-NEXT: sb a0, 71(sp) +; RV32ZVFHMIN-NEXT: sb a5, 97(sp) +; RV32ZVFHMIN-NEXT: lh a0, 382(sp) +; RV32ZVFHMIN-NEXT: lh a1, 254(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v16, 8 +; RV32ZVFHMIN-NEXT: vslidedown.vi v2, v8, 14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 127(sp) +; RV32ZVFHMIN-NEXT: lh a0, 380(sp) +; RV32ZVFHMIN-NEXT: lh a1, 252(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v0, v8, 13 +; RV32ZVFHMIN-NEXT: vslidedown.vi v4, v8, 12 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 126(sp) +; RV32ZVFHMIN-NEXT: lh a0, 378(sp) +; RV32ZVFHMIN-NEXT: lh a1, 250(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v6, v8, 11 +; RV32ZVFHMIN-NEXT: vslidedown.vi v30, v8, 10 +; RV32ZVFHMIN-NEXT: vslidedown.vi v28, v8, 9 +; RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 125(sp) +; RV32ZVFHMIN-NEXT: lh a0, 376(sp) +; RV32ZVFHMIN-NEXT: lh a1, 248(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v14 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 124(sp) +; RV32ZVFHMIN-NEXT: lh a0, 374(sp) +; RV32ZVFHMIN-NEXT: lh a1, 246(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v2 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v18 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 123(sp) +; RV32ZVFHMIN-NEXT: lh a0, 372(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v0 +; RV32ZVFHMIN-NEXT: lh a1, 244(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v20 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: vmv.x.s t3, v4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: sb a0, 122(sp) +; RV32ZVFHMIN-NEXT: lh a1, 370(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: lh a4, 242(sp) +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v22 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV32ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a1, 121(sp) +; RV32ZVFHMIN-NEXT: lh a4, 368(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: lh a6, 240(sp) +; RV32ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb a4, 120(sp) +; RV32ZVFHMIN-NEXT: lh a6, 366(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: lh t0, 238(sp) +; RV32ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: sb t0, 119(sp) +; RV32ZVFHMIN-NEXT: lh t0, 364(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV32ZVFHMIN-NEXT: lh t1, 236(sp) +; RV32ZVFHMIN-NEXT: flt.h t2, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v30 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: sb t1, 118(sp) +; RV32ZVFHMIN-NEXT: lh a5, 362(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: lh a7, 234(sp) +; RV32ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v26 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a7, 117(sp) +; RV32ZVFHMIN-NEXT: lh a6, 360(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV32ZVFHMIN-NEXT: lh a7, 232(sp) +; RV32ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v28 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: sb a7, 116(sp) +; RV32ZVFHMIN-NEXT: lh a5, 358(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: lh a6, 230(sp) +; RV32ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a6, fa4, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV32ZVFHMIN-NEXT: sb a6, 115(sp) +; RV32ZVFHMIN-NEXT: lh a5, 356(sp) +; RV32ZVFHMIN-NEXT: lh a6, 228(sp) +; RV32ZVFHMIN-NEXT: sb t2, 76(sp) +; RV32ZVFHMIN-NEXT: sb a4, 77(sp) +; RV32ZVFHMIN-NEXT: sb a1, 78(sp) +; RV32ZVFHMIN-NEXT: sb a0, 79(sp) +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 72(sp) +; RV32ZVFHMIN-NEXT: sb a7, 73(sp) +; RV32ZVFHMIN-NEXT: sb t0, 74(sp) +; RV32ZVFHMIN-NEXT: sb t1, 75(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV32ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 114(sp) +; RV32ZVFHMIN-NEXT: addi a0, sp, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV32ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV32ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV32ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV32ZVFHMIN-NEXT: vsm.v v12, (a2) +; RV32ZVFHMIN-NEXT: addi sp, s0, -512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa sp, 512 +; RV32ZVFHMIN-NEXT: lw ra, 508(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: lw s0, 504(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: .cfi_restore ra +; RV32ZVFHMIN-NEXT: .cfi_restore s0 +; RV32ZVFHMIN-NEXT: addi sp, sp, 512 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: fcmp_ugt_vv_v64f16_nonans: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: addi sp, sp, -512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 512 +; RV64ZVFHMIN-NEXT: sd ra, 504(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: sd s0, 496(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: .cfi_offset ra, -8 +; RV64ZVFHMIN-NEXT: .cfi_offset s0, -16 +; RV64ZVFHMIN-NEXT: addi s0, sp, 512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVFHMIN-NEXT: andi sp, sp, -128 +; RV64ZVFHMIN-NEXT: li a3, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a3, e16, m8, ta, ma +; RV64ZVFHMIN-NEXT: vle16.v v16, (a0) +; RV64ZVFHMIN-NEXT: vle16.v v8, (a1) +; RV64ZVFHMIN-NEXT: addi a0, sp, 256 +; RV64ZVFHMIN-NEXT: addi a1, sp, 128 +; RV64ZVFHMIN-NEXT: vse16.v v16, (a0) +; RV64ZVFHMIN-NEXT: vse16.v v8, (a1) +; RV64ZVFHMIN-NEXT: lh a0, 320(sp) +; RV64ZVFHMIN-NEXT: lh a1, 192(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 96(sp) +; RV64ZVFHMIN-NEXT: lh a0, 318(sp) +; RV64ZVFHMIN-NEXT: lh a1, 190(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 95(sp) +; RV64ZVFHMIN-NEXT: lh a0, 316(sp) +; RV64ZVFHMIN-NEXT: lh a1, 188(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 94(sp) +; RV64ZVFHMIN-NEXT: lh a0, 314(sp) +; RV64ZVFHMIN-NEXT: lh a1, 186(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 93(sp) +; RV64ZVFHMIN-NEXT: lh a0, 312(sp) +; RV64ZVFHMIN-NEXT: lh a1, 184(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 92(sp) +; RV64ZVFHMIN-NEXT: lh a0, 310(sp) +; RV64ZVFHMIN-NEXT: lh a1, 182(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 91(sp) +; RV64ZVFHMIN-NEXT: lh a0, 308(sp) +; RV64ZVFHMIN-NEXT: lh a1, 180(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 90(sp) +; RV64ZVFHMIN-NEXT: lh a0, 306(sp) +; RV64ZVFHMIN-NEXT: lh a1, 178(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 89(sp) +; RV64ZVFHMIN-NEXT: lh a0, 304(sp) +; RV64ZVFHMIN-NEXT: lh a1, 176(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 88(sp) +; RV64ZVFHMIN-NEXT: lh a0, 302(sp) +; RV64ZVFHMIN-NEXT: lh a1, 174(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 87(sp) +; RV64ZVFHMIN-NEXT: lh a0, 300(sp) +; RV64ZVFHMIN-NEXT: lh a1, 172(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 86(sp) +; RV64ZVFHMIN-NEXT: lh a1, 298(sp) +; RV64ZVFHMIN-NEXT: lh a4, 170(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a0, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a1, 85(sp) +; RV64ZVFHMIN-NEXT: lh a4, 296(sp) +; RV64ZVFHMIN-NEXT: lh a5, 168(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a1, v8 +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v16, 7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 84(sp) +; RV64ZVFHMIN-NEXT: lh a4, 294(sp) +; RV64ZVFHMIN-NEXT: lh a5, 166(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v13, v8, 7 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v16, 6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 83(sp) +; RV64ZVFHMIN-NEXT: lh a4, 292(sp) +; RV64ZVFHMIN-NEXT: lh a5, 164(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v15, v8, 6 +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v16, 5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 82(sp) +; RV64ZVFHMIN-NEXT: lh a4, 290(sp) +; RV64ZVFHMIN-NEXT: lh a5, 162(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v19, v8, 5 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v16, 4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: sb a4, 81(sp) +; RV64ZVFHMIN-NEXT: lh a0, 288(sp) +; RV64ZVFHMIN-NEXT: lh a4, 160(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a1, 64(sp) +; RV64ZVFHMIN-NEXT: sb a0, 80(sp) +; RV64ZVFHMIN-NEXT: lh a0, 354(sp) +; RV64ZVFHMIN-NEXT: lh a1, 226(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v21, v8, 4 +; RV64ZVFHMIN-NEXT: vslidedown.vi v23, v16, 3 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 113(sp) +; RV64ZVFHMIN-NEXT: lh a0, 352(sp) +; RV64ZVFHMIN-NEXT: lh a1, 224(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v8, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vi v25, v16, 2 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 112(sp) +; RV64ZVFHMIN-NEXT: lh a0, 350(sp) +; RV64ZVFHMIN-NEXT: lh a1, 222(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v24, v8, 2 +; RV64ZVFHMIN-NEXT: vslidedown.vi v27, v16, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 111(sp) +; RV64ZVFHMIN-NEXT: lh a0, 348(sp) +; RV64ZVFHMIN-NEXT: lh a1, 220(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v26, v8, 1 +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v16, 15 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 110(sp) +; RV64ZVFHMIN-NEXT: lh a4, 346(sp) +; RV64ZVFHMIN-NEXT: lh a5, 218(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a1, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a0, v13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 109(sp) +; RV64ZVFHMIN-NEXT: lh a4, 344(sp) +; RV64ZVFHMIN-NEXT: lh a6, 216(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 15 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 108(sp) +; RV64ZVFHMIN-NEXT: lh a4, 342(sp) +; RV64ZVFHMIN-NEXT: lh a6, 214(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v15 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v16, 14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 107(sp) +; RV64ZVFHMIN-NEXT: lh t0, 340(sp) +; RV64ZVFHMIN-NEXT: lh t1, 212(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v18 +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v19 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb t0, 106(sp) +; RV64ZVFHMIN-NEXT: lh t1, 338(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v16, 13 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV64ZVFHMIN-NEXT: lh t2, 210(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v21 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v16, 12 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t2 +; RV64ZVFHMIN-NEXT: flt.h t2, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: sb t2, 105(sp) +; RV64ZVFHMIN-NEXT: lh a1, 336(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a0 +; RV64ZVFHMIN-NEXT: lh t3, 208(sp) +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v23 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV64ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: sb a1, 104(sp) +; RV64ZVFHMIN-NEXT: lh a5, 334(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: lh a7, 206(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s t3, v22 +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v16, 11 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a5, 103(sp) +; RV64ZVFHMIN-NEXT: lh a5, 332(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: lh a6, 204(sp) +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v25 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb a5, 102(sp) +; RV64ZVFHMIN-NEXT: lh a6, 330(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: lh t0, 202(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV64ZVFHMIN-NEXT: vslidedown.vi v24, v16, 10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: sb t0, 101(sp) +; RV64ZVFHMIN-NEXT: lh t0, 328(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV64ZVFHMIN-NEXT: lh t1, 200(sp) +; RV64ZVFHMIN-NEXT: flt.h t2, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v27 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: sb t1, 100(sp) +; RV64ZVFHMIN-NEXT: lh a7, 326(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a6, fa4, fa5 +; RV64ZVFHMIN-NEXT: lh t1, 198(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v26 +; RV64ZVFHMIN-NEXT: vslidedown.vi v26, v16, 9 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb t1, 99(sp) +; RV64ZVFHMIN-NEXT: lh t0, 324(sp) +; RV64ZVFHMIN-NEXT: lh t1, 196(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb t0, 98(sp) +; RV64ZVFHMIN-NEXT: lh t0, 322(sp) +; RV64ZVFHMIN-NEXT: lh t1, 194(sp) +; RV64ZVFHMIN-NEXT: sb a7, 65(sp) +; RV64ZVFHMIN-NEXT: sb a6, 66(sp) +; RV64ZVFHMIN-NEXT: sb t2, 67(sp) +; RV64ZVFHMIN-NEXT: sb a5, 68(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h a5, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a4, 69(sp) +; RV64ZVFHMIN-NEXT: sb a1, 70(sp) +; RV64ZVFHMIN-NEXT: sb a0, 71(sp) +; RV64ZVFHMIN-NEXT: sb a5, 97(sp) +; RV64ZVFHMIN-NEXT: lh a0, 382(sp) +; RV64ZVFHMIN-NEXT: lh a1, 254(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v16, 8 +; RV64ZVFHMIN-NEXT: vslidedown.vi v2, v8, 14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 127(sp) +; RV64ZVFHMIN-NEXT: lh a0, 380(sp) +; RV64ZVFHMIN-NEXT: lh a1, 252(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v0, v8, 13 +; RV64ZVFHMIN-NEXT: vslidedown.vi v4, v8, 12 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 126(sp) +; RV64ZVFHMIN-NEXT: lh a0, 378(sp) +; RV64ZVFHMIN-NEXT: lh a1, 250(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v6, v8, 11 +; RV64ZVFHMIN-NEXT: vslidedown.vi v30, v8, 10 +; RV64ZVFHMIN-NEXT: vslidedown.vi v28, v8, 9 +; RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 125(sp) +; RV64ZVFHMIN-NEXT: lh a0, 376(sp) +; RV64ZVFHMIN-NEXT: lh a1, 248(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v14 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 124(sp) +; RV64ZVFHMIN-NEXT: lh a0, 374(sp) +; RV64ZVFHMIN-NEXT: lh a1, 246(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v2 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v18 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 123(sp) +; RV64ZVFHMIN-NEXT: lh a0, 372(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v0 +; RV64ZVFHMIN-NEXT: lh a1, 244(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v20 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmv.x.s t3, v4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a1 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: sb a0, 122(sp) +; RV64ZVFHMIN-NEXT: lh a1, 370(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: lh a4, 242(sp) +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a1 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v22 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a4 +; RV64ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a1, 121(sp) +; RV64ZVFHMIN-NEXT: lh a4, 368(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: lh a6, 240(sp) +; RV64ZVFHMIN-NEXT: flt.h a1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb a4, 120(sp) +; RV64ZVFHMIN-NEXT: lh a6, 366(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: lh t0, 238(sp) +; RV64ZVFHMIN-NEXT: flt.h a4, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v24 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: sb t0, 119(sp) +; RV64ZVFHMIN-NEXT: lh t0, 364(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t3 +; RV64ZVFHMIN-NEXT: lh t1, 236(sp) +; RV64ZVFHMIN-NEXT: flt.h t2, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v30 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: sb t1, 118(sp) +; RV64ZVFHMIN-NEXT: lh a5, 362(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: lh a7, 234(sp) +; RV64ZVFHMIN-NEXT: flt.h t1, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v26 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a7, 117(sp) +; RV64ZVFHMIN-NEXT: lh a6, 360(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, t0 +; RV64ZVFHMIN-NEXT: lh a7, 232(sp) +; RV64ZVFHMIN-NEXT: flt.h t0, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v28 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: sb a7, 116(sp) +; RV64ZVFHMIN-NEXT: lh a5, 358(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: lh a6, 230(sp) +; RV64ZVFHMIN-NEXT: flt.h a7, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a6, fa4, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a5 +; RV64ZVFHMIN-NEXT: sb a6, 115(sp) +; RV64ZVFHMIN-NEXT: lh a5, 356(sp) +; RV64ZVFHMIN-NEXT: lh a6, 228(sp) +; RV64ZVFHMIN-NEXT: sb t2, 76(sp) +; RV64ZVFHMIN-NEXT: sb a4, 77(sp) +; RV64ZVFHMIN-NEXT: sb a1, 78(sp) +; RV64ZVFHMIN-NEXT: sb a0, 79(sp) +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 72(sp) +; RV64ZVFHMIN-NEXT: sb a7, 73(sp) +; RV64ZVFHMIN-NEXT: sb t0, 74(sp) +; RV64ZVFHMIN-NEXT: sb t1, 75(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa4, a6 +; RV64ZVFHMIN-NEXT: flt.h a0, fa4, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 114(sp) +; RV64ZVFHMIN-NEXT: addi a0, sp, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a3, e8, m4, ta, ma +; RV64ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV64ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV64ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV64ZVFHMIN-NEXT: vsm.v v12, (a2) +; RV64ZVFHMIN-NEXT: addi sp, s0, -512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa sp, 512 +; RV64ZVFHMIN-NEXT: ld ra, 504(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: ld s0, 496(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: .cfi_restore ra +; RV64ZVFHMIN-NEXT: .cfi_restore s0 +; RV64ZVFHMIN-NEXT: addi sp, sp, 512 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVFHMIN-NEXT: ret %a = load <64 x half>, ptr %x %b = load <64 x half>, ptr %y %c = fcmp nnan ugt <64 x half> %a, %b @@ -1069,6 +3001,744 @@ define void @fcmp_ugt_vf_v64f16(ptr %x, half %y, ptr %z) { ; ZVFH-NEXT: vmnot.m v8, v16 ; ZVFH-NEXT: vsm.v v8, (a1) ; ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: fcmp_ugt_vf_v64f16: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: addi sp, sp, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV32ZVFHMIN-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: sw s0, 376(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: .cfi_offset ra, -4 +; RV32ZVFHMIN-NEXT: .cfi_offset s0, -8 +; RV32ZVFHMIN-NEXT: addi s0, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVFHMIN-NEXT: andi sp, sp, -128 +; RV32ZVFHMIN-NEXT: li a2, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV32ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV32ZVFHMIN-NEXT: addi a0, sp, 128 +; RV32ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV32ZVFHMIN-NEXT: lh a0, 192(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 96(sp) +; RV32ZVFHMIN-NEXT: lh a0, 190(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 95(sp) +; RV32ZVFHMIN-NEXT: lh a0, 188(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 94(sp) +; RV32ZVFHMIN-NEXT: lh a0, 186(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 93(sp) +; RV32ZVFHMIN-NEXT: lh a0, 184(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 92(sp) +; RV32ZVFHMIN-NEXT: lh a0, 182(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 91(sp) +; RV32ZVFHMIN-NEXT: lh a0, 180(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 90(sp) +; RV32ZVFHMIN-NEXT: lh a0, 178(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 89(sp) +; RV32ZVFHMIN-NEXT: lh a0, 176(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 88(sp) +; RV32ZVFHMIN-NEXT: lh a0, 174(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 87(sp) +; RV32ZVFHMIN-NEXT: lh a0, 172(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 86(sp) +; RV32ZVFHMIN-NEXT: lh a0, 170(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 85(sp) +; RV32ZVFHMIN-NEXT: lh a0, 168(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 84(sp) +; RV32ZVFHMIN-NEXT: lh a0, 166(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 83(sp) +; RV32ZVFHMIN-NEXT: lh a0, 164(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 82(sp) +; RV32ZVFHMIN-NEXT: lh a0, 162(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 81(sp) +; RV32ZVFHMIN-NEXT: lh a0, 160(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a3, 64(sp) +; RV32ZVFHMIN-NEXT: sb a0, 80(sp) +; RV32ZVFHMIN-NEXT: lh a0, 226(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 113(sp) +; RV32ZVFHMIN-NEXT: lh a0, 224(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 112(sp) +; RV32ZVFHMIN-NEXT: lh a0, 222(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 111(sp) +; RV32ZVFHMIN-NEXT: lh a0, 220(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 110(sp) +; RV32ZVFHMIN-NEXT: lh a0, 218(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 109(sp) +; RV32ZVFHMIN-NEXT: lh a0, 216(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 108(sp) +; RV32ZVFHMIN-NEXT: lh a0, 214(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV32ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 107(sp) +; RV32ZVFHMIN-NEXT: lh a0, 212(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 106(sp) +; RV32ZVFHMIN-NEXT: lh a0, 210(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 105(sp) +; RV32ZVFHMIN-NEXT: lh a0, 208(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 104(sp) +; RV32ZVFHMIN-NEXT: lh a0, 206(sp) +; RV32ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fle.h a4, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 103(sp) +; RV32ZVFHMIN-NEXT: lh a0, 204(sp) +; RV32ZVFHMIN-NEXT: fle.h a5, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: fle.h a6, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 102(sp) +; RV32ZVFHMIN-NEXT: lh a0, 202(sp) +; RV32ZVFHMIN-NEXT: fle.h a7, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fle.h t0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 101(sp) +; RV32ZVFHMIN-NEXT: lh a0, 200(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 100(sp) +; RV32ZVFHMIN-NEXT: lh a0, 198(sp) +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: xori a6, a6, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 99(sp) +; RV32ZVFHMIN-NEXT: lh a0, 196(sp) +; RV32ZVFHMIN-NEXT: xori a7, a7, 1 +; RV32ZVFHMIN-NEXT: xori t0, t0, 1 +; RV32ZVFHMIN-NEXT: xori t1, t1, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 98(sp) +; RV32ZVFHMIN-NEXT: lh a0, 194(sp) +; RV32ZVFHMIN-NEXT: sb t1, 65(sp) +; RV32ZVFHMIN-NEXT: sb t0, 66(sp) +; RV32ZVFHMIN-NEXT: sb a7, 67(sp) +; RV32ZVFHMIN-NEXT: sb a6, 68(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a5, 69(sp) +; RV32ZVFHMIN-NEXT: sb a4, 70(sp) +; RV32ZVFHMIN-NEXT: sb a3, 71(sp) +; RV32ZVFHMIN-NEXT: sb a0, 97(sp) +; RV32ZVFHMIN-NEXT: lh a0, 254(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 127(sp) +; RV32ZVFHMIN-NEXT: lh a0, 252(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 126(sp) +; RV32ZVFHMIN-NEXT: lh a0, 250(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 125(sp) +; RV32ZVFHMIN-NEXT: lh a0, 248(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 124(sp) +; RV32ZVFHMIN-NEXT: lh a0, 246(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 123(sp) +; RV32ZVFHMIN-NEXT: lh a0, 244(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 122(sp) +; RV32ZVFHMIN-NEXT: lh a0, 242(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 121(sp) +; RV32ZVFHMIN-NEXT: lh a0, 240(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 120(sp) +; RV32ZVFHMIN-NEXT: lh a0, 238(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 119(sp) +; RV32ZVFHMIN-NEXT: lh a0, 236(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 118(sp) +; RV32ZVFHMIN-NEXT: lh a0, 234(sp) +; RV32ZVFHMIN-NEXT: fle.h a4, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: fle.h a5, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 117(sp) +; RV32ZVFHMIN-NEXT: lh a0, 232(sp) +; RV32ZVFHMIN-NEXT: fle.h a6, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fle.h a7, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 116(sp) +; RV32ZVFHMIN-NEXT: lh a0, 230(sp) +; RV32ZVFHMIN-NEXT: fle.h t0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: xori a6, a6, 1 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 115(sp) +; RV32ZVFHMIN-NEXT: lh a0, 228(sp) +; RV32ZVFHMIN-NEXT: sb a6, 76(sp) +; RV32ZVFHMIN-NEXT: sb a5, 77(sp) +; RV32ZVFHMIN-NEXT: sb a4, 78(sp) +; RV32ZVFHMIN-NEXT: sb a3, 79(sp) +; RV32ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a4, a7, 1 +; RV32ZVFHMIN-NEXT: xori a5, t0, 1 +; RV32ZVFHMIN-NEXT: xori a6, t1, 1 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: sb a3, 72(sp) +; RV32ZVFHMIN-NEXT: sb a6, 73(sp) +; RV32ZVFHMIN-NEXT: sb a5, 74(sp) +; RV32ZVFHMIN-NEXT: sb a4, 75(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 114(sp) +; RV32ZVFHMIN-NEXT: addi a0, sp, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV32ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV32ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV32ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV32ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV32ZVFHMIN-NEXT: addi sp, s0, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV32ZVFHMIN-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: .cfi_restore ra +; RV32ZVFHMIN-NEXT: .cfi_restore s0 +; RV32ZVFHMIN-NEXT: addi sp, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: fcmp_ugt_vf_v64f16: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: addi sp, sp, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV64ZVFHMIN-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: sd s0, 368(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: .cfi_offset ra, -8 +; RV64ZVFHMIN-NEXT: .cfi_offset s0, -16 +; RV64ZVFHMIN-NEXT: addi s0, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVFHMIN-NEXT: andi sp, sp, -128 +; RV64ZVFHMIN-NEXT: li a2, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV64ZVFHMIN-NEXT: addi a0, sp, 128 +; RV64ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV64ZVFHMIN-NEXT: lh a0, 192(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 96(sp) +; RV64ZVFHMIN-NEXT: lh a0, 190(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 95(sp) +; RV64ZVFHMIN-NEXT: lh a0, 188(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 94(sp) +; RV64ZVFHMIN-NEXT: lh a0, 186(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 93(sp) +; RV64ZVFHMIN-NEXT: lh a0, 184(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 92(sp) +; RV64ZVFHMIN-NEXT: lh a0, 182(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 91(sp) +; RV64ZVFHMIN-NEXT: lh a0, 180(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 90(sp) +; RV64ZVFHMIN-NEXT: lh a0, 178(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 89(sp) +; RV64ZVFHMIN-NEXT: lh a0, 176(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 88(sp) +; RV64ZVFHMIN-NEXT: lh a0, 174(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 87(sp) +; RV64ZVFHMIN-NEXT: lh a0, 172(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 86(sp) +; RV64ZVFHMIN-NEXT: lh a0, 170(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 85(sp) +; RV64ZVFHMIN-NEXT: lh a0, 168(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 84(sp) +; RV64ZVFHMIN-NEXT: lh a0, 166(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 83(sp) +; RV64ZVFHMIN-NEXT: lh a0, 164(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 82(sp) +; RV64ZVFHMIN-NEXT: lh a0, 162(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 81(sp) +; RV64ZVFHMIN-NEXT: lh a0, 160(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a3, 64(sp) +; RV64ZVFHMIN-NEXT: sb a0, 80(sp) +; RV64ZVFHMIN-NEXT: lh a0, 226(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 113(sp) +; RV64ZVFHMIN-NEXT: lh a0, 224(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 112(sp) +; RV64ZVFHMIN-NEXT: lh a0, 222(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 111(sp) +; RV64ZVFHMIN-NEXT: lh a0, 220(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 110(sp) +; RV64ZVFHMIN-NEXT: lh a0, 218(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 109(sp) +; RV64ZVFHMIN-NEXT: lh a0, 216(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 108(sp) +; RV64ZVFHMIN-NEXT: lh a0, 214(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 107(sp) +; RV64ZVFHMIN-NEXT: lh a0, 212(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 106(sp) +; RV64ZVFHMIN-NEXT: lh a0, 210(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 105(sp) +; RV64ZVFHMIN-NEXT: lh a0, 208(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 104(sp) +; RV64ZVFHMIN-NEXT: lh a0, 206(sp) +; RV64ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fle.h a4, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 103(sp) +; RV64ZVFHMIN-NEXT: lh a0, 204(sp) +; RV64ZVFHMIN-NEXT: fle.h a5, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: fle.h a6, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 102(sp) +; RV64ZVFHMIN-NEXT: lh a0, 202(sp) +; RV64ZVFHMIN-NEXT: fle.h a7, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fle.h t0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 101(sp) +; RV64ZVFHMIN-NEXT: lh a0, 200(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 100(sp) +; RV64ZVFHMIN-NEXT: lh a0, 198(sp) +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: xori a6, a6, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 99(sp) +; RV64ZVFHMIN-NEXT: lh a0, 196(sp) +; RV64ZVFHMIN-NEXT: xori a7, a7, 1 +; RV64ZVFHMIN-NEXT: xori t0, t0, 1 +; RV64ZVFHMIN-NEXT: xori t1, t1, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 98(sp) +; RV64ZVFHMIN-NEXT: lh a0, 194(sp) +; RV64ZVFHMIN-NEXT: sb t1, 65(sp) +; RV64ZVFHMIN-NEXT: sb t0, 66(sp) +; RV64ZVFHMIN-NEXT: sb a7, 67(sp) +; RV64ZVFHMIN-NEXT: sb a6, 68(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a5, 69(sp) +; RV64ZVFHMIN-NEXT: sb a4, 70(sp) +; RV64ZVFHMIN-NEXT: sb a3, 71(sp) +; RV64ZVFHMIN-NEXT: sb a0, 97(sp) +; RV64ZVFHMIN-NEXT: lh a0, 254(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 127(sp) +; RV64ZVFHMIN-NEXT: lh a0, 252(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 126(sp) +; RV64ZVFHMIN-NEXT: lh a0, 250(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 125(sp) +; RV64ZVFHMIN-NEXT: lh a0, 248(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 124(sp) +; RV64ZVFHMIN-NEXT: lh a0, 246(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 123(sp) +; RV64ZVFHMIN-NEXT: lh a0, 244(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 122(sp) +; RV64ZVFHMIN-NEXT: lh a0, 242(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 121(sp) +; RV64ZVFHMIN-NEXT: lh a0, 240(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 120(sp) +; RV64ZVFHMIN-NEXT: lh a0, 238(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 119(sp) +; RV64ZVFHMIN-NEXT: lh a0, 236(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 118(sp) +; RV64ZVFHMIN-NEXT: lh a0, 234(sp) +; RV64ZVFHMIN-NEXT: fle.h a4, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: fle.h a5, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 117(sp) +; RV64ZVFHMIN-NEXT: lh a0, 232(sp) +; RV64ZVFHMIN-NEXT: fle.h a6, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fle.h a7, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 116(sp) +; RV64ZVFHMIN-NEXT: lh a0, 230(sp) +; RV64ZVFHMIN-NEXT: fle.h t0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: xori a6, a6, 1 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 115(sp) +; RV64ZVFHMIN-NEXT: lh a0, 228(sp) +; RV64ZVFHMIN-NEXT: sb a6, 76(sp) +; RV64ZVFHMIN-NEXT: sb a5, 77(sp) +; RV64ZVFHMIN-NEXT: sb a4, 78(sp) +; RV64ZVFHMIN-NEXT: sb a3, 79(sp) +; RV64ZVFHMIN-NEXT: fle.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a4, a7, 1 +; RV64ZVFHMIN-NEXT: xori a5, t0, 1 +; RV64ZVFHMIN-NEXT: xori a6, t1, 1 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: sb a3, 72(sp) +; RV64ZVFHMIN-NEXT: sb a6, 73(sp) +; RV64ZVFHMIN-NEXT: sb a5, 74(sp) +; RV64ZVFHMIN-NEXT: sb a4, 75(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 114(sp) +; RV64ZVFHMIN-NEXT: addi a0, sp, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV64ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV64ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV64ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV64ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV64ZVFHMIN-NEXT: addi sp, s0, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV64ZVFHMIN-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: .cfi_restore ra +; RV64ZVFHMIN-NEXT: .cfi_restore s0 +; RV64ZVFHMIN-NEXT: addi sp, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVFHMIN-NEXT: ret %a = load <64 x half>, ptr %x %b = insertelement <64 x half> poison, half %y, i32 0 %c = shufflevector <64 x half> %b, <64 x half> poison, <64 x i32> zeroinitializer @@ -1086,6 +3756,616 @@ define void @fcmp_ugt_vf_v64f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFH-NEXT: vmfgt.vf v16, v8, fa0 ; ZVFH-NEXT: vsm.v v16, (a1) ; ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: fcmp_ugt_vf_v64f16_nonans: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: addi sp, sp, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV32ZVFHMIN-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: sw s0, 376(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: .cfi_offset ra, -4 +; RV32ZVFHMIN-NEXT: .cfi_offset s0, -8 +; RV32ZVFHMIN-NEXT: addi s0, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVFHMIN-NEXT: andi sp, sp, -128 +; RV32ZVFHMIN-NEXT: li a2, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV32ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV32ZVFHMIN-NEXT: addi a0, sp, 128 +; RV32ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV32ZVFHMIN-NEXT: lh a0, 192(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 96(sp) +; RV32ZVFHMIN-NEXT: lh a0, 190(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 95(sp) +; RV32ZVFHMIN-NEXT: lh a0, 188(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 94(sp) +; RV32ZVFHMIN-NEXT: lh a0, 186(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 93(sp) +; RV32ZVFHMIN-NEXT: lh a0, 184(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 92(sp) +; RV32ZVFHMIN-NEXT: lh a0, 182(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 91(sp) +; RV32ZVFHMIN-NEXT: lh a0, 180(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 90(sp) +; RV32ZVFHMIN-NEXT: lh a0, 178(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 89(sp) +; RV32ZVFHMIN-NEXT: lh a0, 176(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 88(sp) +; RV32ZVFHMIN-NEXT: lh a0, 174(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 87(sp) +; RV32ZVFHMIN-NEXT: lh a0, 172(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 86(sp) +; RV32ZVFHMIN-NEXT: lh a0, 170(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 85(sp) +; RV32ZVFHMIN-NEXT: lh a0, 168(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 84(sp) +; RV32ZVFHMIN-NEXT: lh a0, 166(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 83(sp) +; RV32ZVFHMIN-NEXT: lh a0, 164(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 82(sp) +; RV32ZVFHMIN-NEXT: lh a0, 162(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 81(sp) +; RV32ZVFHMIN-NEXT: lh a0, 160(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a3, 64(sp) +; RV32ZVFHMIN-NEXT: sb a0, 80(sp) +; RV32ZVFHMIN-NEXT: lh a0, 226(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 113(sp) +; RV32ZVFHMIN-NEXT: lh a0, 224(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 112(sp) +; RV32ZVFHMIN-NEXT: lh a0, 222(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 111(sp) +; RV32ZVFHMIN-NEXT: lh a0, 220(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 110(sp) +; RV32ZVFHMIN-NEXT: lh a0, 218(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 109(sp) +; RV32ZVFHMIN-NEXT: lh a0, 216(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 108(sp) +; RV32ZVFHMIN-NEXT: lh a0, 214(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 107(sp) +; RV32ZVFHMIN-NEXT: lh a0, 212(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 106(sp) +; RV32ZVFHMIN-NEXT: lh a0, 210(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV32ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 105(sp) +; RV32ZVFHMIN-NEXT: lh a0, 208(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV32ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 104(sp) +; RV32ZVFHMIN-NEXT: lh a0, 206(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 103(sp) +; RV32ZVFHMIN-NEXT: lh a0, 204(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 102(sp) +; RV32ZVFHMIN-NEXT: lh a0, 202(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: sb a0, 101(sp) +; RV32ZVFHMIN-NEXT: lh a0, 200(sp) +; RV32ZVFHMIN-NEXT: flt.h a4, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: flt.h a5, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a0, 100(sp) +; RV32ZVFHMIN-NEXT: lh a0, 198(sp) +; RV32ZVFHMIN-NEXT: flt.h a6, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb a0, 99(sp) +; RV32ZVFHMIN-NEXT: lh a0, 196(sp) +; RV32ZVFHMIN-NEXT: flt.h t0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 98(sp) +; RV32ZVFHMIN-NEXT: lh a0, 194(sp) +; RV32ZVFHMIN-NEXT: sb t1, 65(sp) +; RV32ZVFHMIN-NEXT: sb t0, 66(sp) +; RV32ZVFHMIN-NEXT: sb a7, 67(sp) +; RV32ZVFHMIN-NEXT: sb a6, 68(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a5, 69(sp) +; RV32ZVFHMIN-NEXT: sb a4, 70(sp) +; RV32ZVFHMIN-NEXT: sb a3, 71(sp) +; RV32ZVFHMIN-NEXT: sb a0, 97(sp) +; RV32ZVFHMIN-NEXT: lh a0, 254(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 127(sp) +; RV32ZVFHMIN-NEXT: lh a0, 252(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 126(sp) +; RV32ZVFHMIN-NEXT: lh a0, 250(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 125(sp) +; RV32ZVFHMIN-NEXT: lh a0, 248(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 124(sp) +; RV32ZVFHMIN-NEXT: lh a0, 246(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 123(sp) +; RV32ZVFHMIN-NEXT: lh a0, 244(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 122(sp) +; RV32ZVFHMIN-NEXT: lh a0, 242(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 121(sp) +; RV32ZVFHMIN-NEXT: lh a0, 240(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 120(sp) +; RV32ZVFHMIN-NEXT: lh a0, 238(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 119(sp) +; RV32ZVFHMIN-NEXT: lh a0, 236(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: sb a0, 118(sp) +; RV32ZVFHMIN-NEXT: lh a0, 234(sp) +; RV32ZVFHMIN-NEXT: flt.h a4, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: flt.h a5, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a0, 117(sp) +; RV32ZVFHMIN-NEXT: lh a0, 232(sp) +; RV32ZVFHMIN-NEXT: flt.h a6, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb a0, 116(sp) +; RV32ZVFHMIN-NEXT: lh a0, 230(sp) +; RV32ZVFHMIN-NEXT: flt.h t0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: sb a0, 115(sp) +; RV32ZVFHMIN-NEXT: lh a0, 228(sp) +; RV32ZVFHMIN-NEXT: sb a6, 76(sp) +; RV32ZVFHMIN-NEXT: sb a5, 77(sp) +; RV32ZVFHMIN-NEXT: sb a4, 78(sp) +; RV32ZVFHMIN-NEXT: sb a3, 79(sp) +; RV32ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a3, 72(sp) +; RV32ZVFHMIN-NEXT: sb t1, 73(sp) +; RV32ZVFHMIN-NEXT: sb t0, 74(sp) +; RV32ZVFHMIN-NEXT: sb a7, 75(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: sb a0, 114(sp) +; RV32ZVFHMIN-NEXT: addi a0, sp, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV32ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV32ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV32ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV32ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV32ZVFHMIN-NEXT: addi sp, s0, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV32ZVFHMIN-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: .cfi_restore ra +; RV32ZVFHMIN-NEXT: .cfi_restore s0 +; RV32ZVFHMIN-NEXT: addi sp, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: fcmp_ugt_vf_v64f16_nonans: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: addi sp, sp, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV64ZVFHMIN-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: sd s0, 368(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: .cfi_offset ra, -8 +; RV64ZVFHMIN-NEXT: .cfi_offset s0, -16 +; RV64ZVFHMIN-NEXT: addi s0, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVFHMIN-NEXT: andi sp, sp, -128 +; RV64ZVFHMIN-NEXT: li a2, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV64ZVFHMIN-NEXT: addi a0, sp, 128 +; RV64ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV64ZVFHMIN-NEXT: lh a0, 192(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 96(sp) +; RV64ZVFHMIN-NEXT: lh a0, 190(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 95(sp) +; RV64ZVFHMIN-NEXT: lh a0, 188(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 94(sp) +; RV64ZVFHMIN-NEXT: lh a0, 186(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 93(sp) +; RV64ZVFHMIN-NEXT: lh a0, 184(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 92(sp) +; RV64ZVFHMIN-NEXT: lh a0, 182(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 91(sp) +; RV64ZVFHMIN-NEXT: lh a0, 180(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 90(sp) +; RV64ZVFHMIN-NEXT: lh a0, 178(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 89(sp) +; RV64ZVFHMIN-NEXT: lh a0, 176(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 88(sp) +; RV64ZVFHMIN-NEXT: lh a0, 174(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 87(sp) +; RV64ZVFHMIN-NEXT: lh a0, 172(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 86(sp) +; RV64ZVFHMIN-NEXT: lh a0, 170(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 85(sp) +; RV64ZVFHMIN-NEXT: lh a0, 168(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 84(sp) +; RV64ZVFHMIN-NEXT: lh a0, 166(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 83(sp) +; RV64ZVFHMIN-NEXT: lh a0, 164(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 82(sp) +; RV64ZVFHMIN-NEXT: lh a0, 162(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 81(sp) +; RV64ZVFHMIN-NEXT: lh a0, 160(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a3, 64(sp) +; RV64ZVFHMIN-NEXT: sb a0, 80(sp) +; RV64ZVFHMIN-NEXT: lh a0, 226(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 113(sp) +; RV64ZVFHMIN-NEXT: lh a0, 224(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 112(sp) +; RV64ZVFHMIN-NEXT: lh a0, 222(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 111(sp) +; RV64ZVFHMIN-NEXT: lh a0, 220(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 110(sp) +; RV64ZVFHMIN-NEXT: lh a0, 218(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 109(sp) +; RV64ZVFHMIN-NEXT: lh a0, 216(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 108(sp) +; RV64ZVFHMIN-NEXT: lh a0, 214(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 107(sp) +; RV64ZVFHMIN-NEXT: lh a0, 212(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 106(sp) +; RV64ZVFHMIN-NEXT: lh a0, 210(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 105(sp) +; RV64ZVFHMIN-NEXT: lh a0, 208(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 104(sp) +; RV64ZVFHMIN-NEXT: lh a0, 206(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 103(sp) +; RV64ZVFHMIN-NEXT: lh a0, 204(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 102(sp) +; RV64ZVFHMIN-NEXT: lh a0, 202(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: sb a0, 101(sp) +; RV64ZVFHMIN-NEXT: lh a0, 200(sp) +; RV64ZVFHMIN-NEXT: flt.h a4, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: flt.h a5, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a0, 100(sp) +; RV64ZVFHMIN-NEXT: lh a0, 198(sp) +; RV64ZVFHMIN-NEXT: flt.h a6, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb a0, 99(sp) +; RV64ZVFHMIN-NEXT: lh a0, 196(sp) +; RV64ZVFHMIN-NEXT: flt.h t0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 98(sp) +; RV64ZVFHMIN-NEXT: lh a0, 194(sp) +; RV64ZVFHMIN-NEXT: sb t1, 65(sp) +; RV64ZVFHMIN-NEXT: sb t0, 66(sp) +; RV64ZVFHMIN-NEXT: sb a7, 67(sp) +; RV64ZVFHMIN-NEXT: sb a6, 68(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a5, 69(sp) +; RV64ZVFHMIN-NEXT: sb a4, 70(sp) +; RV64ZVFHMIN-NEXT: sb a3, 71(sp) +; RV64ZVFHMIN-NEXT: sb a0, 97(sp) +; RV64ZVFHMIN-NEXT: lh a0, 254(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 127(sp) +; RV64ZVFHMIN-NEXT: lh a0, 252(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 126(sp) +; RV64ZVFHMIN-NEXT: lh a0, 250(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 125(sp) +; RV64ZVFHMIN-NEXT: lh a0, 248(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 124(sp) +; RV64ZVFHMIN-NEXT: lh a0, 246(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 123(sp) +; RV64ZVFHMIN-NEXT: lh a0, 244(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 122(sp) +; RV64ZVFHMIN-NEXT: lh a0, 242(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 121(sp) +; RV64ZVFHMIN-NEXT: lh a0, 240(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 120(sp) +; RV64ZVFHMIN-NEXT: lh a0, 238(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 119(sp) +; RV64ZVFHMIN-NEXT: lh a0, 236(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: sb a0, 118(sp) +; RV64ZVFHMIN-NEXT: lh a0, 234(sp) +; RV64ZVFHMIN-NEXT: flt.h a4, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: flt.h a5, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a0, 117(sp) +; RV64ZVFHMIN-NEXT: lh a0, 232(sp) +; RV64ZVFHMIN-NEXT: flt.h a6, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb a0, 116(sp) +; RV64ZVFHMIN-NEXT: lh a0, 230(sp) +; RV64ZVFHMIN-NEXT: flt.h t0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: sb a0, 115(sp) +; RV64ZVFHMIN-NEXT: lh a0, 228(sp) +; RV64ZVFHMIN-NEXT: sb a6, 76(sp) +; RV64ZVFHMIN-NEXT: sb a5, 77(sp) +; RV64ZVFHMIN-NEXT: sb a4, 78(sp) +; RV64ZVFHMIN-NEXT: sb a3, 79(sp) +; RV64ZVFHMIN-NEXT: flt.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a3, 72(sp) +; RV64ZVFHMIN-NEXT: sb t1, 73(sp) +; RV64ZVFHMIN-NEXT: sb t0, 74(sp) +; RV64ZVFHMIN-NEXT: sb a7, 75(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: sb a0, 114(sp) +; RV64ZVFHMIN-NEXT: addi a0, sp, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV64ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV64ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV64ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV64ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV64ZVFHMIN-NEXT: addi sp, s0, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV64ZVFHMIN-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: .cfi_restore ra +; RV64ZVFHMIN-NEXT: .cfi_restore s0 +; RV64ZVFHMIN-NEXT: addi sp, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVFHMIN-NEXT: ret %a = load <64 x half>, ptr %x %b = insertelement <64 x half> poison, half %y, i32 0 %c = shufflevector <64 x half> %b, <64 x half> poison, <64 x i32> zeroinitializer @@ -1710,6 +4990,744 @@ define void @fcmp_ugt_fv_v64f16(ptr %x, half %y, ptr %z) { ; ZVFH-NEXT: vmnot.m v8, v16 ; ZVFH-NEXT: vsm.v v8, (a1) ; ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: fcmp_ugt_fv_v64f16: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: addi sp, sp, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV32ZVFHMIN-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: sw s0, 376(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: .cfi_offset ra, -4 +; RV32ZVFHMIN-NEXT: .cfi_offset s0, -8 +; RV32ZVFHMIN-NEXT: addi s0, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVFHMIN-NEXT: andi sp, sp, -128 +; RV32ZVFHMIN-NEXT: li a2, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV32ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV32ZVFHMIN-NEXT: addi a0, sp, 128 +; RV32ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV32ZVFHMIN-NEXT: lh a0, 192(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 96(sp) +; RV32ZVFHMIN-NEXT: lh a0, 190(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 95(sp) +; RV32ZVFHMIN-NEXT: lh a0, 188(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 94(sp) +; RV32ZVFHMIN-NEXT: lh a0, 186(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 93(sp) +; RV32ZVFHMIN-NEXT: lh a0, 184(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 92(sp) +; RV32ZVFHMIN-NEXT: lh a0, 182(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 91(sp) +; RV32ZVFHMIN-NEXT: lh a0, 180(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 90(sp) +; RV32ZVFHMIN-NEXT: lh a0, 178(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 89(sp) +; RV32ZVFHMIN-NEXT: lh a0, 176(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 88(sp) +; RV32ZVFHMIN-NEXT: lh a0, 174(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 87(sp) +; RV32ZVFHMIN-NEXT: lh a0, 172(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 86(sp) +; RV32ZVFHMIN-NEXT: lh a0, 170(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 85(sp) +; RV32ZVFHMIN-NEXT: lh a0, 168(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 84(sp) +; RV32ZVFHMIN-NEXT: lh a0, 166(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 83(sp) +; RV32ZVFHMIN-NEXT: lh a0, 164(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 82(sp) +; RV32ZVFHMIN-NEXT: lh a0, 162(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 81(sp) +; RV32ZVFHMIN-NEXT: lh a0, 160(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a3, 64(sp) +; RV32ZVFHMIN-NEXT: sb a0, 80(sp) +; RV32ZVFHMIN-NEXT: lh a0, 226(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 113(sp) +; RV32ZVFHMIN-NEXT: lh a0, 224(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 112(sp) +; RV32ZVFHMIN-NEXT: lh a0, 222(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 111(sp) +; RV32ZVFHMIN-NEXT: lh a0, 220(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 110(sp) +; RV32ZVFHMIN-NEXT: lh a0, 218(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 109(sp) +; RV32ZVFHMIN-NEXT: lh a0, 216(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 108(sp) +; RV32ZVFHMIN-NEXT: lh a0, 214(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV32ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 107(sp) +; RV32ZVFHMIN-NEXT: lh a0, 212(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 106(sp) +; RV32ZVFHMIN-NEXT: lh a0, 210(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 105(sp) +; RV32ZVFHMIN-NEXT: lh a0, 208(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 104(sp) +; RV32ZVFHMIN-NEXT: lh a0, 206(sp) +; RV32ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: fle.h a4, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 103(sp) +; RV32ZVFHMIN-NEXT: lh a0, 204(sp) +; RV32ZVFHMIN-NEXT: fle.h a5, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: fle.h a6, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 102(sp) +; RV32ZVFHMIN-NEXT: lh a0, 202(sp) +; RV32ZVFHMIN-NEXT: fle.h a7, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: fle.h t0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 101(sp) +; RV32ZVFHMIN-NEXT: lh a0, 200(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 100(sp) +; RV32ZVFHMIN-NEXT: lh a0, 198(sp) +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: xori a6, a6, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 99(sp) +; RV32ZVFHMIN-NEXT: lh a0, 196(sp) +; RV32ZVFHMIN-NEXT: xori a7, a7, 1 +; RV32ZVFHMIN-NEXT: xori t0, t0, 1 +; RV32ZVFHMIN-NEXT: xori t1, t1, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 98(sp) +; RV32ZVFHMIN-NEXT: lh a0, 194(sp) +; RV32ZVFHMIN-NEXT: sb t1, 65(sp) +; RV32ZVFHMIN-NEXT: sb t0, 66(sp) +; RV32ZVFHMIN-NEXT: sb a7, 67(sp) +; RV32ZVFHMIN-NEXT: sb a6, 68(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a5, 69(sp) +; RV32ZVFHMIN-NEXT: sb a4, 70(sp) +; RV32ZVFHMIN-NEXT: sb a3, 71(sp) +; RV32ZVFHMIN-NEXT: sb a0, 97(sp) +; RV32ZVFHMIN-NEXT: lh a0, 254(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 127(sp) +; RV32ZVFHMIN-NEXT: lh a0, 252(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 126(sp) +; RV32ZVFHMIN-NEXT: lh a0, 250(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 125(sp) +; RV32ZVFHMIN-NEXT: lh a0, 248(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 124(sp) +; RV32ZVFHMIN-NEXT: lh a0, 246(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 123(sp) +; RV32ZVFHMIN-NEXT: lh a0, 244(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 122(sp) +; RV32ZVFHMIN-NEXT: lh a0, 242(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 121(sp) +; RV32ZVFHMIN-NEXT: lh a0, 240(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 120(sp) +; RV32ZVFHMIN-NEXT: lh a0, 238(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 119(sp) +; RV32ZVFHMIN-NEXT: lh a0, 236(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 118(sp) +; RV32ZVFHMIN-NEXT: lh a0, 234(sp) +; RV32ZVFHMIN-NEXT: fle.h a4, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: fle.h a5, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 117(sp) +; RV32ZVFHMIN-NEXT: lh a0, 232(sp) +; RV32ZVFHMIN-NEXT: fle.h a6, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: fle.h a7, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 116(sp) +; RV32ZVFHMIN-NEXT: lh a0, 230(sp) +; RV32ZVFHMIN-NEXT: fle.h t0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: fle.h t1, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: xori a4, a4, 1 +; RV32ZVFHMIN-NEXT: xori a5, a5, 1 +; RV32ZVFHMIN-NEXT: xori a6, a6, 1 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 115(sp) +; RV32ZVFHMIN-NEXT: lh a0, 228(sp) +; RV32ZVFHMIN-NEXT: sb a6, 76(sp) +; RV32ZVFHMIN-NEXT: sb a5, 77(sp) +; RV32ZVFHMIN-NEXT: sb a4, 78(sp) +; RV32ZVFHMIN-NEXT: sb a3, 79(sp) +; RV32ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a4, a7, 1 +; RV32ZVFHMIN-NEXT: xori a5, t0, 1 +; RV32ZVFHMIN-NEXT: xori a6, t1, 1 +; RV32ZVFHMIN-NEXT: xori a3, a3, 1 +; RV32ZVFHMIN-NEXT: sb a3, 72(sp) +; RV32ZVFHMIN-NEXT: sb a6, 73(sp) +; RV32ZVFHMIN-NEXT: sb a5, 74(sp) +; RV32ZVFHMIN-NEXT: sb a4, 75(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV32ZVFHMIN-NEXT: xori a0, a0, 1 +; RV32ZVFHMIN-NEXT: sb a0, 114(sp) +; RV32ZVFHMIN-NEXT: addi a0, sp, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV32ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV32ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV32ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV32ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV32ZVFHMIN-NEXT: addi sp, s0, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV32ZVFHMIN-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: .cfi_restore ra +; RV32ZVFHMIN-NEXT: .cfi_restore s0 +; RV32ZVFHMIN-NEXT: addi sp, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: fcmp_ugt_fv_v64f16: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: addi sp, sp, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV64ZVFHMIN-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: sd s0, 368(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: .cfi_offset ra, -8 +; RV64ZVFHMIN-NEXT: .cfi_offset s0, -16 +; RV64ZVFHMIN-NEXT: addi s0, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVFHMIN-NEXT: andi sp, sp, -128 +; RV64ZVFHMIN-NEXT: li a2, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV64ZVFHMIN-NEXT: addi a0, sp, 128 +; RV64ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV64ZVFHMIN-NEXT: lh a0, 192(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 96(sp) +; RV64ZVFHMIN-NEXT: lh a0, 190(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 95(sp) +; RV64ZVFHMIN-NEXT: lh a0, 188(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 94(sp) +; RV64ZVFHMIN-NEXT: lh a0, 186(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 93(sp) +; RV64ZVFHMIN-NEXT: lh a0, 184(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 92(sp) +; RV64ZVFHMIN-NEXT: lh a0, 182(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 91(sp) +; RV64ZVFHMIN-NEXT: lh a0, 180(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 90(sp) +; RV64ZVFHMIN-NEXT: lh a0, 178(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 89(sp) +; RV64ZVFHMIN-NEXT: lh a0, 176(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 88(sp) +; RV64ZVFHMIN-NEXT: lh a0, 174(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 87(sp) +; RV64ZVFHMIN-NEXT: lh a0, 172(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 86(sp) +; RV64ZVFHMIN-NEXT: lh a0, 170(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 85(sp) +; RV64ZVFHMIN-NEXT: lh a0, 168(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 84(sp) +; RV64ZVFHMIN-NEXT: lh a0, 166(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 83(sp) +; RV64ZVFHMIN-NEXT: lh a0, 164(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 82(sp) +; RV64ZVFHMIN-NEXT: lh a0, 162(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 81(sp) +; RV64ZVFHMIN-NEXT: lh a0, 160(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a3, 64(sp) +; RV64ZVFHMIN-NEXT: sb a0, 80(sp) +; RV64ZVFHMIN-NEXT: lh a0, 226(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 113(sp) +; RV64ZVFHMIN-NEXT: lh a0, 224(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 112(sp) +; RV64ZVFHMIN-NEXT: lh a0, 222(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 111(sp) +; RV64ZVFHMIN-NEXT: lh a0, 220(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 110(sp) +; RV64ZVFHMIN-NEXT: lh a0, 218(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 109(sp) +; RV64ZVFHMIN-NEXT: lh a0, 216(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 108(sp) +; RV64ZVFHMIN-NEXT: lh a0, 214(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 107(sp) +; RV64ZVFHMIN-NEXT: lh a0, 212(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 106(sp) +; RV64ZVFHMIN-NEXT: lh a0, 210(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 105(sp) +; RV64ZVFHMIN-NEXT: lh a0, 208(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 104(sp) +; RV64ZVFHMIN-NEXT: lh a0, 206(sp) +; RV64ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: fle.h a4, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 103(sp) +; RV64ZVFHMIN-NEXT: lh a0, 204(sp) +; RV64ZVFHMIN-NEXT: fle.h a5, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: fle.h a6, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 102(sp) +; RV64ZVFHMIN-NEXT: lh a0, 202(sp) +; RV64ZVFHMIN-NEXT: fle.h a7, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: fle.h t0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 101(sp) +; RV64ZVFHMIN-NEXT: lh a0, 200(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 100(sp) +; RV64ZVFHMIN-NEXT: lh a0, 198(sp) +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: xori a6, a6, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 99(sp) +; RV64ZVFHMIN-NEXT: lh a0, 196(sp) +; RV64ZVFHMIN-NEXT: xori a7, a7, 1 +; RV64ZVFHMIN-NEXT: xori t0, t0, 1 +; RV64ZVFHMIN-NEXT: xori t1, t1, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 98(sp) +; RV64ZVFHMIN-NEXT: lh a0, 194(sp) +; RV64ZVFHMIN-NEXT: sb t1, 65(sp) +; RV64ZVFHMIN-NEXT: sb t0, 66(sp) +; RV64ZVFHMIN-NEXT: sb a7, 67(sp) +; RV64ZVFHMIN-NEXT: sb a6, 68(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a5, 69(sp) +; RV64ZVFHMIN-NEXT: sb a4, 70(sp) +; RV64ZVFHMIN-NEXT: sb a3, 71(sp) +; RV64ZVFHMIN-NEXT: sb a0, 97(sp) +; RV64ZVFHMIN-NEXT: lh a0, 254(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 127(sp) +; RV64ZVFHMIN-NEXT: lh a0, 252(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 126(sp) +; RV64ZVFHMIN-NEXT: lh a0, 250(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 125(sp) +; RV64ZVFHMIN-NEXT: lh a0, 248(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 124(sp) +; RV64ZVFHMIN-NEXT: lh a0, 246(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 123(sp) +; RV64ZVFHMIN-NEXT: lh a0, 244(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 122(sp) +; RV64ZVFHMIN-NEXT: lh a0, 242(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 121(sp) +; RV64ZVFHMIN-NEXT: lh a0, 240(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 120(sp) +; RV64ZVFHMIN-NEXT: lh a0, 238(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 119(sp) +; RV64ZVFHMIN-NEXT: lh a0, 236(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 118(sp) +; RV64ZVFHMIN-NEXT: lh a0, 234(sp) +; RV64ZVFHMIN-NEXT: fle.h a4, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: fle.h a5, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 117(sp) +; RV64ZVFHMIN-NEXT: lh a0, 232(sp) +; RV64ZVFHMIN-NEXT: fle.h a6, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: fle.h a7, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 116(sp) +; RV64ZVFHMIN-NEXT: lh a0, 230(sp) +; RV64ZVFHMIN-NEXT: fle.h t0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: fle.h t1, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: xori a4, a4, 1 +; RV64ZVFHMIN-NEXT: xori a5, a5, 1 +; RV64ZVFHMIN-NEXT: xori a6, a6, 1 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 115(sp) +; RV64ZVFHMIN-NEXT: lh a0, 228(sp) +; RV64ZVFHMIN-NEXT: sb a6, 76(sp) +; RV64ZVFHMIN-NEXT: sb a5, 77(sp) +; RV64ZVFHMIN-NEXT: sb a4, 78(sp) +; RV64ZVFHMIN-NEXT: sb a3, 79(sp) +; RV64ZVFHMIN-NEXT: fle.h a3, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a4, a7, 1 +; RV64ZVFHMIN-NEXT: xori a5, t0, 1 +; RV64ZVFHMIN-NEXT: xori a6, t1, 1 +; RV64ZVFHMIN-NEXT: xori a3, a3, 1 +; RV64ZVFHMIN-NEXT: sb a3, 72(sp) +; RV64ZVFHMIN-NEXT: sb a6, 73(sp) +; RV64ZVFHMIN-NEXT: sb a5, 74(sp) +; RV64ZVFHMIN-NEXT: sb a4, 75(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: fle.h a0, fa0, fa5 +; RV64ZVFHMIN-NEXT: xori a0, a0, 1 +; RV64ZVFHMIN-NEXT: sb a0, 114(sp) +; RV64ZVFHMIN-NEXT: addi a0, sp, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV64ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV64ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV64ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV64ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV64ZVFHMIN-NEXT: addi sp, s0, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV64ZVFHMIN-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: .cfi_restore ra +; RV64ZVFHMIN-NEXT: .cfi_restore s0 +; RV64ZVFHMIN-NEXT: addi sp, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVFHMIN-NEXT: ret %a = load <64 x half>, ptr %x %b = insertelement <64 x half> poison, half %y, i32 0 %c = shufflevector <64 x half> %b, <64 x half> poison, <64 x i32> zeroinitializer @@ -1727,6 +5745,616 @@ define void @fcmp_ugt_fv_v64f16_nonans(ptr %x, half %y, ptr %z) { ; ZVFH-NEXT: vmflt.vf v16, v8, fa0 ; ZVFH-NEXT: vsm.v v16, (a1) ; ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: fcmp_ugt_fv_v64f16_nonans: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: addi sp, sp, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV32ZVFHMIN-NEXT: sw ra, 380(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: sw s0, 376(sp) # 4-byte Folded Spill +; RV32ZVFHMIN-NEXT: .cfi_offset ra, -4 +; RV32ZVFHMIN-NEXT: .cfi_offset s0, -8 +; RV32ZVFHMIN-NEXT: addi s0, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVFHMIN-NEXT: andi sp, sp, -128 +; RV32ZVFHMIN-NEXT: li a2, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV32ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV32ZVFHMIN-NEXT: addi a0, sp, 128 +; RV32ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV32ZVFHMIN-NEXT: lh a0, 192(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 96(sp) +; RV32ZVFHMIN-NEXT: lh a0, 190(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 95(sp) +; RV32ZVFHMIN-NEXT: lh a0, 188(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 94(sp) +; RV32ZVFHMIN-NEXT: lh a0, 186(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 93(sp) +; RV32ZVFHMIN-NEXT: lh a0, 184(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 92(sp) +; RV32ZVFHMIN-NEXT: lh a0, 182(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 91(sp) +; RV32ZVFHMIN-NEXT: lh a0, 180(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 90(sp) +; RV32ZVFHMIN-NEXT: lh a0, 178(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 89(sp) +; RV32ZVFHMIN-NEXT: lh a0, 176(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 88(sp) +; RV32ZVFHMIN-NEXT: lh a0, 174(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 87(sp) +; RV32ZVFHMIN-NEXT: lh a0, 172(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 86(sp) +; RV32ZVFHMIN-NEXT: lh a0, 170(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 85(sp) +; RV32ZVFHMIN-NEXT: lh a0, 168(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 84(sp) +; RV32ZVFHMIN-NEXT: lh a0, 166(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 83(sp) +; RV32ZVFHMIN-NEXT: lh a0, 164(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 82(sp) +; RV32ZVFHMIN-NEXT: lh a0, 162(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 81(sp) +; RV32ZVFHMIN-NEXT: lh a0, 160(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a3, 64(sp) +; RV32ZVFHMIN-NEXT: sb a0, 80(sp) +; RV32ZVFHMIN-NEXT: lh a0, 226(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 113(sp) +; RV32ZVFHMIN-NEXT: lh a0, 224(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 112(sp) +; RV32ZVFHMIN-NEXT: lh a0, 222(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 111(sp) +; RV32ZVFHMIN-NEXT: lh a0, 220(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 110(sp) +; RV32ZVFHMIN-NEXT: lh a0, 218(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 109(sp) +; RV32ZVFHMIN-NEXT: lh a0, 216(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 108(sp) +; RV32ZVFHMIN-NEXT: lh a0, 214(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 107(sp) +; RV32ZVFHMIN-NEXT: lh a0, 212(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 106(sp) +; RV32ZVFHMIN-NEXT: lh a0, 210(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV32ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 105(sp) +; RV32ZVFHMIN-NEXT: lh a0, 208(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV32ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 104(sp) +; RV32ZVFHMIN-NEXT: lh a0, 206(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 103(sp) +; RV32ZVFHMIN-NEXT: lh a0, 204(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 102(sp) +; RV32ZVFHMIN-NEXT: lh a0, 202(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: sb a0, 101(sp) +; RV32ZVFHMIN-NEXT: lh a0, 200(sp) +; RV32ZVFHMIN-NEXT: flt.h a4, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: flt.h a5, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a0, 100(sp) +; RV32ZVFHMIN-NEXT: lh a0, 198(sp) +; RV32ZVFHMIN-NEXT: flt.h a6, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb a0, 99(sp) +; RV32ZVFHMIN-NEXT: lh a0, 196(sp) +; RV32ZVFHMIN-NEXT: flt.h t0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 98(sp) +; RV32ZVFHMIN-NEXT: lh a0, 194(sp) +; RV32ZVFHMIN-NEXT: sb t1, 65(sp) +; RV32ZVFHMIN-NEXT: sb t0, 66(sp) +; RV32ZVFHMIN-NEXT: sb a7, 67(sp) +; RV32ZVFHMIN-NEXT: sb a6, 68(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a5, 69(sp) +; RV32ZVFHMIN-NEXT: sb a4, 70(sp) +; RV32ZVFHMIN-NEXT: sb a3, 71(sp) +; RV32ZVFHMIN-NEXT: sb a0, 97(sp) +; RV32ZVFHMIN-NEXT: lh a0, 254(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 127(sp) +; RV32ZVFHMIN-NEXT: lh a0, 252(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 126(sp) +; RV32ZVFHMIN-NEXT: lh a0, 250(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 125(sp) +; RV32ZVFHMIN-NEXT: lh a0, 248(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 124(sp) +; RV32ZVFHMIN-NEXT: lh a0, 246(sp) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV32ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV32ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV32ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 123(sp) +; RV32ZVFHMIN-NEXT: lh a0, 244(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV32ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV32ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 122(sp) +; RV32ZVFHMIN-NEXT: lh a0, 242(sp) +; RV32ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV32ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV32ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 121(sp) +; RV32ZVFHMIN-NEXT: lh a0, 240(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV32ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV32ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 120(sp) +; RV32ZVFHMIN-NEXT: lh a0, 238(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV32ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV32ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 119(sp) +; RV32ZVFHMIN-NEXT: lh a0, 236(sp) +; RV32ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV32ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV32ZVFHMIN-NEXT: sb a0, 118(sp) +; RV32ZVFHMIN-NEXT: lh a0, 234(sp) +; RV32ZVFHMIN-NEXT: flt.h a4, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV32ZVFHMIN-NEXT: flt.h a5, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV32ZVFHMIN-NEXT: sb a0, 117(sp) +; RV32ZVFHMIN-NEXT: lh a0, 232(sp) +; RV32ZVFHMIN-NEXT: flt.h a6, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV32ZVFHMIN-NEXT: flt.h a7, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV32ZVFHMIN-NEXT: sb a0, 116(sp) +; RV32ZVFHMIN-NEXT: lh a0, 230(sp) +; RV32ZVFHMIN-NEXT: flt.h t0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV32ZVFHMIN-NEXT: flt.h t1, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV32ZVFHMIN-NEXT: sb a0, 115(sp) +; RV32ZVFHMIN-NEXT: lh a0, 228(sp) +; RV32ZVFHMIN-NEXT: sb a6, 76(sp) +; RV32ZVFHMIN-NEXT: sb a5, 77(sp) +; RV32ZVFHMIN-NEXT: sb a4, 78(sp) +; RV32ZVFHMIN-NEXT: sb a3, 79(sp) +; RV32ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a3, 72(sp) +; RV32ZVFHMIN-NEXT: sb t1, 73(sp) +; RV32ZVFHMIN-NEXT: sb t0, 74(sp) +; RV32ZVFHMIN-NEXT: sb a7, 75(sp) +; RV32ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV32ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV32ZVFHMIN-NEXT: sb a0, 114(sp) +; RV32ZVFHMIN-NEXT: addi a0, sp, 64 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV32ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV32ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV32ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV32ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV32ZVFHMIN-NEXT: addi sp, s0, -384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV32ZVFHMIN-NEXT: lw ra, 380(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: lw s0, 376(sp) # 4-byte Folded Reload +; RV32ZVFHMIN-NEXT: .cfi_restore ra +; RV32ZVFHMIN-NEXT: .cfi_restore s0 +; RV32ZVFHMIN-NEXT: addi sp, sp, 384 +; RV32ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: fcmp_ugt_fv_v64f16_nonans: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: addi sp, sp, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 384 +; RV64ZVFHMIN-NEXT: sd ra, 376(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: sd s0, 368(sp) # 8-byte Folded Spill +; RV64ZVFHMIN-NEXT: .cfi_offset ra, -8 +; RV64ZVFHMIN-NEXT: .cfi_offset s0, -16 +; RV64ZVFHMIN-NEXT: addi s0, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVFHMIN-NEXT: andi sp, sp, -128 +; RV64ZVFHMIN-NEXT: li a2, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e16, m8, ta, ma +; RV64ZVFHMIN-NEXT: vle16.v v8, (a0) +; RV64ZVFHMIN-NEXT: addi a0, sp, 128 +; RV64ZVFHMIN-NEXT: vse16.v v8, (a0) +; RV64ZVFHMIN-NEXT: lh a0, 192(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 96(sp) +; RV64ZVFHMIN-NEXT: lh a0, 190(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 95(sp) +; RV64ZVFHMIN-NEXT: lh a0, 188(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 94(sp) +; RV64ZVFHMIN-NEXT: lh a0, 186(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 93(sp) +; RV64ZVFHMIN-NEXT: lh a0, 184(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 92(sp) +; RV64ZVFHMIN-NEXT: lh a0, 182(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 91(sp) +; RV64ZVFHMIN-NEXT: lh a0, 180(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 90(sp) +; RV64ZVFHMIN-NEXT: lh a0, 178(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 89(sp) +; RV64ZVFHMIN-NEXT: lh a0, 176(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 88(sp) +; RV64ZVFHMIN-NEXT: lh a0, 174(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 87(sp) +; RV64ZVFHMIN-NEXT: lh a0, 172(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 86(sp) +; RV64ZVFHMIN-NEXT: lh a0, 170(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 85(sp) +; RV64ZVFHMIN-NEXT: lh a0, 168(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 84(sp) +; RV64ZVFHMIN-NEXT: lh a0, 166(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 83(sp) +; RV64ZVFHMIN-NEXT: lh a0, 164(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 82(sp) +; RV64ZVFHMIN-NEXT: lh a0, 162(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 81(sp) +; RV64ZVFHMIN-NEXT: lh a0, 160(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a3, 64(sp) +; RV64ZVFHMIN-NEXT: sb a0, 80(sp) +; RV64ZVFHMIN-NEXT: lh a0, 226(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 113(sp) +; RV64ZVFHMIN-NEXT: lh a0, 224(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 112(sp) +; RV64ZVFHMIN-NEXT: lh a0, 222(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 111(sp) +; RV64ZVFHMIN-NEXT: lh a0, 220(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 110(sp) +; RV64ZVFHMIN-NEXT: lh a0, 218(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 109(sp) +; RV64ZVFHMIN-NEXT: lh a0, 216(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 108(sp) +; RV64ZVFHMIN-NEXT: lh a0, 214(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 107(sp) +; RV64ZVFHMIN-NEXT: lh a0, 212(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 7 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 106(sp) +; RV64ZVFHMIN-NEXT: lh a0, 210(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v11, v8, 6 +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 5 +; RV64ZVFHMIN-NEXT: vslidedown.vi v13, v8, 4 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 105(sp) +; RV64ZVFHMIN-NEXT: lh a0, 208(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vi v15, v8, 2 +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 1 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 104(sp) +; RV64ZVFHMIN-NEXT: lh a0, 206(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v11 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v12 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 103(sp) +; RV64ZVFHMIN-NEXT: lh a0, 204(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v13 +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v15 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 102(sp) +; RV64ZVFHMIN-NEXT: lh a0, 202(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: sb a0, 101(sp) +; RV64ZVFHMIN-NEXT: lh a0, 200(sp) +; RV64ZVFHMIN-NEXT: flt.h a4, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: flt.h a5, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a0, 100(sp) +; RV64ZVFHMIN-NEXT: lh a0, 198(sp) +; RV64ZVFHMIN-NEXT: flt.h a6, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb a0, 99(sp) +; RV64ZVFHMIN-NEXT: lh a0, 196(sp) +; RV64ZVFHMIN-NEXT: flt.h t0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 98(sp) +; RV64ZVFHMIN-NEXT: lh a0, 194(sp) +; RV64ZVFHMIN-NEXT: sb t1, 65(sp) +; RV64ZVFHMIN-NEXT: sb t0, 66(sp) +; RV64ZVFHMIN-NEXT: sb a7, 67(sp) +; RV64ZVFHMIN-NEXT: sb a6, 68(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a5, 69(sp) +; RV64ZVFHMIN-NEXT: sb a4, 70(sp) +; RV64ZVFHMIN-NEXT: sb a3, 71(sp) +; RV64ZVFHMIN-NEXT: sb a0, 97(sp) +; RV64ZVFHMIN-NEXT: lh a0, 254(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 127(sp) +; RV64ZVFHMIN-NEXT: lh a0, 252(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 126(sp) +; RV64ZVFHMIN-NEXT: lh a0, 250(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 125(sp) +; RV64ZVFHMIN-NEXT: lh a0, 248(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 124(sp) +; RV64ZVFHMIN-NEXT: lh a0, 246(sp) +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RV64ZVFHMIN-NEXT: vslidedown.vi v10, v8, 15 +; RV64ZVFHMIN-NEXT: vslidedown.vi v12, v8, 14 +; RV64ZVFHMIN-NEXT: vslidedown.vi v14, v8, 13 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 123(sp) +; RV64ZVFHMIN-NEXT: lh a0, 244(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v16, v8, 12 +; RV64ZVFHMIN-NEXT: vslidedown.vi v18, v8, 11 +; RV64ZVFHMIN-NEXT: vslidedown.vi v20, v8, 10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 122(sp) +; RV64ZVFHMIN-NEXT: lh a0, 242(sp) +; RV64ZVFHMIN-NEXT: vslidedown.vi v22, v8, 9 +; RV64ZVFHMIN-NEXT: vslidedown.vi v8, v8, 8 +; RV64ZVFHMIN-NEXT: vmv.x.s a3, v10 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 121(sp) +; RV64ZVFHMIN-NEXT: lh a0, 240(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a4, v12 +; RV64ZVFHMIN-NEXT: vmv.x.s a5, v14 +; RV64ZVFHMIN-NEXT: vmv.x.s a6, v16 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 120(sp) +; RV64ZVFHMIN-NEXT: lh a0, 238(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s a7, v18 +; RV64ZVFHMIN-NEXT: vmv.x.s t0, v20 +; RV64ZVFHMIN-NEXT: vmv.x.s t1, v22 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 119(sp) +; RV64ZVFHMIN-NEXT: lh a0, 236(sp) +; RV64ZVFHMIN-NEXT: vmv.x.s t2, v8 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a3 +; RV64ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a4 +; RV64ZVFHMIN-NEXT: sb a0, 118(sp) +; RV64ZVFHMIN-NEXT: lh a0, 234(sp) +; RV64ZVFHMIN-NEXT: flt.h a4, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a5 +; RV64ZVFHMIN-NEXT: flt.h a5, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a6 +; RV64ZVFHMIN-NEXT: sb a0, 117(sp) +; RV64ZVFHMIN-NEXT: lh a0, 232(sp) +; RV64ZVFHMIN-NEXT: flt.h a6, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a7 +; RV64ZVFHMIN-NEXT: flt.h a7, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t0 +; RV64ZVFHMIN-NEXT: sb a0, 116(sp) +; RV64ZVFHMIN-NEXT: lh a0, 230(sp) +; RV64ZVFHMIN-NEXT: flt.h t0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t1 +; RV64ZVFHMIN-NEXT: flt.h t1, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, t2 +; RV64ZVFHMIN-NEXT: sb a0, 115(sp) +; RV64ZVFHMIN-NEXT: lh a0, 228(sp) +; RV64ZVFHMIN-NEXT: sb a6, 76(sp) +; RV64ZVFHMIN-NEXT: sb a5, 77(sp) +; RV64ZVFHMIN-NEXT: sb a4, 78(sp) +; RV64ZVFHMIN-NEXT: sb a3, 79(sp) +; RV64ZVFHMIN-NEXT: flt.h a3, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a3, 72(sp) +; RV64ZVFHMIN-NEXT: sb t1, 73(sp) +; RV64ZVFHMIN-NEXT: sb t0, 74(sp) +; RV64ZVFHMIN-NEXT: sb a7, 75(sp) +; RV64ZVFHMIN-NEXT: fmv.h.x fa5, a0 +; RV64ZVFHMIN-NEXT: flt.h a0, fa5, fa0 +; RV64ZVFHMIN-NEXT: sb a0, 114(sp) +; RV64ZVFHMIN-NEXT: addi a0, sp, 64 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; RV64ZVFHMIN-NEXT: vle8.v v8, (a0) +; RV64ZVFHMIN-NEXT: vand.vi v8, v8, 1 +; RV64ZVFHMIN-NEXT: vmsne.vi v12, v8, 0 +; RV64ZVFHMIN-NEXT: vsm.v v12, (a1) +; RV64ZVFHMIN-NEXT: addi sp, s0, -384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa sp, 384 +; RV64ZVFHMIN-NEXT: ld ra, 376(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: ld s0, 368(sp) # 8-byte Folded Reload +; RV64ZVFHMIN-NEXT: .cfi_restore ra +; RV64ZVFHMIN-NEXT: .cfi_restore s0 +; RV64ZVFHMIN-NEXT: addi sp, sp, 384 +; RV64ZVFHMIN-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVFHMIN-NEXT: ret %a = load <64 x half>, ptr %x %b = insertelement <64 x half> poison, half %y, i32 0 %c = shufflevector <64 x half> %b, <64 x half> poison, <64 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 38df622998bf9..dd415116c2327 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN define void @fadd_v8bf16(ptr %x, ptr %y) { @@ -3925,8 +3925,9 @@ define void @trunc_v8f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI171_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI171_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -3965,8 +3966,9 @@ define void @trunc_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI172_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI172_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -4022,20 +4024,67 @@ define void @trunc_v4f32(ptr %x) { } define void @trunc_v2f64(ptr %x) { -; CHECK-LABEL: trunc_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI174_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI174_0)(a1) -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: trunc_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vle64.v v8, (a0) +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI174_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI174_0)(a1) +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: vse64.v v8, (a0) +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: trunc_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vle64.v v8, (a0) +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: vse64.v v8, (a0) +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: trunc_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI174_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI174_0)(a1) +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: trunc_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV64ZVFHMIN-NEXT: ret %a = load <2 x double>, ptr %x %b = call <2 x double> @llvm.trunc.v2f64(<2 x double> %a) store <2 x double> %b, ptr %x @@ -4101,8 +4150,9 @@ define void @ceil_v8f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI177_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI177_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 3 @@ -4145,8 +4195,9 @@ define void @ceil_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI178_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI178_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 3 @@ -4208,22 +4259,75 @@ define void @ceil_v4f32(ptr %x) { } define void @ceil_v2f64(ptr %x) { -; CHECK-LABEL: ceil_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI180_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI180_0)(a1) -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 3 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: ceil_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vle64.v v8, (a0) +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI180_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI180_0)(a1) +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a1, 3 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: vse64.v v8, (a0) +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: ceil_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vle64.v v8, (a0) +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a1, 3 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: vse64.v v8, (a0) +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: ceil_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI180_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI180_0)(a1) +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a1, 3 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: ceil_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a1, 3 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV64ZVFHMIN-NEXT: ret %a = load <2 x double>, ptr %x %b = call <2 x double> @llvm.ceil.v2f64(<2 x double> %a) store <2 x double> %b, ptr %x @@ -4289,8 +4393,9 @@ define void @floor_v8f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI183_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI183_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 2 @@ -4333,8 +4438,9 @@ define void @floor_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI184_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI184_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 2 @@ -4396,22 +4502,75 @@ define void @floor_v4f32(ptr %x) { } define void @floor_v2f64(ptr %x) { -; CHECK-LABEL: floor_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI186_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI186_0)(a1) -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: floor_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vle64.v v8, (a0) +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI186_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI186_0)(a1) +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a1, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: vse64.v v8, (a0) +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: floor_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vle64.v v8, (a0) +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a1, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: vse64.v v8, (a0) +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: floor_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI186_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI186_0)(a1) +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a1, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: floor_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a1, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV64ZVFHMIN-NEXT: ret %a = load <2 x double>, ptr %x %b = call <2 x double> @llvm.floor.v2f64(<2 x double> %a) store <2 x double> %b, ptr %x @@ -4477,8 +4636,9 @@ define void @round_v8f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI189_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI189_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 4 @@ -4521,8 +4681,9 @@ define void @round_v6f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI190_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI190_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a1, 4 @@ -4584,22 +4745,75 @@ define void @round_v4f32(ptr %x) { } define void @round_v2f64(ptr %x) { -; CHECK-LABEL: round_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI192_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI192_0)(a1) -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a1, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vle64.v v8, (a0) +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI192_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI192_0)(a1) +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a1, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: vse64.v v8, (a0) +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vle64.v v8, (a0) +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a1, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: vse64.v v8, (a0) +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI192_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI192_0)(a1) +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a1, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a1, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV64ZVFHMIN-NEXT: ret %a = load <2 x double>, ptr %x %b = call <2 x double> @llvm.round.v2f64(<2 x double> %a) store <2 x double> %b, ptr %x @@ -4636,8 +4850,9 @@ define void @rint_v8f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI194_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI194_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -4693,20 +4908,67 @@ define void @rint_v4f32(ptr %x) { } define void @rint_v2f64(ptr %x) { -; CHECK-LABEL: rint_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI196_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI196_0)(a1) -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: rint_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vle64.v v8, (a0) +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI196_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI196_0)(a1) +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: vse64.v v8, (a0) +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: rint_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vle64.v v8, (a0) +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: vse64.v v8, (a0) +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: rint_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI196_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI196_0)(a1) +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: rint_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV64ZVFHMIN-NEXT: ret %a = load <2 x double>, ptr %x %b = call <2 x double> @llvm.rint.v2f64(<2 x double> %a) store <2 x double> %b, ptr %x @@ -4745,8 +5007,9 @@ define void @nearbyint_v8f16(ptr %x) { ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vle16.v v8, (a0) -; ZVFH-NEXT: lui a1, %hi(.LCPI198_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI198_0)(a1) +; ZVFH-NEXT: li a1, 25 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vfabs.v v9, v8 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a1 @@ -4808,22 +5071,75 @@ define void @nearbyint_v4f32(ptr %x) { } define void @nearbyint_v2f64(ptr %x) { -; CHECK-LABEL: nearbyint_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a1, %hi(.LCPI200_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI200_0)(a1) -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a1 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: fsflags a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: nearbyint_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vle64.v v8, (a0) +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI200_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI200_0)(a1) +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: frflags a1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: fsflags a1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: vse64.v v8, (a0) +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: nearbyint_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vle64.v v8, (a0) +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: frflags a1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: fsflags a1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: vse64.v v8, (a0) +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: nearbyint_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI200_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI200_0)(a1) +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: frflags a1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: fsflags a1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: nearbyint_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vle64.v v8, (a0) +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: frflags a1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: fsflags a1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vse64.v v8, (a0) +; RV64ZVFHMIN-NEXT: ret %a = load <2 x double>, ptr %x %b = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %a) store <2 x double> %b, ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll index be32c033fe373..c0b67dd603ebb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s ; This file tests the code generation for `llvm.experimental.constrained.round.*` on scalable vector type. @@ -11,10 +11,11 @@ define <1 x half> @round_v1f16(<1 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -34,10 +35,11 @@ define <2 x half> @round_v2f16(<2 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -57,10 +59,11 @@ define <4 x half> @round_v4f16(<4 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -80,10 +83,11 @@ define <8 x half> @round_v8f16(<8 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -103,10 +107,11 @@ define <16 x half> @round_v16f16(<16 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -125,11 +130,12 @@ define <32 x half> @round_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: round_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; CHECK-NEXT: slli a1, a1, 10 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 @@ -261,92 +267,168 @@ define <16 x float> @round_v16f32(<16 x float> %x) strictfp { declare <16 x float> @llvm.experimental.constrained.round.v16f32(<16 x float>, metadata) define <1 x double> @round_v1f64(<1 x double> %x) strictfp { -; CHECK-LABEL: round_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_v1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_v1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double> %x, metadata !"fpexcept.strict") ret <1 x double> %a } declare <1 x double> @llvm.experimental.constrained.round.v1f64(<1 x double>, metadata) define <2 x double> @round_v2f64(<2 x double> %x) strictfp { -; CHECK-LABEL: round_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double> %x, metadata !"fpexcept.strict") ret <2 x double> %a } declare <2 x double> @llvm.experimental.constrained.round.v2f64(<2 x double>, metadata) define <4 x double> @round_v4f64(<4 x double> %x) strictfp { -; CHECK-LABEL: round_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double> %x, metadata !"fpexcept.strict") ret <4 x double> %a } declare <4 x double> @llvm.experimental.constrained.round.v4f64(<4 x double>, metadata) define <8 x double> @round_v8f64(<8 x double> %x) strictfp { -; CHECK-LABEL: round_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call <8 x double> @llvm.experimental.constrained.round.v8f64(<8 x double> %x, metadata !"fpexcept.strict") ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll index 774ce5c7859c9..455dc0b83c03d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fround.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN ; This file tests the code generation for `llvm.round.*` on fixed vector type. define <1 x half> @round_v1f16(<1 x half> %x) { ; ZVFH-LABEL: round_v1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; ZVFH-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -52,10 +53,11 @@ declare <1 x half> @llvm.round.v1f16(<1 x half>) define <2 x half> @round_v2f16(<2 x half> %x) { ; ZVFH-LABEL: round_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -91,10 +93,11 @@ declare <2 x half> @llvm.round.v2f16(<2 x half>) define <4 x half> @round_v4f16(<4 x half> %x) { ; ZVFH-LABEL: round_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -130,10 +133,11 @@ declare <4 x half> @llvm.round.v4f16(<4 x half>) define <8 x half> @round_v8f16(<8 x half> %x) { ; ZVFH-LABEL: round_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -169,10 +173,11 @@ declare <8 x half> @llvm.round.v8f16(<8 x half>) define <16 x half> @round_v16f16(<16 x half> %x) { ; ZVFH-LABEL: round_v16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -208,11 +213,12 @@ declare <16 x half> @llvm.round.v16f16(<16 x half>) define <32 x half> @round_v32f16(<32 x half> %x) { ; ZVFH-LABEL: round_v32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: li a0, 32 +; ZVFH-NEXT: li a1, 25 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -347,80 +353,268 @@ define <16 x float> @round_v16f32(<16 x float> %x) { declare <16 x float> @llvm.round.v16f32(<16 x float>) define <1 x double> @round_v1f64(<1 x double> %x) { -; CHECK-LABEL: round_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_v1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_v1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_v1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI11_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_v1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <1 x double> @llvm.round.v1f64(<1 x double> %x) ret <1 x double> %a } declare <1 x double> @llvm.round.v1f64(<1 x double>) define <2 x double> @round_v2f64(<2 x double> %x) { -; CHECK-LABEL: round_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI12_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <2 x double> @llvm.round.v2f64(<2 x double> %x) ret <2 x double> %a } declare <2 x double> @llvm.round.v2f64(<2 x double>) define <4 x double> @round_v4f64(<4 x double> %x) { -; CHECK-LABEL: round_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI13_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <4 x double> @llvm.round.v4f64(<4 x double> %x) ret <4 x double> %a } declare <4 x double> @llvm.round.v4f64(<4 x double>) define <8 x double> @round_v8f64(<8 x double> %x) { -; CHECK-LABEL: round_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI14_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <8 x double> @llvm.round.v8f64(<8 x double> %x) ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll index 5c0279e133dfa..b1d35d3bcdc1d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s ; This file tests the code generation for `llvm.experimental.constrained.roundeven.*` on scalable vector type. @@ -11,10 +11,11 @@ define <1 x half> @roundeven_v1f16(<1 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -34,10 +35,11 @@ define <2 x half> @roundeven_v2f16(<2 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -57,10 +59,11 @@ define <4 x half> @roundeven_v4f16(<4 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -80,10 +83,11 @@ define <8 x half> @roundeven_v8f16(<8 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -103,10 +107,11 @@ define <16 x half> @roundeven_v16f16(<16 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -125,11 +130,12 @@ define <32 x half> @roundeven_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: roundeven_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; CHECK-NEXT: slli a1, a1, 10 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 @@ -261,92 +267,168 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) strictfp { declare <16 x float> @llvm.experimental.constrained.roundeven.v16f32(<16 x float>, metadata) define <1 x double> @roundeven_v1f64(<1 x double> %x) strictfp { -; CHECK-LABEL: roundeven_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_v1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_v1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double> %x, metadata !"fpexcept.strict") ret <1 x double> %a } declare <1 x double> @llvm.experimental.constrained.roundeven.v1f64(<1 x double>, metadata) define <2 x double> @roundeven_v2f64(<2 x double> %x) strictfp { -; CHECK-LABEL: roundeven_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %x, metadata !"fpexcept.strict") ret <2 x double> %a } declare <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double>, metadata) define <4 x double> @roundeven_v4f64(<4 x double> %x) strictfp { -; CHECK-LABEL: roundeven_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> %x, metadata !"fpexcept.strict") ret <4 x double> %a } declare <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double>, metadata) define <8 x double> @roundeven_v8f64(<8 x double> %x) strictfp { -; CHECK-LABEL: roundeven_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call <8 x double> @llvm.experimental.constrained.roundeven.v8f64(<8 x double> %x, metadata !"fpexcept.strict") ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll index 0b6baad127643..f8b3cb5897dfa 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-froundeven.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN ; This file tests the code generation for `llvm.roundeven.*` on fixed vector type. define <1 x half> @roundeven_v1f16(<1 x half> %x) { ; ZVFH-LABEL: roundeven_v1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a0) ; ZVFH-NEXT: vsetivli zero, 1, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -52,10 +53,11 @@ declare <1 x half> @llvm.roundeven.v1f16(<1 x half>) define <2 x half> @roundeven_v2f16(<2 x half> %x) { ; ZVFH-LABEL: roundeven_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a0) ; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -91,10 +93,11 @@ declare <2 x half> @llvm.roundeven.v2f16(<2 x half>) define <4 x half> @roundeven_v4f16(<4 x half> %x) { ; ZVFH-LABEL: roundeven_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a0) ; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -130,10 +133,11 @@ declare <4 x half> @llvm.roundeven.v4f16(<4 x half>) define <8 x half> @roundeven_v8f16(<8 x half> %x) { ; ZVFH-LABEL: roundeven_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a0) ; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -169,10 +173,11 @@ declare <8 x half> @llvm.roundeven.v8f16(<8 x half>) define <16 x half> @roundeven_v16f16(<16 x half> %x) { ; ZVFH-LABEL: roundeven_v16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a0) ; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -208,11 +213,12 @@ declare <16 x half> @llvm.roundeven.v16f16(<16 x half>) define <32 x half> @roundeven_v32f16(<32 x half> %x) { ; ZVFH-LABEL: roundeven_v32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a0) ; ZVFH-NEXT: li a0, 32 +; ZVFH-NEXT: li a1, 25 ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: slli a1, a1, 10 +; ZVFH-NEXT: fmv.h.x fa5, a1 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -347,80 +353,268 @@ define <16 x float> @roundeven_v16f32(<16 x float> %x) { declare <16 x float> @llvm.roundeven.v16f32(<16 x float>) define <1 x double> @roundeven_v1f64(<1 x double> %x) { -; CHECK-LABEL: roundeven_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_v1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI11_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_v1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_v1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI11_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_v1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %x) ret <1 x double> %a } declare <1 x double> @llvm.roundeven.v1f64(<1 x double>) define <2 x double> @roundeven_v2f64(<2 x double> %x) { -; CHECK-LABEL: roundeven_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI12_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI12_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %x) ret <2 x double> %a } declare <2 x double> @llvm.roundeven.v2f64(<2 x double>) define <4 x double> @roundeven_v4f64(<4 x double> %x) { -; CHECK-LABEL: roundeven_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI13_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI13_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %x) ret <4 x double> %a } declare <4 x double> @llvm.roundeven.v4f64(<4 x double>) define <8 x double> @roundeven_v8f64(<8 x double> %x) { -; CHECK-LABEL: roundeven_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI14_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32ZVFH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI14_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32ZVFHMIN-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %x) ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll index 2173887e85417..b7cf84fba4210 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ftrunc-constrained-sdnode.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define <1 x half> @trunc_v1f16(<1 x half> %x) strictfp { ; CHECK-LABEL: trunc_v1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -30,10 +31,11 @@ define <2 x half> @trunc_v2f16(<2 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -51,10 +53,11 @@ define <4 x half> @trunc_v4f16(<4 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -72,10 +75,11 @@ define <8 x half> @trunc_v8f16(<8 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -93,10 +97,11 @@ define <16 x half> @trunc_v16f16(<16 x half> %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t @@ -113,11 +118,12 @@ define <32 x half> @trunc_v32f16(<32 x half> %x) strictfp { ; CHECK-LABEL: trunc_v32f16: ; CHECK: # %bb.0: ; CHECK-NEXT: li a0, 32 -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: li a1, 25 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) +; CHECK-NEXT: slli a1, a1, 10 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: fmv.h.x fa5, a1 ; CHECK-NEXT: vfabs.v v12, v8 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -237,84 +243,152 @@ define <16 x float> @trunc_v16f32(<16 x float> %x) strictfp { declare <16 x float> @llvm.experimental.constrained.trunc.v16f32(<16 x float>, metadata) define <1 x double> @trunc_v1f64(<1 x double> %x) strictfp { -; CHECK-LABEL: trunc_v1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_v1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_v1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double> %x, metadata !"fpexcept.strict") ret <1 x double> %a } declare <1 x double> @llvm.experimental.constrained.trunc.v1f64(<1 x double>, metadata) define <2 x double> @trunc_v2f64(<2 x double> %x) strictfp { -; CHECK-LABEL: trunc_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double> %x, metadata !"fpexcept.strict") ret <2 x double> %a } declare <2 x double> @llvm.experimental.constrained.trunc.v2f64(<2 x double>, metadata) define <4 x double> @trunc_v4f64(<4 x double> %x) strictfp { -; CHECK-LABEL: trunc_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double> %x, metadata !"fpexcept.strict") ret <4 x double> %a } declare <4 x double> @llvm.experimental.constrained.trunc.v4f64(<4 x double>, metadata) define <8 x double> @trunc_v8f64(<8 x double> %x) strictfp { -; CHECK-LABEL: trunc_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call <8 x double> @llvm.experimental.constrained.trunc.v8f64(<8 x double> %x, metadata !"fpexcept.strict") ret <8 x double> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index 4bec67d91847d..ca72905a0f39b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -3597,5 +3597,322 @@ define <4 x i32> @buildvec_vredmax_slideup(<8 x i32> %arg0, <8 x i32> %arg1, <8 ret <4 x i32> %255 } +define <16 x i16> @PR159294(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) { +; RV32-ONLY-LABEL: PR159294: +; RV32-ONLY: # %bb.0: # %entry +; RV32-ONLY-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV32-ONLY-NEXT: vmv.x.s a0, v8 +; RV32-ONLY-NEXT: vmv.x.s a1, v9 +; RV32-ONLY-NEXT: vmv.x.s a2, v10 +; RV32-ONLY-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV32-ONLY-NEXT: vmv.v.x v8, a2 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV32-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV32-ONLY-NEXT: vslidedown.vi v8, v8, 13 +; RV32-ONLY-NEXT: ret +; +; RV32VB-LABEL: PR159294: +; RV32VB: # %bb.0: # %entry +; RV32VB-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32VB-NEXT: vmv.x.s a0, v8 +; RV32VB-NEXT: vmv.x.s a1, v10 +; RV32VB-NEXT: slli a0, a0, 16 +; RV32VB-NEXT: zext.h a1, a1 +; RV32VB-NEXT: or a0, a1, a0 +; RV32VB-NEXT: vmv.x.s a1, v9 +; RV32VB-NEXT: vmv.v.i v8, 0 +; RV32VB-NEXT: zext.h a1, a1 +; RV32VB-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; RV32VB-NEXT: vmv.s.x v8, a0 +; RV32VB-NEXT: vmv.s.x v10, a1 +; RV32VB-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV32VB-NEXT: vslideup.vi v8, v10, 1 +; RV32VB-NEXT: ret +; +; RV32VB-PACK-LABEL: PR159294: +; RV32VB-PACK: # %bb.0: # %entry +; RV32VB-PACK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32VB-PACK-NEXT: vmv.x.s a0, v8 +; RV32VB-PACK-NEXT: vmv.x.s a1, v10 +; RV32VB-PACK-NEXT: vmv.x.s a2, v9 +; RV32VB-PACK-NEXT: pack a0, a1, a0 +; RV32VB-PACK-NEXT: pack a1, a0, a0 +; RV32VB-PACK-NEXT: vmv.v.x v8, a1 +; RV32VB-PACK-NEXT: pack a1, a2, a0 +; RV32VB-PACK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; RV32VB-PACK-NEXT: vmv.s.x v8, a0 +; RV32VB-PACK-NEXT: vmv.s.x v10, a1 +; RV32VB-PACK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RV32VB-PACK-NEXT: vslideup.vi v8, v10, 1 +; RV32VB-PACK-NEXT: ret +; +; RV64V-ONLY-LABEL: PR159294: +; RV64V-ONLY: # %bb.0: # %entry +; RV64V-ONLY-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.x.s a0, v8 +; RV64V-ONLY-NEXT: vmv.x.s a1, v9 +; RV64V-ONLY-NEXT: vmv.x.s a2, v10 +; RV64V-ONLY-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 13 +; RV64V-ONLY-NEXT: ret +; +; RVA22U64-LABEL: PR159294: +; RVA22U64: # %bb.0: # %entry +; RVA22U64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RVA22U64-NEXT: vmv.x.s a0, v8 +; RVA22U64-NEXT: vmv.x.s a1, v10 +; RVA22U64-NEXT: slli a0, a0, 16 +; RVA22U64-NEXT: zext.h a1, a1 +; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: vmv.x.s a1, v9 +; RVA22U64-NEXT: vmv.v.i v8, 0 +; RVA22U64-NEXT: zext.h a1, a1 +; RVA22U64-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; RVA22U64-NEXT: vmv.s.x v8, a0 +; RVA22U64-NEXT: vmv.s.x v10, a1 +; RVA22U64-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RVA22U64-NEXT: vslideup.vi v8, v10, 1 +; RVA22U64-NEXT: ret +; +; RVA22U64-PACK-LABEL: PR159294: +; RVA22U64-PACK: # %bb.0: # %entry +; RVA22U64-PACK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RVA22U64-PACK-NEXT: vmv.x.s a0, v8 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v10 +; RVA22U64-PACK-NEXT: vmv.x.s a2, v9 +; RVA22U64-PACK-NEXT: packw a0, a1, a0 +; RVA22U64-PACK-NEXT: packw a1, a0, a0 +; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 +; RVA22U64-PACK-NEXT: packw a1, a2, a0 +; RVA22U64-PACK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; RVA22U64-PACK-NEXT: vmv.s.x v8, a0 +; RVA22U64-PACK-NEXT: vmv.s.x v10, a1 +; RVA22U64-PACK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; RVA22U64-PACK-NEXT: vslideup.vi v8, v10, 1 +; RVA22U64-PACK-NEXT: ret +; +; RV64ZVE32-LABEL: PR159294: +; RV64ZVE32: # %bb.0: # %entry +; RV64ZVE32-NEXT: vsetivli zero, 1, e32, m1, ta, ma +; RV64ZVE32-NEXT: vmv.x.s a0, v8 +; RV64ZVE32-NEXT: vmv.x.s a1, v9 +; RV64ZVE32-NEXT: vmv.x.s a2, v10 +; RV64ZVE32-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 13 +; RV64ZVE32-NEXT: ret +entry: + %vecext3 = extractelement <2 x i32> %a, i32 0 + %conv4 = trunc i32 %vecext3 to i16 + %vecinit5 = insertelement <16 x i16> , i16 %conv4, i32 1 + %vecext7 = extractelement <2 x i32> %b, i32 0 + %conv8 = trunc i32 %vecext7 to i16 + %vecinit9 = insertelement <16 x i16> %vecinit5, i16 %conv8, i32 2 + %vecext59 = extractelement <2 x i32> %c, i32 0 + %conv60 = trunc i32 %vecext59 to i16 + %vecinit61 = insertelement <16 x i16> %vecinit9, i16 %conv60, i32 0 + ret <16 x i16> %vecinit61 +} + +define <16 x i32> @PR159294_zext(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; RV32-LABEL: PR159294_zext: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: lui a2, 16 +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslidedown.vi v8, v8, 13 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: vand.vx v8, v8, a2 +; RV32-NEXT: ret +; +; RV64V-ONLY-LABEL: PR159294_zext: +; RV64V-ONLY: # %bb.0: # %entry +; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.x.s a0, v8 +; RV64V-ONLY-NEXT: lui a1, 16 +; RV64V-ONLY-NEXT: vmv.x.s a2, v9 +; RV64V-ONLY-NEXT: vmv.x.s a3, v10 +; RV64V-ONLY-NEXT: addi a1, a1, -1 +; RV64V-ONLY-NEXT: and a0, a0, a1 +; RV64V-ONLY-NEXT: and a2, a2, a1 +; RV64V-ONLY-NEXT: and a1, a3, a1 +; RV64V-ONLY-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a1 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a2 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 13 +; RV64V-ONLY-NEXT: ret +; +; RVA22U64-LABEL: PR159294_zext: +; RVA22U64: # %bb.0: # %entry +; RVA22U64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RVA22U64-NEXT: vmv.x.s a0, v8 +; RVA22U64-NEXT: vmv.x.s a1, v10 +; RVA22U64-NEXT: slli a0, a0, 48 +; RVA22U64-NEXT: zext.h a1, a1 +; RVA22U64-NEXT: srli a0, a0, 16 +; RVA22U64-NEXT: or a0, a0, a1 +; RVA22U64-NEXT: vmv.x.s a1, v9 +; RVA22U64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RVA22U64-NEXT: vmv.v.i v8, 0 +; RVA22U64-NEXT: zext.h a1, a1 +; RVA22U64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RVA22U64-NEXT: vmv.s.x v8, a0 +; RVA22U64-NEXT: vmv.s.x v12, a1 +; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RVA22U64-NEXT: vslideup.vi v8, v12, 1 +; RVA22U64-NEXT: ret +; +; RVA22U64-PACK-LABEL: PR159294_zext: +; RVA22U64-PACK: # %bb.0: # %entry +; RVA22U64-PACK-NEXT: vsetivli zero, 1, e16, m2, ta, ma +; RVA22U64-PACK-NEXT: vmv1r.v v12, v9 +; RVA22U64-PACK-NEXT: vmv.x.s a0, v8 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v10 +; RVA22U64-PACK-NEXT: pack a2, a0, a0 +; RVA22U64-PACK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RVA22U64-PACK-NEXT: vmv.v.x v8, a2 +; RVA22U64-PACK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; RVA22U64-PACK-NEXT: vmv.x.s a2, v12 +; RVA22U64-PACK-NEXT: zext.h a0, a0 +; RVA22U64-PACK-NEXT: zext.h a1, a1 +; RVA22U64-PACK-NEXT: zext.h a2, a2 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 +; RVA22U64-PACK-NEXT: pack a1, a2, a0 +; RVA22U64-PACK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RVA22U64-PACK-NEXT: vmv.s.x v8, a0 +; RVA22U64-PACK-NEXT: vmv.s.x v12, a1 +; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RVA22U64-PACK-NEXT: vslideup.vi v8, v12, 1 +; RVA22U64-PACK-NEXT: ret +; +; RV64ZVE32-LABEL: PR159294_zext: +; RV64ZVE32: # %bb.0: # %entry +; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32-NEXT: vmv.x.s a0, v8 +; RV64ZVE32-NEXT: lui a1, 16 +; RV64ZVE32-NEXT: vmv.x.s a2, v9 +; RV64ZVE32-NEXT: vmv.x.s a3, v10 +; RV64ZVE32-NEXT: addi a1, a1, -1 +; RV64ZVE32-NEXT: and a0, a0, a1 +; RV64ZVE32-NEXT: and a2, a2, a1 +; RV64ZVE32-NEXT: and a1, a3, a1 +; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a1 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 13 +; RV64ZVE32-NEXT: ret +entry: + %vecext3 = extractelement <2 x i16> %a, i32 0 + %conv4 = zext i16 %vecext3 to i32 + %vecinit5 = insertelement <16 x i32> , i32 %conv4, i32 1 + %vecext7 = extractelement <2 x i16> %b, i32 0 + %conv8 = zext i16 %vecext7 to i32 + %vecinit9 = insertelement <16 x i32> %vecinit5, i32 %conv8, i32 2 + %vecext59 = extractelement <2 x i16> %c, i32 0 + %conv60 = zext i16 %vecext59 to i32 + %vecinit61 = insertelement <16 x i32> %vecinit9, i32 %conv60, i32 0 + ret <16 x i32> %vecinit61 +} + +define <16 x i32> @PR159294_sext(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) { +; RV32-LABEL: PR159294_sext: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV32-NEXT: vmv.x.s a0, v8 +; RV32-NEXT: vmv.x.s a1, v9 +; RV32-NEXT: vmv.x.s a2, v10 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vmv.v.x v8, a2 +; RV32-NEXT: vslide1down.vx v8, v8, a0 +; RV32-NEXT: vslide1down.vx v8, v8, a1 +; RV32-NEXT: vslidedown.vi v8, v8, 13 +; RV32-NEXT: ret +; +; RV64V-ONLY-LABEL: PR159294_sext: +; RV64V-ONLY: # %bb.0: # %entry +; RV64V-ONLY-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64V-ONLY-NEXT: vmv.x.s a0, v8 +; RV64V-ONLY-NEXT: vmv.x.s a1, v9 +; RV64V-ONLY-NEXT: vmv.x.s a2, v10 +; RV64V-ONLY-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64V-ONLY-NEXT: vmv.v.x v8, a2 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a0 +; RV64V-ONLY-NEXT: vslide1down.vx v8, v8, a1 +; RV64V-ONLY-NEXT: vslidedown.vi v8, v8, 13 +; RV64V-ONLY-NEXT: ret +; +; RVA22U64-LABEL: PR159294_sext: +; RVA22U64: # %bb.0: # %entry +; RVA22U64-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RVA22U64-NEXT: vmv.x.s a0, v8 +; RVA22U64-NEXT: vmv.x.s a1, v10 +; RVA22U64-NEXT: slli a0, a0, 32 +; RVA22U64-NEXT: add.uw a0, a1, a0 +; RVA22U64-NEXT: vmv.x.s a1, v9 +; RVA22U64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RVA22U64-NEXT: vmv.v.i v8, 0 +; RVA22U64-NEXT: zext.w a1, a1 +; RVA22U64-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RVA22U64-NEXT: vmv.s.x v8, a0 +; RVA22U64-NEXT: vmv.s.x v12, a1 +; RVA22U64-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RVA22U64-NEXT: vslideup.vi v8, v12, 1 +; RVA22U64-NEXT: ret +; +; RVA22U64-PACK-LABEL: PR159294_sext: +; RVA22U64-PACK: # %bb.0: # %entry +; RVA22U64-PACK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RVA22U64-PACK-NEXT: vmv.x.s a0, v8 +; RVA22U64-PACK-NEXT: vmv.x.s a1, v10 +; RVA22U64-PACK-NEXT: vmv.x.s a2, v9 +; RVA22U64-PACK-NEXT: pack a0, a1, a0 +; RVA22U64-PACK-NEXT: pack a1, a0, a0 +; RVA22U64-PACK-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RVA22U64-PACK-NEXT: vmv.v.x v8, a1 +; RVA22U64-PACK-NEXT: pack a1, a2, a0 +; RVA22U64-PACK-NEXT: vsetvli zero, zero, e64, m4, tu, ma +; RVA22U64-PACK-NEXT: vmv.s.x v8, a0 +; RVA22U64-PACK-NEXT: vmv.s.x v12, a1 +; RVA22U64-PACK-NEXT: vsetivli zero, 2, e64, m1, tu, ma +; RVA22U64-PACK-NEXT: vslideup.vi v8, v12, 1 +; RVA22U64-PACK-NEXT: ret +; +; RV64ZVE32-LABEL: PR159294_sext: +; RV64ZVE32: # %bb.0: # %entry +; RV64ZVE32-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; RV64ZVE32-NEXT: vmv.x.s a0, v8 +; RV64ZVE32-NEXT: vmv.x.s a1, v9 +; RV64ZVE32-NEXT: vmv.x.s a2, v10 +; RV64ZVE32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV64ZVE32-NEXT: vmv.v.x v8, a2 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a0 +; RV64ZVE32-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVE32-NEXT: vslidedown.vi v8, v8, 13 +; RV64ZVE32-NEXT: ret +entry: + %vecext3 = extractelement <2 x i16> %a, i32 0 + %conv4 = sext i16 %vecext3 to i32 + %vecinit5 = insertelement <16 x i32> , i32 %conv4, i32 1 + %vecext7 = extractelement <2 x i16> %b, i32 0 + %conv8 = sext i16 %vecext7 to i32 + %vecinit9 = insertelement <16 x i32> %vecinit5, i32 %conv8, i32 2 + %vecext59 = extractelement <2 x i16> %c, i32 0 + %conv60 = sext i16 %vecext59 to i32 + %vecinit61 = insertelement <16 x i32> %vecinit9, i32 %conv60, i32 0 + ret <16 x i32> %vecinit61 +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll index b6c441290ee45..08da7d6bc50f7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-nearbyint-vp.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s declare <2 x half> @llvm.vp.nearbyint.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -30,10 +31,11 @@ define <2 x half> @vp_nearbyint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext % define <2 x half> @vp_nearbyint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -51,10 +53,11 @@ declare <4 x half> @llvm.vp.nearbyint.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -72,10 +75,11 @@ define <4 x half> @vp_nearbyint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext % define <4 x half> @vp_nearbyint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -93,10 +97,11 @@ declare <8 x half> @llvm.vp.nearbyint.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -114,10 +119,11 @@ define <8 x half> @vp_nearbyint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext % define <8 x half> @vp_nearbyint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -137,9 +143,10 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI6_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t ; CHECK-NEXT: frflags a0 @@ -158,10 +165,11 @@ define <16 x half> @vp_nearbyint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe define <16 x half> @vp_nearbyint_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_nearbyint_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI7_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -349,41 +357,75 @@ define <16 x float> @vp_nearbyint_v16f32_unmasked(<16 x float> %va, i32 zeroext declare <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_nearbyint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI16_0) +; RV32-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vfabs.v v9, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vfabs.v v9, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v2f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI17_0) +; RV32-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v2f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <2 x double> @llvm.vp.nearbyint.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -391,43 +433,79 @@ define <2 x double> @vp_nearbyint_v2f64_unmasked(<2 x double> %va, i32 zeroext % declare <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_nearbyint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: lui a0, %hi(.LCPI18_0) +; RV32-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32-NEXT: vfabs.v v12, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32-NEXT: frflags a0 +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: vfabs.v v12, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64-NEXT: frflags a0 +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v4f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI19_0) +; RV32-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v4f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <4 x double> @llvm.vp.nearbyint.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -435,43 +513,79 @@ define <4 x double> @vp_nearbyint_v4f64_unmasked(<4 x double> %va, i32 zeroext % declare <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_nearbyint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: lui a0, %hi(.LCPI20_0) +; RV32-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32-NEXT: vfabs.v v16, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32-NEXT: frflags a0 +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: vfabs.v v16, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64-NEXT: frflags a0 +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v8f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI21_0) +; RV32-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v8f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <8 x double> @llvm.vp.nearbyint.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -479,43 +593,79 @@ define <8 x double> @vp_nearbyint_v8f64_unmasked(<8 x double> %va, i32 zeroext % declare <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_nearbyint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v15f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v16, v0 +; RV32-NEXT: lui a0, %hi(.LCPI22_0) +; RV32-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32-NEXT: vfabs.v v24, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32-NEXT: frflags a0 +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v15f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v16, v0 +; RV64-NEXT: vfabs.v v24, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64-NEXT: frflags a0 +; RV64-NEXT: vmv1r.v v0, v16 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v15f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI23_0) +; RV32-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v15f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <15 x double> @llvm.vp.nearbyint.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -523,43 +673,79 @@ define <15 x double> @vp_nearbyint_v15f64_unmasked(<15 x double> %va, i32 zeroex declare <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_nearbyint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v16f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v16, v0 +; RV32-NEXT: lui a0, %hi(.LCPI24_0) +; RV32-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32-NEXT: vfabs.v v24, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32-NEXT: frflags a0 +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v16f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v16, v0 +; RV64-NEXT: vfabs.v v24, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64-NEXT: frflags a0 +; RV64-NEXT: vmv1r.v v0, v16 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v16f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI25_0) +; RV32-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v16f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <16 x double> @llvm.vp.nearbyint.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -567,91 +753,175 @@ define <16 x double> @vp_nearbyint_v16f64_unmasked(<16 x double> %va, i32 zeroex declare <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_nearbyint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: frflags a1 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vmv1r.v v6, v0 +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB26_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB26_2: +; RV32-NEXT: vmv1r.v v0, v6 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v8, v0.t +; RV32-NEXT: lui a1, %hi(.LCPI26_0) +; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32-NEXT: frflags a1 +; RV32-NEXT: vmv1r.v v0, v6 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: fsflags a1 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32-NEXT: frflags a0 +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vmv1r.v v6, v0 +; RV64-NEXT: li a2, 16 +; RV64-NEXT: vslidedown.vi v7, v0, 2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB26_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB26_2: +; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v8, v0.t +; RV64-NEXT: li a1, 1075 +; RV64-NEXT: slli a1, a1, 52 +; RV64-NEXT: fmv.d.x fa5, a1 +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: frflags a1 +; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: fsflags a1 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64-NEXT: frflags a0 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_nearbyint_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: frflags a2 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: frflags a1 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: fsflags a1 -; CHECK-NEXT: ret +; RV32-LABEL: vp_nearbyint_v32f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB27_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB27_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v8 +; RV32-NEXT: lui a2, %hi(.LCPI27_0) +; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: frflags a2 +; RV32-NEXT: vmflt.vf v0, v24, fa5 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v16 +; RV32-NEXT: vmflt.vf v7, v24, fa5 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: fsflags a2 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: frflags a1 +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32-NEXT: fsflags a1 +; RV32-NEXT: ret +; +; RV64-LABEL: vp_nearbyint_v32f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: li a2, 16 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB27_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB27_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v8 +; RV64-NEXT: li a2, 1075 +; RV64-NEXT: slli a2, a2, 52 +; RV64-NEXT: fmv.d.x fa5, a2 +; RV64-NEXT: addi a2, a0, -16 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: frflags a2 +; RV64-NEXT: vmflt.vf v0, v24, fa5 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v16 +; RV64-NEXT: vmflt.vf v7, v24, fa5 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: fsflags a2 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: frflags a1 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64-NEXT: fsflags a1 +; RV64-NEXT: ret %v = call <32 x double> @llvm.vp.nearbyint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll index a426f8c619e99..eec12212d0d37 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zfh,+zvfh,+f,+d -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>) @@ -2083,21 +2083,38 @@ define float @vreduce_fminimum_v128f32_nonans(ptr %x) { declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double>) define double @vreduce_fminimum_v2f64(ptr %x) { -; CHECK-LABEL: vreduce_fminimum_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v9, v8, v8 -; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB123_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI123_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI123_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB123_2: -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fminimum_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v9, v8, v8 +; RV32-NEXT: vcpop.m a0, v9 +; RV32-NEXT: beqz a0, .LBB123_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI123_0) +; RV32-NEXT: fld fa0, %lo(.LCPI123_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB123_2: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v9, v8, v8 +; RV64-NEXT: vcpop.m a0, v9 +; RV64-NEXT: beqz a0, .LBB123_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB123_2: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <2 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %v) ret double %red @@ -2119,21 +2136,38 @@ define double @vreduce_fminimum_v2f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fminimum.v4f64(<4 x double>) define double @vreduce_fminimum_v4f64(ptr %x) { -; CHECK-LABEL: vreduce_fminimum_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v10, v8, v8 -; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB125_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI125_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI125_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB125_2: -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fminimum_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v10, v8, v8 +; RV32-NEXT: vcpop.m a0, v10 +; RV32-NEXT: beqz a0, .LBB125_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI125_0) +; RV32-NEXT: fld fa0, %lo(.LCPI125_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB125_2: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v10, v8, v8 +; RV64-NEXT: vcpop.m a0, v10 +; RV64-NEXT: beqz a0, .LBB125_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB125_2: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <4 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %v) ret double %red @@ -2155,21 +2189,38 @@ define double @vreduce_fminimum_v4f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fminimum.v8f64(<8 x double>) define double @vreduce_fminimum_v8f64(ptr %x) { -; CHECK-LABEL: vreduce_fminimum_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v12, v8, v8 -; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB127_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI127_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI127_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB127_2: -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fminimum_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v12, v8, v8 +; RV32-NEXT: vcpop.m a0, v12 +; RV32-NEXT: beqz a0, .LBB127_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI127_0) +; RV32-NEXT: fld fa0, %lo(.LCPI127_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB127_2: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v12, v8, v8 +; RV64-NEXT: vcpop.m a0, v12 +; RV64-NEXT: beqz a0, .LBB127_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB127_2: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <8 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v8f64(<8 x double> %v) ret double %red @@ -2191,21 +2242,38 @@ define double @vreduce_fminimum_v8f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fminimum.v16f64(<16 x double>) define double @vreduce_fminimum_v16f64(ptr %x) { -; CHECK-LABEL: vreduce_fminimum_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v16, v8, v8 -; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB129_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI129_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI129_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB129_2: -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fminimum_v16f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v16, v8, v8 +; RV32-NEXT: vcpop.m a0, v16 +; RV32-NEXT: beqz a0, .LBB129_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI129_0) +; RV32-NEXT: fld fa0, %lo(.LCPI129_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB129_2: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v16f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v16, v8, v8 +; RV64-NEXT: vcpop.m a0, v16 +; RV64-NEXT: beqz a0, .LBB129_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB129_2: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <16 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v16f64(<16 x double> %v) ret double %red @@ -2227,29 +2295,54 @@ define double @vreduce_fminimum_v16f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>) define double @vreduce_fminimum_v32f64(ptr %x) { -; CHECK-LABEL: vreduce_fminimum_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 -; CHECK-NEXT: vfmin.vv v8, v16, v8 -; CHECK-NEXT: vmfne.vv v16, v8, v8 -; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB131_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI131_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI131_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB131_2: -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fminimum_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vmfeq.vv v0, v16, v16 +; RV32-NEXT: vmfeq.vv v7, v24, v24 +; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vfmin.vv v8, v16, v8 +; RV32-NEXT: vmfne.vv v16, v8, v8 +; RV32-NEXT: vcpop.m a0, v16 +; RV32-NEXT: beqz a0, .LBB131_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI131_0) +; RV32-NEXT: fld fa0, %lo(.LCPI131_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB131_2: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vmfeq.vv v0, v16, v16 +; RV64-NEXT: vmfeq.vv v7, v24, v24 +; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV64-NEXT: vfmin.vv v8, v16, v8 +; RV64-NEXT: vmfne.vv v16, v8, v8 +; RV64-NEXT: vcpop.m a0, v16 +; RV64-NEXT: beqz a0, .LBB131_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB131_2: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <32 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %v) ret double %red @@ -2274,85 +2367,166 @@ define double @vreduce_fminimum_v32f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fminimum.v64f64(<64 x double>) define double @vreduce_fminimum_v64f64(ptr %x) { -; CHECK-LABEL: vreduce_fminimum_v64f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmin.vv v24, v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmin.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 -; CHECK-NEXT: vfmin.vv v8, v16, v8 -; CHECK-NEXT: vmfne.vv v16, v8, v8 -; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB133_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI133_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI133_0)(a0) -; CHECK-NEXT: j .LBB133_3 -; CHECK-NEXT: .LBB133_2: -; CHECK-NEXT: vfredmin.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB133_3: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fminimum_v64f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: addi a1, a0, 384 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmfeq.vv v0, v24, v24 +; RV32-NEXT: vmfeq.vv v7, v16, v16 +; RV32-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmin.vv v24, v16, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmfeq.vv v0, v16, v16 +; RV32-NEXT: vmfeq.vv v7, v8, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmin.vv v16, v8, v16 +; RV32-NEXT: vmfeq.vv v0, v16, v16 +; RV32-NEXT: vmfeq.vv v7, v24, v24 +; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vfmin.vv v8, v16, v8 +; RV32-NEXT: vmfne.vv v16, v8, v8 +; RV32-NEXT: vcpop.m a0, v16 +; RV32-NEXT: beqz a0, .LBB133_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI133_0) +; RV32-NEXT: fld fa0, %lo(.LCPI133_0)(a0) +; RV32-NEXT: j .LBB133_3 +; RV32-NEXT: .LBB133_2: +; RV32-NEXT: vfredmin.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB133_3: +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fminimum_v64f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmfeq.vv v0, v24, v24 +; RV64-NEXT: vmfeq.vv v7, v16, v16 +; RV64-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmin.vv v24, v16, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmfeq.vv v0, v16, v16 +; RV64-NEXT: vmfeq.vv v7, v8, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmin.vv v16, v8, v16 +; RV64-NEXT: vmfeq.vv v0, v16, v16 +; RV64-NEXT: vmfeq.vv v7, v24, v24 +; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV64-NEXT: vfmin.vv v8, v16, v8 +; RV64-NEXT: vmfne.vv v16, v8, v8 +; RV64-NEXT: vcpop.m a0, v16 +; RV64-NEXT: beqz a0, .LBB133_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: j .LBB133_3 +; RV64-NEXT: .LBB133_2: +; RV64-NEXT: vfredmin.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB133_3: +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret %v = load <64 x double>, ptr %x %red = call double @llvm.vector.reduce.fminimum.v64f64(<64 x double> %v) ret double %red @@ -2765,21 +2939,38 @@ define float @vreduce_fmaximum_v128f32_nonans(ptr %x) { declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>) define double @vreduce_fmaximum_v2f64(ptr %x) { -; CHECK-LABEL: vreduce_fmaximum_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v9, v8, v8 -; CHECK-NEXT: vcpop.m a0, v9 -; CHECK-NEXT: beqz a0, .LBB151_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI151_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI151_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB151_2: -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fmaximum_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v9, v8, v8 +; RV32-NEXT: vcpop.m a0, v9 +; RV32-NEXT: beqz a0, .LBB151_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI151_0) +; RV32-NEXT: fld fa0, %lo(.LCPI151_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB151_2: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v9, v8, v8 +; RV64-NEXT: vcpop.m a0, v9 +; RV64-NEXT: beqz a0, .LBB151_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB151_2: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <2 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %v) ret double %red @@ -2801,21 +2992,38 @@ define double @vreduce_fmaximum_v2f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>) define double @vreduce_fmaximum_v4f64(ptr %x) { -; CHECK-LABEL: vreduce_fmaximum_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v10, v8, v8 -; CHECK-NEXT: vcpop.m a0, v10 -; CHECK-NEXT: beqz a0, .LBB153_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI153_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI153_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB153_2: -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fmaximum_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v10, v8, v8 +; RV32-NEXT: vcpop.m a0, v10 +; RV32-NEXT: beqz a0, .LBB153_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI153_0) +; RV32-NEXT: fld fa0, %lo(.LCPI153_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB153_2: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v10, v8, v8 +; RV64-NEXT: vcpop.m a0, v10 +; RV64-NEXT: beqz a0, .LBB153_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB153_2: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <4 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %v) ret double %red @@ -2837,21 +3045,38 @@ define double @vreduce_fmaximum_v4f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>) define double @vreduce_fmaximum_v8f64(ptr %x) { -; CHECK-LABEL: vreduce_fmaximum_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v12, v8, v8 -; CHECK-NEXT: vcpop.m a0, v12 -; CHECK-NEXT: beqz a0, .LBB155_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI155_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI155_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB155_2: -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fmaximum_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v12, v8, v8 +; RV32-NEXT: vcpop.m a0, v12 +; RV32-NEXT: beqz a0, .LBB155_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI155_0) +; RV32-NEXT: fld fa0, %lo(.LCPI155_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB155_2: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v12, v8, v8 +; RV64-NEXT: vcpop.m a0, v12 +; RV64-NEXT: beqz a0, .LBB155_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB155_2: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <8 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> %v) ret double %red @@ -2873,21 +3098,38 @@ define double @vreduce_fmaximum_v8f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>) define double @vreduce_fmaximum_v16f64(ptr %x) { -; CHECK-LABEL: vreduce_fmaximum_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfne.vv v16, v8, v8 -; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB157_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI157_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI157_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB157_2: -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fmaximum_v16f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: vmfne.vv v16, v8, v8 +; RV32-NEXT: vcpop.m a0, v16 +; RV32-NEXT: beqz a0, .LBB157_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI157_0) +; RV32-NEXT: fld fa0, %lo(.LCPI157_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB157_2: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v16f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: vmfne.vv v16, v8, v8 +; RV64-NEXT: vcpop.m a0, v16 +; RV64-NEXT: beqz a0, .LBB157_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB157_2: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <16 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> %v) ret double %red @@ -2909,29 +3151,54 @@ define double @vreduce_fmaximum_v16f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>) define double @vreduce_fmaximum_v32f64(ptr %x) { -; CHECK-LABEL: vreduce_fmaximum_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 -; CHECK-NEXT: vfmax.vv v8, v16, v8 -; CHECK-NEXT: vmfne.vv v16, v8, v8 -; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB159_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI159_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI159_0)(a0) -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB159_2: -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fmaximum_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v16, (a0) +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: vmfeq.vv v0, v16, v16 +; RV32-NEXT: vmfeq.vv v7, v24, v24 +; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vfmax.vv v8, v16, v8 +; RV32-NEXT: vmfne.vv v16, v8, v8 +; RV32-NEXT: vcpop.m a0, v16 +; RV32-NEXT: beqz a0, .LBB159_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI159_0) +; RV32-NEXT: fld fa0, %lo(.LCPI159_0)(a0) +; RV32-NEXT: ret +; RV32-NEXT: .LBB159_2: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v16, (a0) +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: vmfeq.vv v0, v16, v16 +; RV64-NEXT: vmfeq.vv v7, v24, v24 +; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV64-NEXT: vfmax.vv v8, v16, v8 +; RV64-NEXT: vmfne.vv v16, v8, v8 +; RV64-NEXT: vcpop.m a0, v16 +; RV64-NEXT: beqz a0, .LBB159_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: ret +; RV64-NEXT: .LBB159_2: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: ret %v = load <32 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %v) ret double %red @@ -2956,85 +3223,166 @@ define double @vreduce_fmaximum_v32f64_nonans(ptr %x) { declare double @llvm.vector.reduce.fmaximum.v64f64(<64 x double>) define double @vreduce_fmaximum_v64f64(ptr %x) { -; CHECK-LABEL: vreduce_fmaximum_v64f64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 -; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb -; CHECK-NEXT: addi a1, a0, 128 -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v24, (a1) -; CHECK-NEXT: addi a1, a0, 384 -; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: addi a1, a0, 256 -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmfeq.vv v0, v24, v24 -; CHECK-NEXT: vmfeq.vv v7, v16, v16 -; CHECK-NEXT: vmerge.vvm v8, v24, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vle64.v v8, (a1) -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 -; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmax.vv v24, v16, v24 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v8, v8 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v16, v16, v8, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfmax.vv v16, v8, v16 -; CHECK-NEXT: vmfeq.vv v0, v16, v16 -; CHECK-NEXT: vmfeq.vv v7, v24, v24 -; CHECK-NEXT: vmerge.vvm v8, v16, v24, v0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 -; CHECK-NEXT: vfmax.vv v8, v16, v8 -; CHECK-NEXT: vmfne.vv v16, v8, v8 -; CHECK-NEXT: vcpop.m a0, v16 -; CHECK-NEXT: beqz a0, .LBB161_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: lui a0, %hi(.LCPI161_0) -; CHECK-NEXT: fld fa0, %lo(.LCPI161_0)(a0) -; CHECK-NEXT: j .LBB161_3 -; CHECK-NEXT: .LBB161_2: -; CHECK-NEXT: vfredmax.vs v8, v8, v8 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: .LBB161_3: -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 -; CHECK-NEXT: add sp, sp, a0 -; CHECK-NEXT: .cfi_def_cfa sp, 16 -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: ret +; RV32-LABEL: vreduce_fmaximum_v64f64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 4 +; RV32-NEXT: sub sp, sp, a1 +; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-NEXT: vle64.v v24, (a1) +; RV32-NEXT: addi a1, a0, 384 +; RV32-NEXT: vle64.v v16, (a1) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: vle64.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmfeq.vv v0, v24, v24 +; RV32-NEXT: vmfeq.vv v7, v16, v16 +; RV32-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmax.vv v24, v16, v24 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmfeq.vv v0, v16, v16 +; RV32-NEXT: vmfeq.vv v7, v8, v8 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 3 +; RV32-NEXT: add a0, sp, a0 +; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: addi a0, sp, 16 +; RV32-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV32-NEXT: vfmax.vv v16, v8, v16 +; RV32-NEXT: vmfeq.vv v0, v16, v16 +; RV32-NEXT: vmfeq.vv v7, v24, v24 +; RV32-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV32-NEXT: vfmax.vv v8, v16, v8 +; RV32-NEXT: vmfne.vv v16, v8, v8 +; RV32-NEXT: vcpop.m a0, v16 +; RV32-NEXT: beqz a0, .LBB161_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: lui a0, %hi(.LCPI161_0) +; RV32-NEXT: fld fa0, %lo(.LCPI161_0)(a0) +; RV32-NEXT: j .LBB161_3 +; RV32-NEXT: .LBB161_2: +; RV32-NEXT: vfredmax.vs v8, v8, v8 +; RV32-NEXT: vfmv.f.s fa0, v8 +; RV32-NEXT: .LBB161_3: +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: slli a0, a0, 4 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: .cfi_def_cfa sp, 16 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 +; RV32-NEXT: ret +; +; RV64-LABEL: vreduce_fmaximum_v64f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a1, a1, 4 +; RV64-NEXT: sub sp, sp, a1 +; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-NEXT: vle64.v v24, (a1) +; RV64-NEXT: addi a1, a0, 384 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: vle64.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmfeq.vv v0, v24, v24 +; RV64-NEXT: vmfeq.vv v7, v16, v16 +; RV64-NEXT: vmerge.vvm v8, v24, v16, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmerge.vvm v16, v16, v24, v0 +; RV64-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmax.vv v24, v16, v24 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmfeq.vv v0, v16, v16 +; RV64-NEXT: vmfeq.vv v7, v8, v8 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v16, v16, v8, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add a0, sp, a0 +; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; RV64-NEXT: vfmax.vv v16, v8, v16 +; RV64-NEXT: vmfeq.vv v0, v16, v16 +; RV64-NEXT: vmfeq.vv v7, v24, v24 +; RV64-NEXT: vmerge.vvm v8, v16, v24, v0 +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vmerge.vvm v16, v24, v16, v0 +; RV64-NEXT: vfmax.vv v8, v16, v8 +; RV64-NEXT: vmfne.vv v16, v8, v8 +; RV64-NEXT: vcpop.m a0, v16 +; RV64-NEXT: beqz a0, .LBB161_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: lui a0, 4095 +; RV64-NEXT: slli a0, a0, 39 +; RV64-NEXT: fmv.d.x fa0, a0 +; RV64-NEXT: j .LBB161_3 +; RV64-NEXT: .LBB161_2: +; RV64-NEXT: vfredmax.vs v8, v8, v8 +; RV64-NEXT: vfmv.f.s fa0, v8 +; RV64-NEXT: .LBB161_3: +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 4 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: .cfi_def_cfa sp, 16 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: .cfi_def_cfa_offset 0 +; RV64-NEXT: ret %v = load <64 x double>, ptr %x %red = call double @llvm.vector.reduce.fmaximum.v64f64(<64 x double> %v) ret double %red diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll index 35cd789acfcc8..97cf7e6902e32 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-rint-vp.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s declare <2 x half> @llvm.vp.rint.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_rint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -28,10 +29,11 @@ define <2 x half> @vp_rint_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_rint_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v2f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -47,10 +49,11 @@ declare <4 x half> @llvm.vp.rint.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_rint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v4f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -66,10 +69,11 @@ define <4 x half> @vp_rint_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_rint_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v4f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -85,10 +89,11 @@ declare <8 x half> @llvm.vp.rint.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_rint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -104,10 +109,11 @@ define <8 x half> @vp_rint_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_rint_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v8f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -125,9 +131,10 @@ define <16 x half> @vp_rint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI6_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; CHECK-NEXT: vfabs.v v12, v8, v0.t +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 @@ -144,10 +151,11 @@ define <16 x half> @vp_rint_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext %e define <16 x half> @vp_rint_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; CHECK-LABEL: vp_rint_v16f16_unmasked: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI7_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; CHECK-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -317,37 +325,67 @@ define <16 x float> @vp_rint_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl) declare <2 x double> @llvm.vp.rint.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_rint_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v2f64: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI16_0) +; RV32-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vfabs.v v9, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vfabs.v v9, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %v = call <2 x double> @llvm.vp.rint.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_rint_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v2f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI17_0) +; RV32-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v2f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %v = call <2 x double> @llvm.vp.rint.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -355,39 +393,71 @@ define <2 x double> @vp_rint_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) declare <4 x double> @llvm.vp.rint.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_rint_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vmv1r.v v10, v0 +; RV32-NEXT: lui a0, %hi(.LCPI18_0) +; RV32-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32-NEXT: vfabs.v v12, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32-NEXT: vmv1r.v v0, v10 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vmv1r.v v10, v0 +; RV64-NEXT: vfabs.v v12, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64-NEXT: vmv1r.v v0, v10 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %v = call <4 x double> @llvm.vp.rint.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_rint_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v4f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI19_0) +; RV32-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v4f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %v = call <4 x double> @llvm.vp.rint.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -395,39 +465,71 @@ define <4 x double> @vp_rint_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) declare <8 x double> @llvm.vp.rint.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_rint_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vmv1r.v v12, v0 +; RV32-NEXT: lui a0, %hi(.LCPI20_0) +; RV32-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32-NEXT: vfabs.v v16, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32-NEXT: vmv1r.v v0, v12 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vmv1r.v v12, v0 +; RV64-NEXT: vfabs.v v16, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64-NEXT: vmv1r.v v0, v12 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %v = call <8 x double> @llvm.vp.rint.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_rint_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v8f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI21_0) +; RV32-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v8f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %v = call <8 x double> @llvm.vp.rint.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -435,39 +537,71 @@ define <8 x double> @vp_rint_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) declare <15 x double> @llvm.vp.rint.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_rint_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v15f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v16, v0 +; RV32-NEXT: lui a0, %hi(.LCPI22_0) +; RV32-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32-NEXT: vfabs.v v24, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v15f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v16, v0 +; RV64-NEXT: vfabs.v v24, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64-NEXT: vmv1r.v v0, v16 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: ret %v = call <15 x double> @llvm.vp.rint.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_rint_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v15f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI23_0) +; RV32-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v15f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %v = call <15 x double> @llvm.vp.rint.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -475,39 +609,71 @@ define <15 x double> @vp_rint_v15f64_unmasked(<15 x double> %va, i32 zeroext %ev declare <16 x double> @llvm.vp.rint.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_rint_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v16f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vmv1r.v v16, v0 +; RV32-NEXT: lui a0, %hi(.LCPI24_0) +; RV32-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32-NEXT: vfabs.v v24, v8, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32-NEXT: vmv1r.v v0, v16 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v16f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vmv1r.v v16, v0 +; RV64-NEXT: vfabs.v v24, v8, v0.t +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64-NEXT: vmv1r.v v0, v16 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: ret %v = call <16 x double> @llvm.vp.rint.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_rint_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v16f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI25_0) +; RV32-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v16f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %v = call <16 x double> @llvm.vp.rint.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -515,83 +681,159 @@ define <16 x double> @vp_rint_v16f64_unmasked(<16 x double> %va, i32 zeroext %ev declare <32 x double> @llvm.vp.rint.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_rint_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v32f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32-NEXT: vmv1r.v v6, v0 +; RV32-NEXT: li a2, 16 +; RV32-NEXT: vslidedown.vi v7, v0, 2 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB26_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB26_2: +; RV32-NEXT: vmv1r.v v0, v6 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v8, v0.t +; RV32-NEXT: lui a1, %hi(.LCPI26_0) +; RV32-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32-NEXT: addi a1, a0, -16 +; RV32-NEXT: sltu a0, a0, a1 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32-NEXT: and a0, a0, a1 +; RV32-NEXT: vmv1r.v v0, v6 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v32f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64-NEXT: vmv1r.v v6, v0 +; RV64-NEXT: li a2, 16 +; RV64-NEXT: vslidedown.vi v7, v0, 2 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB26_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB26_2: +; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v8, v0.t +; RV64-NEXT: li a1, 1075 +; RV64-NEXT: slli a1, a1, 52 +; RV64-NEXT: fmv.d.x fa5, a1 +; RV64-NEXT: addi a1, a0, -16 +; RV64-NEXT: sltu a0, a0, a1 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64-NEXT: and a0, a0, a1 +; RV64-NEXT: vmv1r.v v0, v6 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64-NEXT: ret %v = call <32 x double> @llvm.vp.rint.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_rint_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: vp_rint_v32f64_unmasked: +; RV32: # %bb.0: +; RV32-NEXT: li a2, 16 +; RV32-NEXT: mv a1, a0 +; RV32-NEXT: bltu a0, a2, .LBB27_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a1, 16 +; RV32-NEXT: .LBB27_2: +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v8 +; RV32-NEXT: lui a2, %hi(.LCPI27_0) +; RV32-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32-NEXT: addi a2, a0, -16 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: addi a0, a0, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: vmflt.vf v0, v24, fa5 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfabs.v v24, v16 +; RV32-NEXT: vmflt.vf v7, v24, fa5 +; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32-NEXT: vmv1r.v v0, v7 +; RV32-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: vp_rint_v32f64_unmasked: +; RV64: # %bb.0: +; RV64-NEXT: li a2, 16 +; RV64-NEXT: mv a1, a0 +; RV64-NEXT: bltu a0, a2, .LBB27_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a1, 16 +; RV64-NEXT: .LBB27_2: +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v8 +; RV64-NEXT: li a2, 1075 +; RV64-NEXT: slli a2, a2, 52 +; RV64-NEXT: fmv.d.x fa5, a2 +; RV64-NEXT: addi a2, a0, -16 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: addi a0, a0, -1 +; RV64-NEXT: vmflt.vf v0, v24, fa5 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfabs.v v24, v16 +; RV64-NEXT: vmflt.vf v7, v24, fa5 +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64-NEXT: vmv1r.v v0, v7 +; RV64-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64-NEXT: ret %v = call <32 x double> @llvm.vp.rint.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll index d8ff7062f033e..16c8b2b9da682 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-round-vp.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare <2 x half> @llvm.vp.round.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -59,10 +60,11 @@ define <2 x half> @vp_round_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) define <2 x half> @vp_round_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -99,10 +101,11 @@ declare <4 x half> @llvm.vp.round.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -145,10 +148,11 @@ define <4 x half> @vp_round_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) define <4 x half> @vp_round_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -185,10 +189,11 @@ declare <8 x half> @llvm.vp.round.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -231,10 +236,11 @@ define <8 x half> @vp_round_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) define <8 x half> @vp_round_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -273,9 +279,10 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -319,10 +326,11 @@ define <16 x half> @vp_round_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroext % define <16 x half> @vp_round_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -529,41 +537,141 @@ define <16 x float> @vp_round_v16f32_unmasked(<16 x float> %va, i32 zeroext %evl declare <2 x double> @llvm.vp.round.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_round_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.round.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_round_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.round.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -571,43 +679,149 @@ define <2 x double> @vp_round_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) declare <4 x double> @llvm.vp.round.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_round_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.round.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_round_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.round.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -615,43 +829,149 @@ define <4 x double> @vp_round_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) declare <8 x double> @llvm.vp.round.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_round_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.round.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_round_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.round.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -659,43 +979,149 @@ define <8 x double> @vp_round_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) declare <15 x double> @llvm.vp.round.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_round_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v15f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v15f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v15f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v15f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.round.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_round_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v15f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v15f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v15f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v15f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.round.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -703,43 +1129,149 @@ define <15 x double> @vp_round_v15f64_unmasked(<15 x double> %va, i32 zeroext %e declare <16 x double> @llvm.vp.round.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_round_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.round.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_round_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.round.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -747,91 +1279,341 @@ define <16 x double> @vp_round_v16f64_unmasked(<16 x double> %va, i32 zeroext %e declare <32 x double> @llvm.vp.round.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_round_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 4 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v32f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v6, v0 +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB26_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFH-NEXT: addi a1, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a1 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v32f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v6, v0 +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB26_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: addi a1, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a1 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: and a0, a0, a1 +; RV64ZVFH-NEXT: fsrmi a1, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v32f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB26_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFHMIN-NEXT: addi a1, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v32f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB26_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: addi a1, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: and a0, a0, a1 +; RV64ZVFHMIN-NEXT: fsrmi a1, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.round.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_round_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: fsrmi a2, 4 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsrmi a1, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_v32f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB27_2: +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFH-NEXT: addi a2, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a2 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a2 +; RV32ZVFH-NEXT: fsrmi a2, 4 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 4 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_v32f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB27_2: +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: addi a2, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a2 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: and a0, a0, a2 +; RV64ZVFH-NEXT: fsrmi a2, 4 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsrmi a1, 4 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_v32f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB27_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFHMIN-NEXT: addi a2, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a2 +; RV32ZVFHMIN-NEXT: fsrmi a2, 4 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 4 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_v32f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB27_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: addi a2, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: and a0, a0, a2 +; RV64ZVFHMIN-NEXT: fsrmi a2, 4 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a1, 4 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.round.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll index 2649f234375d2..14c550d555cf7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundeven-vp.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare <2 x half> @llvm.vp.roundeven.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -59,10 +60,11 @@ define <2 x half> @vp_roundeven_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext % define <2 x half> @vp_roundeven_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -99,10 +101,11 @@ declare <4 x half> @llvm.vp.roundeven.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -145,10 +148,11 @@ define <4 x half> @vp_roundeven_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext % define <4 x half> @vp_roundeven_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -185,10 +189,11 @@ declare <8 x half> @llvm.vp.roundeven.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -231,10 +236,11 @@ define <8 x half> @vp_roundeven_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext % define <8 x half> @vp_roundeven_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -273,9 +279,10 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -319,10 +326,11 @@ define <16 x half> @vp_roundeven_v16f16(<16 x half> %va, <16 x i1> %m, i32 zeroe define <16 x half> @vp_roundeven_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -529,41 +537,141 @@ define <16 x float> @vp_roundeven_v16f32_unmasked(<16 x float> %va, i32 zeroext declare <2 x double> @llvm.vp.roundeven.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_roundeven_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.roundeven.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_roundeven_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.roundeven.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -571,43 +679,149 @@ define <2 x double> @vp_roundeven_v2f64_unmasked(<2 x double> %va, i32 zeroext % declare <4 x double> @llvm.vp.roundeven.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_roundeven_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.roundeven.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_roundeven_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.roundeven.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -615,43 +829,149 @@ define <4 x double> @vp_roundeven_v4f64_unmasked(<4 x double> %va, i32 zeroext % declare <8 x double> @llvm.vp.roundeven.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_roundeven_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.roundeven.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_roundeven_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.roundeven.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -659,43 +979,149 @@ define <8 x double> @vp_roundeven_v8f64_unmasked(<8 x double> %va, i32 zeroext % declare <15 x double> @llvm.vp.roundeven.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_roundeven_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v15f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v15f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v15f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v15f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.roundeven.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_roundeven_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v15f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v15f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v15f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v15f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.roundeven.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -703,43 +1129,149 @@ define <15 x double> @vp_roundeven_v15f64_unmasked(<15 x double> %va, i32 zeroex declare <16 x double> @llvm.vp.roundeven.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_roundeven_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.roundeven.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_roundeven_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.roundeven.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -747,91 +1279,341 @@ define <16 x double> @vp_roundeven_v16f64_unmasked(<16 x double> %va, i32 zeroex declare <32 x double> @llvm.vp.roundeven.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_roundeven_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 0 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v32f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v6, v0 +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB26_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFH-NEXT: addi a1, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a1 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v32f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v6, v0 +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB26_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: addi a1, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a1 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: and a0, a0, a1 +; RV64ZVFH-NEXT: fsrmi a1, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v32f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB26_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFHMIN-NEXT: addi a1, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v32f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB26_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: addi a1, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: and a0, a0, a1 +; RV64ZVFHMIN-NEXT: fsrmi a1, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.roundeven.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_roundeven_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: fsrmi a2, 0 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsrmi a1, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_v32f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB27_2: +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFH-NEXT: addi a2, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a2 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a2 +; RV32ZVFH-NEXT: fsrmi a2, 0 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 0 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_v32f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB27_2: +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: addi a2, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a2 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: and a0, a0, a2 +; RV64ZVFH-NEXT: fsrmi a2, 0 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsrmi a1, 0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_v32f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB27_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFHMIN-NEXT: addi a2, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a2 +; RV32ZVFHMIN-NEXT: fsrmi a2, 0 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 0 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_v32f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB27_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: addi a2, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: and a0, a0, a2 +; RV64ZVFHMIN-NEXT: fsrmi a2, 0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a1, 0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.roundeven.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll index 50e65b62e7848..16f04f14721d0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-roundtozero-vp.ll @@ -1,22 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare <2 x half> @llvm.vp.roundtozero.v2f16(<2 x half>, <2 x i1>, i32) define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI0_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI0_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -59,10 +60,11 @@ define <2 x half> @vp_roundtozero_v2f16(<2 x half> %va, <2 x i1> %m, i32 zeroext define <2 x half> @vp_roundtozero_v2f16_unmasked(<2 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI1_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI1_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -99,10 +101,11 @@ declare <4 x half> @llvm.vp.roundtozero.v4f16(<4 x half>, <4 x i1>, i32) define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI2_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI2_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -145,10 +148,11 @@ define <4 x half> @vp_roundtozero_v4f16(<4 x half> %va, <4 x i1> %m, i32 zeroext define <4 x half> @vp_roundtozero_v4f16_unmasked(<4 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI3_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI3_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -185,10 +189,11 @@ declare <8 x half> @llvm.vp.roundtozero.v8f16(<8 x half>, <8 x i1>, i32) define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI4_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -231,10 +236,11 @@ define <8 x half> @vp_roundtozero_v8f16(<8 x half> %va, <8 x i1> %m, i32 zeroext define <8 x half> @vp_roundtozero_v8f16_unmasked(<8 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI5_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -273,9 +279,10 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vmv1r.v v10, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vfabs.v v12, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m2, ta, mu ; ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -319,10 +326,11 @@ define <16 x half> @vp_roundtozero_v16f16(<16 x half> %va, <16 x i1> %m, i32 zer define <16 x half> @vp_roundtozero_v16f16_unmasked(<16 x half> %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_v16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -529,41 +537,141 @@ define <16 x float> @vp_roundtozero_v16f32_unmasked(<16 x float> %va, i32 zeroex declare <2 x double> @llvm.vp.roundtozero.v2f64(<2 x double>, <2 x i1>, i32) define <2 x double> @vp_roundtozero_v2f64(<2 x double> %va, <2 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI16_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI16_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI16_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI16_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.roundtozero.v2f64(<2 x double> %va, <2 x i1> %m, i32 %evl) ret <2 x double> %v } define <2 x double> @vp_roundtozero_v2f64_unmasked(<2 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <2 x double> @llvm.vp.roundtozero.v2f64(<2 x double> %va, <2 x i1> splat (i1 true), i32 %evl) ret <2 x double> %v } @@ -571,43 +679,149 @@ define <2 x double> @vp_roundtozero_v2f64_unmasked(<2 x double> %va, i32 zeroext declare <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double>, <4 x i1>, i32) define <4 x double> @vp_roundtozero_v4f64(<4 x double> %va, <4 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double> %va, <4 x i1> %m, i32 %evl) ret <4 x double> %v } define <4 x double> @vp_roundtozero_v4f64_unmasked(<4 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <4 x double> @llvm.vp.roundtozero.v4f64(<4 x double> %va, <4 x i1> splat (i1 true), i32 %evl) ret <4 x double> %v } @@ -615,43 +829,149 @@ define <4 x double> @vp_roundtozero_v4f64_unmasked(<4 x double> %va, i32 zeroext declare <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double>, <8 x i1>, i32) define <8 x double> @vp_roundtozero_v8f64(<8 x double> %va, <8 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double> %va, <8 x i1> %m, i32 %evl) ret <8 x double> %v } define <8 x double> @vp_roundtozero_v8f64_unmasked(<8 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI21_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI21_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI21_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI21_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <8 x double> @llvm.vp.roundtozero.v8f64(<8 x double> %va, <8 x i1> splat (i1 true), i32 %evl) ret <8 x double> %v } @@ -659,43 +979,149 @@ define <8 x double> @vp_roundtozero_v8f64_unmasked(<8 x double> %va, i32 zeroext declare <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double>, <15 x i1>, i32) define <15 x double> @vp_roundtozero_v15f64(<15 x double> %va, <15 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v15f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI22_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v15f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v15f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v15f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI22_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI22_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v15f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double> %va, <15 x i1> %m, i32 %evl) ret <15 x double> %v } define <15 x double> @vp_roundtozero_v15f64_unmasked(<15 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v15f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI23_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI23_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v15f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v15f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v15f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI23_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI23_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v15f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <15 x double> @llvm.vp.roundtozero.v15f64(<15 x double> %va, <15 x i1> splat (i1 true), i32 %evl) ret <15 x double> %v } @@ -703,43 +1129,149 @@ define <15 x double> @vp_roundtozero_v15f64_unmasked(<15 x double> %va, i32 zero declare <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double>, <16 x i1>, i32) define <16 x double> @vp_roundtozero_v16f64(<16 x double> %va, <16 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI24_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI24_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI24_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double> %va, <16 x i1> %m, i32 %evl) ret <16 x double> %v } define <16 x double> @vp_roundtozero_v16f64_unmasked(<16 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI25_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI25_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI25_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI25_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <16 x double> @llvm.vp.roundtozero.v16f64(<16 x double> %va, <16 x i1> splat (i1 true), i32 %evl) ret <16 x double> %v } @@ -747,91 +1279,341 @@ define <16 x double> @vp_roundtozero_v16f64_unmasked(<16 x double> %va, i32 zero declare <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double>, <32 x i1>, i32) define <32 x double> @vp_roundtozero_v32f64(<32 x double> %va, <32 x i1> %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v6, v0 -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: vslidedown.vi v7, v0, 2 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB26_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB26_2: -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: lui a1, %hi(.LCPI26_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI26_0)(a1) -; CHECK-NEXT: addi a1, a0, -16 -; CHECK-NEXT: sltu a0, a0, a1 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a1, 1 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v32f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v6, v0 +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB26_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFH-NEXT: addi a1, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a1 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v32f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v6, v0 +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB26_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a1, 1075 +; RV64ZVFH-NEXT: slli a1, a1, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a1 +; RV64ZVFH-NEXT: addi a1, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a1 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: and a0, a0, a1 +; RV64ZVFH-NEXT: fsrmi a1, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v32f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB26_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI26_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI26_0)(a1) +; RV32ZVFHMIN-NEXT: addi a1, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v32f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetivli zero, 2, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v6, v0 +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: vslidedown.vi v7, v0, 2 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB26_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB26_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a1, 1075 +; RV64ZVFHMIN-NEXT: slli a1, a1, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a1 +; RV64ZVFHMIN-NEXT: addi a1, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a1 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: and a0, a0, a1 +; RV64ZVFHMIN-NEXT: fsrmi a1, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double> %va, <32 x i1> %m, i32 %evl) ret <32 x double> %v } define <32 x double> @vp_roundtozero_v32f64_unmasked(<32 x double> %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_v32f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: li a2, 16 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: bltu a0, a2, .LBB27_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 16 -; CHECK-NEXT: .LBB27_2: -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: lui a2, %hi(.LCPI27_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI27_0)(a2) -; CHECK-NEXT: addi a2, a0, -16 -; CHECK-NEXT: sltu a0, a0, a2 -; CHECK-NEXT: addi a0, a0, -1 -; CHECK-NEXT: and a0, a0, a2 -; CHECK-NEXT: fsrmi a2, 1 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v7, v24, fa5 -; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsrmi a1, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a1 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_v32f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: li a2, 16 +; RV32ZVFH-NEXT: mv a1, a0 +; RV32ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: li a1, 16 +; RV32ZVFH-NEXT: .LBB27_2: +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFH-NEXT: addi a2, a0, -16 +; RV32ZVFH-NEXT: sltu a0, a0, a2 +; RV32ZVFH-NEXT: addi a0, a0, -1 +; RV32ZVFH-NEXT: and a0, a0, a2 +; RV32ZVFH-NEXT: fsrmi a2, 1 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsrmi a1, 1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a1 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_v32f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: li a2, 16 +; RV64ZVFH-NEXT: mv a1, a0 +; RV64ZVFH-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: li a1, 16 +; RV64ZVFH-NEXT: .LBB27_2: +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: addi a2, a0, -16 +; RV64ZVFH-NEXT: sltu a0, a0, a2 +; RV64ZVFH-NEXT: addi a0, a0, -1 +; RV64ZVFH-NEXT: and a0, a0, a2 +; RV64ZVFH-NEXT: fsrmi a2, 1 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFH-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsrmi a1, 1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a1 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_v32f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: li a2, 16 +; RV32ZVFHMIN-NEXT: mv a1, a0 +; RV32ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: li a1, 16 +; RV32ZVFHMIN-NEXT: .LBB27_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI27_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI27_0)(a2) +; RV32ZVFHMIN-NEXT: addi a2, a0, -16 +; RV32ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV32ZVFHMIN-NEXT: addi a0, a0, -1 +; RV32ZVFHMIN-NEXT: and a0, a0, a2 +; RV32ZVFHMIN-NEXT: fsrmi a2, 1 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV32ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a1, 1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a1 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_v32f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: li a2, 16 +; RV64ZVFHMIN-NEXT: mv a1, a0 +; RV64ZVFHMIN-NEXT: bltu a0, a2, .LBB27_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: li a1, 16 +; RV64ZVFHMIN-NEXT: .LBB27_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: addi a2, a0, -16 +; RV64ZVFHMIN-NEXT: sltu a0, a0, a2 +; RV64ZVFHMIN-NEXT: addi a0, a0, -1 +; RV64ZVFHMIN-NEXT: and a0, a0, a2 +; RV64ZVFHMIN-NEXT: fsrmi a2, 1 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5 +; RV64ZVFHMIN-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a1, 1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a1 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call <32 x double> @llvm.vp.roundtozero.v32f64(<32 x double> %va, <32 x i1> splat (i1 true), i32 %evl) ret <32 x double> %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll index af79ace04cf54..965d0b0fe0f9b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-fp.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s +; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zvfbfmin,+v -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define <4 x bfloat> @shuffle_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; CHECK-LABEL: shuffle_v4bf16: @@ -39,29 +39,49 @@ define <8 x float> @shuffle_v8f32(<8 x float> %x, <8 x float> %y) { } define <4 x double> @shuffle_fv_v4f64(<4 x double> %x) { -; CHECK-LABEL: shuffle_fv_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI3_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 9 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_fv_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, %hi(.LCPI3_0) +; RV32-NEXT: fld fa5, %lo(.LCPI3_0)(a0) +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 9 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vfmerge.vfm v8, v8, fa5, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_fv_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 9 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %s = shufflevector <4 x double> , <4 x double> %x, <4 x i32> ret <4 x double> %s } define <4 x double> @shuffle_vf_v4f64(<4 x double> %x) { -; CHECK-LABEL: shuffle_vf_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI4_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_vf_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, %hi(.LCPI4_0) +; RV32-NEXT: fld fa5, %lo(.LCPI4_0)(a0) +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 6 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vfmerge.vfm v8, v8, fa5, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_vf_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 6 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s } @@ -79,15 +99,25 @@ define <4 x float> @vfmerge_constant_v4f32(<4 x float> %x) { } define <4 x double> @vfmerge_constant_v4f64(<4 x double> %x) { -; CHECK-LABEL: vfmerge_constant_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI6_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI6_0)(a0) -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 6 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmerge.vfm v8, v8, fa5, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vfmerge_constant_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, %hi(.LCPI6_0) +; RV32-NEXT: fld fa5, %lo(.LCPI6_0)(a0) +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 6 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vfmerge.vfm v8, v8, fa5, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vfmerge_constant_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 6 +; RV64-NEXT: lui a0, 4101 +; RV64-NEXT: slli a0, a0, 38 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s } @@ -161,40 +191,71 @@ define <4 x double> @vrgather_shuffle_vv_v4f64(<4 x double> %x, <4 x double> %y) } define <4 x double> @vrgather_shuffle_xv_v4f64(<4 x double> %x) { -; CHECK-LABEL: vrgather_shuffle_xv_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vmv2r.v v10, v8 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vslideup.vi v10, v8, 2, v0.t -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 12 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmv.v.f v8, fa5 -; CHECK-NEXT: vmerge.vvm v8, v8, v10, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vrgather_shuffle_xv_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vmv2r.v v10, v8 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vslideup.vi v10, v8, 2, v0.t +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 12 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vfmv.v.f v8, fa5 +; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrgather_shuffle_xv_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 8 +; RV64-NEXT: vmv2r.v v10, v8 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vslideup.vi v10, v8, 2, v0.t +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 12 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v8, a0 +; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: ret %s = shufflevector <4 x double> , <4 x double> %x, <4 x i32> ret <4 x double> %s } define <4 x double> @vrgather_shuffle_vx_v4f64(<4 x double> %x) { -; CHECK-LABEL: vrgather_shuffle_vx_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; CHECK-NEXT: vslidedown.vi v8, v8, 2, v0.t -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vmv.v.i v0, 3 -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa5 -; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 -; CHECK-NEXT: ret +; RV32-LABEL: vrgather_shuffle_vx_v4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 2 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV32-NEXT: vslidedown.vi v8, v8, 2, v0.t +; RV32-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV32-NEXT: vmv.v.i v0, 3 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vfmv.v.f v10, fa5 +; RV32-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV32-NEXT: ret +; +; RV64-LABEL: vrgather_shuffle_vx_v4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 2 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vslidedown.vi v8, v8, 2, v0.t +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; RV64-NEXT: vmv.v.i v0, 3 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vmv.v.x v10, a0 +; RV64-NEXT: vmerge.vvm v8, v10, v8, v0 +; RV64-NEXT: ret %s = shufflevector <4 x double> %x, <4 x double> , <4 x i32> ret <4 x double> %s } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll index c76aa7c4d317d..5c17283cacd1b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -3,8 +3,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V -; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X -; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X +; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X,RV32ZVKB-ZVE32X +; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X,RV64ZVKB-ZVE32X define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_1: @@ -926,6 +926,136 @@ define <8 x i64> @shuffle_v8i64_as_i128(<8 x i64> %v) { ; ZVKB-V-NEXT: vslideup.vi v12, v8, 1, v0.t ; ZVKB-V-NEXT: vmv.v.v v8, v12 ; ZVKB-V-NEXT: ret +; +; RV32ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i128: +; RV32ZVKB-ZVE32X: # %bb.0: +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, -128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 128 +; RV32ZVKB-ZVE32X-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s3, 112(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset ra, -4 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s0, -8 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s2, -12 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s3, -16 +; RV32ZVKB-ZVE32X-NEXT: addi s0, sp, 128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVKB-ZVE32X-NEXT: andi sp, sp, -64 +; RV32ZVKB-ZVE32X-NEXT: lw a2, 0(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a3, 4(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a4, 8(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a5, 12(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a6, 16(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a7, 20(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t0, 24(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t1, 28(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t2, 48(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t3, 52(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t4, 56(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t5, 60(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t6, 32(a1) +; RV32ZVKB-ZVE32X-NEXT: lw s2, 36(a1) +; RV32ZVKB-ZVE32X-NEXT: lw s3, 40(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a1, 44(a1) +; RV32ZVKB-ZVE32X-NEXT: sw t4, 48(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t5, 52(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t2, 56(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t3, 60(sp) +; RV32ZVKB-ZVE32X-NEXT: sw s3, 32(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a1, 36(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t6, 40(sp) +; RV32ZVKB-ZVE32X-NEXT: sw s2, 44(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t0, 16(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t1, 20(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a6, 24(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a7, 28(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a4, 0(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a5, 4(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a2, 8(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a3, 12(sp) +; RV32ZVKB-ZVE32X-NEXT: mv a1, sp +; RV32ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32ZVKB-ZVE32X-NEXT: vle32.v v8, (a1) +; RV32ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV32ZVKB-ZVE32X-NEXT: addi sp, s0, -128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa sp, 128 +; RV32ZVKB-ZVE32X-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s3, 112(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore ra +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s2 +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s3 +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, 128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVKB-ZVE32X-NEXT: ret +; +; RV64ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i128: +; RV64ZVKB-ZVE32X: # %bb.0: +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, -128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 128 +; RV64ZVKB-ZVE32X-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s2, 104(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s3, 96(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset ra, -8 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s0, -16 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s2, -24 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s3, -32 +; RV64ZVKB-ZVE32X-NEXT: addi s0, sp, 128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVKB-ZVE32X-NEXT: andi sp, sp, -64 +; RV64ZVKB-ZVE32X-NEXT: ld a2, 0(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a3, 8(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a4, 16(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a5, 24(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a6, 32(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a7, 40(a1) +; RV64ZVKB-ZVE32X-NEXT: ld t0, 48(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a1, 56(a1) +; RV64ZVKB-ZVE32X-NEXT: srli t1, a3, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t2, a2, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t3, a5, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t4, a4, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t5, a7, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t6, a6, 32 +; RV64ZVKB-ZVE32X-NEXT: srli s2, a1, 32 +; RV64ZVKB-ZVE32X-NEXT: srli s3, t0, 32 +; RV64ZVKB-ZVE32X-NEXT: sw a1, 48(sp) +; RV64ZVKB-ZVE32X-NEXT: sw s2, 52(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t0, 56(sp) +; RV64ZVKB-ZVE32X-NEXT: sw s3, 60(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a7, 32(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t5, 36(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a6, 40(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t6, 44(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a5, 16(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t3, 20(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a4, 24(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t4, 28(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a3, 0(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t1, 4(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a2, 8(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t2, 12(sp) +; RV64ZVKB-ZVE32X-NEXT: mv a1, sp +; RV64ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV64ZVKB-ZVE32X-NEXT: vle32.v v8, (a1) +; RV64ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV64ZVKB-ZVE32X-NEXT: addi sp, s0, -128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa sp, 128 +; RV64ZVKB-ZVE32X-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s2, 104(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s3, 96(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore ra +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s2 +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s3 +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, 128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> ret <8 x i64> %shuffle } @@ -951,6 +1081,104 @@ define <8 x i64> @shuffle_v8i64_as_i128_2(<8 x i64> %v) { ; ZVKB-V-NEXT: vslideup.vi v12, v8, 1, v0.t ; ZVKB-V-NEXT: vmv.v.v v8, v12 ; ZVKB-V-NEXT: ret +; +; RV32ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i128_2: +; RV32ZVKB-ZVE32X: # %bb.0: +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, -128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 128 +; RV32ZVKB-ZVE32X-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset ra, -4 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s0, -8 +; RV32ZVKB-ZVE32X-NEXT: addi s0, sp, 128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVKB-ZVE32X-NEXT: andi sp, sp, -64 +; RV32ZVKB-ZVE32X-NEXT: lw a2, 16(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a3, 20(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a4, 24(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a5, 28(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a6, 48(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a7, 52(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t0, 56(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t1, 60(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t2, 32(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t3, 36(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t4, 40(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a1, 44(a1) +; RV32ZVKB-ZVE32X-NEXT: sw t0, 48(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t1, 52(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a6, 56(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a7, 60(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t4, 32(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a1, 36(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t2, 40(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t3, 44(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a4, 16(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a5, 20(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a2, 24(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a3, 28(sp) +; RV32ZVKB-ZVE32X-NEXT: mv a1, sp +; RV32ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32ZVKB-ZVE32X-NEXT: vle32.v v8, (a1) +; RV32ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV32ZVKB-ZVE32X-NEXT: addi sp, s0, -128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa sp, 128 +; RV32ZVKB-ZVE32X-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore ra +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, 128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVKB-ZVE32X-NEXT: ret +; +; RV64ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i128_2: +; RV64ZVKB-ZVE32X: # %bb.0: +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, -128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 128 +; RV64ZVKB-ZVE32X-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset ra, -8 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s0, -16 +; RV64ZVKB-ZVE32X-NEXT: addi s0, sp, 128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVKB-ZVE32X-NEXT: andi sp, sp, -64 +; RV64ZVKB-ZVE32X-NEXT: ld a2, 16(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a3, 24(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a4, 32(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a5, 40(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a6, 48(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a1, 56(a1) +; RV64ZVKB-ZVE32X-NEXT: srli a7, a3, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t0, a2, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t1, a5, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t2, a4, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t3, a1, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t4, a6, 32 +; RV64ZVKB-ZVE32X-NEXT: sw a1, 48(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t3, 52(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a6, 56(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t4, 60(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a5, 32(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t1, 36(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a4, 40(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t2, 44(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a3, 16(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a7, 20(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a2, 24(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t0, 28(sp) +; RV64ZVKB-ZVE32X-NEXT: mv a1, sp +; RV64ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV64ZVKB-ZVE32X-NEXT: vle32.v v8, (a1) +; RV64ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV64ZVKB-ZVE32X-NEXT: addi sp, s0, -128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa sp, 128 +; RV64ZVKB-ZVE32X-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore ra +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, 128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> ret <8 x i64> %shuffle } @@ -975,6 +1203,136 @@ define <8 x i64> @shuffle_v8i64_as_i256(<8 x i64> %v) { ; ZVKB-V-NEXT: vrgatherei16.vv v12, v8, v16 ; ZVKB-V-NEXT: vmv.v.v v8, v12 ; ZVKB-V-NEXT: ret +; +; RV32ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i256: +; RV32ZVKB-ZVE32X: # %bb.0: +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, -128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 128 +; RV32ZVKB-ZVE32X-NEXT: sw ra, 124(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s0, 120(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s2, 116(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s3, 112(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset ra, -4 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s0, -8 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s2, -12 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s3, -16 +; RV32ZVKB-ZVE32X-NEXT: addi s0, sp, 128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa s0, 0 +; RV32ZVKB-ZVE32X-NEXT: andi sp, sp, -64 +; RV32ZVKB-ZVE32X-NEXT: lw a2, 0(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a3, 4(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a4, 8(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a5, 12(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a6, 16(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a7, 20(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t0, 24(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t1, 28(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t2, 32(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t3, 36(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t4, 40(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t5, 44(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t6, 48(a1) +; RV32ZVKB-ZVE32X-NEXT: lw s2, 52(a1) +; RV32ZVKB-ZVE32X-NEXT: lw s3, 56(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a1, 60(a1) +; RV32ZVKB-ZVE32X-NEXT: sw t2, 48(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t3, 52(sp) +; RV32ZVKB-ZVE32X-NEXT: sw s3, 56(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a1, 60(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t6, 32(sp) +; RV32ZVKB-ZVE32X-NEXT: sw s2, 36(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t4, 40(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t5, 44(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a2, 16(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a3, 20(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t0, 24(sp) +; RV32ZVKB-ZVE32X-NEXT: sw t1, 28(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a6, 0(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a7, 4(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a4, 8(sp) +; RV32ZVKB-ZVE32X-NEXT: sw a5, 12(sp) +; RV32ZVKB-ZVE32X-NEXT: mv a1, sp +; RV32ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32ZVKB-ZVE32X-NEXT: vle32.v v8, (a1) +; RV32ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV32ZVKB-ZVE32X-NEXT: addi sp, s0, -128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa sp, 128 +; RV32ZVKB-ZVE32X-NEXT: lw ra, 124(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s0, 120(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s2, 116(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s3, 112(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore ra +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s2 +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s3 +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, 128 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVKB-ZVE32X-NEXT: ret +; +; RV64ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i256: +; RV64ZVKB-ZVE32X: # %bb.0: +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, -128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 128 +; RV64ZVKB-ZVE32X-NEXT: sd ra, 120(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s0, 112(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s2, 104(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s3, 96(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset ra, -8 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s0, -16 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s2, -24 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s3, -32 +; RV64ZVKB-ZVE32X-NEXT: addi s0, sp, 128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa s0, 0 +; RV64ZVKB-ZVE32X-NEXT: andi sp, sp, -64 +; RV64ZVKB-ZVE32X-NEXT: ld a2, 0(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a3, 8(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a4, 16(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a5, 24(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a6, 32(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a7, 40(a1) +; RV64ZVKB-ZVE32X-NEXT: ld t0, 48(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a1, 56(a1) +; RV64ZVKB-ZVE32X-NEXT: srli t1, a4, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t2, a3, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t3, a2, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t4, a5, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t5, t0, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t6, a7, 32 +; RV64ZVKB-ZVE32X-NEXT: srli s2, a6, 32 +; RV64ZVKB-ZVE32X-NEXT: srli s3, a1, 32 +; RV64ZVKB-ZVE32X-NEXT: sw a6, 48(sp) +; RV64ZVKB-ZVE32X-NEXT: sw s2, 52(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a1, 56(sp) +; RV64ZVKB-ZVE32X-NEXT: sw s3, 60(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t0, 32(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t5, 36(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a7, 40(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t6, 44(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a2, 16(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t3, 20(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a5, 24(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t4, 28(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a4, 0(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t1, 4(sp) +; RV64ZVKB-ZVE32X-NEXT: sw a3, 8(sp) +; RV64ZVKB-ZVE32X-NEXT: sw t2, 12(sp) +; RV64ZVKB-ZVE32X-NEXT: mv a1, sp +; RV64ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV64ZVKB-ZVE32X-NEXT: vle32.v v8, (a1) +; RV64ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV64ZVKB-ZVE32X-NEXT: addi sp, s0, -128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa sp, 128 +; RV64ZVKB-ZVE32X-NEXT: ld ra, 120(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s0, 112(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s2, 104(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s3, 96(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore ra +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s2 +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s3 +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, 128 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> ret <8 x i64> %shuffle } @@ -1003,6 +1361,106 @@ define <8 x i64> @shuffle_v8i64_as_i256_zvl256b(<8 x i64> %v) vscale_range(4,0) ; ZVKB-V-NEXT: vrgatherei16.vv v10, v8, v12 ; ZVKB-V-NEXT: vmv2r.v v8, v10 ; ZVKB-V-NEXT: ret +; +; RV32ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i256_zvl256b: +; RV32ZVKB-ZVE32X: # %bb.0: +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, -16 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 16 +; RV32ZVKB-ZVE32X-NEXT: sw s0, 12(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: sw s1, 8(sp) # 4-byte Folded Spill +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s0, -4 +; RV32ZVKB-ZVE32X-NEXT: .cfi_offset s1, -8 +; RV32ZVKB-ZVE32X-NEXT: lw a2, 48(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a3, 52(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a4, 56(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a5, 60(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a6, 32(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a7, 36(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t0, 40(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t1, 44(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t2, 16(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t3, 20(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t4, 24(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t5, 28(a1) +; RV32ZVKB-ZVE32X-NEXT: lw t6, 0(a1) +; RV32ZVKB-ZVE32X-NEXT: lw s0, 4(a1) +; RV32ZVKB-ZVE32X-NEXT: lw s1, 8(a1) +; RV32ZVKB-ZVE32X-NEXT: lw a1, 12(a1) +; RV32ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m2, ta, ma +; RV32ZVKB-ZVE32X-NEXT: vmv.v.x v8, t2 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t3 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, s1 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a1 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t6 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, s0 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t4 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t5 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a2 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a3 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t0 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t1 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a6 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a7 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a4 +; RV32ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a5 +; RV32ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV32ZVKB-ZVE32X-NEXT: lw s0, 12(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: lw s1, 8(sp) # 4-byte Folded Reload +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV32ZVKB-ZVE32X-NEXT: .cfi_restore s1 +; RV32ZVKB-ZVE32X-NEXT: addi sp, sp, 16 +; RV32ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV32ZVKB-ZVE32X-NEXT: ret +; +; RV64ZVKB-ZVE32X-LABEL: shuffle_v8i64_as_i256_zvl256b: +; RV64ZVKB-ZVE32X: # %bb.0: +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, -16 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 16 +; RV64ZVKB-ZVE32X-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: sd s1, 0(sp) # 8-byte Folded Spill +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s0, -8 +; RV64ZVKB-ZVE32X-NEXT: .cfi_offset s1, -16 +; RV64ZVKB-ZVE32X-NEXT: ld a2, 32(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a3, 40(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a4, 48(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a5, 56(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a6, 0(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a7, 8(a1) +; RV64ZVKB-ZVE32X-NEXT: ld t0, 16(a1) +; RV64ZVKB-ZVE32X-NEXT: ld a1, 24(a1) +; RV64ZVKB-ZVE32X-NEXT: srli t1, a5, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t2, a2, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t3, a3, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t4, a4, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t5, a1, 32 +; RV64ZVKB-ZVE32X-NEXT: srli t6, a6, 32 +; RV64ZVKB-ZVE32X-NEXT: srli s0, a7, 32 +; RV64ZVKB-ZVE32X-NEXT: srli s1, t0, 32 +; RV64ZVKB-ZVE32X-NEXT: vsetivli zero, 16, e32, m2, ta, ma +; RV64ZVKB-ZVE32X-NEXT: vmv.v.x v8, t0 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, s1 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a7 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, s0 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a6 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t6 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a1 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t5 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a4 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t4 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a3 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t3 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a2 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t2 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, a5 +; RV64ZVKB-ZVE32X-NEXT: vslide1down.vx v8, v8, t1 +; RV64ZVKB-ZVE32X-NEXT: vse32.v v8, (a0) +; RV64ZVKB-ZVE32X-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: ld s1, 0(sp) # 8-byte Folded Reload +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s0 +; RV64ZVKB-ZVE32X-NEXT: .cfi_restore s1 +; RV64ZVKB-ZVE32X-NEXT: addi sp, sp, 16 +; RV64ZVKB-ZVE32X-NEXT: .cfi_def_cfa_offset 0 +; RV64ZVKB-ZVE32X-NEXT: ret %shuffle = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> ret <8 x i64> %shuffle } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll index 5aa3a246d7616..0561ee9addc7b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfw-web-simplification.ll @@ -5,7 +5,7 @@ ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+f,+d -verify-machineinstrs %s -o - --riscv-lower-ext-max-web-size=3 | FileCheck %s --check-prefixes=FOLDING,ZVFHMIN ; Check that the default value enables the web folding and ; that it is bigger than 3. -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+f,+d -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=FOLDING,ZVFH define void @vfwmul_v2f116_multiple_users(ptr %x, ptr %y, ptr %z, <2 x half> %a, <2 x half> %b, <2 x half> %b2) { ; NO_FOLDING1-LABEL: vfwmul_v2f116_multiple_users: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll index 90e9ffdcb320a..eeb232ec1555c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,RV64 %s define <2 x i16> @vwmulu_v2i16(ptr %x, ptr %y) { ; CHECK-LABEL: vwmulu_v2i16: @@ -750,28 +750,25 @@ define <2 x i64> @vwmulu_vx_v2i64_i8(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: lb a1, 0(a1) -; RV32-NEXT: vle32.v v25, (a0) -; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: lbu a1, 0(a1) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v26, (a0), zero -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf2 v27, v25 -; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vwmulu_vx_v2i64_i8: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vle32.v v25, (a0) -; RV64-NEXT: lb a0, 0(a1) -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf2 v26, v25 -; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: vwmulu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i8, ptr %y @@ -788,28 +785,25 @@ define <2 x i64> @vwmulu_vx_v2i64_i16(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: lh a1, 0(a1) -; RV32-NEXT: vle32.v v25, (a0) -; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: lhu a1, 0(a1) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v26, (a0), zero -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf2 v27, v25 -; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vwmulu_vx_v2i64_i16: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vle32.v v25, (a0) -; RV64-NEXT: lh a0, 0(a1) -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf2 v26, v25 -; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) +; RV64-NEXT: lhu a0, 0(a1) +; RV64-NEXT: vwmulu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i16, ptr %y @@ -826,28 +820,25 @@ define <2 x i64> @vwmulu_vx_v2i64_i32(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu ; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: vle32.v v25, (a0) -; RV32-NEXT: srai a0, a1, 31 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) ; RV32-NEXT: sw a1, 8(sp) -; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: sw zero, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v26, (a0), zero -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf2 v27, v25 -; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vwmulu_vx_v2i64_i32: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vle32.v v9, (a0) ; RV64-NEXT: lw a0, 0(a1) -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf2 v26, v25 -; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: vwmulu.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i32, ptr %y @@ -864,28 +855,27 @@ define <2 x i64> @vwmulu_vx_v2i64_i64(ptr %x, ptr %y) { ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV32-NEXT: lw a2, 4(a1) -; RV32-NEXT: lw a1, 0(a1) -; RV32-NEXT: vle32.v v25, (a0) -; RV32-NEXT: sw a2, 12(sp) -; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: lw a2, 0(a1) +; RV32-NEXT: lw a1, 4(a1) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: sw a2, 8(sp) +; RV32-NEXT: sw a1, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v26, (a0), zero -; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV32-NEXT: vzext.vf2 v27, v25 -; RV32-NEXT: vmul.vv v8, v26, v27 +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vzext.vf2 v10, v8 +; RV32-NEXT: vmul.vv v8, v9, v10 ; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vwmulu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu -; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vle32.v v8, (a0) ; RV64-NEXT: ld a0, 0(a1) -; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; RV64-NEXT: vzext.vf2 v26, v25 -; RV64-NEXT: vmul.vx v8, v26, a0 +; RV64-NEXT: vzext.vf2 v9, v8 +; RV64-NEXT: vmul.vx v8, v9, a0 ; RV64-NEXT: ret %a = load <2 x i32>, ptr %x %b = load i64, ptr %y diff --git a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll index 6ebb03ff0297e..8f2aec3140e9d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/floor-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare @llvm.vp.floor.nxv1bf16(, , i32) @@ -407,10 +407,11 @@ declare @llvm.vp.floor.nxv1f16(, @vp_floor_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -453,10 +454,11 @@ define @vp_floor_nxv1f16( %va, @vp_floor_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -493,10 +495,11 @@ declare @llvm.vp.floor.nxv2f16(, @vp_floor_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -539,10 +542,11 @@ define @vp_floor_nxv2f16( %va, @vp_floor_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -579,10 +583,11 @@ declare @llvm.vp.floor.nxv4f16(, @vp_floor_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 2 @@ -625,10 +630,11 @@ define @vp_floor_nxv4f16( %va, @vp_floor_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -667,9 +673,10 @@ define @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16( %va, @vp_floor_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -755,9 +763,10 @@ define @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16( %va, @vp_floor_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -843,9 +853,10 @@ define @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16( %va, @vp_floor_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_floor_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 2 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1210,41 +1222,141 @@ define @vp_floor_nxv16f32_unmasked( % declare @llvm.vp.floor.nxv1f64(, , i32) define @vp_floor_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_floor_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv1f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv1f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv1f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv1f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1252,43 +1364,149 @@ define @vp_floor_nxv1f64_unmasked( %v declare @llvm.vp.floor.nxv2f64(, , i32) define @vp_floor_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_floor_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1296,43 +1514,149 @@ define @vp_floor_nxv2f64_unmasked( %v declare @llvm.vp.floor.nxv4f64(, , i32) define @vp_floor_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_floor_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1340,43 +1664,149 @@ define @vp_floor_nxv4f64_unmasked( %v declare @llvm.vp.floor.nxv7f64(, , i32) define @vp_floor_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv7f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv7f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv7f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv7f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_floor_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv7f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv7f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv7f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv7f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1384,43 +1814,149 @@ define @vp_floor_nxv7f64_unmasked( %v declare @llvm.vp.floor.nxv8f64(, , i32) define @vp_floor_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_floor_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1429,87 +1965,325 @@ define @vp_floor_nxv8f64_unmasked( %v declare @llvm.vp.floor.nxv16f64(, , i32) define @vp_floor_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v7, v0 +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFH-NEXT: srli a3, a1, 3 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFH-NEXT: sub a2, a0, a1 +; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFH-NEXT: sltu a3, a0, a2 +; RV32ZVFH-NEXT: addi a3, a3, -1 +; RV32ZVFH-NEXT: and a2, a3, a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a2, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB44_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v7, v0 +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: srli a3, a1, 3 +; RV64ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a2, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB44_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFHMIN-NEXT: srli a3, a1, 3 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFHMIN-NEXT: sub a2, a0, a1 +; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFHMIN-NEXT: sltu a3, a0, a2 +; RV32ZVFHMIN-NEXT: addi a3, a3, -1 +; RV32ZVFHMIN-NEXT: and a2, a3, a2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a2, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB44_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: srli a3, a1, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a2, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB44_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_floor_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_floor_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 2 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_floor_nxv16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFH-NEXT: sub a3, a0, a1 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFH-NEXT: sltu a2, a0, a3 +; RV32ZVFH-NEXT: addi a2, a2, -1 +; RV32ZVFH-NEXT: and a2, a2, a3 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a2, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB45_2: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_floor_nxv16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a2, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB45_2: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_floor_nxv16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFHMIN-NEXT: sub a3, a0, a1 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV32ZVFHMIN-NEXT: addi a2, a2, -1 +; RV32ZVFHMIN-NEXT: and a2, a2, a3 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a2, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB45_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_floor_nxv16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a2, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB45_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.floor.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll index 7a4695d1c25c1..409235f7e1b2c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s declare @llvm.experimental.constrained.nearbyint.nxv1f16(, metadata, metadata) @@ -11,10 +11,11 @@ define @nearbyint_nxv1f16( %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -35,10 +36,11 @@ define @nearbyint_nxv2f16( %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -59,10 +61,11 @@ define @nearbyint_nxv4f16( %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -83,10 +86,11 @@ define @nearbyint_nxv8f16( %v) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -107,10 +111,11 @@ define @nearbyint_nxv16f16( %v) strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -131,10 +136,11 @@ define @nearbyint_nxv32f16( %v) strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: frflags a0 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma @@ -271,23 +277,42 @@ define @nearbyint_nxv16f32( %v) stric declare @llvm.experimental.constrained.nearbyint.nxv1f64(, metadata, metadata) define @nearbyint_nxv1f64( %v) strictfp { -; CHECK-LABEL: nearbyint_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv1f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r } @@ -295,23 +320,42 @@ define @nearbyint_nxv1f64( %v) strict declare @llvm.experimental.constrained.nearbyint.nxv2f64(, metadata, metadata) define @nearbyint_nxv2f64( %v) strictfp { -; CHECK-LABEL: nearbyint_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv2f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r } @@ -319,23 +363,42 @@ define @nearbyint_nxv2f64( %v) strict declare @llvm.experimental.constrained.nearbyint.nxv4f64(, metadata, metadata) define @nearbyint_nxv4f64( %v) strictfp { -; CHECK-LABEL: nearbyint_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv4f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r } @@ -343,23 +406,42 @@ define @nearbyint_nxv4f64( %v) strict declare @llvm.experimental.constrained.nearbyint.nxv8f64(, metadata, metadata) define @nearbyint_nxv8f64( %v) strictfp { -; CHECK-LABEL: nearbyint_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32-LABEL: nearbyint_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: frflags a0 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: fsflags a0 +; RV32-NEXT: ret +; +; RV64-LABEL: nearbyint_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: frflags a0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: fsflags a0 +; RV64-NEXT: ret %r = call @llvm.experimental.constrained.nearbyint.nxv8f64( %v, metadata !"round.dynamic", metadata !"fpexcept.strict") ret %r } diff --git a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll index 4ea3269cec0b1..97e65f4e4b53a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fnearbyint-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN define @nearbyint_nxv1bf16( %x) { ; CHECK-LABEL: nearbyint_nxv1bf16: @@ -167,10 +167,11 @@ define @nearbyint_nxv32bf16( %x) { define @nearbyint_nxv1f16( %x) { ; ZVFH-LABEL: nearbyint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -206,10 +207,11 @@ declare @llvm.nearbyint.nxv1f16() define @nearbyint_nxv2f16( %x) { ; ZVFH-LABEL: nearbyint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -245,10 +247,11 @@ declare @llvm.nearbyint.nxv2f16() define @nearbyint_nxv4f16( %x) { ; ZVFH-LABEL: nearbyint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -284,10 +287,11 @@ declare @llvm.nearbyint.nxv4f16() define @nearbyint_nxv8f16( %x) { ; ZVFH-LABEL: nearbyint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -323,10 +327,11 @@ declare @llvm.nearbyint.nxv8f16() define @nearbyint_nxv16f16( %x) { ; ZVFH-LABEL: nearbyint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -362,10 +367,11 @@ declare @llvm.nearbyint.nxv16f16() define @nearbyint_nxv32f16( %x) { ; ZVFH-LABEL: nearbyint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -513,80 +519,268 @@ define @nearbyint_nxv16f32( %x) { declare @llvm.nearbyint.nxv16f32() define @nearbyint_nxv1f64( %x) { -; CHECK-LABEL: nearbyint_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: nearbyint_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: nearbyint_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: nearbyint_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: nearbyint_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv1f64( %x) ret %a } declare @llvm.nearbyint.nxv1f64() define @nearbyint_nxv2f64( %x) { -; CHECK-LABEL: nearbyint_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: nearbyint_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: nearbyint_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: nearbyint_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: nearbyint_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv2f64( %x) ret %a } declare @llvm.nearbyint.nxv2f64() define @nearbyint_nxv4f64( %x) { -; CHECK-LABEL: nearbyint_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: nearbyint_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: nearbyint_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: nearbyint_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: nearbyint_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv4f64( %x) ret %a } declare @llvm.nearbyint.nxv4f64() define @nearbyint_nxv8f64( %x) { -; CHECK-LABEL: nearbyint_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: nearbyint_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: nearbyint_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: nearbyint_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: nearbyint_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.nearbyint.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll index 5fe59f3b3933d..5ed921d39590d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/frint-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN define @rint_nxv1bf16( %x) { ; CHECK-LABEL: rint_nxv1bf16: @@ -153,10 +153,11 @@ define @rint_nxv32bf16( %x) { define @rint_nxv1f16( %x) { ; ZVFH-LABEL: rint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -188,10 +189,11 @@ declare @llvm.rint.nxv1f16() define @rint_nxv2f16( %x) { ; ZVFH-LABEL: rint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -223,10 +225,11 @@ declare @llvm.rint.nxv2f16() define @rint_nxv4f16( %x) { ; ZVFH-LABEL: rint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -258,10 +261,11 @@ declare @llvm.rint.nxv4f16() define @rint_nxv8f16( %x) { ; ZVFH-LABEL: rint_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -293,10 +297,11 @@ declare @llvm.rint.nxv8f16() define @rint_nxv16f16( %x) { ; ZVFH-LABEL: rint_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -328,10 +333,11 @@ declare @llvm.rint.nxv16f16() define @rint_nxv32f16( %x) { ; ZVFH-LABEL: rint_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -463,72 +469,236 @@ define @rint_nxv16f32( %x) { declare @llvm.rint.nxv16f32() define @rint_nxv1f64( %x) { -; CHECK-LABEL: rint_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: rint_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: rint_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: rint_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: rint_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv1f64( %x) ret %a } declare @llvm.rint.nxv1f64() define @rint_nxv2f64( %x) { -; CHECK-LABEL: rint_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: rint_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: rint_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: rint_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: rint_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv2f64( %x) ret %a } declare @llvm.rint.nxv2f64() define @rint_nxv4f64( %x) { -; CHECK-LABEL: rint_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: rint_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: rint_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: rint_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: rint_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv4f64( %x) ret %a } declare @llvm.rint.nxv4f64() define @rint_nxv8f64( %x) { -; CHECK-LABEL: rint_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: rint_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: rint_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: rint_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: rint_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.rint.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll index 3d992aa13e379..295c264e7d924 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s ; This file tests the code generation for `llvm.experimental.constrained.round.*` on scalable vector type. @@ -11,10 +11,11 @@ define @round_nxv1f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -34,10 +35,11 @@ define @round_nxv2f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -57,10 +59,11 @@ define @round_nxv4f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -80,10 +83,11 @@ define @round_nxv8f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -103,10 +107,11 @@ define @round_nxv16f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -126,10 +131,11 @@ define @round_nxv32f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 4 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma @@ -260,92 +266,168 @@ define @round_nxv16f32( %x) strictfp declare @llvm.experimental.constrained.round.nxv16f32(, metadata) define @round_nxv1f64( %x) strictfp { -; CHECK-LABEL: round_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.round.nxv1f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.round.nxv1f64(, metadata) define @round_nxv2f64( %x) strictfp { -; CHECK-LABEL: round_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.round.nxv2f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.round.nxv2f64(, metadata) define @round_nxv4f64( %x) strictfp { -; CHECK-LABEL: round_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.round.nxv4f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.round.nxv4f64(, metadata) define @round_nxv8f64( %x) strictfp { -; CHECK-LABEL: round_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: round_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: fsrmi a0, 4 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: round_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: fsrmi a0, 4 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.round.nxv8f64( %x, metadata !"fpexcept.strict") ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll index f7422b279149f..d420636a573fe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fround-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN ; This file tests the code generation for `llvm.round.*` on scalable vector type. @@ -169,10 +169,11 @@ define @round_nxv32bf16( %x) { define @round_nxv1f16( %x) { ; ZVFH-LABEL: round_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -208,10 +209,11 @@ declare @llvm.round.nxv1f16() define @round_nxv2f16( %x) { ; ZVFH-LABEL: round_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -247,10 +249,11 @@ declare @llvm.round.nxv2f16() define @round_nxv4f16( %x) { ; ZVFH-LABEL: round_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -286,10 +289,11 @@ declare @llvm.round.nxv4f16() define @round_nxv8f16( %x) { ; ZVFH-LABEL: round_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -325,10 +329,11 @@ declare @llvm.round.nxv8f16() define @round_nxv16f16( %x) { ; ZVFH-LABEL: round_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -364,10 +369,11 @@ declare @llvm.round.nxv16f16() define @round_nxv32f16( %x) { ; ZVFH-LABEL: round_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -515,80 +521,268 @@ define @round_nxv16f32( %x) { declare @llvm.round.nxv16f32() define @round_nxv1f64( %x) { -; CHECK-LABEL: round_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv1f64( %x) ret %a } declare @llvm.round.nxv1f64() define @round_nxv2f64( %x) { -; CHECK-LABEL: round_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv2f64( %x) ret %a } declare @llvm.round.nxv2f64() define @round_nxv4f64( %x) { -; CHECK-LABEL: round_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv4f64( %x) ret %a } declare @llvm.round.nxv4f64() define @round_nxv8f64( %x) { -; CHECK-LABEL: round_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: round_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: round_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: round_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: round_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.round.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll index c293ac91b63bf..de766895c734f 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-constrained-sdnode.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s ; This file tests the code generation for `llvm.experimental.constrained.roundeven.*` on scalable vector type. @@ -11,10 +11,11 @@ define @roundeven_nxv1f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -34,10 +35,11 @@ define @roundeven_nxv2f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -57,10 +59,11 @@ define @roundeven_nxv4f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -80,10 +83,11 @@ define @roundeven_nxv8f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma @@ -103,10 +107,11 @@ define @roundeven_nxv16f16( %x) strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -126,10 +131,11 @@ define @roundeven_nxv32f16( %x) strictf ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: fsrmi a0, 0 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma @@ -260,92 +266,168 @@ define @roundeven_nxv16f32( %x) stric declare @llvm.experimental.constrained.roundeven.nxv16f32(, metadata) define @roundeven_nxv1f64( %x) strictfp { -; CHECK-LABEL: roundeven_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.roundeven.nxv1f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.roundeven.nxv1f64(, metadata) define @roundeven_nxv2f64( %x) strictfp { -; CHECK-LABEL: roundeven_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.roundeven.nxv2f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.roundeven.nxv2f64(, metadata) define @roundeven_nxv4f64( %x) strictfp { -; CHECK-LABEL: roundeven_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.roundeven.nxv4f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.roundeven.nxv4f64(, metadata) define @roundeven_nxv8f64( %x) strictfp { -; CHECK-LABEL: roundeven_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: roundeven_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: fsrmi a0, 0 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32-NEXT: fsrm a0 +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: roundeven_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: fsrmi a0, 0 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64-NEXT: fsrm a0 +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.roundeven.nxv8f64( %x, metadata !"fpexcept.strict") ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll index 865531b77eb29..b9121c55684ee 100644 --- a/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/froundeven-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN ; This file tests the code generation for `llvm.roundeven.*` on scalable vector type. define @roundeven_nxv1bf16( %x) { @@ -168,10 +168,11 @@ define @roundeven_nxv32bf16( %x) { define @roundeven_nxv1f16( %x) { ; ZVFH-LABEL: roundeven_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -207,10 +208,11 @@ declare @llvm.roundeven.nxv1f16() define @roundeven_nxv2f16( %x) { ; ZVFH-LABEL: roundeven_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -246,10 +248,11 @@ declare @llvm.roundeven.nxv2f16() define @roundeven_nxv4f16( %x) { ; ZVFH-LABEL: roundeven_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -285,10 +288,11 @@ declare @llvm.roundeven.nxv4f16() define @roundeven_nxv8f16( %x) { ; ZVFH-LABEL: roundeven_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -324,10 +328,11 @@ declare @llvm.roundeven.nxv8f16() define @roundeven_nxv16f16( %x) { ; ZVFH-LABEL: roundeven_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -363,10 +368,11 @@ declare @llvm.roundeven.nxv16f16() define @roundeven_nxv32f16( %x) { ; ZVFH-LABEL: roundeven_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -514,80 +520,268 @@ define @roundeven_nxv16f32( %x) { declare @llvm.roundeven.nxv16f32() define @roundeven_nxv1f64( %x) { -; CHECK-LABEL: roundeven_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv1f64( %x) ret %a } declare @llvm.roundeven.nxv1f64() define @roundeven_nxv2f64( %x) { -; CHECK-LABEL: roundeven_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv2f64( %x) ret %a } declare @llvm.roundeven.nxv2f64() define @roundeven_nxv4f64( %x) { -; CHECK-LABEL: roundeven_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv4f64( %x) ret %a } declare @llvm.roundeven.nxv4f64() define @roundeven_nxv8f64( %x) { -; CHECK-LABEL: roundeven_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: roundeven_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: roundeven_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: roundeven_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: roundeven_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.roundeven.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll index 8a5f118d8f6ac..63cb72e8795e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-constrained-sdnode.ll @@ -1,18 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s define @trunc_nxv1f16( %x) strictfp { ; CHECK-LABEL: trunc_nxv1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI0_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI0_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -30,10 +31,11 @@ define @trunc_nxv2f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI1_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI1_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -51,10 +53,11 @@ define @trunc_nxv4f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI2_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI2_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t @@ -72,10 +75,11 @@ define @trunc_nxv8f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI3_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI3_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v10, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v10, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t @@ -93,10 +97,11 @@ define @trunc_nxv16f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI4_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI4_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v12, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v12, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t @@ -114,10 +119,11 @@ define @trunc_nxv32f16( %x) strictfp { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu ; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI5_0)(a0) +; CHECK-NEXT: li a0, 25 ; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t +; CHECK-NEXT: slli a0, a0, 10 ; CHECK-NEXT: vfabs.v v16, v8 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v16, fa5 ; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma ; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t @@ -236,84 +242,152 @@ define @trunc_nxv16f32( %x) strictfp declare @llvm.experimental.constrained.trunc.nxv16f32(, metadata) define @trunc_nxv1f64( %x) strictfp { -; CHECK-LABEL: trunc_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI11_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI11_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI11_0) +; RV32-NEXT: fld fa5, %lo(.LCPI11_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v9, v8 +; RV32-NEXT: vmflt.vf v0, v9, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v9, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v9, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.trunc.nxv1f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.trunc.nxv1f64(, metadata) define @trunc_nxv2f64( %x) strictfp { -; CHECK-LABEL: trunc_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI12_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI12_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI12_0) +; RV32-NEXT: fld fa5, %lo(.LCPI12_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v10, v8 +; RV32-NEXT: vmflt.vf v0, v10, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v10, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v10, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.trunc.nxv2f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.trunc.nxv2f64(, metadata) define @trunc_nxv4f64( %x) strictfp { -; CHECK-LABEL: trunc_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI13_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI13_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI13_0) +; RV32-NEXT: fld fa5, %lo(.LCPI13_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v12, v8 +; RV32-NEXT: vmflt.vf v0, v12, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v12, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v12, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.trunc.nxv4f64( %x, metadata !"fpexcept.strict") ret %a } declare @llvm.experimental.constrained.trunc.nxv4f64(, metadata) define @trunc_nxv8f64( %x) strictfp { -; CHECK-LABEL: trunc_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, mu -; CHECK-NEXT: vmfne.vv v0, v8, v8 -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI14_0)(a0) -; CHECK-NEXT: vfadd.vv v8, v8, v8, v0.t -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: trunc_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV32-NEXT: vmfne.vv v0, v8, v8 +; RV32-NEXT: lui a0, %hi(.LCPI14_0) +; RV32-NEXT: fld fa5, %lo(.LCPI14_0)(a0) +; RV32-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV32-NEXT: vfabs.v v16, v8 +; RV32-NEXT: vmflt.vf v0, v16, fa5 +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; RV32-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: trunc_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, mu +; RV64-NEXT: vmfne.vv v0, v8, v8 +; RV64-NEXT: li a0, 1075 +; RV64-NEXT: vfadd.vv v8, v8, v8, v0.t +; RV64-NEXT: slli a0, a0, 52 +; RV64-NEXT: vfabs.v v16, v8 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vmflt.vf v0, v16, fa5 +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; RV64-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64-NEXT: ret %a = call @llvm.experimental.constrained.trunc.nxv8f64( %x, metadata !"fpexcept.strict") ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll index d597e166be4ee..34b3e8d2849b7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/ftrunc-sdnode.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN define @trunc_nxv1bf16( %x) { ; CHECK-LABEL: trunc_nxv1bf16: @@ -153,10 +153,11 @@ define @trunc_nxv32bf16( %x) { define @trunc_nxv1f16( %x) { ; ZVFH-LABEL: trunc_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI6_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -188,10 +189,11 @@ declare @llvm.trunc.nxv1f16() define @trunc_nxv2f16( %x) { ; ZVFH-LABEL: trunc_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI7_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -223,10 +225,11 @@ declare @llvm.trunc.nxv2f16() define @trunc_nxv4f16( %x) { ; ZVFH-LABEL: trunc_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI8_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI8_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -258,10 +261,11 @@ declare @llvm.trunc.nxv4f16() define @trunc_nxv8f16( %x) { ; ZVFH-LABEL: trunc_nxv8f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI9_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI9_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -293,10 +297,11 @@ declare @llvm.trunc.nxv8f16() define @trunc_nxv16f16( %x) { ; ZVFH-LABEL: trunc_nxv16f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI10_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI10_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -328,10 +333,11 @@ declare @llvm.trunc.nxv16f16() define @trunc_nxv32f16( %x) { ; ZVFH-LABEL: trunc_nxv32f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a0, %hi(.LCPI11_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI11_0)(a0) ; ZVFH-NEXT: vsetvli a0, zero, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -463,72 +469,236 @@ define @trunc_nxv16f32( %x) { declare @llvm.trunc.nxv16f32() define @trunc_nxv1f64( %x) { -; CHECK-LABEL: trunc_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI17_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI17_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: trunc_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: trunc_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: trunc_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI17_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI17_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: trunc_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv1f64( %x) ret %a } declare @llvm.trunc.nxv1f64() define @trunc_nxv2f64( %x) { -; CHECK-LABEL: trunc_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI18_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: trunc_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: trunc_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: trunc_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI18_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI18_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: trunc_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv2f64( %x) ret %a } declare @llvm.trunc.nxv2f64() define @trunc_nxv4f64( %x) { -; CHECK-LABEL: trunc_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI19_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: trunc_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: trunc_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: trunc_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI19_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI19_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: trunc_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv4f64( %x) ret %a } declare @llvm.trunc.nxv4f64() define @trunc_nxv8f64( %x) { -; CHECK-LABEL: trunc_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI20_0)(a0) -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: trunc_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: trunc_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: trunc_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI20_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI20_0)(a0) +; RV32ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: trunc_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.rtz.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %a = call @llvm.trunc.nxv8f64( %x) ret %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll index ae0542fb5b74f..d7bf566b9b5f4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/half-round-conv.ll @@ -83,10 +83,11 @@ define @trunc_nxv1f16_to_ui32( %x) { define @trunc_nxv1f16_to_si64( %x) { ; CHECK-LABEL: trunc_nxv1f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI6_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI6_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -104,10 +105,11 @@ define @trunc_nxv1f16_to_si64( %x) { define @trunc_nxv1f16_to_ui64( %x) { ; CHECK-LABEL: trunc_nxv1f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI7_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI7_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -201,10 +203,11 @@ define @trunc_nxv4f16_to_ui32( %x) { define @trunc_nxv4f16_to_si64( %x) { ; CHECK-LABEL: trunc_nxv4f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI14_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -222,10 +225,11 @@ define @trunc_nxv4f16_to_si64( %x) { define @trunc_nxv4f16_to_ui64( %x) { ; CHECK-LABEL: trunc_nxv4f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI15_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI15_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.rtz.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -331,10 +335,11 @@ define @ceil_nxv1f16_to_ui32( %x) { define @ceil_nxv1f16_to_si64( %x) { ; CHECK-LABEL: ceil_nxv1f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI22_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -388,10 +393,11 @@ define @ceil_nxv1f16_to_si64( %x) { define @ceil_nxv1f16_to_ui64( %x) { ; CHECK-LABEL: ceil_nxv1f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI23_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -533,10 +539,11 @@ define @ceil_nxv4f16_to_ui32( %x) { define @ceil_nxv4f16_to_si64( %x) { ; CHECK-LABEL: ceil_nxv4f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI30_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI30_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -590,10 +597,11 @@ define @ceil_nxv4f16_to_si64( %x) { define @ceil_nxv4f16_to_ui64( %x) { ; CHECK-LABEL: ceil_nxv4f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI31_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI31_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: fsrmi a0, 3 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -723,10 +731,11 @@ define @rint_nxv1f16_to_ui32( %x) { define @rint_nxv1f16_to_si64( %x) { ; CHECK-LABEL: rint_nxv1f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI38_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -778,10 +787,11 @@ define @rint_nxv1f16_to_si64( %x) { define @rint_nxv1f16_to_ui64( %x) { ; CHECK-LABEL: rint_nxv1f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI39_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -909,10 +919,11 @@ define @rint_nxv4f16_to_ui32( %x) { define @rint_nxv4f16_to_si64( %x) { ; CHECK-LABEL: rint_nxv4f16_to_si64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI46_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -964,10 +975,11 @@ define @rint_nxv4f16_to_si64( %x) { define @rint_nxv4f16_to_ui64( %x) { ; CHECK-LABEL: rint_nxv4f16_to_ui64: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI47_0) -; CHECK-NEXT: flh fa5, %lo(.LCPI47_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfabs.v v9, v8 +; CHECK-NEXT: li a0, 25 +; CHECK-NEXT: slli a0, a0, 10 +; CHECK-NEXT: fmv.h.x fa5, a0 ; CHECK-NEXT: vmflt.vf v0, v9, fa5 ; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t ; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t diff --git a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll index 9bb5717d6fc25..64e305f130dd7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/nearbyint-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare @llvm.vp.nearbyint.nxv1bf16(, , i32) @@ -407,10 +407,11 @@ declare @llvm.vp.nearbyint.nxv1f16(, @vp_nearbyint_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: frflags a0 @@ -453,10 +454,11 @@ define @vp_nearbyint_nxv1f16( %va, @vp_nearbyint_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -493,10 +495,11 @@ declare @llvm.vp.nearbyint.nxv2f16(, @vp_nearbyint_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: frflags a0 @@ -539,10 +542,11 @@ define @vp_nearbyint_nxv2f16( %va, @vp_nearbyint_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -579,10 +583,11 @@ declare @llvm.vp.nearbyint.nxv4f16(, @vp_nearbyint_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: frflags a0 @@ -625,10 +630,11 @@ define @vp_nearbyint_nxv4f16( %va, @vp_nearbyint_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -667,9 +673,10 @@ define @vp_nearbyint_nxv8f16( %va, @vp_nearbyint_nxv8f16( %va, @vp_nearbyint_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -755,9 +763,10 @@ define @vp_nearbyint_nxv16f16( %va, @vp_nearbyint_nxv16f16( %va, @vp_nearbyint_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -843,9 +853,10 @@ define @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16( %va, @vp_nearbyint_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_nearbyint_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: frflags a0 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1210,41 +1222,141 @@ define @vp_nearbyint_nxv16f32_unmasked( @llvm.vp.nearbyint.nxv1f64(, , i32) define @vp_nearbyint_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_nearbyint_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv1f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv1f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv1f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv1f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1252,43 +1364,149 @@ define @vp_nearbyint_nxv1f64_unmasked( @llvm.vp.nearbyint.nxv2f64(, , i32) define @vp_nearbyint_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_nearbyint_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1296,43 +1514,149 @@ define @vp_nearbyint_nxv2f64_unmasked( @llvm.vp.nearbyint.nxv4f64(, , i32) define @vp_nearbyint_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_nearbyint_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1340,43 +1664,149 @@ define @vp_nearbyint_nxv4f64_unmasked( @llvm.vp.nearbyint.nxv7f64(, , i32) define @vp_nearbyint_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv7f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv7f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv7f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv7f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_nearbyint_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv7f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv7f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv7f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv7f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1384,43 +1814,149 @@ define @vp_nearbyint_nxv7f64_unmasked( @llvm.vp.nearbyint.nxv8f64(, , i32) define @vp_nearbyint_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_nearbyint_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1429,87 +1965,325 @@ define @vp_nearbyint_nxv8f64_unmasked( @llvm.vp.nearbyint.nxv16f64(, , i32) define @vp_nearbyint_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: frflags a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v7, v0 +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFH-NEXT: srli a3, a1, 3 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFH-NEXT: sub a2, a0, a1 +; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFH-NEXT: sltu a3, a0, a2 +; RV32ZVFH-NEXT: addi a3, a3, -1 +; RV32ZVFH-NEXT: and a2, a3, a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: frflags a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsflags a2 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB44_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v7, v0 +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: srli a3, a1, 3 +; RV64ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: frflags a2 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsflags a2 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB44_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFHMIN-NEXT: srli a3, a1, 3 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFHMIN-NEXT: sub a2, a0, a1 +; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFHMIN-NEXT: sltu a3, a0, a2 +; RV32ZVFHMIN-NEXT: addi a3, a3, -1 +; RV32ZVFHMIN-NEXT: and a2, a3, a2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsflags a2 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB44_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: srli a3, a1, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a2 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsflags a2 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB44_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_nearbyint_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_nearbyint_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: frflags a2 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: fsflags a2 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: frflags a0 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: fsflags a0 -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_nearbyint_nxv16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFH-NEXT: sub a3, a0, a1 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFH-NEXT: sltu a2, a0, a3 +; RV32ZVFH-NEXT: addi a2, a2, -1 +; RV32ZVFH-NEXT: and a2, a2, a3 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: frflags a2 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: fsflags a2 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB45_2: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: frflags a0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: fsflags a0 +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_nearbyint_nxv16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: frflags a2 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: fsflags a2 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB45_2: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: frflags a0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: fsflags a0 +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_nearbyint_nxv16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFHMIN-NEXT: sub a3, a0, a1 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV32ZVFHMIN-NEXT: addi a2, a2, -1 +; RV32ZVFHMIN-NEXT: and a2, a2, a3 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: frflags a2 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: fsflags a2 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB45_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: frflags a0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsflags a0 +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_nearbyint_nxv16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: frflags a2 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: fsflags a2 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB45_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: frflags a0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsflags a0 +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.nearbyint.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 306cacb31bdef..06d54fadaeffd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -126,18 +126,18 @@ define void @vmv.v.x_needs_extended(ptr %p, i64 %x) { ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a1 -; CHECK-NEXT: addi a1, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vl8re64.v v24, (a0) ; CHECK-NEXT: vl8re64.v v0, (a0) -; CHECK-NEXT: vl8re64.v v8, (a0) -; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: vl8re64.v v16, (a0) +; CHECK-NEXT: vs8r.v v16, (a0) ; CHECK-NEXT: vs8r.v v0, (a0) ; CHECK-NEXT: vs8r.v v24, (a0) +; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vs8r.v v16, (a0) -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vs8r.v v8, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 diff --git a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll index a9505dca97529..091caa6c65fd2 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rint-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare @llvm.vp.rint.nxv1bf16(, , i32) @@ -379,10 +379,11 @@ declare @llvm.vp.rint.nxv1f16(, @vp_rint_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, ma @@ -421,10 +422,11 @@ define @vp_rint_nxv1f16( %va, @vp_rint_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -457,10 +459,11 @@ declare @llvm.vp.rint.nxv2f16(, @vp_rint_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, ma @@ -499,10 +502,11 @@ define @vp_rint_nxv2f16( %va, @vp_rint_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -535,10 +539,11 @@ declare @llvm.vp.rint.nxv4f16(, @vp_rint_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, ma @@ -577,10 +582,11 @@ define @vp_rint_nxv4f16( %va, @vp_rint_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t @@ -615,9 +621,10 @@ define @vp_rint_nxv8f16( %va, @vp_rint_nxv8f16( %va, @vp_rint_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t @@ -695,9 +703,10 @@ define @vp_rint_nxv16f16( %va, @vp_rint_nxv16f16( %va, @vp_rint_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t @@ -775,9 +785,10 @@ define @vp_rint_nxv32f16( %va, @vp_rint_nxv32f16( %va, @vp_rint_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_rint_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t ; ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t @@ -1110,37 +1122,125 @@ define @vp_rint_nxv16f32_unmasked( %v declare @llvm.vp.rint.nxv1f64(, , i32) define @vp_rint_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv1f64: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_rint_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv1f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv1f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv1f64_unmasked: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv1f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1148,39 +1248,133 @@ define @vp_rint_nxv1f64_unmasked( %va declare @llvm.vp.rint.nxv2f64(, , i32) define @vp_rint_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv2f64: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFMIN-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_rint_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv2f64_unmasked: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1188,39 +1382,133 @@ define @vp_rint_nxv2f64_unmasked( %va declare @llvm.vp.rint.nxv4f64(, , i32) define @vp_rint_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv4f64: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFMIN-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_rint_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv4f64_unmasked: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1228,39 +1516,133 @@ define @vp_rint_nxv4f64_unmasked( %va declare @llvm.vp.rint.nxv7f64(, , i32) define @vp_rint_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv7f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv7f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv7f64: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFMIN-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv7f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_rint_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv7f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv7f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv7f64_unmasked: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv7f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1268,39 +1650,133 @@ define @vp_rint_nxv7f64_unmasked( %va declare @llvm.vp.rint.nxv8f64(, , i32) define @vp_rint_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv8f64: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFMIN-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_rint_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv8f64_unmasked: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1309,79 +1785,293 @@ define @vp_rint_nxv8f64_unmasked( %va declare @llvm.vp.rint.nxv16f64(, , i32) define @vp_rint_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v7, v0 +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFH-NEXT: srli a3, a1, 3 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFH-NEXT: sub a2, a0, a1 +; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFH-NEXT: sltu a3, a0, a2 +; RV32ZVFH-NEXT: addi a3, a3, -1 +; RV32ZVFH-NEXT: and a2, a3, a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB44_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v7, v0 +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: srli a3, a1, 3 +; RV64ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB44_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv16f64: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFMIN-NEXT: vmv1r.v v7, v0 +; RV32ZVFMIN-NEXT: csrr a1, vlenb +; RV32ZVFMIN-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFMIN-NEXT: srli a3, a1, 3 +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFMIN-NEXT: sub a2, a0, a1 +; RV32ZVFMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFMIN-NEXT: sltu a3, a0, a2 +; RV32ZVFMIN-NEXT: addi a3, a3, -1 +; RV32ZVFMIN-NEXT: and a2, a3, a2 +; RV32ZVFMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFMIN-NEXT: # %bb.1: +; RV32ZVFMIN-NEXT: mv a0, a1 +; RV32ZVFMIN-NEXT: .LBB44_2: +; RV32ZVFMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: srli a3, a1, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB44_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_rint_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_rint_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_rint_nxv16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFH-NEXT: sub a3, a0, a1 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFH-NEXT: sltu a2, a0, a3 +; RV32ZVFH-NEXT: addi a2, a2, -1 +; RV32ZVFH-NEXT: and a2, a2, a3 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB45_2: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_rint_nxv16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB45_2: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFMIN-LABEL: vp_rint_nxv16f64_unmasked: +; RV32ZVFMIN: # %bb.0: +; RV32ZVFMIN-NEXT: csrr a1, vlenb +; RV32ZVFMIN-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFMIN-NEXT: sub a3, a0, a1 +; RV32ZVFMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFMIN-NEXT: sltu a2, a0, a3 +; RV32ZVFMIN-NEXT: addi a2, a2, -1 +; RV32ZVFMIN-NEXT: and a2, a2, a3 +; RV32ZVFMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFMIN-NEXT: # %bb.1: +; RV32ZVFMIN-NEXT: mv a0, a1 +; RV32ZVFMIN-NEXT: .LBB45_2: +; RV32ZVFMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_rint_nxv16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB45_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.rint.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll index ccbc0ebb3b73e..d1ea5aa76268a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/round-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/round-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare @llvm.vp.round.nxv1bf16(, , i32) @@ -407,10 +407,11 @@ declare @llvm.vp.round.nxv1f16(, @vp_round_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -453,10 +454,11 @@ define @vp_round_nxv1f16( %va, @vp_round_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -493,10 +495,11 @@ declare @llvm.vp.round.nxv2f16(, @vp_round_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -539,10 +542,11 @@ define @vp_round_nxv2f16( %va, @vp_round_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -579,10 +583,11 @@ declare @llvm.vp.round.nxv4f16(, @vp_round_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 4 @@ -625,10 +630,11 @@ define @vp_round_nxv4f16( %va, @vp_round_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -667,9 +673,10 @@ define @vp_round_nxv8f16( %va, @vp_round_nxv8f16( %va, @vp_round_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -755,9 +763,10 @@ define @vp_round_nxv16f16( %va, @vp_round_nxv16f16( %va, @vp_round_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -843,9 +853,10 @@ define @vp_round_nxv32f16( %va, @vp_round_nxv32f16( %va, @vp_round_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_round_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 4 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1210,41 +1222,141 @@ define @vp_round_nxv16f32_unmasked( % declare @llvm.vp.round.nxv1f64(, , i32) define @vp_round_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_round_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv1f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv1f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv1f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv1f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1252,43 +1364,149 @@ define @vp_round_nxv1f64_unmasked( %v declare @llvm.vp.round.nxv2f64(, , i32) define @vp_round_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_round_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1296,43 +1514,149 @@ define @vp_round_nxv2f64_unmasked( %v declare @llvm.vp.round.nxv4f64(, , i32) define @vp_round_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_round_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1340,43 +1664,149 @@ define @vp_round_nxv4f64_unmasked( %v declare @llvm.vp.round.nxv7f64(, , i32) define @vp_round_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv7f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv7f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv7f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv7f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_round_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv7f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv7f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv7f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv7f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1384,43 +1814,149 @@ define @vp_round_nxv7f64_unmasked( %v declare @llvm.vp.round.nxv8f64(, , i32) define @vp_round_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_round_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1429,87 +1965,325 @@ define @vp_round_nxv8f64_unmasked( %v declare @llvm.vp.round.nxv16f64(, , i32) define @vp_round_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 4 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v7, v0 +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFH-NEXT: srli a3, a1, 3 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFH-NEXT: sub a2, a0, a1 +; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFH-NEXT: sltu a3, a0, a2 +; RV32ZVFH-NEXT: addi a3, a3, -1 +; RV32ZVFH-NEXT: and a2, a3, a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a2, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB44_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v7, v0 +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: srli a3, a1, 3 +; RV64ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a2, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB44_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFHMIN-NEXT: srli a3, a1, 3 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFHMIN-NEXT: sub a2, a0, a1 +; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFHMIN-NEXT: sltu a3, a0, a2 +; RV32ZVFHMIN-NEXT: addi a3, a3, -1 +; RV32ZVFHMIN-NEXT: and a2, a3, a2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a2, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB44_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: srli a3, a1, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a2, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB44_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_round_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_round_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 4 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 4 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_round_nxv16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFH-NEXT: sub a3, a0, a1 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFH-NEXT: sltu a2, a0, a3 +; RV32ZVFH-NEXT: addi a2, a2, -1 +; RV32ZVFH-NEXT: and a2, a2, a3 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a2, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB45_2: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 4 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_round_nxv16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a2, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB45_2: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 4 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_round_nxv16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFHMIN-NEXT: sub a3, a0, a1 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV32ZVFHMIN-NEXT: addi a2, a2, -1 +; RV32ZVFHMIN-NEXT: and a2, a2, a3 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a2, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB45_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 4 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_round_nxv16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a2, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB45_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 4 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.round.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll index 3975423e6f985..23d0e97c1c82b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundeven-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare @llvm.vp.roundeven.nxv1bf16(, , i32) @@ -407,10 +407,11 @@ declare @llvm.vp.roundeven.nxv1f16(, @vp_roundeven_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -453,10 +454,11 @@ define @vp_roundeven_nxv1f16( %va, @vp_roundeven_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -493,10 +495,11 @@ declare @llvm.vp.roundeven.nxv2f16(, @vp_roundeven_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -539,10 +542,11 @@ define @vp_roundeven_nxv2f16( %va, @vp_roundeven_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -579,10 +583,11 @@ declare @llvm.vp.roundeven.nxv4f16(, @vp_roundeven_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 0 @@ -625,10 +630,11 @@ define @vp_roundeven_nxv4f16( %va, @vp_roundeven_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -667,9 +673,10 @@ define @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16( %va, @vp_roundeven_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -755,9 +763,10 @@ define @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16( %va, @vp_roundeven_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -843,9 +853,10 @@ define @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16( %va, @vp_roundeven_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundeven_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 0 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1210,41 +1222,141 @@ define @vp_roundeven_nxv16f32_unmasked( @llvm.vp.roundeven.nxv1f64(, , i32) define @vp_roundeven_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_roundeven_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv1f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv1f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv1f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv1f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1252,43 +1364,149 @@ define @vp_roundeven_nxv1f64_unmasked( @llvm.vp.roundeven.nxv2f64(, , i32) define @vp_roundeven_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_roundeven_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1296,43 +1514,149 @@ define @vp_roundeven_nxv2f64_unmasked( @llvm.vp.roundeven.nxv4f64(, , i32) define @vp_roundeven_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_roundeven_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1340,43 +1664,149 @@ define @vp_roundeven_nxv4f64_unmasked( @llvm.vp.roundeven.nxv7f64(, , i32) define @vp_roundeven_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv7f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv7f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv7f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv7f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_roundeven_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv7f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv7f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv7f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv7f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1384,43 +1814,149 @@ define @vp_roundeven_nxv7f64_unmasked( @llvm.vp.roundeven.nxv8f64(, , i32) define @vp_roundeven_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_roundeven_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1429,87 +1965,325 @@ define @vp_roundeven_nxv8f64_unmasked( @llvm.vp.roundeven.nxv16f64(, , i32) define @vp_roundeven_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 0 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v7, v0 +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFH-NEXT: srli a3, a1, 3 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFH-NEXT: sub a2, a0, a1 +; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFH-NEXT: sltu a3, a0, a2 +; RV32ZVFH-NEXT: addi a3, a3, -1 +; RV32ZVFH-NEXT: and a2, a3, a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a2, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB44_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v7, v0 +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: srli a3, a1, 3 +; RV64ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a2, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB44_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFHMIN-NEXT: srli a3, a1, 3 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFHMIN-NEXT: sub a2, a0, a1 +; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFHMIN-NEXT: sltu a3, a0, a2 +; RV32ZVFHMIN-NEXT: addi a3, a3, -1 +; RV32ZVFHMIN-NEXT: and a2, a3, a2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a2, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB44_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: srli a3, a1, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a2, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB44_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_roundeven_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundeven_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 0 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 0 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundeven_nxv16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFH-NEXT: sub a3, a0, a1 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFH-NEXT: sltu a2, a0, a3 +; RV32ZVFH-NEXT: addi a2, a2, -1 +; RV32ZVFH-NEXT: and a2, a2, a3 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a2, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB45_2: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 0 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundeven_nxv16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a2, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB45_2: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 0 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundeven_nxv16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFHMIN-NEXT: sub a3, a0, a1 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV32ZVFHMIN-NEXT: addi a2, a2, -1 +; RV32ZVFHMIN-NEXT: and a2, a2, a3 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a2, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB45_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 0 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundeven_nxv16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a2, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB45_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 0 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundeven.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll index 7f617f48862c4..4d8066d12c9ad 100644 --- a/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/roundtozero-vp.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV32ZVFH ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFH +; RUN: --check-prefixes=CHECK,ZVFH,RV64ZVFH ; RUN: llc -mtriple=riscv32 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV32ZVFHMIN ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \ ; RUN: -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \ -; RUN: --check-prefixes=CHECK,ZVFHMIN +; RUN: --check-prefixes=CHECK,ZVFHMIN,RV64ZVFHMIN declare @llvm.vp.roundtozero.nxv1bf16(, , i32) @@ -407,10 +407,11 @@ declare @llvm.vp.roundtozero.nxv1f16(, @vp_roundtozero_nxv1f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv1f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI12_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI12_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf4, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -453,10 +454,11 @@ define @vp_roundtozero_nxv1f16( %va, @vp_roundtozero_nxv1f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv1f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI13_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI13_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf4, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -493,10 +495,11 @@ declare @llvm.vp.roundtozero.nxv2f16(, @vp_roundtozero_nxv2f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv2f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI14_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI14_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, mf2, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -539,10 +542,11 @@ define @vp_roundtozero_nxv2f16( %va, @vp_roundtozero_nxv2f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv2f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI15_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI15_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, mf2, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -579,10 +583,11 @@ declare @llvm.vp.roundtozero.nxv4f16(, @vp_roundtozero_nxv4f16( %va, %m, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv4f16: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI16_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI16_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m1, ta, mu ; ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -625,10 +630,11 @@ define @vp_roundtozero_nxv4f16( %va, @vp_roundtozero_nxv4f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv4f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI17_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI17_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m1, ta, ma ; ZVFH-NEXT: vfabs.v v9, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v9, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t @@ -667,9 +673,10 @@ define @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16( %va, @vp_roundtozero_nxv8f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv8f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI19_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI19_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m2, ta, ma ; ZVFH-NEXT: vfabs.v v10, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v10, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t @@ -755,9 +763,10 @@ define @vp_roundtozero_nxv16f16( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vmv1r.v v12, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI20_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI20_0)(a0) ; ZVFH-NEXT: vfabs.v v16, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, mu ; ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -801,10 +810,11 @@ define @vp_roundtozero_nxv16f16( %va, < define @vp_roundtozero_nxv16f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv16f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI21_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI21_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFH-NEXT: vfabs.v v12, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v12, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t @@ -843,9 +853,10 @@ define @vp_roundtozero_nxv32f16( %va, < ; ZVFH: # %bb.0: ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vmv1r.v v16, v0 -; ZVFH-NEXT: lui a0, %hi(.LCPI22_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI22_0)(a0) ; ZVFH-NEXT: vfabs.v v24, v8, v0.t +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vsetvli zero, zero, e16, m8, ta, mu ; ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t ; ZVFH-NEXT: fsrmi a0, 1 @@ -922,10 +933,11 @@ define @vp_roundtozero_nxv32f16( %va, < define @vp_roundtozero_nxv32f16_unmasked( %va, i32 zeroext %evl) { ; ZVFH-LABEL: vp_roundtozero_nxv32f16_unmasked: ; ZVFH: # %bb.0: -; ZVFH-NEXT: lui a1, %hi(.LCPI23_0) -; ZVFH-NEXT: flh fa5, %lo(.LCPI23_0)(a1) ; ZVFH-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFH-NEXT: vfabs.v v16, v8 +; ZVFH-NEXT: li a0, 25 +; ZVFH-NEXT: slli a0, a0, 10 +; ZVFH-NEXT: fmv.h.x fa5, a0 ; ZVFH-NEXT: vmflt.vf v0, v16, fa5 ; ZVFH-NEXT: fsrmi a0, 1 ; ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t @@ -1210,41 +1222,141 @@ define @vp_roundtozero_nxv16f32_unmasked( @llvm.vp.roundtozero.nxv1f64(, , i32) define @vp_roundtozero_nxv1f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv1f64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI34_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI34_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vmflt.vf v0, v9, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv1f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv1f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv1f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI34_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI34_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv1f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv1f64( %va, %m, i32 %evl) ret %v } define @vp_roundtozero_nxv1f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv1f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI35_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI35_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfabs.v v9, v8 -; CHECK-NEXT: vmflt.vf v0, v9, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v9, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v9, v9, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v9, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv1f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFH-NEXT: vfabs.v v9, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv1f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFH-NEXT: vfabs.v v9, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv1f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI35_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI35_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv1f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v9, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v9, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v9, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v9, v9, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m1, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v9, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv1f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1252,43 +1364,149 @@ define @vp_roundtozero_nxv1f64_unmasked( @llvm.vp.roundtozero.nxv2f64(, , i32) define @vp_roundtozero_nxv2f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vmv1r.v v10, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI36_0)(a0) -; CHECK-NEXT: vfabs.v v12, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vmflt.vf v10, v12, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v10 -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv2f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v10, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v10 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv2f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v10, v0 +; RV64ZVFH-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v10 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv2f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI36_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI36_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv2f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v10, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v10, v12, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v10 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv2f64( %va, %m, i32 %evl) ret %v } define @vp_roundtozero_nxv2f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv2f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI37_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI37_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m2, ta, ma -; CHECK-NEXT: vfabs.v v10, v8 -; CHECK-NEXT: vmflt.vf v0, v10, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v10, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v10, v10, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v10, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv2f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFH-NEXT: vfabs.v v10, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv2f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFH-NEXT: vfabs.v v10, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv2f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI37_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI37_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv2f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m2, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v10, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v10, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v10, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v10, v10, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v10, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv2f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1296,43 +1514,149 @@ define @vp_roundtozero_nxv2f64_unmasked( @llvm.vp.roundtozero.nxv4f64(, , i32) define @vp_roundtozero_nxv4f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vmv1r.v v12, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI38_0)(a0) -; CHECK-NEXT: vfabs.v v16, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vmflt.vf v12, v16, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v12 -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv4f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v12, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v12 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv4f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v12, v0 +; RV64ZVFH-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v12 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv4f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI38_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI38_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv4f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v12, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v12, v16, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v12 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv4f64( %va, %m, i32 %evl) ret %v } define @vp_roundtozero_nxv4f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv4f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI39_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI39_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m4, ta, ma -; CHECK-NEXT: vfabs.v v12, v8 -; CHECK-NEXT: vmflt.vf v0, v12, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v12, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v12, v12, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v12, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv4f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFH-NEXT: vfabs.v v12, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv4f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFH-NEXT: vfabs.v v12, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv4f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI39_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI39_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv4f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m4, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v12, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v12, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v12, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v12, v12, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v12, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv4f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1340,43 +1664,149 @@ define @vp_roundtozero_nxv4f64_unmasked( @llvm.vp.roundtozero.nxv7f64(, , i32) define @vp_roundtozero_nxv7f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv7f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI40_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv7f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv7f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv7f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI40_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI40_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv7f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv7f64( %va, %m, i32 %evl) ret %v } define @vp_roundtozero_nxv7f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv7f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI41_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI41_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv7f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv7f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv7f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI41_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI41_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv7f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv7f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1384,43 +1814,149 @@ define @vp_roundtozero_nxv7f64_unmasked( @llvm.vp.roundtozero.nxv8f64(, , i32) define @vp_roundtozero_nxv8f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv8f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vmv1r.v v16, v0 -; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI42_0)(a0) -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v16, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v16 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv8f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v16, v0 +; RV32ZVFH-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v16 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv8f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v16, v0 +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v16 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv8f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV32ZVFHMIN-NEXT: lui a0, %hi(.LCPI42_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI42_0)(a0) +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv8f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v16, v0 +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v16, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v16 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv8f64( %va, %m, i32 %evl) ret %v } define @vp_roundtozero_nxv8f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv8f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI43_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI43_0)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v16, v8 -; CHECK-NEXT: vmflt.vf v0, v16, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v16, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v16, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v16, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv8f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v16, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv8f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v16, v8 +; RV64ZVFH-NEXT: li a0, 1075 +; RV64ZVFH-NEXT: slli a0, a0, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a0 +; RV64ZVFH-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv8f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: lui a1, %hi(.LCPI43_0) +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI43_0)(a1) +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv8f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v16, v8 +; RV64ZVFHMIN-NEXT: li a0, 1075 +; RV64ZVFHMIN-NEXT: slli a0, a0, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a0 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v16, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v16, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v16, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v16, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv8f64( %va, splat (i1 true), i32 %evl) ret %v } @@ -1429,87 +1965,325 @@ define @vp_roundtozero_nxv8f64_unmasked( @llvm.vp.roundtozero.nxv16f64(, , i32) define @vp_roundtozero_nxv16f64( %va, %m, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv16f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma -; CHECK-NEXT: vmv1r.v v7, v0 -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI44_0) -; CHECK-NEXT: srli a3, a1, 3 -; CHECK-NEXT: fld fa5, %lo(.LCPI44_0)(a2) -; CHECK-NEXT: sub a2, a0, a1 -; CHECK-NEXT: vslidedown.vx v6, v0, a3 -; CHECK-NEXT: sltu a3, a0, a2 -; CHECK-NEXT: addi a3, a3, -1 -; CHECK-NEXT: and a2, a3, a2 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v6, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a2, 1 -; CHECK-NEXT: vmv1r.v v0, v6 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB44_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB44_2: -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vmflt.vf v7, v24, fa5, v0.t -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vmv1r.v v0, v7 -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv16f64: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFH-NEXT: vmv1r.v v7, v0 +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFH-NEXT: srli a3, a1, 3 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFH-NEXT: sub a2, a0, a1 +; RV32ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFH-NEXT: sltu a3, a0, a2 +; RV32ZVFH-NEXT: addi a3, a3, -1 +; RV32ZVFH-NEXT: and a2, a3, a2 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a2, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v6 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB44_2: +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vmv1r.v v0, v7 +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv16f64: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFH-NEXT: vmv1r.v v7, v0 +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: srli a3, a1, 3 +; RV64ZVFH-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a2, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v6 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB44_2: +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vmv1r.v v0, v7 +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv16f64: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV32ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI44_0) +; RV32ZVFHMIN-NEXT: srli a3, a1, 3 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI44_0)(a2) +; RV32ZVFHMIN-NEXT: sub a2, a0, a1 +; RV32ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV32ZVFHMIN-NEXT: sltu a3, a0, a2 +; RV32ZVFHMIN-NEXT: addi a3, a3, -1 +; RV32ZVFHMIN-NEXT: and a2, a3, a2 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a2, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB44_2: +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv16f64: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; RV64ZVFHMIN-NEXT: vmv1r.v v7, v0 +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: srli a3, a1, 3 +; RV64ZVFHMIN-NEXT: vslidedown.vx v6, v0, a3 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v6, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a2, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v6 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB44_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB44_2: +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vmflt.vf v7, v24, fa5, v0.t +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vmv1r.v v0, v7 +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv16f64( %va, %m, i32 %evl) ret %v } define @vp_roundtozero_nxv16f64_unmasked( %va, i32 zeroext %evl) { -; CHECK-LABEL: vp_roundtozero_nxv16f64_unmasked: -; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: lui a2, %hi(.LCPI45_0) -; CHECK-NEXT: sub a3, a0, a1 -; CHECK-NEXT: fld fa5, %lo(.LCPI45_0)(a2) -; CHECK-NEXT: sltu a2, a0, a3 -; CHECK-NEXT: addi a2, a2, -1 -; CHECK-NEXT: and a2, a2, a3 -; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v16 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a2, 1 -; CHECK-NEXT: vfcvt.x.f.v v24, v16, v0.t -; CHECK-NEXT: fsrm a2 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v16, v24, v16, v0.t -; CHECK-NEXT: bltu a0, a1, .LBB45_2 -; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB45_2: -; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma -; CHECK-NEXT: vfabs.v v24, v8 -; CHECK-NEXT: vmflt.vf v0, v24, fa5 -; CHECK-NEXT: fsrmi a0, 1 -; CHECK-NEXT: vfcvt.x.f.v v24, v8, v0.t -; CHECK-NEXT: fsrm a0 -; CHECK-NEXT: vfcvt.f.x.v v24, v24, v0.t -; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, mu -; CHECK-NEXT: vfsgnj.vv v8, v24, v8, v0.t -; CHECK-NEXT: ret +; RV32ZVFH-LABEL: vp_roundtozero_nxv16f64_unmasked: +; RV32ZVFH: # %bb.0: +; RV32ZVFH-NEXT: csrr a1, vlenb +; RV32ZVFH-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFH-NEXT: sub a3, a0, a1 +; RV32ZVFH-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFH-NEXT: sltu a2, a0, a3 +; RV32ZVFH-NEXT: addi a2, a2, -1 +; RV32ZVFH-NEXT: and a2, a2, a3 +; RV32ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v16 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a2, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFH-NEXT: fsrm a2 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFH-NEXT: # %bb.1: +; RV32ZVFH-NEXT: mv a0, a1 +; RV32ZVFH-NEXT: .LBB45_2: +; RV32ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFH-NEXT: vfabs.v v24, v8 +; RV32ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFH-NEXT: fsrmi a0, 1 +; RV32ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFH-NEXT: fsrm a0 +; RV32ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFH-NEXT: ret +; +; RV64ZVFH-LABEL: vp_roundtozero_nxv16f64_unmasked: +; RV64ZVFH: # %bb.0: +; RV64ZVFH-NEXT: csrr a1, vlenb +; RV64ZVFH-NEXT: li a2, 1075 +; RV64ZVFH-NEXT: sub a3, a0, a1 +; RV64ZVFH-NEXT: slli a2, a2, 52 +; RV64ZVFH-NEXT: fmv.d.x fa5, a2 +; RV64ZVFH-NEXT: sltu a2, a0, a3 +; RV64ZVFH-NEXT: addi a2, a2, -1 +; RV64ZVFH-NEXT: and a2, a2, a3 +; RV64ZVFH-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v16 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a2, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFH-NEXT: fsrm a2 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFH-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFH-NEXT: # %bb.1: +; RV64ZVFH-NEXT: mv a0, a1 +; RV64ZVFH-NEXT: .LBB45_2: +; RV64ZVFH-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFH-NEXT: vfabs.v v24, v8 +; RV64ZVFH-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFH-NEXT: fsrmi a0, 1 +; RV64ZVFH-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFH-NEXT: fsrm a0 +; RV64ZVFH-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFH-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFH-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFH-NEXT: ret +; +; RV32ZVFHMIN-LABEL: vp_roundtozero_nxv16f64_unmasked: +; RV32ZVFHMIN: # %bb.0: +; RV32ZVFHMIN-NEXT: csrr a1, vlenb +; RV32ZVFHMIN-NEXT: lui a2, %hi(.LCPI45_0) +; RV32ZVFHMIN-NEXT: sub a3, a0, a1 +; RV32ZVFHMIN-NEXT: fld fa5, %lo(.LCPI45_0)(a2) +; RV32ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV32ZVFHMIN-NEXT: addi a2, a2, -1 +; RV32ZVFHMIN-NEXT: and a2, a2, a3 +; RV32ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a2, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV32ZVFHMIN-NEXT: fsrm a2 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV32ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV32ZVFHMIN-NEXT: # %bb.1: +; RV32ZVFHMIN-NEXT: mv a0, a1 +; RV32ZVFHMIN-NEXT: .LBB45_2: +; RV32ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV32ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV32ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV32ZVFHMIN-NEXT: fsrmi a0, 1 +; RV32ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV32ZVFHMIN-NEXT: fsrm a0 +; RV32ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV32ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV32ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV32ZVFHMIN-NEXT: ret +; +; RV64ZVFHMIN-LABEL: vp_roundtozero_nxv16f64_unmasked: +; RV64ZVFHMIN: # %bb.0: +; RV64ZVFHMIN-NEXT: csrr a1, vlenb +; RV64ZVFHMIN-NEXT: li a2, 1075 +; RV64ZVFHMIN-NEXT: sub a3, a0, a1 +; RV64ZVFHMIN-NEXT: slli a2, a2, 52 +; RV64ZVFHMIN-NEXT: fmv.d.x fa5, a2 +; RV64ZVFHMIN-NEXT: sltu a2, a0, a3 +; RV64ZVFHMIN-NEXT: addi a2, a2, -1 +; RV64ZVFHMIN-NEXT: and a2, a2, a3 +; RV64ZVFHMIN-NEXT: vsetvli zero, a2, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v16 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a2, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v16, v0.t +; RV64ZVFHMIN-NEXT: fsrm a2 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v16, v24, v16, v0.t +; RV64ZVFHMIN-NEXT: bltu a0, a1, .LBB45_2 +; RV64ZVFHMIN-NEXT: # %bb.1: +; RV64ZVFHMIN-NEXT: mv a0, a1 +; RV64ZVFHMIN-NEXT: .LBB45_2: +; RV64ZVFHMIN-NEXT: vsetvli zero, a0, e64, m8, ta, ma +; RV64ZVFHMIN-NEXT: vfabs.v v24, v8 +; RV64ZVFHMIN-NEXT: vmflt.vf v0, v24, fa5 +; RV64ZVFHMIN-NEXT: fsrmi a0, 1 +; RV64ZVFHMIN-NEXT: vfcvt.x.f.v v24, v8, v0.t +; RV64ZVFHMIN-NEXT: fsrm a0 +; RV64ZVFHMIN-NEXT: vfcvt.f.x.v v24, v24, v0.t +; RV64ZVFHMIN-NEXT: vsetvli zero, zero, e64, m8, ta, mu +; RV64ZVFHMIN-NEXT: vfsgnj.vv v8, v24, v8, v0.t +; RV64ZVFHMIN-NEXT: ret %v = call @llvm.vp.roundtozero.nxv16f64( %va, splat (i1 true), i32 %evl) ret %v } diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll index 818b882a402ac..bb121416ddec3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-constrained-sdnode.ll @@ -249,42 +249,32 @@ define @vfdiv_vf_nxv32bf16( %va, bf ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 ; CHECK-NEXT: addi a1, sp, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: vmv.v.x v24, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v16 -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24 +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v24, v16, v0 +; CHECK-NEXT: vfdiv.vv v24, v24, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v12 -; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: add a0, sp, a0 -; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfdiv.vv v16, v0, v8 +; CHECK-NEXT: vfdiv.vv v16, v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -617,42 +607,32 @@ define @vfdiv_vf_nxv32f16( %va, half %b ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8 ; ZVFHMIN-NEXT: addi a1, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a0 +; ZVFHMIN-NEXT: vmv.v.x v24, a0 ; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v24, v16, v0 +; ZVFHMIN-NEXT: vfdiv.vv v24, v24, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v12 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v8 +; ZVFHMIN-NEXT: vfdiv.vv v16, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll index 03e6e6b7a624d..7e580d1057525 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp-combine.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+d,+zvfh,+v,+m -target-abi=ilp32d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+m -target-abi=lp64d \ -; RUN: -verify-machineinstrs < %s | FileCheck %s +; RUN: -verify-machineinstrs < %s | FileCheck --check-prefixes=CHECK,RV64 %s declare @llvm.vp.fma.nxv1f64(, , , , i32) declare @llvm.vp.fneg.nxv1f64(, , i32) @@ -24,17 +24,30 @@ define @test1( %a, (fmul x, c1+c2) define @test2( %a, %m, i32 zeroext %evl) { -; CHECK-LABEL: test2: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: lui a1, %hi(.LCPI1_1) -; CHECK-NEXT: fld fa4, %lo(.LCPI1_1)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v9, fa5 -; CHECK-NEXT: vfadd.vf v9, v9, fa4, v0.t -; CHECK-NEXT: vfmul.vv v8, v8, v9, v0.t -; CHECK-NEXT: ret +; RV32-LABEL: test2: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI1_0) +; RV32-NEXT: fld fa5, %lo(.LCPI1_0)(a1) +; RV32-NEXT: lui a1, %hi(.LCPI1_1) +; RV32-NEXT: fld fa4, %lo(.LCPI1_1)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vfmv.v.f v9, fa5 +; RV32-NEXT: vfadd.vf v9, v9, fa4, v0.t +; RV32-NEXT: vfmul.vv v8, v8, v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: test2: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 1025 +; RV64-NEXT: slli a1, a1, 52 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v9, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vfadd.vf v9, v9, fa5, v0.t +; RV64-NEXT: vfmul.vv v8, v8, v9, v0.t +; RV64-NEXT: ret %t = call @llvm.vp.fmul.nxv1f64( %a, splat (double 2.0), %m, i32 %evl) %v = call fast @llvm.vp.fma.nxv1f64( %a, splat (double 4.0), %t, %m, i32 %evl) ret %v @@ -42,18 +55,32 @@ define @test2( %a, ; (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) define @test3( %a, %b, %m, i32 zeroext %evl) { -; CHECK-LABEL: test3: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a1, %hi(.LCPI2_0) -; CHECK-NEXT: fld fa5, %lo(.LCPI2_0)(a1) -; CHECK-NEXT: lui a1, %hi(.LCPI2_1) -; CHECK-NEXT: fld fa4, %lo(.LCPI2_1)(a1) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vfmv.v.f v10, fa5 -; CHECK-NEXT: vfmul.vf v10, v10, fa4, v0.t -; CHECK-NEXT: vfmadd.vv v10, v8, v9, v0.t -; CHECK-NEXT: vmv.v.v v8, v10 -; CHECK-NEXT: ret +; RV32-LABEL: test3: +; RV32: # %bb.0: +; RV32-NEXT: lui a1, %hi(.LCPI2_0) +; RV32-NEXT: fld fa5, %lo(.LCPI2_0)(a1) +; RV32-NEXT: lui a1, %hi(.LCPI2_1) +; RV32-NEXT: fld fa4, %lo(.LCPI2_1)(a1) +; RV32-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV32-NEXT: vfmv.v.f v10, fa5 +; RV32-NEXT: vfmul.vf v10, v10, fa4, v0.t +; RV32-NEXT: vfmadd.vv v10, v8, v9, v0.t +; RV32-NEXT: vmv.v.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: test3: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 1025 +; RV64-NEXT: slli a1, a1, 52 +; RV64-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; RV64-NEXT: vmv.v.x v10, a1 +; RV64-NEXT: li a0, 1 +; RV64-NEXT: slli a0, a0, 62 +; RV64-NEXT: fmv.d.x fa5, a0 +; RV64-NEXT: vfmul.vf v10, v10, fa5, v0.t +; RV64-NEXT: vfmadd.vv v10, v8, v9, v0.t +; RV64-NEXT: vmv.v.v v8, v10 +; RV64-NEXT: ret %t = call @llvm.vp.fmul.nxv1f64( %a, splat (double 2.0), %m, i32 %evl) %v = call fast @llvm.vp.fma.nxv1f64( %t, splat (double 4.0), %b, %m, i32 %evl) ret %v diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll index 0a11501905b81..728fa07a7d4e5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll @@ -820,6 +820,7 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; CHECK-NEXT: vmv1r.v v3, v0 +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a2, fa0 ; CHECK-NEXT: csrr a3, vlenb ; CHECK-NEXT: slli a1, a3, 1 @@ -835,41 +836,33 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20, v0.t ; CHECK-NEXT: addi a4, sp, 16 -; CHECK-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 -; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t +; CHECK-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v28, v0.t ; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 -; CHECK-NEXT: add a2, sp, a2 -; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t ; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v24, v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v4, v8, v0.t +; CHECK-NEXT: vfncvtbf16.f.f.w v4, v24, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB32_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 @@ -879,27 +872,27 @@ define @vfma_vf_nxv32bf16( %va, bfl ; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 -; CHECK-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v8, v0.t +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8, v0.t +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16, v0.t ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: slli a0, a0, 1 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload @@ -1061,81 +1054,87 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 5 +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: add a1, a1, a2 ; CHECK-NEXT: sub sp, sp, a1 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vmv8r.v v24, v8 ; CHECK-NEXT: fmv.x.h a2, fa0 ; CHECK-NEXT: csrr a3, vlenb -; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; CHECK-NEXT: vmset.m v24 +; CHECK-NEXT: vmset.m v8 ; CHECK-NEXT: slli a1, a3, 1 ; CHECK-NEXT: srli a3, a3, 2 ; CHECK-NEXT: sub a4, a0, a1 ; CHECK-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; CHECK-NEXT: vslidedown.vx v0, v24, a3 +; CHECK-NEXT: vslidedown.vx v0, v8, a3 ; CHECK-NEXT: sltu a3, a0, a4 ; CHECK-NEXT: addi a3, a3, -1 ; CHECK-NEXT: and a3, a3, a4 ; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: slli a4, a4, 5 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v20, v0.t -; CHECK-NEXT: addi a4, sp, 16 +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v20, v0.t +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t ; CHECK-NEXT: csrr a4, vlenb ; CHECK-NEXT: slli a4, a4, 3 ; CHECK-NEXT: add a4, sp, a4 ; CHECK-NEXT: addi a4, a4, 16 ; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12, v0.t ; CHECK-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; CHECK-NEXT: vmv.v.x v8, a2 +; CHECK-NEXT: addi a2, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v12, v0.t ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: slli a2, a2, 1 -; CHECK-NEXT: add a2, a2, a4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v28, v0.t -; CHECK-NEXT: addi a2, sp, 16 -; CHECK-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v8, v16, v24, v0.t +; CHECK-NEXT: vfmadd.vv v24, v16, v8, v0.t ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v20, v8, v0.t +; CHECK-NEXT: vfncvtbf16.f.f.w v12, v24, v0.t ; CHECK-NEXT: bltu a0, a1, .LBB34_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: mv a0, a1 ; CHECK-NEXT: .LBB34_2: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: slli a1, a1, 5 ; CHECK-NEXT: add a1, sp, a1 ; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v24 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a1, a0 @@ -1143,17 +1142,25 @@ define @vfma_vf_nxv32bf16_unmasked( ; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v8 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v0, v24 ; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v0, v24, v8 +; CHECK-NEXT: vfmadd.vv v24, v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; CHECK-NEXT: vfncvtbf16.f.f.w v16, v0 -; CHECK-NEXT: vmv8r.v v8, v16 +; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 5 +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, a0, a1 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -2287,6 +2294,7 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: vsetvli a1, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: slli a1, a3, 1 @@ -2302,41 +2310,33 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB68_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -2346,27 +2346,27 @@ define @vfma_vf_nxv32f16( %va, half %b, ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload @@ -2540,81 +2540,87 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v8 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 ; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 +; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 ; ZVFHMIN-NEXT: sltu a3, a0, a4 ; ZVFHMIN-NEXT: addi a3, a3, -1 ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB70_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB70_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 5 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -2622,17 +2628,25 @@ define @vfma_vf_nxv32f16_unmasked( %va, ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8 +; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -8266,7 +8280,8 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v24, v16, a1, v0.t +; ZVFHMIN-NEXT: vmv8r.v v16, v8 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 @@ -8277,45 +8292,37 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v8, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v16, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB282_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -8323,24 +8330,22 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: vmv1r.v v0, v3 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: mv a1, a0 @@ -8348,12 +8353,20 @@ define @vfmsub_vf_nxv32f16( %va, half % ; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload @@ -8524,13 +8537,17 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb +; ZVFHMIN-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; ZVFHMIN-NEXT: vmv8r.v v24, v8 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 -; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v8 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 @@ -8538,88 +8555,98 @@ define @vfmsub_vf_nxv32f16_unmasked( %v ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 ; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 +; ZVFHMIN-NEXT: vslidedown.vx v0, v8, a3 ; ZVFHMIN-NEXT: sltu a3, a0, a4 ; ZVFHMIN-NEXT: addi a3, a3, -1 ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v20, v8, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB284_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB284_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v8 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v8 +; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v16, v0 -; ZVFHMIN-NEXT: vmv8r.v v8, v16 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9534,100 +9561,116 @@ define @vfnmadd_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma -; ZVFHMIN-NEXT: vmset.m v24 +; ZVFHMIN-NEXT: vmset.m v7 ; ZVFHMIN-NEXT: csrr a3, vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1 -; ZVFHMIN-NEXT: vxor.vx v16, v16, a1 +; ZVFHMIN-NEXT: vxor.vx v24, v8, a1 +; ZVFHMIN-NEXT: vxor.vx v8, v16, a1 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 ; ZVFHMIN-NEXT: vsetvli a5, zero, e8, mf2, ta, ma -; ZVFHMIN-NEXT: vslidedown.vx v0, v24, a3 +; ZVFHMIN-NEXT: vslidedown.vx v0, v7, a3 ; ZVFHMIN-NEXT: sltu a3, a0, a4 ; ZVFHMIN-NEXT: addi a3, a3, -1 ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: slli a4, a4, 5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v28, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 -; ZVFHMIN-NEXT: mv a5, a4 -; ZVFHMIN-NEXT: slli a4, a4, 1 -; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: csrr a2, vlenb +; ZVFHMIN-NEXT: slli a2, a2, 3 +; ZVFHMIN-NEXT: add a2, sp, a2 +; ZVFHMIN-NEXT: addi a2, a2, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB292_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB292_2: ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v24, v16 +; ZVFHMIN-NEXT: vfmadd.vv v24, v0, v16 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -9673,44 +9716,39 @@ define @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmadd_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: vmv8r.v v24, v16 ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v8, a1, v0.t ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 @@ -10804,43 +10846,35 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a2 -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v8, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB302_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 @@ -10850,40 +10884,40 @@ define @vfnmsub_vf_nxv32f16( %va, half ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a1) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8, v0.t +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16, v0.t ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v16, v0.t +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v24, v0.t ; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v8, v16, v24, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v8, v24, v16, v0.t ; ZVFHMIN-NEXT: vmv.v.v v16, v8 ; ZVFHMIN-NEXT: vmv4r.v v12, v4 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma @@ -10919,10 +10953,18 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 +; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vxor.vx v8, v8, a1, v0.t +; ZVFHMIN-NEXT: vxor.vx v16, v8, a1, v0.t ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 ; ZVFHMIN-NEXT: sub a4, a0, a1 @@ -10935,41 +10977,33 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: slli a4, a4, 3 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t ; ZVFHMIN-NEXT: csrr a4, vlenb -; ZVFHMIN-NEXT: slli a4, a4, 4 +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv.v.x v16, a2 ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 -; ZVFHMIN-NEXT: add a2, sp, a2 -; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t +; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB303_2 @@ -10987,7 +11021,10 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 1 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload @@ -10998,10 +11035,7 @@ define @vfnmsub_vf_nxv32f16_commute( %v ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload @@ -11044,9 +11078,12 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 5 +; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 2 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: sub sp, sp, a1 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb ; ZVFHMIN-NEXT: fmv.x.h a2, fa0 ; ZVFHMIN-NEXT: lui a1, 8 ; ZVFHMIN-NEXT: vsetvli a3, zero, e8, m4, ta, ma @@ -11064,81 +11101,92 @@ define @vfnmsub_vf_nxv32f16_unmasked( % ; ZVFHMIN-NEXT: and a3, a3, a4 ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: mv a5, a4 +; ZVFHMIN-NEXT: slli a4, a4, 1 +; ZVFHMIN-NEXT: add a4, a4, a5 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: addi a4, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a4, vlenb ; ZVFHMIN-NEXT: slli a4, a4, 4 ; ZVFHMIN-NEXT: add a4, sp, a4 ; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 5 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vfwcvt.f.f.v v8, v20, v0.t +; ZVFHMIN-NEXT: csrr a4, vlenb +; ZVFHMIN-NEXT: slli a4, a4, 3 +; ZVFHMIN-NEXT: add a4, sp, a4 +; ZVFHMIN-NEXT: addi a4, a4, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a4, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v16, a2 +; ZVFHMIN-NEXT: vmv.v.x v8, a2 +; ZVFHMIN-NEXT: addi a2, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t ; ZVFHMIN-NEXT: csrr a2, vlenb -; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 +; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 3 -; ZVFHMIN-NEXT: mv a4, a2 -; ZVFHMIN-NEXT: slli a2, a2, 1 -; ZVFHMIN-NEXT: add a2, a2, a4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t -; ZVFHMIN-NEXT: addi a2, sp, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a2) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v16, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v8, v0.t +; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t +; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t ; ZVFHMIN-NEXT: bltu a0, a1, .LBB304_2 ; ZVFHMIN-NEXT: # %bb.1: ; ZVFHMIN-NEXT: mv a0, a1 ; ZVFHMIN-NEXT: .LBB304_2: ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 +; ZVFHMIN-NEXT: mv a2, a1 +; ZVFHMIN-NEXT: slli a1, a1, 1 +; ZVFHMIN-NEXT: add a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 ; ZVFHMIN-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v24 -; ZVFHMIN-NEXT: addi a0, sp, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 4 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v0 +; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 3 -; ZVFHMIN-NEXT: mv a1, a0 -; ZVFHMIN-NEXT: slli a0, a0, 1 -; ZVFHMIN-NEXT: add a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v16 +; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v0, v24 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v24 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v0 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v0 +; ZVFHMIN-NEXT: vfncvt.f.f.w v8, v24 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: slli a0, a0, 3 +; ZVFHMIN-NEXT: mv a1, a0 +; ZVFHMIN-NEXT: slli a0, a0, 2 +; ZVFHMIN-NEXT: add a0, a0, a1 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -11182,42 +11230,37 @@ define @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vsetvli zero, a0, e16, m8, ta, ma ; ZVFHMIN-NEXT: vmv1r.v v3, v0 ; ZVFHMIN-NEXT: csrr a1, vlenb +; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: add a1, sp, a1 +; ZVFHMIN-NEXT: addi a1, a1, 16 +; ZVFHMIN-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: mv a2, a1 ; ZVFHMIN-NEXT: slli a1, a1, 1 @@ -11298,10 +11349,10 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: fmv.x.h a1, fa0 ; ZVFHMIN-NEXT: lui a2, 8 ; ZVFHMIN-NEXT: csrr a3, vlenb -; ZVFHMIN-NEXT: vmv.v.x v8, a1 +; ZVFHMIN-NEXT: vmv.v.x v16, a1 ; ZVFHMIN-NEXT: slli a1, a3, 1 ; ZVFHMIN-NEXT: srli a3, a3, 2 -; ZVFHMIN-NEXT: vxor.vx v8, v8, a2, v0.t +; ZVFHMIN-NEXT: vxor.vx v8, v16, a2, v0.t ; ZVFHMIN-NEXT: sub a2, a0, a1 ; ZVFHMIN-NEXT: vsetvli a4, zero, e8, mf2, ta, ma ; ZVFHMIN-NEXT: vslidedown.vx v0, v0, a3 @@ -11315,12 +11366,11 @@ define @vfnmsub_vf_nxv32f16_neg_splat( ; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v12, v0.t -; ZVFHMIN-NEXT: vmv8r.v v8, v16 ; ZVFHMIN-NEXT: csrr a2, vlenb ; ZVFHMIN-NEXT: slli a2, a2, 4 ; ZVFHMIN-NEXT: add a2, sp, a2 ; ZVFHMIN-NEXT: addi a2, a2, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vl8r.v v8, (a2) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t ; ZVFHMIN-NEXT: addi a2, sp, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill @@ -11563,21 +11613,22 @@ define @vfnmsub_vf_nxv32f16_neg_splat_unmasked( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfnmsub_vf_nxv32f16_neg_splat_unmasked_commute( @vfmadd_vf_nxv32bf16( %va, < ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: sub sp, sp, a0 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb ; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma ; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: fmv.x.h a0, fa0 ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8 -; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 ; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v24 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.x v24, a0 +; CHECK-NEXT: vmv.v.x v16, a0 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; CHECK-NEXT: vfwcvtbf16.f.f.v v24, v0 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmadd.vv v24, v16, v0 +; CHECK-NEXT: vfmadd.vv v24, v0, v16 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v12 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 4 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v4 -; CHECK-NEXT: addi a0, sp, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma ; CHECK-NEXT: vfmadd.vv v16, v8, v0 ; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v8, v24 ; CHECK-NEXT: vfncvtbf16.f.f.w v12, v16 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: li a1, 24 -; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add sp, sp, a0 ; CHECK-NEXT: .cfi_def_cfa sp, 16 ; CHECK-NEXT: addi sp, sp, 16 @@ -759,65 +771,77 @@ define @vfmadd_vf_nxv32f16( %va, @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: addi sp, sp, -16 ; ZVFH-NEXT: .cfi_def_cfa_offset 16 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 2 -; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: slli a0, a0, 5 ; ZVFH-NEXT: sub sp, sp, a0 -; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vmv8r.v v24, v16 +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vmv8r.v v16, v8 +; ZVFH-NEXT: addi a0, sp, 16 +; ZVFH-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; ZVFH-NEXT: fmv.x.h a0, fa0 -; ZVFH-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v16 ; ZVFH-NEXT: csrr a1, vlenb -; ZVFH-NEXT: slli a1, a1, 4 +; ZVFH-NEXT: slli a1, a1, 3 +; ZVFH-NEXT: mv a2, a1 +; ZVFH-NEXT: slli a1, a1, 1 +; ZVFH-NEXT: add a1, a1, a2 ; ZVFH-NEXT: add a1, sp, a1 ; ZVFH-NEXT: addi a1, a1, 16 -; ZVFH-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFH-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v24 ; ZVFH-NEXT: csrr a1, vlenb ; ZVFH-NEXT: slli a1, a1, 3 ; ZVFH-NEXT: add a1, sp, a1 ; ZVFH-NEXT: addi a1, a1, 16 -; ZVFH-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; ZVFH-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFH-NEXT: vmv.v.x v24, a0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 1 -; ZVFH-NEXT: add a0, a0, a1 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: vmv.v.x v8, a0 +; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v8 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 @@ -433,40 +434,17 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 4 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v16, v24, v0 -; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 -; ZVFH-NEXT: add a0, sp, a0 -; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFH-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFH-NEXT: vfmadd.vv v24, v16, v0 ; ZVFH-NEXT: addi a0, sp, 16 ; ZVFH-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFH-NEXT: vfwcvtbf16.f.f.v v8, v20 +; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v20 ; ZVFH-NEXT: csrr a0, vlenb ; ZVFH-NEXT: slli a0, a0, 3 ; ZVFH-NEXT: mv a1, a0 @@ -474,23 +452,29 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 +; ZVFH-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; ZVFH-NEXT: csrr a0, vlenb +; ZVFH-NEXT: slli a0, a0, 4 +; ZVFH-NEXT: add a0, sp, a0 +; ZVFH-NEXT: addi a0, a0, 16 ; ZVFH-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFH-NEXT: vfwcvtbf16.f.f.v v16, v4 -; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFH-NEXT: vfmadd.vv v16, v8, v24 +; ZVFH-NEXT: vfwcvtbf16.f.f.v v0, v12 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 5 +; ZVFH-NEXT: slli a0, a0, 3 +; ZVFH-NEXT: mv a1, a0 +; ZVFH-NEXT: slli a0, a0, 1 +; ZVFH-NEXT: add a0, a0, a1 ; ZVFH-NEXT: add a0, sp, a0 ; ZVFH-NEXT: addi a0, a0, 16 -; ZVFH-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFH-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFH-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFH-NEXT: vfmadd.vv v0, v16, v8 ; ZVFH-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFH-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFH-NEXT: vfncvtbf16.f.f.w v12, v0 ; ZVFH-NEXT: csrr a0, vlenb -; ZVFH-NEXT: slli a0, a0, 3 -; ZVFH-NEXT: mv a1, a0 -; ZVFH-NEXT: slli a0, a0, 2 -; ZVFH-NEXT: add a0, a0, a1 +; ZVFH-NEXT: slli a0, a0, 5 ; ZVFH-NEXT: add sp, sp, a0 ; ZVFH-NEXT: .cfi_def_cfa sp, 16 ; ZVFH-NEXT: addi sp, sp, 16 @@ -502,94 +486,80 @@ define @vfmadd_vf_nxv32bf16( %va, < ; ZVFHMIN-NEXT: addi sp, sp, -16 ; ZVFHMIN-NEXT: .cfi_def_cfa_offset 16 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: sub sp, sp, a0 -; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb -; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vmv8r.v v24, v16 +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv8r.v v16, v8 +; ZVFHMIN-NEXT: addi a0, sp, 16 +; ZVFHMIN-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: fmv.x.h a0, fa0 -; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v16 ; ZVFHMIN-NEXT: csrr a1, vlenb -; ZVFHMIN-NEXT: slli a1, a1, 4 +; ZVFHMIN-NEXT: li a2, 24 +; ZVFHMIN-NEXT: mul a1, a1, a2 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v24 ; ZVFHMIN-NEXT: csrr a1, vlenb ; ZVFHMIN-NEXT: slli a1, a1, 3 ; ZVFHMIN-NEXT: add a1, sp, a1 ; ZVFHMIN-NEXT: addi a1, a1, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill ; ZVFHMIN-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; ZVFHMIN-NEXT: vmv.v.x v24, a0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 24 -; ZVFHMIN-NEXT: mul a0, a0, a1 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v24, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: vmv.v.x v8, a0 +; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v8 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 ; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vsetvli a0, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 4 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: slli a0, a0, 3 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v24, v0 -; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 -; ZVFHMIN-NEXT: add a0, sp, a0 -; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vs8r.v v16, (a0) # vscale x 64-byte Folded Spill -; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v24, v12 +; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v0 ; ZVFHMIN-NEXT: addi a0, sp, 16 ; ZVFHMIN-NEXT: vl8r.v v16, (a0) # vscale x 64-byte Folded Reload -; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v8, v20 +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v20 ; ZVFHMIN-NEXT: csrr a0, vlenb ; ZVFHMIN-NEXT: li a1, 24 ; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 +; ZVFHMIN-NEXT: vs8r.v v0, (a0) # vscale x 64-byte Folded Spill +; ZVFHMIN-NEXT: csrr a0, vlenb +; ZVFHMIN-NEXT: slli a0, a0, 4 +; ZVFHMIN-NEXT: add a0, sp, a0 +; ZVFHMIN-NEXT: addi a0, a0, 16 ; ZVFHMIN-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload ; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v16, v4 -; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma -; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24 +; ZVFHMIN-NEXT: vfwcvtbf16.f.f.v v0, v12 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: slli a0, a0, 5 +; ZVFHMIN-NEXT: li a1, 24 +; ZVFHMIN-NEXT: mul a0, a0, a1 ; ZVFHMIN-NEXT: add a0, sp, a0 ; ZVFHMIN-NEXT: addi a0, a0, 16 -; ZVFHMIN-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload +; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFHMIN-NEXT: vfmadd.vv v0, v16, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma ; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v8, v24 -; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v16 +; ZVFHMIN-NEXT: vfncvtbf16.f.f.w v12, v0 ; ZVFHMIN-NEXT: csrr a0, vlenb -; ZVFHMIN-NEXT: li a1, 40 -; ZVFHMIN-NEXT: mul a0, a0, a1 +; ZVFHMIN-NEXT: slli a0, a0, 5 ; ZVFHMIN-NEXT: add sp, sp, a0 ; ZVFHMIN-NEXT: .cfi_def_cfa sp, 16 ; ZVFHMIN-NEXT: addi sp, sp, 16 @@ -973,94 +943,80 @@ define @vfmadd_vf_nxv32f16( %va, @vfmsub_vf_nxv32f16( %va, @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -22,8 +22,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -38,8 +38,8 @@ define @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -51,8 +51,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -67,8 +67,8 @@ define @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -80,8 +80,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -96,8 +96,8 @@ define @test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -109,8 +109,8 @@ define @test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -125,8 +125,8 @@ define @test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -138,8 +138,8 @@ define @test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -154,8 +154,8 @@ define @test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -167,8 +167,8 @@ define @test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -183,8 +183,8 @@ define @test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -196,8 +196,8 @@ define @test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -212,8 +212,8 @@ define @test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -225,8 +225,8 @@ define @test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -270,8 +270,8 @@ define @test_vloxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -283,8 +283,8 @@ define @test_vloxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -502,8 +502,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -515,8 +515,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -531,8 +531,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -544,8 +544,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -560,8 +560,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -573,8 +573,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -589,8 +589,8 @@ define @test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -602,8 +602,8 @@ define @test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -618,8 +618,8 @@ define @test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -631,8 +631,8 @@ define @test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -647,8 +647,8 @@ define @test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -660,8 +660,8 @@ define @test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -676,8 +676,8 @@ define @test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -689,8 +689,8 @@ define @test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -705,8 +705,8 @@ define @test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -718,8 +718,8 @@ define @test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -763,8 +763,8 @@ define @test_vloxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -776,8 +776,8 @@ define @test_vloxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -850,8 +850,8 @@ define @test_vloxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei8.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -863,8 +863,8 @@ define @test_vloxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei8.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -937,8 +937,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -950,8 +950,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -966,8 +966,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -979,8 +979,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -995,8 +995,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1008,8 +1008,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1024,8 +1024,8 @@ define @test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1037,8 +1037,8 @@ define @test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1053,8 +1053,8 @@ define @test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1066,8 +1066,8 @@ define @test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1082,8 +1082,8 @@ define @test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1095,8 +1095,8 @@ define @test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1111,8 +1111,8 @@ define @test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1124,8 +1124,8 @@ define @test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1140,8 +1140,8 @@ define @test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1153,8 +1153,8 @@ define @test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1198,8 +1198,8 @@ define @test_vloxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1211,8 +1211,8 @@ define @test_vloxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1285,8 +1285,8 @@ define @test_vloxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei8.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1298,8 +1298,8 @@ define @test_vloxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei8.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1372,8 +1372,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1385,8 +1385,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1401,8 +1401,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1414,8 +1414,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1430,8 +1430,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1443,8 +1443,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1459,8 +1459,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1472,8 +1472,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1488,8 +1488,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1501,8 +1501,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1517,8 +1517,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1530,8 +1530,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1546,8 +1546,8 @@ define @test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1559,8 +1559,8 @@ define @test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1575,8 +1575,8 @@ define @test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1588,8 +1588,8 @@ define @test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1604,8 +1604,8 @@ define @test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1617,8 +1617,8 @@ define @test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1633,8 +1633,8 @@ define @test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1646,8 +1646,8 @@ define @test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1662,8 +1662,8 @@ define @test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1675,8 +1675,8 @@ define @test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1720,8 +1720,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1733,8 +1733,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1749,8 +1749,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1762,8 +1762,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1778,8 +1778,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1791,8 +1791,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1807,8 +1807,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1820,8 +1820,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1836,8 +1836,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1849,8 +1849,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1865,8 +1865,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1878,8 +1878,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1894,8 +1894,8 @@ define @test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1907,8 +1907,8 @@ define @test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1923,8 +1923,8 @@ define @test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1936,8 +1936,8 @@ define @test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1952,8 +1952,8 @@ define @test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1965,8 +1965,8 @@ define @test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1981,8 +1981,8 @@ define @test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1994,8 +1994,8 @@ define @test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2010,8 +2010,8 @@ define @test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2023,8 +2023,8 @@ define @test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2068,8 +2068,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2081,8 +2081,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2097,8 +2097,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2110,8 +2110,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2126,8 +2126,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2139,8 +2139,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2155,8 +2155,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2168,8 +2168,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2184,8 +2184,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2197,8 +2197,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2213,8 +2213,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2226,8 +2226,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2242,8 +2242,8 @@ define @test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2255,8 +2255,8 @@ define @test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2271,8 +2271,8 @@ define @test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2284,8 +2284,8 @@ define @test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2300,8 +2300,8 @@ define @test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2313,8 +2313,8 @@ define @test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2329,8 +2329,8 @@ define @test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2342,8 +2342,8 @@ define @test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2358,8 +2358,8 @@ define @test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2371,8 +2371,8 @@ define @test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2416,8 +2416,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2429,8 +2429,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2445,8 +2445,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2458,8 +2458,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2474,8 +2474,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2487,8 +2487,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2503,8 +2503,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2516,8 +2516,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2532,8 +2532,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2545,8 +2545,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2561,8 +2561,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2574,8 +2574,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2590,8 +2590,8 @@ define @test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2603,8 +2603,8 @@ define @test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2619,8 +2619,8 @@ define @test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2632,8 +2632,8 @@ define @test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2648,8 +2648,8 @@ define @test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2661,8 +2661,8 @@ define @test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2677,8 +2677,8 @@ define @test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2690,8 +2690,8 @@ define @test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2706,8 +2706,8 @@ define @test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2719,8 +2719,8 @@ define @test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2764,8 +2764,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2777,8 +2777,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2793,8 +2793,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2806,8 +2806,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2822,8 +2822,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2835,8 +2835,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2851,8 +2851,8 @@ define @test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2864,8 +2864,8 @@ define @test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2880,8 +2880,8 @@ define @test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2893,8 +2893,8 @@ define @test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2909,8 +2909,8 @@ define @test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2922,8 +2922,8 @@ define @test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2938,8 +2938,8 @@ define @test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2951,8 +2951,8 @@ define @test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2967,8 +2967,8 @@ define @test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2980,8 +2980,8 @@ define @test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3199,8 +3199,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3212,8 +3212,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3228,8 +3228,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3241,8 +3241,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3257,8 +3257,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3270,8 +3270,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3286,8 +3286,8 @@ define @test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3299,8 +3299,8 @@ define @test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3315,8 +3315,8 @@ define @test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3328,8 +3328,8 @@ define @test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3344,8 +3344,8 @@ define @test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3357,8 +3357,8 @@ define @test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3373,8 +3373,8 @@ define @test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3386,8 +3386,8 @@ define @test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3402,8 +3402,8 @@ define @test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3415,8 +3415,8 @@ define @test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3489,8 +3489,8 @@ define @test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3502,8 +3502,8 @@ define @test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3547,8 +3547,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3560,8 +3560,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3576,8 +3576,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3589,8 +3589,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3605,8 +3605,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3618,8 +3618,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3634,8 +3634,8 @@ define @test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3647,8 +3647,8 @@ define @test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3663,8 +3663,8 @@ define @test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3676,8 +3676,8 @@ define @test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3692,8 +3692,8 @@ define @test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3705,8 +3705,8 @@ define @test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3721,8 +3721,8 @@ define @test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3734,8 +3734,8 @@ define @test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3750,8 +3750,8 @@ define @test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3763,8 +3763,8 @@ define @test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3837,8 +3837,8 @@ define @test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3850,8 +3850,8 @@ define @test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3895,8 +3895,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3908,8 +3908,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3924,8 +3924,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3937,8 +3937,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3953,8 +3953,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3966,8 +3966,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3982,8 +3982,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3995,8 +3995,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4011,8 +4011,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4024,8 +4024,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4040,8 +4040,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4053,8 +4053,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4069,8 +4069,8 @@ define @test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4082,8 +4082,8 @@ define @test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4098,8 +4098,8 @@ define @test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4111,8 +4111,8 @@ define @test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4127,8 +4127,8 @@ define @test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4140,8 +4140,8 @@ define @test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4156,8 +4156,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4169,8 +4169,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4185,8 +4185,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4198,8 +4198,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4214,8 +4214,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4227,8 +4227,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4243,8 +4243,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4256,8 +4256,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4272,8 +4272,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4285,8 +4285,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4301,8 +4301,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4314,8 +4314,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4330,8 +4330,8 @@ define @test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4343,8 +4343,8 @@ define @test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4359,8 +4359,8 @@ define @test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4372,8 +4372,8 @@ define @test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4388,8 +4388,8 @@ define @test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4401,8 +4401,8 @@ define @test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4417,8 +4417,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4430,8 +4430,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4446,8 +4446,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4459,8 +4459,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4475,8 +4475,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4488,8 +4488,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4504,8 +4504,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4517,8 +4517,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4533,8 +4533,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4546,8 +4546,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4562,8 +4562,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4575,8 +4575,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4591,8 +4591,8 @@ define @test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4604,8 +4604,8 @@ define @test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4620,8 +4620,8 @@ define @test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4633,8 +4633,8 @@ define @test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4649,8 +4649,8 @@ define @test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4662,8 +4662,8 @@ define @test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4678,8 +4678,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4691,8 +4691,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4707,8 +4707,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4720,8 +4720,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4736,8 +4736,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4749,8 +4749,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4765,8 +4765,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4778,8 +4778,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4794,8 +4794,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4807,8 +4807,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4823,8 +4823,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4836,8 +4836,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4852,8 +4852,8 @@ define @test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4865,8 +4865,8 @@ define @test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4881,8 +4881,8 @@ define @test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4894,8 +4894,8 @@ define @test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4910,8 +4910,8 @@ define @test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4923,8 +4923,8 @@ define @test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4939,8 +4939,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -4952,8 +4952,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -4968,8 +4968,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -4981,8 +4981,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -4997,8 +4997,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5010,8 +5010,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5026,8 +5026,8 @@ define @test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5039,8 +5039,8 @@ define @test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5055,8 +5055,8 @@ define @test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5068,8 +5068,8 @@ define @test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5084,8 +5084,8 @@ define @test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5097,8 +5097,8 @@ define @test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5287,8 +5287,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5300,8 +5300,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5316,8 +5316,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5329,8 +5329,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5345,8 +5345,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5358,8 +5358,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5374,8 +5374,8 @@ define @test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5387,8 +5387,8 @@ define @test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5403,8 +5403,8 @@ define @test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5416,8 +5416,8 @@ define @test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5432,8 +5432,8 @@ define @test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5445,8 +5445,8 @@ define @test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5519,8 +5519,8 @@ define @test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5532,8 +5532,8 @@ define @test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5548,8 +5548,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5561,8 +5561,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5577,8 +5577,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5590,8 +5590,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5606,8 +5606,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5619,8 +5619,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5635,8 +5635,8 @@ define @test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5648,8 +5648,8 @@ define @test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5664,8 +5664,8 @@ define @test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5677,8 +5677,8 @@ define @test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5693,8 +5693,8 @@ define @test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5706,8 +5706,8 @@ define @test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5780,8 +5780,8 @@ define @test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5793,8 +5793,8 @@ define @test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5809,8 +5809,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5822,8 +5822,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5838,8 +5838,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5851,8 +5851,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5867,8 +5867,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5880,8 +5880,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5896,8 +5896,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5909,8 +5909,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5925,8 +5925,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5938,8 +5938,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5954,8 +5954,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5967,8 +5967,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5983,8 +5983,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5996,8 +5996,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6012,8 +6012,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6025,8 +6025,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6041,8 +6041,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6054,8 +6054,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6070,8 +6070,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6083,8 +6083,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6099,8 +6099,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6112,8 +6112,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6128,8 +6128,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6141,8 +6141,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6157,8 +6157,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6170,8 +6170,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6186,8 +6186,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6199,8 +6199,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6215,8 +6215,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6228,8 +6228,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6244,8 +6244,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6257,8 +6257,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6273,8 +6273,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6286,8 +6286,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6302,8 +6302,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6315,8 +6315,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6331,8 +6331,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6344,8 +6344,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6360,8 +6360,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6373,8 +6373,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6389,8 +6389,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6402,8 +6402,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6418,8 +6418,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6431,8 +6431,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6447,8 +6447,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6460,8 +6460,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6476,8 +6476,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6489,8 +6489,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6505,8 +6505,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6518,8 +6518,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6534,8 +6534,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6547,8 +6547,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6563,8 +6563,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6576,8 +6576,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6766,8 +6766,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6779,8 +6779,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6795,8 +6795,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6808,8 +6808,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6824,8 +6824,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6837,8 +6837,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6940,8 +6940,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6953,8 +6953,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6969,8 +6969,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6982,8 +6982,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6998,8 +6998,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7011,8 +7011,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7114,8 +7114,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7127,8 +7127,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7143,8 +7143,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7156,8 +7156,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7172,8 +7172,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7185,8 +7185,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7201,8 +7201,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7214,8 +7214,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7230,8 +7230,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7243,8 +7243,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7259,8 +7259,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7272,8 +7272,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7288,8 +7288,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7301,8 +7301,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7317,8 +7317,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7330,8 +7330,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7346,8 +7346,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7359,8 +7359,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7375,8 +7375,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7388,8 +7388,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7404,8 +7404,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7417,8 +7417,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7433,8 +7433,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7446,8 +7446,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7460,8 +7460,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7473,8 +7473,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7487,8 +7487,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7500,8 +7500,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7514,8 +7514,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7527,8 +7527,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7541,8 +7541,8 @@ define @test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7554,8 +7554,8 @@ define @test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7568,8 +7568,8 @@ define @test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7581,8 +7581,8 @@ define @test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7595,8 +7595,8 @@ define @test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7608,8 +7608,8 @@ define @test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7622,8 +7622,8 @@ define @test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7635,8 +7635,8 @@ define @test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7649,8 +7649,8 @@ define @test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7662,8 +7662,8 @@ define @test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7865,8 +7865,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7878,8 +7878,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7892,8 +7892,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7905,8 +7905,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7919,8 +7919,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7932,8 +7932,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7946,8 +7946,8 @@ define @test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7959,8 +7959,8 @@ define @test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7973,8 +7973,8 @@ define @test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7986,8 +7986,8 @@ define @test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8000,8 +8000,8 @@ define @test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8013,8 +8013,8 @@ define @test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8027,8 +8027,8 @@ define @test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8040,8 +8040,8 @@ define @test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8054,8 +8054,8 @@ define @test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8067,8 +8067,8 @@ define @test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8135,8 +8135,8 @@ define @test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t ; CHECK-LABEL: test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8148,8 +8148,8 @@ define @test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8189,8 +8189,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8202,8 +8202,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8216,8 +8216,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8229,8 +8229,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8243,8 +8243,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8256,8 +8256,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8270,8 +8270,8 @@ define @test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8283,8 +8283,8 @@ define @test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8297,8 +8297,8 @@ define @test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8310,8 +8310,8 @@ define @test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8324,8 +8324,8 @@ define @test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8337,8 +8337,8 @@ define @test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8351,8 +8351,8 @@ define @test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8364,8 +8364,8 @@ define @test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8378,8 +8378,8 @@ define @test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8391,8 +8391,8 @@ define @test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8459,8 +8459,8 @@ define @test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t ; CHECK-LABEL: test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8472,8 +8472,8 @@ define @test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8513,8 +8513,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8526,8 +8526,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8540,8 +8540,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8553,8 +8553,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8567,8 +8567,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8580,8 +8580,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8594,8 +8594,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8607,8 +8607,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8621,8 +8621,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8634,8 +8634,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8648,8 +8648,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8661,8 +8661,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8675,8 +8675,8 @@ define @test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8688,8 +8688,8 @@ define @test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8702,8 +8702,8 @@ define @test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8715,8 +8715,8 @@ define @test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8729,8 +8729,8 @@ define @test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8742,8 +8742,8 @@ define @test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8756,8 +8756,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8769,8 +8769,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8783,8 +8783,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8796,8 +8796,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8810,8 +8810,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8823,8 +8823,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8837,8 +8837,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8850,8 +8850,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8864,8 +8864,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8877,8 +8877,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8891,8 +8891,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8904,8 +8904,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8918,8 +8918,8 @@ define @test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8931,8 +8931,8 @@ define @test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8945,8 +8945,8 @@ define @test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8958,8 +8958,8 @@ define @test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8972,8 +8972,8 @@ define @test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8985,8 +8985,8 @@ define @test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8999,8 +8999,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9012,8 +9012,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9026,8 +9026,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9039,8 +9039,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9053,8 +9053,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9066,8 +9066,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9080,8 +9080,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9093,8 +9093,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9107,8 +9107,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9120,8 +9120,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9134,8 +9134,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9147,8 +9147,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9161,8 +9161,8 @@ define @test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9174,8 +9174,8 @@ define @test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9188,8 +9188,8 @@ define @test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9201,8 +9201,8 @@ define @test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9215,8 +9215,8 @@ define @test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9228,8 +9228,8 @@ define @test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9242,8 +9242,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9255,8 +9255,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9269,8 +9269,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9282,8 +9282,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9296,8 +9296,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9309,8 +9309,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9323,8 +9323,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9336,8 +9336,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9350,8 +9350,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9363,8 +9363,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9377,8 +9377,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9390,8 +9390,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9404,8 +9404,8 @@ define @test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9417,8 +9417,8 @@ define @test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9431,8 +9431,8 @@ define @test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9444,8 +9444,8 @@ define @test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9458,8 +9458,8 @@ define @test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9471,8 +9471,8 @@ define @test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9485,8 +9485,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9498,8 +9498,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9512,8 +9512,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9525,8 +9525,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9539,8 +9539,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9552,8 +9552,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9566,8 +9566,8 @@ define @test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9579,8 +9579,8 @@ define @test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9593,8 +9593,8 @@ define @test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9606,8 +9606,8 @@ define @test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9620,8 +9620,8 @@ define @test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9633,8 +9633,8 @@ define @test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9809,8 +9809,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9822,8 +9822,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9836,8 +9836,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9849,8 +9849,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9863,8 +9863,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9876,8 +9876,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9890,8 +9890,8 @@ define @test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9903,8 +9903,8 @@ define @test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9917,8 +9917,8 @@ define @test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9930,8 +9930,8 @@ define @test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9944,8 +9944,8 @@ define @test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9957,8 +9957,8 @@ define @test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10025,8 +10025,8 @@ define @test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3 ; CHECK-LABEL: test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10038,8 +10038,8 @@ define @test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10052,8 +10052,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10065,8 +10065,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10079,8 +10079,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10092,8 +10092,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10106,8 +10106,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10119,8 +10119,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10133,8 +10133,8 @@ define @test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10146,8 +10146,8 @@ define @test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10160,8 +10160,8 @@ define @test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10173,8 +10173,8 @@ define @test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10187,8 +10187,8 @@ define @test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10200,8 +10200,8 @@ define @test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10268,8 +10268,8 @@ define @test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4 ; CHECK-LABEL: test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10281,8 +10281,8 @@ define @test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10295,8 +10295,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10308,8 +10308,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10322,8 +10322,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10335,8 +10335,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10349,8 +10349,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10362,8 +10362,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10376,8 +10376,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10389,8 +10389,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10403,8 +10403,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10416,8 +10416,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10430,8 +10430,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10443,8 +10443,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10457,8 +10457,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10470,8 +10470,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10484,8 +10484,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10497,8 +10497,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10511,8 +10511,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10524,8 +10524,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10538,8 +10538,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10551,8 +10551,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10565,8 +10565,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10578,8 +10578,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10592,8 +10592,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10605,8 +10605,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10619,8 +10619,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10632,8 +10632,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10646,8 +10646,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10659,8 +10659,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10673,8 +10673,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10686,8 +10686,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10700,8 +10700,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10713,8 +10713,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10727,8 +10727,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10740,8 +10740,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10754,8 +10754,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10767,8 +10767,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10781,8 +10781,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10794,8 +10794,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10808,8 +10808,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10821,8 +10821,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10835,8 +10835,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10848,8 +10848,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10862,8 +10862,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10875,8 +10875,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10889,8 +10889,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10902,8 +10902,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10916,8 +10916,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10929,8 +10929,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10943,8 +10943,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -10956,8 +10956,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -10970,8 +10970,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -10983,8 +10983,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -10997,8 +10997,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11010,8 +11010,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11186,8 +11186,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11199,8 +11199,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11213,8 +11213,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11226,8 +11226,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11240,8 +11240,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11253,8 +11253,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11348,8 +11348,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11361,8 +11361,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11375,8 +11375,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11388,8 +11388,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11402,8 +11402,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11415,8 +11415,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11510,8 +11510,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11523,8 +11523,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11537,8 +11537,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11550,8 +11550,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11564,8 +11564,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11577,8 +11577,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11591,8 +11591,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11604,8 +11604,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11618,8 +11618,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11631,8 +11631,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11645,8 +11645,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11658,8 +11658,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11672,8 +11672,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11685,8 +11685,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11699,8 +11699,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11712,8 +11712,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11726,8 +11726,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11739,8 +11739,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11753,8 +11753,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11766,8 +11766,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11780,8 +11780,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11793,8 +11793,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11807,8 +11807,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11820,8 +11820,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11834,8 +11834,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11847,8 +11847,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11861,8 +11861,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11874,8 +11874,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11888,8 +11888,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11901,8 +11901,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11915,8 +11915,8 @@ define @test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11928,8 +11928,8 @@ define @test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11942,8 +11942,8 @@ define @test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11955,8 +11955,8 @@ define @test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11969,8 +11969,8 @@ define @test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11982,8 +11982,8 @@ define @test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11996,8 +11996,8 @@ define @test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12009,8 +12009,8 @@ define @test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12023,8 +12023,8 @@ define @test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12036,8 +12036,8 @@ define @test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12239,8 +12239,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12252,8 +12252,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12266,8 +12266,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12279,8 +12279,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12293,8 +12293,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12306,8 +12306,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12320,8 +12320,8 @@ define @test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12333,8 +12333,8 @@ define @test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12347,8 +12347,8 @@ define @test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12360,8 +12360,8 @@ define @test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12374,8 +12374,8 @@ define @test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12387,8 +12387,8 @@ define @test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12401,8 +12401,8 @@ define @test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12414,8 +12414,8 @@ define @test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12428,8 +12428,8 @@ define @test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12441,8 +12441,8 @@ define @test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12509,8 +12509,8 @@ define @test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12522,8 +12522,8 @@ define @test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12563,8 +12563,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12576,8 +12576,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12590,8 +12590,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12603,8 +12603,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12617,8 +12617,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12630,8 +12630,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12644,8 +12644,8 @@ define @test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12657,8 +12657,8 @@ define @test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12671,8 +12671,8 @@ define @test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12684,8 +12684,8 @@ define @test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12698,8 +12698,8 @@ define @test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12711,8 +12711,8 @@ define @test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12725,8 +12725,8 @@ define @test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12738,8 +12738,8 @@ define @test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12752,8 +12752,8 @@ define @test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12765,8 +12765,8 @@ define @test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12833,8 +12833,8 @@ define @test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12846,8 +12846,8 @@ define @test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12887,8 +12887,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12900,8 +12900,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12914,8 +12914,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12927,8 +12927,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12941,8 +12941,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12954,8 +12954,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12968,8 +12968,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12981,8 +12981,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12995,8 +12995,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13008,8 +13008,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13022,8 +13022,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13035,8 +13035,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13049,8 +13049,8 @@ define @test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13062,8 +13062,8 @@ define @test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13076,8 +13076,8 @@ define @test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13089,8 +13089,8 @@ define @test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13103,8 +13103,8 @@ define @test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13116,8 +13116,8 @@ define @test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13130,8 +13130,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13143,8 +13143,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13157,8 +13157,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13170,8 +13170,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13184,8 +13184,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13197,8 +13197,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13211,8 +13211,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13224,8 +13224,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13238,8 +13238,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13251,8 +13251,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13265,8 +13265,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13278,8 +13278,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13292,8 +13292,8 @@ define @test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13305,8 +13305,8 @@ define @test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13319,8 +13319,8 @@ define @test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13332,8 +13332,8 @@ define @test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13346,8 +13346,8 @@ define @test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13359,8 +13359,8 @@ define @test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13373,8 +13373,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13386,8 +13386,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13400,8 +13400,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13413,8 +13413,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13427,8 +13427,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13440,8 +13440,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13454,8 +13454,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13467,8 +13467,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13481,8 +13481,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13494,8 +13494,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13508,8 +13508,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13521,8 +13521,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13535,8 +13535,8 @@ define @test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13548,8 +13548,8 @@ define @test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13562,8 +13562,8 @@ define @test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13575,8 +13575,8 @@ define @test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13589,8 +13589,8 @@ define @test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13602,8 +13602,8 @@ define @test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13616,8 +13616,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13629,8 +13629,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13643,8 +13643,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13656,8 +13656,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13670,8 +13670,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13683,8 +13683,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13697,8 +13697,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13710,8 +13710,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13724,8 +13724,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13737,8 +13737,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13751,8 +13751,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13764,8 +13764,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13778,8 +13778,8 @@ define @test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13791,8 +13791,8 @@ define @test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13805,8 +13805,8 @@ define @test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13818,8 +13818,8 @@ define @test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13832,8 +13832,8 @@ define @test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13845,8 +13845,8 @@ define @test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) diff --git a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll index a45571e30e853..8f85eb5638255 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vloxseg-rv64.ll @@ -9,8 +9,8 @@ define @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -22,8 +22,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -38,8 +38,8 @@ define @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -51,8 +51,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -67,8 +67,8 @@ define @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -80,8 +80,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -96,8 +96,8 @@ define @test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -109,8 +109,8 @@ define @test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -125,8 +125,8 @@ define @test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -138,8 +138,8 @@ define @test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -154,8 +154,8 @@ define @test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -167,8 +167,8 @@ define @test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -183,8 +183,8 @@ define @test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -196,8 +196,8 @@ define @test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -241,8 +241,8 @@ define @test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -254,8 +254,8 @@ define @test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -270,8 +270,8 @@ define @test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -283,8 +283,8 @@ define @test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -357,8 +357,8 @@ define @test_vloxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv ; CHECK-LABEL: test_vloxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -370,8 +370,8 @@ define @test_vloxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -618,8 +618,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -631,8 +631,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -647,8 +647,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -660,8 +660,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -676,8 +676,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -689,8 +689,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -705,8 +705,8 @@ define @test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -718,8 +718,8 @@ define @test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -734,8 +734,8 @@ define @test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -747,8 +747,8 @@ define @test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -763,8 +763,8 @@ define @test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -776,8 +776,8 @@ define @test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -792,8 +792,8 @@ define @test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -805,8 +805,8 @@ define @test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -850,8 +850,8 @@ define @test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -863,8 +863,8 @@ define @test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -879,8 +879,8 @@ define @test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -892,8 +892,8 @@ define @test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -966,8 +966,8 @@ define @test_vloxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv ; CHECK-LABEL: test_vloxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -979,8 +979,8 @@ define @test_vloxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1082,8 +1082,8 @@ define @test_vloxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei8.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1095,8 +1095,8 @@ define @test_vloxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei8.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1169,8 +1169,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1182,8 +1182,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1198,8 +1198,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1211,8 +1211,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1227,8 +1227,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1240,8 +1240,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1256,8 +1256,8 @@ define @test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1269,8 +1269,8 @@ define @test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1285,8 +1285,8 @@ define @test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1298,8 +1298,8 @@ define @test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1314,8 +1314,8 @@ define @test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1327,8 +1327,8 @@ define @test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1343,8 +1343,8 @@ define @test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1356,8 +1356,8 @@ define @test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1401,8 +1401,8 @@ define @test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1414,8 +1414,8 @@ define @test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1430,8 +1430,8 @@ define @test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1443,8 +1443,8 @@ define @test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1517,8 +1517,8 @@ define @test_vloxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv ; CHECK-LABEL: test_vloxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1530,8 +1530,8 @@ define @test_vloxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1633,8 +1633,8 @@ define @test_vloxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei8.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1646,8 +1646,8 @@ define @test_vloxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei8.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1720,8 +1720,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1733,8 +1733,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1749,8 +1749,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1762,8 +1762,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1778,8 +1778,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1791,8 +1791,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1807,8 +1807,8 @@ define @test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1820,8 +1820,8 @@ define @test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1836,8 +1836,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1849,8 +1849,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1865,8 +1865,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1878,8 +1878,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1894,8 +1894,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1907,8 +1907,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1923,8 +1923,8 @@ define @test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1936,8 +1936,8 @@ define @test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1952,8 +1952,8 @@ define @test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1965,8 +1965,8 @@ define @test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1981,8 +1981,8 @@ define @test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1994,8 +1994,8 @@ define @test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2010,8 +2010,8 @@ define @test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2023,8 +2023,8 @@ define @test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2068,8 +2068,8 @@ define @test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2081,8 +2081,8 @@ define @test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2097,8 +2097,8 @@ define @test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vloxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2110,8 +2110,8 @@ define @test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2184,8 +2184,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2197,8 +2197,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2213,8 +2213,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2226,8 +2226,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2242,8 +2242,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2255,8 +2255,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2271,8 +2271,8 @@ define @test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2284,8 +2284,8 @@ define @test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2300,8 +2300,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2313,8 +2313,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2329,8 +2329,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2342,8 +2342,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2358,8 +2358,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2371,8 +2371,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2387,8 +2387,8 @@ define @test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2400,8 +2400,8 @@ define @test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2416,8 +2416,8 @@ define @test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2429,8 +2429,8 @@ define @test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2445,8 +2445,8 @@ define @test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2458,8 +2458,8 @@ define @test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2474,8 +2474,8 @@ define @test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2487,8 +2487,8 @@ define @test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2532,8 +2532,8 @@ define @test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2545,8 +2545,8 @@ define @test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2561,8 +2561,8 @@ define @test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vloxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2574,8 +2574,8 @@ define @test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2648,8 +2648,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2661,8 +2661,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2677,8 +2677,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2690,8 +2690,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2706,8 +2706,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2719,8 +2719,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2735,8 +2735,8 @@ define @test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2748,8 +2748,8 @@ define @test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2764,8 +2764,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2777,8 +2777,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2793,8 +2793,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2806,8 +2806,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2822,8 +2822,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2835,8 +2835,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2851,8 +2851,8 @@ define @test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2864,8 +2864,8 @@ define @test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2880,8 +2880,8 @@ define @test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2893,8 +2893,8 @@ define @test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2909,8 +2909,8 @@ define @test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2922,8 +2922,8 @@ define @test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2938,8 +2938,8 @@ define @test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2951,8 +2951,8 @@ define @test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2996,8 +2996,8 @@ define @test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3009,8 +3009,8 @@ define @test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3025,8 +3025,8 @@ define @test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vloxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3038,8 +3038,8 @@ define @test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3112,8 +3112,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3125,8 +3125,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3141,8 +3141,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3154,8 +3154,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3170,8 +3170,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3183,8 +3183,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3199,8 +3199,8 @@ define @test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3212,8 +3212,8 @@ define @test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3228,8 +3228,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3241,8 +3241,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3257,8 +3257,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3270,8 +3270,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3286,8 +3286,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3299,8 +3299,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3315,8 +3315,8 @@ define @test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3328,8 +3328,8 @@ define @test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3344,8 +3344,8 @@ define @test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3357,8 +3357,8 @@ define @test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3373,8 +3373,8 @@ define @test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3386,8 +3386,8 @@ define @test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3402,8 +3402,8 @@ define @test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3415,8 +3415,8 @@ define @test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3460,8 +3460,8 @@ define @test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3473,8 +3473,8 @@ define @test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3489,8 +3489,8 @@ define @test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vloxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3502,8 +3502,8 @@ define @test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3576,8 +3576,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3589,8 +3589,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3605,8 +3605,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3618,8 +3618,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3634,8 +3634,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3647,8 +3647,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3663,8 +3663,8 @@ define @test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3676,8 +3676,8 @@ define @test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3692,8 +3692,8 @@ define @test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3705,8 +3705,8 @@ define @test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3721,8 +3721,8 @@ define @test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3734,8 +3734,8 @@ define @test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3750,8 +3750,8 @@ define @test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3763,8 +3763,8 @@ define @test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3808,8 +3808,8 @@ define @test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3821,8 +3821,8 @@ define @test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3837,8 +3837,8 @@ define @test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3850,8 +3850,8 @@ define @test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4127,8 +4127,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4140,8 +4140,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4156,8 +4156,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4169,8 +4169,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4185,8 +4185,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4198,8 +4198,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4214,8 +4214,8 @@ define @test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4227,8 +4227,8 @@ define @test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4243,8 +4243,8 @@ define @test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4256,8 +4256,8 @@ define @test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4272,8 +4272,8 @@ define @test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4285,8 +4285,8 @@ define @test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4301,8 +4301,8 @@ define @test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4314,8 +4314,8 @@ define @test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4359,8 +4359,8 @@ define @test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4372,8 +4372,8 @@ define @test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4388,8 +4388,8 @@ define @test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4401,8 +4401,8 @@ define @test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4504,8 +4504,8 @@ define @test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4517,8 +4517,8 @@ define @test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4591,8 +4591,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4604,8 +4604,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4620,8 +4620,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4633,8 +4633,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4649,8 +4649,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4662,8 +4662,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4678,8 +4678,8 @@ define @test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4691,8 +4691,8 @@ define @test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4707,8 +4707,8 @@ define @test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4720,8 +4720,8 @@ define @test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4736,8 +4736,8 @@ define @test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4749,8 +4749,8 @@ define @test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4765,8 +4765,8 @@ define @test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4778,8 +4778,8 @@ define @test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4823,8 +4823,8 @@ define @test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4836,8 +4836,8 @@ define @test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4852,8 +4852,8 @@ define @test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4865,8 +4865,8 @@ define @test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4968,8 +4968,8 @@ define @test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4981,8 +4981,8 @@ define @test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5055,8 +5055,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5068,8 +5068,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5084,8 +5084,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5097,8 +5097,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5113,8 +5113,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5126,8 +5126,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5142,8 +5142,8 @@ define @test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5155,8 +5155,8 @@ define @test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5171,8 +5171,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5184,8 +5184,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5200,8 +5200,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5213,8 +5213,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5229,8 +5229,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5242,8 +5242,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5258,8 +5258,8 @@ define @test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5271,8 +5271,8 @@ define @test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5287,8 +5287,8 @@ define @test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5300,8 +5300,8 @@ define @test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5316,8 +5316,8 @@ define @test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5329,8 +5329,8 @@ define @test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5345,8 +5345,8 @@ define @test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5358,8 +5358,8 @@ define @test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5403,8 +5403,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5416,8 +5416,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5432,8 +5432,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5445,8 +5445,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5461,8 +5461,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5474,8 +5474,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5490,8 +5490,8 @@ define @test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5503,8 +5503,8 @@ define @test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5519,8 +5519,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5532,8 +5532,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5548,8 +5548,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5561,8 +5561,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5577,8 +5577,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5590,8 +5590,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5606,8 +5606,8 @@ define @test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5619,8 +5619,8 @@ define @test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5635,8 +5635,8 @@ define @test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5648,8 +5648,8 @@ define @test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5664,8 +5664,8 @@ define @test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5677,8 +5677,8 @@ define @test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5693,8 +5693,8 @@ define @test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5706,8 +5706,8 @@ define @test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5751,8 +5751,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5764,8 +5764,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5780,8 +5780,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5793,8 +5793,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5809,8 +5809,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5822,8 +5822,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5838,8 +5838,8 @@ define @test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5851,8 +5851,8 @@ define @test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5867,8 +5867,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5880,8 +5880,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5896,8 +5896,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5909,8 +5909,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5925,8 +5925,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5938,8 +5938,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5954,8 +5954,8 @@ define @test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5967,8 +5967,8 @@ define @test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5983,8 +5983,8 @@ define @test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5996,8 +5996,8 @@ define @test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6012,8 +6012,8 @@ define @test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6025,8 +6025,8 @@ define @test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6041,8 +6041,8 @@ define @test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6054,8 +6054,8 @@ define @test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6099,8 +6099,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6112,8 +6112,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6128,8 +6128,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6141,8 +6141,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6157,8 +6157,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6170,8 +6170,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6186,8 +6186,8 @@ define @test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6199,8 +6199,8 @@ define @test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6215,8 +6215,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6228,8 +6228,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6244,8 +6244,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6257,8 +6257,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6273,8 +6273,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6286,8 +6286,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6302,8 +6302,8 @@ define @test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6315,8 +6315,8 @@ define @test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6331,8 +6331,8 @@ define @test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6344,8 +6344,8 @@ define @test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6360,8 +6360,8 @@ define @test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6373,8 +6373,8 @@ define @test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6389,8 +6389,8 @@ define @test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6402,8 +6402,8 @@ define @test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6447,8 +6447,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6460,8 +6460,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6476,8 +6476,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6489,8 +6489,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6505,8 +6505,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6518,8 +6518,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6534,8 +6534,8 @@ define @test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6547,8 +6547,8 @@ define @test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6563,8 +6563,8 @@ define @test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6576,8 +6576,8 @@ define @test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6592,8 +6592,8 @@ define @test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6605,8 +6605,8 @@ define @test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6621,8 +6621,8 @@ define @test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6634,8 +6634,8 @@ define @test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6911,8 +6911,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6924,8 +6924,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6940,8 +6940,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6953,8 +6953,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6969,8 +6969,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6982,8 +6982,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6998,8 +6998,8 @@ define @test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7011,8 +7011,8 @@ define @test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7027,8 +7027,8 @@ define @test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7040,8 +7040,8 @@ define @test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7056,8 +7056,8 @@ define @test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7069,8 +7069,8 @@ define @test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7085,8 +7085,8 @@ define @test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7098,8 +7098,8 @@ define @test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7201,8 +7201,8 @@ define @test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7214,8 +7214,8 @@ define @test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7259,8 +7259,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7272,8 +7272,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7288,8 +7288,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7301,8 +7301,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7317,8 +7317,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7330,8 +7330,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7346,8 +7346,8 @@ define @test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7359,8 +7359,8 @@ define @test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7375,8 +7375,8 @@ define @test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7388,8 +7388,8 @@ define @test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7404,8 +7404,8 @@ define @test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7417,8 +7417,8 @@ define @test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7433,8 +7433,8 @@ define @test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7446,8 +7446,8 @@ define @test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7549,8 +7549,8 @@ define @test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7562,8 +7562,8 @@ define @test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7607,8 +7607,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7620,8 +7620,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7636,8 +7636,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7649,8 +7649,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7665,8 +7665,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7678,8 +7678,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7694,8 +7694,8 @@ define @test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7707,8 +7707,8 @@ define @test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7723,8 +7723,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7736,8 +7736,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7752,8 +7752,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7765,8 +7765,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7781,8 +7781,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7794,8 +7794,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7810,8 +7810,8 @@ define @test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7823,8 +7823,8 @@ define @test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7839,8 +7839,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7852,8 +7852,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7868,8 +7868,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7881,8 +7881,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7897,8 +7897,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7910,8 +7910,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7926,8 +7926,8 @@ define @test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7939,8 +7939,8 @@ define @test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7955,8 +7955,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7968,8 +7968,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7984,8 +7984,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7997,8 +7997,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8013,8 +8013,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8026,8 +8026,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8042,8 +8042,8 @@ define @test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8055,8 +8055,8 @@ define @test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8071,8 +8071,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8084,8 +8084,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8100,8 +8100,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8113,8 +8113,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8129,8 +8129,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8142,8 +8142,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8158,8 +8158,8 @@ define @test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8171,8 +8171,8 @@ define @test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8187,8 +8187,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8200,8 +8200,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8216,8 +8216,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8229,8 +8229,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8245,8 +8245,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8258,8 +8258,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8274,8 +8274,8 @@ define @test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8287,8 +8287,8 @@ define @test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8303,8 +8303,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8316,8 +8316,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8332,8 +8332,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8345,8 +8345,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8361,8 +8361,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8374,8 +8374,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8390,8 +8390,8 @@ define @test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8403,8 +8403,8 @@ define @test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8419,8 +8419,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8432,8 +8432,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8448,8 +8448,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8461,8 +8461,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8477,8 +8477,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8490,8 +8490,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8506,8 +8506,8 @@ define @test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8519,8 +8519,8 @@ define @test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8535,8 +8535,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8548,8 +8548,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8564,8 +8564,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8577,8 +8577,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8593,8 +8593,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8606,8 +8606,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8622,8 +8622,8 @@ define @test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vloxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8635,8 +8635,8 @@ define @test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8883,8 +8883,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8896,8 +8896,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8912,8 +8912,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8925,8 +8925,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8941,8 +8941,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8954,8 +8954,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8970,8 +8970,8 @@ define @test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vloxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8983,8 +8983,8 @@ define @test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9086,8 +9086,8 @@ define @test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei64.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9099,8 +9099,8 @@ define @test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei64.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9115,8 +9115,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9128,8 +9128,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9144,8 +9144,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9157,8 +9157,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9173,8 +9173,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9186,8 +9186,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9202,8 +9202,8 @@ define @test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vloxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9215,8 +9215,8 @@ define @test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9318,8 +9318,8 @@ define @test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei64.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9331,8 +9331,8 @@ define @test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vloxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei64.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9347,8 +9347,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9360,8 +9360,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9376,8 +9376,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9389,8 +9389,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9405,8 +9405,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9418,8 +9418,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9434,8 +9434,8 @@ define @test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vloxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9447,8 +9447,8 @@ define @test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9463,8 +9463,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9476,8 +9476,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9492,8 +9492,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9505,8 +9505,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9521,8 +9521,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9534,8 +9534,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9550,8 +9550,8 @@ define @test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vloxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9563,8 +9563,8 @@ define @test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9579,8 +9579,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9592,8 +9592,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9608,8 +9608,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9621,8 +9621,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9637,8 +9637,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9650,8 +9650,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9666,8 +9666,8 @@ define @test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vloxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9679,8 +9679,8 @@ define @test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9695,8 +9695,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9708,8 +9708,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9724,8 +9724,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9737,8 +9737,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9753,8 +9753,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9766,8 +9766,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9782,8 +9782,8 @@ define @test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vloxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9795,8 +9795,8 @@ define @test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vloxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9809,8 +9809,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9822,8 +9822,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9836,8 +9836,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9849,8 +9849,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9863,8 +9863,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9876,8 +9876,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9890,8 +9890,8 @@ define @test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9903,8 +9903,8 @@ define @test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9917,8 +9917,8 @@ define @test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9930,8 +9930,8 @@ define @test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9944,8 +9944,8 @@ define @test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9957,8 +9957,8 @@ define @test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9971,8 +9971,8 @@ define @test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9984,8 +9984,8 @@ define @test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10025,8 +10025,8 @@ define @test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10038,8 +10038,8 @@ define @test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10052,8 +10052,8 @@ define @test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vloxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10065,8 +10065,8 @@ define @test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10322,8 +10322,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10335,8 +10335,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10349,8 +10349,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10362,8 +10362,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10376,8 +10376,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10389,8 +10389,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10403,8 +10403,8 @@ define @test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10416,8 +10416,8 @@ define @test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10430,8 +10430,8 @@ define @test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10443,8 +10443,8 @@ define @test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10457,8 +10457,8 @@ define @test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10470,8 +10470,8 @@ define @test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10484,8 +10484,8 @@ define @test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10497,8 +10497,8 @@ define @test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10538,8 +10538,8 @@ define @test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10551,8 +10551,8 @@ define @test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10565,8 +10565,8 @@ define @test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vloxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10578,8 +10578,8 @@ define @test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10673,8 +10673,8 @@ define @test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t ; CHECK-LABEL: test_vloxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10686,8 +10686,8 @@ define @test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vloxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10754,8 +10754,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10767,8 +10767,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10781,8 +10781,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10794,8 +10794,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10808,8 +10808,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10821,8 +10821,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10835,8 +10835,8 @@ define @test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10848,8 +10848,8 @@ define @test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10862,8 +10862,8 @@ define @test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10875,8 +10875,8 @@ define @test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10889,8 +10889,8 @@ define @test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10902,8 +10902,8 @@ define @test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10916,8 +10916,8 @@ define @test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10929,8 +10929,8 @@ define @test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10970,8 +10970,8 @@ define @test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10983,8 +10983,8 @@ define @test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10997,8 +10997,8 @@ define @test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vloxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11010,8 +11010,8 @@ define @test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11105,8 +11105,8 @@ define @test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t ; CHECK-LABEL: test_vloxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11118,8 +11118,8 @@ define @test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vloxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11186,8 +11186,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11199,8 +11199,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11213,8 +11213,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11226,8 +11226,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11240,8 +11240,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11253,8 +11253,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11267,8 +11267,8 @@ define @test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11280,8 +11280,8 @@ define @test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11294,8 +11294,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11307,8 +11307,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11321,8 +11321,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11334,8 +11334,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11348,8 +11348,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11361,8 +11361,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11375,8 +11375,8 @@ define @test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11388,8 +11388,8 @@ define @test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11402,8 +11402,8 @@ define @test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11415,8 +11415,8 @@ define @test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11429,8 +11429,8 @@ define @test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11442,8 +11442,8 @@ define @test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11456,8 +11456,8 @@ define @test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vloxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11469,8 +11469,8 @@ define @test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11510,8 +11510,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11523,8 +11523,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11537,8 +11537,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11550,8 +11550,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11564,8 +11564,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11577,8 +11577,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11591,8 +11591,8 @@ define @test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11604,8 +11604,8 @@ define @test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11618,8 +11618,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11631,8 +11631,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11645,8 +11645,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11658,8 +11658,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11672,8 +11672,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11685,8 +11685,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11699,8 +11699,8 @@ define @test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11712,8 +11712,8 @@ define @test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11726,8 +11726,8 @@ define @test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11739,8 +11739,8 @@ define @test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11753,8 +11753,8 @@ define @test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11766,8 +11766,8 @@ define @test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11780,8 +11780,8 @@ define @test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vloxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11793,8 +11793,8 @@ define @test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11834,8 +11834,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11847,8 +11847,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11861,8 +11861,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11874,8 +11874,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11888,8 +11888,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11901,8 +11901,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11915,8 +11915,8 @@ define @test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11928,8 +11928,8 @@ define @test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11942,8 +11942,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11955,8 +11955,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11969,8 +11969,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11982,8 +11982,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11996,8 +11996,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12009,8 +12009,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12023,8 +12023,8 @@ define @test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12036,8 +12036,8 @@ define @test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12050,8 +12050,8 @@ define @test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12063,8 +12063,8 @@ define @test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12077,8 +12077,8 @@ define @test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12090,8 +12090,8 @@ define @test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12104,8 +12104,8 @@ define @test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vloxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12117,8 +12117,8 @@ define @test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12158,8 +12158,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12171,8 +12171,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12185,8 +12185,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12198,8 +12198,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12212,8 +12212,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12225,8 +12225,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12239,8 +12239,8 @@ define @test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12252,8 +12252,8 @@ define @test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vloxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12266,8 +12266,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12279,8 +12279,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12293,8 +12293,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12306,8 +12306,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12320,8 +12320,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12333,8 +12333,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12347,8 +12347,8 @@ define @test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12360,8 +12360,8 @@ define @test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vloxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12374,8 +12374,8 @@ define @test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12387,8 +12387,8 @@ define @test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12401,8 +12401,8 @@ define @test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12414,8 +12414,8 @@ define @test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12428,8 +12428,8 @@ define @test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vloxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12441,8 +12441,8 @@ define @test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vloxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12482,8 +12482,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12495,8 +12495,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12509,8 +12509,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12522,8 +12522,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12536,8 +12536,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12549,8 +12549,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12563,8 +12563,8 @@ define @test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vloxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12576,8 +12576,8 @@ define @test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12590,8 +12590,8 @@ define @test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12603,8 +12603,8 @@ define @test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12617,8 +12617,8 @@ define @test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12630,8 +12630,8 @@ define @test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12644,8 +12644,8 @@ define @test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vloxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12657,8 +12657,8 @@ define @test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12914,8 +12914,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12927,8 +12927,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12941,8 +12941,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12954,8 +12954,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12968,8 +12968,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12981,8 +12981,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12995,8 +12995,8 @@ define @test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vloxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13008,8 +13008,8 @@ define @test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13022,8 +13022,8 @@ define @test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13035,8 +13035,8 @@ define @test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13049,8 +13049,8 @@ define @test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13062,8 +13062,8 @@ define @test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13076,8 +13076,8 @@ define @test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vloxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13089,8 +13089,8 @@ define @test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13184,8 +13184,8 @@ define @test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3 ; CHECK-LABEL: test_vloxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13197,8 +13197,8 @@ define @test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vloxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13238,8 +13238,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13251,8 +13251,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13265,8 +13265,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13278,8 +13278,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13292,8 +13292,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13305,8 +13305,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13319,8 +13319,8 @@ define @test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vloxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13332,8 +13332,8 @@ define @test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13346,8 +13346,8 @@ define @test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13359,8 +13359,8 @@ define @test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13373,8 +13373,8 @@ define @test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13386,8 +13386,8 @@ define @test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13400,8 +13400,8 @@ define @test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vloxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13413,8 +13413,8 @@ define @test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13508,8 +13508,8 @@ define @test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4 ; CHECK-LABEL: test_vloxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13521,8 +13521,8 @@ define @test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vloxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13562,8 +13562,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13575,8 +13575,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13589,8 +13589,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13602,8 +13602,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13616,8 +13616,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13629,8 +13629,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13643,8 +13643,8 @@ define @test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vloxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13656,8 +13656,8 @@ define @test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13670,8 +13670,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13683,8 +13683,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13697,8 +13697,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13710,8 +13710,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13724,8 +13724,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13737,8 +13737,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13751,8 +13751,8 @@ define @test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vloxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13764,8 +13764,8 @@ define @test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13778,8 +13778,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13791,8 +13791,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13805,8 +13805,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13818,8 +13818,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13832,8 +13832,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13845,8 +13845,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13859,8 +13859,8 @@ define @test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vloxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13872,8 +13872,8 @@ define @test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13886,8 +13886,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13899,8 +13899,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13913,8 +13913,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13926,8 +13926,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13940,8 +13940,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13953,8 +13953,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13967,8 +13967,8 @@ define @test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vloxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13980,8 +13980,8 @@ define @test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13994,8 +13994,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14007,8 +14007,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14021,8 +14021,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14034,8 +14034,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14048,8 +14048,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14061,8 +14061,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14075,8 +14075,8 @@ define @test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vloxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14088,8 +14088,8 @@ define @test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14102,8 +14102,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14115,8 +14115,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14129,8 +14129,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14142,8 +14142,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14156,8 +14156,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14169,8 +14169,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14183,8 +14183,8 @@ define @test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vloxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14196,8 +14196,8 @@ define @test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14210,8 +14210,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14223,8 +14223,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14237,8 +14237,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14250,8 +14250,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14264,8 +14264,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14277,8 +14277,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14291,8 +14291,8 @@ define @test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vloxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14304,8 +14304,8 @@ define @test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vloxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14318,8 +14318,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14331,8 +14331,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14345,8 +14345,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14358,8 +14358,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14372,8 +14372,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14385,8 +14385,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14399,8 +14399,8 @@ define @test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vloxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14412,8 +14412,8 @@ define @test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vloxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14426,8 +14426,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14439,8 +14439,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14453,8 +14453,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14466,8 +14466,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14480,8 +14480,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14493,8 +14493,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14507,8 +14507,8 @@ define @test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vloxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14520,8 +14520,8 @@ define @test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14750,8 +14750,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14763,8 +14763,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14777,8 +14777,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14790,8 +14790,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14804,8 +14804,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14817,8 +14817,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14831,8 +14831,8 @@ define @test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vloxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14844,8 +14844,8 @@ define @test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14939,8 +14939,8 @@ define @test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_ ; CHECK-LABEL: test_vloxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei64.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14952,8 +14952,8 @@ define @test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei64.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14966,8 +14966,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14979,8 +14979,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14993,8 +14993,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15006,8 +15006,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15020,8 +15020,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15033,8 +15033,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15047,8 +15047,8 @@ define @test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vloxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15060,8 +15060,8 @@ define @test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15155,8 +15155,8 @@ define @test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_ ; CHECK-LABEL: test_vloxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei64.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15168,8 +15168,8 @@ define @test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei64.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15182,8 +15182,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15195,8 +15195,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15209,8 +15209,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15222,8 +15222,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15236,8 +15236,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15249,8 +15249,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15263,8 +15263,8 @@ define @test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vloxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15276,8 +15276,8 @@ define @test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15290,8 +15290,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15303,8 +15303,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15317,8 +15317,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15330,8 +15330,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15344,8 +15344,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15357,8 +15357,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15371,8 +15371,8 @@ define @test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vloxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15384,8 +15384,8 @@ define @test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15398,8 +15398,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15411,8 +15411,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15425,8 +15425,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15438,8 +15438,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15452,8 +15452,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15465,8 +15465,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15479,8 +15479,8 @@ define @test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vloxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15492,8 +15492,8 @@ define @test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15506,8 +15506,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15519,8 +15519,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15533,8 +15533,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15546,8 +15546,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15560,8 +15560,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15573,8 +15573,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15587,8 +15587,8 @@ define @test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vloxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15600,8 +15600,8 @@ define @test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vloxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15614,8 +15614,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15627,8 +15627,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15641,8 +15641,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15654,8 +15654,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15668,8 +15668,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15681,8 +15681,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15695,8 +15695,8 @@ define @test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15708,8 +15708,8 @@ define @test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15722,8 +15722,8 @@ define @test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15735,8 +15735,8 @@ define @test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15749,8 +15749,8 @@ define @test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15762,8 +15762,8 @@ define @test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15776,8 +15776,8 @@ define @test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15789,8 +15789,8 @@ define @test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15830,8 +15830,8 @@ define @test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15843,8 +15843,8 @@ define @test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15857,8 +15857,8 @@ define @test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15870,8 +15870,8 @@ define @test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vloxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16127,8 +16127,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16140,8 +16140,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16154,8 +16154,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16167,8 +16167,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16181,8 +16181,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16194,8 +16194,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16208,8 +16208,8 @@ define @test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16221,8 +16221,8 @@ define @test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16235,8 +16235,8 @@ define @test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16248,8 +16248,8 @@ define @test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16262,8 +16262,8 @@ define @test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16275,8 +16275,8 @@ define @test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16289,8 +16289,8 @@ define @test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16302,8 +16302,8 @@ define @test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16343,8 +16343,8 @@ define @test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16356,8 +16356,8 @@ define @test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16370,8 +16370,8 @@ define @test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16383,8 +16383,8 @@ define @test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vloxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16478,8 +16478,8 @@ define @test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vloxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16491,8 +16491,8 @@ define @test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16559,8 +16559,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16572,8 +16572,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16586,8 +16586,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16599,8 +16599,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16613,8 +16613,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16626,8 +16626,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16640,8 +16640,8 @@ define @test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16653,8 +16653,8 @@ define @test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16667,8 +16667,8 @@ define @test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16680,8 +16680,8 @@ define @test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16694,8 +16694,8 @@ define @test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16707,8 +16707,8 @@ define @test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16721,8 +16721,8 @@ define @test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16734,8 +16734,8 @@ define @test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16775,8 +16775,8 @@ define @test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16788,8 +16788,8 @@ define @test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16802,8 +16802,8 @@ define @test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16815,8 +16815,8 @@ define @test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vloxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16910,8 +16910,8 @@ define @test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vloxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16923,8 +16923,8 @@ define @test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vloxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16991,8 +16991,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17004,8 +17004,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17018,8 +17018,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17031,8 +17031,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17045,8 +17045,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17058,8 +17058,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17072,8 +17072,8 @@ define @test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17085,8 +17085,8 @@ define @test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17099,8 +17099,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17112,8 +17112,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17126,8 +17126,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17139,8 +17139,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17153,8 +17153,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17166,8 +17166,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17180,8 +17180,8 @@ define @test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17193,8 +17193,8 @@ define @test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17207,8 +17207,8 @@ define @test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17220,8 +17220,8 @@ define @test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17234,8 +17234,8 @@ define @test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17247,8 +17247,8 @@ define @test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vloxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17261,8 +17261,8 @@ define @test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17274,8 +17274,8 @@ define @test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vloxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17315,8 +17315,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17328,8 +17328,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17342,8 +17342,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17355,8 +17355,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17369,8 +17369,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17382,8 +17382,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17396,8 +17396,8 @@ define @test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17409,8 +17409,8 @@ define @test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17423,8 +17423,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17436,8 +17436,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17450,8 +17450,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17463,8 +17463,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17477,8 +17477,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17490,8 +17490,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17504,8 +17504,8 @@ define @test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17517,8 +17517,8 @@ define @test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17531,8 +17531,8 @@ define @test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17544,8 +17544,8 @@ define @test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17558,8 +17558,8 @@ define @test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17571,8 +17571,8 @@ define @test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vloxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17585,8 +17585,8 @@ define @test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17598,8 +17598,8 @@ define @test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17639,8 +17639,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17652,8 +17652,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17666,8 +17666,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17679,8 +17679,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17693,8 +17693,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17706,8 +17706,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17720,8 +17720,8 @@ define @test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17733,8 +17733,8 @@ define @test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17747,8 +17747,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17760,8 +17760,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17774,8 +17774,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17787,8 +17787,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17801,8 +17801,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17814,8 +17814,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17828,8 +17828,8 @@ define @test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17841,8 +17841,8 @@ define @test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17855,8 +17855,8 @@ define @test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17868,8 +17868,8 @@ define @test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17882,8 +17882,8 @@ define @test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17895,8 +17895,8 @@ define @test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vloxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17909,8 +17909,8 @@ define @test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17922,8 +17922,8 @@ define @test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vloxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17963,8 +17963,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17976,8 +17976,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17990,8 +17990,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18003,8 +18003,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18017,8 +18017,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18030,8 +18030,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18044,8 +18044,8 @@ define @test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vloxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18057,8 +18057,8 @@ define @test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18071,8 +18071,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18084,8 +18084,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18098,8 +18098,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18111,8 +18111,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18125,8 +18125,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18138,8 +18138,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18152,8 +18152,8 @@ define @test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vloxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18165,8 +18165,8 @@ define @test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vloxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18179,8 +18179,8 @@ define @test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18192,8 +18192,8 @@ define @test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18206,8 +18206,8 @@ define @test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18219,8 +18219,8 @@ define @test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vloxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18233,8 +18233,8 @@ define @test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vloxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18246,8 +18246,8 @@ define @test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vloxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vloxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vloxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll index 782b9d50dcebd..0c9aa28d3b137 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv32.ll @@ -9,8 +9,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -22,8 +22,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -38,8 +38,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -51,8 +51,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -67,8 +67,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -80,8 +80,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -96,8 +96,8 @@ define @test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -109,8 +109,8 @@ define @test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -125,8 +125,8 @@ define @test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -138,8 +138,8 @@ define @test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -154,8 +154,8 @@ define @test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -167,8 +167,8 @@ define @test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -183,8 +183,8 @@ define @test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -196,8 +196,8 @@ define @test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -212,8 +212,8 @@ define @test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -225,8 +225,8 @@ define @test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -270,8 +270,8 @@ define @test_vluxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 3) @@ -283,8 +283,8 @@ define @test_vluxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -502,8 +502,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -515,8 +515,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -531,8 +531,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -544,8 +544,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -560,8 +560,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -573,8 +573,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -589,8 +589,8 @@ define @test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -602,8 +602,8 @@ define @test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -618,8 +618,8 @@ define @test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -631,8 +631,8 @@ define @test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -647,8 +647,8 @@ define @test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -660,8 +660,8 @@ define @test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -676,8 +676,8 @@ define @test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -689,8 +689,8 @@ define @test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -705,8 +705,8 @@ define @test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -718,8 +718,8 @@ define @test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -763,8 +763,8 @@ define @test_vluxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -776,8 +776,8 @@ define @test_vluxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -850,8 +850,8 @@ define @test_vluxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei8.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 3) @@ -863,8 +863,8 @@ define @test_vluxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei8.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -937,8 +937,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -950,8 +950,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -966,8 +966,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -979,8 +979,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -995,8 +995,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1008,8 +1008,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1024,8 +1024,8 @@ define @test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1037,8 +1037,8 @@ define @test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1053,8 +1053,8 @@ define @test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1066,8 +1066,8 @@ define @test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1082,8 +1082,8 @@ define @test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1095,8 +1095,8 @@ define @test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1111,8 +1111,8 @@ define @test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1124,8 +1124,8 @@ define @test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1140,8 +1140,8 @@ define @test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1153,8 +1153,8 @@ define @test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1198,8 +1198,8 @@ define @test_vluxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1211,8 +1211,8 @@ define @test_vluxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1285,8 +1285,8 @@ define @test_vluxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei8.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1298,8 +1298,8 @@ define @test_vluxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei8.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1372,8 +1372,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1385,8 +1385,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1401,8 +1401,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1414,8 +1414,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1430,8 +1430,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1443,8 +1443,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1459,8 +1459,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1472,8 +1472,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1488,8 +1488,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1501,8 +1501,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1517,8 +1517,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1530,8 +1530,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1546,8 +1546,8 @@ define @test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1559,8 +1559,8 @@ define @test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1575,8 +1575,8 @@ define @test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1588,8 +1588,8 @@ define @test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1604,8 +1604,8 @@ define @test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1617,8 +1617,8 @@ define @test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1633,8 +1633,8 @@ define @test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1646,8 +1646,8 @@ define @test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1662,8 +1662,8 @@ define @test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1675,8 +1675,8 @@ define @test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1720,8 +1720,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1733,8 +1733,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1749,8 +1749,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1762,8 +1762,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1778,8 +1778,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1791,8 +1791,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1807,8 +1807,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1820,8 +1820,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1836,8 +1836,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1849,8 +1849,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1865,8 +1865,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1878,8 +1878,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1894,8 +1894,8 @@ define @test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1907,8 +1907,8 @@ define @test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1923,8 +1923,8 @@ define @test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1936,8 +1936,8 @@ define @test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1952,8 +1952,8 @@ define @test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1965,8 +1965,8 @@ define @test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -1981,8 +1981,8 @@ define @test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -1994,8 +1994,8 @@ define @test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2010,8 +2010,8 @@ define @test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2023,8 +2023,8 @@ define @test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2068,8 +2068,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2081,8 +2081,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2097,8 +2097,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2110,8 +2110,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2126,8 +2126,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2139,8 +2139,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2155,8 +2155,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2168,8 +2168,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2184,8 +2184,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2197,8 +2197,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2213,8 +2213,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2226,8 +2226,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2242,8 +2242,8 @@ define @test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2255,8 +2255,8 @@ define @test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2271,8 +2271,8 @@ define @test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2284,8 +2284,8 @@ define @test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2300,8 +2300,8 @@ define @test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2313,8 +2313,8 @@ define @test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2329,8 +2329,8 @@ define @test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2342,8 +2342,8 @@ define @test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2358,8 +2358,8 @@ define @test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2371,8 +2371,8 @@ define @test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2416,8 +2416,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2429,8 +2429,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2445,8 +2445,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2458,8 +2458,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2474,8 +2474,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2487,8 +2487,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2503,8 +2503,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2516,8 +2516,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2532,8 +2532,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2545,8 +2545,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2561,8 +2561,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2574,8 +2574,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2590,8 +2590,8 @@ define @test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2603,8 +2603,8 @@ define @test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2619,8 +2619,8 @@ define @test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2632,8 +2632,8 @@ define @test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2648,8 +2648,8 @@ define @test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2661,8 +2661,8 @@ define @test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2677,8 +2677,8 @@ define @test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2690,8 +2690,8 @@ define @test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2706,8 +2706,8 @@ define @test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 3) @@ -2719,8 +2719,8 @@ define @test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 3) @@ -2764,8 +2764,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2777,8 +2777,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2793,8 +2793,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2806,8 +2806,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2822,8 +2822,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2835,8 +2835,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2851,8 +2851,8 @@ define @test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2864,8 +2864,8 @@ define @test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2880,8 +2880,8 @@ define @test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2893,8 +2893,8 @@ define @test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2909,8 +2909,8 @@ define @test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2922,8 +2922,8 @@ define @test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2938,8 +2938,8 @@ define @test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2951,8 +2951,8 @@ define @test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -2967,8 +2967,8 @@ define @test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -2980,8 +2980,8 @@ define @test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3199,8 +3199,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3212,8 +3212,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3228,8 +3228,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3241,8 +3241,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3257,8 +3257,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3270,8 +3270,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3286,8 +3286,8 @@ define @test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3299,8 +3299,8 @@ define @test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3315,8 +3315,8 @@ define @test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3328,8 +3328,8 @@ define @test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3344,8 +3344,8 @@ define @test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3357,8 +3357,8 @@ define @test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3373,8 +3373,8 @@ define @test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3386,8 +3386,8 @@ define @test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3402,8 +3402,8 @@ define @test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3415,8 +3415,8 @@ define @test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3489,8 +3489,8 @@ define @test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3502,8 +3502,8 @@ define @test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3547,8 +3547,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3560,8 +3560,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3576,8 +3576,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3589,8 +3589,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3605,8 +3605,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3618,8 +3618,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3634,8 +3634,8 @@ define @test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3647,8 +3647,8 @@ define @test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3663,8 +3663,8 @@ define @test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3676,8 +3676,8 @@ define @test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3692,8 +3692,8 @@ define @test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3705,8 +3705,8 @@ define @test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3721,8 +3721,8 @@ define @test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3734,8 +3734,8 @@ define @test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3750,8 +3750,8 @@ define @test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3763,8 +3763,8 @@ define @test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3837,8 +3837,8 @@ define @test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3850,8 +3850,8 @@ define @test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3895,8 +3895,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3908,8 +3908,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3924,8 +3924,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3937,8 +3937,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3953,8 +3953,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3966,8 +3966,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -3982,8 +3982,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -3995,8 +3995,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4011,8 +4011,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4024,8 +4024,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4040,8 +4040,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4053,8 +4053,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4069,8 +4069,8 @@ define @test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4082,8 +4082,8 @@ define @test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4098,8 +4098,8 @@ define @test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4111,8 +4111,8 @@ define @test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4127,8 +4127,8 @@ define @test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4140,8 +4140,8 @@ define @test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4156,8 +4156,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4169,8 +4169,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4185,8 +4185,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4198,8 +4198,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4214,8 +4214,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4227,8 +4227,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4243,8 +4243,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4256,8 +4256,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4272,8 +4272,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4285,8 +4285,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4301,8 +4301,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4314,8 +4314,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4330,8 +4330,8 @@ define @test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4343,8 +4343,8 @@ define @test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4359,8 +4359,8 @@ define @test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4372,8 +4372,8 @@ define @test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4388,8 +4388,8 @@ define @test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4401,8 +4401,8 @@ define @test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4417,8 +4417,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4430,8 +4430,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4446,8 +4446,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4459,8 +4459,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4475,8 +4475,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4488,8 +4488,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4504,8 +4504,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4517,8 +4517,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4533,8 +4533,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4546,8 +4546,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4562,8 +4562,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4575,8 +4575,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4591,8 +4591,8 @@ define @test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4604,8 +4604,8 @@ define @test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4620,8 +4620,8 @@ define @test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4633,8 +4633,8 @@ define @test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4649,8 +4649,8 @@ define @test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4662,8 +4662,8 @@ define @test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4678,8 +4678,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4691,8 +4691,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4707,8 +4707,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4720,8 +4720,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4736,8 +4736,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4749,8 +4749,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4765,8 +4765,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4778,8 +4778,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4794,8 +4794,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4807,8 +4807,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4823,8 +4823,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4836,8 +4836,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4852,8 +4852,8 @@ define @test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4865,8 +4865,8 @@ define @test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4881,8 +4881,8 @@ define @test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4894,8 +4894,8 @@ define @test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4910,8 +4910,8 @@ define @test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -4923,8 +4923,8 @@ define @test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -4939,8 +4939,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -4952,8 +4952,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -4968,8 +4968,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -4981,8 +4981,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -4997,8 +4997,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5010,8 +5010,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5026,8 +5026,8 @@ define @test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5039,8 +5039,8 @@ define @test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5055,8 +5055,8 @@ define @test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5068,8 +5068,8 @@ define @test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5084,8 +5084,8 @@ define @test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5097,8 +5097,8 @@ define @test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5287,8 +5287,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5300,8 +5300,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5316,8 +5316,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5329,8 +5329,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5345,8 +5345,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5358,8 +5358,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5374,8 +5374,8 @@ define @test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5387,8 +5387,8 @@ define @test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5403,8 +5403,8 @@ define @test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5416,8 +5416,8 @@ define @test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5432,8 +5432,8 @@ define @test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5445,8 +5445,8 @@ define @test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5519,8 +5519,8 @@ define @test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5532,8 +5532,8 @@ define @test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5548,8 +5548,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5561,8 +5561,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5577,8 +5577,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5590,8 +5590,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5606,8 +5606,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5619,8 +5619,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5635,8 +5635,8 @@ define @test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5648,8 +5648,8 @@ define @test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5664,8 +5664,8 @@ define @test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5677,8 +5677,8 @@ define @test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5693,8 +5693,8 @@ define @test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5706,8 +5706,8 @@ define @test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5780,8 +5780,8 @@ define @test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5793,8 +5793,8 @@ define @test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5809,8 +5809,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5822,8 +5822,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5838,8 +5838,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5851,8 +5851,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5867,8 +5867,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5880,8 +5880,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5896,8 +5896,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5909,8 +5909,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5925,8 +5925,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5938,8 +5938,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5954,8 +5954,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5967,8 +5967,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -5983,8 +5983,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -5996,8 +5996,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6012,8 +6012,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6025,8 +6025,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6041,8 +6041,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6054,8 +6054,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6070,8 +6070,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6083,8 +6083,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6099,8 +6099,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6112,8 +6112,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6128,8 +6128,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6141,8 +6141,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6157,8 +6157,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6170,8 +6170,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6186,8 +6186,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6199,8 +6199,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6215,8 +6215,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6228,8 +6228,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6244,8 +6244,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6257,8 +6257,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6273,8 +6273,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6286,8 +6286,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6302,8 +6302,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6315,8 +6315,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6331,8 +6331,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6344,8 +6344,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6360,8 +6360,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6373,8 +6373,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6389,8 +6389,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6402,8 +6402,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6418,8 +6418,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6431,8 +6431,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6447,8 +6447,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6460,8 +6460,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6476,8 +6476,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -6489,8 +6489,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -6505,8 +6505,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6518,8 +6518,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6534,8 +6534,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6547,8 +6547,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6563,8 +6563,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6576,8 +6576,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6766,8 +6766,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6779,8 +6779,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6795,8 +6795,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6808,8 +6808,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6824,8 +6824,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6837,8 +6837,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6940,8 +6940,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6953,8 +6953,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6969,8 +6969,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -6982,8 +6982,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -6998,8 +6998,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7011,8 +7011,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7114,8 +7114,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7127,8 +7127,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7143,8 +7143,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7156,8 +7156,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7172,8 +7172,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7185,8 +7185,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7201,8 +7201,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7214,8 +7214,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7230,8 +7230,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7243,8 +7243,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7259,8 +7259,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7272,8 +7272,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7288,8 +7288,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7301,8 +7301,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7317,8 +7317,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7330,8 +7330,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7346,8 +7346,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7359,8 +7359,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7375,8 +7375,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7388,8 +7388,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7404,8 +7404,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7417,8 +7417,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7433,8 +7433,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -7446,8 +7446,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -7460,8 +7460,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7473,8 +7473,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7487,8 +7487,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7500,8 +7500,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7514,8 +7514,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7527,8 +7527,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7541,8 +7541,8 @@ define @test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7554,8 +7554,8 @@ define @test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7568,8 +7568,8 @@ define @test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7581,8 +7581,8 @@ define @test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7595,8 +7595,8 @@ define @test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7608,8 +7608,8 @@ define @test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7622,8 +7622,8 @@ define @test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7635,8 +7635,8 @@ define @test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7649,8 +7649,8 @@ define @test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7662,8 +7662,8 @@ define @test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7865,8 +7865,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7878,8 +7878,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7892,8 +7892,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7905,8 +7905,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7919,8 +7919,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7932,8 +7932,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7946,8 +7946,8 @@ define @test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7959,8 +7959,8 @@ define @test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -7973,8 +7973,8 @@ define @test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -7986,8 +7986,8 @@ define @test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8000,8 +8000,8 @@ define @test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8013,8 +8013,8 @@ define @test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8027,8 +8027,8 @@ define @test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8040,8 +8040,8 @@ define @test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8054,8 +8054,8 @@ define @test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8067,8 +8067,8 @@ define @test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8135,8 +8135,8 @@ define @test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t ; CHECK-LABEL: test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8148,8 +8148,8 @@ define @test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8189,8 +8189,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8202,8 +8202,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8216,8 +8216,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8229,8 +8229,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8243,8 +8243,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8256,8 +8256,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8270,8 +8270,8 @@ define @test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8283,8 +8283,8 @@ define @test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8297,8 +8297,8 @@ define @test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8310,8 +8310,8 @@ define @test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8324,8 +8324,8 @@ define @test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8337,8 +8337,8 @@ define @test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8351,8 +8351,8 @@ define @test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8364,8 +8364,8 @@ define @test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8378,8 +8378,8 @@ define @test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8391,8 +8391,8 @@ define @test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8459,8 +8459,8 @@ define @test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t ; CHECK-LABEL: test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8472,8 +8472,8 @@ define @test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8513,8 +8513,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8526,8 +8526,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8540,8 +8540,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8553,8 +8553,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8567,8 +8567,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8580,8 +8580,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8594,8 +8594,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8607,8 +8607,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8621,8 +8621,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8634,8 +8634,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8648,8 +8648,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8661,8 +8661,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8675,8 +8675,8 @@ define @test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8688,8 +8688,8 @@ define @test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8702,8 +8702,8 @@ define @test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8715,8 +8715,8 @@ define @test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8729,8 +8729,8 @@ define @test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8742,8 +8742,8 @@ define @test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8756,8 +8756,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8769,8 +8769,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8783,8 +8783,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8796,8 +8796,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8810,8 +8810,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8823,8 +8823,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8837,8 +8837,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8850,8 +8850,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8864,8 +8864,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8877,8 +8877,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8891,8 +8891,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8904,8 +8904,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8918,8 +8918,8 @@ define @test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8931,8 +8931,8 @@ define @test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8945,8 +8945,8 @@ define @test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8958,8 +8958,8 @@ define @test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8972,8 +8972,8 @@ define @test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -8985,8 +8985,8 @@ define @test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -8999,8 +8999,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9012,8 +9012,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9026,8 +9026,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9039,8 +9039,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9053,8 +9053,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9066,8 +9066,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9080,8 +9080,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9093,8 +9093,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9107,8 +9107,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9120,8 +9120,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9134,8 +9134,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9147,8 +9147,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9161,8 +9161,8 @@ define @test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9174,8 +9174,8 @@ define @test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9188,8 +9188,8 @@ define @test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9201,8 +9201,8 @@ define @test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9215,8 +9215,8 @@ define @test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9228,8 +9228,8 @@ define @test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9242,8 +9242,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9255,8 +9255,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9269,8 +9269,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9282,8 +9282,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9296,8 +9296,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9309,8 +9309,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9323,8 +9323,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9336,8 +9336,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9350,8 +9350,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9363,8 +9363,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9377,8 +9377,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9390,8 +9390,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9404,8 +9404,8 @@ define @test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9417,8 +9417,8 @@ define @test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9431,8 +9431,8 @@ define @test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9444,8 +9444,8 @@ define @test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9458,8 +9458,8 @@ define @test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -9471,8 +9471,8 @@ define @test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -9485,8 +9485,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9498,8 +9498,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9512,8 +9512,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9525,8 +9525,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9539,8 +9539,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9552,8 +9552,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9566,8 +9566,8 @@ define @test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9579,8 +9579,8 @@ define @test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9593,8 +9593,8 @@ define @test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9606,8 +9606,8 @@ define @test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9620,8 +9620,8 @@ define @test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9633,8 +9633,8 @@ define @test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9809,8 +9809,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9822,8 +9822,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9836,8 +9836,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9849,8 +9849,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9863,8 +9863,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9876,8 +9876,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9890,8 +9890,8 @@ define @test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9903,8 +9903,8 @@ define @test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9917,8 +9917,8 @@ define @test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9930,8 +9930,8 @@ define @test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -9944,8 +9944,8 @@ define @test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -9957,8 +9957,8 @@ define @test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10025,8 +10025,8 @@ define @test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3 ; CHECK-LABEL: test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10038,8 +10038,8 @@ define @test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10052,8 +10052,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10065,8 +10065,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10079,8 +10079,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10092,8 +10092,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10106,8 +10106,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10119,8 +10119,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10133,8 +10133,8 @@ define @test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10146,8 +10146,8 @@ define @test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10160,8 +10160,8 @@ define @test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10173,8 +10173,8 @@ define @test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10187,8 +10187,8 @@ define @test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10200,8 +10200,8 @@ define @test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10268,8 +10268,8 @@ define @test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4 ; CHECK-LABEL: test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10281,8 +10281,8 @@ define @test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10295,8 +10295,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10308,8 +10308,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10322,8 +10322,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10335,8 +10335,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10349,8 +10349,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10362,8 +10362,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10376,8 +10376,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10389,8 +10389,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10403,8 +10403,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10416,8 +10416,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10430,8 +10430,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10443,8 +10443,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10457,8 +10457,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10470,8 +10470,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10484,8 +10484,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10497,8 +10497,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10511,8 +10511,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10524,8 +10524,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10538,8 +10538,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10551,8 +10551,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10565,8 +10565,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10578,8 +10578,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10592,8 +10592,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10605,8 +10605,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10619,8 +10619,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10632,8 +10632,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10646,8 +10646,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10659,8 +10659,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10673,8 +10673,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10686,8 +10686,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10700,8 +10700,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10713,8 +10713,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10727,8 +10727,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10740,8 +10740,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10754,8 +10754,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10767,8 +10767,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10781,8 +10781,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10794,8 +10794,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10808,8 +10808,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10821,8 +10821,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10835,8 +10835,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10848,8 +10848,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10862,8 +10862,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10875,8 +10875,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10889,8 +10889,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10902,8 +10902,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10916,8 +10916,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 5) @@ -10929,8 +10929,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 5) @@ -10943,8 +10943,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -10956,8 +10956,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -10970,8 +10970,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -10983,8 +10983,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -10997,8 +10997,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11010,8 +11010,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11186,8 +11186,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11199,8 +11199,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11213,8 +11213,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11226,8 +11226,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11240,8 +11240,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11253,8 +11253,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11348,8 +11348,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11361,8 +11361,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11375,8 +11375,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11388,8 +11388,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11402,8 +11402,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11415,8 +11415,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11510,8 +11510,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11523,8 +11523,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11537,8 +11537,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11550,8 +11550,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11564,8 +11564,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11577,8 +11577,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11591,8 +11591,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11604,8 +11604,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11618,8 +11618,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11631,8 +11631,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11645,8 +11645,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11658,8 +11658,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11672,8 +11672,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11685,8 +11685,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11699,8 +11699,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11712,8 +11712,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11726,8 +11726,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11739,8 +11739,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11753,8 +11753,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11766,8 +11766,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11780,8 +11780,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11793,8 +11793,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11807,8 +11807,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 6) @@ -11820,8 +11820,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 6) @@ -11834,8 +11834,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11847,8 +11847,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11861,8 +11861,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11874,8 +11874,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11888,8 +11888,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11901,8 +11901,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11915,8 +11915,8 @@ define @test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11928,8 +11928,8 @@ define @test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11942,8 +11942,8 @@ define @test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11955,8 +11955,8 @@ define @test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11969,8 +11969,8 @@ define @test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -11982,8 +11982,8 @@ define @test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -11996,8 +11996,8 @@ define @test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12009,8 +12009,8 @@ define @test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12023,8 +12023,8 @@ define @test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12036,8 +12036,8 @@ define @test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12239,8 +12239,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12252,8 +12252,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12266,8 +12266,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12279,8 +12279,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12293,8 +12293,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12306,8 +12306,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12320,8 +12320,8 @@ define @test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12333,8 +12333,8 @@ define @test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12347,8 +12347,8 @@ define @test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12360,8 +12360,8 @@ define @test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12374,8 +12374,8 @@ define @test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12387,8 +12387,8 @@ define @test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12401,8 +12401,8 @@ define @test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12414,8 +12414,8 @@ define @test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12428,8 +12428,8 @@ define @test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12441,8 +12441,8 @@ define @test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12509,8 +12509,8 @@ define @test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12522,8 +12522,8 @@ define @test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12563,8 +12563,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12576,8 +12576,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12590,8 +12590,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12603,8 +12603,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12617,8 +12617,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12630,8 +12630,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12644,8 +12644,8 @@ define @test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12657,8 +12657,8 @@ define @test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12671,8 +12671,8 @@ define @test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12684,8 +12684,8 @@ define @test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12698,8 +12698,8 @@ define @test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12711,8 +12711,8 @@ define @test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12725,8 +12725,8 @@ define @test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12738,8 +12738,8 @@ define @test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12752,8 +12752,8 @@ define @test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12765,8 +12765,8 @@ define @test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12833,8 +12833,8 @@ define @test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12846,8 +12846,8 @@ define @test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12887,8 +12887,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12900,8 +12900,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12914,8 +12914,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12927,8 +12927,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12941,8 +12941,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12954,8 +12954,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12968,8 +12968,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -12981,8 +12981,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -12995,8 +12995,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13008,8 +13008,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13022,8 +13022,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13035,8 +13035,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13049,8 +13049,8 @@ define @test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13062,8 +13062,8 @@ define @test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13076,8 +13076,8 @@ define @test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13089,8 +13089,8 @@ define @test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13103,8 +13103,8 @@ define @test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13116,8 +13116,8 @@ define @test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13130,8 +13130,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13143,8 +13143,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13157,8 +13157,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13170,8 +13170,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13184,8 +13184,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13197,8 +13197,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13211,8 +13211,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13224,8 +13224,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13238,8 +13238,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13251,8 +13251,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13265,8 +13265,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13278,8 +13278,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13292,8 +13292,8 @@ define @test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13305,8 +13305,8 @@ define @test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13319,8 +13319,8 @@ define @test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13332,8 +13332,8 @@ define @test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13346,8 +13346,8 @@ define @test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13359,8 +13359,8 @@ define @test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13373,8 +13373,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13386,8 +13386,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13400,8 +13400,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13413,8 +13413,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13427,8 +13427,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13440,8 +13440,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13454,8 +13454,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13467,8 +13467,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13481,8 +13481,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13494,8 +13494,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13508,8 +13508,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13521,8 +13521,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13535,8 +13535,8 @@ define @test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13548,8 +13548,8 @@ define @test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13562,8 +13562,8 @@ define @test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13575,8 +13575,8 @@ define @test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13589,8 +13589,8 @@ define @test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13602,8 +13602,8 @@ define @test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13616,8 +13616,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13629,8 +13629,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13643,8 +13643,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13656,8 +13656,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13670,8 +13670,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13683,8 +13683,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13697,8 +13697,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13710,8 +13710,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13724,8 +13724,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13737,8 +13737,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13751,8 +13751,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13764,8 +13764,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13778,8 +13778,8 @@ define @test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13791,8 +13791,8 @@ define @test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13805,8 +13805,8 @@ define @test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13818,8 +13818,8 @@ define @test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) @@ -13832,8 +13832,8 @@ define @test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i32 %vl, i32 4) @@ -13845,8 +13845,8 @@ define @test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i32 %vl, i32 1, i32 4) diff --git a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll index 49565c422e92c..cfe5ab2b07e64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vluxseg-rv64.ll @@ -9,8 +9,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -22,8 +22,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -35,8 +35,8 @@ define @test_vluxseg2_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg2_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -51,8 +51,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -64,8 +64,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -80,8 +80,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -93,8 +93,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -109,8 +109,8 @@ define @test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv1i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -122,8 +122,8 @@ define @test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i8_triscv.vector.tuple_nxv1i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv1i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -138,8 +138,8 @@ define @test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -151,8 +151,8 @@ define @test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -167,8 +167,8 @@ define @test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -180,8 +180,8 @@ define @test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -196,8 +196,8 @@ define @test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -209,8 +209,8 @@ define @test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i8_triscv.vector.tuple_nxv2i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -254,8 +254,8 @@ define @test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -267,8 +267,8 @@ define @test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -283,8 +283,8 @@ define @test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -296,8 +296,8 @@ define @test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i8_triscv.vector.tuple_nxv4i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -370,8 +370,8 @@ define @test_vluxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv ; CHECK-LABEL: test_vluxseg2_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 3) @@ -383,8 +383,8 @@ define @test_vluxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_mask_nxv8i8_triscv.vector.tuple_nxv8i8_2t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -631,8 +631,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -644,8 +644,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -657,8 +657,8 @@ define @test_vluxseg3_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg3_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -673,8 +673,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -686,8 +686,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -702,8 +702,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -715,8 +715,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -731,8 +731,8 @@ define @test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv1i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -744,8 +744,8 @@ define @test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i8_triscv.vector.tuple_nxv1i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv1i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -760,8 +760,8 @@ define @test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -773,8 +773,8 @@ define @test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -789,8 +789,8 @@ define @test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -802,8 +802,8 @@ define @test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -818,8 +818,8 @@ define @test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -831,8 +831,8 @@ define @test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i8_triscv.vector.tuple_nxv2i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -876,8 +876,8 @@ define @test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -889,8 +889,8 @@ define @test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -905,8 +905,8 @@ define @test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -918,8 +918,8 @@ define @test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i8_triscv.vector.tuple_nxv4i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -992,8 +992,8 @@ define @test_vluxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv ; CHECK-LABEL: test_vluxseg3_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1005,8 +1005,8 @@ define @test_vluxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_mask_nxv8i8_triscv.vector.tuple_nxv8i8_3t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1108,8 +1108,8 @@ define @test_vluxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei8.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1121,8 +1121,8 @@ define @test_vluxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv16i8_triscv.vector.tuple_nxv16i8_3t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei8.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1195,8 +1195,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1208,8 +1208,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1221,8 +1221,8 @@ define @test_vluxseg4_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg4_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -1237,8 +1237,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1250,8 +1250,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1266,8 +1266,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1279,8 +1279,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1295,8 +1295,8 @@ define @test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv1i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1308,8 +1308,8 @@ define @test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i8_triscv.vector.tuple_nxv1i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv1i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1324,8 +1324,8 @@ define @test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1337,8 +1337,8 @@ define @test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1353,8 +1353,8 @@ define @test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1366,8 +1366,8 @@ define @test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1382,8 +1382,8 @@ define @test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1395,8 +1395,8 @@ define @test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i8_triscv.vector.tuple_nxv2i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1440,8 +1440,8 @@ define @test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1453,8 +1453,8 @@ define @test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1469,8 +1469,8 @@ define @test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1482,8 +1482,8 @@ define @test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i8_triscv.vector.tuple_nxv4i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1556,8 +1556,8 @@ define @test_vluxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv ; CHECK-LABEL: test_vluxseg4_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1569,8 +1569,8 @@ define @test_vluxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_mask_nxv8i8_triscv.vector.tuple_nxv8i8_4t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1672,8 +1672,8 @@ define @test_vluxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei8.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1685,8 +1685,8 @@ define @test_vluxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv16i8_triscv.vector.tuple_nxv16i8_4t_nxv16i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei8.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv16i1.nxv16i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1759,8 +1759,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1772,8 +1772,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1785,8 +1785,8 @@ define @test_vluxseg5_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg5_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -1801,8 +1801,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1814,8 +1814,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1830,8 +1830,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1843,8 +1843,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1859,8 +1859,8 @@ define @test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv1i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1872,8 +1872,8 @@ define @test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i8_triscv.vector.tuple_nxv1i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv1i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1888,8 +1888,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1901,8 +1901,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1917,8 +1917,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1930,8 +1930,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1946,8 +1946,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1959,8 +1959,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -1975,8 +1975,8 @@ define @test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -1988,8 +1988,8 @@ define @test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i8_triscv.vector.tuple_nxv2i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2004,8 +2004,8 @@ define @test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2017,8 +2017,8 @@ define @test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2033,8 +2033,8 @@ define @test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2046,8 +2046,8 @@ define @test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2062,8 +2062,8 @@ define @test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2075,8 +2075,8 @@ define @test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i8_triscv.vector.tuple_nxv4i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2120,8 +2120,8 @@ define @test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2133,8 +2133,8 @@ define @test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2149,8 +2149,8 @@ define @test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv ; CHECK-LABEL: test_vluxseg5_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2162,8 +2162,8 @@ define @test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_mask_nxv8i8_triscv.vector.tuple_nxv8i8_5t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2236,8 +2236,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2249,8 +2249,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2262,8 +2262,8 @@ define @test_vluxseg6_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg6_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -2278,8 +2278,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2291,8 +2291,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2307,8 +2307,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2320,8 +2320,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2336,8 +2336,8 @@ define @test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv1i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2349,8 +2349,8 @@ define @test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i8_triscv.vector.tuple_nxv1i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv1i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2365,8 +2365,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2378,8 +2378,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2394,8 +2394,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2407,8 +2407,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2423,8 +2423,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2436,8 +2436,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2452,8 +2452,8 @@ define @test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2465,8 +2465,8 @@ define @test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i8_triscv.vector.tuple_nxv2i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2481,8 +2481,8 @@ define @test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2494,8 +2494,8 @@ define @test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2510,8 +2510,8 @@ define @test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2523,8 +2523,8 @@ define @test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2539,8 +2539,8 @@ define @test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2552,8 +2552,8 @@ define @test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i8_triscv.vector.tuple_nxv4i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2597,8 +2597,8 @@ define @test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2610,8 +2610,8 @@ define @test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2626,8 +2626,8 @@ define @test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv ; CHECK-LABEL: test_vluxseg6_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2639,8 +2639,8 @@ define @test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_mask_nxv8i8_triscv.vector.tuple_nxv8i8_6t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2713,8 +2713,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2726,8 +2726,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2739,8 +2739,8 @@ define @test_vluxseg7_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg7_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -2755,8 +2755,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2768,8 +2768,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2784,8 +2784,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2797,8 +2797,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2813,8 +2813,8 @@ define @test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv1i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2826,8 +2826,8 @@ define @test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i8_triscv.vector.tuple_nxv1i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv1i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2842,8 +2842,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2855,8 +2855,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2871,8 +2871,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2884,8 +2884,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2900,8 +2900,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2913,8 +2913,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2929,8 +2929,8 @@ define @test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2942,8 +2942,8 @@ define @test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i8_triscv.vector.tuple_nxv2i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2958,8 +2958,8 @@ define @test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -2971,8 +2971,8 @@ define @test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -2987,8 +2987,8 @@ define @test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3000,8 +3000,8 @@ define @test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3016,8 +3016,8 @@ define @test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3029,8 +3029,8 @@ define @test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i8_triscv.vector.tuple_nxv4i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3074,8 +3074,8 @@ define @test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3087,8 +3087,8 @@ define @test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3103,8 +3103,8 @@ define @test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv ; CHECK-LABEL: test_vluxseg7_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3116,8 +3116,8 @@ define @test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_mask_nxv8i8_triscv.vector.tuple_nxv8i8_7t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3190,8 +3190,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3203,8 +3203,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3216,8 +3216,8 @@ define @test_vluxseg8_allonesmask_nxv1i8_triscv.vector.tuple_n ; CHECK-LABEL: test_vluxseg8_allonesmask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, splat (i1 true), i64 %vl, i64 1, i64 3) @@ -3232,8 +3232,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3245,8 +3245,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3261,8 +3261,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3274,8 +3274,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3290,8 +3290,8 @@ define @test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv1i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3303,8 +3303,8 @@ define @test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i8_triscv.vector.tuple_nxv1i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf8, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv1i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3319,8 +3319,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3332,8 +3332,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3348,8 +3348,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3361,8 +3361,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3377,8 +3377,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3390,8 +3390,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3406,8 +3406,8 @@ define @test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3419,8 +3419,8 @@ define @test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i8_triscv.vector.tuple_nxv2i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3435,8 +3435,8 @@ define @test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3448,8 +3448,8 @@ define @test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3464,8 +3464,8 @@ define @test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3477,8 +3477,8 @@ define @test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3493,8 +3493,8 @@ define @test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3506,8 +3506,8 @@ define @test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i8_triscv.vector.tuple_nxv4i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3551,8 +3551,8 @@ define @test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3564,8 +3564,8 @@ define @test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3580,8 +3580,8 @@ define @test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv ; CHECK-LABEL: test_vluxseg8_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 3) @@ -3593,8 +3593,8 @@ define @test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_mask_nxv8i8_triscv.vector.tuple_nxv8i8_8t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 3) @@ -3667,8 +3667,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3680,8 +3680,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3696,8 +3696,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3709,8 +3709,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3725,8 +3725,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3738,8 +3738,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3754,8 +3754,8 @@ define @test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3767,8 +3767,8 @@ define @test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3783,8 +3783,8 @@ define @test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3796,8 +3796,8 @@ define @test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3812,8 +3812,8 @@ define @test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3825,8 +3825,8 @@ define @test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3841,8 +3841,8 @@ define @test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3854,8 +3854,8 @@ define @test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3899,8 +3899,8 @@ define @test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3912,8 +3912,8 @@ define @test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -3928,8 +3928,8 @@ define @test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -3941,8 +3941,8 @@ define @test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv4i16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4218,8 +4218,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4231,8 +4231,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4247,8 +4247,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4260,8 +4260,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4276,8 +4276,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4289,8 +4289,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4305,8 +4305,8 @@ define @test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4318,8 +4318,8 @@ define @test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4334,8 +4334,8 @@ define @test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4347,8 +4347,8 @@ define @test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4363,8 +4363,8 @@ define @test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4376,8 +4376,8 @@ define @test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4392,8 +4392,8 @@ define @test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4405,8 +4405,8 @@ define @test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4450,8 +4450,8 @@ define @test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4463,8 +4463,8 @@ define @test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4479,8 +4479,8 @@ define @test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4492,8 +4492,8 @@ define @test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv4i16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4595,8 +4595,8 @@ define @test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4608,8 +4608,8 @@ define @test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv8i16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4682,8 +4682,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4695,8 +4695,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4711,8 +4711,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4724,8 +4724,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4740,8 +4740,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4753,8 +4753,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4769,8 +4769,8 @@ define @test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4782,8 +4782,8 @@ define @test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4798,8 +4798,8 @@ define @test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4811,8 +4811,8 @@ define @test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4827,8 +4827,8 @@ define @test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4840,8 +4840,8 @@ define @test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4856,8 +4856,8 @@ define @test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4869,8 +4869,8 @@ define @test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4914,8 +4914,8 @@ define @test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4927,8 +4927,8 @@ define @test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -4943,8 +4943,8 @@ define @test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -4956,8 +4956,8 @@ define @test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv4i16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5059,8 +5059,8 @@ define @test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5072,8 +5072,8 @@ define @test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv8i16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5146,8 +5146,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5159,8 +5159,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5175,8 +5175,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5188,8 +5188,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5204,8 +5204,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5217,8 +5217,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5233,8 +5233,8 @@ define @test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5246,8 +5246,8 @@ define @test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5262,8 +5262,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5275,8 +5275,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5291,8 +5291,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5304,8 +5304,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5320,8 +5320,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5333,8 +5333,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5349,8 +5349,8 @@ define @test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5362,8 +5362,8 @@ define @test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5378,8 +5378,8 @@ define @test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5391,8 +5391,8 @@ define @test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5407,8 +5407,8 @@ define @test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5420,8 +5420,8 @@ define @test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5436,8 +5436,8 @@ define @test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5449,8 +5449,8 @@ define @test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv4i16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5494,8 +5494,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5507,8 +5507,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5523,8 +5523,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5536,8 +5536,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5552,8 +5552,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5565,8 +5565,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5581,8 +5581,8 @@ define @test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5594,8 +5594,8 @@ define @test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5610,8 +5610,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5623,8 +5623,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5639,8 +5639,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5652,8 +5652,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5668,8 +5668,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5681,8 +5681,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5697,8 +5697,8 @@ define @test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5710,8 +5710,8 @@ define @test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5726,8 +5726,8 @@ define @test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5739,8 +5739,8 @@ define @test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5755,8 +5755,8 @@ define @test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5768,8 +5768,8 @@ define @test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5784,8 +5784,8 @@ define @test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5797,8 +5797,8 @@ define @test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv4i16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5842,8 +5842,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5855,8 +5855,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5871,8 +5871,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5884,8 +5884,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5900,8 +5900,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5913,8 +5913,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5929,8 +5929,8 @@ define @test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5942,8 +5942,8 @@ define @test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5958,8 +5958,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -5971,8 +5971,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -5987,8 +5987,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6000,8 +6000,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6016,8 +6016,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6029,8 +6029,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6045,8 +6045,8 @@ define @test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6058,8 +6058,8 @@ define @test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6074,8 +6074,8 @@ define @test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6087,8 +6087,8 @@ define @test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6103,8 +6103,8 @@ define @test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6116,8 +6116,8 @@ define @test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6132,8 +6132,8 @@ define @test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6145,8 +6145,8 @@ define @test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv4i16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6190,8 +6190,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6203,8 +6203,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6219,8 +6219,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6232,8 +6232,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6248,8 +6248,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6261,8 +6261,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6277,8 +6277,8 @@ define @test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6290,8 +6290,8 @@ define @test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6306,8 +6306,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6319,8 +6319,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6335,8 +6335,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6348,8 +6348,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6364,8 +6364,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6377,8 +6377,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6393,8 +6393,8 @@ define @test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6406,8 +6406,8 @@ define @test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6422,8 +6422,8 @@ define @test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6435,8 +6435,8 @@ define @test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6451,8 +6451,8 @@ define @test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6464,8 +6464,8 @@ define @test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6480,8 +6480,8 @@ define @test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -6493,8 +6493,8 @@ define @test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv4i16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -6538,8 +6538,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6551,8 +6551,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6567,8 +6567,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6580,8 +6580,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6596,8 +6596,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6609,8 +6609,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6625,8 +6625,8 @@ define @test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6638,8 +6638,8 @@ define @test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6654,8 +6654,8 @@ define @test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6667,8 +6667,8 @@ define @test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6683,8 +6683,8 @@ define @test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6696,8 +6696,8 @@ define @test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -6712,8 +6712,8 @@ define @test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -6725,8 +6725,8 @@ define @test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2i32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7002,8 +7002,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7015,8 +7015,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7031,8 +7031,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7044,8 +7044,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7060,8 +7060,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7073,8 +7073,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7089,8 +7089,8 @@ define @test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7102,8 +7102,8 @@ define @test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7118,8 +7118,8 @@ define @test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7131,8 +7131,8 @@ define @test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7147,8 +7147,8 @@ define @test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7160,8 +7160,8 @@ define @test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7176,8 +7176,8 @@ define @test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7189,8 +7189,8 @@ define @test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2i32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7292,8 +7292,8 @@ define @test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7305,8 +7305,8 @@ define @test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv4i32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7350,8 +7350,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7363,8 +7363,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7379,8 +7379,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7392,8 +7392,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7408,8 +7408,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7421,8 +7421,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7437,8 +7437,8 @@ define @test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7450,8 +7450,8 @@ define @test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7466,8 +7466,8 @@ define @test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7479,8 +7479,8 @@ define @test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7495,8 +7495,8 @@ define @test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7508,8 +7508,8 @@ define @test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7524,8 +7524,8 @@ define @test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7537,8 +7537,8 @@ define @test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2i32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7640,8 +7640,8 @@ define @test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7653,8 +7653,8 @@ define @test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv4i32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7698,8 +7698,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7711,8 +7711,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7727,8 +7727,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7740,8 +7740,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7756,8 +7756,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7769,8 +7769,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7785,8 +7785,8 @@ define @test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7798,8 +7798,8 @@ define @test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7814,8 +7814,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7827,8 +7827,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7843,8 +7843,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7856,8 +7856,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7872,8 +7872,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7885,8 +7885,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7901,8 +7901,8 @@ define @test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7914,8 +7914,8 @@ define @test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2i32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7930,8 +7930,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7943,8 +7943,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7959,8 +7959,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -7972,8 +7972,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -7988,8 +7988,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8001,8 +8001,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8017,8 +8017,8 @@ define @test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8030,8 +8030,8 @@ define @test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8046,8 +8046,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8059,8 +8059,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8075,8 +8075,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8088,8 +8088,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8104,8 +8104,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8117,8 +8117,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8133,8 +8133,8 @@ define @test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8146,8 +8146,8 @@ define @test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2i32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8162,8 +8162,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8175,8 +8175,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8191,8 +8191,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8204,8 +8204,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8220,8 +8220,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8233,8 +8233,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8249,8 +8249,8 @@ define @test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8262,8 +8262,8 @@ define @test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8278,8 +8278,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8291,8 +8291,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8307,8 +8307,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8320,8 +8320,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8336,8 +8336,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8349,8 +8349,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8365,8 +8365,8 @@ define @test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8378,8 +8378,8 @@ define @test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2i32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8394,8 +8394,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8407,8 +8407,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8423,8 +8423,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8436,8 +8436,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8452,8 +8452,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8465,8 +8465,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8481,8 +8481,8 @@ define @test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8494,8 +8494,8 @@ define @test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8510,8 +8510,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8523,8 +8523,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8539,8 +8539,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8552,8 +8552,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8568,8 +8568,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8581,8 +8581,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8597,8 +8597,8 @@ define @test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -8610,8 +8610,8 @@ define @test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2i32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -8626,8 +8626,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8639,8 +8639,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8655,8 +8655,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8668,8 +8668,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8684,8 +8684,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8697,8 +8697,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8713,8 +8713,8 @@ define @test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_n ; CHECK-LABEL: test_vluxseg2_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8726,8 +8726,8 @@ define @test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg2_mask_nxv1i64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -8974,8 +8974,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -8987,8 +8987,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9003,8 +9003,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9016,8 +9016,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9032,8 +9032,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9045,8 +9045,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9061,8 +9061,8 @@ define @test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_n ; CHECK-LABEL: test_vluxseg3_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9074,8 +9074,8 @@ define @test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg3_mask_nxv1i64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9177,8 +9177,8 @@ define @test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei64.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9190,8 +9190,8 @@ define @test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg3_mask_nxv2i64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei64.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9206,8 +9206,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9219,8 +9219,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9235,8 +9235,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9248,8 +9248,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9264,8 +9264,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9277,8 +9277,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9293,8 +9293,8 @@ define @test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_n ; CHECK-LABEL: test_vluxseg4_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9306,8 +9306,8 @@ define @test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg4_mask_nxv1i64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9409,8 +9409,8 @@ define @test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei64.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9422,8 +9422,8 @@ define @test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i ; CHECK-LABEL: test_vluxseg4_mask_nxv2i64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei64.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9438,8 +9438,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9451,8 +9451,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9467,8 +9467,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9480,8 +9480,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9496,8 +9496,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9509,8 +9509,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9525,8 +9525,8 @@ define @test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_n ; CHECK-LABEL: test_vluxseg5_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9538,8 +9538,8 @@ define @test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg5_mask_nxv1i64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9554,8 +9554,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9567,8 +9567,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9583,8 +9583,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9596,8 +9596,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9612,8 +9612,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9625,8 +9625,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9641,8 +9641,8 @@ define @test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_n ; CHECK-LABEL: test_vluxseg6_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9654,8 +9654,8 @@ define @test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg6_mask_nxv1i64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9670,8 +9670,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9683,8 +9683,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9699,8 +9699,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9712,8 +9712,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9728,8 +9728,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9741,8 +9741,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9757,8 +9757,8 @@ define @test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_n ; CHECK-LABEL: test_vluxseg7_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9770,8 +9770,8 @@ define @test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg7_mask_nxv1i64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9786,8 +9786,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9799,8 +9799,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9815,8 +9815,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9828,8 +9828,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9844,8 +9844,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9857,8 +9857,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9873,8 +9873,8 @@ define @test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_n ; CHECK-LABEL: test_vluxseg8_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -9886,8 +9886,8 @@ define @test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8 ; CHECK-LABEL: test_vluxseg8_mask_nxv1i64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -9900,8 +9900,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9913,8 +9913,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9927,8 +9927,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9940,8 +9940,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9954,8 +9954,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9967,8 +9967,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -9981,8 +9981,8 @@ define @test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -9994,8 +9994,8 @@ define @test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg2_mask_nxv1f16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10008,8 +10008,8 @@ define @test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10021,8 +10021,8 @@ define @test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10035,8 +10035,8 @@ define @test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10048,8 +10048,8 @@ define @test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10062,8 +10062,8 @@ define @test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10075,8 +10075,8 @@ define @test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg2_mask_nxv2f16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10116,8 +10116,8 @@ define @test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10129,8 +10129,8 @@ define @test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10143,8 +10143,8 @@ define @test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_ ; CHECK-LABEL: test_vluxseg2_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10156,8 +10156,8 @@ define @test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg2_mask_nxv4f16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10413,8 +10413,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10426,8 +10426,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10440,8 +10440,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10453,8 +10453,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10467,8 +10467,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10480,8 +10480,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10494,8 +10494,8 @@ define @test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10507,8 +10507,8 @@ define @test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg3_mask_nxv1f16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10521,8 +10521,8 @@ define @test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10534,8 +10534,8 @@ define @test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10548,8 +10548,8 @@ define @test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10561,8 +10561,8 @@ define @test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10575,8 +10575,8 @@ define @test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10588,8 +10588,8 @@ define @test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg3_mask_nxv2f16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10629,8 +10629,8 @@ define @test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10642,8 +10642,8 @@ define @test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10656,8 +10656,8 @@ define @test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_ ; CHECK-LABEL: test_vluxseg3_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10669,8 +10669,8 @@ define @test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg3_mask_nxv4f16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10764,8 +10764,8 @@ define @test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t ; CHECK-LABEL: test_vluxseg3_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10777,8 +10777,8 @@ define @test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vluxseg3_mask_nxv8f16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10845,8 +10845,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10858,8 +10858,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10872,8 +10872,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10885,8 +10885,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10899,8 +10899,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10912,8 +10912,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10926,8 +10926,8 @@ define @test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10939,8 +10939,8 @@ define @test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg4_mask_nxv1f16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10953,8 +10953,8 @@ define @test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10966,8 +10966,8 @@ define @test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -10980,8 +10980,8 @@ define @test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -10993,8 +10993,8 @@ define @test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11007,8 +11007,8 @@ define @test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11020,8 +11020,8 @@ define @test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg4_mask_nxv2f16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11061,8 +11061,8 @@ define @test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11074,8 +11074,8 @@ define @test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11088,8 +11088,8 @@ define @test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_ ; CHECK-LABEL: test_vluxseg4_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11101,8 +11101,8 @@ define @test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg4_mask_nxv4f16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11196,8 +11196,8 @@ define @test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t ; CHECK-LABEL: test_vluxseg4_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11209,8 +11209,8 @@ define @test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16 ; CHECK-LABEL: test_vluxseg4_mask_nxv8f16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11277,8 +11277,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11290,8 +11290,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11304,8 +11304,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11317,8 +11317,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11331,8 +11331,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11344,8 +11344,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11358,8 +11358,8 @@ define @test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11371,8 +11371,8 @@ define @test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg5_mask_nxv1f16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11385,8 +11385,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11398,8 +11398,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11412,8 +11412,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11425,8 +11425,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11439,8 +11439,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11452,8 +11452,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11466,8 +11466,8 @@ define @test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11479,8 +11479,8 @@ define @test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg5_mask_nxv2f16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11493,8 +11493,8 @@ define @test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11506,8 +11506,8 @@ define @test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11520,8 +11520,8 @@ define @test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11533,8 +11533,8 @@ define @test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11547,8 +11547,8 @@ define @test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_ ; CHECK-LABEL: test_vluxseg5_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11560,8 +11560,8 @@ define @test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg5_mask_nxv4f16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11601,8 +11601,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11614,8 +11614,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11628,8 +11628,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11641,8 +11641,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11655,8 +11655,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11668,8 +11668,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11682,8 +11682,8 @@ define @test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11695,8 +11695,8 @@ define @test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg6_mask_nxv1f16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11709,8 +11709,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11722,8 +11722,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11736,8 +11736,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11749,8 +11749,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11763,8 +11763,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11776,8 +11776,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11790,8 +11790,8 @@ define @test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11803,8 +11803,8 @@ define @test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg6_mask_nxv2f16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11817,8 +11817,8 @@ define @test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11830,8 +11830,8 @@ define @test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11844,8 +11844,8 @@ define @test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11857,8 +11857,8 @@ define @test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11871,8 +11871,8 @@ define @test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_ ; CHECK-LABEL: test_vluxseg6_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11884,8 +11884,8 @@ define @test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg6_mask_nxv4f16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11925,8 +11925,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11938,8 +11938,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11952,8 +11952,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11965,8 +11965,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -11979,8 +11979,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -11992,8 +11992,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12006,8 +12006,8 @@ define @test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12019,8 +12019,8 @@ define @test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg7_mask_nxv1f16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12033,8 +12033,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12046,8 +12046,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12060,8 +12060,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12073,8 +12073,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12087,8 +12087,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12100,8 +12100,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12114,8 +12114,8 @@ define @test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12127,8 +12127,8 @@ define @test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg7_mask_nxv2f16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12141,8 +12141,8 @@ define @test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12154,8 +12154,8 @@ define @test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12168,8 +12168,8 @@ define @test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12181,8 +12181,8 @@ define @test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12195,8 +12195,8 @@ define @test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_ ; CHECK-LABEL: test_vluxseg7_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12208,8 +12208,8 @@ define @test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg7_mask_nxv4f16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12249,8 +12249,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12262,8 +12262,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12276,8 +12276,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12289,8 +12289,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12303,8 +12303,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12316,8 +12316,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12330,8 +12330,8 @@ define @test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12343,8 +12343,8 @@ define @test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i ; CHECK-LABEL: test_vluxseg8_mask_nxv1f16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12357,8 +12357,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12370,8 +12370,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12384,8 +12384,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12397,8 +12397,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12411,8 +12411,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12424,8 +12424,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12438,8 +12438,8 @@ define @test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12451,8 +12451,8 @@ define @test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i ; CHECK-LABEL: test_vluxseg8_mask_nxv2f16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12465,8 +12465,8 @@ define @test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12478,8 +12478,8 @@ define @test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12492,8 +12492,8 @@ define @test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12505,8 +12505,8 @@ define @test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12519,8 +12519,8 @@ define @test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_ ; CHECK-LABEL: test_vluxseg8_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -12532,8 +12532,8 @@ define @test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i ; CHECK-LABEL: test_vluxseg8_mask_nxv4f16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -12573,8 +12573,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12586,8 +12586,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12600,8 +12600,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12613,8 +12613,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12627,8 +12627,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12640,8 +12640,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12654,8 +12654,8 @@ define @test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t ; CHECK-LABEL: test_vluxseg2_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12667,8 +12667,8 @@ define @test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg2_mask_nxv1f32_triscv.vector.tuple_nxv4i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12681,8 +12681,8 @@ define @test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12694,8 +12694,8 @@ define @test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12708,8 +12708,8 @@ define @test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12721,8 +12721,8 @@ define @test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -12735,8 +12735,8 @@ define @test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t ; CHECK-LABEL: test_vluxseg2_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) @@ -12748,8 +12748,8 @@ define @test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg2_mask_nxv2f32_triscv.vector.tuple_nxv8i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13005,8 +13005,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13018,8 +13018,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13032,8 +13032,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13045,8 +13045,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13059,8 +13059,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13072,8 +13072,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13086,8 +13086,8 @@ define @test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t ; CHECK-LABEL: test_vluxseg3_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13099,8 +13099,8 @@ define @test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg3_mask_nxv1f32_triscv.vector.tuple_nxv4i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13113,8 +13113,8 @@ define @test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13126,8 +13126,8 @@ define @test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13140,8 +13140,8 @@ define @test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13153,8 +13153,8 @@ define @test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13167,8 +13167,8 @@ define @test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t ; CHECK-LABEL: test_vluxseg3_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13180,8 +13180,8 @@ define @test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg3_mask_nxv2f32_triscv.vector.tuple_nxv8i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13275,8 +13275,8 @@ define @test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3 ; CHECK-LABEL: test_vluxseg3_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13288,8 +13288,8 @@ define @test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vluxseg3_mask_nxv4f32_triscv.vector.tuple_nxv16i8_3t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei32.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13329,8 +13329,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13342,8 +13342,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13356,8 +13356,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13369,8 +13369,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13383,8 +13383,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13396,8 +13396,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13410,8 +13410,8 @@ define @test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t ; CHECK-LABEL: test_vluxseg4_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13423,8 +13423,8 @@ define @test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg4_mask_nxv1f32_triscv.vector.tuple_nxv4i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13437,8 +13437,8 @@ define @test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13450,8 +13450,8 @@ define @test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13464,8 +13464,8 @@ define @test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13477,8 +13477,8 @@ define @test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13491,8 +13491,8 @@ define @test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t ; CHECK-LABEL: test_vluxseg4_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13504,8 +13504,8 @@ define @test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg4_mask_nxv2f32_triscv.vector.tuple_nxv8i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13599,8 +13599,8 @@ define @test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4 ; CHECK-LABEL: test_vluxseg4_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13612,8 +13612,8 @@ define @test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv1 ; CHECK-LABEL: test_vluxseg4_mask_nxv4f32_triscv.vector.tuple_nxv16i8_4t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei32.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13653,8 +13653,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13666,8 +13666,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13680,8 +13680,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13693,8 +13693,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13707,8 +13707,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13720,8 +13720,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13734,8 +13734,8 @@ define @test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t ; CHECK-LABEL: test_vluxseg5_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13747,8 +13747,8 @@ define @test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg5_mask_nxv1f32_triscv.vector.tuple_nxv4i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13761,8 +13761,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13774,8 +13774,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13788,8 +13788,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13801,8 +13801,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13815,8 +13815,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13828,8 +13828,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13842,8 +13842,8 @@ define @test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t ; CHECK-LABEL: test_vluxseg5_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13855,8 +13855,8 @@ define @test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg5_mask_nxv2f32_triscv.vector.tuple_nxv8i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13869,8 +13869,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13882,8 +13882,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13896,8 +13896,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13909,8 +13909,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13923,8 +13923,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13936,8 +13936,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13950,8 +13950,8 @@ define @test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t ; CHECK-LABEL: test_vluxseg6_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13963,8 +13963,8 @@ define @test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg6_mask_nxv1f32_triscv.vector.tuple_nxv4i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -13977,8 +13977,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -13990,8 +13990,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14004,8 +14004,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14017,8 +14017,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14031,8 +14031,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14044,8 +14044,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14058,8 +14058,8 @@ define @test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t ; CHECK-LABEL: test_vluxseg6_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14071,8 +14071,8 @@ define @test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg6_mask_nxv2f32_triscv.vector.tuple_nxv8i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14085,8 +14085,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14098,8 +14098,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14112,8 +14112,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14125,8 +14125,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14139,8 +14139,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14152,8 +14152,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14166,8 +14166,8 @@ define @test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t ; CHECK-LABEL: test_vluxseg7_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14179,8 +14179,8 @@ define @test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg7_mask_nxv1f32_triscv.vector.tuple_nxv4i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14193,8 +14193,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14206,8 +14206,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14220,8 +14220,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14233,8 +14233,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14247,8 +14247,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14260,8 +14260,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14274,8 +14274,8 @@ define @test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t ; CHECK-LABEL: test_vluxseg7_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14287,8 +14287,8 @@ define @test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg7_mask_nxv2f32_triscv.vector.tuple_nxv8i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14301,8 +14301,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14314,8 +14314,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14328,8 +14328,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14341,8 +14341,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14355,8 +14355,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14368,8 +14368,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14382,8 +14382,8 @@ define @test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t ; CHECK-LABEL: test_vluxseg8_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14395,8 +14395,8 @@ define @test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4 ; CHECK-LABEL: test_vluxseg8_mask_nxv1f32_triscv.vector.tuple_nxv4i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14409,8 +14409,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14422,8 +14422,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14436,8 +14436,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14449,8 +14449,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14463,8 +14463,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14476,8 +14476,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14490,8 +14490,8 @@ define @test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t ; CHECK-LABEL: test_vluxseg8_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) @@ -14503,8 +14503,8 @@ define @test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8 ; CHECK-LABEL: test_vluxseg8_mask_nxv2f32_triscv.vector.tuple_nxv8i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) @@ -14517,8 +14517,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14530,8 +14530,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14544,8 +14544,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14557,8 +14557,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14571,8 +14571,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14584,8 +14584,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14598,8 +14598,8 @@ define @test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2 ; CHECK-LABEL: test_vluxseg2_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14611,8 +14611,8 @@ define @test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg2_mask_nxv1f64_triscv.vector.tuple_nxv8i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14841,8 +14841,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14854,8 +14854,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14868,8 +14868,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14881,8 +14881,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14895,8 +14895,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14908,8 +14908,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -14922,8 +14922,8 @@ define @test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3 ; CHECK-LABEL: test_vluxseg3_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -14935,8 +14935,8 @@ define @test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv1f64_triscv.vector.tuple_nxv8i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15030,8 +15030,8 @@ define @test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_ ; CHECK-LABEL: test_vluxseg3_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei64.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15043,8 +15043,8 @@ define @test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg3_mask_nxv2f64_triscv.vector.tuple_nxv16i8_3t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei64.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15057,8 +15057,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15070,8 +15070,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15084,8 +15084,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15097,8 +15097,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15111,8 +15111,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15124,8 +15124,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15138,8 +15138,8 @@ define @test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4 ; CHECK-LABEL: test_vluxseg4_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15151,8 +15151,8 @@ define @test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv1f64_triscv.vector.tuple_nxv8i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15246,8 +15246,8 @@ define @test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_ ; CHECK-LABEL: test_vluxseg4_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei64.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15259,8 +15259,8 @@ define @test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg4_mask_nxv2f64_triscv.vector.tuple_nxv16i8_4t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m2, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei64.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15273,8 +15273,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15286,8 +15286,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15300,8 +15300,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15313,8 +15313,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15327,8 +15327,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15340,8 +15340,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15354,8 +15354,8 @@ define @test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5 ; CHECK-LABEL: test_vluxseg5_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15367,8 +15367,8 @@ define @test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg5_mask_nxv1f64_triscv.vector.tuple_nxv8i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15381,8 +15381,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15394,8 +15394,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15408,8 +15408,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15421,8 +15421,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15435,8 +15435,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15448,8 +15448,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15462,8 +15462,8 @@ define @test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6 ; CHECK-LABEL: test_vluxseg6_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15475,8 +15475,8 @@ define @test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg6_mask_nxv1f64_triscv.vector.tuple_nxv8i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15489,8 +15489,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15502,8 +15502,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15516,8 +15516,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15529,8 +15529,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15543,8 +15543,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15556,8 +15556,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15570,8 +15570,8 @@ define @test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7 ; CHECK-LABEL: test_vluxseg7_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15583,8 +15583,8 @@ define @test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg7_mask_nxv1f64_triscv.vector.tuple_nxv8i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15597,8 +15597,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15610,8 +15610,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15624,8 +15624,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15637,8 +15637,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15651,8 +15651,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15664,8 +15664,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15678,8 +15678,8 @@ define @test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8 ; CHECK-LABEL: test_vluxseg8_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 6) @@ -15691,8 +15691,8 @@ define @test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv ; CHECK-LABEL: test_vluxseg8_mask_nxv1f64_triscv.vector.tuple_nxv8i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e64, m1, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 6) @@ -15705,8 +15705,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15718,8 +15718,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15732,8 +15732,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15745,8 +15745,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15759,8 +15759,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15772,8 +15772,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15786,8 +15786,8 @@ define @test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg2_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv2i8_2t.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15799,8 +15799,8 @@ define @test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_2t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg2ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei64.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv2i8_2t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15813,8 +15813,8 @@ define @test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15826,8 +15826,8 @@ define @test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15840,8 +15840,8 @@ define @test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15853,8 +15853,8 @@ define @test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15867,8 +15867,8 @@ define @test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg2_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15880,8 +15880,8 @@ define @test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_2t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg2ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei32.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15921,8 +15921,8 @@ define @test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15934,8 +15934,8 @@ define @test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei8.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -15948,8 +15948,8 @@ define @test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg2_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv8i8_2t.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 4) @@ -15961,8 +15961,8 @@ define @test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg2_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_2t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg2ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v8 +; CHECK-NEXT: vluxseg2ei16.v v7, (a0), v9, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv8i8_2t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16218,8 +16218,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16231,8 +16231,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16245,8 +16245,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16258,8 +16258,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16272,8 +16272,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16285,8 +16285,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16299,8 +16299,8 @@ define @test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg3_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv2i8_3t.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16312,8 +16312,8 @@ define @test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_3t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg3ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei64.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv2i8_3t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16326,8 +16326,8 @@ define @test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16339,8 +16339,8 @@ define @test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16353,8 +16353,8 @@ define @test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16366,8 +16366,8 @@ define @test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16380,8 +16380,8 @@ define @test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg3_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16393,8 +16393,8 @@ define @test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_3t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg3ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei32.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16434,8 +16434,8 @@ define @test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16447,8 +16447,8 @@ define @test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei8.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16461,8 +16461,8 @@ define @test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg3_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv8i8_3t.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16474,8 +16474,8 @@ define @test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_3t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vluxseg3ei16.v v7, (a0), v10, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv8i8_3t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16569,8 +16569,8 @@ define @test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vluxseg3_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv16i8_3t.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16582,8 +16582,8 @@ define @test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg3_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_3t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg3ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg3ei16.v v6, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv16i8_3t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16650,8 +16650,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16663,8 +16663,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16677,8 +16677,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16690,8 +16690,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16704,8 +16704,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16717,8 +16717,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16731,8 +16731,8 @@ define @test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg4_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv2i8_4t.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16744,8 +16744,8 @@ define @test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_4t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg4ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei64.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv2i8_4t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16758,8 +16758,8 @@ define @test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16771,8 +16771,8 @@ define @test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16785,8 +16785,8 @@ define @test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16798,8 +16798,8 @@ define @test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16812,8 +16812,8 @@ define @test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg4_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16825,8 +16825,8 @@ define @test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_4t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg4ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei32.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16866,8 +16866,8 @@ define @test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16879,8 +16879,8 @@ define @test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei8.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -16893,8 +16893,8 @@ define @test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg4_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv8i8_4t.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -16906,8 +16906,8 @@ define @test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_4t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v11, v8 +; CHECK-NEXT: vluxseg4ei16.v v7, (a0), v11, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv8i8_4t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17001,8 +17001,8 @@ define @test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8 ; CHECK-LABEL: test_vluxseg4_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8 -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv16i8_4t.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17014,8 +17014,8 @@ define @test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg4_mask_nxv8bf16_triscv.vector.tuple_nxv16i8_4t_nxv8i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m2, ta, ma -; CHECK-NEXT: vluxseg4ei16.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg4ei16.v v6, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv16i8_4t.nxv8i1.nxv8i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17082,8 +17082,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17095,8 +17095,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17109,8 +17109,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17122,8 +17122,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17136,8 +17136,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17149,8 +17149,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17163,8 +17163,8 @@ define @test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg5_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv2i8_5t.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17176,8 +17176,8 @@ define @test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_5t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv2i8_5t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17190,8 +17190,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17203,8 +17203,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17217,8 +17217,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17230,8 +17230,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17244,8 +17244,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17257,8 +17257,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17271,8 +17271,8 @@ define @test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg5_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17284,8 +17284,8 @@ define @test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_5t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg5ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei64.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17298,8 +17298,8 @@ define @test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17311,8 +17311,8 @@ define @test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei8.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17325,8 +17325,8 @@ define @test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17338,8 +17338,8 @@ define @test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v12, v8 +; CHECK-NEXT: vluxseg5ei16.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17352,8 +17352,8 @@ define @test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg5_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv8i8_5t.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17365,8 +17365,8 @@ define @test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg5_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_5t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg5ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v12, v8 +; CHECK-NEXT: vluxseg5ei32.v v7, (a0), v12, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv8i8_5t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17406,8 +17406,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17419,8 +17419,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17433,8 +17433,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17446,8 +17446,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17460,8 +17460,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17473,8 +17473,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17487,8 +17487,8 @@ define @test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg6_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv2i8_6t.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17500,8 +17500,8 @@ define @test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_6t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv2i8_6t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17514,8 +17514,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17527,8 +17527,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17541,8 +17541,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17554,8 +17554,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17568,8 +17568,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17581,8 +17581,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17595,8 +17595,8 @@ define @test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg6_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17608,8 +17608,8 @@ define @test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_6t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg6ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17622,8 +17622,8 @@ define @test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17635,8 +17635,8 @@ define @test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei8.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17649,8 +17649,8 @@ define @test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17662,8 +17662,8 @@ define @test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v13, v8 +; CHECK-NEXT: vluxseg6ei16.v v7, (a0), v13, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17676,8 +17676,8 @@ define @test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg6_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv8i8_6t.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17689,8 +17689,8 @@ define @test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg6_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_6t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg6ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg6ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv8i8_6t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17730,8 +17730,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17743,8 +17743,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17757,8 +17757,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17770,8 +17770,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17784,8 +17784,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17797,8 +17797,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17811,8 +17811,8 @@ define @test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg7_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv2i8_7t.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17824,8 +17824,8 @@ define @test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_7t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv2i8_7t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17838,8 +17838,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17851,8 +17851,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17865,8 +17865,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17878,8 +17878,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17892,8 +17892,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17905,8 +17905,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17919,8 +17919,8 @@ define @test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg7_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17932,8 +17932,8 @@ define @test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_7t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg7ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei64.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17946,8 +17946,8 @@ define @test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17959,8 +17959,8 @@ define @test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei8.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -17973,8 +17973,8 @@ define @test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -17986,8 +17986,8 @@ define @test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v14, v8 +; CHECK-NEXT: vluxseg7ei16.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18000,8 +18000,8 @@ define @test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg7_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv8i8_7t.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18013,8 +18013,8 @@ define @test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg7_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_7t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg7ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: vluxseg7ei32.v v7, (a0), v14, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv8i8_7t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18054,8 +18054,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18067,8 +18067,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18081,8 +18081,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18094,8 +18094,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18108,8 +18108,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18121,8 +18121,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18135,8 +18135,8 @@ define @test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_ ; CHECK-LABEL: test_vluxseg8_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv2i8_8t.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18148,8 +18148,8 @@ define @test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv1bf16_triscv.vector.tuple_nxv2i8_8t_nxv1i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf4, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv2i8_8t.nxv1i1.nxv1i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18162,8 +18162,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18175,8 +18175,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18189,8 +18189,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18202,8 +18202,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18216,8 +18216,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18229,8 +18229,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18243,8 +18243,8 @@ define @test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_ ; CHECK-LABEL: test_vluxseg8_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18256,8 +18256,8 @@ define @test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv2bf16_triscv.vector.tuple_nxv4i8_8t_nxv2i64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, mf2, ta, ma -; CHECK-NEXT: vluxseg8ei64.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei64.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv2i1.nxv2i64(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18270,8 +18270,8 @@ define @test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18283,8 +18283,8 @@ define @test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei8.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei8.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i8(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18297,8 +18297,8 @@ define @test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18310,8 +18310,8 @@ define @test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei16.v v9, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v15, v8 +; CHECK-NEXT: vluxseg8ei16.v v7, (a0), v15, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) @@ -18324,8 +18324,8 @@ define @test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_ ; CHECK-LABEL: test_vluxseg8_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8 -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16 ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv8i8_8t.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 4) @@ -18337,8 +18337,8 @@ define @test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nx ; CHECK-LABEL: test_vluxseg8_mask_nxv4bf16_triscv.vector.tuple_nxv8i8_8t_nxv4i32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma -; CHECK-NEXT: vluxseg8ei32.v v10, (a0), v8, v0.t -; CHECK-NEXT: vmv1r.v v8, v11 +; CHECK-NEXT: vmv2r.v v16, v8 +; CHECK-NEXT: vluxseg8ei32.v v7, (a0), v16, v0.t ; CHECK-NEXT: ret entry: %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv8i8_8t.nxv4i1.nxv4i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 4) diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge.ll b/llvm/test/CodeGen/RISCV/rvv/vmerge.ll index ef1de87b3b8b7..3fb5aa02230b4 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmerge.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmerge.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \ -; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck %s +; RUN: -verify-machineinstrs -target-abi=ilp32d | FileCheck -check-prefixes=CHECK,RV32 %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \ -; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck %s +; RUN: -verify-machineinstrs -target-abi=lp64d | FileCheck -check-prefixes=CHECK,RV64 %s declare @llvm.riscv.vmerge.nxv1i8.nxv1i8( , @@ -972,6 +972,22 @@ declare @llvm.riscv.vmerge.nxv1i64.i64( iXLen); define @intrinsic_vmerge_vxm_nxv1i64_nxv1i64_i64( %0, i64 %1, %2, iXLen %3) nounwind { +; RV32-LABEL: intrinsic_vmerge_vxm_nxv1i64_nxv1i64_i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli zero, a2, e64, m1, ta, mu +; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: intrinsic_vmerge_vxm_nxv1i64_nxv1i64_i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetvli zero, a1, e64, m1, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret entry: %a = call @llvm.riscv.vmerge.nxv1i64.i64( poison, @@ -991,6 +1007,22 @@ declare @llvm.riscv.vmerge.nxv2i64.i64( iXLen); define @intrinsic_vmerge_vxm_nxv2i64_nxv2i64_i64( %0, i64 %1, %2, iXLen %3) nounwind { +; RV32-LABEL: intrinsic_vmerge_vxm_nxv2i64_nxv2i64_i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli zero, a2, e64, m2, ta, mu +; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: intrinsic_vmerge_vxm_nxv2i64_nxv2i64_i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetvli zero, a1, e64, m2, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret entry: %a = call @llvm.riscv.vmerge.nxv2i64.i64( poison, @@ -1010,6 +1042,22 @@ declare @llvm.riscv.vmerge.nxv4i64.i64( iXLen); define @intrinsic_vmerge_vxm_nxv4i64_nxv4i64_i64( %0, i64 %1, %2, iXLen %3) nounwind { +; RV32-LABEL: intrinsic_vmerge_vxm_nxv4i64_nxv4i64_i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli zero, a2, e64, m4, ta, mu +; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: intrinsic_vmerge_vxm_nxv4i64_nxv4i64_i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetvli zero, a1, e64, m4, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret entry: %a = call @llvm.riscv.vmerge.nxv4i64.i64( poison, @@ -1029,6 +1077,22 @@ declare @llvm.riscv.vmerge.nxv8i64.i64( iXLen); define @intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64( %0, i64 %1, %2, iXLen %3) nounwind { +; RV32-LABEL: intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64: +; RV32: # %bb.0: # %entry +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: sw a0, 8(sp) +; RV32-NEXT: sw a1, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; RV32-NEXT: vlse64.v v8, (a0), zero, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: intrinsic_vmerge_vxm_nxv8i64_nxv8i64_i64: +; RV64: # %bb.0: # %entry +; RV64-NEXT: vsetvli zero, a1, e64, m8, ta, ma +; RV64-NEXT: vmerge.vxm v8, v8, a0, v0 +; RV64-NEXT: ret entry: %a = call @llvm.riscv.vmerge.nxv8i64.i64( poison, diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode-f16.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode-f16.ll index e269b13137d44..93b12ad14d7e1 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode-f16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode-f16.ll @@ -100,8 +100,9 @@ define half @vreduce_fminimum_nxv4f16( %val) { ; ZVFH-NEXT: vcpop.m a0, v9 ; ZVFH-NEXT: beqz a0, .LBB4_2 ; ZVFH-NEXT: # %bb.1: -; ZVFH-NEXT: lui a0, %hi(.LCPI4_0) -; ZVFH-NEXT: flh fa0, %lo(.LCPI4_0)(a0) +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -512 +; ZVFH-NEXT: fmv.h.x fa0, a0 ; ZVFH-NEXT: ret ; ZVFH-NEXT: .LBB4_2: ; ZVFH-NEXT: vfredmin.vs v8, v8, v8 @@ -138,8 +139,9 @@ define half @vreduce_fmaximum_nxv4f16( %val) { ; ZVFH-NEXT: vcpop.m a0, v9 ; ZVFH-NEXT: beqz a0, .LBB5_2 ; ZVFH-NEXT: # %bb.1: -; ZVFH-NEXT: lui a0, %hi(.LCPI5_0) -; ZVFH-NEXT: flh fa0, %lo(.LCPI5_0)(a0) +; ZVFH-NEXT: lui a0, 8 +; ZVFH-NEXT: addi a0, a0, -512 +; ZVFH-NEXT: fmv.h.x fa0, a0 ; ZVFH-NEXT: ret ; ZVFH-NEXT: .LBB5_2: ; ZVFH-NEXT: vfredmax.vs v8, v8, v8 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll index 78aae96242fd3..861998a2ba51a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -998,13 +998,13 @@ declare half @llvm.vector.reduce.fmin.nxv10f16() define half @vreduce_fmin_nxv10f16( %v) { ; CHECK-LABEL: vreduce_fmin_nxv10f16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI73_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI73_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v12, (a0) ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a1, a0, 2 ; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: lui a1, 8 +; CHECK-NEXT: addi a1, a1, -512 +; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma +; CHECK-NEXT: vmv.s.x v12, a1 ; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma ; CHECK-NEXT: vfredmin.vs v12, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v12 diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp-f16.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp-f16.ll index 8993bf8a767d8..7fb26fb6f6258 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp-f16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp-f16.ll @@ -124,8 +124,9 @@ define half @vpreduce_fminimum_nxv4f16(half %start, %val, %val, @test4(i64 %avl, i8 zeroext %cond, @test6(i64 %avl, i8 zeroext %cond, This Inner Loop Header: Depth=1 +; CHECK-NEXT: addi s0, s0, 4 +; CHECK-NEXT: bltu a0, s0, .LBB0_7 +; CHECK-NEXT: # %bb.8: # %exit +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; CHECK-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; CHECK-NEXT: .cfi_restore ra +; CHECK-NEXT: .cfi_restore s0 +; CHECK-NEXT: .cfi_restore s1 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %sel_1 = select i1 %arg_1, i32 %arg_2, i32 1 + %div = udiv i32 %arg_2, 7 + %cond_1 = icmp ugt i32 %div, %sel_1 + %sel_2 = select i1 %arg_1, i32 %div, i32 3 + %sel = select i1 %arg_1, i32 %sel_1, i32 %sel_2 + br label %body + +body: + %res = phi i32 [ %sel, %entry ], [ %add_loop, %body ] + %add_loop = add i32 4, %res + %cond_2 = icmp ugt i32 %add_loop, 3 + br i1 %cond_2, label %body, label %exit + +exit: + ret i32 %add_loop +} diff --git a/llvm/test/CodeGen/RISCV/srodata.ll b/llvm/test/CodeGen/RISCV/srodata.ll index 1d5bd904f233f..71ced1743efcd 100644 --- a/llvm/test/CodeGen/RISCV/srodata.ll +++ b/llvm/test/CodeGen/RISCV/srodata.ll @@ -4,12 +4,6 @@ ; RUN: sed 's/SMALL_DATA_LIMIT/0/g' %s | \ ; RUN: llc -mtriple=riscv64 -mattr=+d | \ ; RUN: FileCheck -check-prefix=CHECK-SDL-0 %s -; RUN: sed 's/SMALL_DATA_LIMIT/4/g' %s | \ -; RUN: llc -mtriple=riscv32 -mattr=+d | \ -; RUN: FileCheck -check-prefix=CHECK-SDL-4 %s -; RUN: sed 's/SMALL_DATA_LIMIT/4/g' %s | \ -; RUN: llc -mtriple=riscv64 -mattr=+d | \ -; RUN: FileCheck -check-prefix=CHECK-SDL-4 %s ; RUN: sed 's/SMALL_DATA_LIMIT/8/g' %s | \ ; RUN: llc -mtriple=riscv32 -mattr=+d | \ ; RUN: FileCheck -check-prefix=CHECK-SDL-8 %s @@ -23,11 +17,6 @@ ; RUN: llc -mtriple=riscv64 -mattr=+d | \ ; RUN: FileCheck -check-prefix=CHECK-SDL-16 %s -define dso_local float @foof() { -entry: - ret float 0x400A08ACA0000000 -} - define dso_local double @foo() { entry: ret double 0x400A08AC91C3E242 @@ -39,9 +28,5 @@ entry: ; CHECK-SDL-0-NOT: .section .srodata.cst4 ; CHECK-SDL-0-NOT: .section .srodata.cst8 -; CHECK-SDL-4: .section .srodata.cst4 -; CHECK-SDL-4-NOT: .section .srodata.cst8 -; CHECK-SDL-8: .section .srodata.cst4 ; CHECK-SDL-8: .section .srodata.cst8 -; CHECK-SDL-16: .section .srodata.cst4 ; CHECK-SDL-16: .section .srodata.cst8 diff --git a/llvm/test/CodeGen/RISCV/zilsd.ll b/llvm/test/CodeGen/RISCV/zilsd.ll index 048ce964f9e18..7f2d2dd120306 100644 --- a/llvm/test/CodeGen/RISCV/zilsd.ll +++ b/llvm/test/CodeGen/RISCV/zilsd.ll @@ -7,10 +7,9 @@ define i64 @load(ptr %a) nounwind { ; CHECK-LABEL: load: ; CHECK: # %bb.0: -; CHECK-NEXT: ld a2, 80(a0) -; CHECK-NEXT: ld zero, 0(a0) -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: ld a0, 80(a0) +; CHECK-NEXT: ld zero, 0(a2) ; CHECK-NEXT: ret %1 = getelementptr i64, ptr %a, i32 10 %2 = load i64, ptr %1 diff --git a/llvm/test/CodeGen/SPARC/64abi.ll b/llvm/test/CodeGen/SPARC/64abi.ll index 6485a7f13e8d5..dc8c9af4a5185 100644 --- a/llvm/test/CodeGen/SPARC/64abi.ll +++ b/llvm/test/CodeGen/SPARC/64abi.ll @@ -473,8 +473,8 @@ declare i64 @receive_fp128(i64 %a, ...) ; HARD-DAG: ldx [%sp+[[Offset0]]], %o2 ; HARD-DAG: ldx [%sp+[[Offset1]]], %o3 ; SOFT-DAG: mov %i0, %o0 -; SOFT-DAG: mov %i1, %o1 ; SOFT-DAG: mov %i2, %o2 +; SOFT-DAG: mov %i3, %o3 ; CHECK: call receive_fp128 define i64 @test_fp128_variable_args(i64 %a, fp128 %b) { entry: @@ -482,6 +482,19 @@ entry: ret i64 %0 } +declare i64 @receive_i128(i64 %a, i128 %b) + +; CHECK-LABEL: test_i128_args: +; CHECK: mov %i3, %o3 +; CHECK: mov %i2, %o2 +; CHECK: mov %i0, %o0 +; CHECK: call receive_i128 +define i64 @test_i128_args(i64 %a, i128 %b) { +entry: + %0 = call i64 @receive_i128(i64 %a, i128 %b) + ret i64 %0 +} + ; CHECK-LABEL: test_call_libfunc: ; HARD: st %f1, [%fp+[[Offset0:[0-9]+]]] ; HARD: fmovs %f3, %f1 diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll new file mode 100644 index 0000000000000..093d172c5c1b1 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/builtin_printf.ll @@ -0,0 +1,24 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_relaxed_printf_string_address_space %s -o - | FileCheck %s +; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR + +; CHECK: OpExtension "SPV_EXT_relaxed_printf_string_address_space" +; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] printf + +; CHECK-ERROR: LLVM ERROR: SPV_EXT_relaxed_printf_string_address_space is required because printf uses a format string not in constant address space. + +@.str = private unnamed_addr addrspace(1) constant [4 x i8] c"%d\0A\00", align 1 + +declare spir_func i32 @printf(ptr addrspace(4), ...) + +define spir_kernel void @test_kernel() { +entry: + ; Format string in addrspace(1) → cast to addrspace(4) + %format = addrspacecast ptr addrspace(1) @.str to ptr addrspace(4) + %val = alloca i32, align 4 + store i32 123, ptr %val, align 4 + %loaded = load i32, ptr %val, align 4 + + ; Call printf with non-constant format string + %call = call spir_func i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) %format, i32 %loaded) + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll new file mode 100644 index 0000000000000..b54d59b30309f --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_relaxed_printf_string_address_space/non-constant-printf.ll @@ -0,0 +1,48 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_EXT_relaxed_printf_string_address_space %s -o - | FileCheck %s +; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR + +; CHECK: OpExtension "SPV_EXT_relaxed_printf_string_address_space" +; CHECK: %[[#ExtInstSetId:]] = OpExtInstImport "OpenCL.std" +; CHECK-DAG: %[[#TypeInt32Id:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#TypeInt8Id:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#TypeInt64Id:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#TypeArrayId:]] = OpTypeArray %[[#TypeInt8Id]] %[[#]] +; CHECK-DAG: %[[#ConstantStorClassGlobalPtrTy:]] = OpTypePointer UniformConstant %[[#TypeArrayId]] +; CHECK-DAG: %[[#WGStorClassGlobalPtrTy:]] = OpTypePointer Workgroup %[[#TypeArrayId]] +; CHECK-DAG: %[[#CrossWFStorClassGlobalPtrTy:]] = OpTypePointer CrossWorkgroup %[[#TypeArrayId]] +; CHECK-DAG: %[[#FunctionStorClassPtrTy:]] = OpTypePointer Function %[[#TypeInt8Id]] +; CHECK-DAG: %[[#WGStorClassPtrTy:]] = OpTypePointer Workgroup %[[#TypeInt8Id]] +; CHECK-DAG: %[[#CrossWFStorClassPtrTy:]] = OpTypePointer CrossWorkgroup %[[#TypeInt8Id]] +; CHECK: %[[#ConstantCompositeId:]] = OpConstantComposite %[[#TypeArrayId]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#]] = OpVariable %[[#ConstantStorClassGlobalPtrTy]] UniformConstant %[[#ConstantCompositeId]] +; CHECK: %[[#]] = OpVariable %[[#CrossWFStorClassGlobalPtrTy]] CrossWorkgroup %[[#ConstantCompositeId]] +; CHECK: %[[#]] = OpVariable %[[#WGStorClassGlobalPtrTy]] Workgroup %[[#ConstantCompositeId]] +; CHECK: %[[#GEP1:]] = OpInBoundsPtrAccessChain %[[#FunctionStorClassPtrTy]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP1]] +; CHECK: %[[#GEP2:]] = OpInBoundsPtrAccessChain %[[#CrossWFStorClassPtrTy]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP2]] +; CHECK: %[[#GEP3:]] = OpInBoundsPtrAccessChain %[[#WGStorClassPtrTy]] %[[#]] %[[#]] %[[#]] +; CHECK: %[[#]] = OpExtInst %[[#TypeInt32Id]] %[[#ExtInstSetId:]] printf %[[#GEP3]] + +; CHECK-ERROR: LLVM ERROR: SPV_EXT_relaxed_printf_string_address_space is required because printf uses a format string not in constant address space. + +@0 = internal unnamed_addr addrspace(2) constant [6 x i8] c"Test\0A\00", align 1 +@1 = internal unnamed_addr addrspace(1) constant [6 x i8] c"Test\0A\00", align 1 +@2 = internal unnamed_addr addrspace(3) constant [6 x i8] c"Test\0A\00", align 1 + +define spir_kernel void @test() { + %tmp1 = alloca [6 x i8], align 1 + call void @llvm.memcpy.p0.p2.i64(ptr align 1 %tmp1, ptr addrspace(2) align 1 @0, i64 6, i1 false) + %1 = getelementptr inbounds [6 x i8], ptr %tmp1, i32 0, i32 0 + %2 = call spir_func i32 @_Z18__spirv_ocl_printfPc(ptr %1) + %3 = getelementptr inbounds [6 x i8], ptr addrspace(1) @1, i32 0, i32 0 + %4 = call spir_func i32 @_Z18__spirv_ocl_printfPU3AS1c(ptr addrspace(1) %3) + %5 = getelementptr inbounds [6 x i8], ptr addrspace(3) @2, i32 0, i32 0 + %6 = call spir_func i32 @_Z18__spirv_ocl_printfPU3AS3c(ptr addrspace(3) %5) + ret void +} + +declare spir_func i32 @_Z18__spirv_ocl_printfPc(ptr) +declare spir_func i32 @_Z18__spirv_ocl_printfPU3AS1c(ptr addrspace(1)) +declare spir_func i32 @_Z18__spirv_ocl_printfPU3AS3c(ptr addrspace(3)) +declare void @llvm.memcpy.p0.p2.i64(ptr captures(none), ptr addrspace(2) captures(none) readonly, i64, i1) diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll new file mode 100644 index 0000000000000..3624f149cb491 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bindless_images/i32-in-physical64.ll @@ -0,0 +1,19 @@ +; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bindless_images %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR + +; CHECK-ERROR: LLVM ERROR: Parameter value must be a 32-bit scalar in case of Physical32 addressing model or a 64-bit scalar in case of Physical64 addressing model + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +define spir_func void @foo(i32 %in) { + %img = call spir_func target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELi(i32 %in) + %samp = call spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64 42) + %sampImage = call spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64 43) + ret void +} + +declare spir_func target("spirv.Image", i32, 2, 0, 0, 0, 0, 0, 0) @_Z33__spirv_ConvertHandleToImageINTELi(i32) + +declare spir_func target("spirv.Sampler") @_Z35__spirv_ConvertHandleToSamplerINTELl(i64) + +declare spir_func target("spirv.SampledImage", i64, 1, 0, 0, 0, 0, 0, 0) @_Z40__spirv_ConvertHandleToSampledImageINTELl(i64) diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_integer_dot_product/SPV_KHR_integer_dot_product_OCLtoSPIRV_int.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_integer_dot_product/SPV_KHR_integer_dot_product_OCLtoSPIRV_int.ll index 284f5c34671b7..52ddc39265442 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_integer_dot_product/SPV_KHR_integer_dot_product_OCLtoSPIRV_int.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_integer_dot_product/SPV_KHR_integer_dot_product_OCLtoSPIRV_int.ll @@ -13,15 +13,15 @@ ; CHECK: Name %[[#SignedB:]] "ib" ; CHECK: Name %[[#UnsignedB:]] "ub" -; CHECK: SDot %[[#]] %[[#SignedA]] %[[#SignedB]] 0 -; CHECK: SUDot %[[#]] %[[#SignedA]] %[[#UnsignedB]] 0 -; CHECK: SUDot %[[#]] %[[#SignedB]] %[[#UnsignedA]] 0 -; CHECK: UDot %[[#]] %[[#UnsignedA]] %[[#UnsignedB]] 0 - -; CHECK: SDotAccSat %[[#]] %[[#SignedA]] %[[#SignedB]] %[[#]] 0 -; CHECK: SUDotAccSat %[[#]] %[[#SignedA]] %[[#UnsignedB]] %[[#]] 0 -; CHECK: SUDotAccSat %[[#]] %[[#SignedB]] %[[#UnsignedA]] %[[#]] 0 -; CHECK: UDotAccSat %[[#]] %[[#UnsignedA]] %[[#UnsignedB]] %[[#]] 0 +; CHECK: SDot %[[#]] %[[#SignedA]] %[[#SignedB]] PackedVectorFormat4x8Bit +; CHECK: SUDot %[[#]] %[[#SignedA]] %[[#UnsignedB]] PackedVectorFormat4x8Bit +; CHECK: SUDot %[[#]] %[[#SignedB]] %[[#UnsignedA]] PackedVectorFormat4x8Bit +; CHECK: UDot %[[#]] %[[#UnsignedA]] %[[#UnsignedB]] PackedVectorFormat4x8Bit + +; CHECK: SDotAccSat %[[#]] %[[#SignedA]] %[[#SignedB]] %[[#]] PackedVectorFormat4x8Bit +; CHECK: SUDotAccSat %[[#]] %[[#SignedA]] %[[#UnsignedB]] %[[#]] PackedVectorFormat4x8Bit +; CHECK: SUDotAccSat %[[#]] %[[#SignedB]] %[[#UnsignedA]] %[[#]] PackedVectorFormat4x8Bit +; CHECK: UDotAccSat %[[#]] %[[#UnsignedA]] %[[#UnsignedB]] %[[#]] PackedVectorFormat4x8Bit define spir_kernel void @test(i32 %ia, i32 %ua, i32 %ib, i32 %ub, i32 %ires, i32 %ures) { entry: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/isnan.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/isnan.ll new file mode 100644 index 0000000000000..67bb0cd8240f3 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/isnan.ll @@ -0,0 +1,45 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val --target-env spv1.4 %} + +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 +; CHECK-DAG: %[[#bool:]] = OpTypeBool +; CHECK-DAG: %[[#vec4_bool:]] = OpTypeVector %[[#bool]] 4 + +define noundef i1 @isnan_half(half noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#bool]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_16]] + ; CHECK: %[[#]] = OpIsNan %[[#bool]] %[[#arg0]] + %hlsl.isnan = call i1 @llvm.spv.isnan.f16(half %a) + ret i1 %hlsl.isnan +} + +define noundef i1 @isnan_float(float noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#bool]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#float_32]] + ; CHECK: %[[#]] = OpIsNan %[[#bool]] %[[#arg0]] + %hlsl.isnan = call i1 @llvm.spv.isnan.f32(float %a) + ret i1 %hlsl.isnan +} + +define noundef <4 x i1> @isnan_half4(<4 x half> noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_bool]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#]] = OpIsNan %[[#vec4_bool]] %[[#arg0]] + %hlsl.isnan = call <4 x i1> @llvm.spv.isnan.v4f16(<4 x half> %a) + ret <4 x i1> %hlsl.isnan +} + +define noundef <4 x i1> @isnan_float4(<4 x float> noundef %a) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_bool]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#]] = OpIsNan %[[#vec4_bool]] %[[#arg0]] + %hlsl.isnan = call <4 x i1> @llvm.spv.isnan.v4f32(<4 x float> %a) + ret <4 x i1> %hlsl.isnan +} diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll index 08b2756fbab00..5e15aab7ddee0 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll @@ -1,8 +1,8 @@ ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.5-vulkan-library %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.5-vulkan-library %s -o - -filetype=obj | spirv-val %} -; This test depends on NonUniform resource analysis -; https://github.com/llvm/llvm-project/issues/155701 +; This test depends on llvm.svp.resource.nonuniformindex support (not yet implemented) +; https://github.com/llvm/llvm-project/issues/160231 ; XFAIL: * @.str.b0 = private unnamed_addr constant [3 x i8] c"B0\00", align 1 diff --git a/llvm/test/CodeGen/SPIRV/image_store.ll b/llvm/test/CodeGen/SPIRV/image_store.ll new file mode 100644 index 0000000000000..a70651c974f36 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/image_store.ll @@ -0,0 +1,22 @@ +; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Image types may be represented in two ways while translating to SPIR-V: +; - OpenCL form, for example, '%opencl.image2d_ro_t', +; - SPIR-V form, for example, '%spirv.Image._void_1_0_0_0_0_0_0', +; but it is still one type which should be translated to one SPIR-V type. +; +; The test checks that the code below is successfully translated and only one +; SPIR-V type for images is generated (no duplicate OpTypeImage instructions). + +; CHECK: %[[#]] = OpTypeImage %[[#]] 2D +; CHECK-NOT: %[[#]] = OpTypeImage %[[#]] 2D + +declare spir_func <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(ptr addrspace(1), ptr addrspace(2), <2 x float>, float) + +define spir_kernel void @read_image(ptr addrspace(1) %srcimg, ptr addrspace(2) %sampler){ +entry: + %spirvimg.addr = alloca target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0), align 8 + %val = call <4 x float> @_Z11read_imagef14ocl_image2d_ro11ocl_samplerDv2_ff(ptr addrspace(1) %srcimg, ptr addrspace(2) %sampler, <2 x float> zeroinitializer, float 0.0) + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll b/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll new file mode 100644 index 0000000000000..b788f34bf7238 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/instructions/insertvalue-undef-ptr.ll @@ -0,0 +1,28 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-LABEL: Begin function original_testcase +define fastcc void @original_testcase() { +top: + ; CHECK: OpCompositeInsert + %0 = insertvalue [1 x ptr] zeroinitializer, ptr poison, 0 + ret void +} + +; CHECK-LABEL: Begin function additional_testcases +define fastcc void @additional_testcases() { +top: + ; Test with different pointer types + ; CHECK: OpCompositeInsert + %1 = insertvalue [1 x ptr] zeroinitializer, ptr undef, 0 + ; CHECK-NEXT: OpCompositeInsert + %2 = insertvalue {ptr, i32} zeroinitializer, ptr poison, 0 + ; CHECK-NEXT: OpCompositeInsert + %3 = insertvalue {ptr, ptr} undef, ptr null, 0 + + ; Test with undef aggregate + ; CHECK-NEXT: OpCompositeInsert + %4 = insertvalue [1 x ptr] undef, ptr undef, 0 + + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll new file mode 100644 index 0000000000000..49bb8eac10be8 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/constrained-comparison.ll @@ -0,0 +1,56 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: OpFOrdEqual +; CHECK-DAG: OpFOrdGreaterThan +; CHECK-DAG: OpFOrdGreaterThanEqual +; CHECK-DAG: OpFOrdLessThan +; CHECK-DAG: OpFOrdLessThanEqual +; CHECK-DAG: OpFOrdNotEqual +; CHECK-DAG: OpOrdered +; CHECK-DAG: OpFUnordEqual +; CHECK-DAG: OpFUnordGreaterThan +; CHECK-DAG: OpFUnordGreaterThanEqual +; CHECK-DAG: OpFUnordLessThan +; CHECK-DAG: OpFUnordLessThanEqual +; CHECK-DAG: OpFUnordNotEqual +; CHECK-DAG: OpUnordered + +define dso_local spir_kernel void @test(float %a){ +entry: + %cmp = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"oeq", metadata !"fpexcept.strict") + %cmp1 = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"ogt", metadata !"fpexcept.strict") + %cmp2 = tail call i1 @llvm.experimental.constrained.fcmps.f32(float %a, float %a, metadata !"oge", metadata !"fpexcept.strict") + %cmp3 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"olt", metadata !"fpexcept.strict") + %cmp4 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ole", metadata !"fpexcept.strict") + %cmp5 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"one", metadata !"fpexcept.strict") + %cmp6 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ord", metadata !"fpexcept.strict") + %cmp7 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ueq", metadata !"fpexcept.strict") + %cmp8 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ugt", metadata !"fpexcept.strict") + %cmp9 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"uge", metadata !"fpexcept.strict") + %cmp10 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ult", metadata !"fpexcept.strict") + %cmp11 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"ule", metadata !"fpexcept.strict") + %cmp12 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"une", metadata !"fpexcept.strict") + %cmp13 = tail call i1 @llvm.experimental.constrained.fcmp.f32(float %a, float %a, metadata !"uno", metadata !"fpexcept.strict") + + %or1 = or i1 %cmp, %cmp1 + %or2 = or i1 %or1, %cmp2 + %or3 = or i1 %or2, %cmp3 + %or4 = or i1 %or3, %cmp4 + %or5 = or i1 %or4, %cmp5 + %or6 = or i1 %or5, %cmp6 + %or7 = or i1 %or6, %cmp7 + %or8 = or i1 %or7, %cmp8 + %or9 = or i1 %or8, %cmp9 + %or10 = or i1 %or9, %cmp10 + %or11 = or i1 %or10, %cmp11 + %or12 = or i1 %or11, %cmp12 + %or13 = or i1 %or12, %cmp13 + br i1 %or13, label %true_block, label %false_block +true_block: + ret void +false_block: + ret void +} +declare i1 @llvm.experimental.constrained.fcmps.f32(float, float, metadata, metadata) +declare i1 @llvm.experimental.constrained.fcmp.f32(float, float, metadata, metadata) diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll new file mode 100644 index 0000000000000..fd8cb9d7ff6f0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/debugtrap.ll @@ -0,0 +1,14 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s + +; CHECK: OpNop +; CHECK-NEXT: OpReturn + +declare void @llvm.debugtrap() + +define spir_kernel void @foo(ptr addrspace(1) %a){ +entry: + %a.addr = alloca ptr addrspace(1), align 4 + store ptr addrspace(1) %a, ptr %a.addr, align 4 + call void @llvm.debugtrap() + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll new file mode 100644 index 0000000000000..f6434e94a9d79 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/frexp.ll @@ -0,0 +1,114 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-DAG: %[[#extinst_id:]] = OpExtInstImport "OpenCL.std" +; CHECK-DAG: %[[#float_32_type:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#int_32_type:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#fn_ptr_type_i32:]] = OpTypePointer Function %[[#int_32_type]] +; CHECK-DAG: %[[#const_negzero:]] = OpConstant %[[#float_32_type]] -0 +; CHECK-DAG: %[[#vec2_float_type:]] = OpTypeVector %[[#float_32_type]] 2 +; CHECK-DAG: %[[#vec2_int_type:]] = OpTypeVector %[[#int_32_type]] 2 +; CHECK-DAG: %[[#fn_ptr_type_vec2_i32:]] = OpTypePointer Function %[[#vec2_int_type]] +; CHECK-DAG: %[[#vec2_null:]] = OpConstantNull %[[#vec2_float_type]] +; CHECK-DAG: %[[#scalar_null:]] = OpConstantNull %[[#float_32_type]] +; CHECK-DAG: %[[#const_composite1:]] = OpConstantComposite %[[#vec2_float_type]] %[[#scalar_null]] %[[#const_negzero]] +; CHECK-DAG: %[[#vec4_float_type:]] = OpTypeVector %[[#float_32_type]] 4 +; CHECK-DAG: %[[#vec4_int_type:]] = OpTypeVector %[[#int_32_type]] 4 +; CHECK-DAG: %[[#fn_ptr_type_vec4_i32:]] = OpTypePointer Function %[[#vec4_int_type]] +; CHECK-DAG: %[[#const_composite2:]] = OpConstantComposite %[[#vec4_float_type]] %[[#const_16:]] %[[#const_neg32:]] %[[#const_0:]] %[[#const_9999:]] +; CHECK-DAG: %[[#float_64_type:]] = OpTypeFloat 64 +; CHECK-DAG: %[[#vec2_double_type:]] = OpTypeVector %[[#float_64_type]] 2 + +; CHECK: %[[#]] = OpFunctionParameter %[[#float_32_type]] +; CHECK: %[[#var1:]] = OpVariable %[[#fn_ptr_type_i32]] Function +; CHECK: %[[#extinst1:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#const_negzero]] %[[#var1]] +; CHECK: %[[#exp_part_var:]] = OpLoad %[[#int_32_type]] %[[#var1]] +; CHECK: OpReturnValue %[[#exp_part_var]] +define i32 @frexp_negzero(float %x) { + %ret = call { float, i32 } @llvm.frexp.f32.i32(float -0.0) + %f_part = extractvalue { float, i32 } %ret, 0 + %exp_part = extractvalue { float, i32 } %ret, 1 + ret i32 %exp_part +} + +; CHECK: %[[#x_var4:]] = OpFunctionParameter %[[#float_32_type]] +; CHECK: %[[#var10:]] = OpVariable %[[#fn_ptr_type_i32]] Function +; CHECK: %[[#extinst10:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#x_var4]] %[[#var10]] +; CHECK: %[[#exp_part_var2:]] = OpLoad %[[#int_32_type]] %[[#var10]] +; CHECK: OpReturnValue %[[#exp_part_var2]] +define i32 @frexp_frexp_get_int(float %x) { + %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x) + %f_part = extractvalue { float, i32 } %frexp0, 0 + %exp_part = extractvalue { float, i32 } %frexp0, 1 + ret i32 %exp_part +} + +; CHECK: %[[#var3:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function +; CHECK: %[[#extinst3:]] = OpExtInst %[[#vec2_float_type]] %[[#extinst_id]] frexp %[[#vec2_null]] %[[#var3]] +; CHECK: %[[#f_part_var2:]] = OpLoad %[[#vec2_int_type]] %[[#var3]] +; CHECK: OpReturnValue %[[#extinst3]] +define <2 x float> @frexp_zero_vector() { + %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> zeroinitializer) + %f_part = extractvalue { <2 x float>, <2 x i32> } %ret, 0 + %exp_part = extractvalue { <2 x float>, <2 x i32> } %ret, 1 + ret <2 x float> %f_part +} + +; CHECK: %[[#var4:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function +; CHECK: %[[#extinst4:]] = OpExtInst %[[#vec2_float_type]] %[[#extinst_id]] frexp %[[#const_composite1]] %[[#var4]] +; CHECK: %[[#f_part_var3:]] = OpLoad %[[#vec2_int_type]] %[[#var4]] +; CHECK: OpReturnValue %[[#extinst4]] +define <2 x float> @frexp_zero_negzero_vector() { + %ret = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> ) + %f_part = extractvalue { <2 x float>, <2 x i32> } %ret, 0 + %exp_part = extractvalue { <2 x float>, <2 x i32> } %ret, 1 + ret <2 x float> %f_part +} + +; CHECK: %[[#var5:]] = OpVariable %[[#fn_ptr_type_vec4_i32]] Function +; CHECK: %[[#extinst5:]] = OpExtInst %[[#vec4_float_type]] %[[#extinst_id]] frexp %[[#const_composite2]] %[[#var5]] +; CHECK: %[[#f_part_var4:]] = OpLoad %[[#vec4_int_type]] %[[#var5]] +; CHECK: OpReturnValue %[[#extinst5]] +define <4 x float> @frexp_nonsplat_vector() { + %ret = call { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float> ) + %f_part = extractvalue { <4 x float>, <4 x i32> } %ret, 0 + %exp_part = extractvalue { <4 x float>, <4 x i32> } %ret, 1 + ret <4 x float> %f_part +} + +; CHECK: %[[#x_var2:]] = OpFunctionParameter %[[#float_32_type]] +; CHECK: %[[#var6:]] = OpVariable %[[#fn_ptr_type_i32]] Function +; CHECK: %[[#var7:]] = OpVariable %[[#fn_ptr_type_i32]] Function +; CHECK: %[[#extinst6:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#x_var2]] %[[#var6]] +; CHECK: %[[#load1:]] = OpLoad %[[#int_32_type]] %[[#var6]] +; CHECK: %[[#extinst7:]] = OpExtInst %[[#float_32_type]] %[[#extinst_id]] frexp %[[#extinst6]] %[[#var7]] +; CHECK: %[[#f_part_var5:]] = OpLoad %[[#int_32_type]] %[[#var7]] +; CHECK: OpReturnValue %[[#extinst7]] +define float @frexp_frexp(float %x) { + %frexp0 = call { float, i32 } @llvm.frexp.f32.i32(float %x) + %frexp0_f_part = extractvalue { float, i32 } %frexp0, 0 + %frexp0_exp_part = extractvalue { float, i32 } %frexp0, 1 + %frexp1 = call { float, i32 } @llvm.frexp.f32.i32(float %frexp0_f_part) + %frexp1_f_part = extractvalue { float, i32 } %frexp1, 0 + %frexp1_exp_part = extractvalue { float, i32 } %frexp1, 1 + ret float %frexp1_f_part +} + +; CHECK: %[[#x_var3:]] = OpFunctionParameter %[[#vec2_double_type]] +; CHECK: %[[#var9:]] = OpVariable %[[#fn_ptr_type_vec2_i32]] Function +; CHECK: %[[#extinst9:]] = OpExtInst %[[#vec2_double_type]] %[[#extinst_id]] frexp %[[#x_var3]] %[[#var9]] +; CHECK: %[[#f_part_var6:]] = OpLoad %[[#vec2_int_type]] %[[#var9]] +; CHECK: OpReturnValue %[[#extinst9]] +define <2 x double> @frexp_frexp_vector(<2 x double> %x) { + %frexp0 = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %x) + %f_part = extractvalue { <2 x double>, <2 x i32> } %frexp0, 0 + %exp_part = extractvalue { <2 x double>, <2 x i32> } %frexp0, 1 + ret <2 x double> %f_part +} + +declare { float, i32 } @llvm.frexp.f32.i32(float) +declare { double, i32 } @llvm.frexp.f64.i32(double) +declare { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float>) +declare { <4 x float>, <4 x i32> } @llvm.frexp.v4f32.v4i32(<4 x float>) +declare { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double>) +declare { float, i8 } @llvm.frexp.f32.i8(float) diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll index a15a80754cd60..b3ef6d6bbced9 100644 --- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/ignore-llvm-intrinsic.ll @@ -11,7 +11,6 @@ define spir_kernel void @foo(ptr %p) { entry: call void @llvm.trap() - call void @llvm.debugtrap() call void @llvm.ubsantrap(i8 100) %r1 = call ptr @llvm.invariant.start.p0(i64 1024, ptr %p) diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llround.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llround.ll new file mode 100644 index 0000000000000..2695237508af0 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/llround.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: [[opencl:%[0-9]+]] = OpExtInstImport "OpenCL.std" +; CHECK-DAG: [[f32:%[0-9]+]] = OpTypeFloat 32 +; CHECK-DAG: [[i32:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[f64:%[0-9]+]] = OpTypeFloat 64 +; CHECK-DAG: [[i64:%[0-9]+]] = OpTypeInt 64 0 +; CHECK-DAG: [[vecf32:%[0-9]+]] = OpTypeVector [[f32]] +; CHECK-DAG: [[veci32:%[0-9]+]] = OpTypeVector [[i32]] +; CHECK-DAG: [[vecf64:%[0-9]+]] = OpTypeVector [[f64]] +; CHECK-DAG: [[veci64:%[0-9]+]] = OpTypeVector [[i64]] + +; CHECK: [[rounded_i32_f32:%[0-9]+]] = OpExtInst [[f32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i32]] [[rounded_i32_f32]] +; CHECK: [[rounded_i32_f64:%[0-9]+]] = OpExtInst [[f64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i32]] [[rounded_i32_f64]] +; CHECK: [[rounded_i64_f32:%[0-9]+]] = OpExtInst [[f32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i64]] [[rounded_i64_f32]] +; CHECK: [[rounded_i64_f64:%[0-9]+]] = OpExtInst [[f64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i64]] [[rounded_i64_f64]] +; CHECK: [[rounded_v4i32_f32:%[0-9]+]] = OpExtInst [[vecf32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci32]] [[rounded_v4i32_f32]] +; CHECK: [[rounded_v4i32_f64:%[0-9]+]] = OpExtInst [[vecf64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci32]] [[rounded_v4i32_f64]] +; CHECK: [[rounded_v4i64_f32:%[0-9]+]] = OpExtInst [[vecf32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci64]] [[rounded_v4i64_f32]] +; CHECK: [[rounded_v4i64_f64:%[0-9]+]] = OpExtInst [[vecf64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci64]] [[rounded_v4i64_f64]] + +define spir_func i32 @test_llround_i32_f32(float %arg0) { +entry: + %0 = call i32 @llvm.llround.i32.f32(float %arg0) + ret i32 %0 +} + +define spir_func i32 @test_llround_i32_f64(double %arg0) { +entry: + %0 = call i32 @llvm.llround.i32.f64(double %arg0) + ret i32 %0 +} + +define spir_func i64 @test_llround_i64_f32(float %arg0) { +entry: + %0 = call i64 @llvm.llround.i64.f32(float %arg0) + ret i64 %0 +} + +define spir_func i64 @test_llround_i64_f64(double %arg0) { +entry: + %0 = call i64 @llvm.llround.i64.f64(double %arg0) + ret i64 %0 +} + +define spir_func <4 x i32> @test_llround_v4i32_f32(<4 x float> %arg0) { +entry: + %0 = call <4 x i32> @llvm.llround.v4i32.f32(<4 x float> %arg0) + ret <4 x i32> %0 +} + +define spir_func <4 x i32> @test_llround_v4i32_f64(<4 x double> %arg0) { +entry: + %0 = call <4 x i32> @llvm.llround.v4i32.f64(<4 x double> %arg0) + ret <4 x i32> %0 +} + +define spir_func <4 x i64> @test_llround_v4i64_f32(<4 x float> %arg0) { +entry: + %0 = call <4 x i64> @llvm.llround.v4i64.f32(<4 x float> %arg0) + ret <4 x i64> %0 +} + +define spir_func <4 x i64> @test_llround_v4i64_f64(<4 x double> %arg0) { +entry: + %0 = call <4 x i64> @llvm.llround.v4i64.f64(<4 x double> %arg0) + ret <4 x i64> %0 +} + +declare i32 @llvm.llround.i32.f32(float) +declare i32 @llvm.llround.i32.f64(double) +declare i64 @llvm.llround.i64.f32(float) +declare i64 @llvm.llround.i64.f64(double) + +declare <4 x i32> @llvm.llround.v4i32.f32(<4 x float>) +declare <4 x i32> @llvm.llround.v4i32.f64(<4 x double>) +declare <4 x i64> @llvm.llround.v4i64.f32(<4 x float>) +declare <4 x i64> @llvm.llround.v4i64.f64(<4 x double>) diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lround.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lround.ll new file mode 100644 index 0000000000000..891f1ceb5b238 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/lround.ll @@ -0,0 +1,87 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: [[opencl:%[0-9]+]] = OpExtInstImport "OpenCL.std" +; CHECK-DAG: [[f32:%[0-9]+]] = OpTypeFloat 32 +; CHECK-DAG: [[i32:%[0-9]+]] = OpTypeInt 32 0 +; CHECK-DAG: [[f64:%[0-9]+]] = OpTypeFloat 64 +; CHECK-DAG: [[i64:%[0-9]+]] = OpTypeInt 64 0 +; CHECK-DAG: [[vecf32:%[0-9]+]] = OpTypeVector [[f32]] +; CHECK-DAG: [[veci32:%[0-9]+]] = OpTypeVector [[i32]] +; CHECK-DAG: [[vecf64:%[0-9]+]] = OpTypeVector [[f64]] +; CHECK-DAG: [[veci64:%[0-9]+]] = OpTypeVector [[i64]] + +; CHECK: [[rounded_i32_f32:%[0-9]+]] = OpExtInst [[f32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i32]] [[rounded_i32_f32]] +; CHECK: [[rounded_i32_f64:%[0-9]+]] = OpExtInst [[f64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i32]] [[rounded_i32_f64]] +; CHECK: [[rounded_i64_f32:%[0-9]+]] = OpExtInst [[f32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i64]] [[rounded_i64_f32]] +; CHECK: [[rounded_i64_f64:%[0-9]+]] = OpExtInst [[f64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[i64]] [[rounded_i64_f64]] +; CHECK: [[rounded_v4i32_f32:%[0-9]+]] = OpExtInst [[vecf32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci32]] [[rounded_v4i32_f32]] +; CHECK: [[rounded_v4i32_f64:%[0-9]+]] = OpExtInst [[vecf64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci32]] [[rounded_v4i32_f64]] +; CHECK: [[rounded_v4i64_f32:%[0-9]+]] = OpExtInst [[vecf32]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci64]] [[rounded_v4i64_f32]] +; CHECK: [[rounded_v4i64_f64:%[0-9]+]] = OpExtInst [[vecf64]] [[opencl]] round %[[#]] +; CHECK-NEXT: %[[#]] = OpConvertFToS [[veci64]] [[rounded_v4i64_f64]] + +define spir_func i32 @test_lround_i32_f32(float %arg0) { +entry: + %0 = call i32 @llvm.lround.i32.f32(float %arg0) + ret i32 %0 +} + +define spir_func i32 @test_lround_i32_f64(double %arg0) { +entry: + %0 = call i32 @llvm.lround.i32.f64(double %arg0) + ret i32 %0 +} + +define spir_func i64 @test_lround_i64_f32(float %arg0) { +entry: + %0 = call i64 @llvm.lround.i64.f32(float %arg0) + ret i64 %0 +} + +define spir_func i64 @test_lround_i64_f64(double %arg0) { +entry: + %0 = call i64 @llvm.lround.i64.f64(double %arg0) + ret i64 %0 +} + +define spir_func <4 x i32> @test_lround_v4i32_f32(<4 x float> %arg0) { +entry: + %0 = call <4 x i32> @llvm.lround.v4i32.f32(<4 x float> %arg0) + ret <4 x i32> %0 +} + +define spir_func <4 x i32> @test_lround_v4i32_f64(<4 x double> %arg0) { +entry: + %0 = call <4 x i32> @llvm.lround.v4i32.f64(<4 x double> %arg0) + ret <4 x i32> %0 +} + +define spir_func <4 x i64> @test_lround_v4i64_f32(<4 x float> %arg0) { +entry: + %0 = call <4 x i64> @llvm.lround.v4i64.f32(<4 x float> %arg0) + ret <4 x i64> %0 +} + +define spir_func <4 x i64> @test_lround_v4i64_f64(<4 x double> %arg0) { +entry: + %0 = call <4 x i64> @llvm.lround.v4i64.f64(<4 x double> %arg0) + ret <4 x i64> %0 +} + +declare i32 @llvm.lround.i32.f32(float) +declare i32 @llvm.lround.i32.f64(double) +declare i64 @llvm.lround.i64.f32(float) +declare i64 @llvm.lround.i64.f64(double) + +declare <4 x i32> @llvm.lround.v4i32.f32(<4 x float>) +declare <4 x i32> @llvm.lround.v4i32.f64(<4 x double>) +declare <4 x i64> @llvm.lround.v4i64.f32(<4 x float>) +declare <4 x i64> @llvm.lround.v4i64.f64(<4 x double>) diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll new file mode 100644 index 0000000000000..51b76640cc056 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memmove.ll @@ -0,0 +1,86 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-NOT: llvm.memmove + +; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0 +; CHECK-DAG: %[[#Int32:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0 +; CHECK-DAG: %[[#Ptr_CrossWG_8:]] = OpTypePointer CrossWorkgroup %[[#Int8]] +; CHECK-DAG: %[[#Ptr_Generic_32:]] = OpTypePointer Generic %[[#Int32]] +; CHECK-DAG: %[[#Const_64:]] = OpConstant %[[#Int32]] 64 +; CHECK-DAG: %[[#Const_36:]] = OpConstant %[[#Int32]] 36 +; CHECK-DAG: %[[#Const_30:]] = OpConstant %[[#Int32]] 30 +; CHECK-DAG: %[[#Const_32_64:]] = OpConstant %[[#Int64]] 32 + +; CHECK: %[[#Param1:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]] +; CHECK: %[[#Param2:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]] +; CHECK: %[[#Size1:]] = OpUConvert %[[#Int64]] %[[#Const_64]] +; CHECK: OpCopyMemorySized %[[#Param2]] %[[#Param1]] %[[#Size1]] Aligned 64 + +; CHECK: %[[#Src:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]] +; CHECK: %[[#CastDst2:]] = OpGenericCastToPtr %[[#Ptr_CrossWG_8]] %[[#GenPtr:]] +; CHECK: %[[#Size2:]] = OpUConvert %[[#Int64]] %[[#Const_36]] +; CHECK: OpCopyMemorySized %[[#CastDst2]] %[[#Src]] %[[#Size2]] Aligned 64 + +; CHECK: %[[#Param1:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]] +; CHECK: %[[#Param2:]] = OpFunctionParameter %[[#Ptr_CrossWG_8]] +; CHECK: %[[#Size3:]] = OpUConvert %[[#Int64]] %[[#Const_30]] +; CHECK: OpCopyMemorySized %[[#Param2]] %[[#Param1]] %[[#Size3]] Aligned 1 + +; CHECK: %[[#Phi:]] = OpPhi %[[#Ptr_Generic_32]] %[[#Op1:]] %[[#Lbl1:]] %[[#Op2:]] %[[#Lbl2:]] +; CHECK: %[[#Cast:]] = OpPtrCastToGeneric %[[#]] %[[#]] +; CHECK: OpCopyMemorySized %[[#Cast]] %[[#Phi]] %[[#Const_32_64]] Aligned 8 + +%struct.SomeStruct = type { <16 x float>, i32, [60 x i8] } +%class.kfunc = type <{ i32, i32, i32, [4 x i8] }> + +@InvocIndex = external local_unnamed_addr addrspace(1) constant i64, align 8 +@"func_object1" = internal addrspace(3) global %class.kfunc zeroinitializer, align 8 + +define spir_kernel void @test_full_move(%struct.SomeStruct addrspace(1)* captures(none) readonly %in, %struct.SomeStruct addrspace(1)* captures(none) %out) { + %1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)* + %2 = bitcast %struct.SomeStruct addrspace(1)* %out to i8 addrspace(1)* + call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %2, i8 addrspace(1)* align 64 %1, i32 64, i1 false) + ret void +} + +define spir_kernel void @test_partial_move(%struct.SomeStruct addrspace(1)* captures(none) readonly %in, %struct.SomeStruct addrspace(4)* captures(none) %out) { + %1 = bitcast %struct.SomeStruct addrspace(1)* %in to i8 addrspace(1)* + %2 = bitcast %struct.SomeStruct addrspace(4)* %out to i8 addrspace(4)* + %3 = addrspacecast i8 addrspace(4)* %2 to i8 addrspace(1)* + call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* align 64 %3, i8 addrspace(1)* align 64 %1, i32 36, i1 false) + ret void +} + +define spir_kernel void @test_array(i8 addrspace(1)* %in, i8 addrspace(1)* %out) { + call void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 30, i1 false) + ret void +} + +define weak_odr dso_local spir_kernel void @test_phi() local_unnamed_addr { +entry: + %0 = alloca i32, align 8 + %1 = addrspacecast i32* %0 to i32 addrspace(4)* + %2 = load i64, i64 addrspace(1)* @InvocIndex, align 8 + %cmp = icmp eq i64 %2, 0 + br i1 %cmp, label %leader, label %entry.merge_crit_edge + +entry.merge_crit_edge: ; preds = %entry + %3 = bitcast i32 addrspace(4)* %1 to i8 addrspace(4)* + br label %merge + +leader: ; preds = %entry + %4 = bitcast i32 addrspace(4)* %1 to i8 addrspace(4)* + br label %merge + +merge: ; preds = %entry.merge_crit_edge, %leader + %phi = phi i8 addrspace(4)* [ %3, %entry.merge_crit_edge ], [ %4, %leader ] + %5 = addrspacecast i8 addrspace(3)* bitcast (%class.kfunc addrspace(3)* @"func_object1" to i8 addrspace(3)*) to i8 addrspace(4)* + call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* align 8 dereferenceable(32) %5, i8 addrspace(4)* align 8 dereferenceable(32) %phi, i64 32, i1 false) + ret void +} + +declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* captures(none) writeonly, i8 addrspace(4)* captures(none) readonly, i64, i1 immarg) + +declare void @llvm.memmove.p1i8.p1i8.i32(i8 addrspace(1)* captures(none), i8 addrspace(1)* captures(none) readonly, i32, i1) diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll new file mode 100644 index 0000000000000..52f939faf0a9f --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/signed_arithmetic_overflow.ll @@ -0,0 +1,30 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -filetype=obj -o - | spirv-val %} +; XFAIL: * +;@llvm.sadd.with.overflow and @llvm.ssub.with.overflow has not been implemented. + +define spir_func void @test_sadd_overflow(ptr %out_result, ptr %out_overflow, i32 %a, i32 %b) { +entry: + %res = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) + %val = extractvalue { i32, i1 } %res, 0 + %ofl = extractvalue { i32, i1 } %res, 1 + store i32 %val, ptr %out_result + %zext_ofl = zext i1 %ofl to i8 + store i8 %zext_ofl, ptr %out_overflow + ret void +} + +declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) + +define spir_func void @test_ssub_overflow(ptr %out_result, ptr %out_overflow, i32 %a, i32 %b) { +entry: + %res = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) + %val = extractvalue { i32, i1 } %res, 0 + %ofl = extractvalue { i32, i1 } %res, 1 + store i32 %val, ptr %out_result + %zext_ofl = zext i1 %ofl to i8 + store i8 %zext_ofl, ptr %out_overflow + ret void +} + +declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll b/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll index e405ef0ed58a5..5e66b8b639f17 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/NoSignedUnsignedWrap.ll @@ -7,10 +7,11 @@ ;; ;; Positive tests: ;; -; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-NEGATIVE +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=+SPV_KHR_no_integer_wrap_decoration %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV ;; ;; Negative tests: ;; +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV-NEGATIVE ;; Check that backend is able to skip nsw/nuw attributes if extension is ;; disabled implicitly or explicitly and if max SPIR-V version is lower then 1.4 diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll new file mode 100644 index 0000000000000..c8953c701d47d --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpVariable_Initializer.ll @@ -0,0 +1,11 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV: [[#PtrT:]] = OpTypePointer Workgroup %[[#]] +; CHECK-SPIRV: %[[#]] = OpVariable %[[#PtrT]] Workgroup + +@test_atomic_fn.L = internal addrspace(3) global [64 x i32] zeroinitializer, align 4 + +define spir_kernel void @test_atomic_fn() { + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll new file mode 100644 index 0000000000000..607997d034f09 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_pipe.ll @@ -0,0 +1,140 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpCapability Kernel +; CHECK: OpCapability Addresses +; CHECK: OpCapability Pipes +; CHECK: OpCapability Int8 +; CHECK: OpCapability GenericPointer + +; CHECK-DAG: %[[#PipeWriteTy:]] = OpTypePipe WriteOnly +; CHECK-DAG: %[[#PipeReadTy:]] = OpTypePipe ReadOnly +; CHECK-DAG: %[[#ReserveIdTy:]] = OpTypeReserveId +; CHECK-DAG: %[[#BoolTy:]] = OpTypeBool +; CHECK-DAG: %[[#Int32Ty:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#Uint1:]] = OpConstant %[[#Int32Ty]] 1 +; CHECK-DAG: %[[#Uint2:]] = OpConstant %[[#Int32Ty]] 2 +; CHECK-DAG: %[[#Uint3:]] = OpConstant %[[#Int32Ty]] 3 +; CHECK-DAG: %[[#Uint4:]] = OpConstant %[[#Int32Ty]] 4 +; CHECK-DAG: %[[#NullUint:]] = OpConstantNull %[[#Int32Ty]] + +; CHECK: OpFunction +; CHECK: %[[#FuncParam1:]] = OpFunctionParameter %[[#PipeWriteTy]] +; CHECK: %[[#FuncParam2:]] = OpFunctionParameter %[[#PipeReadTy]] + +; CHECK: %[[#BasicWriteReserve:]] = OpReserveWritePipePackets %[[#ReserveIdTy]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpWritePipe %[[#Int32Ty]] %[[#FuncParam1]] %[[#]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpCommitWritePipe %[[#FuncParam1]] %[[#BasicWriteReserve]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#BasicReadReserve:]] = OpReserveReadPipePackets %[[#ReserveIdTy]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpReadPipe %[[#Int32Ty]] %[[#FuncParam2]] %[[#]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpCommitReadPipe %[[#FuncParam2]] %[[#BasicReadReserve]] %[[#Uint4]] %[[#Uint4]] + +; --- Reserved pipe operations --- +; CHECK: %[[#ReservedWriteReserve:]] = OpReserveWritePipePackets %[[#ReserveIdTy]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#ReservedWrite:]] = OpReservedWritePipe %[[#Int32Ty]] %[[#FuncParam1]] %[[#ReservedWriteReserve]] %[[#NullUint]] %[[#]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#IsValidWrite:]] = OpIsValidReserveId %[[#BoolTy]] %[[#ReservedWriteReserve]] +; CHECK: OpCommitWritePipe %[[#FuncParam1]] %[[#ReservedWriteReserve]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#ReservedReadReserve:]] = OpReserveReadPipePackets %[[#ReserveIdTy]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#ReservedRead:]] = OpReservedReadPipe %[[#Int32Ty]] %[[#FuncParam2]] %[[#ReservedReadReserve]] %[[#NullUint]] %[[#]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#IsValidRead:]] = OpIsValidReserveId %[[#BoolTy]] %[[#ReservedReadReserve]] +; CHECK: OpCommitReadPipe %[[#FuncParam2]] %[[#ReservedReadReserve]] %[[#Uint4]] %[[#Uint4]] + +; --- Pipe packet queries --- +; CHECK: %[[#MaxPacketsWO:]] = OpGetMaxPipePackets %[[#Int32Ty]] %[[#FuncParam1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpStore %[[#]] %[[#MaxPacketsWO]] Aligned 4 +; CHECK: %[[#NumPacketsWO:]] = OpGetNumPipePackets %[[#Int32Ty]] %[[#FuncParam1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpStore %[[#]] %[[#NumPacketsWO]] Aligned 4 +; CHECK: %[[#MaxPacketsRO:]] = OpGetMaxPipePackets %[[#Int32Ty]] %[[#FuncParam2]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpStore %[[#]] %[[#MaxPacketsRO]] Aligned 4 +; CHECK: %[[#NumPacketsRO:]] = OpGetNumPipePackets %[[#Int32Ty]] %[[#FuncParam2]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpStore %[[#]] %[[#NumPacketsRO]] Aligned 4 + +; --- Workgroup operations --- +; CHECK: %[[#WorkgroupWriteReserve:]] = OpGroupReserveWritePipePackets %[[#ReserveIdTy]] %[[#Uint2]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint1]] %[[#Uint1]] +; CHECK: OpGroupCommitWritePipe %[[#Uint2]] %[[#FuncParam1]] %[[#WorkgroupWriteReserve]] %[[#Uint1]] %[[#Uint1]] +; CHECK: %[[#WorkgroupReadReserve:]] = OpGroupReserveReadPipePackets %[[#ReserveIdTy]] %[[#Uint2]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint1]] %[[#Uint1]] +; CHECK: OpGroupCommitReadPipe %[[#Uint2]] %[[#FuncParam2]] %[[#WorkgroupReadReserve]] %[[#Uint1]] %[[#Uint1]] + +; --- Subgroup operations --- +; CHECK: %[[#SubgroupWriteReserve:]] = OpGroupReserveWritePipePackets %[[#ReserveIdTy]] %[[#Uint3]] %[[#FuncParam1]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpGroupCommitWritePipe %[[#Uint3]] %[[#FuncParam1]] %[[#SubgroupWriteReserve]] %[[#Uint4]] %[[#Uint4]] +; CHECK: %[[#SubgroupReadReserve:]] = OpGroupReserveReadPipePackets %[[#ReserveIdTy]] %[[#Uint3]] %[[#FuncParam2]] %[[#Uint1]] %[[#Uint4]] %[[#Uint4]] +; CHECK: OpGroupCommitReadPipe %[[#Uint3]] %[[#FuncParam2]] %[[#SubgroupReadReserve]] %[[#Uint4]] %[[#Uint4]] + +define spir_kernel void @test_pipe_builtins( + target("spirv.Pipe", 1) %out_pipe, + target("spirv.Pipe", 0) %in_pipe, + ptr addrspace(4) %src, + ptr addrspace(4) %dst, + ptr addrspace(1) %max_packets_wo, + ptr addrspace(1) %num_packets_wo, + ptr addrspace(1) %max_packets_ro, + ptr addrspace(1) %num_packets_ro +) { +entry: + ; Basic pipe operations + %0 = call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4) + %1 = call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %out_pipe, ptr addrspace(4) %src, i32 4, i32 4) + call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %0, i32 4, i32 4) + + %2 = call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4) + %3 = call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %in_pipe, ptr addrspace(4) %dst, i32 4, i32 4) + call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %2, i32 4, i32 4) + + ; Reserved pipe operations + %4 = call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4) + %5 = call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %4, i32 0, ptr addrspace(4) %src, i32 4, i32 4) + %6 = call spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId") %4) + call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %4, i32 4, i32 4) + + %7 = call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4) + %8 = call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %7, i32 0, ptr addrspace(4) %dst, i32 4, i32 4) + %9 = call spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId") %7) + call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %7, i32 4, i32 4) + + ; Pipe packet queries + %10 = call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %out_pipe, i32 4, i32 4) + store i32 %10, ptr addrspace(1) %max_packets_wo, align 4 + %11 = call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %out_pipe, i32 4, i32 4) + store i32 %11, ptr addrspace(1) %num_packets_wo, align 4 + %12 = call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %in_pipe, i32 4, i32 4) + store i32 %12, ptr addrspace(1) %max_packets_ro, align 4 + %13 = call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %in_pipe, i32 4, i32 4) + store i32 %13, ptr addrspace(1) %num_packets_ro, align 4 + + ; Workgroup operations + %14 = call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 1, i32 1) + call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %14, i32 1, i32 1) + %15 = call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 1, i32 1) + call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %15, i32 1, i32 1) + + ; Subgroup operations + %16 = call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %out_pipe, i32 1, i32 4, i32 4) + call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %out_pipe, target("spirv.ReserveId") %16, i32 4, i32 4) + %17 = call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %in_pipe, i32 1, i32 4, i32 4) + call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %in_pipe, target("spirv.ReserveId") %17, i32 4, i32 4) + + ret void +} + +declare spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32) +declare spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32) +declare spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1), ptr addrspace(4), i32, i32) +declare spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0), ptr addrspace(4), i32, i32) +declare spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, ptr addrspace(4), i32, i32) +declare spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, ptr addrspace(4), i32, i32) +declare spir_func void @__commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32) +declare spir_func void @__commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32) +declare spir_func i1 @_Z19is_valid_reserve_id13ocl_reserveid(target("spirv.ReserveId")) +declare spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1), i32, i32) +declare spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1), i32, i32) +declare spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0), i32, i32) +declare spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0), i32, i32) +declare spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32) +declare spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32) +declare spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32) +declare spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32) +declare spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1), i32, i32, i32) +declare spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1), target("spirv.ReserveId"), i32, i32) +declare spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0), i32, i32, i32) +declare spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0), target("spirv.ReserveId"), i32, i32) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll new file mode 100644 index 0000000000000..4c64a127a7019 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_gep.ll @@ -0,0 +1,16 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId +; CHECK: %[[#Id]] = OpVariable %[[#]] CrossWorkgroup + +@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 + +define spir_kernel void @f() { +entry: + %0 = load i64, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, align 32 + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll b/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll new file mode 100644 index 0000000000000..74ce26bee9cf3 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/decoration-forward-decl.ll @@ -0,0 +1,30 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Check saturation conversion is translated when there is forward declaration +; of SPIRV entry. + +; CHECK: OpDecorate %[[#SAT:]] SaturatedConversion +; CHECK: %[[#SAT]] = OpConvertFToU %[[#]] %[[#]] + +declare spir_func zeroext i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float) + +define spir_func void @forward(float %val, i8 %initval, ptr addrspace(1) %dst) { +entry: + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %new_val.0 = phi i8 [ %initval, %entry ], [ %call1, %for.body ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp ult i32 %i.0, 1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call spir_func zeroext i8 @_Z30__spirv_ConvertFToU_Ruchar_satf(float noundef %val) + %inc = add i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + store i8 %new_val.0, ptr addrspace(1) %dst, align 1 + ret void +} diff --git a/llvm/test/CodeGen/SPIRV/transcoding/float16.ll b/llvm/test/CodeGen/SPIRV/transcoding/float16.ll new file mode 100644 index 0000000000000..0018dba68d4ea --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/float16.ll @@ -0,0 +1,25 @@ +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV: %[[#HALF:]] = OpTypeFloat 16 +; CHECK-SPIRV: %[[#HALFPTR:]] = OpTypePointer Function %[[#HALF]] +; CHECK-SPIRV: %[[#HALFV2:]] = OpTypeVector %[[#HALF]] 2 +; CHECK-SPIRV: %[[#HALFV2PTR:]] = OpTypePointer Function %[[#HALFV2]] +; CHECK-SPIRV: %[[#CONST:]] = OpConstant %[[#HALF]] 14788 +; CHECK-SPIRV: %[[#ADDR:]] = OpVariable %[[#HALFPTR]] Function +; CHECK-SPIRV: %[[#ADDR2:]] = OpVariable %[[#HALFV2PTR]] Function +; CHECK-SPIRV: %[[#]] = OpExtInst %[[#HALF]] %[[#]] fract %[[#CONST]] %[[#ADDR]] +; CHECK-SPIRV: %[[#]] = OpExtInst %[[#HALFV2]] %[[#]] fract %[[#]] %[[#ADDR2]] + +define spir_kernel void @test() { +entry: + %addr = alloca half + %addr2 = alloca <2 x half> + %res = call spir_func noundef half @_Z17__spirv_ocl_fractDF16_PU3AS0DF16_(half noundef 0xH39C4, ptr noundef %addr) + %res2 = call spir_func noundef <2 x half> @_Z17__spirv_ocl_fractDv2_DF16_PU3AS0S_(<2 x half> noundef , ptr noundef %addr2) + ret void +} + +declare spir_func noundef half @_Z17__spirv_ocl_fractDF16_PU3AS0DF16_(half noundef, ptr noundef) local_unnamed_addr + +declare spir_func noundef <2 x half> @_Z17__spirv_ocl_fractDv2_DF16_PU3AS0S_(<2 x half> noundef, ptr noundef) local_unnamed_addr diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index 9346098f0371b..696938c27b0f5 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -173,8 +173,8 @@ define dso_local i32 @and_mul_reduce_add(ptr noalias nocapture readonly %a, ptr ; CHECK-LABEL: and_mul_reduce_add: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: ldr.w r12, [sp, #12] +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: ldr.w r12, [sp, #16] ; CHECK-NEXT: cmp.w r12, #0 ; CHECK-NEXT: beq .LBB2_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph @@ -195,9 +195,14 @@ define dso_local i32 @and_mul_reduce_add(ptr noalias nocapture readonly %a, ptr ; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vsub.i32 q1, q2, q1 -; CHECK-NEXT: vpsttt +; CHECK-NEXT: vpst ; CHECK-NEXT: vcmpt.i32 eq, q1, zr +; CHECK-NEXT: vstr p0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 +; CHECK-NEXT: vldr p0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q2, [r2], #16 ; CHECK-NEXT: vmul.i32 q1, q2, q1 ; CHECK-NEXT: vadd.i32 q1, q1, q0 @@ -206,11 +211,11 @@ define dso_local i32 @and_mul_reduce_add(ptr noalias nocapture readonly %a, ptr ; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload ; CHECK-NEXT: vpsel q0, q1, q0 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .LBB2_4: ; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: add sp, #8 ; CHECK-NEXT: pop {r4, pc} ptr noalias nocapture readonly %c, ptr noalias nocapture readonly %d, i32 %N) { entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll index eb1527ff3dc4a..32648b6b449a8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -790,15 +790,250 @@ entry: ret i16 %result } -declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) - - -declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) -declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) -declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) -declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) -declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) -declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) -declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) -declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) -declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) +define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) { +; CHECK-LABEL: vmulhs_kb_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: smmul r0, r0, r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: smmul r1, r1, r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: smmul r0, r0, r1 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: smmul r1, r1, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: bx lr +entry: + %s0s = sext <4 x i32> %s0 to <4 x i64> + %s1s = ashr <4 x i64> %s1, + %m = mul <4 x i64> %s0s, %s1s + %s = ashr <4 x i64> %m, + %s2 = trunc <4 x i64> %s to <4 x i32> + ret <4 x i32> %s2 +} + +define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) { +; CHECK-LABEL: vmulhu_kb_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: umull r0, r1, r0, r1 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: umull r0, r1, r0, r1 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: umull r0, r2, r0, r2 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: bx lr +entry: + %s0s = zext <4 x i32> %s0 to <4 x i64> + %s1s = lshr <4 x i64> %s1, + %m = mul <4 x i64> %s0s, %s1s + %s = lshr <4 x i64> %m, + %s2 = trunc <4 x i64> %s to <4 x i32> + ret <4 x i32> %s2 +} + +define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) { +; CHECK-LABEL: vmulhs_kbc_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: smmul r0, r1, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: smmul r1, r2, r1 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov q0[2], q0[0], r1, r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: smmul r0, r1, r0 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: smmul r1, r2, r1 +; CHECK-NEXT: vmov q0[3], q0[1], r1, r0 +; CHECK-NEXT: bx lr +entry: + %s0s = sext <4 x i32> %s0 to <4 x i64> + %s1s = ashr <4 x i64> %s1, + %m = mul <4 x i64> %s1s, %s0s + %s = ashr <4 x i64> %m, + %s2 = trunc <4 x i64> %s to <4 x i32> + ret <4 x i32> %s2 +} + +define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) { +; CHECK-LABEL: vmulhu_kbc_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s4, s2 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vmov.f32 s6, s3 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov q0[2], q0[0], r2, r1 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: vmov q0[3], q0[1], r2, r1 +; CHECK-NEXT: bx lr +entry: + %s0s = zext <4 x i32> %s0 to <4 x i64> + %s1s = lshr <4 x i64> %s1, + %m = mul <4 x i64> %s1s, %s0s + %s = lshr <4 x i64> %m, + %s2 = trunc <4 x i64> %s to <4 x i32> + ret <4 x i32> %s2 +} + +define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) { +; CHECK-LABEL: vmulhs_kb_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmovlt.s16 q4, q0 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vshr.s32 q3, q3, #16 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmul.i32 q3, q4, q3 +; CHECK-NEXT: vshr.s32 q1, q1, #16 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovnt.i32 q0, q3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %s0s = sext <8 x i16> %s0 to <8 x i32> + %s1s = ashr <8 x i32> %s1, + %m = mul <8 x i32> %s0s, %s1s + %s = ashr <8 x i32> %m, + %s2 = trunc <8 x i32> %s to <8 x i16> + ret <8 x i16> %s2 +} + +define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) { +; CHECK-LABEL: vmulhu_kb_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmovlt.u16 q4, q0 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmul.i32 q3, q4, q3 +; CHECK-NEXT: vshr.u32 q1, q1, #16 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vmul.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovnt.i32 q0, q3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %s0s = zext <8 x i16> %s0 to <8 x i32> + %s1s = lshr <8 x i32> %s1, + %m = mul <8 x i32> %s0s, %s1s + %s = lshr <8 x i32> %m, + %s2 = trunc <8 x i32> %s to <8 x i16> + ret <8 x i16> %s2 +} + +define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) { +; CHECK-LABEL: vmulhs_kbc_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmovlt.s16 q4, q0 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vshr.s32 q3, q3, #16 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmul.i32 q3, q3, q4 +; CHECK-NEXT: vshr.s32 q1, q1, #16 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovnt.i32 q0, q3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %s0s = sext <8 x i16> %s0 to <8 x i32> + %s1s = ashr <8 x i32> %s1, + %m = mul <8 x i32> %s1s, %s0s + %s = ashr <8 x i32> %m, + %s2 = trunc <8 x i32> %s to <8 x i16> + ret <8 x i16> %s2 +} + +define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) { +; CHECK-LABEL: vmulhu_kbc_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.f32 s12, s5 +; CHECK-NEXT: vmovlt.u16 q4, q0 +; CHECK-NEXT: vmov.f32 s13, s7 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vmov.f32 s5, s6 +; CHECK-NEXT: vmov.f32 s14, s9 +; CHECK-NEXT: vmov.f32 s15, s11 +; CHECK-NEXT: vmov.f32 s6, s8 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vmov.f32 s7, s10 +; CHECK-NEXT: vmul.i32 q3, q3, q4 +; CHECK-NEXT: vshr.u32 q1, q1, #16 +; CHECK-NEXT: vshr.u32 q3, q3, #16 +; CHECK-NEXT: vmul.i32 q0, q1, q0 +; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovnt.i32 q0, q3 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %s0s = zext <8 x i16> %s0 to <8 x i32> + %s1s = lshr <8 x i32> %s1, + %m = mul <8 x i32> %s1s, %s0s + %s = lshr <8 x i32> %m, + %s2 = trunc <8 x i32> %s to <8 x i16> + ret <8 x i16> %s2 +} diff --git a/llvm/test/CodeGen/WebAssembly/fake-use.ll b/llvm/test/CodeGen/WebAssembly/fake-use.ll new file mode 100644 index 0000000000000..a18ce33566df0 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/fake-use.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s | llvm-mc -triple=wasm32-unknown-unknown + +target triple = "wasm32-unknown-unknown" + +define void @fake_use() { + %t = call i32 @foo() + tail call void (...) @llvm.fake.use(i32 %t) + ret void +} + +; %t shouldn't be converted to TEE in RegStackify, because the FAKE_USE will be +; deleted in the beginning of ExplicitLocals. +define void @fake_use_no_tee() { + %t = call i32 @foo() + tail call void (...) @llvm.fake.use(i32 %t) + call void @use(i32 %t) + ret void +} + +declare i32 @foo() +declare void @use(i32 %t) +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.fake.use(...) #0 + +attributes #0 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } diff --git a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll index 6679b5f58e8c1..41fa34667af86 100644 --- a/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll +++ b/llvm/test/CodeGen/X86/apx/ndd-neg-addr-index.ll @@ -8,7 +8,7 @@ define void @neg_8bit_1(i1 %cmp) { ; NDD-NEXT: andb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xe7,0x01] ; NDD-NEXT: movzbl 0, %ecx # encoding: [0x0f,0xb6,0x0c,0x25,0x00,0x00,0x00,0x00] ; NDD-NEXT: negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8] -; NDD-NEXT: leab 2(%rcx,%rax), %al # encoding: [0x66,0x8d,0x44,0x01,0x02] +; NDD-NEXT: leal 2(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0x02] ; NDD-NEXT: movb %al, 0 # encoding: [0x88,0x04,0x25,0x00,0x00,0x00,0x00] ; NDD-NEXT: retq # encoding: [0xc3] entry: @@ -25,7 +25,8 @@ define void @neg_8bit_2(i8 %int8) { ; NDD-NEXT: # kill: def $edi killed $edi def $rdi ; NDD-NEXT: addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff] ; NDD-NEXT: negb %al, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd8] -; NDD-NEXT: leab 1(%rdi,%rax), %al # encoding: [0x66,0x8d,0x44,0x07,0x01] +; NDD-NEXT: leal 1(%rdi,%rax), %eax # encoding: [0x8d,0x44,0x07,0x01] +; NDD-NEXT: # kill: def $al killed $al killed $eax ; NDD-NEXT: mulb %dil # encoding: [0x40,0xf6,0xe7] ; NDD-NEXT: testb %al, %al # encoding: [0x84,0xc0] ; NDD-NEXT: retq # encoding: [0xc3] @@ -55,7 +56,7 @@ define i32 @neg_16bit(i16 %0) { ; NDD-NEXT: cmovsl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc1] ; NDD-NEXT: andw $-256, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0x00,0xff] ; NDD-NEXT: negw %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd8] -; NDD-NEXT: leaw 1(%rdi,%rax), %ax # encoding: [0x66,0x8d,0x44,0x07,0x01] +; NDD-NEXT: leal 1(%rdi,%rax), %eax # encoding: [0x8d,0x44,0x07,0x01] ; NDD-NEXT: movzwl %ax, %eax # encoding: [0x0f,0xb7,0xc0] ; NDD-NEXT: movq %rax, 0 # encoding: [0x48,0x89,0x04,0x25,0x00,0x00,0x00,0x00] ; NDD-NEXT: xorl %eax, %eax # encoding: [0x31,0xc0] diff --git a/llvm/test/CodeGen/X86/atomic-bit-test.ll b/llvm/test/CodeGen/X86/atomic-bit-test.ll index 8f91f4120842b..b06bef44a5e9e 100644 --- a/llvm/test/CodeGen/X86/atomic-bit-test.ll +++ b/llvm/test/CodeGen/X86/atomic-bit-test.ll @@ -469,52 +469,56 @@ entry: define i16 @use_in_diff_bb() nounwind { ; X86-LABEL: use_in_diff_bb: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %esi -; X86-NEXT: movzwl v16, %esi +; X86-NEXT: movzwl v16, %eax ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB17_1: # %atomicrmw.start ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: orl $1, %ecx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: lock cmpxchgw %cx, v16 -; X86-NEXT: movl %eax, %esi +; X86-NEXT: # kill: def $ax killed $ax def $eax ; X86-NEXT: jne .LBB17_1 ; X86-NEXT: # %bb.2: # %atomicrmw.end -; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testb %al, %al +; X86-NEXT: xorl %ecx, %ecx +; X86-NEXT: testb %cl, %cl ; X86-NEXT: jne .LBB17_4 ; X86-NEXT: # %bb.3: +; X86-NEXT: pushl %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: calll foo@PLT -; X86-NEXT: .LBB17_4: -; X86-NEXT: andl $1, %esi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: popl %esi +; X86-NEXT: .LBB17_4: +; X86-NEXT: andl $1, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-LABEL: use_in_diff_bb: ; X64: # %bb.0: # %entry -; X64-NEXT: pushq %rbx -; X64-NEXT: movzwl v16(%rip), %ebx +; X64-NEXT: movzwl v16(%rip), %eax ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB17_1: # %atomicrmw.start ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movl %ebx, %ecx +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: orl $1, %ecx -; X64-NEXT: movl %ebx, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: lock cmpxchgw %cx, v16(%rip) -; X64-NEXT: movl %eax, %ebx +; X64-NEXT: # kill: def $ax killed $ax def $eax ; X64-NEXT: jne .LBB17_1 ; X64-NEXT: # %bb.2: # %atomicrmw.end -; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testb %al, %al +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: testb %cl, %cl ; X64-NEXT: jne .LBB17_4 ; X64-NEXT: # %bb.3: +; X64-NEXT: pushq %rbx +; X64-NEXT: movl %eax, %ebx ; X64-NEXT: callq foo@PLT -; X64-NEXT: .LBB17_4: -; X64-NEXT: andl $1, %ebx ; X64-NEXT: movl %ebx, %eax ; X64-NEXT: popq %rbx +; X64-NEXT: .LBB17_4: +; X64-NEXT: andl $1, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq entry: %0 = atomicrmw or ptr @v16, i16 1 monotonic, align 2 diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 8aa898f3ec576..da0cef0e4e99b 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -2119,8 +2119,7 @@ define void @ktest_1(<8 x double> %in, ptr %base) { ; KNL-LABEL: ktest_1: ; KNL: ## %bb.0: ; KNL-NEXT: vcmpgtpd (%rdi), %zmm0, %k1 -; KNL-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} -; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} +; KNL-NEXT: vcmpltpd 8(%rdi), %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al ; KNL-NEXT: je LBB44_2 @@ -2152,8 +2151,7 @@ define void @ktest_1(<8 x double> %in, ptr %base) { ; AVX512BW-LABEL: ktest_1: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: vcmpgtpd (%rdi), %zmm0, %k1 -; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} -; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} +; AVX512BW-NEXT: vcmpltpd 8(%rdi), %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al ; AVX512BW-NEXT: je LBB44_2 diff --git a/llvm/test/CodeGen/X86/basic-block-sections-cfg.ll b/llvm/test/CodeGen/X86/basic-block-sections-cfg.ll new file mode 100644 index 0000000000000..b8eadc3cac36e --- /dev/null +++ b/llvm/test/CodeGen/X86/basic-block-sections-cfg.ll @@ -0,0 +1,40 @@ +; BB section test with CFG. +; +;; Profile for version 1: +; RUN: echo 'v1' > %t +; RUN: echo 'f foo' >> %t +; RUN: echo 'g 0:10,1:9,2:1 1:8,3:8 2:2,3:2 3:11' >> %t +; RUN: echo 'c 0 2 3' >> %t +; +; RUN: llc < %s -O0 -mtriple=x86_64-pc-linux -function-sections -basic-block-sections=%t | FileCheck %s +; +define void @foo(i1 zeroext) nounwind { + %2 = alloca i8, align 1 + %3 = zext i1 %0 to i8 + store i8 %3, ptr %2, align 1 + %4 = load i8, ptr %2, align 1 + %5 = trunc i8 %4 to i1 + br i1 %5, label %6, label %8 + +6: ; preds = %1 + %7 = call i32 @bar() + br label %10 + +8: ; preds = %1 + %9 = call i32 @baz() + br label %10 + +10: ; preds = %8, %6 + ret void +} + +declare i32 @bar() #1 + +declare i32 @baz() #1 + +; CHECK: .section .text.foo,"ax",@progbits +; CHECK: callq baz +; CHECK: retq +; CHECK: .section .text.split.foo,"ax",@progbits +; CHECK: callq bar + diff --git a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll index d6f3d5010b556..751ab76722c07 100644 --- a/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll +++ b/llvm/test/CodeGen/X86/basic-block-sections-clusters-error.ll @@ -57,6 +57,19 @@ ; RUN: echo 'p 1 2 3 2' >> %t13 ; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t13 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR13 ; CHECK-ERROR13: LLVM ERROR: invalid profile {{.*}} at line 4: duplicate cloned block in path: '2' +; RUN: echo 'v1' > %t14 +; RUN: echo 'f dummy1' >> %t14 +; RUN: echo 'c 0 1' >> %t14 +; RUN: echo 'g 0,1:2' >> %t14 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t14 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR14 +; CHECK-ERROR14: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '' +; RUN: echo 'v1' > %t15 +; RUN: echo 'f dummy1' >> %t15 +; RUN: echo 'c 0 1' >> %t15 +; RUN: echo 'g 0:4,1:2:3' >> %t15 +; RUN: not --crash llc < %s -O0 -mtriple=x86_64 -function-sections -basic-block-sections=%t15 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR15 +; CHECK-ERROR15: LLVM ERROR: invalid profile {{.*}} at line 4: unsigned integer expected: '2:3' + define i32 @dummy1(i32 %x, i32 %y, i32 %z) { entry: diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll index 11362873fb151..f0dbc31222c89 100644 --- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll +++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll @@ -1,22 +1,34 @@ -;; Test if temporary labels are generated for each indirect callsite with a callee_type metadata. -;; Test if the .callgraph section contains the MD5 hash of callee type ids generated from -;; generalized type id strings. +;; Test if temporary labels are generated for each indirect callsite. +;; Test if the .callgraph section contains the MD5 hash of callees' type (type id) +;; is correctly paired with its corresponding temporary label generated for indirect +;; call sites annotated with !callee_type metadata. +;; Test if the .callgraph section contains unique direct callees. ; RUN: llc -mtriple=x86_64-unknown-linux --call-graph-section -o - < %s | FileCheck %s +declare !type !0 void @direct_foo() +declare !type !1 i32 @direct_bar(i8) +declare !type !2 ptr @direct_baz(ptr) + ; CHECK: ball: ; CHECK-NEXT: [[LABEL_FUNC:\.Lfunc_begin[0-9]+]]: define ptr @ball() { entry: + call void @direct_foo() %fp_foo_val = load ptr, ptr null, align 8 ; CHECK: [[LABEL_TMP0:\.L.*]]: - call void (...) %fp_foo_val(), !callee_type !0 + call void (...) %fp_foo_val(), !callee_type !0 + call void @direct_foo() %fp_bar_val = load ptr, ptr null, align 8 ; CHECK: [[LABEL_TMP1:\.L.*]]: - %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2 + %call_fp_bar = call i32 %fp_bar_val(i8 0), !callee_type !2 + %call_fp_bar_direct = call i32 @direct_bar(i8 1) %fp_baz_val = load ptr, ptr null, align 8 ; CHECK: [[LABEL_TMP2:\.L.*]]: %call_fp_baz = call ptr %fp_baz_val(ptr null), !callee_type !4 + call void @direct_foo() + %call_fp_baz_direct = call ptr @direct_baz(ptr null) + call void @direct_foo() ret ptr %call_fp_baz } @@ -41,3 +53,8 @@ entry: ;; Test for MD5 hash of _ZTSFPvS_E.generalized and the generated temporary callsite label. ; CHECK-NEXT: .quad 8646233951371320954 ; CHECK-NEXT: .quad [[LABEL_TMP2]] +;; Test for number of direct calls and {callsite_label, callee} pairs. +; CHECK-NEXT: .quad 3 +; CHECK-NEXT: .quad direct_foo +; CHECK-NEXT: .quad direct_bar +; CHECK-NEXT: .quad direct_baz diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll index ff9f995c4765b..51a8bf5b48415 100644 --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -235,10 +235,10 @@ define void @PR52039(ptr %pa, ptr %pb) { ; SSE-NEXT: psubd %xmm1, %xmm3 ; SSE-NEXT: psubd %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: paddd %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm1 ; SSE-NEXT: paddd %xmm3, %xmm1 ; SSE-NEXT: movdqu %xmm3, 16(%rsi) ; SSE-NEXT: movdqu %xmm2, (%rsi) diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll index 8e4a50ea266c3..ae4d24f91ffc0 100644 --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -81,7 +81,7 @@ define <4 x i64> @combine_vec_mul_pow2c(<4 x i64> %x) { ; SSE-LABEL: combine_vec_mul_pow2c: ; SSE: # %bb.0: ; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm2, %xmm2 ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psllq $4, %xmm2 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll index 98187d61c1f84..6bcbfe1808933 100644 --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2187,13 +2187,13 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] ; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,2,2,2,128,2,128] ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: paddw %xmm4, %xmm4 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4,5],xmm4[6],xmm2[7] +; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: paddw %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2],xmm0[3,4,5],xmm2[6],xmm0[7] ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: paddb %xmm1, %xmm2 @@ -2201,15 +2201,14 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] ; SSE41-NEXT: psraw $8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: paddw %xmm0, %xmm3 -; SSE41-NEXT: psllw $7, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] -; SSE41-NEXT: psrlw $8, %xmm0 +; SSE41-NEXT: psllw $7, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7] +; SSE41-NEXT: psrlw $8, %xmm3 ; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE41-NEXT: psraw $8, %xmm2 ; SSE41-NEXT: psllw $7, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 +; SSE41-NEXT: packuswb %xmm3, %xmm2 ; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] ; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] @@ -2225,18 +2224,17 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,2,2,2,2,128,2,128] ; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpmovsxbw %xmm0, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5],xmm3[6],xmm2[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 -; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] +; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7] ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/debug-loclists.ll b/llvm/test/CodeGen/X86/debug-loclists.ll index 406bce587b904..62388a4c91e0c 100644 --- a/llvm/test/CodeGen/X86/debug-loclists.ll +++ b/llvm/test/CodeGen/X86/debug-loclists.ll @@ -2,6 +2,10 @@ ; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ ; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32 +; RUN: llc -mtriple=x86_64-pc-mingw -filetype=obj -function-sections -o %t -experimental-debug-variable-locations=true < %s +; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32 + ; RUN: llc -dwarf64 -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t -experimental-debug-variable-locations=true < %s ; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ ; RUN: FileCheck %s --check-prefixes=CHECK,DWARF64 @@ -10,6 +14,10 @@ ; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ ; RUN: FileCheck %s --check-prefixes=DWO,DWO32 +; RUN: llc -dwarf-version=5 -split-dwarf-file=foo.dwo -mtriple=x86_64-pc-mingw -filetype=obj -function-sections -o %t -experimental-debug-variable-locations=true < %s +; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ +; RUN: FileCheck %s --check-prefixes=DWO,DWO32 + ; RUN: llc -dwarf64 -dwarf-version=5 -split-dwarf-file=foo.dwo -mtriple=x86_64-pc-linux -filetype=obj -function-sections -o %t -experimental-debug-variable-locations=true < %s ; RUN: llvm-dwarfdump -v -debug-info -debug-loclists %t | \ ; RUN: FileCheck %s --check-prefixes=DWO,DWO64 @@ -17,20 +25,20 @@ ; CHECK: DW_TAG_variable ; DWARF32-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x0) loclist = 0x00000018: ; DWARF64-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x0) loclist = 0x0000002c: -; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text._Z2f1ii": DW_OP_consts +5, DW_OP_stack_value) +; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text{{[.$]}}_Z2f1ii": DW_OP_consts +5, DW_OP_stack_value) ; CHECK-NEXT: DW_AT_name {{.*}} "x" ; CHECK: DW_TAG_variable ; DWARF32-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x1) loclist = 0x00000020: ; DWARF64-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x1) loclist = 0x00000034: -; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text._Z2f1ii": DW_OP_consts +3, DW_OP_stack_value -; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text._Z2f1ii": DW_OP_consts +4, DW_OP_stack_value) +; CHECK-NEXT: [0x0000000000000000, 0x0000000000000003) ".text{{[.$]}}_Z2f1ii": DW_OP_consts +3, DW_OP_stack_value +; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text{{[.$]}}_Z2f1ii": DW_OP_consts +4, DW_OP_stack_value) ; CHECK-NEXT: DW_AT_name {{.*}} "y" ; CHECK: DW_TAG_variable ; DWARF32-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x2) loclist = 0x00000031: ; DWARF64-NEXT: DW_AT_location [DW_FORM_loclistx] (indexed (0x2) loclist = 0x00000045: -; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text._Z2f1ii": DW_OP_reg0 RAX) +; CHECK-NEXT: [0x0000000000000003, 0x0000000000000004) ".text{{[.$]}}_Z2f1ii": DW_OP_reg0 RAX) ; CHECK-NEXT: DW_AT_name {{.*}} "r" ; CHECK: .debug_loclists contents: diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll index 3aa77c3955c63..7bd22d57347b3 100644 --- a/llvm/test/CodeGen/X86/dpbusd.ll +++ b/llvm/test/CodeGen/X86/dpbusd.ll @@ -1,40 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=AVXVNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=AVX512,AVX512VNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI define i32 @no_dpbusd(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: no_dpbusd: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq -; -; AVX512-LABEL: no_dpbusd: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: addl %edx, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; CHECK-LABEL: no_dpbusd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; CHECK-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq entry: %0 = load <16 x i8>, ptr %a, align 16 %1 = zext <16 x i8> %0 to <16 x i32> @@ -99,25 +84,44 @@ entry: } define i32 @mul_zext(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: mul_zext: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 -; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVXVNNI-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_zext: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; AVXVNNI-AVX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVXVNNI-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_zext: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_zext: ; AVX512: # %bb.0: # %entry @@ -153,25 +157,44 @@ entry: } define i32 @mul_sext(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: mul_sext: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVXVNNI-NEXT: vpmovsxbw (%rsi), %ymm1 -; AVXVNNI-NEXT: vpmullw %ymm0, %ymm1, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpmovsxwd %xmm1, %ymm1 -; AVXVNNI-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVXVNNI-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_sext: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVXVNNI-AVX-NEXT: vpmovsxwd %xmm0, %ymm0 +; AVXVNNI-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_sext: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVXVNNI-AVX512-NEXT: vpmovsxbw (%rsi), %ymm1 +; AVXVNNI-AVX512-NEXT: vpmullw %ymm0, %ymm1, %ymm0 +; AVXVNNI-AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_sext: ; AVX512: # %bb.0: # %entry @@ -312,17 +335,30 @@ entry: declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) define i32 @vpdpbusd_128(ptr%a, ptr%b, i32 %c, i32 %n) { -; AVXVNNI-LABEL: vpdpbusd_128: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVXVNNI-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVXVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 -; AVXVNNI-NEXT: vmovd %xmm2, %eax -; AVXVNNI-NEXT: addl %edx, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: vpdpbusd_128: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXVNNI-AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 +; AVXVNNI-AVX-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX-NEXT: addl %edx, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: vpdpbusd_128: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXVNNI-AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %xmm1, %xmm0, %xmm2 +; AVXVNNI-AVX512-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX512-NEXT: addl %edx, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: vpdpbusd_128: ; AVX512VNNI: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll index 456e6e8f263aa..bb47df59eefad 100644 --- a/llvm/test/CodeGen/X86/dpbusd_const.ll +++ b/llvm/test/CodeGen/X86/dpbusd_const.ll @@ -1,20 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=ALL,AVXVNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VNNI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VLVNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVXVNNI,AVXVNNI-AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VNNI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512VLVNNI define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) { -; ALL-LABEL: mul_4xi8_zc_exceed: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: addl %edi, %eax -; ALL-NEXT: retq +; CHECK-LABEL: mul_4xi8_zc_exceed: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, @@ -24,14 +25,24 @@ entry: } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi8_zc: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi8_zc: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi8_zc: ; AVX512VNNI: # %bb.0: # %entry @@ -62,16 +73,26 @@ entry: } define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi4_cz: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi4_cz: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVXVNNI-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi4_cz: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry @@ -104,15 +125,26 @@ entry: } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVXVNNI-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_4xi8_cs: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX-NEXT: vmovd {{.*#+}} xmm2 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 +; AVXVNNI-AVX-NEXT: vmovd %xmm1, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_4xi8_cs: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVXVNNI-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVXVNNI-AVX512-NEXT: vmovd {{.*#+}} xmm1 = [16,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] +; AVXVNNI-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %xmm0, %xmm1, %xmm2 +; AVXVNNI-AVX512-NEXT: vmovd %xmm2, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi8_cs: ; AVX512VNNI: # %bb.0: # %entry @@ -145,17 +177,17 @@ entry: } define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) { -; ALL-LABEL: mul_4xi8_cs_exceed: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 -; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: addl %edi, %eax -; ALL-NEXT: retq +; CHECK-LABEL: mul_4xi8_cs_exceed: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 +; CHECK-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0] +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovd %xmm0, %eax +; CHECK-NEXT: addl %edi, %eax +; CHECK-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> , %0 @@ -265,24 +297,44 @@ entry: } define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_64xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] -; AVXVNNI-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVXVNNI-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 -; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 -; AVXVNNI-NEXT: vpaddd %ymm4, %ymm3, %ymm0 -; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVXVNNI-NEXT: vmovd %xmm0, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: vzeroupper -; AVXVNNI-NEXT: retq +; AVXVNNI-AVX-LABEL: mul_64xi8_zc: +; AVXVNNI-AVX: # %bb.0: # %entry +; AVXVNNI-AVX-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] +; AVXVNNI-AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVXVNNI-AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 +; AVXVNNI-AVX-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 +; AVXVNNI-AVX-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX-NEXT: addl %edi, %eax +; AVXVNNI-AVX-NEXT: vzeroupper +; AVXVNNI-AVX-NEXT: retq +; +; AVXVNNI-AVX512-LABEL: mul_64xi8_zc: +; AVXVNNI-AVX512: # %bb.0: # %entry +; AVXVNNI-AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVXVNNI-AVX512-NEXT: vpbroadcastd {{.*#+}} ymm2 = [0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64,0,1,2,64] +; AVXVNNI-AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVXVNNI-AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm4 +; AVXVNNI-AVX512-NEXT: {vex} vpdpbusd %ymm2, %ymm0, %ymm3 +; AVXVNNI-AVX512-NEXT: vpaddd %ymm4, %ymm3, %ymm0 +; AVXVNNI-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVXVNNI-AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-AVX512-NEXT: vmovd %xmm0, %eax +; AVXVNNI-AVX512-NEXT: addl %edi, %eax +; AVXVNNI-AVX512-NEXT: vzeroupper +; AVXVNNI-AVX512-NEXT: retq ; ; AVX512-LABEL: mul_64xi8_zc: ; AVX512: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll index 1082177e3da19..2c06c538ae10d 100644 --- a/llvm/test/CodeGen/X86/fadd-combines.ll +++ b/llvm/test/CodeGen/X86/fadd-combines.ll @@ -5,7 +5,7 @@ define float @fadd_zero_f32(float %x) #0 { ; CHECK-LABEL: fadd_zero_f32: ; CHECK: # %bb.0: ; CHECK-NEXT: retq - %y = fadd float %x, 0.0 + %y = fadd nsz float %x, 0.0 ret float %y } @@ -13,7 +13,7 @@ define <4 x float> @fadd_zero_4f32(<4 x float> %x) #0 { ; CHECK-LABEL: fadd_zero_4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, zeroinitializer + %y = fadd nsz <4 x float> %x, zeroinitializer ret <4 x float> %y } @@ -31,8 +31,8 @@ define float @fadd_2const_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, 1.0 - %z = fadd float %y, 2.0 + %y = fadd reassoc nsz float %x, 1.0 + %z = fadd reassoc nsz float %y, 2.0 ret float %z } @@ -45,8 +45,8 @@ define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, - %z = fadd <4 x float> %y, + %y = fadd reassoc nsz <4 x float> %x, + %z = fadd reassoc nsz <4 x float> %y, ret <4 x float> %z } @@ -56,8 +56,8 @@ define float @fadd_x_fmul_x_c_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul float %x, 2.0 - %z = fadd float %x, %y + %y = fmul reassoc nsz float %x, 2.0 + %z = fadd reassoc nsz float %x, %y ret float %z } @@ -70,8 +70,8 @@ define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul <4 x float> %x, - %z = fadd <4 x float> %x, %y + %y = fmul reassoc nsz <4 x float> %x, + %z = fadd reassoc nsz <4 x float> %x, %y ret <4 x float> %z } @@ -81,8 +81,8 @@ define float @fadd_fmul_x_c_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul float %x, 2.0 - %z = fadd float %y, %x + %y = fmul reassoc nsz float %x, 2.0 + %z = fadd reassoc nsz float %y, %x ret float %z } @@ -95,8 +95,8 @@ define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul <4 x float> %x, - %z = fadd <4 x float> %y, %x + %y = fmul reassoc nsz <4 x float> %x, + %z = fadd reassoc nsz <4 x float> %y, %x ret <4 x float> %z } @@ -106,9 +106,9 @@ define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fmul float %x, 2.0 - %w = fadd float %y, %z + %y = fadd reassoc nsz float %x, %x + %z = fmul reassoc nsz float %x, 2.0 + %w = fadd reassoc nsz float %y, %z ret float %w } @@ -121,9 +121,9 @@ define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fmul <4 x float> %x, - %w = fadd <4 x float> %y, %z + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fmul reassoc nsz <4 x float> %x, + %w = fadd reassoc nsz <4 x float> %y, %z ret <4 x float> %w } @@ -133,9 +133,9 @@ define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fmul float %x, 2.0 - %w = fadd float %z, %y + %y = fadd reassoc nsz float %x, %x + %z = fmul reassoc nsz float %x, 2.0 + %w = fadd reassoc nsz float %z, %y ret float %w } @@ -148,9 +148,9 @@ define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fmul <4 x float> %x, - %w = fadd <4 x float> %z, %y + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fmul reassoc nsz <4 x float> %x, + %w = fadd reassoc nsz <4 x float> %z, %y ret <4 x float> %w } @@ -160,8 +160,8 @@ define float @fadd_x_fadd_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fadd float %x, %y + %y = fadd reassoc nsz float %x, %x + %z = fadd reassoc nsz float %x, %y ret float %z } @@ -174,8 +174,8 @@ define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fadd <4 x float> %x, %y + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fadd reassoc nsz <4 x float> %x, %y ret <4 x float> %z } @@ -185,8 +185,8 @@ define float @fadd_fadd_x_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fadd float %y, %x + %y = fadd reassoc nsz float %x, %x + %z = fadd reassoc nsz float %y, %x ret float %z } @@ -199,8 +199,8 @@ define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fadd <4 x float> %y, %x + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fadd reassoc nsz <4 x float> %y, %x ret <4 x float> %z } @@ -210,8 +210,8 @@ define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fadd float %y, %y + %y = fadd reassoc nsz float %x, %x + %z = fadd reassoc nsz float %y, %y ret float %z } @@ -224,8 +224,8 @@ define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fadd <4 x float> %y, %y + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fadd reassoc nsz <4 x float> %y, %y ret <4 x float> %z } @@ -241,9 +241,9 @@ define float @fadd_const_multiuse_attr(float %x) #0 { ; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: retq - %a1 = fadd float %x, 42.0 - %a2 = fadd float %a1, 17.0 - %a3 = fadd float %a1, %a2 + %a1 = fadd reassoc nsz float %x, 42.0 + %a2 = fadd reassoc nsz float %a1, 17.0 + %a3 = fadd reassoc nsz float %a1, %a2 ret float %a3 } @@ -275,4 +275,4 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do ret <2 x double> %sub } -attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" } +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" } diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll index 54d82b0c1c929..c66473e9edd19 100644 --- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll +++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll @@ -1756,263 +1756,131 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind ; ; AVX512-LABEL: test_fmaximumnum_v4f16: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $56, %rsp -; AVX512-NEXT: vmovdqa %xmm1, %xmm5 -; AVX512-NEXT: vmovdqa %xmm0, %xmm6 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,3,3,3] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm1 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX512-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 -; AVX512-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm2 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm3 -; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm5[1,0] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm6[1,0] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm2 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm13 -; AVX512-NEXT: vcvtph2ps %xmm13, %xmm2 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm12 -; AVX512-NEXT: vcvtph2ps %xmm12, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm2 -; AVX512-NEXT: seta %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpsrlq $48, %xmm5, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrlq $48, %xmm6, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm11 -; AVX512-NEXT: vcvtph2ps %xmm11, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm15 -; AVX512-NEXT: vcvtph2ps %xmm15, %xmm7 -; AVX512-NEXT: vucomiss %xmm7, %xmm1 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vmovd %xmm2, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm7, %xmm7 {%k1} -; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm5[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovdqa %xmm2, %xmm4 +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vmaxss %xmm4, %xmm3, %xmm2 +; AVX512-NEXT: vcmpordss %xmm3, %xmm3, %k1 +; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovshdup {{.*#+}} xmm1 = xmm6[1,1,3,3] -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm9 -; AVX512-NEXT: vcvtph2ps %xmm9, %xmm4 -; AVX512-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm10 -; AVX512-NEXT: vcvtph2ps %xmm10, %xmm3 -; AVX512-NEXT: vucomiss %xmm3, %xmm4 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[3,3,3,3] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 +; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 +; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 ; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm0 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm4 -; AVX512-NEXT: vucomiss %xmm4, %xmm4 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm0, %xmm4, %xmm4 {%k2} +; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 +; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} ; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 -; AVX512-NEXT: vcvtph2ps %xmm4, %xmm1 -; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm8 -; AVX512-NEXT: vcvtph2ps %xmm8, %xmm2 -; AVX512-NEXT: vucomiss %xmm2, %xmm1 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm3 +; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512-NEXT: vmovd %xmm3, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512-NEXT: vpsrld $16, %xmm5, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vucomiss %xmm1, %xmm1 -; AVX512-NEXT: setp %al +; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm4 +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovdqa %xmm3, %xmm5 +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm4, %xmm3 +; AVX512-NEXT: vcmpordss %xmm4, %xmm4, %k1 +; AVX512-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] +; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vpsrld $16, %xmm6, %xmm5 +; AVX512-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3] ; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512-NEXT: vucomiss %xmm5, %xmm5 -; AVX512-NEXT: setp %al -; AVX512-NEXT: kmovw %eax, %k2 -; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k2} -; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm6 -; AVX512-NEXT: vcvtph2ps %xmm6, %xmm5 -; AVX512-NEXT: vmovss %xmm5, %xmm1, %xmm1 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm0 -; AVX512-NEXT: vucomiss %xmm0, %xmm5 -; AVX512-NEXT: seta %al +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 +; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm4 +; AVX512-NEXT: vmovd %xmm4, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al ; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} -; AVX512-NEXT: vcvtps2ph $4, %xmm7, %xmm7 -; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 -; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm5 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] -; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-NEXT: # xmm0 = xmm0[0],mem[0] -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX512-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1],xmm14[2],mem[2],xmm14[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload -; AVX512-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm13[0] -; AVX512-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX512-NEXT: vpcmpeqw %xmm6, %xmm4, %xmm9 -; AVX512-NEXT: vpblendvb %xmm9, %xmm4, %xmm0, %xmm4 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX512-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload -; AVX512-NEXT: # xmm11 = xmm12[0],mem[0],xmm12[1],mem[1],xmm12[2],mem[2],xmm12[3],mem[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm9[0] -; AVX512-NEXT: vpcmpeqw %xmm6, %xmm1, %xmm6 -; AVX512-NEXT: vpblendvb %xmm6, %xmm1, %xmm4, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512-NEXT: xorl %eax, %eax -; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF -; AVX512-NEXT: movl $0, %edx -; AVX512-NEXT: cmovel %ecx, %edx -; AVX512-NEXT: vcvtph2ps %xmm5, %xmm2 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: cmovel %ecx, %esi -; AVX512-NEXT: vcvtph2ps %xmm3, %xmm2 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %edi -; AVX512-NEXT: cmovel %ecx, %edi -; AVX512-NEXT: vcvtph2ps %xmm7, %xmm2 -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r8d -; AVX512-NEXT: cmovel %ecx, %r8d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r9d -; AVX512-NEXT: cmovel %ecx, %r9d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r10d -; AVX512-NEXT: cmovel %ecx, %r10d -; AVX512-NEXT: vcvtph2ps (%rsp), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: movl $0, %r11d -; AVX512-NEXT: cmovel %ecx, %r11d -; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX512-NEXT: vucomiss %xmm4, %xmm2 -; AVX512-NEXT: vmovd %esi, %xmm2 -; AVX512-NEXT: vpinsrw $1, %edx, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $2, %edi, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $3, %r8d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $4, %r9d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $5, %r10d, %xmm2, %xmm2 -; AVX512-NEXT: vpinsrw $6, %r11d, %xmm2, %xmm2 -; AVX512-NEXT: cmovel %ecx, %eax -; AVX512-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2 -; AVX512-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5 +; AVX512-NEXT: vmovdqa %xmm4, %xmm6 +; AVX512-NEXT: vmovss %xmm5, %xmm6, %xmm6 {%k1} +; AVX512-NEXT: vmovss %xmm4, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmaxss %xmm6, %xmm5, %xmm4 +; AVX512-NEXT: vcmpordss %xmm5, %xmm5, %k1 +; AVX512-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: testl %eax, %eax +; AVX512-NEXT: sets %al +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa %xmm0, %xmm5 +; AVX512-NEXT: vmovss %xmm1, %xmm5, %xmm5 {%k1} +; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512-NEXT: vmaxss %xmm5, %xmm1, %xmm0 +; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 +; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512-NEXT: retq ; ; AVX10_2-LABEL: test_fmaximumnum_v4f16: diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll index 3ed98589767fb..9095fb1550e70 100644 --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -243,7 +243,7 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 { ret <4 x double> %r } -define float @trunc_signed_f32_no_fast_math(float %x) { +define float @trunc_signed_f32_no_fast_math(float %x) nounwind { ; SSE-LABEL: trunc_signed_f32_no_fast_math: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 @@ -259,14 +259,12 @@ define float @trunc_signed_f32_no_fast_math(float %x) { ; X86-AVX1-LABEL: trunc_signed_f32_no_fast_math: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: pushl %eax -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 ; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 ; X86-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ; X86-AVX1-NEXT: flds (%esp) ; X86-AVX1-NEXT: popl %eax -; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX1-NEXT: retl %i = fptosi float %x to i32 %r = sitofp i32 %i to float @@ -306,7 +304,7 @@ define float @trunc_signed_f32_nsz(float %x) #0 { ret float %r } -define double @trunc_signed32_f64_no_fast_math(double %x) { +define double @trunc_signed32_f64_no_fast_math(double %x) nounwind { ; SSE-LABEL: trunc_signed32_f64_no_fast_math: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 @@ -322,10 +320,7 @@ define double @trunc_signed32_f64_no_fast_math(double %x) { ; X86-AVX1-LABEL: trunc_signed32_f64_no_fast_math: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: pushl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT: .cfi_offset %ebp, -8 ; X86-AVX1-NEXT: movl %esp, %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp ; X86-AVX1-NEXT: andl $-8, %esp ; X86-AVX1-NEXT: subl $8, %esp ; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -335,7 +330,6 @@ define double @trunc_signed32_f64_no_fast_math(double %x) { ; X86-AVX1-NEXT: fldl (%esp) ; X86-AVX1-NEXT: movl %ebp, %esp ; X86-AVX1-NEXT: popl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 ; X86-AVX1-NEXT: retl %i = fptosi double %x to i32 %r = sitofp i32 %i to double @@ -377,7 +371,7 @@ define double @trunc_signed32_f64_nsz(double %x) #0 { ret double %r } -define double @trunc_f32_signed32_f64_no_fast_math(float %x) { +define double @trunc_f32_signed32_f64_no_fast_math(float %x) nounwind { ; SSE-LABEL: trunc_f32_signed32_f64_no_fast_math: ; SSE: # %bb.0: ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 @@ -393,10 +387,7 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) { ; X86-AVX1-LABEL: trunc_f32_signed32_f64_no_fast_math: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: pushl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT: .cfi_offset %ebp, -8 ; X86-AVX1-NEXT: movl %esp, %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp ; X86-AVX1-NEXT: andl $-8, %esp ; X86-AVX1-NEXT: subl $8, %esp ; X86-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -406,7 +397,6 @@ define double @trunc_f32_signed32_f64_no_fast_math(float %x) { ; X86-AVX1-NEXT: fldl (%esp) ; X86-AVX1-NEXT: movl %ebp, %esp ; X86-AVX1-NEXT: popl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 ; X86-AVX1-NEXT: retl %i = fptosi float %x to i32 %r = sitofp i32 %i to double @@ -445,7 +435,7 @@ define double @trunc_f32_signed32_f64_nsz(float %x) #0 { ret double %r } -define float @trunc_f64_signed32_f32_no_fast_math(double %x) { +define float @trunc_f64_signed32_f32_no_fast_math(double %x) nounwind { ; SSE-LABEL: trunc_f64_signed32_f32_no_fast_math: ; SSE: # %bb.0: ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 @@ -461,14 +451,12 @@ define float @trunc_f64_signed32_f32_no_fast_math(double %x) { ; X86-AVX1-LABEL: trunc_f64_signed32_f32_no_fast_math: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: pushl %eax -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 ; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X86-AVX1-NEXT: vcvttpd2dq %xmm0, %xmm0 ; X86-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ; X86-AVX1-NEXT: flds (%esp) ; X86-AVX1-NEXT: popl %eax -; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 ; X86-AVX1-NEXT: retl %i = fptosi double %x to i32 %r = sitofp i32 %i to float @@ -503,7 +491,7 @@ define float @trunc_f64_signed32_f32_nsz(double %x) #0 { ret float %r } -define double @trunc_signed_f64_no_fast_math(double %x) { +define double @trunc_signed_f64_no_fast_math(double %x) nounwind { ; SSE-LABEL: trunc_signed_f64_no_fast_math: ; SSE: # %bb.0: ; SSE-NEXT: cvttsd2si %xmm0, %rax @@ -520,10 +508,7 @@ define double @trunc_signed_f64_no_fast_math(double %x) { ; X86-AVX1-LABEL: trunc_signed_f64_no_fast_math: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: pushl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 -; X86-AVX1-NEXT: .cfi_offset %ebp, -8 ; X86-AVX1-NEXT: movl %esp, %ebp -; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp ; X86-AVX1-NEXT: andl $-8, %esp ; X86-AVX1-NEXT: subl $24, %esp ; X86-AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero @@ -537,7 +522,6 @@ define double @trunc_signed_f64_no_fast_math(double %x) { ; X86-AVX1-NEXT: fldl {{[0-9]+}}(%esp) ; X86-AVX1-NEXT: movl %ebp, %esp ; X86-AVX1-NEXT: popl %ebp -; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 ; X86-AVX1-NEXT: retl %i = fptosi double %x to i64 %r = sitofp i64 %i to double diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll index aebfc7d483d6f..3ece4beb9c22e 100644 --- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll +++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll @@ -1,25 +1,26 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma | FileCheck %s --check-prefixes=X64,AVX,AVXIFMA ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma | FileCheck %s --check-prefixes=X64,AVX512,AVX512-NOVL ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avxifma,+avx512vl | FileCheck %s --check-prefixes=X64,AVX,AVX512-NOIFMA ; 67108863 == (1 << 26) - 1 ; 4503599627370496 == (1 << 52) ; 4503599627370495 == (1 << 52) - 1 define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_combine: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm0 -; AVX-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 -; AVX-NEXT: vmovdqa %ymm4, %ymm0 -; AVX-NEXT: vmovdqa %ymm5, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_combine: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm0 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm0 +; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 +; AVXIFMA-NEXT: vmovdqa %ymm4, %ymm0 +; AVXIFMA-NEXT: vmovdqa %ymm5, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_combine: ; AVX512: # %bb.0: @@ -29,6 +30,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { ; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_combine: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm3 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm4 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm5 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm4, %ymm5 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm5, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <8 x i64> %x, splat (i64 67108863) %y_masked = and <8 x i64> %y, splat (i64 67108863) %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked @@ -37,19 +51,19 @@ define <8 x i64> @test_512_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { } define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_combine_v2: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3] -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623] -; AVX-NEXT: vpand %ymm7, %ymm0, %ymm0 -; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm0 -; AVX-NEXT: vpand %ymm7, %ymm1, %ymm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 -; AVX-NEXT: vmovdqa %ymm4, %ymm0 -; AVX-NEXT: vmovdqa %ymm5, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_combine_v2: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [3,3,3,3] +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1125899906842623,1125899906842623,1125899906842623,1125899906842623] +; AVXIFMA-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm0 +; AVXIFMA-NEXT: vpand %ymm7, %ymm1, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm1, %ymm5 +; AVXIFMA-NEXT: vmovdqa %ymm4, %ymm0 +; AVXIFMA-NEXT: vmovdqa %ymm5, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_combine_v2: ; AVX512: # %bb.0: @@ -58,6 +72,18 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) ; AVX512-NEXT: vpmadd52luq %zmm1, %zmm0, %zmm2 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_combine_v2: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm4, %ymm5, %ymm3 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm2 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <8 x i64> %x, splat (i64 1125899906842623) ; (1 << 50) - 1 %y_masked = and <8 x i64> %y, splat (i64 3) %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked @@ -66,32 +92,32 @@ define <8 x i64> @test_512_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) } define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_no_combine: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495] -; AVX-NEXT: vpand %ymm6, %ymm0, %ymm7 -; AVX-NEXT: vpand %ymm6, %ymm1, %ymm8 -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm9 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm6 -; AVX-NEXT: vpsrlq $32, %ymm8, %ymm8 -; AVX-NEXT: vpmuludq %ymm3, %ymm8, %ymm8 -; AVX-NEXT: vpsrlq $32, %ymm6, %ymm6 -; AVX-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 -; AVX-NEXT: vpaddq %ymm6, %ymm8, %ymm6 -; AVX-NEXT: vpsllq $32, %ymm6, %ymm6 -; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpsrlq $32, %ymm7, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vpsrlq $32, %ymm9, %ymm7 -; AVX-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 -; AVX-NEXT: vpaddq %ymm3, %ymm7, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX-NEXT: vpaddq %ymm6, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_no_combine: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495] +; AVXIFMA-NEXT: vpand %ymm6, %ymm0, %ymm7 +; AVXIFMA-NEXT: vpand %ymm6, %ymm1, %ymm8 +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm9 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm6 +; AVXIFMA-NEXT: vpsrlq $32, %ymm8, %ymm8 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm8, %ymm8 +; AVXIFMA-NEXT: vpsrlq $32, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm8, %ymm6 +; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpsrlq $32, %ymm7, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpsrlq $32, %ymm9, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm7, %ymm3 +; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_no_combine: ; AVX512: # %bb.0: @@ -108,6 +134,22 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_no_combine: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495,4503599627370495] +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm0, %zmm4 +; AVX512-NOIFMA-NEXT: vpandq %zmm3, %zmm1, %zmm3 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm4, %zmm4 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm4, %zmm4 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 +; AVX512-NOIFMA-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <8 x i64> %x, splat (i64 4503599627370495) %y_masked = and <8 x i64> %y, splat (i64 4503599627370495) %mul = mul nuw nsw <8 x i64> %x_masked, %y_masked @@ -116,27 +158,27 @@ define <8 x i64> @test_512_no_combine(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) } define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_512_no_combine_v2: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlq $32, %ymm1, %ymm6 -; AVX-NEXT: vpmuludq %ymm3, %ymm6, %ymm6 -; AVX-NEXT: vpsrlq $32, %ymm3, %ymm7 -; AVX-NEXT: vpmuludq %ymm7, %ymm1, %ymm7 -; AVX-NEXT: vpaddq %ymm6, %ymm7, %ymm6 -; AVX-NEXT: vpsllq $32, %ymm6, %ymm6 -; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vpsrlq $32, %ymm2, %ymm7 -; AVX-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 -; AVX-NEXT: vpaddq %ymm3, %ymm7, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX-NEXT: vpaddq %ymm6, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_512_no_combine_v2: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpsrlq $32, %ymm1, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpsrlq $32, %ymm3, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm1, %ymm7 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm7, %ymm6 +; AVXIFMA-NEXT: vpsllq $32, %ymm6, %ymm6 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpsrlq $32, %ymm2, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm7, %ymm0, %ymm7 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm7, %ymm3 +; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm6, %ymm1, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_512_no_combine_v2: ; AVX512: # %bb.0: @@ -150,6 +192,19 @@ define <8 x i64> @test_512_no_combine_v2(<8 x i64> %x, <8 x i64> %y, <8 x i64> % ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_512_no_combine_v2: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm1, %zmm4 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm4, %zmm0, %zmm4 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm4, %zmm3 +; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %mul = mul <8 x i64> %x, %y %res = add <8 x i64> %mul, %z ret <8 x i64> %res @@ -255,25 +310,25 @@ define <1 x i64> @test_scalar_no_ifma(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) ; 40-bit and 13-bit, too wide define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) { -; AVX-LABEL: test_mixed_width_too_wide: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191] -; AVX-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX-NEXT: vpand %ymm6, %ymm3, %ymm3 -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052] -; AVX-NEXT: vpshufb %ymm6, %ymm1, %ymm7 -; AVX-NEXT: vpmuludq %ymm3, %ymm7, %ymm7 -; AVX-NEXT: vpsllq $32, %ymm7, %ymm7 -; AVX-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX-NEXT: vpshufb %ymm6, %ymm0, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm0, %ymm4, %ymm0 -; AVX-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm1, %ymm5, %ymm1 -; AVX-NEXT: vpaddq %ymm7, %ymm1, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_mixed_width_too_wide: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8191,8191,8191,8191] +; AVXIFMA-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVXIFMA-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm6 = [2155905028,2155905036,2155905044,2155905052] +; AVXIFMA-NEXT: vpshufb %ymm6, %ymm1, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm7, %ymm7 +; AVXIFMA-NEXT: vpsllq $32, %ymm7, %ymm7 +; AVXIFMA-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 +; AVXIFMA-NEXT: vpshufb %ymm6, %ymm0, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVXIFMA-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm1, %ymm5, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm7, %ymm1, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_mixed_width_too_wide: ; AVX512: # %bb.0: @@ -286,6 +341,18 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64 ; AVX512-NEXT: vpaddq %zmm3, %zmm2, %zmm1 ; AVX512-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_mixed_width_too_wide: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm3 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpsllq $32, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm3, %zmm2, %zmm1 +; AVX512-NOIFMA-NEXT: vpaddq %zmm0, %zmm1, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x40 = and <8 x i64> %x, splat (i64 1099511627775) %y13 = and <8 x i64> %y, splat (i64 8191) %mul = mul <8 x i64> %x40, %y13 @@ -294,19 +361,19 @@ define <8 x i64> @test_mixed_width_too_wide(<8 x i64> %x, <8 x i64> %y, <8 x i64 } define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, <8 x i64> %z) { -; AVX-LABEL: test_zext32_inputs_not_safe: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpmuludq %ymm5, %ymm4, %ymm4 -; AVX-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpaddq %ymm4, %ymm2, %ymm0 -; AVX-NEXT: vpaddq %ymm1, %ymm3, %ymm1 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_zext32_inputs_not_safe: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVXIFMA-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVXIFMA-NEXT: vpmuludq %ymm5, %ymm4, %ymm4 +; AVXIFMA-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVXIFMA-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVXIFMA-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 +; AVXIFMA-NEXT: vpaddq %ymm4, %ymm2, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm1, %ymm3, %ymm1 +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_zext32_inputs_not_safe: ; AVX512: # %bb.0: @@ -315,6 +382,14 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, ; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_zext32_inputs_not_safe: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; AVX512-NOIFMA-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero +; AVX512-NOIFMA-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x = zext <8 x i32> %xi32 to <8 x i64> %y = zext <8 x i32> %yi32 to <8 x i64> %mul = mul <8 x i64> %x, %y @@ -323,36 +398,36 @@ define <8 x i64> @test_zext32_inputs_not_safe(<8 x i32> %xi32, <8 x i32> %yi32, } define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) nounwind { -; AVX-LABEL: test_1024_combine_split: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: movq %rsp, %rbp -; AVX-NEXT: andq $-32, %rsp -; AVX-NEXT: subq $32, %rsp -; AVX-NEXT: vmovdqa 112(%rbp), %ymm8 -; AVX-NEXT: vmovdqa 80(%rbp), %ymm9 -; AVX-NEXT: vmovdqa 48(%rbp), %ymm10 -; AVX-NEXT: vmovdqa 16(%rbp), %ymm11 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm3, %ymm12, %ymm3 -; AVX-NEXT: vpand %ymm2, %ymm12, %ymm2 -; AVX-NEXT: vpand %ymm1, %ymm12, %ymm1 -; AVX-NEXT: vpand %ymm0, %ymm12, %ymm0 -; AVX-NEXT: vpand %ymm7, %ymm12, %ymm7 -; AVX-NEXT: {vex} vpmadd52luq %ymm7, %ymm3, %ymm8 -; AVX-NEXT: vpand %ymm6, %ymm12, %ymm3 -; AVX-NEXT: {vex} vpmadd52luq %ymm3, %ymm2, %ymm9 -; AVX-NEXT: vpand %ymm5, %ymm12, %ymm2 -; AVX-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm10 -; AVX-NEXT: vpand %ymm4, %ymm12, %ymm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm11 -; AVX-NEXT: vmovdqa %ymm11, %ymm0 -; AVX-NEXT: vmovdqa %ymm10, %ymm1 -; AVX-NEXT: vmovdqa %ymm9, %ymm2 -; AVX-NEXT: vmovdqa %ymm8, %ymm3 -; AVX-NEXT: movq %rbp, %rsp -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_1024_combine_split: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: pushq %rbp +; AVXIFMA-NEXT: movq %rsp, %rbp +; AVXIFMA-NEXT: andq $-32, %rsp +; AVXIFMA-NEXT: subq $32, %rsp +; AVXIFMA-NEXT: vmovdqa 112(%rbp), %ymm8 +; AVXIFMA-NEXT: vmovdqa 80(%rbp), %ymm9 +; AVXIFMA-NEXT: vmovdqa 48(%rbp), %ymm10 +; AVXIFMA-NEXT: vmovdqa 16(%rbp), %ymm11 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm12 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm3, %ymm12, %ymm3 +; AVXIFMA-NEXT: vpand %ymm2, %ymm12, %ymm2 +; AVXIFMA-NEXT: vpand %ymm1, %ymm12, %ymm1 +; AVXIFMA-NEXT: vpand %ymm0, %ymm12, %ymm0 +; AVXIFMA-NEXT: vpand %ymm7, %ymm12, %ymm7 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm7, %ymm3, %ymm8 +; AVXIFMA-NEXT: vpand %ymm6, %ymm12, %ymm3 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm2, %ymm9 +; AVXIFMA-NEXT: vpand %ymm5, %ymm12, %ymm2 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm1, %ymm10 +; AVXIFMA-NEXT: vpand %ymm4, %ymm12, %ymm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm0, %ymm11 +; AVXIFMA-NEXT: vmovdqa %ymm11, %ymm0 +; AVXIFMA-NEXT: vmovdqa %ymm10, %ymm1 +; AVXIFMA-NEXT: vmovdqa %ymm9, %ymm2 +; AVXIFMA-NEXT: vmovdqa %ymm8, %ymm3 +; AVXIFMA-NEXT: movq %rbp, %rsp +; AVXIFMA-NEXT: popq %rbp +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_1024_combine_split: ; AVX512: # %bb.0: @@ -366,6 +441,27 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_1024_combine_split: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863] +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm1, %zmm1 +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm3, %zmm3 +; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm2, %zmm2 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm6 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm7 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm4, %ymm8 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm6, %ymm7, %ymm8 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm3, %ymm2 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm5, %ymm6 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm4, %ymm6 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm1, %ymm5 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm1 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <16 x i64> %x, splat (i64 67108863) %y_masked = and <16 x i64> %y, splat (i64 67108863) %mul = mul <16 x i64> %x_masked, %y_masked @@ -388,13 +484,13 @@ define <1 x i64> @test_not_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %z) { } define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) { -; AVX-LABEL: test_v3i64: -; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v3i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm1 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: retq ; ; AVX512-NOVL-LABEL: test_v3i64: ; AVX512-NOVL: # %bb.0: @@ -410,6 +506,13 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) { ; AVX512VL-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v3i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NOIFMA-NEXT: vpmuludq %ymm0, %ymm0, %ymm0 +; AVX512-NOIFMA-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <3 x i64> %x, splat (i64 67108863) %y_masked = and <3 x i64> %x, splat (i64 67108863) %mul = mul <3 x i64> %x_masked, %y_masked @@ -418,35 +521,35 @@ define <3 x i64> @test_v3i64(<3 x i64> %x, <3 x i64> %y, <3 x i64> %z) { } define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) { -; AVX-LABEL: test_v5i64: -; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: vmovq %r8, %xmm0 -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %rsi, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF -; AVX-NEXT: vmovq %rcx, %xmm3 -; AVX-NEXT: vmovq %r9, %xmm4 -; AVX-NEXT: vpand %xmm3, %xmm4, %xmm3 -; AVX-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 -; AVX-NEXT: vpsllq $33, %xmm4, %xmm4 -; AVX-NEXT: vpmuludq %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX-NEXT: vpaddq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2 -; AVX-NEXT: vmovdqa %ymm2, (%rdi) -; AVX-NEXT: vmovq %xmm1, 32(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v5i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: movq %rdi, %rax +; AVXIFMA-NEXT: vmovq %r8, %xmm0 +; AVXIFMA-NEXT: vmovq %rcx, %xmm1 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVXIFMA-NEXT: vmovq %rdx, %xmm1 +; AVXIFMA-NEXT: vmovq %rsi, %xmm2 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm3 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVXIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF +; AVXIFMA-NEXT: vmovq %rcx, %xmm3 +; AVXIFMA-NEXT: vmovq %r9, %xmm4 +; AVXIFMA-NEXT: vpand %xmm3, %xmm4, %xmm3 +; AVXIFMA-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVXIFMA-NEXT: vpmuludq %xmm4, %xmm3, %xmm4 +; AVXIFMA-NEXT: vpsllq $33, %xmm4, %xmm4 +; AVXIFMA-NEXT: vpmuludq %xmm3, %xmm3, %xmm3 +; AVXIFMA-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVXIFMA-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2 +; AVXIFMA-NEXT: vmovdqa %ymm2, (%rdi) +; AVXIFMA-NEXT: vmovq %xmm1, 32(%rdi) +; AVXIFMA-NEXT: vzeroupper +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_v5i64: ; AVX512: # %bb.0: @@ -454,6 +557,13 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) { ; AVX512-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v5i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <5 x i64> %x, splat (i64 67108863) %y_masked = and <5 x i64> %x, splat (i64 67108863) %mul = mul <5 x i64> %x_masked, %y_masked @@ -462,30 +572,30 @@ define <5 x i64> @test_v5i64(<5 x i64> %x, <5 x i64> %y, <5 x i64> %z) { } define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) { -; AVX-LABEL: test_v6i64: -; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: vmovq %r8, %xmm0 -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %rsi, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm1 -; AVX-NEXT: vmovq %r9, %xmm0 -; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 32(%rdi) -; AVX-NEXT: vmovdqa %ymm1, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v6i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: movq %rdi, %rax +; AVXIFMA-NEXT: vmovq %r8, %xmm0 +; AVXIFMA-NEXT: vmovq %rcx, %xmm1 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVXIFMA-NEXT: vmovq %rdx, %xmm1 +; AVXIFMA-NEXT: vmovq %rsi, %xmm2 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm2 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm1 +; AVXIFMA-NEXT: vmovq %r9, %xmm0 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; AVXIFMA-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVXIFMA-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 +; AVXIFMA-NEXT: vpaddq {{[0-9]+}}(%rsp), %xmm0, %xmm0 +; AVXIFMA-NEXT: vmovdqa %xmm0, 32(%rdi) +; AVXIFMA-NEXT: vmovdqa %ymm1, (%rdi) +; AVXIFMA-NEXT: vzeroupper +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_v6i64: ; AVX512: # %bb.0: @@ -493,6 +603,13 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) { ; AVX512-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 ; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v6i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpmuludq %zmm0, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vpaddq %zmm2, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: retq %x_masked = and <6 x i64> %x, splat (i64 67108863) %y_masked = and <6 x i64> %x, splat (i64 67108863) %mul = mul <6 x i64> %x_masked, %y_masked @@ -501,43 +618,43 @@ define <6 x i64> @test_v6i64(<6 x i64> %x, <6 x i64> %y, <6 x i64> %z) { } define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) { -; AVX-LABEL: test_v9i64: -; AVX: # %bb.0: -; AVX-NEXT: movq %rdi, %rax -; AVX-NEXT: vmovq %r8, %xmm0 -; AVX-NEXT: vmovq %rcx, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: vmovq %rdx, %xmm1 -; AVX-NEXT: vmovq %rsi, %xmm2 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX-NEXT: vmovq %r9, %xmm1 -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 -; AVX-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm4 -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863] -; AVX-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF -; AVX-NEXT: vmovq %rcx, %xmm5 -; AVX-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero -; AVX-NEXT: vpand %xmm5, %xmm6, %xmm5 -; AVX-NEXT: vpsrlq $32, %xmm5, %xmm6 -; AVX-NEXT: vpmuludq %xmm6, %xmm5, %xmm6 -; AVX-NEXT: vpsllq $33, %xmm6, %xmm6 -; AVX-NEXT: vpmuludq %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vpaddq %xmm2, %xmm5, %xmm2 -; AVX-NEXT: vpaddq %xmm6, %xmm2, %xmm2 -; AVX-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm4 -; AVX-NEXT: {vex} vpmadd52luq %ymm1, %ymm1, %ymm3 -; AVX-NEXT: vmovdqa %ymm3, 32(%rdi) -; AVX-NEXT: vmovdqa %ymm4, (%rdi) -; AVX-NEXT: vmovq %xmm2, 64(%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVXIFMA-LABEL: test_v9i64: +; AVXIFMA: # %bb.0: +; AVXIFMA-NEXT: movq %rdi, %rax +; AVXIFMA-NEXT: vmovq %r8, %xmm0 +; AVXIFMA-NEXT: vmovq %rcx, %xmm1 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVXIFMA-NEXT: vmovq %rdx, %xmm1 +; AVXIFMA-NEXT: vmovq %rsi, %xmm2 +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVXIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVXIFMA-NEXT: vmovq %r9, %xmm1 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVXIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVXIFMA-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 +; AVXIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm4 +; AVXIFMA-NEXT: vpbroadcastq {{.*#+}} ymm5 = [67108863,67108863,67108863,67108863] +; AVXIFMA-NEXT: vpand %ymm5, %ymm0, %ymm0 +; AVXIFMA-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVXIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF +; AVXIFMA-NEXT: vmovq %rcx, %xmm5 +; AVXIFMA-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero +; AVXIFMA-NEXT: vpand %xmm5, %xmm6, %xmm5 +; AVXIFMA-NEXT: vpsrlq $32, %xmm5, %xmm6 +; AVXIFMA-NEXT: vpmuludq %xmm6, %xmm5, %xmm6 +; AVXIFMA-NEXT: vpsllq $33, %xmm6, %xmm6 +; AVXIFMA-NEXT: vpmuludq %xmm5, %xmm5, %xmm5 +; AVXIFMA-NEXT: vpaddq %xmm2, %xmm5, %xmm2 +; AVXIFMA-NEXT: vpaddq %xmm6, %xmm2, %xmm2 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm4 +; AVXIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm1, %ymm3 +; AVXIFMA-NEXT: vmovdqa %ymm3, 32(%rdi) +; AVXIFMA-NEXT: vmovdqa %ymm4, (%rdi) +; AVXIFMA-NEXT: vmovq %xmm2, 64(%rdi) +; AVXIFMA-NEXT: vzeroupper +; AVXIFMA-NEXT: retq ; ; AVX512-LABEL: test_v9i64: ; AVX512: # %bb.0: @@ -572,6 +689,44 @@ define <9 x i64> @test_v9i64(<9 x i64> %x, <9 x i64> %y, <9 x i64> %z) { ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq +; +; AVX512-NOIFMA-LABEL: test_v9i64: +; AVX512-NOIFMA: # %bb.0: +; AVX512-NOIFMA-NEXT: movq %rdi, %rax +; AVX512-NOIFMA-NEXT: vmovq %r8, %xmm0 +; AVX512-NOIFMA-NEXT: vmovq %rcx, %xmm1 +; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512-NOIFMA-NEXT: vmovq %rdx, %xmm1 +; AVX512-NOIFMA-NEXT: vmovq %rsi, %xmm2 +; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512-NOIFMA-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NOIFMA-NEXT: vmovq %r9, %xmm1 +; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX512-NOIFMA-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX512-NOIFMA-NEXT: vinserti128 $1, {{[0-9]+}}(%rsp), %ymm1, %ymm1 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX512-NOIFMA-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NOIFMA-NEXT: movl $67108863, %ecx # imm = 0x3FFFFFF +; AVX512-NOIFMA-NEXT: vmovq %rcx, %xmm2 +; AVX512-NOIFMA-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero +; AVX512-NOIFMA-NEXT: vpand %xmm2, %xmm3, %xmm2 +; AVX512-NOIFMA-NEXT: vpsrlq $32, %xmm2, %xmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %xmm3, %xmm2, %xmm3 +; AVX512-NOIFMA-NEXT: vpsllq $33, %xmm3, %xmm3 +; AVX512-NOIFMA-NEXT: vpmuludq %xmm2, %xmm2, %xmm2 +; AVX512-NOIFMA-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX512-NOIFMA-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512-NOIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm3 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm2, %ymm3 +; AVX512-NOIFMA-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm2 +; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm0, %ymm0, %ymm2 +; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 +; AVX512-NOIFMA-NEXT: vmovq %xmm1, 64(%rdi) +; AVX512-NOIFMA-NEXT: vmovdqa64 %zmm0, (%rdi) +; AVX512-NOIFMA-NEXT: vzeroupper +; AVX512-NOIFMA-NEXT: retq %x_masked = and <9 x i64> %x, splat (i64 67108863) %y_masked = and <9 x i64> %x, splat (i64 67108863) %mul = mul <9 x i64> %x_masked, %y_masked diff --git a/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll b/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll index 688add1e92ab1..d271e97d8832a 100644 --- a/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll +++ b/llvm/test/CodeGen/X86/isel-llvm.set.rounding.ll @@ -1,16 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s --check-prefixes=X86-NOSSE,SDAG-X86-NOSSE ; RUN: llc < %s -mtriple=i686-- -fast-isel -fast-isel-abort=1 -mattr=-sse | FileCheck %s --check-prefixes=X86-NOSSE,FASTISEL-X86-NOSSE -; RUN: llc < %s -mtriple=i686-- -global-isel -global-isel-abort=2 -mattr=-sse | FileCheck %s --check-prefixes=X86-NOSSE,GISEL-X86-NOSSE +; RUN: llc < %s -mtriple=i686-- -global-isel -global-isel-abort=1 -mattr=-sse | FileCheck %s --check-prefixes=GISEL-X86-NOSSE ; RUN: llc < %s -mtriple=x86_64-- -mattr=-sse | FileCheck %s --check-prefixes=X64-NOSSE,SDAG-X64-NOSSE ; RUN: llc < %s -mtriple=x86_64-- -fast-isel -fast-isel-abort=1 -mattr=-sse | FileCheck %s --check-prefixes=X64-NOSSE,FASTISEL-X64-NOSSE -; RUN: llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=2 -mattr=-sse | FileCheck %s --check-prefixes=X64-NOSSE,GISEL-X64-NOSSE +; RUN: llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=1 -mattr=-sse | FileCheck %s --check-prefixes=GISEL-X64-NOSSE ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86,SDAG-X86 ; RUN: llc < %s -mtriple=i686-- -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X86,FASTISEL-X86 -; RUN: llc < %s -mtriple=i686-- -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X86,GISEL-X86 +; RUN: llc < %s -mtriple=i686-- -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X86 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefixes=X64,SDAG-X64 ; RUN: llc < %s -mtriple=x86_64-- -fast-isel -fast-isel-abort=1 | FileCheck %s --check-prefixes=X64,FASTISEL-X64 -; RUN: llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=2 | FileCheck %s --check-prefixes=X64,GISEL-X64 +; RUN: llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=1 | FileCheck %s --check-prefixes=GISEL-X64 declare void @llvm.set.rounding(i32 %x) @@ -24,6 +24,18 @@ define void @func_01() nounwind { ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl ; +; GISEL-X86-NOSSE-LABEL: func_01: +; GISEL-X86-NOSSE: # %bb.0: +; GISEL-X86-NOSSE-NEXT: pushl %eax +; GISEL-X86-NOSSE-NEXT: fnstcw (%esp) +; GISEL-X86-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NOSSE-NEXT: andw (%esp), %ax +; GISEL-X86-NOSSE-NEXT: orw $24576, %ax # imm = 0x6000 +; GISEL-X86-NOSSE-NEXT: movw %ax, (%esp) +; GISEL-X86-NOSSE-NEXT: fldcw (%esp) +; GISEL-X86-NOSSE-NEXT: popl %eax +; GISEL-X86-NOSSE-NEXT: retl +; ; X64-NOSSE-LABEL: func_01: ; X64-NOSSE: # %bb.0: ; X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -31,6 +43,16 @@ define void @func_01() nounwind { ; X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NOSSE-NEXT: retq ; +; GISEL-X64-NOSSE-LABEL: func_01: +; GISEL-X64-NOSSE: # %bb.0: +; GISEL-X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NOSSE-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NOSSE-NEXT: orw $24576, %ax # imm = 0x6000 +; GISEL-X64-NOSSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: retq +; ; X86-LABEL: func_01: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -40,6 +62,18 @@ define void @func_01() nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; +; GISEL-X86-LABEL: func_01: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %eax +; GISEL-X86-NEXT: fnstcw (%esp) +; GISEL-X86-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NEXT: andw (%esp), %ax +; GISEL-X86-NEXT: orw $24576, %ax # imm = 0x6000 +; GISEL-X86-NEXT: movw %ax, (%esp) +; GISEL-X86-NEXT: fldcw (%esp) +; GISEL-X86-NEXT: popl %eax +; GISEL-X86-NEXT: retl +; ; X64-LABEL: func_01: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -49,6 +83,22 @@ define void @func_01() nounwind { ; X64-NEXT: orb $96, -{{[0-9]+}}(%rsp) ; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: retq +; +; GISEL-X64-LABEL: func_01: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NEXT: orw $24576, %ax # imm = 0x6000 +; GISEL-X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; GISEL-X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; GISEL-X64-NEXT: orl $24576, %eax # imm = 0x6000 +; GISEL-X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: retq call void @llvm.set.rounding(i32 0) ; TowardZero (CW[11-10] = 11) ret void } @@ -63,6 +113,18 @@ define void @func_02() nounwind { ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl ; +; GISEL-X86-NOSSE-LABEL: func_02: +; GISEL-X86-NOSSE: # %bb.0: +; GISEL-X86-NOSSE-NEXT: pushl %eax +; GISEL-X86-NOSSE-NEXT: fnstcw (%esp) +; GISEL-X86-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NOSSE-NEXT: andw (%esp), %ax +; GISEL-X86-NOSSE-NEXT: orw $0, %ax +; GISEL-X86-NOSSE-NEXT: movw %ax, (%esp) +; GISEL-X86-NOSSE-NEXT: fldcw (%esp) +; GISEL-X86-NOSSE-NEXT: popl %eax +; GISEL-X86-NOSSE-NEXT: retl +; ; X64-NOSSE-LABEL: func_02: ; X64-NOSSE: # %bb.0: ; X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -70,6 +132,16 @@ define void @func_02() nounwind { ; X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NOSSE-NEXT: retq ; +; GISEL-X64-NOSSE-LABEL: func_02: +; GISEL-X64-NOSSE: # %bb.0: +; GISEL-X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NOSSE-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NOSSE-NEXT: orw $0, %ax +; GISEL-X64-NOSSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: retq +; ; X86-LABEL: func_02: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -79,6 +151,18 @@ define void @func_02() nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; +; GISEL-X86-LABEL: func_02: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %eax +; GISEL-X86-NEXT: fnstcw (%esp) +; GISEL-X86-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NEXT: andw (%esp), %ax +; GISEL-X86-NEXT: orw $0, %ax +; GISEL-X86-NEXT: movw %ax, (%esp) +; GISEL-X86-NEXT: fldcw (%esp) +; GISEL-X86-NEXT: popl %eax +; GISEL-X86-NEXT: retl +; ; X64-LABEL: func_02: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -88,6 +172,22 @@ define void @func_02() nounwind { ; X64-NEXT: andb $-97, -{{[0-9]+}}(%rsp) ; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: retq +; +; GISEL-X64-LABEL: func_02: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NEXT: orw $0, %ax +; GISEL-X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; GISEL-X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; GISEL-X64-NEXT: orl $0, %eax +; GISEL-X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: retq call void @llvm.set.rounding(i32 1) ; ToNearestTiesToEven (CW[11-10] = 00) ret void } @@ -105,6 +205,18 @@ define void @func_03() nounwind { ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl ; +; GISEL-X86-NOSSE-LABEL: func_03: +; GISEL-X86-NOSSE: # %bb.0: +; GISEL-X86-NOSSE-NEXT: pushl %eax +; GISEL-X86-NOSSE-NEXT: fnstcw (%esp) +; GISEL-X86-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NOSSE-NEXT: andw (%esp), %ax +; GISEL-X86-NOSSE-NEXT: orw $16384, %ax # imm = 0x4000 +; GISEL-X86-NOSSE-NEXT: movw %ax, (%esp) +; GISEL-X86-NOSSE-NEXT: fldcw (%esp) +; GISEL-X86-NOSSE-NEXT: popl %eax +; GISEL-X86-NOSSE-NEXT: retl +; ; X64-NOSSE-LABEL: func_03: ; X64-NOSSE: # %bb.0: ; X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -115,6 +227,16 @@ define void @func_03() nounwind { ; X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NOSSE-NEXT: retq ; +; GISEL-X64-NOSSE-LABEL: func_03: +; GISEL-X64-NOSSE: # %bb.0: +; GISEL-X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NOSSE-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NOSSE-NEXT: orw $16384, %ax # imm = 0x4000 +; GISEL-X64-NOSSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: retq +; ; X86-LABEL: func_03: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -127,6 +249,18 @@ define void @func_03() nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; +; GISEL-X86-LABEL: func_03: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %eax +; GISEL-X86-NEXT: fnstcw (%esp) +; GISEL-X86-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NEXT: andw (%esp), %ax +; GISEL-X86-NEXT: orw $16384, %ax # imm = 0x4000 +; GISEL-X86-NEXT: movw %ax, (%esp) +; GISEL-X86-NEXT: fldcw (%esp) +; GISEL-X86-NEXT: popl %eax +; GISEL-X86-NEXT: retl +; ; X64-LABEL: func_03: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -142,6 +276,22 @@ define void @func_03() nounwind { ; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: retq +; +; GISEL-X64-LABEL: func_03: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NEXT: orw $16384, %ax # imm = 0x4000 +; GISEL-X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; GISEL-X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; GISEL-X64-NEXT: orl $16384, %eax # imm = 0x4000 +; GISEL-X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: retq call void @llvm.set.rounding(i32 2) ; Upward (CW[11-10] = 10) ret void } @@ -159,6 +309,18 @@ define void @func_04() nounwind { ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl ; +; GISEL-X86-NOSSE-LABEL: func_04: +; GISEL-X86-NOSSE: # %bb.0: +; GISEL-X86-NOSSE-NEXT: pushl %eax +; GISEL-X86-NOSSE-NEXT: fnstcw (%esp) +; GISEL-X86-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NOSSE-NEXT: andw (%esp), %ax +; GISEL-X86-NOSSE-NEXT: orw $8192, %ax # imm = 0x2000 +; GISEL-X86-NOSSE-NEXT: movw %ax, (%esp) +; GISEL-X86-NOSSE-NEXT: fldcw (%esp) +; GISEL-X86-NOSSE-NEXT: popl %eax +; GISEL-X86-NOSSE-NEXT: retl +; ; X64-NOSSE-LABEL: func_04: ; X64-NOSSE: # %bb.0: ; X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -169,6 +331,16 @@ define void @func_04() nounwind { ; X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NOSSE-NEXT: retq ; +; GISEL-X64-NOSSE-LABEL: func_04: +; GISEL-X64-NOSSE: # %bb.0: +; GISEL-X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NOSSE-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NOSSE-NEXT: orw $8192, %ax # imm = 0x2000 +; GISEL-X64-NOSSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: retq +; ; X86-LABEL: func_04: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -181,6 +353,18 @@ define void @func_04() nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; +; GISEL-X86-LABEL: func_04: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %eax +; GISEL-X86-NEXT: fnstcw (%esp) +; GISEL-X86-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NEXT: andw (%esp), %ax +; GISEL-X86-NEXT: orw $8192, %ax # imm = 0x2000 +; GISEL-X86-NEXT: movw %ax, (%esp) +; GISEL-X86-NEXT: fldcw (%esp) +; GISEL-X86-NEXT: popl %eax +; GISEL-X86-NEXT: retl +; ; X64-LABEL: func_04: ; X64: # %bb.0: ; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) @@ -196,6 +380,22 @@ define void @func_04() nounwind { ; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: retq +; +; GISEL-X64-LABEL: func_04: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NEXT: orw $8192, %ax # imm = 0x2000 +; GISEL-X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; GISEL-X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; GISEL-X64-NEXT: orl $8192, %eax # imm = 0x2000 +; GISEL-X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: retq call void @llvm.set.rounding(i32 3) ; Downward (CW[11-10] = 01) ret void } @@ -219,6 +419,25 @@ define void @func_05(i32 %x) nounwind { ; X86-NOSSE-NEXT: popl %eax ; X86-NOSSE-NEXT: retl ; +; GISEL-X86-NOSSE-LABEL: func_05: +; GISEL-X86-NOSSE: # %bb.0: +; GISEL-X86-NOSSE-NEXT: pushl %eax +; GISEL-X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NOSSE-NEXT: fnstcw (%esp) +; GISEL-X86-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NOSSE-NEXT: andw (%esp), %ax +; GISEL-X86-NOSSE-NEXT: addl %ecx, %ecx +; GISEL-X86-NOSSE-NEXT: addl $4, %ecx +; GISEL-X86-NOSSE-NEXT: movw $201, %dx +; GISEL-X86-NOSSE-NEXT: # kill: def $cl killed $cl killed $ecx +; GISEL-X86-NOSSE-NEXT: shlw %cl, %dx +; GISEL-X86-NOSSE-NEXT: andw $3072, %dx # imm = 0xC00 +; GISEL-X86-NOSSE-NEXT: orw %ax, %dx +; GISEL-X86-NOSSE-NEXT: movw %dx, (%esp) +; GISEL-X86-NOSSE-NEXT: fldcw (%esp) +; GISEL-X86-NOSSE-NEXT: popl %eax +; GISEL-X86-NOSSE-NEXT: retl +; ; X64-NOSSE-LABEL: func_05: ; X64-NOSSE: # %bb.0: ; X64-NOSSE-NEXT: # kill: def $edi killed $edi def $rdi @@ -235,6 +454,23 @@ define void @func_05(i32 %x) nounwind { ; X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) ; X64-NOSSE-NEXT: retq ; +; GISEL-X64-NOSSE-LABEL: func_05: +; GISEL-X64-NOSSE: # %bb.0: +; GISEL-X64-NOSSE-NEXT: # kill: def $edi killed $edi def $rdi +; GISEL-X64-NOSSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NOSSE-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NOSSE-NEXT: leal (%rdi,%rdi), %ecx +; GISEL-X64-NOSSE-NEXT: addl $4, %ecx +; GISEL-X64-NOSSE-NEXT: movw $201, %dx +; GISEL-X64-NOSSE-NEXT: # kill: def $cl killed $cl killed $ecx +; GISEL-X64-NOSSE-NEXT: shlw %cl, %dx +; GISEL-X64-NOSSE-NEXT: andw $3072, %dx # imm = 0xC00 +; GISEL-X64-NOSSE-NEXT: orw %ax, %dx +; GISEL-X64-NOSSE-NEXT: movw %dx, -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NOSSE-NEXT: retq +; ; X86-LABEL: func_05: ; X86: # %bb.0: ; X86-NEXT: pushl %eax @@ -253,6 +489,25 @@ define void @func_05(i32 %x) nounwind { ; X86-NEXT: popl %eax ; X86-NEXT: retl ; +; GISEL-X86-LABEL: func_05: +; GISEL-X86: # %bb.0: +; GISEL-X86-NEXT: pushl %eax +; GISEL-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; GISEL-X86-NEXT: fnstcw (%esp) +; GISEL-X86-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X86-NEXT: andw (%esp), %ax +; GISEL-X86-NEXT: addl %ecx, %ecx +; GISEL-X86-NEXT: addl $4, %ecx +; GISEL-X86-NEXT: movw $201, %dx +; GISEL-X86-NEXT: # kill: def $cl killed $cl killed $ecx +; GISEL-X86-NEXT: shlw %cl, %dx +; GISEL-X86-NEXT: andw $3072, %dx # imm = 0xC00 +; GISEL-X86-NEXT: orw %ax, %dx +; GISEL-X86-NEXT: movw %dx, (%esp) +; GISEL-X86-NEXT: fldcw (%esp) +; GISEL-X86-NEXT: popl %eax +; GISEL-X86-NEXT: retl +; ; X64-LABEL: func_05: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi @@ -274,6 +529,31 @@ define void @func_05(i32 %x) nounwind { ; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) ; X64-NEXT: retq +; +; GISEL-X64-LABEL: func_05: +; GISEL-X64: # %bb.0: +; GISEL-X64-NEXT: # kill: def $edi killed $edi def $rdi +; GISEL-X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movw $-3073, %ax # imm = 0xF3FF +; GISEL-X64-NEXT: andw -{{[0-9]+}}(%rsp), %ax +; GISEL-X64-NEXT: leal (%rdi,%rdi), %ecx +; GISEL-X64-NEXT: addl $4, %ecx +; GISEL-X64-NEXT: movw $201, %dx +; GISEL-X64-NEXT: # kill: def $cl killed $cl killed $ecx +; GISEL-X64-NEXT: shlw %cl, %dx +; GISEL-X64-NEXT: andw $3072, %dx # imm = 0xC00 +; GISEL-X64-NEXT: movzwl %dx, %ecx +; GISEL-X64-NEXT: leal (,%rcx,8), %edx +; GISEL-X64-NEXT: orw %ax, %cx +; GISEL-X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: stmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: movl $-24577, %eax # imm = 0x9FFF +; GISEL-X64-NEXT: andl -{{[0-9]+}}(%rsp), %eax +; GISEL-X64-NEXT: orl %edx, %eax +; GISEL-X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: ldmxcsr -{{[0-9]+}}(%rsp) +; GISEL-X64-NEXT: retq call void @llvm.set.rounding(i32 %x) ; Downward ret void } @@ -284,10 +564,6 @@ attributes #0 = { nounwind "use-soft-float"="true" } ; FASTISEL-X64-NOSSE: {{.*}} ; FASTISEL-X86: {{.*}} ; FASTISEL-X86-NOSSE: {{.*}} -; GISEL-X64: {{.*}} -; GISEL-X64-NOSSE: {{.*}} -; GISEL-X86: {{.*}} -; GISEL-X86-NOSSE: {{.*}} ; SDAG-X64: {{.*}} ; SDAG-X64-NOSSE: {{.*}} ; SDAG-X86: {{.*}} diff --git a/llvm/test/CodeGen/X86/isel-set-invalid-rounding.ll b/llvm/test/CodeGen/X86/isel-set-invalid-rounding.ll new file mode 100644 index 0000000000000..9fed9945532a0 --- /dev/null +++ b/llvm/test/CodeGen/X86/isel-set-invalid-rounding.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: not llc < %s -mtriple=i686-- -fast-isel -filetype=null 2>&1 | FileCheck %s --check-prefixes=ERROR +; RUN: not llc < %s -mtriple=i686-- -global-isel=0 -fast-isel=0 -filetype=null 2>&1 | FileCheck %s --check-prefixes=ERROR +; RUN: not llc < %s -mtriple=i686-- -global-isel -global-isel-abort=1 -filetype=null 2>&1 | FileCheck %s --check-prefixes=ERROR +; RUN: not llc < %s -mtriple=x86_64-- -fast-isel -filetype=null 2>&1 | FileCheck %s --check-prefixes=ERROR +; RUN: not llc < %s -mtriple=x86_64-- -global-isel=0 -fast-isel=0 -filetype=null 2>&1 | FileCheck %s --check-prefixes=ERROR +; RUN: not llc < %s -mtriple=x86_64-- -global-isel -global-isel-abort=1 -filetype=null 2>&1 | FileCheck %s --check-prefixes=ERROR + +; ERROR: error: isel-set-invalid-rounding:3:3: in function foo void (): rounding mode is not supported by X86 hardware + +define void @foo() !dbg !9 { +entry: + tail call void @llvm.set.rounding(i32 99), !dbg !12 + ret void, !dbg !13 +} + +declare void @llvm.set.rounding(i32) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "isel-set-invalid-rounding", directory: "/tmp") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 8, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{!"clang"} +!9 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !10, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, keyInstructions: true) +!10 = !DISubroutineType(types: !11) +!11 = !{null} +!12 = !DILocation(line: 3, column: 3, scope: !9) +!13 = !DILocation(line: 4, column: 1, scope: !9, atomGroup: 1, atomRank: 1) diff --git a/llvm/test/CodeGen/X86/known-signbits-shl.ll b/llvm/test/CodeGen/X86/known-signbits-shl.ll index 473fecc307ed4..57d557dec11b9 100644 --- a/llvm/test/CodeGen/X86/known-signbits-shl.ll +++ b/llvm/test/CodeGen/X86/known-signbits-shl.ll @@ -137,7 +137,7 @@ define void @computeNumSignBits_shl_zext_vec_3(<2 x i8> %x, ptr %p) nounwind { ; X64-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; X64-NEXT: por %xmm2, %xmm1 ; X64-NEXT: movdqa %xmm0, %xmm2 -; X64-NEXT: paddw %xmm0, %xmm2 +; X64-NEXT: paddw %xmm2, %xmm2 ; X64-NEXT: movdqa %xmm2, %xmm3 ; X64-NEXT: psraw $1, %xmm3 ; X64-NEXT: pcmpeqw %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/lea-16bit.ll b/llvm/test/CodeGen/X86/lea-16bit.ll index cec29ab1da6ab..40da01d9ab8f3 100644 --- a/llvm/test/CodeGen/X86/lea-16bit.ll +++ b/llvm/test/CodeGen/X86/lea-16bit.ll @@ -13,7 +13,8 @@ define i16 @lea16bit(i16 %in) { ; NDD-LABEL: lea16bit: ; NDD: # %bb.0: ; NDD-NEXT: # kill: def $edi killed $edi def $rdi -; NDD-NEXT: leaw 1(%rdi,%rdi), %ax +; NDD-NEXT: leal 1(%rdi,%rdi), %eax +; NDD-NEXT: # kill: def $ax killed $ax killed $eax ; NDD-NEXT: retq %shl = shl i16 %in, 1 %or = or i16 %shl, 1 diff --git a/llvm/test/CodeGen/X86/lea-8bit.ll b/llvm/test/CodeGen/X86/lea-8bit.ll index 98222dfc0407c..fc295f75e23c7 100644 --- a/llvm/test/CodeGen/X86/lea-8bit.ll +++ b/llvm/test/CodeGen/X86/lea-8bit.ll @@ -14,7 +14,8 @@ define i8 @lea8bit(i8 %in) { ; NDD-LABEL: lea8bit: ; NDD: # %bb.0: ; NDD-NEXT: # kill: def $edi killed $edi def $rdi -; NDD-NEXT: leab 1(%rdi,%rdi), %al +; NDD-NEXT: leal 1(%rdi,%rdi), %eax +; NDD-NEXT: # kill: def $al killed $al killed $eax ; NDD-NEXT: retq %shl = shl i8 %in, 1 %or = or i8 %shl, 1 diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll index 83840dd85c533..e3a1b1b83b2e3 100644 --- a/llvm/test/CodeGen/X86/llvm.frexp.ll +++ b/llvm/test/CodeGen/X86/llvm.frexp.ll @@ -582,6 +582,22 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) nounwind { ret i32 %result.0 } +define { float, i32 } @pr160981() { +; X64-LABEL: pr160981: +; X64: # %bb.0: +; X64-NEXT: movss {{.*#+}} xmm0 = [9.9999988E-1,0.0E+0,0.0E+0,0.0E+0] +; X64-NEXT: movl $-126, %eax +; X64-NEXT: retq +; +; WIN32-LABEL: pr160981: +; WIN32: # %bb.0: +; WIN32-NEXT: flds __real@3f7ffffe +; WIN32-NEXT: movl $-126, %eax +; WIN32-NEXT: retl + %ret = call { float, i32 } @llvm.frexp.f32.i32(float bitcast (i32 8388607 to float)) + ret { float, i32 } %ret +} + ; FIXME: Widen vector result ; define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) nounwind { ; %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 4e6f666fa05de..4cde581c10508 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -4806,9 +4806,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq @@ -4830,9 +4829,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: retq @@ -4842,10 +4840,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index(ptr %x, ptr %arr, <16 ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: retq @@ -4875,9 +4872,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-KNL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: retq @@ -4899,9 +4895,8 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: retq @@ -4911,10 +4906,9 @@ define <16 x float> @test_gather_structpt_16f32_mask_index_offset(ptr %x, ptr %a ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax ; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps 4(%rdi,%zmm0,8), %zmm1 {%k1} ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: retq @@ -4944,9 +4938,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; X64-KNL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-KNL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-KNL-NEXT: vpslld $1, (%rsi), %zmm0 +; X64-KNL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2 ; X64-KNL-NEXT: kmovw %k1, %k2 ; X64-KNL-NEXT: vmovaps %zmm1, %zmm0 ; X64-KNL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} @@ -4972,9 +4965,8 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-SMALL-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-SMALL-NEXT: vmovdqu64 (%rsi), %zmm0 -; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0 -; X64-SKX-SMALL-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-SMALL-NEXT: vpslld $1, (%rsi), %zmm0 +; X64-SKX-SMALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm2 ; X64-SKX-SMALL-NEXT: kmovw %k1, %k2 ; X64-SKX-SMALL-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-SMALL-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} @@ -4986,10 +4978,9 @@ define {<16 x float>, <16 x float>} @test_gather_structpt_16f32_mask_index_pair( ; X64-SKX-LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpslld $31, %zmm0, %zmm0 ; X64-SKX-LARGE-NEXT: vpmovd2m %zmm0, %k1 -; X64-SKX-LARGE-NEXT: vmovdqu64 (%rsi), %zmm0 +; X64-SKX-LARGE-NEXT: vpslld $1, (%rsi), %zmm0 ; X64-SKX-LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax -; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm0 -; X64-SKX-LARGE-NEXT: vpaddd %zmm0, %zmm0, %zmm2 +; X64-SKX-LARGE-NEXT: vpandd (%rax){1to16}, %zmm0, %zmm2 ; X64-SKX-LARGE-NEXT: kmovw %k1, %k2 ; X64-SKX-LARGE-NEXT: vmovaps %zmm1, %zmm0 ; X64-SKX-LARGE-NEXT: vgatherdps (%rdi,%zmm2,8), %zmm0 {%k2} diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 2f0d419132492..ecf4fbb603a8f 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefix=SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { @@ -178,13 +178,13 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v8i64_v8i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: @@ -192,14 +192,6 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v8i64_v8i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer %b = trunc <8 x i64> %x to <8 x i32> call void @llvm.masked.store.v8i32.p0(<8 x i32> %b, ptr %p, i32 1, <8 x i1> %a) @@ -573,6 +565,70 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i64_v8i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB1_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB1_3 +; AVX512FVL-NEXT: .LBB1_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB1_5 +; AVX512FVL-NEXT: .LBB1_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB1_7 +; AVX512FVL-NEXT: .LBB1_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB1_9 +; AVX512FVL-NEXT: .LBB1_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB1_11 +; AVX512FVL-NEXT: .LBB1_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB1_13 +; AVX512FVL-NEXT: .LBB1_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB1_15 +; AVX512FVL-NEXT: .LBB1_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB1_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB1_4 +; AVX512FVL-NEXT: .LBB1_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB1_6 +; AVX512FVL-NEXT: .LBB1_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB1_8 +; AVX512FVL-NEXT: .LBB1_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB1_10 +; AVX512FVL-NEXT: .LBB1_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB1_12 +; AVX512FVL-NEXT: .LBB1_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB1_14 +; AVX512FVL-NEXT: .LBB1_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB1_16 +; AVX512FVL-NEXT: .LBB1_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -960,6 +1016,70 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i64_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovqb %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB2_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB2_3 +; AVX512FVL-NEXT: .LBB2_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB2_5 +; AVX512FVL-NEXT: .LBB2_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB2_7 +; AVX512FVL-NEXT: .LBB2_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB2_9 +; AVX512FVL-NEXT: .LBB2_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB2_11 +; AVX512FVL-NEXT: .LBB2_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB2_13 +; AVX512FVL-NEXT: .LBB2_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB2_15 +; AVX512FVL-NEXT: .LBB2_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB2_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB2_4 +; AVX512FVL-NEXT: .LBB2_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB2_6 +; AVX512FVL-NEXT: .LBB2_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB2_8 +; AVX512FVL-NEXT: .LBB2_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB2_10 +; AVX512FVL-NEXT: .LBB2_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB2_12 +; AVX512FVL-NEXT: .LBB2_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB2_14 +; AVX512FVL-NEXT: .LBB2_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB2_16 +; AVX512FVL-NEXT: .LBB2_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -1080,17 +1200,17 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k1 +; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: @@ -1098,18 +1218,6 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = trunc <4 x i64> %x to <4 x i32> call void @llvm.masked.store.v4i32.p0(<4 x i32> %b, ptr %p, i32 1, <4 x i1> %a) @@ -1321,6 +1429,42 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i64_v4i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovqw %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB4_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB4_3 +; AVX512FVL-NEXT: .LBB4_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB4_5 +; AVX512FVL-NEXT: .LBB4_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB4_7 +; AVX512FVL-NEXT: .LBB4_8: # %else6 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB4_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB4_4 +; AVX512FVL-NEXT: .LBB4_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB4_6 +; AVX512FVL-NEXT: .LBB4_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB4_8 +; AVX512FVL-NEXT: .LBB4_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i64_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -1552,6 +1696,42 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i64_v4i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB5_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB5_3 +; AVX512FVL-NEXT: .LBB5_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB5_5 +; AVX512FVL-NEXT: .LBB5_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB5_7 +; AVX512FVL-NEXT: .LBB5_8: # %else6 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB5_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB5_4 +; AVX512FVL-NEXT: .LBB5_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB5_6 +; AVX512FVL-NEXT: .LBB5_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB5_8 +; AVX512FVL-NEXT: .LBB5_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i64_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -1646,33 +1826,22 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v2i64_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v2i64_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v2i64_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = trunc <2 x i64> %x to <2 x i32> call void @llvm.masked.store.v2i32.p0(<2 x i32> %b, ptr %p, i32 1, <2 x i1> %a) @@ -1777,6 +1946,26 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v2i64_v2i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovqw %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB7_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB7_3 +; AVX512FVL-NEXT: .LBB7_4: # %else2 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB7_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB7_4 +; AVX512FVL-NEXT: .LBB7_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v2i64_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -1896,6 +2085,26 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v2i64_v2i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovqb %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB8_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB8_3 +; AVX512FVL-NEXT: .LBB8_4: # %else2 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB8_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB8_4 +; AVX512FVL-NEXT: .LBB8_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v2i64_v2i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -2581,6 +2790,126 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i32_v16i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512FVL-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB9_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB9_3 +; AVX512FVL-NEXT: .LBB9_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB9_5 +; AVX512FVL-NEXT: .LBB9_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB9_7 +; AVX512FVL-NEXT: .LBB9_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB9_9 +; AVX512FVL-NEXT: .LBB9_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB9_11 +; AVX512FVL-NEXT: .LBB9_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB9_13 +; AVX512FVL-NEXT: .LBB9_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB9_16 +; AVX512FVL-NEXT: .LBB9_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: .LBB9_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512FVL-NEXT: jne .LBB9_17 +; AVX512FVL-NEXT: # %bb.18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB9_19 +; AVX512FVL-NEXT: .LBB9_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB9_21 +; AVX512FVL-NEXT: .LBB9_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB9_23 +; AVX512FVL-NEXT: .LBB9_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB9_25 +; AVX512FVL-NEXT: .LBB9_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB9_27 +; AVX512FVL-NEXT: .LBB9_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB9_29 +; AVX512FVL-NEXT: .LBB9_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB9_31 +; AVX512FVL-NEXT: .LBB9_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB9_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB9_4 +; AVX512FVL-NEXT: .LBB9_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB9_6 +; AVX512FVL-NEXT: .LBB9_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB9_8 +; AVX512FVL-NEXT: .LBB9_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB9_10 +; AVX512FVL-NEXT: .LBB9_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB9_12 +; AVX512FVL-NEXT: .LBB9_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB9_14 +; AVX512FVL-NEXT: .LBB9_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB9_15 +; AVX512FVL-NEXT: jmp .LBB9_16 +; AVX512FVL-NEXT: .LBB9_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB9_20 +; AVX512FVL-NEXT: .LBB9_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB9_22 +; AVX512FVL-NEXT: .LBB9_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB9_24 +; AVX512FVL-NEXT: .LBB9_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 22(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB9_26 +; AVX512FVL-NEXT: .LBB9_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 24(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB9_28 +; AVX512FVL-NEXT: .LBB9_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 26(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB9_30 +; AVX512FVL-NEXT: .LBB9_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB9_32 +; AVX512FVL-NEXT: .LBB9_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -3247,6 +3576,126 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i32_v16i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512FVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB10_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB10_3 +; AVX512FVL-NEXT: .LBB10_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB10_5 +; AVX512FVL-NEXT: .LBB10_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB10_7 +; AVX512FVL-NEXT: .LBB10_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB10_9 +; AVX512FVL-NEXT: .LBB10_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB10_11 +; AVX512FVL-NEXT: .LBB10_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB10_13 +; AVX512FVL-NEXT: .LBB10_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB10_15 +; AVX512FVL-NEXT: .LBB10_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB10_17 +; AVX512FVL-NEXT: .LBB10_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB10_19 +; AVX512FVL-NEXT: .LBB10_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB10_21 +; AVX512FVL-NEXT: .LBB10_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB10_23 +; AVX512FVL-NEXT: .LBB10_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB10_25 +; AVX512FVL-NEXT: .LBB10_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB10_27 +; AVX512FVL-NEXT: .LBB10_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB10_29 +; AVX512FVL-NEXT: .LBB10_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB10_31 +; AVX512FVL-NEXT: .LBB10_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB10_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB10_4 +; AVX512FVL-NEXT: .LBB10_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB10_6 +; AVX512FVL-NEXT: .LBB10_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB10_8 +; AVX512FVL-NEXT: .LBB10_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB10_10 +; AVX512FVL-NEXT: .LBB10_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB10_12 +; AVX512FVL-NEXT: .LBB10_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB10_14 +; AVX512FVL-NEXT: .LBB10_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB10_16 +; AVX512FVL-NEXT: .LBB10_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB10_18 +; AVX512FVL-NEXT: .LBB10_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB10_20 +; AVX512FVL-NEXT: .LBB10_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB10_22 +; AVX512FVL-NEXT: .LBB10_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB10_24 +; AVX512FVL-NEXT: .LBB10_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB10_26 +; AVX512FVL-NEXT: .LBB10_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB10_28 +; AVX512FVL-NEXT: .LBB10_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB10_30 +; AVX512FVL-NEXT: .LBB10_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB10_32 +; AVX512FVL-NEXT: .LBB10_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -3619,6 +4068,70 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i32_v8i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB11_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB11_3 +; AVX512FVL-NEXT: .LBB11_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB11_5 +; AVX512FVL-NEXT: .LBB11_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB11_7 +; AVX512FVL-NEXT: .LBB11_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB11_9 +; AVX512FVL-NEXT: .LBB11_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB11_11 +; AVX512FVL-NEXT: .LBB11_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB11_13 +; AVX512FVL-NEXT: .LBB11_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB11_15 +; AVX512FVL-NEXT: .LBB11_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB11_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB11_4 +; AVX512FVL-NEXT: .LBB11_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB11_6 +; AVX512FVL-NEXT: .LBB11_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB11_8 +; AVX512FVL-NEXT: .LBB11_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB11_10 +; AVX512FVL-NEXT: .LBB11_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB11_12 +; AVX512FVL-NEXT: .LBB11_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB11_14 +; AVX512FVL-NEXT: .LBB11_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB11_16 +; AVX512FVL-NEXT: .LBB11_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i32_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -3996,6 +4509,70 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i32_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB12_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB12_3 +; AVX512FVL-NEXT: .LBB12_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB12_5 +; AVX512FVL-NEXT: .LBB12_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB12_7 +; AVX512FVL-NEXT: .LBB12_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB12_9 +; AVX512FVL-NEXT: .LBB12_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB12_11 +; AVX512FVL-NEXT: .LBB12_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB12_13 +; AVX512FVL-NEXT: .LBB12_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB12_15 +; AVX512FVL-NEXT: .LBB12_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB12_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB12_4 +; AVX512FVL-NEXT: .LBB12_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB12_6 +; AVX512FVL-NEXT: .LBB12_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB12_8 +; AVX512FVL-NEXT: .LBB12_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB12_10 +; AVX512FVL-NEXT: .LBB12_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB12_12 +; AVX512FVL-NEXT: .LBB12_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB12_14 +; AVX512FVL-NEXT: .LBB12_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB12_16 +; AVX512FVL-NEXT: .LBB12_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i32_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -4172,6 +4749,40 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i32_v4i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB13_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB13_3 +; AVX512FVL-NEXT: .LBB13_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB13_5 +; AVX512FVL-NEXT: .LBB13_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB13_7 +; AVX512FVL-NEXT: .LBB13_8: # %else6 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB13_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB13_4 +; AVX512FVL-NEXT: .LBB13_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB13_6 +; AVX512FVL-NEXT: .LBB13_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB13_8 +; AVX512FVL-NEXT: .LBB13_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i32_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -4346,6 +4957,40 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i32_v4i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB14_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB14_3 +; AVX512FVL-NEXT: .LBB14_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB14_5 +; AVX512FVL-NEXT: .LBB14_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB14_7 +; AVX512FVL-NEXT: .LBB14_8: # %else6 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB14_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB14_4 +; AVX512FVL-NEXT: .LBB14_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB14_6 +; AVX512FVL-NEXT: .LBB14_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB14_8 +; AVX512FVL-NEXT: .LBB14_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i32_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -5550,6 +6195,245 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v32i16_v32i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX512FVL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512FVL-NEXT: vpmovdb %zmm2, %xmm2 +; AVX512FVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512FVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512FVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512FVL-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512FVL-NEXT: vpmovmskb %ymm1, %eax +; AVX512FVL-NEXT: notl %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB15_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB15_3 +; AVX512FVL-NEXT: .LBB15_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB15_5 +; AVX512FVL-NEXT: .LBB15_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB15_7 +; AVX512FVL-NEXT: .LBB15_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB15_9 +; AVX512FVL-NEXT: .LBB15_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB15_11 +; AVX512FVL-NEXT: .LBB15_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB15_13 +; AVX512FVL-NEXT: .LBB15_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB15_15 +; AVX512FVL-NEXT: .LBB15_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB15_17 +; AVX512FVL-NEXT: .LBB15_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB15_19 +; AVX512FVL-NEXT: .LBB15_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB15_21 +; AVX512FVL-NEXT: .LBB15_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB15_23 +; AVX512FVL-NEXT: .LBB15_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB15_25 +; AVX512FVL-NEXT: .LBB15_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB15_27 +; AVX512FVL-NEXT: .LBB15_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB15_29 +; AVX512FVL-NEXT: .LBB15_30: # %else28 +; AVX512FVL-NEXT: testw %ax, %ax +; AVX512FVL-NEXT: jns .LBB15_32 +; AVX512FVL-NEXT: .LBB15_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: .LBB15_32: # %else30 +; AVX512FVL-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512FVL-NEXT: jne .LBB15_33 +; AVX512FVL-NEXT: # %bb.34: # %else32 +; AVX512FVL-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512FVL-NEXT: jne .LBB15_35 +; AVX512FVL-NEXT: .LBB15_36: # %else34 +; AVX512FVL-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512FVL-NEXT: jne .LBB15_37 +; AVX512FVL-NEXT: .LBB15_38: # %else36 +; AVX512FVL-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512FVL-NEXT: jne .LBB15_39 +; AVX512FVL-NEXT: .LBB15_40: # %else38 +; AVX512FVL-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512FVL-NEXT: jne .LBB15_41 +; AVX512FVL-NEXT: .LBB15_42: # %else40 +; AVX512FVL-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512FVL-NEXT: jne .LBB15_43 +; AVX512FVL-NEXT: .LBB15_44: # %else42 +; AVX512FVL-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512FVL-NEXT: jne .LBB15_45 +; AVX512FVL-NEXT: .LBB15_46: # %else44 +; AVX512FVL-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512FVL-NEXT: jne .LBB15_47 +; AVX512FVL-NEXT: .LBB15_48: # %else46 +; AVX512FVL-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512FVL-NEXT: jne .LBB15_49 +; AVX512FVL-NEXT: .LBB15_50: # %else48 +; AVX512FVL-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512FVL-NEXT: jne .LBB15_51 +; AVX512FVL-NEXT: .LBB15_52: # %else50 +; AVX512FVL-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512FVL-NEXT: jne .LBB15_53 +; AVX512FVL-NEXT: .LBB15_54: # %else52 +; AVX512FVL-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512FVL-NEXT: jne .LBB15_55 +; AVX512FVL-NEXT: .LBB15_56: # %else54 +; AVX512FVL-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512FVL-NEXT: jne .LBB15_57 +; AVX512FVL-NEXT: .LBB15_58: # %else56 +; AVX512FVL-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512FVL-NEXT: jne .LBB15_59 +; AVX512FVL-NEXT: .LBB15_60: # %else58 +; AVX512FVL-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512FVL-NEXT: jne .LBB15_61 +; AVX512FVL-NEXT: .LBB15_62: # %else60 +; AVX512FVL-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512FVL-NEXT: jne .LBB15_63 +; AVX512FVL-NEXT: .LBB15_64: # %else62 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB15_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB15_4 +; AVX512FVL-NEXT: .LBB15_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB15_6 +; AVX512FVL-NEXT: .LBB15_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB15_8 +; AVX512FVL-NEXT: .LBB15_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB15_10 +; AVX512FVL-NEXT: .LBB15_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB15_12 +; AVX512FVL-NEXT: .LBB15_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB15_14 +; AVX512FVL-NEXT: .LBB15_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB15_16 +; AVX512FVL-NEXT: .LBB15_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB15_18 +; AVX512FVL-NEXT: .LBB15_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB15_20 +; AVX512FVL-NEXT: .LBB15_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB15_22 +; AVX512FVL-NEXT: .LBB15_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB15_24 +; AVX512FVL-NEXT: .LBB15_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB15_26 +; AVX512FVL-NEXT: .LBB15_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB15_28 +; AVX512FVL-NEXT: .LBB15_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB15_30 +; AVX512FVL-NEXT: .LBB15_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testw %ax, %ax +; AVX512FVL-NEXT: js .LBB15_31 +; AVX512FVL-NEXT: jmp .LBB15_32 +; AVX512FVL-NEXT: .LBB15_33: # %cond.store31 +; AVX512FVL-NEXT: vpextrb $0, %xmm0, 16(%rdi) +; AVX512FVL-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512FVL-NEXT: je .LBB15_36 +; AVX512FVL-NEXT: .LBB15_35: # %cond.store33 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 17(%rdi) +; AVX512FVL-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512FVL-NEXT: je .LBB15_38 +; AVX512FVL-NEXT: .LBB15_37: # %cond.store35 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 18(%rdi) +; AVX512FVL-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512FVL-NEXT: je .LBB15_40 +; AVX512FVL-NEXT: .LBB15_39: # %cond.store37 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 19(%rdi) +; AVX512FVL-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512FVL-NEXT: je .LBB15_42 +; AVX512FVL-NEXT: .LBB15_41: # %cond.store39 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 20(%rdi) +; AVX512FVL-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512FVL-NEXT: je .LBB15_44 +; AVX512FVL-NEXT: .LBB15_43: # %cond.store41 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 21(%rdi) +; AVX512FVL-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512FVL-NEXT: je .LBB15_46 +; AVX512FVL-NEXT: .LBB15_45: # %cond.store43 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 22(%rdi) +; AVX512FVL-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512FVL-NEXT: je .LBB15_48 +; AVX512FVL-NEXT: .LBB15_47: # %cond.store45 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 23(%rdi) +; AVX512FVL-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512FVL-NEXT: je .LBB15_50 +; AVX512FVL-NEXT: .LBB15_49: # %cond.store47 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 24(%rdi) +; AVX512FVL-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512FVL-NEXT: je .LBB15_52 +; AVX512FVL-NEXT: .LBB15_51: # %cond.store49 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 25(%rdi) +; AVX512FVL-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512FVL-NEXT: je .LBB15_54 +; AVX512FVL-NEXT: .LBB15_53: # %cond.store51 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 26(%rdi) +; AVX512FVL-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512FVL-NEXT: je .LBB15_56 +; AVX512FVL-NEXT: .LBB15_55: # %cond.store53 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 27(%rdi) +; AVX512FVL-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512FVL-NEXT: je .LBB15_58 +; AVX512FVL-NEXT: .LBB15_57: # %cond.store55 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 28(%rdi) +; AVX512FVL-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512FVL-NEXT: je .LBB15_60 +; AVX512FVL-NEXT: .LBB15_59: # %cond.store57 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 29(%rdi) +; AVX512FVL-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512FVL-NEXT: je .LBB15_62 +; AVX512FVL-NEXT: .LBB15_61: # %cond.store59 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 30(%rdi) +; AVX512FVL-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512FVL-NEXT: je .LBB15_64 +; AVX512FVL-NEXT: .LBB15_63: # %cond.store61 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -6177,6 +7061,129 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i16_v16i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX512FVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512FVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512FVL-NEXT: vpmovmskb %xmm1, %eax +; AVX512FVL-NEXT: xorl $65535, %eax # imm = 0xFFFF +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB16_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB16_3 +; AVX512FVL-NEXT: .LBB16_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB16_5 +; AVX512FVL-NEXT: .LBB16_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB16_7 +; AVX512FVL-NEXT: .LBB16_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB16_9 +; AVX512FVL-NEXT: .LBB16_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB16_11 +; AVX512FVL-NEXT: .LBB16_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB16_13 +; AVX512FVL-NEXT: .LBB16_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB16_15 +; AVX512FVL-NEXT: .LBB16_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB16_17 +; AVX512FVL-NEXT: .LBB16_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB16_19 +; AVX512FVL-NEXT: .LBB16_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB16_21 +; AVX512FVL-NEXT: .LBB16_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB16_23 +; AVX512FVL-NEXT: .LBB16_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB16_25 +; AVX512FVL-NEXT: .LBB16_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB16_27 +; AVX512FVL-NEXT: .LBB16_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB16_29 +; AVX512FVL-NEXT: .LBB16_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB16_31 +; AVX512FVL-NEXT: .LBB16_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB16_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB16_4 +; AVX512FVL-NEXT: .LBB16_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB16_6 +; AVX512FVL-NEXT: .LBB16_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB16_8 +; AVX512FVL-NEXT: .LBB16_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB16_10 +; AVX512FVL-NEXT: .LBB16_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB16_12 +; AVX512FVL-NEXT: .LBB16_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB16_14 +; AVX512FVL-NEXT: .LBB16_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB16_16 +; AVX512FVL-NEXT: .LBB16_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB16_18 +; AVX512FVL-NEXT: .LBB16_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB16_20 +; AVX512FVL-NEXT: .LBB16_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB16_22 +; AVX512FVL-NEXT: .LBB16_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB16_24 +; AVX512FVL-NEXT: .LBB16_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB16_26 +; AVX512FVL-NEXT: .LBB16_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB16_28 +; AVX512FVL-NEXT: .LBB16_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB16_30 +; AVX512FVL-NEXT: .LBB16_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB16_32 +; AVX512FVL-NEXT: .LBB16_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i16_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -6466,6 +7473,74 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i16_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512FVL-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 +; AVX512FVL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB17_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB17_3 +; AVX512FVL-NEXT: .LBB17_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB17_5 +; AVX512FVL-NEXT: .LBB17_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB17_7 +; AVX512FVL-NEXT: .LBB17_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB17_9 +; AVX512FVL-NEXT: .LBB17_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB17_11 +; AVX512FVL-NEXT: .LBB17_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB17_13 +; AVX512FVL-NEXT: .LBB17_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB17_15 +; AVX512FVL-NEXT: .LBB17_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB17_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB17_4 +; AVX512FVL-NEXT: .LBB17_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB17_6 +; AVX512FVL-NEXT: .LBB17_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB17_8 +; AVX512FVL-NEXT: .LBB17_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB17_10 +; AVX512FVL-NEXT: .LBB17_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB17_12 +; AVX512FVL-NEXT: .LBB17_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB17_14 +; AVX512FVL-NEXT: .LBB17_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB17_16 +; AVX512FVL-NEXT: .LBB17_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i16_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index c950ce64e8883..18d394e1281b4 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefix=SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { @@ -340,15 +340,15 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v8i64_v8i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: @@ -358,16 +358,6 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v8i64_v8i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer %b = icmp slt <8 x i64> %x, %c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> @@ -897,6 +887,70 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i64_v8i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovsqw %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB1_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB1_3 +; AVX512FVL-NEXT: .LBB1_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB1_5 +; AVX512FVL-NEXT: .LBB1_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB1_7 +; AVX512FVL-NEXT: .LBB1_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB1_9 +; AVX512FVL-NEXT: .LBB1_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB1_11 +; AVX512FVL-NEXT: .LBB1_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB1_13 +; AVX512FVL-NEXT: .LBB1_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB1_15 +; AVX512FVL-NEXT: .LBB1_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB1_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB1_4 +; AVX512FVL-NEXT: .LBB1_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB1_6 +; AVX512FVL-NEXT: .LBB1_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB1_8 +; AVX512FVL-NEXT: .LBB1_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB1_10 +; AVX512FVL-NEXT: .LBB1_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB1_12 +; AVX512FVL-NEXT: .LBB1_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB1_14 +; AVX512FVL-NEXT: .LBB1_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB1_16 +; AVX512FVL-NEXT: .LBB1_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -1441,6 +1495,70 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i64_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovsqb %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB2_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB2_3 +; AVX512FVL-NEXT: .LBB2_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB2_5 +; AVX512FVL-NEXT: .LBB2_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB2_7 +; AVX512FVL-NEXT: .LBB2_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB2_9 +; AVX512FVL-NEXT: .LBB2_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB2_11 +; AVX512FVL-NEXT: .LBB2_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB2_13 +; AVX512FVL-NEXT: .LBB2_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB2_15 +; AVX512FVL-NEXT: .LBB2_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB2_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB2_4 +; AVX512FVL-NEXT: .LBB2_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB2_6 +; AVX512FVL-NEXT: .LBB2_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB2_8 +; AVX512FVL-NEXT: .LBB2_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB2_10 +; AVX512FVL-NEXT: .LBB2_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB2_12 +; AVX512FVL-NEXT: .LBB2_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB2_14 +; AVX512FVL-NEXT: .LBB2_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB2_16 +; AVX512FVL-NEXT: .LBB2_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -1658,17 +1776,17 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k1 +; AVX512-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: @@ -1678,18 +1796,6 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp slt <4 x i64> %x, %c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> @@ -1984,6 +2090,42 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i64_v4i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovsqw %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB4_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB4_3 +; AVX512FVL-NEXT: .LBB4_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB4_5 +; AVX512FVL-NEXT: .LBB4_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB4_7 +; AVX512FVL-NEXT: .LBB4_8: # %else6 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB4_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB4_4 +; AVX512FVL-NEXT: .LBB4_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB4_6 +; AVX512FVL-NEXT: .LBB4_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB4_8 +; AVX512FVL-NEXT: .LBB4_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i64_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -2302,6 +2444,42 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i64_v4i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovsqb %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB5_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB5_3 +; AVX512FVL-NEXT: .LBB5_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB5_5 +; AVX512FVL-NEXT: .LBB5_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB5_7 +; AVX512FVL-NEXT: .LBB5_8: # %else6 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB5_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB5_4 +; AVX512FVL-NEXT: .LBB5_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB5_6 +; AVX512FVL-NEXT: .LBB5_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB5_8 +; AVX512FVL-NEXT: .LBB5_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i64_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -2451,17 +2629,17 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v2i64_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v2i64_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: vpmovsqd %zmm0, %ymm0 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: @@ -2470,18 +2648,6 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512VL-NEXT: vpmaxsq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v2i64_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 -; AVX512BW-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp slt <2 x i64> %x, %c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> @@ -2631,6 +2797,26 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v2i64_v2i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovsqw %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB7_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB7_3 +; AVX512FVL-NEXT: .LBB7_4: # %else2 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB7_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB7_4 +; AVX512FVL-NEXT: .LBB7_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v2i64_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -2797,6 +2983,26 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v2i64_v2i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovsqb %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB8_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB8_3 +; AVX512FVL-NEXT: .LBB8_4: # %else2 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB8_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB8_4 +; AVX512FVL-NEXT: .LBB8_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v2i64_v2i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -3478,6 +3684,126 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i32_v16i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512FVL-NEXT: vpmovsdw %zmm0, %ymm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB9_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB9_3 +; AVX512FVL-NEXT: .LBB9_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB9_5 +; AVX512FVL-NEXT: .LBB9_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB9_7 +; AVX512FVL-NEXT: .LBB9_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB9_9 +; AVX512FVL-NEXT: .LBB9_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB9_11 +; AVX512FVL-NEXT: .LBB9_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB9_13 +; AVX512FVL-NEXT: .LBB9_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB9_16 +; AVX512FVL-NEXT: .LBB9_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: .LBB9_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512FVL-NEXT: jne .LBB9_17 +; AVX512FVL-NEXT: # %bb.18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB9_19 +; AVX512FVL-NEXT: .LBB9_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB9_21 +; AVX512FVL-NEXT: .LBB9_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB9_23 +; AVX512FVL-NEXT: .LBB9_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB9_25 +; AVX512FVL-NEXT: .LBB9_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB9_27 +; AVX512FVL-NEXT: .LBB9_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB9_29 +; AVX512FVL-NEXT: .LBB9_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB9_31 +; AVX512FVL-NEXT: .LBB9_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB9_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB9_4 +; AVX512FVL-NEXT: .LBB9_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB9_6 +; AVX512FVL-NEXT: .LBB9_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB9_8 +; AVX512FVL-NEXT: .LBB9_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB9_10 +; AVX512FVL-NEXT: .LBB9_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB9_12 +; AVX512FVL-NEXT: .LBB9_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB9_14 +; AVX512FVL-NEXT: .LBB9_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB9_15 +; AVX512FVL-NEXT: jmp .LBB9_16 +; AVX512FVL-NEXT: .LBB9_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB9_20 +; AVX512FVL-NEXT: .LBB9_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB9_22 +; AVX512FVL-NEXT: .LBB9_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB9_24 +; AVX512FVL-NEXT: .LBB9_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 22(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB9_26 +; AVX512FVL-NEXT: .LBB9_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 24(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB9_28 +; AVX512FVL-NEXT: .LBB9_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 26(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB9_30 +; AVX512FVL-NEXT: .LBB9_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB9_32 +; AVX512FVL-NEXT: .LBB9_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -4136,6 +4462,126 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i32_v16i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512FVL-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB10_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB10_3 +; AVX512FVL-NEXT: .LBB10_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB10_5 +; AVX512FVL-NEXT: .LBB10_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB10_7 +; AVX512FVL-NEXT: .LBB10_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB10_9 +; AVX512FVL-NEXT: .LBB10_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB10_11 +; AVX512FVL-NEXT: .LBB10_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB10_13 +; AVX512FVL-NEXT: .LBB10_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB10_15 +; AVX512FVL-NEXT: .LBB10_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB10_17 +; AVX512FVL-NEXT: .LBB10_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB10_19 +; AVX512FVL-NEXT: .LBB10_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB10_21 +; AVX512FVL-NEXT: .LBB10_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB10_23 +; AVX512FVL-NEXT: .LBB10_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB10_25 +; AVX512FVL-NEXT: .LBB10_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB10_27 +; AVX512FVL-NEXT: .LBB10_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB10_29 +; AVX512FVL-NEXT: .LBB10_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB10_31 +; AVX512FVL-NEXT: .LBB10_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB10_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB10_4 +; AVX512FVL-NEXT: .LBB10_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB10_6 +; AVX512FVL-NEXT: .LBB10_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB10_8 +; AVX512FVL-NEXT: .LBB10_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB10_10 +; AVX512FVL-NEXT: .LBB10_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB10_12 +; AVX512FVL-NEXT: .LBB10_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB10_14 +; AVX512FVL-NEXT: .LBB10_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB10_16 +; AVX512FVL-NEXT: .LBB10_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB10_18 +; AVX512FVL-NEXT: .LBB10_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB10_20 +; AVX512FVL-NEXT: .LBB10_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB10_22 +; AVX512FVL-NEXT: .LBB10_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB10_24 +; AVX512FVL-NEXT: .LBB10_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB10_26 +; AVX512FVL-NEXT: .LBB10_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB10_28 +; AVX512FVL-NEXT: .LBB10_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB10_30 +; AVX512FVL-NEXT: .LBB10_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB10_32 +; AVX512FVL-NEXT: .LBB10_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -4509,6 +4955,70 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i32_v8i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB11_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB11_3 +; AVX512FVL-NEXT: .LBB11_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB11_5 +; AVX512FVL-NEXT: .LBB11_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB11_7 +; AVX512FVL-NEXT: .LBB11_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB11_9 +; AVX512FVL-NEXT: .LBB11_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB11_11 +; AVX512FVL-NEXT: .LBB11_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB11_13 +; AVX512FVL-NEXT: .LBB11_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB11_15 +; AVX512FVL-NEXT: .LBB11_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB11_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB11_4 +; AVX512FVL-NEXT: .LBB11_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB11_6 +; AVX512FVL-NEXT: .LBB11_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB11_8 +; AVX512FVL-NEXT: .LBB11_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB11_10 +; AVX512FVL-NEXT: .LBB11_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB11_12 +; AVX512FVL-NEXT: .LBB11_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB11_14 +; AVX512FVL-NEXT: .LBB11_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB11_16 +; AVX512FVL-NEXT: .LBB11_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i32_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -4883,6 +5393,70 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i32_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovsdb %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB12_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB12_3 +; AVX512FVL-NEXT: .LBB12_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB12_5 +; AVX512FVL-NEXT: .LBB12_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB12_7 +; AVX512FVL-NEXT: .LBB12_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB12_9 +; AVX512FVL-NEXT: .LBB12_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB12_11 +; AVX512FVL-NEXT: .LBB12_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB12_13 +; AVX512FVL-NEXT: .LBB12_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB12_15 +; AVX512FVL-NEXT: .LBB12_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB12_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB12_4 +; AVX512FVL-NEXT: .LBB12_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB12_6 +; AVX512FVL-NEXT: .LBB12_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB12_8 +; AVX512FVL-NEXT: .LBB12_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB12_10 +; AVX512FVL-NEXT: .LBB12_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB12_12 +; AVX512FVL-NEXT: .LBB12_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB12_14 +; AVX512FVL-NEXT: .LBB12_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB12_16 +; AVX512FVL-NEXT: .LBB12_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i32_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -5064,6 +5638,40 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i32_v4i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB13_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB13_3 +; AVX512FVL-NEXT: .LBB13_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB13_5 +; AVX512FVL-NEXT: .LBB13_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB13_7 +; AVX512FVL-NEXT: .LBB13_8: # %else6 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB13_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB13_4 +; AVX512FVL-NEXT: .LBB13_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB13_6 +; AVX512FVL-NEXT: .LBB13_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB13_8 +; AVX512FVL-NEXT: .LBB13_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i32_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -5246,6 +5854,41 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i32_v4i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512FVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB14_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB14_3 +; AVX512FVL-NEXT: .LBB14_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB14_5 +; AVX512FVL-NEXT: .LBB14_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB14_7 +; AVX512FVL-NEXT: .LBB14_8: # %else6 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB14_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB14_4 +; AVX512FVL-NEXT: .LBB14_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB14_6 +; AVX512FVL-NEXT: .LBB14_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB14_8 +; AVX512FVL-NEXT: .LBB14_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i32_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -6440,6 +7083,242 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v32i16_v32i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX512FVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512FVL-NEXT: vpacksswb %ymm2, %ymm0, %ymm0 +; AVX512FVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512FVL-NEXT: vpmovmskb %ymm1, %eax +; AVX512FVL-NEXT: notl %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB15_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB15_3 +; AVX512FVL-NEXT: .LBB15_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB15_5 +; AVX512FVL-NEXT: .LBB15_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB15_7 +; AVX512FVL-NEXT: .LBB15_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB15_9 +; AVX512FVL-NEXT: .LBB15_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB15_11 +; AVX512FVL-NEXT: .LBB15_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB15_13 +; AVX512FVL-NEXT: .LBB15_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB15_15 +; AVX512FVL-NEXT: .LBB15_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB15_17 +; AVX512FVL-NEXT: .LBB15_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB15_19 +; AVX512FVL-NEXT: .LBB15_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB15_21 +; AVX512FVL-NEXT: .LBB15_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB15_23 +; AVX512FVL-NEXT: .LBB15_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB15_25 +; AVX512FVL-NEXT: .LBB15_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB15_27 +; AVX512FVL-NEXT: .LBB15_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB15_29 +; AVX512FVL-NEXT: .LBB15_30: # %else28 +; AVX512FVL-NEXT: testw %ax, %ax +; AVX512FVL-NEXT: jns .LBB15_32 +; AVX512FVL-NEXT: .LBB15_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: .LBB15_32: # %else30 +; AVX512FVL-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512FVL-NEXT: jne .LBB15_33 +; AVX512FVL-NEXT: # %bb.34: # %else32 +; AVX512FVL-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512FVL-NEXT: jne .LBB15_35 +; AVX512FVL-NEXT: .LBB15_36: # %else34 +; AVX512FVL-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512FVL-NEXT: jne .LBB15_37 +; AVX512FVL-NEXT: .LBB15_38: # %else36 +; AVX512FVL-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512FVL-NEXT: jne .LBB15_39 +; AVX512FVL-NEXT: .LBB15_40: # %else38 +; AVX512FVL-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512FVL-NEXT: jne .LBB15_41 +; AVX512FVL-NEXT: .LBB15_42: # %else40 +; AVX512FVL-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512FVL-NEXT: jne .LBB15_43 +; AVX512FVL-NEXT: .LBB15_44: # %else42 +; AVX512FVL-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512FVL-NEXT: jne .LBB15_45 +; AVX512FVL-NEXT: .LBB15_46: # %else44 +; AVX512FVL-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512FVL-NEXT: jne .LBB15_47 +; AVX512FVL-NEXT: .LBB15_48: # %else46 +; AVX512FVL-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512FVL-NEXT: jne .LBB15_49 +; AVX512FVL-NEXT: .LBB15_50: # %else48 +; AVX512FVL-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512FVL-NEXT: jne .LBB15_51 +; AVX512FVL-NEXT: .LBB15_52: # %else50 +; AVX512FVL-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512FVL-NEXT: jne .LBB15_53 +; AVX512FVL-NEXT: .LBB15_54: # %else52 +; AVX512FVL-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512FVL-NEXT: jne .LBB15_55 +; AVX512FVL-NEXT: .LBB15_56: # %else54 +; AVX512FVL-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512FVL-NEXT: jne .LBB15_57 +; AVX512FVL-NEXT: .LBB15_58: # %else56 +; AVX512FVL-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512FVL-NEXT: jne .LBB15_59 +; AVX512FVL-NEXT: .LBB15_60: # %else58 +; AVX512FVL-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512FVL-NEXT: jne .LBB15_61 +; AVX512FVL-NEXT: .LBB15_62: # %else60 +; AVX512FVL-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512FVL-NEXT: jne .LBB15_63 +; AVX512FVL-NEXT: .LBB15_64: # %else62 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB15_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB15_4 +; AVX512FVL-NEXT: .LBB15_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB15_6 +; AVX512FVL-NEXT: .LBB15_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB15_8 +; AVX512FVL-NEXT: .LBB15_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB15_10 +; AVX512FVL-NEXT: .LBB15_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB15_12 +; AVX512FVL-NEXT: .LBB15_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB15_14 +; AVX512FVL-NEXT: .LBB15_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB15_16 +; AVX512FVL-NEXT: .LBB15_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB15_18 +; AVX512FVL-NEXT: .LBB15_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB15_20 +; AVX512FVL-NEXT: .LBB15_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB15_22 +; AVX512FVL-NEXT: .LBB15_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB15_24 +; AVX512FVL-NEXT: .LBB15_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB15_26 +; AVX512FVL-NEXT: .LBB15_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB15_28 +; AVX512FVL-NEXT: .LBB15_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB15_30 +; AVX512FVL-NEXT: .LBB15_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testw %ax, %ax +; AVX512FVL-NEXT: js .LBB15_31 +; AVX512FVL-NEXT: jmp .LBB15_32 +; AVX512FVL-NEXT: .LBB15_33: # %cond.store31 +; AVX512FVL-NEXT: vpextrb $0, %xmm0, 16(%rdi) +; AVX512FVL-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512FVL-NEXT: je .LBB15_36 +; AVX512FVL-NEXT: .LBB15_35: # %cond.store33 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 17(%rdi) +; AVX512FVL-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512FVL-NEXT: je .LBB15_38 +; AVX512FVL-NEXT: .LBB15_37: # %cond.store35 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 18(%rdi) +; AVX512FVL-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512FVL-NEXT: je .LBB15_40 +; AVX512FVL-NEXT: .LBB15_39: # %cond.store37 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 19(%rdi) +; AVX512FVL-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512FVL-NEXT: je .LBB15_42 +; AVX512FVL-NEXT: .LBB15_41: # %cond.store39 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 20(%rdi) +; AVX512FVL-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512FVL-NEXT: je .LBB15_44 +; AVX512FVL-NEXT: .LBB15_43: # %cond.store41 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 21(%rdi) +; AVX512FVL-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512FVL-NEXT: je .LBB15_46 +; AVX512FVL-NEXT: .LBB15_45: # %cond.store43 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 22(%rdi) +; AVX512FVL-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512FVL-NEXT: je .LBB15_48 +; AVX512FVL-NEXT: .LBB15_47: # %cond.store45 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 23(%rdi) +; AVX512FVL-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512FVL-NEXT: je .LBB15_50 +; AVX512FVL-NEXT: .LBB15_49: # %cond.store47 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 24(%rdi) +; AVX512FVL-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512FVL-NEXT: je .LBB15_52 +; AVX512FVL-NEXT: .LBB15_51: # %cond.store49 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 25(%rdi) +; AVX512FVL-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512FVL-NEXT: je .LBB15_54 +; AVX512FVL-NEXT: .LBB15_53: # %cond.store51 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 26(%rdi) +; AVX512FVL-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512FVL-NEXT: je .LBB15_56 +; AVX512FVL-NEXT: .LBB15_55: # %cond.store53 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 27(%rdi) +; AVX512FVL-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512FVL-NEXT: je .LBB15_58 +; AVX512FVL-NEXT: .LBB15_57: # %cond.store55 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 28(%rdi) +; AVX512FVL-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512FVL-NEXT: je .LBB15_60 +; AVX512FVL-NEXT: .LBB15_59: # %cond.store57 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 29(%rdi) +; AVX512FVL-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512FVL-NEXT: je .LBB15_62 +; AVX512FVL-NEXT: .LBB15_61: # %cond.store59 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 30(%rdi) +; AVX512FVL-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512FVL-NEXT: je .LBB15_64 +; AVX512FVL-NEXT: .LBB15_63: # %cond.store61 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -7067,6 +7946,129 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i16_v16i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX512FVL-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 +; AVX512FVL-NEXT: vpmovmskb %xmm1, %eax +; AVX512FVL-NEXT: xorl $65535, %eax # imm = 0xFFFF +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB16_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB16_3 +; AVX512FVL-NEXT: .LBB16_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB16_5 +; AVX512FVL-NEXT: .LBB16_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB16_7 +; AVX512FVL-NEXT: .LBB16_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB16_9 +; AVX512FVL-NEXT: .LBB16_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB16_11 +; AVX512FVL-NEXT: .LBB16_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB16_13 +; AVX512FVL-NEXT: .LBB16_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB16_15 +; AVX512FVL-NEXT: .LBB16_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB16_17 +; AVX512FVL-NEXT: .LBB16_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB16_19 +; AVX512FVL-NEXT: .LBB16_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB16_21 +; AVX512FVL-NEXT: .LBB16_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB16_23 +; AVX512FVL-NEXT: .LBB16_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB16_25 +; AVX512FVL-NEXT: .LBB16_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB16_27 +; AVX512FVL-NEXT: .LBB16_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB16_29 +; AVX512FVL-NEXT: .LBB16_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB16_31 +; AVX512FVL-NEXT: .LBB16_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB16_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB16_4 +; AVX512FVL-NEXT: .LBB16_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB16_6 +; AVX512FVL-NEXT: .LBB16_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB16_8 +; AVX512FVL-NEXT: .LBB16_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB16_10 +; AVX512FVL-NEXT: .LBB16_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB16_12 +; AVX512FVL-NEXT: .LBB16_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB16_14 +; AVX512FVL-NEXT: .LBB16_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB16_16 +; AVX512FVL-NEXT: .LBB16_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB16_18 +; AVX512FVL-NEXT: .LBB16_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB16_20 +; AVX512FVL-NEXT: .LBB16_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB16_22 +; AVX512FVL-NEXT: .LBB16_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB16_24 +; AVX512FVL-NEXT: .LBB16_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB16_26 +; AVX512FVL-NEXT: .LBB16_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB16_28 +; AVX512FVL-NEXT: .LBB16_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB16_30 +; AVX512FVL-NEXT: .LBB16_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB16_32 +; AVX512FVL-NEXT: .LBB16_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i16_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -7361,6 +8363,74 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i16_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512FVL-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 +; AVX512FVL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB17_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB17_3 +; AVX512FVL-NEXT: .LBB17_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB17_5 +; AVX512FVL-NEXT: .LBB17_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB17_7 +; AVX512FVL-NEXT: .LBB17_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB17_9 +; AVX512FVL-NEXT: .LBB17_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB17_11 +; AVX512FVL-NEXT: .LBB17_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB17_13 +; AVX512FVL-NEXT: .LBB17_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB17_15 +; AVX512FVL-NEXT: .LBB17_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB17_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB17_4 +; AVX512FVL-NEXT: .LBB17_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB17_6 +; AVX512FVL-NEXT: .LBB17_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB17_8 +; AVX512FVL-NEXT: .LBB17_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB17_10 +; AVX512FVL-NEXT: .LBB17_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB17_12 +; AVX512FVL-NEXT: .LBB17_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB17_14 +; AVX512FVL-NEXT: .LBB17_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB17_16 +; AVX512FVL-NEXT: .LBB17_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i16_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index da057dd084b36..4c4b6e78d1f8c 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefix=SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s --check-prefixes=SSE4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512FVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { @@ -272,14 +272,14 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v8i64_v8i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v8i64_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k1 +; AVX512-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512-NEXT: vpmovqd %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v8i64_v8i32: ; AVX512VL: # %bb.0: @@ -288,15 +288,6 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512VL-NEXT: vpmovqd %zmm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v8i64_v8i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512BW-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <8 x i32> %mask, zeroinitializer %b = icmp ult <8 x i64> %x, %c = select <8 x i1> %b, <8 x i64> %x, <8 x i64> @@ -762,6 +753,70 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i64_v8i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovusqw %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB1_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB1_3 +; AVX512FVL-NEXT: .LBB1_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB1_5 +; AVX512FVL-NEXT: .LBB1_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB1_7 +; AVX512FVL-NEXT: .LBB1_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB1_9 +; AVX512FVL-NEXT: .LBB1_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB1_11 +; AVX512FVL-NEXT: .LBB1_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB1_13 +; AVX512FVL-NEXT: .LBB1_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB1_15 +; AVX512FVL-NEXT: .LBB1_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB1_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB1_4 +; AVX512FVL-NEXT: .LBB1_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB1_6 +; AVX512FVL-NEXT: .LBB1_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB1_8 +; AVX512FVL-NEXT: .LBB1_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB1_10 +; AVX512FVL-NEXT: .LBB1_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB1_12 +; AVX512FVL-NEXT: .LBB1_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB1_14 +; AVX512FVL-NEXT: .LBB1_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB1_16 +; AVX512FVL-NEXT: .LBB1_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i64_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -1236,6 +1291,70 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i64_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovusqb %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB2_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB2_3 +; AVX512FVL-NEXT: .LBB2_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB2_5 +; AVX512FVL-NEXT: .LBB2_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB2_7 +; AVX512FVL-NEXT: .LBB2_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB2_9 +; AVX512FVL-NEXT: .LBB2_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB2_11 +; AVX512FVL-NEXT: .LBB2_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB2_13 +; AVX512FVL-NEXT: .LBB2_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB2_15 +; AVX512FVL-NEXT: .LBB2_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB2_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB2_4 +; AVX512FVL-NEXT: .LBB2_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB2_6 +; AVX512FVL-NEXT: .LBB2_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB2_8 +; AVX512FVL-NEXT: .LBB2_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB2_10 +; AVX512FVL-NEXT: .LBB2_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB2_12 +; AVX512FVL-NEXT: .LBB2_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB2_14 +; AVX512FVL-NEXT: .LBB2_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB2_16 +; AVX512FVL-NEXT: .LBB2_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i64_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -1416,17 +1535,17 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $12, %k0, %k0 -; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v4i64_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512-NEXT: kshiftlw $12, %k0, %k0 +; AVX512-NEXT: kshiftrw $12, %k0, %k1 +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v4i64_v4i32: ; AVX512VL: # %bb.0: @@ -1435,18 +1554,6 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512VL-NEXT: vpmovqd %ymm0, (%rdi) {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <4 x i32> %mask, zeroinitializer %b = icmp ult <4 x i64> %x, %c = select <4 x i1> %b, <4 x i64> %x, <4 x i64> @@ -1710,6 +1817,42 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i64_v4i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovusqw %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB4_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB4_3 +; AVX512FVL-NEXT: .LBB4_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB4_5 +; AVX512FVL-NEXT: .LBB4_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB4_7 +; AVX512FVL-NEXT: .LBB4_8: # %else6 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB4_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB4_4 +; AVX512FVL-NEXT: .LBB4_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB4_6 +; AVX512FVL-NEXT: .LBB4_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB4_8 +; AVX512FVL-NEXT: .LBB4_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i64_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -1994,6 +2137,42 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i64_v4i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovusqb %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB5_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB5_3 +; AVX512FVL-NEXT: .LBB5_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB5_5 +; AVX512FVL-NEXT: .LBB5_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB5_7 +; AVX512FVL-NEXT: .LBB5_8: # %else6 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB5_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB5_4 +; AVX512FVL-NEXT: .LBB5_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB5_6 +; AVX512FVL-NEXT: .LBB5_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB5_8 +; AVX512FVL-NEXT: .LBB5_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i64_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -2113,17 +2292,17 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; -; AVX512F-LABEL: truncstore_v2i64_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512F-NEXT: kshiftlw $14, %k0, %k0 -; AVX512F-NEXT: kshiftrw $14, %k0, %k1 -; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq +; AVX512-LABEL: truncstore_v2i64_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vptestmq %zmm1, %zmm1, %k0 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k1 +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 +; AVX512-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq ; ; AVX512VL-LABEL: truncstore_v2i64_v2i32: ; AVX512VL: # %bb.0: @@ -2131,18 +2310,6 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512VL-NEXT: vpminuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 ; AVX512VL-NEXT: vpmovqd %xmm0, (%rdi) {%k1} ; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: truncstore_v2i64_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vptestmq %zmm1, %zmm1, %k0 -; AVX512BW-NEXT: kshiftlw $14, %k0, %k0 -; AVX512BW-NEXT: kshiftrw $14, %k0, %k1 -; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq %a = icmp ne <2 x i64> %mask, zeroinitializer %b = icmp ult <2 x i64> %x, %c = select <2 x i1> %b, <2 x i64> %x, <2 x i64> @@ -2268,6 +2435,26 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v2i64_v2i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovusqw %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB7_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB7_3 +; AVX512FVL-NEXT: .LBB7_4: # %else2 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB7_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB7_4 +; AVX512FVL-NEXT: .LBB7_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v2i64_v2i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -2408,6 +2595,26 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v2i64_v2i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmq %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovusqb %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB8_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB8_3 +; AVX512FVL-NEXT: .LBB8_4: # %else2 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB8_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB8_4 +; AVX512FVL-NEXT: .LBB8_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v2i64_v2i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -3119,6 +3326,126 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i32_v16i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512FVL-NEXT: vpmovusdw %zmm0, %ymm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB9_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB9_3 +; AVX512FVL-NEXT: .LBB9_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB9_5 +; AVX512FVL-NEXT: .LBB9_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB9_7 +; AVX512FVL-NEXT: .LBB9_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB9_9 +; AVX512FVL-NEXT: .LBB9_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB9_11 +; AVX512FVL-NEXT: .LBB9_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB9_13 +; AVX512FVL-NEXT: .LBB9_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB9_16 +; AVX512FVL-NEXT: .LBB9_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: .LBB9_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512FVL-NEXT: jne .LBB9_17 +; AVX512FVL-NEXT: # %bb.18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB9_19 +; AVX512FVL-NEXT: .LBB9_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB9_21 +; AVX512FVL-NEXT: .LBB9_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB9_23 +; AVX512FVL-NEXT: .LBB9_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB9_25 +; AVX512FVL-NEXT: .LBB9_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB9_27 +; AVX512FVL-NEXT: .LBB9_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB9_29 +; AVX512FVL-NEXT: .LBB9_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB9_31 +; AVX512FVL-NEXT: .LBB9_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB9_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB9_4 +; AVX512FVL-NEXT: .LBB9_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB9_6 +; AVX512FVL-NEXT: .LBB9_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB9_8 +; AVX512FVL-NEXT: .LBB9_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB9_10 +; AVX512FVL-NEXT: .LBB9_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB9_12 +; AVX512FVL-NEXT: .LBB9_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB9_14 +; AVX512FVL-NEXT: .LBB9_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB9_15 +; AVX512FVL-NEXT: jmp .LBB9_16 +; AVX512FVL-NEXT: .LBB9_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrw $0, %xmm0, 16(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB9_20 +; AVX512FVL-NEXT: .LBB9_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 18(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB9_22 +; AVX512FVL-NEXT: .LBB9_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 20(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB9_24 +; AVX512FVL-NEXT: .LBB9_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 22(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB9_26 +; AVX512FVL-NEXT: .LBB9_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 24(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB9_28 +; AVX512FVL-NEXT: .LBB9_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 26(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB9_30 +; AVX512FVL-NEXT: .LBB9_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 28(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB9_32 +; AVX512FVL-NEXT: .LBB9_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 30(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i32_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -3815,6 +4142,126 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i32_v16i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %zmm1, %zmm1, %k0 +; AVX512FVL-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB10_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB10_3 +; AVX512FVL-NEXT: .LBB10_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB10_5 +; AVX512FVL-NEXT: .LBB10_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB10_7 +; AVX512FVL-NEXT: .LBB10_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB10_9 +; AVX512FVL-NEXT: .LBB10_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB10_11 +; AVX512FVL-NEXT: .LBB10_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB10_13 +; AVX512FVL-NEXT: .LBB10_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB10_15 +; AVX512FVL-NEXT: .LBB10_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB10_17 +; AVX512FVL-NEXT: .LBB10_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB10_19 +; AVX512FVL-NEXT: .LBB10_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB10_21 +; AVX512FVL-NEXT: .LBB10_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB10_23 +; AVX512FVL-NEXT: .LBB10_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB10_25 +; AVX512FVL-NEXT: .LBB10_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB10_27 +; AVX512FVL-NEXT: .LBB10_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB10_29 +; AVX512FVL-NEXT: .LBB10_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB10_31 +; AVX512FVL-NEXT: .LBB10_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB10_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB10_4 +; AVX512FVL-NEXT: .LBB10_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB10_6 +; AVX512FVL-NEXT: .LBB10_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB10_8 +; AVX512FVL-NEXT: .LBB10_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB10_10 +; AVX512FVL-NEXT: .LBB10_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB10_12 +; AVX512FVL-NEXT: .LBB10_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB10_14 +; AVX512FVL-NEXT: .LBB10_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB10_16 +; AVX512FVL-NEXT: .LBB10_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB10_18 +; AVX512FVL-NEXT: .LBB10_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB10_20 +; AVX512FVL-NEXT: .LBB10_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB10_22 +; AVX512FVL-NEXT: .LBB10_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB10_24 +; AVX512FVL-NEXT: .LBB10_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB10_26 +; AVX512FVL-NEXT: .LBB10_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB10_28 +; AVX512FVL-NEXT: .LBB10_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB10_30 +; AVX512FVL-NEXT: .LBB10_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB10_32 +; AVX512FVL-NEXT: .LBB10_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i32_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k1 @@ -4211,6 +4658,70 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i32_v8i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB11_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB11_3 +; AVX512FVL-NEXT: .LBB11_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB11_5 +; AVX512FVL-NEXT: .LBB11_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB11_7 +; AVX512FVL-NEXT: .LBB11_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB11_9 +; AVX512FVL-NEXT: .LBB11_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB11_11 +; AVX512FVL-NEXT: .LBB11_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB11_13 +; AVX512FVL-NEXT: .LBB11_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB11_15 +; AVX512FVL-NEXT: .LBB11_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB11_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB11_4 +; AVX512FVL-NEXT: .LBB11_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB11_6 +; AVX512FVL-NEXT: .LBB11_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB11_8 +; AVX512FVL-NEXT: .LBB11_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB11_10 +; AVX512FVL-NEXT: .LBB11_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrw $4, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB11_12 +; AVX512FVL-NEXT: .LBB11_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrw $5, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB11_14 +; AVX512FVL-NEXT: .LBB11_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrw $6, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB11_16 +; AVX512FVL-NEXT: .LBB11_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrw $7, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i32_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -4604,6 +5115,70 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i32_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpmovusdb %ymm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB12_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB12_3 +; AVX512FVL-NEXT: .LBB12_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB12_5 +; AVX512FVL-NEXT: .LBB12_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB12_7 +; AVX512FVL-NEXT: .LBB12_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB12_9 +; AVX512FVL-NEXT: .LBB12_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB12_11 +; AVX512FVL-NEXT: .LBB12_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB12_13 +; AVX512FVL-NEXT: .LBB12_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB12_15 +; AVX512FVL-NEXT: .LBB12_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB12_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB12_4 +; AVX512FVL-NEXT: .LBB12_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB12_6 +; AVX512FVL-NEXT: .LBB12_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB12_8 +; AVX512FVL-NEXT: .LBB12_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB12_10 +; AVX512FVL-NEXT: .LBB12_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB12_12 +; AVX512FVL-NEXT: .LBB12_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB12_14 +; AVX512FVL-NEXT: .LBB12_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB12_16 +; AVX512FVL-NEXT: .LBB12_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i32_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -4831,6 +5406,40 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i32_v4i16: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB13_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB13_3 +; AVX512FVL-NEXT: .LBB13_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB13_5 +; AVX512FVL-NEXT: .LBB13_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB13_7 +; AVX512FVL-NEXT: .LBB13_8: # %else6 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB13_1: # %cond.store +; AVX512FVL-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB13_4 +; AVX512FVL-NEXT: .LBB13_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB13_6 +; AVX512FVL-NEXT: .LBB13_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB13_8 +; AVX512FVL-NEXT: .LBB13_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i32_v4i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -5059,6 +5668,40 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v4i32_v4i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vptestmd %xmm1, %xmm1, %k0 +; AVX512FVL-NEXT: vpmovusdb %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB14_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB14_3 +; AVX512FVL-NEXT: .LBB14_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB14_5 +; AVX512FVL-NEXT: .LBB14_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB14_7 +; AVX512FVL-NEXT: .LBB14_8: # %else6 +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB14_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB14_4 +; AVX512FVL-NEXT: .LBB14_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB14_6 +; AVX512FVL-NEXT: .LBB14_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB14_8 +; AVX512FVL-NEXT: .LBB14_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v4i32_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -6277,6 +6920,245 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, ptr %p, <32 x i8> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v32i16_v32i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX512FVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512FVL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512FVL-NEXT: vpminuw %ymm3, %ymm2, %ymm2 +; AVX512FVL-NEXT: vpminuw %ymm3, %ymm0, %ymm0 +; AVX512FVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; AVX512FVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512FVL-NEXT: vpmovmskb %ymm1, %eax +; AVX512FVL-NEXT: notl %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB15_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB15_3 +; AVX512FVL-NEXT: .LBB15_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB15_5 +; AVX512FVL-NEXT: .LBB15_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB15_7 +; AVX512FVL-NEXT: .LBB15_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB15_9 +; AVX512FVL-NEXT: .LBB15_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB15_11 +; AVX512FVL-NEXT: .LBB15_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB15_13 +; AVX512FVL-NEXT: .LBB15_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB15_15 +; AVX512FVL-NEXT: .LBB15_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB15_17 +; AVX512FVL-NEXT: .LBB15_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB15_19 +; AVX512FVL-NEXT: .LBB15_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB15_21 +; AVX512FVL-NEXT: .LBB15_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB15_23 +; AVX512FVL-NEXT: .LBB15_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB15_25 +; AVX512FVL-NEXT: .LBB15_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB15_27 +; AVX512FVL-NEXT: .LBB15_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB15_29 +; AVX512FVL-NEXT: .LBB15_30: # %else28 +; AVX512FVL-NEXT: testw %ax, %ax +; AVX512FVL-NEXT: jns .LBB15_32 +; AVX512FVL-NEXT: .LBB15_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: .LBB15_32: # %else30 +; AVX512FVL-NEXT: testl $65536, %eax # imm = 0x10000 +; AVX512FVL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX512FVL-NEXT: jne .LBB15_33 +; AVX512FVL-NEXT: # %bb.34: # %else32 +; AVX512FVL-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512FVL-NEXT: jne .LBB15_35 +; AVX512FVL-NEXT: .LBB15_36: # %else34 +; AVX512FVL-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512FVL-NEXT: jne .LBB15_37 +; AVX512FVL-NEXT: .LBB15_38: # %else36 +; AVX512FVL-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512FVL-NEXT: jne .LBB15_39 +; AVX512FVL-NEXT: .LBB15_40: # %else38 +; AVX512FVL-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512FVL-NEXT: jne .LBB15_41 +; AVX512FVL-NEXT: .LBB15_42: # %else40 +; AVX512FVL-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512FVL-NEXT: jne .LBB15_43 +; AVX512FVL-NEXT: .LBB15_44: # %else42 +; AVX512FVL-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512FVL-NEXT: jne .LBB15_45 +; AVX512FVL-NEXT: .LBB15_46: # %else44 +; AVX512FVL-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512FVL-NEXT: jne .LBB15_47 +; AVX512FVL-NEXT: .LBB15_48: # %else46 +; AVX512FVL-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512FVL-NEXT: jne .LBB15_49 +; AVX512FVL-NEXT: .LBB15_50: # %else48 +; AVX512FVL-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512FVL-NEXT: jne .LBB15_51 +; AVX512FVL-NEXT: .LBB15_52: # %else50 +; AVX512FVL-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512FVL-NEXT: jne .LBB15_53 +; AVX512FVL-NEXT: .LBB15_54: # %else52 +; AVX512FVL-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512FVL-NEXT: jne .LBB15_55 +; AVX512FVL-NEXT: .LBB15_56: # %else54 +; AVX512FVL-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512FVL-NEXT: jne .LBB15_57 +; AVX512FVL-NEXT: .LBB15_58: # %else56 +; AVX512FVL-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512FVL-NEXT: jne .LBB15_59 +; AVX512FVL-NEXT: .LBB15_60: # %else58 +; AVX512FVL-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512FVL-NEXT: jne .LBB15_61 +; AVX512FVL-NEXT: .LBB15_62: # %else60 +; AVX512FVL-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512FVL-NEXT: jne .LBB15_63 +; AVX512FVL-NEXT: .LBB15_64: # %else62 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB15_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB15_4 +; AVX512FVL-NEXT: .LBB15_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB15_6 +; AVX512FVL-NEXT: .LBB15_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB15_8 +; AVX512FVL-NEXT: .LBB15_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB15_10 +; AVX512FVL-NEXT: .LBB15_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB15_12 +; AVX512FVL-NEXT: .LBB15_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB15_14 +; AVX512FVL-NEXT: .LBB15_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB15_16 +; AVX512FVL-NEXT: .LBB15_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB15_18 +; AVX512FVL-NEXT: .LBB15_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB15_20 +; AVX512FVL-NEXT: .LBB15_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB15_22 +; AVX512FVL-NEXT: .LBB15_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB15_24 +; AVX512FVL-NEXT: .LBB15_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB15_26 +; AVX512FVL-NEXT: .LBB15_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB15_28 +; AVX512FVL-NEXT: .LBB15_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB15_30 +; AVX512FVL-NEXT: .LBB15_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testw %ax, %ax +; AVX512FVL-NEXT: js .LBB15_31 +; AVX512FVL-NEXT: jmp .LBB15_32 +; AVX512FVL-NEXT: .LBB15_33: # %cond.store31 +; AVX512FVL-NEXT: vpextrb $0, %xmm0, 16(%rdi) +; AVX512FVL-NEXT: testl $131072, %eax # imm = 0x20000 +; AVX512FVL-NEXT: je .LBB15_36 +; AVX512FVL-NEXT: .LBB15_35: # %cond.store33 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 17(%rdi) +; AVX512FVL-NEXT: testl $262144, %eax # imm = 0x40000 +; AVX512FVL-NEXT: je .LBB15_38 +; AVX512FVL-NEXT: .LBB15_37: # %cond.store35 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 18(%rdi) +; AVX512FVL-NEXT: testl $524288, %eax # imm = 0x80000 +; AVX512FVL-NEXT: je .LBB15_40 +; AVX512FVL-NEXT: .LBB15_39: # %cond.store37 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 19(%rdi) +; AVX512FVL-NEXT: testl $1048576, %eax # imm = 0x100000 +; AVX512FVL-NEXT: je .LBB15_42 +; AVX512FVL-NEXT: .LBB15_41: # %cond.store39 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 20(%rdi) +; AVX512FVL-NEXT: testl $2097152, %eax # imm = 0x200000 +; AVX512FVL-NEXT: je .LBB15_44 +; AVX512FVL-NEXT: .LBB15_43: # %cond.store41 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 21(%rdi) +; AVX512FVL-NEXT: testl $4194304, %eax # imm = 0x400000 +; AVX512FVL-NEXT: je .LBB15_46 +; AVX512FVL-NEXT: .LBB15_45: # %cond.store43 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 22(%rdi) +; AVX512FVL-NEXT: testl $8388608, %eax # imm = 0x800000 +; AVX512FVL-NEXT: je .LBB15_48 +; AVX512FVL-NEXT: .LBB15_47: # %cond.store45 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 23(%rdi) +; AVX512FVL-NEXT: testl $16777216, %eax # imm = 0x1000000 +; AVX512FVL-NEXT: je .LBB15_50 +; AVX512FVL-NEXT: .LBB15_49: # %cond.store47 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 24(%rdi) +; AVX512FVL-NEXT: testl $33554432, %eax # imm = 0x2000000 +; AVX512FVL-NEXT: je .LBB15_52 +; AVX512FVL-NEXT: .LBB15_51: # %cond.store49 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 25(%rdi) +; AVX512FVL-NEXT: testl $67108864, %eax # imm = 0x4000000 +; AVX512FVL-NEXT: je .LBB15_54 +; AVX512FVL-NEXT: .LBB15_53: # %cond.store51 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 26(%rdi) +; AVX512FVL-NEXT: testl $134217728, %eax # imm = 0x8000000 +; AVX512FVL-NEXT: je .LBB15_56 +; AVX512FVL-NEXT: .LBB15_55: # %cond.store53 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 27(%rdi) +; AVX512FVL-NEXT: testl $268435456, %eax # imm = 0x10000000 +; AVX512FVL-NEXT: je .LBB15_58 +; AVX512FVL-NEXT: .LBB15_57: # %cond.store55 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 28(%rdi) +; AVX512FVL-NEXT: testl $536870912, %eax # imm = 0x20000000 +; AVX512FVL-NEXT: je .LBB15_60 +; AVX512FVL-NEXT: .LBB15_59: # %cond.store57 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 29(%rdi) +; AVX512FVL-NEXT: testl $1073741824, %eax # imm = 0x40000000 +; AVX512FVL-NEXT: je .LBB15_62 +; AVX512FVL-NEXT: .LBB15_61: # %cond.store59 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 30(%rdi) +; AVX512FVL-NEXT: testl $-2147483648, %eax # imm = 0x80000000 +; AVX512FVL-NEXT: je .LBB15_64 +; AVX512FVL-NEXT: .LBB15_63: # %cond.store61 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 31(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v32i16_v32i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 @@ -6915,6 +7797,130 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v16i16_v16i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX512FVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512FVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; AVX512FVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512FVL-NEXT: vpmovmskb %xmm1, %eax +; AVX512FVL-NEXT: xorl $65535, %eax # imm = 0xFFFF +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB16_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB16_3 +; AVX512FVL-NEXT: .LBB16_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB16_5 +; AVX512FVL-NEXT: .LBB16_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB16_7 +; AVX512FVL-NEXT: .LBB16_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB16_9 +; AVX512FVL-NEXT: .LBB16_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB16_11 +; AVX512FVL-NEXT: .LBB16_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB16_13 +; AVX512FVL-NEXT: .LBB16_14: # %else12 +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: js .LBB16_15 +; AVX512FVL-NEXT: .LBB16_16: # %else14 +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: jne .LBB16_17 +; AVX512FVL-NEXT: .LBB16_18: # %else16 +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: jne .LBB16_19 +; AVX512FVL-NEXT: .LBB16_20: # %else18 +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: jne .LBB16_21 +; AVX512FVL-NEXT: .LBB16_22: # %else20 +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: jne .LBB16_23 +; AVX512FVL-NEXT: .LBB16_24: # %else22 +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: jne .LBB16_25 +; AVX512FVL-NEXT: .LBB16_26: # %else24 +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: jne .LBB16_27 +; AVX512FVL-NEXT: .LBB16_28: # %else26 +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: jne .LBB16_29 +; AVX512FVL-NEXT: .LBB16_30: # %else28 +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: jne .LBB16_31 +; AVX512FVL-NEXT: .LBB16_32: # %else30 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB16_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB16_4 +; AVX512FVL-NEXT: .LBB16_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB16_6 +; AVX512FVL-NEXT: .LBB16_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB16_8 +; AVX512FVL-NEXT: .LBB16_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB16_10 +; AVX512FVL-NEXT: .LBB16_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB16_12 +; AVX512FVL-NEXT: .LBB16_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB16_14 +; AVX512FVL-NEXT: .LBB16_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb %al, %al +; AVX512FVL-NEXT: jns .LBB16_16 +; AVX512FVL-NEXT: .LBB16_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: testl $256, %eax # imm = 0x100 +; AVX512FVL-NEXT: je .LBB16_18 +; AVX512FVL-NEXT: .LBB16_17: # %cond.store15 +; AVX512FVL-NEXT: vpextrb $8, %xmm0, 8(%rdi) +; AVX512FVL-NEXT: testl $512, %eax # imm = 0x200 +; AVX512FVL-NEXT: je .LBB16_20 +; AVX512FVL-NEXT: .LBB16_19: # %cond.store17 +; AVX512FVL-NEXT: vpextrb $9, %xmm0, 9(%rdi) +; AVX512FVL-NEXT: testl $1024, %eax # imm = 0x400 +; AVX512FVL-NEXT: je .LBB16_22 +; AVX512FVL-NEXT: .LBB16_21: # %cond.store19 +; AVX512FVL-NEXT: vpextrb $10, %xmm0, 10(%rdi) +; AVX512FVL-NEXT: testl $2048, %eax # imm = 0x800 +; AVX512FVL-NEXT: je .LBB16_24 +; AVX512FVL-NEXT: .LBB16_23: # %cond.store21 +; AVX512FVL-NEXT: vpextrb $11, %xmm0, 11(%rdi) +; AVX512FVL-NEXT: testl $4096, %eax # imm = 0x1000 +; AVX512FVL-NEXT: je .LBB16_26 +; AVX512FVL-NEXT: .LBB16_25: # %cond.store23 +; AVX512FVL-NEXT: vpextrb $12, %xmm0, 12(%rdi) +; AVX512FVL-NEXT: testl $8192, %eax # imm = 0x2000 +; AVX512FVL-NEXT: je .LBB16_28 +; AVX512FVL-NEXT: .LBB16_27: # %cond.store25 +; AVX512FVL-NEXT: vpextrb $13, %xmm0, 13(%rdi) +; AVX512FVL-NEXT: testl $16384, %eax # imm = 0x4000 +; AVX512FVL-NEXT: je .LBB16_30 +; AVX512FVL-NEXT: .LBB16_29: # %cond.store27 +; AVX512FVL-NEXT: vpextrb $14, %xmm0, 14(%rdi) +; AVX512FVL-NEXT: testl $32768, %eax # imm = 0x8000 +; AVX512FVL-NEXT: je .LBB16_32 +; AVX512FVL-NEXT: .LBB16_31: # %cond.store29 +; AVX512FVL-NEXT: vpextrb $15, %xmm0, 15(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v16i16_v16i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 @@ -7212,6 +8218,75 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512FVL-LABEL: truncstore_v8i16_v8i8: +; AVX512FVL: # %bb.0: +; AVX512FVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512FVL-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 +; AVX512FVL-NEXT: vpternlogq {{.*#+}} xmm1 = ~xmm1 +; AVX512FVL-NEXT: vpmovsxwd %xmm1, %ymm1 +; AVX512FVL-NEXT: vptestmd %ymm1, %ymm1, %k0 +; AVX512FVL-NEXT: vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512FVL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512FVL-NEXT: kmovw %k0, %eax +; AVX512FVL-NEXT: testb $1, %al +; AVX512FVL-NEXT: jne .LBB17_1 +; AVX512FVL-NEXT: # %bb.2: # %else +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: jne .LBB17_3 +; AVX512FVL-NEXT: .LBB17_4: # %else2 +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: jne .LBB17_5 +; AVX512FVL-NEXT: .LBB17_6: # %else4 +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: jne .LBB17_7 +; AVX512FVL-NEXT: .LBB17_8: # %else6 +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: jne .LBB17_9 +; AVX512FVL-NEXT: .LBB17_10: # %else8 +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: jne .LBB17_11 +; AVX512FVL-NEXT: .LBB17_12: # %else10 +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: jne .LBB17_13 +; AVX512FVL-NEXT: .LBB17_14: # %else12 +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: jne .LBB17_15 +; AVX512FVL-NEXT: .LBB17_16: # %else14 +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; AVX512FVL-NEXT: .LBB17_1: # %cond.store +; AVX512FVL-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX512FVL-NEXT: testb $2, %al +; AVX512FVL-NEXT: je .LBB17_4 +; AVX512FVL-NEXT: .LBB17_3: # %cond.store1 +; AVX512FVL-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX512FVL-NEXT: testb $4, %al +; AVX512FVL-NEXT: je .LBB17_6 +; AVX512FVL-NEXT: .LBB17_5: # %cond.store3 +; AVX512FVL-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX512FVL-NEXT: testb $8, %al +; AVX512FVL-NEXT: je .LBB17_8 +; AVX512FVL-NEXT: .LBB17_7: # %cond.store5 +; AVX512FVL-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX512FVL-NEXT: testb $16, %al +; AVX512FVL-NEXT: je .LBB17_10 +; AVX512FVL-NEXT: .LBB17_9: # %cond.store7 +; AVX512FVL-NEXT: vpextrb $4, %xmm0, 4(%rdi) +; AVX512FVL-NEXT: testb $32, %al +; AVX512FVL-NEXT: je .LBB17_12 +; AVX512FVL-NEXT: .LBB17_11: # %cond.store9 +; AVX512FVL-NEXT: vpextrb $5, %xmm0, 5(%rdi) +; AVX512FVL-NEXT: testb $64, %al +; AVX512FVL-NEXT: je .LBB17_14 +; AVX512FVL-NEXT: .LBB17_13: # %cond.store11 +; AVX512FVL-NEXT: vpextrb $6, %xmm0, 6(%rdi) +; AVX512FVL-NEXT: testb $-128, %al +; AVX512FVL-NEXT: je .LBB17_16 +; AVX512FVL-NEXT: .LBB17_15: # %cond.store13 +; AVX512FVL-NEXT: vpextrb $7, %xmm0, 7(%rdi) +; AVX512FVL-NEXT: vzeroupper +; AVX512FVL-NEXT: retq +; ; AVX512BW-LABEL: truncstore_v8i16_v8i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 diff --git a/llvm/test/CodeGen/X86/negative-sin.ll b/llvm/test/CodeGen/X86/negative-sin.ll index f24507d3a4f38..4836da2ad7797 100644 --- a/llvm/test/CodeGen/X86/negative-sin.ll +++ b/llvm/test/CodeGen/X86/negative-sin.ll @@ -82,18 +82,13 @@ define double @semi_strict2(double %e) nounwind { ret double %h } -; FIXME: -; Auto-upgrade function attribute to IR-level fast-math-flags. - -define double @fn_attr(double %e) nounwind #0 { -; CHECK-LABEL: fn_attr: +define double @nsz_flag(double %e) nounwind { +; CHECK-LABEL: nsz_flag: ; CHECK: # %bb.0: ; CHECK-NEXT: jmp sin@PLT # TAILCALL - %f = fsub double 0.0, %e - %g = call double @sin(double %f) readonly - %h = fsub double 0.0, %g + %f = fsub nsz double 0.0, %e + %g = call nsz double @sin(double %f) readonly + %h = fsub nsz double 0.0, %g ret double %h } -attributes #0 = { "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" } - diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll index f53983036a016..5df1867f73c8e 100644 --- a/llvm/test/CodeGen/X86/oddsubvector.ll +++ b/llvm/test/CodeGen/X86/oddsubvector.ll @@ -155,10 +155,10 @@ define <16 x i32> @PR42819(ptr %a0) { define void @PR42833() { ; SSE2-LABEL: PR42833: ; SSE2: # %bb.0: +; SSE2-NEXT: movl b(%rip), %eax ; SSE2-NEXT: movdqa c+144(%rip), %xmm2 ; SSE2-NEXT: movdqa c+128(%rip), %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: addl b(%rip), %eax +; SSE2-NEXT: addl c+128(%rip), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: paddd %xmm0, %xmm3 @@ -166,7 +166,7 @@ define void @PR42833() { ; SSE2-NEXT: psubd %xmm2, %xmm4 ; SSE2-NEXT: paddd %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm5 ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] ; SSE2-NEXT: movdqa %xmm2, c+144(%rip) ; SSE2-NEXT: movaps %xmm5, c+128(%rip) @@ -191,17 +191,17 @@ define void @PR42833() { ; ; SSE42-LABEL: PR42833: ; SSE42: # %bb.0: +; SSE42-NEXT: movl b(%rip), %eax ; SSE42-NEXT: movdqa c+144(%rip), %xmm1 ; SSE42-NEXT: movdqa c+128(%rip), %xmm0 -; SSE42-NEXT: movd %xmm0, %eax -; SSE42-NEXT: addl b(%rip), %eax +; SSE42-NEXT: addl c+128(%rip), %eax ; SSE42-NEXT: movd %eax, %xmm2 ; SSE42-NEXT: paddd %xmm0, %xmm2 ; SSE42-NEXT: movdqa d+144(%rip), %xmm3 ; SSE42-NEXT: psubd %xmm1, %xmm3 ; SSE42-NEXT: paddd %xmm1, %xmm1 ; SSE42-NEXT: movdqa %xmm0, %xmm4 -; SSE42-NEXT: paddd %xmm0, %xmm4 +; SSE42-NEXT: paddd %xmm4, %xmm4 ; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] ; SSE42-NEXT: movdqa %xmm1, c+144(%rip) ; SSE42-NEXT: movdqa %xmm4, c+128(%rip) diff --git a/llvm/test/CodeGen/X86/pr159723.ll b/llvm/test/CodeGen/X86/pr159723.ll index cab4abb043639..c66b101fff990 100644 --- a/llvm/test/CodeGen/X86/pr159723.ll +++ b/llvm/test/CodeGen/X86/pr159723.ll @@ -17,7 +17,7 @@ define <8 x i1> @test_cmp_v8half_ogt(<8 x half> %rhs, <8 x i1> %mask) nounwind { ; CHECK-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; CHECK-NEXT: callq test_call_8@PLT ; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; CHECK-NEXT: vcmpltph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 {%k1} # 16-byte Folded Reload +; CHECK-NEXT: vcmpgtph {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %k0 {%k1} # 16-byte Folded Reload ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: retq @@ -79,7 +79,7 @@ define <16 x i1> @test_cmp_v16half_olt_commute(<16 x half> %rhs, <16 x i1> %mask ; CHECK-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; CHECK-NEXT: callq test_call_16@PLT ; CHECK-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; CHECK-NEXT: vcmpltph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 {%k1} # 32-byte Folded Reload +; CHECK-NEXT: vcmpgtph {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %k0 {%k1} # 32-byte Folded Reload ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: addq $56, %rsp ; CHECK-NEXT: vzeroupper @@ -100,7 +100,7 @@ define <32 x i1> @test_cmp_v32half_oge(<32 x half> %rhs, <32 x i1> %mask) nounwi ; CHECK-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-NEXT: callq test_call_32@PLT ; CHECK-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload -; CHECK-NEXT: vcmpleph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 {%k1} # 64-byte Folded Reload +; CHECK-NEXT: vcmpgeph {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %k0 {%k1} # 64-byte Folded Reload ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: addq $88, %rsp ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll index ce03f8fad4a19..161e9651a9cf2 100644 --- a/llvm/test/CodeGen/X86/pr62286.ll +++ b/llvm/test/CodeGen/X86/pr62286.ll @@ -26,27 +26,33 @@ define i64 @PR62286(i32 %a) { ; AVX1-LABEL: PR62286: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR62286: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm1 -; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 @@ -59,12 +65,12 @@ define i64 @PR62286(i32 %a) { ; AVX512-LABEL: PR62286: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: movb $8, %al +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm1 +; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: movw $4369, %ax # imm = 0x1111 ; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: vpexpandd %ymm0, %ymm1 {%k1} {z} -; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll index ceccee00c9457..58955265580bd 100644 --- a/llvm/test/CodeGen/X86/pr74736.ll +++ b/llvm/test/CodeGen/X86/pr74736.ll @@ -6,8 +6,8 @@ define void @main(<16 x i32> %0, i32 %1) { ; SSE-LABEL: main: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0] +; SSE-NEXT: movsd {{.*#+}} xmm0 = [0,1,0,0] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] ; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: paddd %xmm1, %xmm1 ; SSE-NEXT: paddd %xmm3, %xmm3 @@ -32,20 +32,20 @@ define void @main(<16 x i32> %0, i32 %1) { ; AVX-LABEL: main: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] ; AVX-NEXT: movl $1, %eax ; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 ; AVX-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2 -; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1 -; AVX-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7] -; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; AVX-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX-NEXT: vpaddd %ymm2, %ymm2, %ymm2 +; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm3 ; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] ; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7] +; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0 +; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7] ; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX-NEXT: vpxor %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,1,3,3,5,5,7] +; AVX-NEXT: vpermd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vpxor %ymm0, %ymm1, %ymm0 ; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] diff --git a/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir b/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir index b99c5fc8df0cb..44a2aecdc3672 100644 --- a/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir +++ b/llvm/test/CodeGen/X86/rematerialize-sub-super-reg.mir @@ -165,5 +165,25 @@ body: | bb.3: $rax = COPY %t3 RET 0, $rax - ... +--- +name: rematerialize_superregister_into_subregister_def_with_impdef_physreg +body: | + bb.0.entry: + ; CHECK-LABEL: name: rematerialize_superregister_into_subregister_def_with_impdef_physreg + ; CHECK: dead $esi = MOV32r0 implicit-def dead $eflags, implicit-def $rsi + ; CHECK-NEXT: dead $edx = MOV32r0 implicit-def dead $eflags, implicit-def $rdx + ; CHECK-NEXT: FAKE_USE implicit killed $rsi, implicit killed $rdx + ; CHECK-NEXT: dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def dead $rax, implicit-def $al + ; CHECK-NEXT: FAKE_USE implicit killed $al + ; CHECK-NEXT: $eax = MOV32r0 implicit-def dead $eflags + ; CHECK-NEXT: RET 0, $eax + undef %1.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %1 + $rsi = COPY %1 + $rdx = COPY %1 + FAKE_USE implicit killed $rsi, implicit killed $rdx + %4:gr8 = COPY killed %1.sub_8bit + $al = COPY killed %4 + FAKE_USE implicit killed $al + $eax = MOV32r0 implicit-def dead $eflags + RET 0, killed $eax diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll index 756019d0e98a0..03b61d9235254 100644 --- a/llvm/test/CodeGen/X86/shift-i512.ll +++ b/llvm/test/CodeGen/X86/shift-i512.ll @@ -10,7 +10,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm3 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm0, %xmm3 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX512VL-NEXT: vpsrlq $63, %xmm4, %xmm4 ; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm2 @@ -34,7 +34,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] ; AVX512VBMI-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 -; AVX512VBMI-NEXT: vpsllq $1, %xmm0, %xmm4 +; AVX512VBMI-NEXT: vpaddq %xmm0, %xmm0, %xmm4 ; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 ; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] @@ -51,7 +51,7 @@ define <8 x i64> @shl_i512_1(<8 x i64> %a) { ; ZNVER4-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; ZNVER4-NEXT: vextracti128 $1, %ymm0, %xmm2 ; ZNVER4-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] -; ZNVER4-NEXT: vpsllq $1, %xmm0, %xmm4 +; ZNVER4-NEXT: vpaddq %xmm0, %xmm0, %xmm4 ; ZNVER4-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 ; ZNVER4-NEXT: vpshldq $1, %xmm3, %xmm2, %xmm3 ; ZNVER4-NEXT: vextracti64x4 $1, %zmm0, %ymm2 diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll index 82c460fc55938..571915b47d297 100644 --- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-FAST-ALL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW,AVX512BW-FAST-PERLANE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-ALL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-FAST-PERLANE @@ -21,6 +21,31 @@ define void @shuffle_v64i8_to_v32i8_1(ptr %L, ptr %S) nounwind { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8_1: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] +; AVX512VL-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-FAST-ALL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-FAST-ALL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-FAST-ALL-NEXT: vzeroupper +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8_1: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15] +; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-FAST-PERLANE-NEXT: vzeroupper +; AVX512VL-FAST-PERLANE-NEXT: retq +; ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8_1: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $8, (%rdi), %zmm0 @@ -63,6 +88,40 @@ define void @shuffle_v16i32_to_v8i32_1(ptr %L, ptr %S) nounwind { ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; +; AVX512VL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512VL-FAST-ALL-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512VL-FAST-ALL-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VL-FAST-ALL-NEXT: vzeroupper +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7] +; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VL-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VL-FAST-PERLANE-NEXT: vzeroupper +; AVX512VL-FAST-PERLANE-NEXT: retq +; +; AVX512BW-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512BW-FAST-ALL: # %bb.0: +; AVX512BW-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] +; AVX512BW-FAST-ALL-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; AVX512BW-FAST-ALL-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-FAST-ALL-NEXT: vzeroupper +; AVX512BW-FAST-ALL-NEXT: retq +; +; AVX512BW-FAST-PERLANE-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512BW-FAST-PERLANE: # %bb.0: +; AVX512BW-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BW-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],mem[1,3],ymm0[5,7],mem[5,7] +; AVX512BW-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-FAST-PERLANE-NEXT: vzeroupper +; AVX512BW-FAST-PERLANE-NEXT: retq +; ; AVX512BWVL-FAST-ALL-LABEL: shuffle_v16i32_to_v8i32_1: ; AVX512BWVL-FAST-ALL: # %bb.0: ; AVX512BWVL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15] diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 3f48b22e2b9ff..a48be037ebebc 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -5791,20 +5791,20 @@ declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone define <2 x i64> @test_mm_slli_epi16(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi16: ; SSE: # %bb.0: -; SSE-NEXT: psllw $1, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x01] +; SSE-NEXT: psllw $2, %xmm0 # encoding: [0x66,0x0f,0x71,0xf0,0x02] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi16: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x01] +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x71,0xf0,0x02] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x01] +; AVX512-NEXT: vpsllw $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x02] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 1) + %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %arg0, i32 2) %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } @@ -5813,20 +5813,20 @@ declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone define <2 x i64> @test_mm_slli_epi32(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi32: ; SSE: # %bb.0: -; SSE-NEXT: pslld $1, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x01] +; SSE-NEXT: pslld $2, %xmm0 # encoding: [0x66,0x0f,0x72,0xf0,0x02] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x01] +; AVX1-NEXT: vpslld $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x72,0xf0,0x02] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x01] +; AVX512-NEXT: vpslld $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x02] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <4 x i32> - %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 1) + %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %arg0, i32 2) %bc = bitcast <4 x i32> %res to <2 x i64> ret <2 x i64> %bc } @@ -5835,19 +5835,19 @@ declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone define <2 x i64> @test_mm_slli_epi64(<2 x i64> %a0) { ; SSE-LABEL: test_mm_slli_epi64: ; SSE: # %bb.0: -; SSE-NEXT: psllq $1, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x01] +; SSE-NEXT: psllq $2, %xmm0 # encoding: [0x66,0x0f,0x73,0xf0,0x02] ; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_slli_epi64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x01] +; AVX1-NEXT: vpsllq $2, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x73,0xf0,0x02] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512-LABEL: test_mm_slli_epi64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpsllq $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x01] +; AVX512-NEXT: vpsllq $2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x02] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 1) + %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 2) ret <2 x i64> %res } declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone diff --git a/llvm/test/CodeGen/X86/usub_inc_iv.ll b/llvm/test/CodeGen/X86/usub_inc_iv.ll index 88bfddb51f2d4..ff06aaabd1b0c 100644 --- a/llvm/test/CodeGen/X86/usub_inc_iv.ll +++ b/llvm/test/CodeGen/X86/usub_inc_iv.ll @@ -303,14 +303,14 @@ define i32 @test_06(ptr %p, i64 %len, i32 %x) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[MATH:%.*]], [[BACKEDGE:%.*]] ], [ [[LEN:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 [[IV]], i64 1) -; CHECK-NEXT: [[MATH]] = extractvalue { i64, i1 } [[TMP0]], 0 -; CHECK-NEXT: [[OV:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1 +; CHECK-NEXT: [[OV:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[OV]], label [[EXIT:%.*]], label [[BACKEDGE]] ; CHECK: backedge: -; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[MATH]], 4 +; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[IV]], 4 ; CHECK-NEXT: [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[SUNKADDR]] -; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, ptr [[SUNKADDR1]] unordered, align 4 +; CHECK-NEXT: [[SUNKADDR2:%.*]] = getelementptr i8, ptr [[SUNKADDR1]], i64 -4 +; CHECK-NEXT: [[LOADED:%.*]] = load atomic i32, ptr [[SUNKADDR2]] unordered, align 4 +; CHECK-NEXT: [[MATH]] = add i64 [[IV]], -1 ; CHECK-NEXT: [[COND_2:%.*]] = icmp eq i32 [[LOADED]], [[X:%.*]] ; CHECK-NEXT: br i1 [[COND_2]], label [[FAILURE:%.*]], label [[LOOP]] ; CHECK: exit: diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll index 71e659c681d17..219e32c86c848 100644 --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -28,14 +28,14 @@ define <8 x i16> @test2(<8 x i16> %a) { ; SSE2-LABEL: test2: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test2: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: paddw %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; @@ -56,7 +56,7 @@ define <4 x i32> @test3(<4 x i32> %a) { ; SSE2-LABEL: test3: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm1 ; SSE2-NEXT: pslld $2, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq @@ -81,14 +81,14 @@ define <4 x i32> @test4(<4 x i32> %a) { ; SSE2-LABEL: test4: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test4: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: paddd %xmm0, %xmm1 +; SSE41-NEXT: paddd %xmm1, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll b/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll index 23d22e75d1e9d..3f92d2b79c85d 100644 --- a/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll +++ b/llvm/test/CodeGen/X86/vec_unsafe-fp-math.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math. @@ -18,7 +18,7 @@ define <4 x float> @vec_fneg(<4 x float> %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %sub = fsub <4 x float> zeroinitializer, %x + %sub = fsub nsz <4 x float> zeroinitializer, %x ret <4 x float> %sub } diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index 0b98a9388adc1..445e572aff403 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -679,6 +679,19 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %amt) ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: var_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] +; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3 +; AVX512VLVBMI2-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm0 +; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512VLVBMI2-NEXT: vpsllvw %zmm0, %zmm3, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: var_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 @@ -1918,6 +1931,17 @@ define <32 x i8> @constant_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y) nounwind { ; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512VLBW-NEXT: retq ; +; AVX512VLVBMI2-LABEL: constant_funnnel_v32i8: +; AVX512VLVBMI2: # %bb.0: +; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95] +; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm2 +; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512VLVBMI2-NEXT: retq +; ; XOPAVX1-LABEL: constant_funnnel_v32i8: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-gep.ll b/llvm/test/CodeGen/X86/vector-gep.ll index 5c485592295d3..b4cffcd171b33 100644 --- a/llvm/test/CodeGen/X86/vector-gep.ll +++ b/llvm/test/CodeGen/X86/vector-gep.ll @@ -122,91 +122,87 @@ define <64 x ptr> @AGEP9(ptr %param, <64 x i32> %off) nounwind { ; CHECK-NEXT: movl %esp, %ebp ; CHECK-NEXT: andl $-32, %esp ; CHECK-NEXT: subl $160, %esp -; CHECK-NEXT: vmovdqa %ymm2, %ymm5 -; CHECK-NEXT: vmovdqa %ymm1, %ymm3 -; CHECK-NEXT: vmovdqa %ymm0, %ymm1 -; CHECK-NEXT: vmovdqa 72(%ebp), %ymm0 -; CHECK-NEXT: vmovdqa 40(%ebp), %ymm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm4 -; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm7 -; CHECK-NEXT: vpaddd %xmm4, %xmm7, %xmm4 -; CHECK-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vbroadcastss 12(%ebp), %xmm5 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 +; CHECK-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 104(%ebp), %ymm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 136(%ebp), %ymm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm2, %xmm2, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 ; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovdqa 168(%ebp), %ymm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm2 -; CHECK-NEXT: vpaddd %xmm2, %xmm7, %xmm2 -; CHECK-NEXT: vmovdqa %xmm2, (%esp) # 16-byte Spill -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovdqa 40(%ebp), %xmm0 ; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm2 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm0 -; CHECK-NEXT: vpaddd %xmm0, %xmm7, %xmm0 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm1, %xmm7, %xmm1 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm6 -; CHECK-NEXT: vpaddd %xmm6, %xmm7, %xmm6 -; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpaddd %xmm3, %xmm7, %xmm3 -; CHECK-NEXT: vmovdqa %ymm5, %ymm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm5 -; CHECK-NEXT: vpaddd %xmm5, %xmm7, %xmm5 -; CHECK-NEXT: vextractf128 $1, %ymm4, %xmm4 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 56(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovdqa 72(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa %xmm0, (%esp) # 16-byte Spill +; CHECK-NEXT: vmovdqa 88(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm2 +; CHECK-NEXT: vmovdqa 104(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm1 +; CHECK-NEXT: vmovdqa 120(%ebp), %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm5, %xmm0 +; CHECK-NEXT: vmovdqa 136(%ebp), %xmm6 +; CHECK-NEXT: vpaddd %xmm6, %xmm6, %xmm6 +; CHECK-NEXT: vpaddd %xmm6, %xmm5, %xmm6 +; CHECK-NEXT: vmovdqa 152(%ebp), %xmm7 +; CHECK-NEXT: vpaddd %xmm7, %xmm7, %xmm7 +; CHECK-NEXT: vpaddd %xmm7, %xmm5, %xmm7 +; CHECK-NEXT: vmovdqa 168(%ebp), %xmm4 ; CHECK-NEXT: vpaddd %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vpaddd %xmm4, %xmm7, %xmm4 +; CHECK-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; CHECK-NEXT: vmovdqa 184(%ebp), %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm5, %xmm3 ; CHECK-NEXT: movl 8(%ebp), %eax -; CHECK-NEXT: vmovdqa %xmm4, 80(%eax) -; CHECK-NEXT: vmovdqa %xmm5, 64(%eax) -; CHECK-NEXT: vmovdqa %xmm3, 48(%eax) -; CHECK-NEXT: vmovdqa %xmm6, 32(%eax) -; CHECK-NEXT: vmovdqa %xmm1, 16(%eax) -; CHECK-NEXT: vmovdqa %xmm0, (%eax) -; CHECK-NEXT: vmovdqa %xmm2, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm3, 240(%eax) +; CHECK-NEXT: vmovdqa %xmm4, 224(%eax) +; CHECK-NEXT: vmovdqa %xmm7, 208(%eax) +; CHECK-NEXT: vmovdqa %xmm6, 192(%eax) +; CHECK-NEXT: vmovdqa %xmm0, 176(%eax) +; CHECK-NEXT: vmovdqa %xmm1, 160(%eax) +; CHECK-NEXT: vmovdqa %xmm2, 144(%eax) ; CHECK-NEXT: vmovaps (%esp), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 224(%eax) +; CHECK-NEXT: vmovaps %xmm0, 128(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 208(%eax) +; CHECK-NEXT: vmovaps %xmm0, 112(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 192(%eax) +; CHECK-NEXT: vmovaps %xmm0, 96(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 176(%eax) +; CHECK-NEXT: vmovaps %xmm0, 80(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 160(%eax) +; CHECK-NEXT: vmovaps %xmm0, 64(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 144(%eax) +; CHECK-NEXT: vmovaps %xmm0, 48(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 128(%eax) +; CHECK-NEXT: vmovaps %xmm0, 32(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 112(%eax) +; CHECK-NEXT: vmovaps %xmm0, 16(%eax) ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovaps %xmm0, 96(%eax) +; CHECK-NEXT: vmovaps %xmm0, (%eax) ; CHECK-NEXT: movl %ebp, %esp ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index 13f7d68ccb893..33d80f63dbcc8 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -652,7 +652,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: paddb %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psllw $1, %xmm2 +; SSE2-NEXT: paddw %xmm2, %xmm2 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE2-NEXT: psrlw $2, %xmm1 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -678,7 +678,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE41-NEXT: paddb %xmm3, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $1, %xmm2 +; SSE41-NEXT: paddw %xmm2, %xmm2 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $2, %xmm1 ; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 @@ -701,7 +701,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -720,7 +720,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2NOBW-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -739,7 +739,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $1, %xmm2, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512BW-NEXT: vpsllw $1, %xmm1, %xmm2 +; AVX512BW-NEXT: vpaddw %xmm1, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll index 1a5c3730c1839..e43108fe7d784 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -590,7 +590,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX1-NEXT: vpand %xmm6, %xmm5, %xmm5 ; AVX1-NEXT: vpaddb %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpsllw $1, %xmm3, %xmm5 +; AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm5 ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlw $2, %xmm3, %xmm3 @@ -609,7 +609,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 ; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3 +; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 ; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2 ; AVX1-NEXT: vpand %xmm2, %xmm8, %xmm2 @@ -633,7 +633,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX2NOBW-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX2NOBW-NEXT: vpsllw $1, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpaddw %ymm1, %ymm1, %ymm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -651,7 +651,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX512BW-NEXT: vpsllw $1, %ymm1, %ymm2 +; AVX512BW-NEXT: vpaddw %ymm1, %ymm1, %ymm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll index 9c56894f0c59c..bf98bcca59c04 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -485,7 +485,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vpsllw $1, %ymm3, %ymm5 +; AVX512F-NEXT: vpaddw %ymm3, %ymm3, %ymm5 ; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 @@ -504,7 +504,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vpsllw $1, %ymm2, %ymm3 +; AVX512F-NEXT: vpaddw %ymm2, %ymm2, %ymm3 ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 ; AVX512F-NEXT: vpand %ymm2, %ymm8, %ymm2 @@ -528,7 +528,7 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vpsrlw $1, %zmm2, %zmm2 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpsllw $1, %zmm1, %zmm2 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm1, %zmm2 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll index 13b21a747878b..6e1bf25908302 100644 --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -821,10 +821,10 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; X86-SSE-NEXT: andl $-16, %esp ; X86-SSE-NEXT: subl $16, %esp ; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: paddw %xmm1, %xmm3 +; X86-SSE-NEXT: paddw %xmm3, %xmm3 ; X86-SSE-NEXT: paddw %xmm3, %xmm1 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: paddw %xmm0, %xmm3 +; X86-SSE-NEXT: paddw %xmm3, %xmm3 ; X86-SSE-NEXT: paddw %xmm2, %xmm0 ; X86-SSE-NEXT: paddw %xmm3, %xmm0 ; X86-SSE-NEXT: paddw 8(%ebp), %xmm1 @@ -835,9 +835,9 @@ define <16 x i16> @madd_v16i16_3(<16 x i16> %a0, <16 x i16> %a1) nounwind { ; X64-SSE-LABEL: madd_v16i16_3: ; X64-SSE: # %bb.0: ; X64-SSE-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE-NEXT: paddw %xmm1, %xmm4 +; X64-SSE-NEXT: paddw %xmm4, %xmm4 ; X64-SSE-NEXT: movdqa %xmm0, %xmm5 -; X64-SSE-NEXT: paddw %xmm0, %xmm5 +; X64-SSE-NEXT: paddw %xmm5, %xmm5 ; X64-SSE-NEXT: paddw %xmm2, %xmm0 ; X64-SSE-NEXT: paddw %xmm5, %xmm0 ; X64-SSE-NEXT: paddw %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 227e000c6be7f..ab1feba98b008 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -907,7 +907,7 @@ define i1 @mask_v8i32_2(<8 x i32> %a0) { ; SSE2-LABEL: mask_v8i32_2: ; SSE2: # %bb.0: ; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pslld $1, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm0 ; SSE2-NEXT: movmskps %xmm0, %eax ; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 2b1cf5b671e53..99dac74d8127b 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -927,7 +927,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; SSE2-LABEL: constant_shift_v2i64: ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: paddq %xmm1, %xmm1 ; SSE2-NEXT: psllq $7, %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; SSE2-NEXT: retq @@ -975,7 +975,7 @@ define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind { ; X86-SSE-LABEL: constant_shift_v2i64: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: movdqa %xmm0, %xmm1 -; X86-SSE-NEXT: paddq %xmm0, %xmm1 +; X86-SSE-NEXT: paddq %xmm1, %xmm1 ; X86-SSE-NEXT: psllq $7, %xmm0 ; X86-SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] ; X86-SSE-NEXT: retl diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4378ee604459e..89cc7a638fa01 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1051,28 +1051,11 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31( ; PR159670 define <16 x i8> @shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31(<16 x i8> %a, <16 x i8> %b) { -; SSE2-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: +; SSE: # %bb.0: +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_24_01_25_02_26_03_27_04_28_05_29_06_30_07_31: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll index 2df013d0ff3e3..3279a50a1265b 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -370,6 +370,16 @@ define <8 x float> @constant_fold_vpermilvar_ps_256() { ret <8 x float> %1 } +define <8 x float> @freeze_vpermilvar_ps_256(<8 x float> %a0) { +; CHECK-LABEL: freeze_vpermilvar_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: ret{{[l|q]}} + %s0 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) + %f0 = freeze <8 x float> %s0 + %s1 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %f0, <8 x i32> ) + ret <8 x float> %s1 +} + define void @PR39483() { ; X86-AVX1-LABEL: PR39483: ; X86-AVX1: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 298858a8fcc73..56c0b164b63d6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -933,6 +933,16 @@ entry: ret i32 %tmp6 } +define <8 x float> @freeze_permps(<8 x float> %a0) { +; CHECK-LABEL: freeze_permps: +; CHECK: # %bb.0: +; CHECK-NEXT: ret{{[l|q]}} + %s0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> ) + %f0 = freeze <8 x float> %s0 + %s1 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %f0, <8 x i32> ) + ret <8 x float> %s1 +} + define <32 x i8> @PR27320(<8 x i32> %a0) { ; CHECK-LABEL: PR27320: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll index d3e4906450e43..bec33492bbf1e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll @@ -7,6 +7,7 @@ ; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX) declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) +declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { ; SSE-LABEL: combine_vpshufb_as_movzx: @@ -58,6 +59,25 @@ define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) ret <4 x i32> %r } +define <4 x float> @freeze_insertps(<4 x float> %a0, <4 x float> %a1) { +; SSE-LABEL: freeze_insertps: +; SSE: # %bb.0: +; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE-NEXT: insertps {{.*#+}} xmm1 = xmm0[1],xmm1[1,2,3] +; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: freeze_insertps: +; AVX: # %bb.0: +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],xmm1[1,2,3] +; AVX-NEXT: retq + %s0 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 16) + %f0 = freeze <4 x float> %s0 + %s1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a1, <4 x float> %f0, i8 64) + ret <4 x float> %s1 +} + define <16 x i8> @PR50049(ptr %p1, ptr %p2) { ; SSE-LABEL: PR50049: ; SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll index bd2710139d584..0e20b1813040a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -896,6 +896,16 @@ define i32 @mask_z1z3_v16i8(<16 x i8> %a0) { ret i32 %4 } +define <16 x i8> @freeze_pshufb_v16i8(<16 x i8> %a0) { +; CHECK-LABEL: freeze_pshufb_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: retq + %s0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> ) + %f0 = freeze <16 x i8> %s0 + %s1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %f0, <16 x i8> ) + ret <16 x i8> %s1 +} + define i32 @PR22415(double %a0) { ; SSE-LABEL: PR22415: ; SSE: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index 5b61de5a3b772..ee9d8a55aeb3e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3550,14 +3550,14 @@ define <8 x i16> @PR141475(i32 %in) { ; SSE-LABEL: PR141475: ; SSE: # %bb.0: ; SSE-NEXT: movd %edi, %xmm0 -; SSE-NEXT: pslld $1, %xmm0 +; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE-NEXT: retq ; ; AVX-LABEL: PR141475: ; AVX: # %bb.0: ; AVX-NEXT: vmovd %edi, %xmm0 -; AVX-NEXT: vpslld $1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX-NEXT: retq %mul = shl i32 %in, 1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll index b8db14c026bf8..3592ed8a84cb2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-sse4a.ll @@ -362,11 +362,9 @@ define <8 x i16> @shuf_089uuuuu(<8 x i16> %a0, <8 x i16> %a1) { define <16 x i8> @shuffle_8_18_uuuuuuuuuuuuuu(<16 x i8> %a, <16 x i8> %b) { ; AMD10H-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: ; AMD10H: # %bb.0: -; AMD10H-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AMD10H-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AMD10H-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AMD10H-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] -; AMD10H-NEXT: packuswb %xmm0, %xmm0 +; AMD10H-NEXT: psrld $16, %xmm1 +; AMD10H-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AMD10H-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AMD10H-NEXT: retq ; ; BTVER1-LABEL: shuffle_8_18_uuuuuuuuuuuuuu: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index a5d83a86f295e..0806e4960e48a 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -10,7 +10,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll index 54dc107fd0c10..3b93734c24deb 100644 --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -1438,26 +1438,26 @@ define <8 x i16> @test_128_i16_x_8_65024_mask_ashr_10(<8 x i16> %a0) { define <8 x i16> @test_128_i16_x_8_127_mask_shl_1(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddw %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i16_x_8_127_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, @@ -1656,26 +1656,26 @@ define <8 x i16> @test_128_i16_x_8_2032_mask_shl_6(<8 x i16> %a0) { define <8 x i16> @test_128_i16_x_8_65024_mask_shl_1(<8 x i16> %a0) { ; X86-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddw %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddw %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i16_x_8_65024_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <8 x i16> %a0, %t1 = shl <8 x i16> %t0, @@ -2373,40 +2373,40 @@ define <4 x i32> @test_128_i32_x_4_4294836224_mask_ashr_18(<4 x i32> %a0) { define <4 x i32> @test_128_i32_x_4_32767_mask_shl_1(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddd %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, @@ -2675,40 +2675,40 @@ define <4 x i32> @test_128_i32_x_4_8388352_mask_shl_10(<4 x i32> %a0) { define <4 x i32> @test_128_i32_x_4_4294836224_mask_shl_1(<4 x i32> %a0) { ; X86-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152] +; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddd %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152] +; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, @@ -3325,26 +3325,26 @@ define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_ashr_34(<2 x i64> % define <2 x i64> @test_128_i64_x_2_2147483647_mask_shl_1(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddq %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i64_x_2_2147483647_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, @@ -3543,26 +3543,26 @@ define <2 x i64> @test_128_i64_x_2_140737488289792_mask_shl_18(<2 x i64> %a0) { define <2 x i64> @test_128_i64_x_2_18446744065119617024_mask_shl_1(<2 x i64> %a0) { ; X86-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: paddq %xmm0, %xmm0 +; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; ; X86-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: paddq %xmm0, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; ; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_shl_1: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = shl <2 x i64> %t0, diff --git a/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll b/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll new file mode 100644 index 0000000000000..559f20122cc47 --- /dev/null +++ b/llvm/test/DebugInfo/AArch64/abstract-sp-unit.ll @@ -0,0 +1,43 @@ +; RUN: llc --filetype=obj -O0 -o - %s | llvm-dwarfdump --verify - + +; Check that abstract DIE for a subprogram referenced from another compile unit +; is emitted in the correct CU. + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +define void @a() !dbg !10 { + br label %for.b.c.c, !dbg !13 + for.b.c.c: + br label %for.b.c.c +} + +!llvm.dbg.cu = !{!0, !6} +!llvm.module.flags = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_20, file: !1, emissionKind: FullDebug, globals: !2) +!1 = !DIFile(filename: "foo.cpp", directory: "") +!2 = !{!3} +!3 = !DIGlobalVariableExpression(var: !4, expr: !DIExpression()) +!4 = !DIGlobalVariable(type: !5) +!5 = !DICompositeType(tag: DW_TAG_class_type) +!6 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_20, file: !7, emissionKind: FullDebug) +!7 = !DIFile(filename: "bar.cpp", directory: "") +!8 = !{i32 2, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(type: !11, unit: !6) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(scope: !14, inlinedAt: !15) +!14 = distinct !DISubprogram(unit: !6) +!15 = !DILocation(scope: !16, inlinedAt: !25) +!16 = distinct !DISubprogram(type: !11, unit: !6, declaration: !17) +!17 = !DISubprogram(scope: !5, type: !11, spFlags: DISPFlagOptimized, templateParams: !18) +!18 = !{!19} +!19 = !DITemplateTypeParameter(type: !20) +!20 = !DICompositeType(tag: DW_TAG_class_type, scope: !21) +!21 = distinct !DISubprogram(unit: !6, retainedNodes: !22) +!22 = !{!23} +!23 = !DILocalVariable(scope: !21, type: !24) +!24 = !DIBasicType() +!25 = !DILocation(scope: !21, inlinedAt: !26) +!26 = !DILocation(scope: !10) diff --git a/llvm/test/DebugInfo/X86/convert-loclist.ll b/llvm/test/DebugInfo/X86/convert-loclist.ll index 720bc46896ced..0fb15d56da703 100644 --- a/llvm/test/DebugInfo/X86/convert-loclist.ll +++ b/llvm/test/DebugInfo/X86/convert-loclist.ll @@ -5,6 +5,13 @@ ; RUN: llc -mtriple=x86_64 -split-dwarf-file=foo.dwo -filetype=asm -dwarf-op-convert=Enable < %s \ ; RUN: | FileCheck --check-prefix=ASM %s +; RUN: llc -mtriple=x86_64-mingw -filetype=obj < %s \ +; RUN: | llvm-dwarfdump -debug-info -debug-loclists - | FileCheck %s +; RUN: llc -mtriple=x86_64-mingw -split-dwarf-file=foo.dwo -filetype=obj -dwarf-op-convert=Enable < %s \ +; RUN: | llvm-dwarfdump -debug-info -debug-loclists - | FileCheck --check-prefix=SPLIT --check-prefix=CHECK %s +; RUN: llc -mtriple=x86_64-mingw -split-dwarf-file=foo.dwo -filetype=asm -dwarf-op-convert=Enable < %s \ +; RUN: | FileCheck --check-prefix=ASM %s + ; A bit of a brittle test - this is testing the specific DWO_id. The ; alternative would be to test two files with different DW_OP_convert values & ; ensuring the DWO IDs differ when the DW_OP_convert parameter differs. diff --git a/llvm/test/DebugInfo/X86/ranges_always_default.ll b/llvm/test/DebugInfo/X86/ranges_always_default.ll index 0cb2004a57d9f..0759327f3a741 100644 --- a/llvm/test/DebugInfo/X86/ranges_always_default.ll +++ b/llvm/test/DebugInfo/X86/ranges_always_default.ll @@ -3,11 +3,21 @@ ; RUN: | llvm-dwarfdump -debug-info -debug-addr -debug-rnglists -v - \ ; RUN: | FileCheck --check-prefix=RANGE %s +; RUN: llc -O0 %s -mtriple=x86_64-unknown-win32-gnu -filetype=obj -o - -minimize-addr-in-v5=Default \ +; RUN: -split-dwarf-file=test.dwo \ +; RUN: | llvm-dwarfdump -debug-info -debug-addr -debug-rnglists -v - \ +; RUN: | FileCheck --check-prefix=RANGE %s + ; RUN: llc -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o - -minimize-addr-in-v5=Disabled \ ; RUN: -split-dwarf-file=test.dwo \ ; RUN: | llvm-dwarfdump -debug-info -debug-addr -debug-rnglists -v - \ ; RUN: | FileCheck --check-prefix=NORANGE %s +; RUN: llc -O0 %s -mtriple=x86_64-unknown-win32-gnu -filetype=obj -o - -minimize-addr-in-v5=Disabled \ +; RUN: -split-dwarf-file=test.dwo \ +; RUN: | llvm-dwarfdump -debug-info -debug-addr -debug-rnglists -v - \ +; RUN: | FileCheck --check-prefix=NORANGE %s + ; A simpler example than used in ranges_always.ll, since this doesn't test all ; the nuances of where minimizing ranges are useful. This is only testing the ; defaulting behavior - specifically that the "ranges" version of the diff --git a/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll index 0174efea1e0d9..01b1d8fb65a09 100644 --- a/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll +++ b/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll @@ -1,16 +1,22 @@ ; RUN: llc -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj %s -o %t32 ; RUN: llvm-dwarfdump -v -debug-info -debug-rnglists %t32 | \ -; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32 +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32,CHECK-ELF + +; RUN: llc -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-win32-gnu -filetype=obj %s -o %t32 +; RUN: llvm-dwarfdump -v -debug-info -debug-rnglists %t32 | \ +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF32,CHECK-COFF ; RUN: llc -dwarf64 -split-dwarf-file=foo.dwo -mtriple=x86_64-unknown-linux-gnu -filetype=obj %s -o %t64 ; RUN: llvm-dwarfdump -v -debug-info -debug-rnglists %t64 | \ -; RUN: FileCheck %s --check-prefixes=CHECK,DWARF64 +; RUN: FileCheck %s --check-prefixes=CHECK,DWARF64,CHECK-ELF ; CHECK: .debug_info contents: ; CHECK: .debug_info.dwo contents: ; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x[[#%.8x,RNG_OFF:]] -; CHECK: [0x0000000000000001, 0x000000000000000c) ".text" -; CHECK: [0x000000000000000e, 0x0000000000000013) ".text") +; CHECK-ELF: [0x[[#%.16x,BEGIN1:0x01]], 0x[[#%.16x,END1:0x0c]]) ".text" +; CHECK-ELF: [0x[[#%.16x,BEGIN2:0x0e]], 0x[[#%.16x,END2:0x13]]) ".text") +; CHECK-COFF: [0x[[#%.16x,BEGIN1:0x04]], 0x[[#%.16x,END1:0x0f]]) ".text" +; CHECK-COFF: [0x[[#%.16x,BEGIN2:0x11]], 0x[[#%.16x,END2:0x17]]) ".text") ; CHECK: .debug_rnglists.dwo contents: ; DWARF32: 0x00000000: range list header: length = 0x00000015, format = DWARF32, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000001 @@ -21,8 +27,8 @@ ; CHECK: ] ; CHECK: ranges: ; CHECK: 0x[[#RNG_OFF]]: [DW_RLE_base_addressx]: 0x0000000000000000 -; CHECK: 0x[[#RNG_OFF+2]]: [DW_RLE_offset_pair ]: 0x0000000000000001, 0x000000000000000c => [0x0000000000000001, 0x000000000000000c) -; CHECK: 0x[[#RNG_OFF+5]]: [DW_RLE_offset_pair ]: 0x000000000000000e, 0x0000000000000013 => [0x000000000000000e, 0x0000000000000013) +; CHECK: 0x[[#RNG_OFF+2]]: [DW_RLE_offset_pair ]: 0x[[#%.16x,BEGIN1]], 0x[[#%.16x,END1]] => [0x[[#%.16x,BEGIN1]], 0x[[#%.16x,END1]]) +; CHECK: 0x[[#RNG_OFF+5]]: [DW_RLE_offset_pair ]: 0x[[#%.16x,BEGIN2]], 0x[[#%.16x,END2]] => [0x[[#%.16x,BEGIN2]], 0x[[#%.16x,END2]]) ; CHECK: 0x[[#RNG_OFF+8]]: [DW_RLE_end_of_list ] ; Function Attrs: noinline optnone uwtable diff --git a/llvm/test/Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll b/llvm/test/Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll new file mode 100644 index 0000000000000..919f16b103090 --- /dev/null +++ b/llvm/test/Instrumentation/AddressSanitizer/RISCV/asan-rvv-intrinsics.ll @@ -0,0 +1,2521 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=riscv64 -mattr=+v -passes=asan \ +; RUN: -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s +; REQUIRES: riscv-registered-target + +declare @llvm.riscv.vle.nxv1i32( + , + *, + i64) +define @intrinsic_vle_v_nxv1i32_nxv1i32(* align 4 %0, i64 %1) sanitize_address { +; CHECK-LABEL: @intrinsic_vle_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP1:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP12:%.*]] +; CHECK: 4: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[TMP5]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP4]] ], [ [[IV_NEXT:%.*]], [[TMP11:%.*]] ] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP11]] +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr , ptr [[TMP0:%.*]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP9]] to i64 +; CHECK-NEXT: call void @__asan_load4(i64 [[TMP10]]) +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP6]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP12]] +; CHECK: 12: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vle.nxv1i32.p0.i64( poison, ptr [[TMP0]], i64 [[TMP1]]) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vle.nxv1i32( + poison, + * %0, + i64 %1) + ret %a +} + +declare @llvm.riscv.vle.mask.nxv1i32( + , + *, + , + i64, + i64) +define @intrinsic_vle_mask_v_nxv1i32_nxv1i32( %0, * align 4 %1, %2, i64 %3) sanitize_address { +; CHECK-LABEL: @intrinsic_vle_mask_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP14:%.*]] +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP7]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP6]] ], [ [[IV_NEXT:%.*]], [[TMP13:%.*]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement [[TMP2:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP13]] +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr , ptr [[TMP1:%.*]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP11]] to i64 +; CHECK-NEXT: call void @__asan_load4(i64 [[TMP12]]) +; CHECK-NEXT: br label [[TMP13]] +; CHECK: 13: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP8]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vle.mask.nxv1i32.p0.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], i64 [[TMP3]], i64 1) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vle.mask.nxv1i32( + %0, + * %1, + %2, + i64 %3, i64 1) + ret %a +} + +declare void @llvm.riscv.vse.nxv1i32( + , + *, + i64) +define void @intrinsic_vse_v_nxv1i32_nxv1i32( %0, * align 4 %1, i64 %2) sanitize_address { +; CHECK-LABEL: @intrinsic_vse_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP13:%.*]] +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP6]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP5]] ], [ [[IV_NEXT:%.*]], [[TMP12:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP12]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr , ptr [[TMP1:%.*]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: call void @__asan_store4(i64 [[TMP11]]) +; CHECK-NEXT: br label [[TMP12]] +; CHECK: 12: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP7]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP13]] +; CHECK: 13: +; CHECK-NEXT: call void @llvm.riscv.vse.nxv1i32.p0.i64( [[TMP0:%.*]], ptr [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vse.nxv1i32( + %0, + * %1, + i64 %2) + ret void +} + +declare void @llvm.riscv.vse.mask.nxv1i32( + , + *, + , + i64) +define void @intrinsic_vse_mask_v_nxv1i32_nxv1i32( %0, * align 4 %1, %2, i64 %3) sanitize_address { +; CHECK-LABEL: @intrinsic_vse_mask_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP14:%.*]] +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP7]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP6]] ], [ [[IV_NEXT:%.*]], [[TMP13:%.*]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement [[TMP2:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP13]] +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr , ptr [[TMP1:%.*]], i64 0, i64 [[IV]] +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP11]] to i64 +; CHECK-NEXT: call void @__asan_store4(i64 [[TMP12]]) +; CHECK-NEXT: br label [[TMP13]] +; CHECK: 13: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP8]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: call void @llvm.riscv.vse.mask.nxv1i32.p0.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], i64 [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vse.mask.nxv1i32( + %0, + * %1, + %2, + i64 %3) + ret void +} + + +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64) +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2), ptr, , i64, i64, i64) + +define @test_vlseg2_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg2_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t.p0.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP24]], i32 1) +; CHECK-NEXT: ret [[TMP25]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + +define @test_vlseg2_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg2_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP24]], i32 1) +; CHECK-NEXT: ret [[TMP25]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3), ptr, i64, i64) +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3), ptr, , i64, i64, i64) + +define @test_vlseg3_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg3_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP36:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t.p0.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP37:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP36]], i32 1) +; CHECK-NEXT: ret [[TMP37]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + +define @test_vlseg3_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg3_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP36:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP37:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP36]], i32 1) +; CHECK-NEXT: ret [[TMP37]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4), ptr, i64, i64) +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4), ptr, , i64, i64, i64) + +define @test_vlseg4_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg4_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP48:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t.p0.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP49:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP48]], i32 1) +; CHECK-NEXT: ret [[TMP49]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + +define @test_vlseg4_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg4_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP48:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP49:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP48]], i32 1) +; CHECK-NEXT: ret [[TMP49]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5), ptr, i64, i64) +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5), ptr, , i64, i64, i64) + +define @test_vlseg5_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg5_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP60:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t.p0.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP61:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP60]], i32 1) +; CHECK-NEXT: ret [[TMP61]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + +define @test_vlseg5_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg5_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP60:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP61:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP60]], i32 1) +; CHECK-NEXT: ret [[TMP61]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6), ptr, i64, i64) +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6), ptr, , i64, i64, i64) + +define @test_vlseg6_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg6_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP72:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t.p0.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP73:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP72]], i32 1) +; CHECK-NEXT: ret [[TMP73]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + +define @test_vlseg6_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg6_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP72:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP73:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP72]], i32 1) +; CHECK-NEXT: ret [[TMP73]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7), ptr, i64, i64) +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7), ptr, , i64, i64, i64) + +define @test_vlseg7_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg7_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP84:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t.p0.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP85:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP84]], i32 1) +; CHECK-NEXT: ret [[TMP85]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + +define @test_vlseg7_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg7_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP84:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP85:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP84]], i32 1) +; CHECK-NEXT: ret [[TMP85]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8), ptr, i64, i64) +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8), ptr, , i64, i64, i64) + +define @test_vlseg8_nxv1i32(ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlseg8_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP96:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t.p0.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP97:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP96]], i32 1) +; CHECK-NEXT: ret [[TMP97]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) poison, ptr %base, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + +define @test_vlseg8_mask_nxv1i32(ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlseg8_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP96:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP97:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP96]], i32 1) +; CHECK-NEXT: ret [[TMP97]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8) poison, ptr %base, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + + +declare void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64) +declare void @llvm.riscv.vsseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2), ptr, , i64, i64) + +define void @test_vsseg2_nxv1i32(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg2_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv4i8_2t.p0.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg2_mask_nxv1i32(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg2_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3), ptr, i64, i64) +declare void @llvm.riscv.vsseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3), ptr, , i64, i64) + +define void @test_vsseg3_nxv1i32(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg3_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv4i8_3t.p0.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg3_mask_nxv1i32(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg3_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4), ptr, i64, i64) +declare void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4), ptr, , i64, i64) + +define void @test_vsseg4_nxv1i32(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg4_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv4i8_4t.p0.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg4_mask_nxv1i32(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg4_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5), ptr, i64, i64) +declare void @llvm.riscv.vsseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5), ptr, , i64, i64) + +define void @test_vsseg5_nxv1i32(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg5_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv4i8_5t.p0.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg5_mask_nxv1i32(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg5_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6), ptr, i64, i64) +declare void @llvm.riscv.vsseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6), ptr, , i64, i64) + +define void @test_vsseg6_nxv1i32(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg6_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv4i8_6t.p0.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg6_mask_nxv1i32(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg6_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7), ptr, i64, i64) +declare void @llvm.riscv.vsseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7), ptr, , i64, i64) + +define void @test_vsseg7_nxv1i32(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg7_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv4i8_7t.p0.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg7_mask_nxv1i32(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg7_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8), ptr, i64, i64) +declare void @llvm.riscv.vsseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8), ptr, , i64, i64) + +define void @test_vsseg8_nxv1i32(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsseg8_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv4i8_8t.p0.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %vl, i64 5) + ret void +} + +define void @test_vsseg8_mask_nxv1i32(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsseg8_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i1.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8) %val, ptr %base, %mask, i64 %vl, i64 5) + ret void +} + + +; Test stride load +declare @llvm.riscv.vlse.nxv1i32( + , + *, + i64, + i64); + +define @intrinsic_vlse_v_nxv1i32_nxv1i32(* align 4 %0, i64 %1, i64 %2) sanitize_address { +; CHECK-LABEL: @intrinsic_vlse_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP14:%.*]] +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP6]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP5]] ], [ [[IV_NEXT:%.*]], [[TMP13:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP13]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[IV]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP11]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP12]], i64 4) +; CHECK-NEXT: br label [[TMP13]] +; CHECK: 13: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP7]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vlse.nxv1i32.p0.i64( poison, ptr [[TMP0]], i64 [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vlse.nxv1i32( + poison, + * %0, + i64 %1, + i64 %2) + + ret %a +} + +declare @llvm.riscv.vlse.mask.nxv1i32( + , + *, + i64, + , + i64, + i64); + +define @intrinsic_vlse_mask_v_nxv1i32_nxv1i32( %0, * %1, i64 %2, %3, i64 %4) sanitize_address { +; CHECK-LABEL: @intrinsic_vlse_mask_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP4:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP16:%.*]] +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[TMP8]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[TMP15:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[TMP3:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP15]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[IV]], [[TMP2:%.*]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP14]], i64 4) +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP9]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vlse.mask.nxv1i32.p0.i64( [[TMP0:%.*]], ptr [[TMP1]], i64 [[TMP2]], [[TMP3]], i64 [[TMP4]], i64 1) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vlse.mask.nxv1i32( + %0, + * %1, + i64 %2, + %3, + i64 %4, i64 1) + + ret %a +} + +; Test stride store +declare void @llvm.riscv.vsse.nxv1i32( + , + *, + i64, + i64); + +define void @intrinsic_vsse_v_nxv1i32_nxv1i32( %0, * %1, i64 %2, i64 %3) sanitize_address { +; CHECK-LABEL: @intrinsic_vsse_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne i64 [[TMP3:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP6:%.*]], label [[TMP15:%.*]] +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP7]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP6]] ], [ [[IV_NEXT:%.*]], [[TMP14:%.*]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP14]] +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[IV]], [[TMP2:%.*]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64 +; CHECK-NEXT: call void @__asan_storeN(i64 [[TMP13]], i64 4) +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP8]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: call void @llvm.riscv.vsse.nxv1i32.p0.i64( [[TMP0:%.*]], ptr [[TMP1]], i64 [[TMP2]], i64 [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vsse.nxv1i32( + %0, + * %1, + i64 %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsse.mask.nxv1i32( + , + *, + i64, + , + i64); + +define void @intrinsic_vsse_mask_v_nxv1i32_nxv1i32( %0, * %1, i64 %2, %3, i64 %4) sanitize_address { +; CHECK-LABEL: @intrinsic_vsse_mask_v_nxv1i32_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP4:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP16:%.*]] +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[TMP8]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[TMP15:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement [[TMP3:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP15]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[IV]], [[TMP2:%.*]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: call void @__asan_storeN(i64 [[TMP14]], i64 4) +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP9]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: call void @llvm.riscv.vsse.mask.nxv1i32.p0.i64( [[TMP0:%.*]], ptr [[TMP1]], i64 [[TMP2]], [[TMP3]], i64 [[TMP4]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vsse.mask.nxv1i32( + %0, + * %1, + i64 %2, + %3, + i64 %4) + + ret void +} + + +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2), ptr, i64, , i64, i64, i64) + +define @test_vlsseg2_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg2_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.triscv.vector.tuple_nxv4i8_2t.p0.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP24]], i32 1) +; CHECK-NEXT: ret [[TMP25]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + +define @test_vlsseg2_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg2_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP24]], i32 1) +; CHECK-NEXT: ret [[TMP25]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vlsseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vlsseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vlsseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3), ptr, i64, , i64, i64, i64) + +define @test_vlsseg3_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg3_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP36:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlsseg3.triscv.vector.tuple_nxv4i8_3t.p0.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP37:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP36]], i32 1) +; CHECK-NEXT: ret [[TMP37]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlsseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + +define @test_vlsseg3_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg3_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP36:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlsseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP37:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP36]], i32 1) +; CHECK-NEXT: ret [[TMP37]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vlsseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vlsseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vlsseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4), ptr, i64, , i64, i64, i64) + +define @test_vlsseg4_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg4_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP48:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlsseg4.triscv.vector.tuple_nxv4i8_4t.p0.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP49:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP48]], i32 1) +; CHECK-NEXT: ret [[TMP49]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlsseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + +define @test_vlsseg4_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg4_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP48:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlsseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP49:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP48]], i32 1) +; CHECK-NEXT: ret [[TMP49]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vlsseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vlsseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vlsseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5), ptr, i64, , i64, i64, i64) + +define @test_vlsseg5_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg5_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP60:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlsseg5.triscv.vector.tuple_nxv4i8_5t.p0.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP61:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP60]], i32 1) +; CHECK-NEXT: ret [[TMP61]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlsseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + +define @test_vlsseg5_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg5_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP60:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlsseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP61:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP60]], i32 1) +; CHECK-NEXT: ret [[TMP61]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vlsseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vlsseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vlsseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6), ptr, i64, , i64, i64, i64) + +define @test_vlsseg6_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg6_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP72:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlsseg6.triscv.vector.tuple_nxv4i8_6t.p0.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP73:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP72]], i32 1) +; CHECK-NEXT: ret [[TMP73]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlsseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + +define @test_vlsseg6_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg6_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP72:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlsseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP73:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP72]], i32 1) +; CHECK-NEXT: ret [[TMP73]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vlsseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vlsseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vlsseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7), ptr, i64, , i64, i64, i64) + +define @test_vlsseg7_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg7_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP84:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlsseg7.triscv.vector.tuple_nxv4i8_7t.p0.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP85:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP84]], i32 1) +; CHECK-NEXT: ret [[TMP85]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlsseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + +define @test_vlsseg7_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg7_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP84:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlsseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP85:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP84]], i32 1) +; CHECK-NEXT: ret [[TMP85]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vlsseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vlsseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8), ptr, i64, i64, i64) +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vlsseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8), ptr, i64, , i64, i64, i64) + +define @test_vlsseg8_nxv1i32(ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vlsseg8_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP96:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlsseg8.triscv.vector.tuple_nxv4i8_8t.p0.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP97:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP96]], i32 1) +; CHECK-NEXT: ret [[TMP97]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlsseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) poison, ptr %base, i64 %offset, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + +define @test_vlsseg8_mask_nxv1i32(ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vlsseg8_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP96:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlsseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP97:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP96]], i32 1) +; CHECK-NEXT: ret [[TMP97]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vlsseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8) poison, ptr %base, i64 %offset, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + + +declare void @llvm.riscv.vssseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2), ptr, i64, , i64, i64) + +define void @test_vssseg2_nxv1i32(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg2_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg2.triscv.vector.tuple_nxv4i8_2t.p0.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg2.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg2_mask_nxv1i32(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg2_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i1(target("riscv.vector.tuple", , 2) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vssseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3), ptr, i64, , i64, i64) + +define void @test_vssseg3_nxv1i32(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg3_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg3.triscv.vector.tuple_nxv4i8_3t.p0.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg3.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg3_mask_nxv1i32(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg3_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i1(target("riscv.vector.tuple", , 3) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vssseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4), ptr, i64, , i64, i64) + +define void @test_vssseg4_nxv1i32(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg4_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg4.triscv.vector.tuple_nxv4i8_4t.p0.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg4.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg4_mask_nxv1i32(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg4_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i1(target("riscv.vector.tuple", , 4) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vssseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5), ptr, i64, , i64, i64) + +define void @test_vssseg5_nxv1i32(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg5_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg5.triscv.vector.tuple_nxv4i8_5t.p0.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg5.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg5_mask_nxv1i32(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg5_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i1(target("riscv.vector.tuple", , 5) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vssseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6), ptr, i64, , i64, i64) + +define void @test_vssseg6_nxv1i32(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg6_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg6.triscv.vector.tuple_nxv4i8_6t.p0.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg6.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg6_mask_nxv1i32(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg6_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i1(target("riscv.vector.tuple", , 6) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vssseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7), ptr, i64, , i64, i64) + +define void @test_vssseg7_nxv1i32(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg7_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg7.triscv.vector.tuple_nxv4i8_7t.p0.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg7.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg7_mask_nxv1i32(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg7_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i1(target("riscv.vector.tuple", , 7) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vssseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8), ptr, i64, i64, i64) +declare void @llvm.riscv.vssseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8), ptr, i64, , i64, i64) + +define void @test_vssseg8_nxv1i32(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %offset, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vssseg8_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg8.triscv.vector.tuple_nxv4i8_8t.p0.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg8.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %offset, i64 %vl, i64 5) + ret void +} + +define void @test_vssseg8_mask_nxv1i32(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %offset, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vssseg8_mask_nxv1i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vssseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.i64.nxv1i1(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], i64 [[OFFSET:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vssseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i1(target("riscv.vector.tuple", , 8) %val, ptr %base, i64 %offset, %mask, i64 %vl, i64 5) + ret void +} + + +; Test stride value is a multiple of pointer alignment. +define @intrinsic_vlse_v_nxv1i32_nxv1i32_align(* align 4 %0, i64 %1, i64 %2) sanitize_address { +; CHECK-LABEL: @intrinsic_vlse_v_nxv1i32_nxv1i32_align( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i64 [[TMP2:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP14:%.*]] +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP6]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP5]] ], [ [[IV_NEXT:%.*]], [[TMP13:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP13]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[IV]], 4 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP11]] to i64 +; CHECK-NEXT: call void @__asan_load4(i64 [[TMP12]]) +; CHECK-NEXT: br label [[TMP13]] +; CHECK: 13: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP7]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vlse.nxv1i32.p0.i64( poison, ptr [[TMP0]], i64 4, i64 [[TMP2]]) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vlse.nxv1i32( + poison, + * %0, + i64 4, + i64 %2) + + ret %a +} + +declare @llvm.riscv.vloxei.nxv1i32.nxv1i16( + , + *, + , + i64); + +define @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(* %0, %1, i64 %2) sanitize_address { +; CHECK-LABEL: @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = zext [[TMP1:%.*]] to +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP2:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP15:%.*]] +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP8]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[TMP14:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP14]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP5]], i64 [[IV]] +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP13]], i64 4) +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP9]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vloxei.nxv1i32.p0.nxv1i16.i64( poison, ptr [[TMP0]], [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vloxei.nxv1i32.nxv1i16( + poison, + * %0, + %1, + i64 %2) + + ret %a +} + +declare @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + , + *, + , + , + i64, + i64); + +define @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16( %0, * %1, %2, %3, i64 %4) sanitize_address { +; CHECK-LABEL: @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = zext [[TMP2:%.*]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP4:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP17:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[TMP10]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP9]] ], [ [[IV_NEXT:%.*]], [[TMP16:%.*]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP3:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP12]], label [[TMP13:%.*]], label [[TMP16]] +; CHECK: 13: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP7]], i64 [[IV]] +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP15]], i64 4) +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP17]] +; CHECK: 17: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vloxei.mask.nxv1i32.p0.nxv1i16.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], [[TMP3]], i64 [[TMP4]], i64 1) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16( + %0, + * %1, + %2, + %3, + i64 %4, i64 1) + + ret %a +} + +declare @llvm.riscv.vloxei.nxv1f32.nxv1i16( + , + *, + , + i64); + +define @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(* %0, %1, i64 %2) sanitize_address { +; CHECK-LABEL: @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = zext [[TMP1:%.*]] to +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP2:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP15:%.*]] +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP8]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[TMP14:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP14]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP5]], i64 [[IV]] +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP13]], i64 4) +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP9]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vloxei.nxv1f32.p0.nxv1i16.i64( poison, ptr [[TMP0]], [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vloxei.nxv1f32.nxv1i16( + poison, + * %0, + %1, + i64 %2) + + ret %a +} + +declare @llvm.riscv.vluxei.nxv1i32.nxv1i16( + , + *, + , + i64); + +define @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(* %0, %1, i64 %2) sanitize_address { +; CHECK-LABEL: @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP4:%.*]] = zext [[TMP1:%.*]] to +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[TMP2:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP15:%.*]] +; CHECK: 7: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP8]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP7]] ], [ [[IV_NEXT:%.*]], [[TMP14:%.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP10]], label [[TMP11:%.*]], label [[TMP14]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP5]], i64 [[IV]] +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr [[TMP12]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP13]], i64 4) +; CHECK-NEXT: br label [[TMP14]] +; CHECK: 14: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP9]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vluxei.nxv1i32.p0.nxv1i16.i64( poison, ptr [[TMP0]], [[TMP1]], i64 [[TMP2]]) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vluxei.nxv1i32.nxv1i16( + poison, + * %0, + %1, + i64 %2) + + ret %a +} + +declare @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + , + *, + , + , + i64, + i64); + +define @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16( %0, * %1, %2, %3, i64 %4) sanitize_address { +; CHECK-LABEL: @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = zext [[TMP2:%.*]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP4:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP17:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[TMP10]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP9]] ], [ [[IV_NEXT:%.*]], [[TMP16:%.*]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP3:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP12]], label [[TMP13:%.*]], label [[TMP16]] +; CHECK: 13: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP7]], i64 [[IV]] +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_loadN(i64 [[TMP15]], i64 4) +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP17]] +; CHECK: 17: +; CHECK-NEXT: [[A:%.*]] = call @llvm.riscv.vluxei.mask.nxv1i32.p0.nxv1i16.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], [[TMP3]], i64 [[TMP4]], i64 1) +; CHECK-NEXT: ret [[A]] +; +entry: + %a = call @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16( + %0, + * %1, + %2, + %3, + i64 %4, i64 1) + + ret %a +} + +declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + , + *, + , + i64); + +define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16( %0, * %1, %2, i64 %3) sanitize_address { +; CHECK-LABEL: @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP2:%.*]] to +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP3:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP16:%.*]] +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP9]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP8]] ], [ [[IV_NEXT:%.*]], [[TMP15:%.*]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP15]] +; CHECK: 12: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP6]], i64 [[IV]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: call void @__asan_storeN(i64 [[TMP14]], i64 4) +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP10]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: call void @llvm.riscv.vsoxei.nxv1i32.p0.nxv1i16.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], i64 [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16( + %0, + * %1, + %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + , + *, + , + , + i64); + +define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16( %0, * %1, %2, %3, i64 %4) sanitize_address { +; CHECK-LABEL: @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = zext [[TMP2:%.*]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP4:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP17:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[TMP10]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP9]] ], [ [[IV_NEXT:%.*]], [[TMP16:%.*]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP3:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP12]], label [[TMP13:%.*]], label [[TMP16]] +; CHECK: 13: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP7]], i64 [[IV]] +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_storeN(i64 [[TMP15]], i64 4) +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP17]] +; CHECK: 17: +; CHECK-NEXT: call void @llvm.riscv.vsoxei.mask.nxv1i32.p0.nxv1i16.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], [[TMP3]], i64 [[TMP4]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16( + %0, + * %1, + %2, + %3, + i64 %4) + + ret void +} + +declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + , + *, + , + i64); + +define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16( %0, * %1, %2, i64 %3) sanitize_address { +; CHECK-LABEL: @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = zext [[TMP2:%.*]] to +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP3:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP16:%.*]] +; CHECK: 8: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[TMP9]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP8]] ], [ [[IV_NEXT:%.*]], [[TMP15:%.*]] ] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement splat (i1 true), i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP15]] +; CHECK: 12: +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP6]], i64 [[IV]] +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 +; CHECK-NEXT: call void @__asan_storeN(i64 [[TMP14]], i64 4) +; CHECK-NEXT: br label [[TMP15]] +; CHECK: 15: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP10]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: call void @llvm.riscv.vsuxei.nxv1i32.p0.nxv1i16.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], i64 [[TMP3]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16( + %0, + * %1, + %2, + i64 %3) + + ret void +} + +declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + , + *, + , + , + i64); + +define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16( %0, * %1, %2, %3, i64 %4) sanitize_address { +; CHECK-LABEL: @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP6:%.*]] = zext [[TMP2:%.*]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP1:%.*]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne i64 [[TMP4:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[TMP9:%.*]], label [[TMP17:%.*]] +; CHECK: 9: +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP4]], i64 [[TMP10]]) +; CHECK-NEXT: br label [[DOTSPLIT:%.*]] +; CHECK: .split: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[TMP9]] ], [ [[IV_NEXT:%.*]], [[TMP16:%.*]] ] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement [[TMP3:%.*]], i64 [[IV]] +; CHECK-NEXT: br i1 [[TMP12]], label [[TMP13:%.*]], label [[TMP16]] +; CHECK: 13: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP7]], i64 [[IV]] +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_storeN(i64 [[TMP15]], i64 4) +; CHECK-NEXT: br label [[TMP16]] +; CHECK: 16: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_CHECK:%.*]] = icmp eq i64 [[IV_NEXT]], [[TMP11]] +; CHECK-NEXT: br i1 [[IV_CHECK]], label [[DOTSPLIT_SPLIT:%.*]], label [[DOTSPLIT]] +; CHECK: .split.split: +; CHECK-NEXT: br label [[TMP17]] +; CHECK: 17: +; CHECK-NEXT: call void @llvm.riscv.vsuxei.mask.nxv1i32.p0.nxv1i16.i64( [[TMP0:%.*]], ptr [[TMP1]], [[TMP2]], [[TMP3]], i64 [[TMP4]]) +; CHECK-NEXT: ret void +; +entry: + call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16( + %0, + * %1, + %2, + %3, + i64 %4) + + ret void +} + + +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 2), ptr, , , i64, i64, i64) + +define @test_vloxseg2_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg2_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP25:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP26:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP25]], i32 1) +; CHECK-NEXT: ret [[TMP26]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + +define @test_vloxseg2_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg2_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP25:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP26:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP25]], i32 1) +; CHECK-NEXT: ret [[TMP26]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vloxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 3), ptr, , , i64, i64, i64) + +define @test_vloxseg3_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg3_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP37:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP38:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP37]], i32 1) +; CHECK-NEXT: ret [[TMP38]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + +define @test_vloxseg3_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg3_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP37:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP38:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP37]], i32 1) +; CHECK-NEXT: ret [[TMP38]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vloxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 4), ptr, , , i64, i64, i64) + +define @test_vloxseg4_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg4_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP49:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP50:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP49]], i32 1) +; CHECK-NEXT: ret [[TMP50]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + +define @test_vloxseg4_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg4_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP49:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP50:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP49]], i32 1) +; CHECK-NEXT: ret [[TMP50]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vloxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 5), ptr, , , i64, i64, i64) + +define @test_vloxseg5_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg5_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP61:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP62:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP61]], i32 1) +; CHECK-NEXT: ret [[TMP62]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + +define @test_vloxseg5_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg5_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP61:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP62:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP61]], i32 1) +; CHECK-NEXT: ret [[TMP62]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vloxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 6), ptr, , , i64, i64, i64) + +define @test_vloxseg6_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg6_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP73:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP74:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP73]], i32 1) +; CHECK-NEXT: ret [[TMP74]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + +define @test_vloxseg6_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg6_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP73:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP74:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP73]], i32 1) +; CHECK-NEXT: ret [[TMP74]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vloxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 7), ptr, , , i64, i64, i64) + +define @test_vloxseg7_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg7_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP85:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP86:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP85]], i32 1) +; CHECK-NEXT: ret [[TMP86]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + +define @test_vloxseg7_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg7_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP85:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP86:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP85]], i32 1) +; CHECK-NEXT: ret [[TMP86]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vloxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 8), ptr, , , i64, i64, i64) + +define @test_vloxseg8_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vloxseg8_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP97:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP98:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP97]], i32 1) +; CHECK-NEXT: ret [[TMP98]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + +define @test_vloxseg8_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vloxseg8_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP97:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP98:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP97]], i32 1) +; CHECK-NEXT: ret [[TMP98]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vloxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 2), ptr, , , i64, i64, i64) + +define @test_vluxseg2_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg2_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP25:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP26:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP25]], i32 1) +; CHECK-NEXT: ret [[TMP26]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + +define @test_vluxseg2_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg2_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP25:%.*]] = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 2) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP26:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) [[TMP25]], i32 1) +; CHECK-NEXT: ret [[TMP26]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 2) @llvm.riscv.vluxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 2) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_2t(target("riscv.vector.tuple", , 2) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 3), ptr, , , i64, i64, i64) + +define @test_vluxseg3_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg3_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP37:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP38:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP37]], i32 1) +; CHECK-NEXT: ret [[TMP38]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + +define @test_vluxseg3_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg3_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP37:%.*]] = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 3) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP38:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) [[TMP37]], i32 1) +; CHECK-NEXT: ret [[TMP38]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 3) @llvm.riscv.vluxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 3) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_3t(target("riscv.vector.tuple", , 3) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 4), ptr, , , i64, i64, i64) + +define @test_vluxseg4_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg4_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP49:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP50:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP49]], i32 1) +; CHECK-NEXT: ret [[TMP50]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + +define @test_vluxseg4_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg4_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP49:%.*]] = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 4) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP50:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) [[TMP49]], i32 1) +; CHECK-NEXT: ret [[TMP50]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 4) @llvm.riscv.vluxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 4) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_4t(target("riscv.vector.tuple", , 4) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 5), ptr, , , i64, i64, i64) + +define @test_vluxseg5_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg5_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP61:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP62:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP61]], i32 1) +; CHECK-NEXT: ret [[TMP62]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + +define @test_vluxseg5_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg5_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP61:%.*]] = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 5) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP62:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) [[TMP61]], i32 1) +; CHECK-NEXT: ret [[TMP62]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 5) @llvm.riscv.vluxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 5) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_5t(target("riscv.vector.tuple", , 5) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 6), ptr, , , i64, i64, i64) + +define @test_vluxseg6_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg6_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP73:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP74:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP73]], i32 1) +; CHECK-NEXT: ret [[TMP74]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + +define @test_vluxseg6_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg6_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP73:%.*]] = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 6) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP74:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) [[TMP73]], i32 1) +; CHECK-NEXT: ret [[TMP74]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 6) @llvm.riscv.vluxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 6) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_6t(target("riscv.vector.tuple", , 6) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 7), ptr, , , i64, i64, i64) + +define @test_vluxseg7_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg7_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP85:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP86:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP85]], i32 1) +; CHECK-NEXT: ret [[TMP86]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + +define @test_vluxseg7_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg7_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP85:%.*]] = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 7) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP86:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) [[TMP85]], i32 1) +; CHECK-NEXT: ret [[TMP86]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 7) @llvm.riscv.vluxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 7) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_7t(target("riscv.vector.tuple", , 7) %0, i32 1) + ret %1 +} + + +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8), ptr, , i64, i64) +declare target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 8), ptr, , , i64, i64, i64) + +define @test_vluxseg8_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vluxseg8_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP97:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: [[TMP98:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP97]], i32 1) +; CHECK-NEXT: ret [[TMP98]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, i64 %vl, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + +define @test_vluxseg8_mask_nxv1i32_nxv1i16(ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vluxseg8_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: [[TMP97:%.*]] = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 8) poison, ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 1, i64 5) +; CHECK-NEXT: [[TMP98:%.*]] = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) [[TMP97]], i32 1) +; CHECK-NEXT: ret [[TMP98]] +; +entry: + %0 = tail call target("riscv.vector.tuple", , 8) @llvm.riscv.vluxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i1.nxv1i32(target("riscv.vector.tuple", , 8) poison, ptr %base, %index, %mask, i64 %vl, i64 1, i64 5) + %1 = call @llvm.riscv.tuple.extract.nxv1i32.triscv.vector.tuple_nxv4i8_8t(target("riscv.vector.tuple", , 8) %0, i32 1) + ret %1 +} + + +declare void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 2), ptr, , , i64, i64) + +define void @test_vsoxseg2_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg2_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg2_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg2_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 3), ptr, , , i64, i64) + +define void @test_vsoxseg3_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg3_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg3_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg3_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 4), ptr, , , i64, i64) + +define void @test_vsoxseg4_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg4_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg4_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg4_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 5), ptr, , , i64, i64) + +define void @test_vsoxseg5_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg5_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg5_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg5_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 6), ptr, , , i64, i64) + +define void @test_vsoxseg6_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg6_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg6_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg6_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 7), ptr, , , i64, i64) + +define void @test_vsoxseg7_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg7_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg7_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg7_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8), ptr, , i64, i64) +declare void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 8), ptr, , , i64, i64) + +define void @test_vsoxseg8_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsoxseg8_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsoxseg8_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsoxseg8_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.nxv4i8(target("riscv.vector.tuple", , 2), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 2), ptr, , , i64, i64) + +define void @test_vsuxseg2_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg2_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg2.triscv.vector.tuple_nxv4i8_2t.nxv1i16(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg2_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg2_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 2) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg2.mask.triscv.vector.tuple_nxv4i8_2t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 2) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.nxv4i8(target("riscv.vector.tuple", , 3), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 3), ptr, , , i64, i64) + +define void @test_vsuxseg3_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg3_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg3.triscv.vector.tuple_nxv4i8_3t.nxv1i16(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg3_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg3_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 3) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg3.mask.triscv.vector.tuple_nxv4i8_3t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 3) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.nxv4i8(target("riscv.vector.tuple", , 4), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 4), ptr, , , i64, i64) + +define void @test_vsuxseg4_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg4_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg4.triscv.vector.tuple_nxv4i8_4t.nxv1i16(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg4_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg4_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 4) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg4.mask.triscv.vector.tuple_nxv4i8_4t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 4) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.nxv4i8(target("riscv.vector.tuple", , 5), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 5), ptr, , , i64, i64) + +define void @test_vsuxseg5_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg5_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg5.triscv.vector.tuple_nxv4i8_5t.nxv1i16(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg5_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg5_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 5) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg5.mask.triscv.vector.tuple_nxv4i8_5t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 5) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.nxv4i8(target("riscv.vector.tuple", , 6), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 6), ptr, , , i64, i64) + +define void @test_vsuxseg6_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg6_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg6.triscv.vector.tuple_nxv4i8_6t.nxv1i16(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg6_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg6_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 6) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg6.mask.triscv.vector.tuple_nxv4i8_6t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 6) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.nxv4i8(target("riscv.vector.tuple", , 7), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 7), ptr, , , i64, i64) + +define void @test_vsuxseg7_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg7_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg7.triscv.vector.tuple_nxv4i8_7t.nxv1i16(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg7_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg7_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 7) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg7.mask.triscv.vector.tuple_nxv4i8_7t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 7) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + + +declare void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.nxv4i8(target("riscv.vector.tuple", , 8), ptr, , i64, i64) +declare void @llvm.riscv.vsuxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv4i8.nxv4i1(target("riscv.vector.tuple", , 8), ptr, , , i64, i64) + +define void @test_vsuxseg8_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, i64 %vl) sanitize_address { +; CHECK-LABEL: @test_vsuxseg8_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsuxseg8.triscv.vector.tuple_nxv4i8_8t.nxv1i16(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, i64 %vl, i64 5) + ret void +} + +define void @test_vsuxseg8_mask_nxv1i32_nxv1i16(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, i64 %vl, %mask) sanitize_address { +; CHECK-LABEL: @test_vsuxseg8_mask_nxv1i32_nxv1i16( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @__asan_shadow_memory_dynamic_address, align 8 +; CHECK-NEXT: tail call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.p0.nxv1i16.nxv1i1.i64(target("riscv.vector.tuple", , 8) [[VAL:%.*]], ptr [[BASE:%.*]], [[INDEX:%.*]], [[MASK:%.*]], i64 [[VL:%.*]], i64 5) +; CHECK-NEXT: ret void +; +entry: + tail call void @llvm.riscv.vsoxseg8.mask.triscv.vector.tuple_nxv4i8_8t.nxv1i16.nxv1i1(target("riscv.vector.tuple", , 8) %val, ptr %base, %index, %mask, i64 %vl, i64 5) + ret void +} + diff --git a/llvm/test/Instrumentation/AddressSanitizer/coro-byval-param.ll b/llvm/test/Instrumentation/AddressSanitizer/coro-byval-param.ll index 290f1cbd38cdf..b0aec4e0426e6 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/coro-byval-param.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/coro-byval-param.ll @@ -58,7 +58,7 @@ coro.free: ; preds = %cleanup33 br label %coro.ret coro.ret: ; preds = %coro.free, %cleanup33, %init.ready, %coro.init - %10 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #10 + call void @llvm.coro.end(ptr null, i1 false, token none) #10 ret ptr %call2 } @@ -105,7 +105,7 @@ declare i8 @llvm.coro.suspend(token, i1) #2 declare void @_ZN4task12promise_type13final_suspendEv(ptr nonnull dereferenceable(1)) local_unnamed_addr #7 align 2 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 ; Function Attrs: nobuiltin nounwind declare void @_ZdlPv(ptr) local_unnamed_addr #8 diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/coro-byval-param.ll b/llvm/test/Instrumentation/HWAddressSanitizer/coro-byval-param.ll index 0289b33a45882..064565ca2f3b2 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/coro-byval-param.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/coro-byval-param.ll @@ -58,7 +58,7 @@ coro.free: ; preds = %cleanup33 br label %coro.ret coro.ret: ; preds = %coro.free, %cleanup33, %init.ready, %coro.init - %10 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #10 + call void @llvm.coro.end(ptr null, i1 false, token none) #10 ret ptr %call2 } @@ -105,7 +105,7 @@ declare i8 @llvm.coro.suspend(token, i1) #2 declare void @_ZN4task12promise_type13final_suspendEv(ptr nonnull dereferenceable(1)) local_unnamed_addr #7 align 2 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 ; Function Attrs: nobuiltin nounwind declare void @_ZdlPv(ptr) local_unnamed_addr #8 diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll index b2a4f0e582f9e..d8f204f32cfd1 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll @@ -21,7 +21,7 @@ ; - llvm.x86.avx512.mask.pmov.db.mem.512, llvm.x86.avx512.mask.pmov.dw.mem.512, llvm.x86.avx512.mask.pmov.qb.mem.512, llvm.x86.avx512.mask.pmov.qd.mem.512llvm.x86.avx512.mask.pmov.qw.mem.512 ; - llvm.x86.avx512.mask.pmovs.db.mem.512, llvm.x86.avx512.mask.pmovs.dw.mem.512, llvm.x86.avx512.mask.pmovs.qb.mem.512, llvm.x86.avx512.mask.pmovs.qd.mem.512, llvm.x86.avx512.mask.pmovs.qw.mem.512 ; - llvm.x86.avx512.mask.pmovus.db.mem.512, llvm.x86.avx512.mask.pmovus.dw.mem.512, llvm.x86.avx512.mask.pmovus.qb.mem.512, llvm.x86.avx512.mask.pmovus.qd.mem.512, llvm.x86.avx512.mask.pmovus.qw.mem.512 -; - llvm.x86.avx512.mask.rndscale.pd.512, llvm.x86.avx512.mask.rndscale.ps.512, llvm.x86.avx512.mask.rndscale.sd, llvm.x86.avx512.mask.rndscale.ss +; - llvm.x86.avx512.mask.rndscale.sd, llvm.x86.avx512.mask.rndscale.ss ; - llvm.x86.avx512.mask.scalef.pd.512, llvm.x86.avx512.mask.scalef.ps.512 ; - llvm.x86.avx512.mask.sqrt.sd, llvm.x86.avx512.mask.sqrt.ss ; - llvm.x86.avx512.maskz.fixupimm.pd.512, llvm.x86.avx512.maskz.fixupimm.ps.512, llvm.x86.avx512.maskz.fixupimm.sd, llvm.x86.avx512.maskz.fixupimm.ss @@ -965,18 +965,11 @@ define <8 x double> @test7(<8 x double> %a) #0 { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <8 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i64> +; CHECK-NEXT: [[TMP4:%.*]] = select <8 x i1> splat (i1 true), <8 x i64> [[TMP3]], <8 x i64> [[TMP1]] ; CHECK-NEXT: [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> [[A:%.*]], i32 11, <8 x double> [[A]], i8 -1, i32 4) -; CHECK-NEXT: store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x double> [[RES]] ; %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4) @@ -989,18 +982,11 @@ define <16 x float> @test8(<16 x float> %a) #0 { ; CHECK-LABEL: @test8( ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] -; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]] -; CHECK: 4: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]] -; CHECK-NEXT: unreachable -; CHECK: 5: +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <16 x i1> splat (i1 true), <16 x i32> [[TMP3]], <16 x i32> [[TMP1]] ; CHECK-NEXT: [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> [[A:%.*]], i32 11, <16 x float> [[A]], i16 -1, i32 4) -; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <16 x i32> [[TMP4]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x float> [[RES]] ; %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll index e5cbe8c132238..8723b1005f8fc 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512fp16-intrinsics.ll @@ -17,7 +17,6 @@ ; - llvm.x86.avx512fp16.mask.rcp.sh ; - llvm.x86.avx512fp16.mask.reduce.ph.512 ; - llvm.x86.avx512fp16.mask.reduce.sh -; - llvm.x86.avx512fp16.mask.rndscale.ph.512 ; - llvm.x86.avx512fp16.mask.rndscale.sh ; - llvm.x86.avx512fp16.mask.rsqrt.sh ; - llvm.x86.avx512fp16.mask.scalef.ph.512 @@ -868,36 +867,28 @@ declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, define <32 x half>@test_int_x86_avx512_mask_rndscale_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) #0 { ; CHECK-LABEL: define <32 x half> @test_int_x86_avx512_mask_rndscale_ph_512( ; CHECK-SAME: <32 x half> [[X0:%.*]], <32 x half> [[X2:%.*]], i32 [[X3:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[X3]] to <32 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <32 x i1> [[TMP5]] to <32 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP4]], <32 x i16> [[TMP6]], <32 x i16> [[TMP2]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i32 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] -; CHECK: [[BB6]]: +; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] ; CHECK-NEXT: unreachable -; CHECK: [[BB7]]: +; CHECK: [[BB9]]: ; CHECK-NEXT: [[RES:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[X0]], i32 8, <32 x half> [[X2]], i32 [[X3]], i32 4) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <32 x i16> [[TMP1]] to i512 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <32 x i16> [[TMP2]] to i512 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] -; CHECK: [[BB10]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR8]] -; CHECK-NEXT: unreachable -; CHECK: [[BB11]]: +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <32 x i1> [[TMP10]] to <32 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> splat (i1 true), <32 x i16> [[TMP11]], <32 x i16> [[TMP2]] ; CHECK-NEXT: [[RES1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[X0]], i32 4, <32 x half> [[X2]], i32 -1, i32 8) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i16> [[TMP7]], [[TMP12]] ; CHECK-NEXT: [[RES2:%.*]] = fadd <32 x half> [[RES]], [[RES1]] -; CHECK-NEXT: store <32 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <32 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <32 x half> [[RES2]] ; %res = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4) diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll index 20114fe7d3151..d598142fe8dbf 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll @@ -54,8 +54,6 @@ ; - llvm.x86.avx512.mask.pmovus.qd.mem.128, llvm.x86.avx512.mask.pmovus.qd.mem.256 ; - llvm.x86.avx512.mask.pmovus.qw.128, llvm.x86.avx512.mask.pmovus.qw.256 ; - llvm.x86.avx512.mask.pmovus.qw.mem.128, llvm.x86.avx512.mask.pmovus.qw.mem.256 -; - llvm.x86.avx512.mask.rndscale.pd.128, llvm.x86.avx512.mask.rndscale.pd.256 -; - llvm.x86.avx512.mask.rndscale.ps.128, llvm.x86.avx512.mask.rndscale.ps.256 ; - llvm.x86.avx512.mask.scalef.pd.128, llvm.x86.avx512.mask.scalef.pd.256 ; - llvm.x86.avx512.mask.scalef.ps.128, llvm.x86.avx512.mask.scalef.ps.256 ; - llvm.x86.avx512.maskz.fixupimm.pd.128, llvm.x86.avx512.maskz.fixupimm.pd.256 @@ -7127,36 +7125,29 @@ define <2 x double>@test_int_x86_avx512_mask_rndscale_pd_128(<2 x double> %x0, < ; ; CHECK-LABEL: define <2 x double> @test_int_x86_avx512_mask_rndscale_pd_128( ; CHECK-SAME: <2 x double> [[X0:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[X3]] to i2 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i2 [[TMP4]] to <2 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <2 x i1> [[TMP6]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP5]], <2 x i64> [[TMP7]], <2 x i64> [[TMP2]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] -; CHECK: [[BB6]]: +; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: [[BB7]]: -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 4, <2 x double> [[X2]], i8 [[X3]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] ; CHECK: [[BB10]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 4, <2 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = sext <2 x i1> [[TMP11]] to <2 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> splat (i1 true), <2 x i64> [[TMP12]], <2 x i64> [[TMP2]] ; CHECK-NEXT: [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> [[X0]], i32 88, <2 x double> [[X2]], i8 -1) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <2 x i64> [[TMP8]], [[TMP13]] ; CHECK-NEXT: [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]] -; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <2 x double> [[RES2]] ; %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %x0, i32 4, <2 x double> %x2, i8 %x3) @@ -7171,36 +7162,29 @@ define <4 x double>@test_int_x86_avx512_mask_rndscale_pd_256(<4 x double> %x0, < ; ; CHECK-LABEL: define <4 x double> @test_int_x86_avx512_mask_rndscale_pd_256( ; CHECK-SAME: <4 x double> [[X0:%.*]], <4 x double> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[X3]] to i4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP7]], <4 x i64> [[TMP2]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] -; CHECK: [[BB6]]: +; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: [[BB7]]: -; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 4, <4 x double> [[X2]], i8 [[X3]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i64> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] ; CHECK: [[BB10]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 4, <4 x double> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <4 x i64> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> splat (i1 true), <4 x i64> [[TMP12]], <4 x i64> [[TMP2]] ; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> [[X0]], i32 88, <4 x double> [[X2]], i8 -1) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP8]], [[TMP13]] ; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x double> [[RES]], [[RES1]] -; CHECK-NEXT: store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x double> [[RES2]] ; %res = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %x0, i32 4, <4 x double> %x2, i8 %x3) @@ -7215,36 +7199,29 @@ define <4 x float>@test_int_x86_avx512_mask_rndscale_ps_128(<4 x float> %x0, <4 ; ; CHECK-LABEL: define <4 x float> @test_int_x86_avx512_mask_rndscale_ps_128( ; CHECK-SAME: <4 x float> [[X0:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = trunc i8 [[X3]] to i4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i1> [[TMP6]] to <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP7]], <4 x i32> [[TMP2]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] -; CHECK: [[BB6]]: +; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB9:.*]], label %[[BB10:.*]], !prof [[PROF1]] +; CHECK: [[BB9]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: [[BB7]]: -; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 88, <4 x float> [[X2]], i8 [[X3]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i128 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] ; CHECK: [[BB10]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: [[BB11]]: +; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 88, <4 x float> [[X2]], i8 [[X3]]) +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <4 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = sext <4 x i1> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> splat (i1 true), <4 x i32> [[TMP12]], <4 x i32> [[TMP2]] ; CHECK-NEXT: [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> [[X0]], i32 4, <4 x float> [[X2]], i8 -1) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], [[TMP13]] ; CHECK-NEXT: [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]] -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x float> [[RES2]] ; %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %x0, i32 88, <4 x float> %x2, i8 %x3) @@ -7259,36 +7236,28 @@ define <8 x float>@test_int_x86_avx512_mask_rndscale_ps_256(<8 x float> %x0, <8 ; ; CHECK-LABEL: define <8 x float> @test_int_x86_avx512_mask_rndscale_ps_256( ; CHECK-SAME: <8 x float> [[X0:%.*]], <8 x float> [[X2:%.*]], i8 [[X3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0 -; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8 [[X3]] to <8 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP6]], <8 x i32> [[TMP2]] ; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0 -; CHECK-NEXT: [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]] -; CHECK-NEXT: br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]] -; CHECK: [[BB6]]: +; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]] +; CHECK: [[BB8]]: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable -; CHECK: [[BB7]]: +; CHECK: [[BB9]]: ; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 5, <8 x float> [[X2]], i8 [[X3]]) -; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256 -; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256 -; CHECK-NEXT: [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0 -; CHECK-NEXT: [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]] -; CHECK-NEXT: br i1 [[_MSOR6]], label %[[BB10:.*]], label %[[BB11:.*]], !prof [[PROF1]] -; CHECK: [[BB10]]: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: [[BB11]]: +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> splat (i1 true), <8 x i32> [[TMP11]], <8 x i32> [[TMP2]] ; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> [[X0]], i32 66, <8 x float> [[X2]], i8 -1) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP7]], [[TMP12]] ; CHECK-NEXT: [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]] -; CHECK-NEXT: store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x float> [[RES2]] ; %res = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %x0, i32 5, <8 x float> %x2, i8 %x3) diff --git a/llvm/test/Linker/Inputs/errno-tbaa-cxx-metadata.ll b/llvm/test/Linker/Inputs/errno-tbaa-cxx-metadata.ll new file mode 100644 index 0000000000000..eefb6d833f636 --- /dev/null +++ b/llvm/test/Linker/Inputs/errno-tbaa-cxx-metadata.ll @@ -0,0 +1,5 @@ +!llvm.errno.tbaa = !{!0} +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"} diff --git a/llvm/test/Linker/Inputs/errno-tbaa-metadata.ll b/llvm/test/Linker/Inputs/errno-tbaa-metadata.ll new file mode 100644 index 0000000000000..5dd468776bdee --- /dev/null +++ b/llvm/test/Linker/Inputs/errno-tbaa-metadata.ll @@ -0,0 +1,5 @@ +!llvm.errno.tbaa = !{!0} +!0 = !{!1, !1, i64 0} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} diff --git a/llvm/test/Linker/link-errno-tbaa-metadata.ll b/llvm/test/Linker/link-errno-tbaa-metadata.ll new file mode 100644 index 0000000000000..b58373d3acbef --- /dev/null +++ b/llvm/test/Linker/link-errno-tbaa-metadata.ll @@ -0,0 +1,8 @@ +; RUN: llvm-link %S/Inputs/errno-tbaa-metadata.ll %S/Inputs/errno-tbaa-cxx-metadata.ll -S -o - | FileCheck %s --check-prefix=CHECK-MERGE +; RUN: llvm-link %S/Inputs/errno-tbaa-metadata.ll %S/Inputs/errno-tbaa-metadata.ll -S -o - | FileCheck %s --check-prefix=CHECK-DEDUP + +; Ensure merging when linking modules w/ different errno TBAA hierarchies. +; CHECK-MERGE: !llvm.errno.tbaa = !{![[NODE0:[0-9]+]], ![[NODE1:[0-9]+]]} + +; Ensure deduplication when linking modules w/ identical errno TBAA nodes. +; CHECK-DEDUP: !llvm.errno.tbaa = !{![[NODE:[0-9]+]]} diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s index cce8e1ef24f5f..4f7df62659f68 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s @@ -1,4 +1,5 @@ // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX125X-ERR,GFX1250-ERR --implicit-check-not=error: --strict-whitespace %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1251 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX125X-ERR,GFX1251-ERR --implicit-check-not=error: --strict-whitespace %s v_lshl_add_u64 v[2:3], v[4:5], v7, v[8:9] dpp8:[7,6,5,4,3,2,1,0] // GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s index 4e3e725a00556..819ecb866c5ae 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s @@ -193,19 +193,19 @@ s_call_b64 vcc, 0x1234 s_call_b64 null, 0x1234 // GFX12: encoding: [0x34,0x12,0x7c,0xba] -s_getreg_b32 s0, hwreg(HW_REG_MODE) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_MODE) // GFX12: encoding: [0x01,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_STATUS) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_STATUS) // GFX12: encoding: [0x02,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_STATE_PRIV) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_STATE_PRIV) // GFX12: encoding: [0x04,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_GPR_ALLOC) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_GPR_ALLOC) // GFX12: encoding: [0x05,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_LDS_ALLOC) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_LDS_ALLOC) // GFX12: encoding: [0x06,0xf8,0x80,0xb8] s_getreg_b32 s0, hwreg(HW_REG_IB_STS) @@ -226,31 +226,31 @@ s_getreg_b32 s0, hwreg(HW_REG_PERF_SNAPSHOT_DATA1) s_getreg_b32 s0, hwreg(HW_REG_PERF_SNAPSHOT_DATA2) // GFX12: encoding: [0x10,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_PRIV) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) // GFX12: encoding: [0x11,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_USER) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) // GFX12: encoding: [0x12,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_TRAP_CTRL) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_TRAP_CTRL) // GFX12: encoding: [0x13,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_SCRATCH_BASE_LO) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) // GFX12: encoding: [0x14,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_SCRATCH_BASE_HI) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) // GFX12: encoding: [0x15,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_HW_ID1) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID1) // GFX12: encoding: [0x17,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_HW_ID2) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2) // GFX12: encoding: [0x18,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_DVGPR_ALLOC_LO) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_DVGPR_ALLOC_LO) // GFX12: encoding: [0x1f,0xf8,0x80,0xb8] -s_getreg_b32 s0, hwreg(HW_REG_DVGPR_ALLOC_HI) +s_getreg_b32 s0, hwreg(HW_REG_WAVE_DVGPR_ALLOC_HI) // GFX12: encoding: [0x20,0xf8,0x80,0xb8] s_getreg_b32 s0, hwreg(HW_REG_SHADER_CYCLES_LO) diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s index 4a25922f956d3..bd265938170f1 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopk_alias.s @@ -1,4 +1,46 @@ // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1200 %s | FileCheck --check-prefix=GFX12 %s s_addk_i32 s0, 0x1234 -// GFX12: s_addk_co_i32 s0, 0x1234 ; encoding: [0x34,0x12,0x80,0xb7] +// GFX12: s_addk_co_i32 s0, 0x1234 ; encoding: [0x34,0x12,0x80,0xb7] + +s_getreg_b32 s0, hwreg(HW_REG_MODE) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_MODE) ; encoding: [0x01,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_STATUS) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_STATUS) ; encoding: [0x02,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_STATE_PRIV) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_STATE_PRIV) ; encoding: [0x04,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_GPR_ALLOC) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_GPR_ALLOC) ; encoding: [0x05,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_LDS_ALLOC) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_LDS_ALLOC) ; encoding: [0x06,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_PRIV) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) ; encoding: [0x11,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_USER) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) ; encoding: [0x12,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_TRAP_CTRL) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_TRAP_CTRL) ; encoding: [0x13,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_SCRATCH_BASE_LO) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) ; encoding: [0x14,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_SCRATCH_BASE_HI) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) ; encoding: [0x15,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_HW_ID1) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID1) ; encoding: [0x17,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_HW_ID2) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2) ; encoding: [0x18,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_DVGPR_ALLOC_LO) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_DVGPR_ALLOC_LO) ; encoding: [0x1f,0xf8,0x80,0xb8] + +s_getreg_b32 s0, hwreg(HW_REG_DVGPR_ALLOC_HI) +// GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_DVGPR_ALLOC_HI) ; encoding: [0x20,0xf8,0x80,0xb8] diff --git a/llvm/test/MC/AMDGPU/pal-registers.s b/llvm/test/MC/AMDGPU/pal-registers.s index 3080518b7eb0a..839b3126a131b 100644 --- a/llvm/test/MC/AMDGPU/pal-registers.s +++ b/llvm/test/MC/AMDGPU/pal-registers.s @@ -258,6 +258,22 @@ amdpal.pipelines: 0x2e4d: 0 0x2e4e: 0 0x2e4f: 0 + 0x2e50: 0 + 0x2e51: 0 + 0x2e52: 0 + 0x2e53: 0 + 0x2e54: 0 + 0x2e55: 0 + 0x2e56: 0 + 0x2e57: 0 + 0x2e58: 0 + 0x2e59: 0 + 0x2e5a: 0 + 0x2e5b: 0 + 0x2e5c: 0 + 0x2e5d: 0 + 0x2e5e: 0 + 0x2e5f: 0 0xa08f: 0 0xa191: 0 0xa192: 0 @@ -596,6 +612,22 @@ amdpal.pipelines: // CHECK: 0x2e4d (COMPUTE_USER_DATA_13) // CHECK: 0x2e4e (COMPUTE_USER_DATA_14) // CHECK: 0x2e4f (COMPUTE_USER_DATA_15) +// CHECK: 0x2e50 (COMPUTE_USER_DATA_16) +// CHECK: 0x2e51 (COMPUTE_USER_DATA_17) +// CHECK: 0x2e52 (COMPUTE_USER_DATA_18) +// CHECK: 0x2e53 (COMPUTE_USER_DATA_19) +// CHECK: 0x2e54 (COMPUTE_USER_DATA_20) +// CHECK: 0x2e55 (COMPUTE_USER_DATA_21) +// CHECK: 0x2e56 (COMPUTE_USER_DATA_22) +// CHECK: 0x2e57 (COMPUTE_USER_DATA_23) +// CHECK: 0x2e58 (COMPUTE_USER_DATA_24) +// CHECK: 0x2e59 (COMPUTE_USER_DATA_25) +// CHECK: 0x2e5a (COMPUTE_USER_DATA_26) +// CHECK: 0x2e5b (COMPUTE_USER_DATA_27) +// CHECK: 0x2e5c (COMPUTE_USER_DATA_28) +// CHECK: 0x2e5d (COMPUTE_USER_DATA_29) +// CHECK: 0x2e5e (COMPUTE_USER_DATA_30) +// CHECK: 0x2e5f (COMPUTE_USER_DATA_31) // CHECK: 0xa08f (CB_SHADER_MASK) // CHECK: 0xa191 (SPI_PS_INPUT_CNTL_0) // CHECK: 0xa192 (SPI_PS_INPUT_CNTL_1) diff --git a/llvm/test/MC/AMDGPU/wave_any.s b/llvm/test/MC/AMDGPU/wave_any.s index 27502eff89bfc..3c265db30a324 100644 --- a/llvm/test/MC/AMDGPU/wave_any.s +++ b/llvm/test/MC/AMDGPU/wave_any.s @@ -1,13 +1,14 @@ +// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 6 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX10 %s v_cmp_ge_i32_e32 s0, v0 -// GFX10: v_cmp_ge_i32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d] +// GFX10: v_cmp_ge_i32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x0c,0x7d] v_cmp_ge_i32_e32 vcc_lo, s0, v1 -// GFX10: v_cmp_ge_i32_e32 vcc_lo, s0, v1 ; encoding: [0x00,0x02,0x0c,0x7d] +// GFX10: v_cmp_ge_i32_e32 vcc_lo, s0, v1 ; encoding: [0x00,0x02,0x0c,0x7d] v_cmp_ge_i32_e32 vcc, s0, v2 -// GFX10: v_cmp_ge_i32_e32 vcc_lo, s0, v2 ; encoding: [0x00,0x04,0x0c,0x7d] +// GFX10: v_cmp_ge_i32_e32 vcc_lo, s0, v2 ; encoding: [0x00,0x04,0x0c,0x7d] v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD // GFX10: v_cmp_le_f16_sdwa s0, v3, v4 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06] @@ -16,10 +17,10 @@ v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD // GFX10: v_cmp_le_f16_sdwa s[0:1], v3, v4 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x08,0x96,0x7d,0x03,0x80,0x05,0x06] v_cmp_class_f32_e32 vcc_lo, s0, v0 -// GFX10: v_cmp_class_f32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d] +// GFX10: v_cmp_class_f32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d] v_cmp_class_f32_e32 vcc, s0, v0 -// GFX10: v_cmp_class_f32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d] +// GFX10: v_cmp_class_f32_e32 vcc_lo, s0, v0 ; encoding: [0x00,0x00,0x10,0x7d] v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD // GFX10: v_cmp_class_f16_sdwa vcc_lo, v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x00,0x06,0x06] @@ -34,13 +35,13 @@ v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD // GFX10: v_cmp_class_f16_sdwa s[0:1], v1, v2 src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x04,0x1e,0x7d,0x01,0x80,0x06,0x06] v_cndmask_b32_e32 v1, v2, v3, -// GFX10: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02] +// GFX10: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02] v_cndmask_b32_e32 v1, v2, v3, vcc_lo -// GFX10: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02] +// GFX10: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02] v_cndmask_b32_e32 v1, v2, v3, vcc -// GFX10: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02] +// GFX10: v_cndmask_b32_e32 v1, v2, v3, vcc_lo ; encoding: [0x02,0x07,0x02,0x02] v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo // GFX10: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo ; encoding: [0x03,0x09,0x06,0x50] @@ -127,61 +128,61 @@ v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 ban // GFX10: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00] v_add_co_u32 v0, s0, v0, v2 -// GFX10: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] v_add_co_u32_e64 v0, s0, v0, v2 -// GFX10: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] v_add_co_ci_u32_e64 v4, s0, v1, v5, s2 -// GFX10: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00] +// GFX10: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00] v_sub_co_u32 v0, s0, v0, v2 -// GFX10: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] v_sub_co_u32_e64 v0, s0, v0, v2 -// GFX10: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2 -// GFX10: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00] +// GFX10: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00] v_subrev_co_u32 v0, s0, v0, v2 -// GFX10: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] v_subrev_co_u32_e64 v0, s0, v0, v2 -// GFX10: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2 // GFX10: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00] v_add_co_u32 v0, s[0:1], v0, v2 -// GFX10: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] v_add_co_u32 v0, exec, v0, v2 -// GFX10: v_add_co_u32 v0, exec, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_add_co_u32 v0, exec, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00] v_add_co_u32 v0, exec_lo, v0, v2 -// GFX10: v_add_co_u32 v0, exec_lo, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_add_co_u32 v0, exec_lo, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00] v_add_co_u32_e64 v0, s[0:1], v0, v2 -// GFX10: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00] v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] // GFX10: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00] v_sub_co_u32 v0, s[0:1], v0, v2 -// GFX10: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] v_sub_co_u32_e64 v0, s[0:1], v0, v2 -// GFX10: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00] v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] // GFX10: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00] v_subrev_co_u32 v0, s[0:1], v0, v2 -// GFX10: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] v_subrev_co_u32_e64 v0, s[0:1], v0, v2 -// GFX10: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] +// GFX10: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00] v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] // GFX10: v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00] @@ -199,10 +200,10 @@ v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc // GFX10: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, vcc ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0xaa,0x01] v_div_scale_f32 v2, s2, v0, v0, v2 -// GFX10: v_div_scale_f32 v2, s2, v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] +// GFX10: v_div_scale_f32 v2, s2, v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] v_div_scale_f32 v2, s[2:3], v0, v0, v2 -// GFX10: v_div_scale_f32 v2, s[2:3], v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] +// GFX10: v_div_scale_f32 v2, s[2:3], v0, v0, v2 ; encoding: [0x02,0x02,0x6d,0xd5,0x00,0x01,0x0a,0x04] v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] // GFX10: v_div_scale_f64 v[2:3], s2, v[0:1], v[0:1], v[2:3] ; encoding: [0x02,0x02,0x6e,0xd5,0x00,0x01,0x0a,0x04] @@ -223,7 +224,7 @@ v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] // GFX10: v_mad_u64_u32 v[0:1], s[6:7], v0, v1, v[2:3] ; encoding: [0x00,0x06,0x76,0xd5,0x00,0x03,0x0a,0x04] v_cmpx_neq_f32_e32 v0, v1 -// GFX10: v_cmpx_neq_f32_e32 v0, v1 ; encoding: [0x00,0x03,0x3a,0x7c] +// GFX10: v_cmpx_neq_f32_e32 v0, v1 ; encoding: [0x00,0x03,0x3a,0x7c] v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD // GFX10: v_cmpx_neq_f32_sdwa v0, v1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x3a,0x7c,0x00,0x00,0x05,0x06] @@ -232,7 +233,7 @@ v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD // GFX10: v_cmpx_eq_u32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0xa5,0x7d,0x00,0x00,0x05,0x86] v_cmpx_class_f32_e64 v0, 1 -// GFX10: v_cmpx_class_f32_e64 v0, 1 ; encoding: [0x7e,0x00,0x98,0xd4,0x00,0x03,0x01,0x00] +// GFX10: v_cmpx_class_f32_e64 v0, 1 ; encoding: [0x7e,0x00,0x98,0xd4,0x00,0x03,0x01,0x00] v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD // GFX10: v_cmpx_class_f32_sdwa v0, 1 src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x31,0x7d,0x00,0x00,0x05,0x86] diff --git a/llvm/test/MC/AMDGPU/wavesize-feature-unsupported-target.s b/llvm/test/MC/AMDGPU/wavesize-feature-unsupported-target.s new file mode 100644 index 0000000000000..3a8656c392ff5 --- /dev/null +++ b/llvm/test/MC/AMDGPU/wavesize-feature-unsupported-target.s @@ -0,0 +1,23 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize64 -o - %s | FileCheck -check-prefix=GFX1250 %s +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -mattr=+wavefrontsize32 -o - %s | FileCheck -check-prefix=GFX900 %s + +// Make sure setting both modes is supported at the same time. +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,+wavefrontsize64 %s | FileCheck -check-prefixes=GFX10 %s + +// Test that there is no assertion when using an explicit +// wavefrontsize attribute on a target which does not support it. + +// GFX1250: v_add_f64_e32 v[0:1], 1.0, v[0:1] +// GFX900: v_add_f64 v[0:1], 1.0, v[0:1] +// GFX10: v_add_f64 v[0:1], 1.0, v[0:1] +v_add_f64 v[0:1], 1.0, v[0:1] + +// GFX1250: v_cmp_eq_u32_e64 s[0:1], 1.0, s1 +// GFX900: v_cmp_eq_u32_e64 s[0:1], 1.0, s1 +// GFX10: v_cmp_eq_u32_e64 s[0:1], 1.0, s1 +v_cmp_eq_u32_e64 s[0:1], 1.0, s1 + +// GFX1250: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +// GFX900: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +// GFX10: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +v_cndmask_b32 v1, v2, v3, s[0:1] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt index 2156a682337e8..336f4b2e88f47 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vopc.txt @@ -1,6 +1,6 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W64 %s - +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,+wavefrontsize64 -disassemble -show-encoding < %s | FileCheck -strict-whitespace -check-prefix=W32 %s # W32: v_cmp_class_f32_e32 vcc_lo, -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] # W64: v_cmp_class_f32_e32 vcc, -1, v2 ; encoding: [0xc1,0x04,0x10,0x7d] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt index d2ec2133b1b88..7064479082b7a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_salu_lit64.txt @@ -1,55 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s -# GFX1250: s_mov_b64 s[2:3], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_mov_b64 s[2:3], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x01,0x82,0xbe,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_add_nc_u64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_add_nc_u64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0xa9,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_and_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_and_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), lit64(0x10abcdef12345678) ; encoding: [0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), lit64(0x10abcdef12345678) ; encoding: [0xfe,0xfe,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_and_not1_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x91,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_ashr_i64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_ashr_i64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x86,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_bfe_i64 s[2:3], lit64(0x80abcdef12345678), 5 ; encoding: [0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80] 0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80 +# GFX1250: s_bfe_i64 s[2:3], lit64(0x80abcdef12345678), 5 ; encoding: [0xfe,0x85,0x82,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x80] -# GFX1250: s_bfe_u64 s[2:3], lit64(0x10abcdef12345678), 5 ; encoding: [0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_bfe_u64 s[2:3], lit64(0x10abcdef12345678), 5 ; encoding: [0xfe,0x85,0x02,0x94,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_cselect_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_cselect_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x98,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_lshl_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_lshl_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x84,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_lshr_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_lshr_b64 s[2:3], lit64(0x10abcdef12345678), s4 ; encoding: [0xfe,0x04,0x82,0x85,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_mul_u64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_mul_u64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0xaa,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_nand_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_nand_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8e,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_nor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_nor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_or_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_or_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x8c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_or_not1_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_or_not1_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x92,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_xnor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_xnor_b64 s[2:3], s[4:5], lit64(0x10abcdef12345678) ; encoding: [0x04,0xfe,0x82,0x90,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: s_xor_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: s_xor_b64 s[2:3], lit64(0x10abcdef12345678), s[4:5] ; encoding: [0xfe,0x04,0x82,0x8d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt index 963e69370a3ba..227e1c47b3d05 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sop1.txt @@ -1,34 +1,35 @@ +# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s -# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0) ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00] 0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00 +# GFX1250: s_add_pc_i64 lit64(0x12345678abcd0) ; encoding: [0xfe,0x4b,0x80,0xbe,0xd0,0xbc,0x8a,0x67,0x45,0x23,0x01,0x00] -# GFX1250: s_add_pc_i64 0x64 ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00] 0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00 +# GFX1250: s_add_pc_i64 0x64 ; encoding: [0xff,0x4b,0x80,0xbe,0x64,0x00,0x00,0x00] -# GFX1250: s_add_pc_i64 4 ; encoding: [0x84,0x4b,0x80,0xbe] 0x84,0x4b,0x80,0xbe +# GFX1250: s_add_pc_i64 4 ; encoding: [0x84,0x4b,0x80,0xbe] -# GFX1250: s_add_pc_i64 s[2:3] ; encoding: [0x02,0x4b,0x80,0xbe] 0x02,0x4b,0x80,0xbe +# GFX1250: s_add_pc_i64 s[2:3] ; encoding: [0x02,0x4b,0x80,0xbe] -# GFX1250: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4c,0x82,0xbe] 0x88,0x4c,0x82,0xbe +# GFX1250: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4c,0x82,0xbe] -# GFX1250: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4d,0x82,0xbe] 0x88,0x4d,0x82,0xbe +# GFX1250: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE) ; encoding: [0x88,0x4d,0x82,0xbe] -# GFX1250: s_get_shader_cycles_u64 s[2:3] ; encoding: [0x00,0x06,0x82,0xbe] 0x00,0x06,0x82,0xbe +# GFX1250: s_get_shader_cycles_u64 s[2:3] ; encoding: [0x00,0x06,0x82,0xbe] -# GFX1250: s_barrier_signal -3 ; encoding: [0xc3,0x4e,0x80,0xbe] 0xc3,0x4e,0x80,0xbe +# GFX1250: s_barrier_signal -3 ; encoding: [0xc3,0x4e,0x80,0xbe] -# GFX1250: s_get_barrier_state s3, -3 ; encoding: [0xc3,0x50,0x83,0xbe] 0xc3,0x50,0x83,0xbe +# GFX1250: s_get_barrier_state s3, -3 ; encoding: [0xc3,0x50,0x83,0xbe] -# GFX1250: s_get_barrier_state s3, -4 ; encoding: [0xc4,0x50,0x83,0xbe] 0xc4,0x50,0x83,0xbe +# GFX1250: s_get_barrier_state s3, -4 ; encoding: [0xc4,0x50,0x83,0xbe] -# GFX1250: s_get_barrier_state s3, m0 ; encoding: [0x7d,0x50,0x83,0xbe] 0x7d,0x50,0x83,0xbe +# GFX1250: s_get_barrier_state s3, m0 ; encoding: [0x7d,0x50,0x83,0xbe] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt index 30650b4fa227f..1571fb96dcf49 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_valu_lit64.txt @@ -1,232 +1,233 @@ +# NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s -# GFX1250: v_add_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_add_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x05,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x30,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_class_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_class_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_eq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_eq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_ge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_ge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_gt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_gt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_gt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_gt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_gt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_gt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_le_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_le_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_le_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_le_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_le_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_le_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_lg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_lg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_lt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_lt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_lt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_lt_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_lt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_lt_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_ne_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_ne_i64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_ne_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_ne_u64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_neq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_neq_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_nge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_nge_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_ngt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_ngt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_nle_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_nle_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_nlg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_nlg_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_nlt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_nlt_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_o_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_o_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmp_u_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmp_u_f64_e32 vcc_lo, lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7c,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_class_f64_e32 lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_class_f64_e32 lit64(0x10abcdef12345678), v255 ; encoding: [0xfe,0xfe,0xff,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_eq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_eq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x45,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_eq_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_eq_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_eq_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_eq_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb5,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_ge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_ge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_ge_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_ge_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xad,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_ge_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_ge_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbd,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_gt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_gt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x49,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_gt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_gt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_gt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_gt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb9,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_le_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_le_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x47,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_le_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_le_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_le_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_le_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb7,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_lg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_lg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_lt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_lt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x43,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_lt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_lt_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xa3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_lt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_lt_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xb3,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_ne_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_ne_i64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xab,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_ne_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_ne_u64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xbb,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_neq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_neq_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5b,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_nge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_nge_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x53,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_ngt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_ngt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x57,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_nle_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_nle_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x59,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_nlg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_nlg_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x55,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_nlt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_nlt_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x5d,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_o_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_o_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x4f,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cmpx_u_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cmpx_u_f64_e32 lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0x51,0x7d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cvt_f32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cvt_f32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x1e,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cvt_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cvt_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x06,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_cvt_u32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_cvt_u32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2a,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_floor_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_floor_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x34,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_fract_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_fract_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7c,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_frexp_exp_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_frexp_exp_i32_f64_e32 v255, lit64(0x10abcdef12345678) ; encoding: [0xfe,0x78,0xfe,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_frexp_mant_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_frexp_mant_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x7a,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_max_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_max_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_min_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_min_num_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x1b,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_mul_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_mul_f64_e32 v[254:255], lit64(0x10abcdef12345678), v[254:255] ; encoding: [0xfe,0xfc,0xfd,0x0d,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_rcp_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_rcp_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x5e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_rndne_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_rndne_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x32,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_rsq_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_rsq_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x62,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_sqrt_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_sqrt_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x68,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_trunc_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] 0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10 +# GFX1250: v_trunc_f64_e32 v[254:255], lit64(0x10abcdef12345678) ; encoding: [0xfe,0x2e,0xfc,0x7f,0x78,0x56,0x34,0x12,0xef,0xcd,0xab,0x10] -# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x4063233333333333) ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40] 0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40 +# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x4063233333333333) ; encoding: [0xfe,0x30,0xfc,0x7f,0x33,0x33,0x33,0x33,0x33,0x23,0x63,0x40] -# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x448969368974c05b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44] 0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44 +# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x448969368974c05b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x5b,0xc0,0x74,0x89,0x36,0x69,0x89,0x44] -# GFX1250: v_ceil_f64_e32 v[254:255], 0x40632000 ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40] 0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40 +# GFX1250: v_ceil_f64_e32 v[254:255], 0x40632000 ; encoding: [0xff,0x30,0xfc,0x7f,0x00,0x20,0x63,0x40] -# GFX1250: v_mov_b64_e32 v[0:1], 0x12345678 ; encoding: [0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12] 0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12 +# GFX1250: v_mov_b64_e32 v[0:1], 0x12345678 ; encoding: [0xff,0x3a,0x00,0x7e,0x78,0x56,0x34,0x12] -# GFX1250: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f] 0xf8,0x30,0xfc,0x7f +# GFX1250: v_ceil_f64_e32 v[254:255], 0.15915494309189532 ; encoding: [0xf8,0x30,0xfc,0x7f] -# GFX1250: v_ceil_f64_e32 v[254:255], -4.0 ; encoding: [0xf7,0x30,0xfc,0x7f] 0xf7,0x30,0xfc,0x7f +# GFX1250: v_ceil_f64_e32 v[254:255], -4.0 ; encoding: [0xf7,0x30,0xfc,0x7f] -# GFX1250: v_ceil_f64_e32 v[254:255], 2.0 ; encoding: [0xf4,0x30,0xfc,0x7f] 0xf4,0x30,0xfc,0x7f +# GFX1250: v_ceil_f64_e32 v[254:255], 2.0 ; encoding: [0xf4,0x30,0xfc,0x7f] -# GFX1250: v_ceil_f64_e32 v[254:255], 0 ; encoding: [0x80,0x30,0xfc,0x7f] 0x80,0x30,0xfc,0x7f +# GFX1250: v_ceil_f64_e32 v[254:255], 0 ; encoding: [0x80,0x30,0xfc,0x7f] -# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x7b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00] 0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x7b) ; encoding: [0xfe,0x30,0xfc,0x7f,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00] -# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00] 0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e32 v[254:255], lit64(0x109a) ; encoding: [0xfe,0x30,0xfc,0x7f,0x9a,0x10,0x00,0x00,0x00,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_wave64_feature.s b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_wave64_feature.s new file mode 100644 index 0000000000000..bdea636a9efe3 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_wave64_feature.s @@ -0,0 +1,13 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize64 -disassemble -o - %s | FileCheck %s + +# Make sure there's no assertion when trying to use an unsupported +# wave64 on a wave32-only target + +# CHECK: v_add_f64_e32 v[0:1], 1.0, v[0:1] +0xf2,0x00,0x00,0x04 + +# CHECK: v_cmp_eq_u32_e64 s[0:1], 1.0, s1 +0x00,0x00,0x4a,0xd4,0xf2,0x02,0x00,0x00 + +# CHECK: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +0x01,0x00,0x01,0xd5,0x02,0x07,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt index 49fa263f6bbf8..41c5724a596f9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt @@ -82,7 +82,7 @@ # GFX12: s_getreg_b32 s0, hwreg(52, 8, 3) ; encoding: [0x34,0x12,0x80,0xb8] 0x34,0x12,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_PRIV, 7, 25) ; encoding: [0xd1,0xc1,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 7, 25) ; encoding: [0xd1,0xc1,0x80,0xb8] 0xd1,0xc1,0x80,0xb8 # GFX12: s_getreg_b32 s105, hwreg(52, 8, 3) ; encoding: [0x34,0x12,0xe9,0xb8] @@ -163,7 +163,7 @@ # GFX12: s_setreg_b32 hwreg(52, 8, 3), vcc_lo ; encoding: [0x34,0x12,0x6a,0xb9] 0x34,0x12,0x6a,0xb9 -# GFX12: s_setreg_b32 hwreg(HW_REG_EXCP_FLAG_PRIV, 7, 25), s0 ; encoding: [0xd1,0xc1,0x00,0xb9] +# GFX12: s_setreg_b32 hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV, 7, 25), s0 ; encoding: [0xd1,0xc1,0x00,0xb9] 0xd1,0xc1,0x00,0xb9 # GFX12: s_version 0x1234 ; encoding: [0x34,0x12,0x80,0xb0] @@ -187,43 +187,43 @@ # GFX12: s_version ((128|UC_VERSION_W64_BIT)|UC_VERSION_W32_BIT)|UC_VERSION_MDP_BIT ; encoding: [0x80,0xe0,0x80,0xb0] 0x80,0xe0,0x80,0xb0 -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_MODE), 0xaf123456 ; encoding: [0x01,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE), 0xaf123456 ; encoding: [0x01,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x01,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 31, 1), 0xaf123456 ; encoding: [0xc1,0x07,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 31, 1), 0xaf123456 ; encoding: [0xc1,0x07,0x80,0xb9,0x56,0x34,0x12,0xaf] 0xc1,0x07,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_STATUS), 0xaf123456 ; encoding: [0x02,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_STATUS), 0xaf123456 ; encoding: [0x02,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x02,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_GPR_ALLOC), 0xaf123456 ; encoding: [0x05,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_GPR_ALLOC), 0xaf123456 ; encoding: [0x05,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x05,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_LDS_ALLOC), 0xaf123456 ; encoding: [0x06,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_LDS_ALLOC), 0xaf123456 ; encoding: [0x06,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x06,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf # GFX12: s_setreg_imm32_b32 hwreg(HW_REG_IB_STS), 0xaf123456 ; encoding: [0x07,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x07,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_HW_ID1), 0xaf123456 ; encoding: [0x17,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_HW_ID1), 0xaf123456 ; encoding: [0x17,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x17,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_HW_ID2), 0xaf123456 ; encoding: [0x18,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] +# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_HW_ID2), 0xaf123456 ; encoding: [0x18,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf] 0x18,0xf8,0x80,0xb9,0x56,0x34,0x12,0xaf -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_MODE) ; encoding: [0x01,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_MODE) ; encoding: [0x01,0xf8,0x80,0xb8] 0x01,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_STATUS) ; encoding: [0x02,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_STATUS) ; encoding: [0x02,0xf8,0x80,0xb8] 0x02,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_STATE_PRIV) ; encoding: [0x04,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_STATE_PRIV) ; encoding: [0x04,0xf8,0x80,0xb8] 0x04,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_GPR_ALLOC) ; encoding: [0x05,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_GPR_ALLOC) ; encoding: [0x05,0xf8,0x80,0xb8] 0x05,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_LDS_ALLOC) ; encoding: [0x06,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_LDS_ALLOC) ; encoding: [0x06,0xf8,0x80,0xb8] 0x06,0xf8,0x80,0xb8 # GFX12: s_getreg_b32 s0, hwreg(HW_REG_IB_STS) ; encoding: [0x07,0xf8,0x80,0xb8] @@ -244,31 +244,31 @@ # GFX12: s_getreg_b32 s0, hwreg(HW_REG_PERF_SNAPSHOT_DATA2) ; encoding: [0x10,0xf8,0x80,0xb8] 0x10,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_PRIV) ; encoding: [0x11,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_PRIV) ; encoding: [0x11,0xf8,0x80,0xb8] 0x11,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_EXCP_FLAG_USER) ; encoding: [0x12,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_EXCP_FLAG_USER) ; encoding: [0x12,0xf8,0x80,0xb8] 0x12,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_TRAP_CTRL) ; encoding: [0x13,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_TRAP_CTRL) ; encoding: [0x13,0xf8,0x80,0xb8] 0x13,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_SCRATCH_BASE_LO) ; encoding: [0x14,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCRATCH_BASE_LO) ; encoding: [0x14,0xf8,0x80,0xb8] 0x14,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_SCRATCH_BASE_HI) ; encoding: [0x15,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCRATCH_BASE_HI) ; encoding: [0x15,0xf8,0x80,0xb8] 0x15,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_HW_ID1) ; encoding: [0x17,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID1) ; encoding: [0x17,0xf8,0x80,0xb8] 0x17,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_HW_ID2) ; encoding: [0x18,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_HW_ID2) ; encoding: [0x18,0xf8,0x80,0xb8] 0x18,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_DVGPR_ALLOC_LO) ; encoding: [0x1f,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_DVGPR_ALLOC_LO) ; encoding: [0x1f,0xf8,0x80,0xb8] 0x1f,0xf8,0x80,0xb8 -# GFX12: s_getreg_b32 s0, hwreg(HW_REG_DVGPR_ALLOC_HI) ; encoding: [0x20,0xf8,0x80,0xb8] +# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_DVGPR_ALLOC_HI) ; encoding: [0x20,0xf8,0x80,0xb8] 0x20,0xf8,0x80,0xb8 # GFX12: s_getreg_b32 s0, hwreg(HW_REG_SHADER_CYCLES_LO) ; encoding: [0x1d,0xf8,0x80,0xb8] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_wave32_feature.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_wave32_feature.txt new file mode 100644 index 0000000000000..40494b3dfa1ea --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_wave32_feature.txt @@ -0,0 +1,13 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -mattr=+wavefrontsize32 -disassemble -o - %s | FileCheck %s + +# Make sure there's no assertion when trying to use an unsupported +# wave32 on a wave64-only target + +# CHECK: v_add_f64 v[0:1], 1.0, v[0:1] +0x00,0x00,0x80,0xd2,0xf2,0x00,0x02,0x00 + +# CHECK: v_cmp_eq_u32_e64 s[0:1], 1.0, s1 +0x00,0x00,0xca,0xd0,0xf2,0x02,0x00,0x00 + +# CHECK: v_cndmask_b32_e64 v1, v2, v3, s[0:1] +0x01,0x00,0x00,0xd1,0x02,0x07,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt index da3601b00b199..a34e7f54c2234 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt @@ -231,3 +231,45 @@ #CHECK: vucmprhh 1, 3, 6 0x10,0x23,0x31,0x03 + +#CHECK: xxaes192encp 8, 10, 14 +0xf1,0x0b,0x76,0x10 + +#CHECK: xxaes256decp 14, 10, 6 +0xf1,0xca,0x3e,0x50 + +#CHECK: xxaes128genlkp 4, 8 +0xf0,0x80,0x46,0x90 + +#CHECK: xxgfmul128gcm 7, 5, 4 +0xf0,0xe5,0x26,0xd0 + +#CHECK: xvadduwm 4, 5, 7 +0xf0,0x85,0x3c,0x18 + +#CHECK: xvadduhm 4, 5, 7 +0xf0,0x85,0x3c,0x58 + +#CHECK: xvsubuwm 4, 5, 7 +0xf0,0x85,0x3c,0x98 + +#CHECK: xvsubuhm 4, 5, 7 +0xf0,0x85,0x3c,0xd8 + +#CHECK: xvmuluwm 4, 5, 7 +0xf0,0x85,0x3d,0x18 + +#CHECK: xvmuluhm 4, 5, 7 +0xf0,0x85,0x3d,0x58 + +#CHECK: xvmulhsw 4, 5, 7 +0xf0,0x85,0x3d,0x98 + +#CHECK: xvmulhsh 4, 5, 7 +0xf0,0x85,0x3d,0xd8 + +#CHECK: xvmulhuw 4, 5, 7 +0xf0,0x85,0x3b,0x90 + +#CHECK: xvmulhuh 4, 5, 7 +0xf0,0x85,0x3b,0xd0 diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt index 66d05043301b6..9cefe2451b0e3 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt @@ -225,3 +225,45 @@ #CHECK: vucmprhh 1, 3, 6 0x03,0x31,0x23,0x10 + +#CHECK: xxaes192encp 8, 10, 14 +0x10,0x76,0x0b,0xf1 + +#CHECK: xxaes256decp 14, 10, 6 +0x50,0x3e,0xca,0xf1 + +#CHECK: xxaes128genlkp 4, 8 +0x90,0x46,0x80,0xf0 + +#CHECK: xxgfmul128gcm 7, 5, 4 +0xd0,0x26,0xe5,0xf0 + +#CHECK: xvadduwm 4, 5, 7 +0x18,0x3c,0x85,0xf0 + +#CHECK: xvadduhm 4, 5, 7 +0x58,0x3c,0x85,0xf0 + +#CHECK: xvsubuwm 4, 5, 7 +0x98,0x3c,0x85,0xf0 + +#CHECK: xvsubuhm 4, 5, 7 +0xd8,0x3c,0x85,0xf0 + +#CHECK: xvmuluwm 4, 5, 7 +0x18,0x3d,0x85,0xf0 + +#CHECK: xvmuluhm 4, 5, 7 +0x58,0x3d,0x85,0xf0 + +#CHECK: xvmulhsw 4, 5, 7 +0x98,0x3d,0x85,0xf0 + +#CHECK: xvmulhsh 4, 5, 7 +0xd8,0x3d,0x85,0xf0 + +#CHECK: xvmulhuw 4, 5, 7 +0x90,0x3b,0x85,0xf0 + +#CHECK: xvmulhuh 4, 5, 7 +0xd0,0x3b,0x85,0xf0 diff --git a/llvm/test/MC/ELF/cfi-sframe-fre-cases.s b/llvm/test/MC/ELF/cfi-sframe-fre-cases.s index 6d9e8c1b6480f..eeaa4021ceefd 100644 --- a/llvm/test/MC/ELF/cfi-sframe-fre-cases.s +++ b/llvm/test/MC/ELF/cfi-sframe-fre-cases.s @@ -17,7 +17,7 @@ fde4_fre_offset_sizes: # CHECK: FuncDescEntry [0] { # CHECK: Start FRE Offset: 0 # CHECK: FRE Type: Addr1 (0x0) - .cfi_startproc + .cfi_startproc # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x0 # CHECK-NEXT: Return Address Signed: No @@ -27,9 +27,9 @@ fde4_fre_offset_sizes: # CHECK-NEXT: RA Offset: -8 .long 0 # Uninteresting register no new fre, no effect on cfa - .cfi_offset 0, 8 + .cfi_offset 0, 8 .long 0 - .cfi_def_cfa_offset 0x78 + .cfi_def_cfa_offset 0x78 # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x8 # CHECK-NEXT: Return Address Signed: No @@ -37,11 +37,11 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: SP (0x1) # CHECK-NEXT: CFA Offset: 120 # CHECK-NEXT: RA Offset: -8 - .long 0 + .long 0 # Uninteresting register no new fre, no effect on cfa .cfi_rel_offset 1, 8 .long 0 - .cfi_def_cfa_offset 0x80 + .cfi_def_cfa_offset 0x80 # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x10 # CHECK-NEXT: Return Address Signed: No @@ -49,11 +49,11 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: SP (0x1) # CHECK-NEXT: CFA Offset: 128 # CHECK-NEXT: RA Offset: -8 - .long 0 + .long 0 # Uninteresting register no new fre, no effect on cfa .cfi_val_offset 1, 8 .long 0 - .cfi_def_cfa_offset 0x7FFF + .cfi_def_cfa_offset 0x7FFF # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x18 # CHECK-NEXT: Return Address Signed: No @@ -61,8 +61,8 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: SP (0x1) # CHECK-NEXT: CFA Offset: 32767 # CHECK-NEXT: RA Offset: -8 - .long 0 - .cfi_def_cfa_offset 0x8000 + .long 0 + .cfi_def_cfa_offset 0x8000 # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x1C # CHECK-NEXT: Return Address Signed: No @@ -70,8 +70,8 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: SP (0x1) # CHECK-NEXT: CFA Offset: 32768 # CHECK-NEXT: RA Offset: -8 - .long 0 - .cfi_def_cfa_offset 0x8 + .long 0 + .cfi_def_cfa_offset 0x8 # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x20 # CHECK-NEXT: Return Address Signed: No @@ -79,8 +79,8 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: SP (0x1) # CHECK-NEXT: CFA Offset: 8 # CHECK-NEXT: RA Offset: -8 - .long 0 - .cfi_adjust_cfa_offset 0x8 + .long 0 + .cfi_adjust_cfa_offset 0x8 # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x24 # CHECK-NEXT: Return Address Signed: No @@ -88,8 +88,8 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: SP (0x1) # CHECK-NEXT: CFA Offset: 16 # CHECK-NEXT: RA Offset: -8 - .long 0 - .cfi_def_cfa_register 6 # switch to fp + .long 0 + .cfi_def_cfa_register 6 # switch to fp # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x28 # CHECK-NEXT: Return Address Signed: No @@ -97,10 +97,10 @@ fde4_fre_offset_sizes: # CHECK-NEXT: Base Register: FP (0x0) # CHECK-NEXT: CFA Offset: 16 # CHECK-NEXT: RA Offset: -8 - .long 0 - .cfi_offset 7, 32 - # sp not the cfa but with large offset still changes encoding. - .cfi_offset 6, 0x7FF8 + .long 0 + .cfi_offset 7, 32 + # sp not the cfa but with large offset still changes encoding. + .cfi_offset 6, 0x7FF8 # CHECK: Frame Row Entry { # CHECK-NEXT: Start Address: 0x2C # CHECK-NEXT: Return Address Signed: No @@ -109,5 +109,75 @@ fde4_fre_offset_sizes: # CHECK-NEXT: CFA Offset: 16 # CHECK-NEXT: RA Offset: -8 # CHECK-NEXT: FP Offset: 32760 - .long 0 + .long 0 + .cfi_endproc + + .align 1024 +restore_reg: +# CHECK: FuncDescEntry [1] { +# CHECK: Start FRE Offset: 0x23 +# CHECK-NEXT: Num FREs: 3 + .cfi_startproc +# CHECK: Frame Row Entry { +# CHECK-NEXT: Start Address: 0x400 +# CHECK-NOT FP Offset{{.*}} +# CHECK: } + .long 0 + .cfi_offset 6, 32 +# CHECK Frame Row Entry { +# CHECK-NEXT Start Address: 0x404 +# CHECK: FP Offset: 32 + .long 0 + .cfi_restore 6 +# CHECK: Frame Row Entry { +# CHECK-NEXT: Start Address: 0x408 +# CHECK-NOT FP Offset{{.*}} +# CHECK: } + .long 0 + .cfi_endproc + + .align 1024 +remember_restore_state: +# CHECK: FuncDescEntry [2] { +# CHECK: Start FRE Offset: 0x2D +# CHECK-NEXT: Num FREs: 4 + .cfi_startproc +# CHECK: Frame Row Entry { +# CHECK-NEXT: Start Address: 0x800 +# CHECK-NOT FP Offset{{.*}} +# CHECK: } + .long 0 + .cfi_offset 6, 8 + .cfi_offset 7, 16 + .cfi_offset 8, 24 +# CHECK: Frame Row Entry { +# CHECK-NEXT: Start Address: 0x804 +# CHECK: Base Register: SP (0x1) +# CHECK-NEXT: CFA Offset: 8 +# CHECK-NEXT: RA Offset: -8 +# CHECK-NEXT: FP Offset: 8 +# CHECK-NEXT: } + .long 0 + .cfi_remember_state +# CHECK: Frame Row Entry { +# CHECK-NEXT: Start Address: 0x808 +# CHECK: Base Register: SP (0x1) +# CHECK-NEXT: CFA Offset: 8 +# CHECK-NEXT: RA Offset: -8 +# CHECK-NEXT: FP Offset: 32 +# CHECK-NEXT: } + .cfi_offset 6, 32 + .cfi_offset 7, 40 + .cfi_offset 8, 48 + .long 0 +# CHECK: Frame Row Entry { +# CHECK-NEXT: Start Address: 0x80C +# CHECK: Base Register: SP (0x1) +# CHECK-NEXT: CFA Offset: 8 +# CHECK-NEXT: RA Offset: -8 +# CHECK-NEXT: FP Offset: 8 +# CHECK-NEXT: } + .cfi_restore_state + .long 0 + .cfi_endproc diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s index 6ae7bd785773d..f01d6fa697d89 100644 --- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s +++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s @@ -330,3 +330,59 @@ vucmprhh 1, 3, 6 #CHECK-BE: vucmprhh 1, 3, 6 # encoding: [0x10,0x23,0x31,0x03] #CHECK-LE: vucmprhh 1, 3, 6 # encoding: [0x03,0x31,0x23,0x10] + + xxaes192encp 8, 10, 14 +#CHECK-BE: xxaes192encp 8, 10, 14 # encoding: [0xf1,0x0b,0x76,0x10] +#CHECK-LE: xxaes192encp 8, 10, 14 # encoding: [0x10,0x76,0x0b,0xf1] + + xxaes256decp 14, 10, 6 +#CHECK-BE: xxaes256decp 14, 10, 6 # encoding: [0xf1,0xca,0x3e,0x50] +#CHECK-LE: xxaes256decp 14, 10, 6 # encoding: [0x50,0x3e,0xca,0xf1] + + xxaes128genlkp 4, 8 +#CHECK-BE: xxaes128genlkp 4, 8 # encoding: [0xf0,0x80,0x46,0x90] +#CHECK-LE: xxaes128genlkp 4, 8 # encoding: [0x90,0x46,0x80,0xf0] + + xxgfmul128gcm 7, 5, 4 +#CHECK-BE: xxgfmul128gcm 7, 5, 4 # encoding: [0xf0,0xe5,0x26,0xd0] +#CHECK-LE: xxgfmul128gcm 7, 5, 4 # encoding: [0xd0,0x26,0xe5,0xf0] + + xvadduwm 4, 5, 7 +#CHECK-BE: xvadduwm 4, 5, 7 # encoding: [0xf0,0x85,0x3c,0x18] +#CHECK-LE: xvadduwm 4, 5, 7 # encoding: [0x18,0x3c,0x85,0xf0] + + xvadduhm 4, 5, 7 +#CHECK-BE: xvadduhm 4, 5, 7 # encoding: [0xf0,0x85,0x3c,0x58] +#CHECK-LE: xvadduhm 4, 5, 7 # encoding: [0x58,0x3c,0x85,0xf0] + + xvsubuwm 4, 5, 7 +#CHECK-BE: xvsubuwm 4, 5, 7 # encoding: [0xf0,0x85,0x3c,0x98] +#CHECK-LE: xvsubuwm 4, 5, 7 # encoding: [0x98,0x3c,0x85,0xf0] + + xvsubuhm 4, 5, 7 +#CHECK-BE: xvsubuhm 4, 5, 7 # encoding: [0xf0,0x85,0x3c,0xd8] +#CHECK-LE: xvsubuhm 4, 5, 7 # encoding: [0xd8,0x3c,0x85,0xf0] + + xvmuluwm 4, 5, 7 +#CHECK-BE: xvmuluwm 4, 5, 7 # encoding: [0xf0,0x85,0x3d,0x18] +#CHECK-LE: xvmuluwm 4, 5, 7 # encoding: [0x18,0x3d,0x85,0xf0] + + xvmuluhm 4, 5, 7 +#CHECK-BE: xvmuluhm 4, 5, 7 # encoding: [0xf0,0x85,0x3d,0x58] +#CHECK-LE: xvmuluhm 4, 5, 7 # encoding: [0x58,0x3d,0x85,0xf0] + + xvmulhsw 4, 5, 7 +#CHECK-BE: xvmulhsw 4, 5, 7 # encoding: [0xf0,0x85,0x3d,0x98] +#CHECK-LE: xvmulhsw 4, 5, 7 # encoding: [0x98,0x3d,0x85,0xf0] + + xvmulhsh 4, 5, 7 +#CHECK-BE: xvmulhsh 4, 5, 7 # encoding: [0xf0,0x85,0x3d,0xd8] +#CHECK-LE: xvmulhsh 4, 5, 7 # encoding: [0xd8,0x3d,0x85,0xf0] + + xvmulhuw 4, 5, 7 +#CHECK-BE: xvmulhuw 4, 5, 7 # encoding: [0xf0,0x85,0x3b,0x90] +#CHECK-LE: xvmulhuw 4, 5, 7 # encoding: [0x90,0x3b,0x85,0xf0] + + xvmulhuh 4, 5, 7 +#CHECK-BE: xvmulhuh 4, 5, 7 # encoding: [0xf0,0x85,0x3b,0xd0] +#CHECK-LE: xvmulhuh 4, 5, 7 # encoding: [0xd0,0x3b,0x85,0xf0] diff --git a/llvm/test/MC/RISCV/xandesvsinth-valid.s b/llvm/test/MC/RISCV/xandesvsinth-valid.s new file mode 100644 index 0000000000000..387bb116fe86f --- /dev/null +++ b/llvm/test/MC/RISCV/xandesvsinth-valid.s @@ -0,0 +1,60 @@ +# XAndesVSIntLoad - Andes Vector INT4 Load Extension +# RUN: llvm-mc %s -triple=riscv32 -mattr=+xandesvsinth -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM %s +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+xandesvsinth < %s \ +# RUN: | llvm-objdump --mattr=+xandesvsinth -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s +# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc %s -triple=riscv64 -mattr=+xandesvsinth -show-encoding \ +# RUN: | FileCheck -check-prefixes=CHECK-ASM %s +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+xandesvsinth < %s \ +# RUN: | llvm-objdump --mattr=+xandesvsinth -M no-aliases -d -r - \ +# RUN: | FileCheck -check-prefixes=CHECK-OBJ %s +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# CHECK-OBJ: nds.vfwcvt.f.n.v v8, v10 +# CHECK-ASM: nds.vfwcvt.f.n.v v8, v10 +# CHECK-ASM: encoding: [0x5b,0x44,0xa2,0x02] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.n.v v8, v10 +# CHECK-OBJ: nds.vfwcvt.f.n.v v8, v10, v0.t +# CHECK-ASM: nds.vfwcvt.f.n.v v8, v10, v0.t +# CHECK-ASM: encoding: [0x5b,0x44,0xa2,0x00] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.n.v v8, v10, v0.t +# CHECK-OBJ: nds.vfwcvt.f.nu.v v8, v10 +# CHECK-ASM: nds.vfwcvt.f.nu.v v8, v10 +# CHECK-ASM: encoding: [0x5b,0xc4,0xa2,0x02] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.nu.v v8, v10 +# CHECK-OBJ: nds.vfwcvt.f.nu.v v8, v10, v0.t +# CHECK-ASM: nds.vfwcvt.f.nu.v v8, v10, v0.t +# CHECK-ASM: encoding: [0x5b,0xc4,0xa2,0x00] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.nu.v v8, v10, v0.t +# CHECK-OBJ: nds.vfwcvt.f.b.v v8, v10 +# CHECK-ASM: nds.vfwcvt.f.b.v v8, v10 +# CHECK-ASM: encoding: [0x5b,0x44,0xa3,0x02] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.b.v v8, v10 +# CHECK-OBJ: nds.vfwcvt.f.b.v v8, v10, v0.t +# CHECK-ASM: nds.vfwcvt.f.b.v v8, v10, v0.t +# CHECK-ASM: encoding: [0x5b,0x44,0xa3,0x00] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.b.v v8, v10, v0.t +# CHECK-OBJ: nds.vfwcvt.f.bu.v v8, v10 +# CHECK-ASM: nds.vfwcvt.f.bu.v v8, v10 +# CHECK-ASM: encoding: [0x5b,0xc4,0xa3,0x02] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.bu.v v8, v10 +# CHECK-OBJ: nds.vfwcvt.f.bu.v v8, v10, v0.t +# CHECK-ASM: nds.vfwcvt.f.bu.v v8, v10, v0.t +# CHECK-ASM: encoding: [0x5b,0xc4,0xa3,0x00] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vfwcvt.f.bu.v v8, v10, v0.t +# CHECK-OBJ: nds.vle4.v v8, (a0) +# CHECK-ASM: nds.vle4.v v8, (a0) +# CHECK-ASM: encoding: [0x5b,0x44,0x05,0x06] +# CHECK-ERROR: instruction requires the following: 'XAndesVSIntH' (Andes Vector Small INT Handling Extension){{$}} +nds.vle4.v v8, (a0) diff --git a/llvm/test/MC/X86/encoder-fail.s b/llvm/test/MC/X86/encoder-fail.s index a8b9f48c8fb70..f5718e14d138f 100644 --- a/llvm/test/MC/X86/encoder-fail.s +++ b/llvm/test/MC/X86/encoder-fail.s @@ -1,16 +1,38 @@ // RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -triple x86_64-unknown-unknown --show-encoding -x86-asm-syntax=intel %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTEL -// CHECK: error: can't encode 'dh' in an instruction requiring REX prefix +// CHECK: error: can't encode 'dh' in an instruction requiring EVEX/REX2/REX prefix movzx %dh, %rsi -// CHECK: error: can't encode 'ah' in an instruction requiring REX prefix +// CHECK: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix movzx %ah, %r8d -// CHECK: error: can't encode 'bh' in an instruction requiring REX prefix +// CHECK: error: can't encode 'bh' in an instruction requiring EVEX/REX2/REX prefix add %bh, %sil -// CHECK: error: can't encode 'ch' in an instruction requiring REX prefix +// CHECK: error: can't encode 'ch' in an instruction requiring EVEX/REX2/REX prefix mov %ch, (%r8) -// CHECK: error: can't encode 'dh' in an instruction requiring REX prefix +// CHECK: error: can't encode 'dh' in an instruction requiring EVEX/REX2/REX prefix mov %dh, (%rax,%r8) + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +add ah, ah, ah + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +and ah, byte ptr [-13426159], ah + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +ccmpa {dfv=of,cf} byte ptr [r8 + 4*rax + 291], ah + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +ccmpae {dfv=of,cf} byte ptr [r8 + 4*rax + 291], ah + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +sar ah, byte ptr [-13426159] + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +{rex2} add ah, al + +// CHECK-INTEL: error: can't encode 'ah' in an instruction requiring EVEX/REX2/REX prefix +{rex} add ah, al diff --git a/llvm/test/MachineVerifier/test_g_build_vector.mir b/llvm/test/MachineVerifier/test_g_build_vector.mir index 50b98017a49a7..9857306737108 100644 --- a/llvm/test/MachineVerifier/test_g_build_vector.mir +++ b/llvm/test/MachineVerifier/test_g_build_vector.mir @@ -16,17 +16,17 @@ body: | ; CHECK: Bad machine code: G_BUILD_VECTOR must produce a vector from scalar operands %3:_(<2 x s32>) = G_BUILD_VECTOR %2 - ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement + ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element %4:_(<2 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0 ; CHECK: Bad machine code: G_BUILD_VECTOR result element type must match source type - ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement + ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element %5:_(<4 x s16>) = G_BUILD_VECTOR %0, %0 %6:_(s16) = IMPLICIT_DEF ; CHECK: Bad machine code: G_BUILD_VECTOR result element type must match source type - ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each elemement + ; CHECK: Bad machine code: G_BUILD_VECTOR must have an operand for each element %7:_(<2 x s32>) = G_BUILD_VECTOR %6, %6, %6, %6 %8:_(p0) = IMPLICIT_DEF diff --git a/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers1.3.yaml b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers1.3.yaml new file mode 100644 index 0000000000000..1623b05def009 --- /dev/null +++ b/llvm/test/ObjectYAML/DXContainer/RootSignature-StaticSamplers1.3.yaml @@ -0,0 +1,65 @@ +# RUN: yaml2obj %s | obj2yaml | FileCheck %s + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 80 + RootSignature: + Version: 3 + NumRootParameters: 0 + RootParametersOffset: 24 + NumStaticSamplers: 1 + StaticSamplersOffset: 24 + Parameters: [] + Samplers: + - Filter: MinLinearMagMipPoint + AddressU: Wrap + AddressV: Mirror + AddressW: MirrorOnce + MipLODBias: 1.23 + MaxAnisotropy: 20 + ComparisonFunc: LessEqual + BorderColor: TransparentBlack + MinLOD: 4.56 + MaxLOD: 8.90 + ShaderRegister: 31 + RegisterSpace: 32 + ShaderVisibility: Mesh + SAMPLER_FLAG_UINT_BORDER_COLOR: true + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + +#CHECK: - Name: RTS0 +#CHECK-NEXT: Size: 80 +#CHECK-NEXT: RootSignature: +#CHECK-NEXT: Version: 3 +#CHECK-NEXT: NumRootParameters: 0 +#CHECK-NEXT: RootParametersOffset: 24 +#CHECK-NEXT: NumStaticSamplers: 1 +#CHECK-NEXT: StaticSamplersOffset: 24 +#CHECK-NEXT: Parameters: [] +#CHECK-NEXT: Samplers: +#CHECK-NEXT: - Filter: MinLinearMagMipPoint +#CHECK-NEXT: AddressU: Wrap +#CHECK-NEXT: AddressV: Mirror +#CHECK-NEXT: AddressW: MirrorOnce +#CHECK-NEXT: MipLODBias: 1.23 +#CHECK-NEXT: MaxAnisotropy: 20 +#CHECK-NEXT: ComparisonFunc: LessEqual +#CHECK-NEXT: BorderColor: TransparentBlack +#CHECK-NEXT: MinLOD: 4.56 +#CHECK-NEXT: MaxLOD: 8.9 +#CHECK-NEXT: ShaderRegister: 31 +#CHECK-NEXT: RegisterSpace: 32 +#CHECK-NEXT: ShaderVisibility: Mesh +#CHECK-NEXT: SAMPLER_FLAG_UINT_BORDER_COLOR: true +#CHECK-NEXT: AllowInputAssemblerInputLayout: true +#CHECK-NEXT: DenyGeometryShaderRootAccess: true diff --git a/llvm/test/Other/new-pm-O0-defaults.ll b/llvm/test/Other/new-pm-O0-defaults.ll index 81d1ee0df2c5b..278a89261691a 100644 --- a/llvm/test/Other/new-pm-O0-defaults.ll +++ b/llvm/test/Other/new-pm-O0-defaults.ll @@ -44,6 +44,7 @@ ; CHECK-PRE-LINK: Running pass: CanonicalizeAliasesPass ; CHECK-PRE-LINK-NEXT: Running pass: NameAnonGlobalPass ; CHECK-THINLTO: Running pass: LowerTypeTestsPass +; CHECK-THINLTO-NEXT: Running pass: CoroConditionalWrapper ; CHECK-THINLTO-NEXT: Running pass: EliminateAvailableExternallyPass ; CHECK-THINLTO-NEXT: Running pass: GlobalDCEPass ; CHECK-LTO: Running pass: CrossDSOCFIPass on [module] diff --git a/llvm/test/TableGen/CPtrWildcard.td b/llvm/test/TableGen/CPtrWildcard.td new file mode 100644 index 0000000000000..96b51ae1044a3 --- /dev/null +++ b/llvm/test/TableGen/CPtrWildcard.td @@ -0,0 +1,74 @@ +// RUN: llvm-tblgen -gen-dag-isel -I %p/../../include %s -o - | FileCheck %s + +// Create an intrinsic that uses cPTR to overload on capability pointer types, +// and verify that we can match it correct in SelectionDAG. + +// CHECK: static const unsigned char MatcherTable[] = { +// CHECK-NEXT: /* 0*/ OPC_CheckOpcode, TARGET_VAL(ISD::INTRINSIC_WO_CHAIN), +// CHECK-NEXT:/* 3*/ OPC_CheckChild0Integer, 42, +// CHECK-NEXT:/* 5*/ OPC_RecordChild1, // #0 = $src +// CHECK-NEXT:/* 6*/ OPC_Scope, 9, /*->17*/ // 2 children in Scope +// CHECK-NEXT:/* 8*/ OPC_CheckChild1Type, /*MVT::c64*/126|128,1/*254*/, +// CHECK-NEXT:/* 11*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C64_TO_I64), +// CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0, +// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } 21:{ *:[iPTR] }, c64:{ *:[c64] }:$src) - Complexity = 8 +// CHECK-NEXT: // Dst: (C64_TO_I64:{ *:[i64] } ?:{ *:[c64] }:$src) +// CHECK-NEXT:/* 17*/ /*Scope*/ 9, /*->27*/ +// CHECK-NEXT:/* 18*/ OPC_CheckChild1Type, /*MVT::c128*/127|128,1/*255*/, +// CHECK-NEXT:/* 21*/ OPC_MorphNodeTo1None, TARGET_VAL(MyTarget::C128_TO_I64), +// CHECK-NEXT: /*MVT::i64*/8, 1/*#Ops*/, 0, +// CHECK-NEXT: // Src: (intrinsic_wo_chain:{ *:[i64] } 21:{ *:[iPTR] }, c128:{ *:[c128] }:$src) - Complexity = 8 +// CHECK-NEXT: // Dst: (C128_TO_I64:{ *:[i64] } ?:{ *:[c128] }:$src) +// CHECK-NEXT:/* 27*/ 0, /*End of Scope*/ +// CHECK-NEXT: 0 +// CHECK-NEXT: }; // Total Array size is 29 bytes + +include "llvm/Target/Target.td" + +def my_cap_ty : LLVMQualPointerType<200> { + let VT = cPTR; +} + +def int_cap_get_length : + Intrinsic<[llvm_i64_ty], + [my_cap_ty], + [IntrNoMem, IntrWillReturn]>; + +class CapReg : Register { + let Namespace = "MyTarget"; +} + +def C64 : CapReg<"c0">; +def C64s + : RegisterClass<"MyTarget", [i64, c64], 64, + (add C64)>; + +def C128 : CapReg<"c0">; +def C128s + : RegisterClass<"MyTarget", [c128], 64, + (add C128)>; + +def C64_TO_I64 : Instruction { + let Namespace = "MyTarget"; + let OutOperandList = (outs C64s:$dst); + let InOperandList = (ins C64s:$src); +} + +def C128_TO_I64 : Instruction { + let Namespace = "MyTarget"; + let OutOperandList = (outs C64s:$dst); + let InOperandList = (ins C128s:$src); +} + +def : Pat< + (int_cap_get_length c64:$src), + (C64_TO_I64 $src) +>; + +def : Pat< + (int_cap_get_length c128:$src), + (C128_TO_I64 $src) +>; + +def MyTargetISA : InstrInfo; +def MyTarget : Target { let InstructionSet = MyTargetISA; } diff --git a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td index 7a86b5b726a82..6be1720a6da23 100644 --- a/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td +++ b/llvm/test/TableGen/GlobalISelEmitter/GlobalISelEmitter.td @@ -535,7 +535,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3), // R00O-NEXT: GIM_Reject, // R00O: // Label [[DEFAULT_NUM]]: @[[DEFAULT]] // R00O-NEXT: GIM_Reject, -// R00O-NEXT: }; // Size: 1894 bytes +// R00O-NEXT: }; // Size: 1898 bytes def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4), [(set GPR32:$dst, diff --git a/llvm/test/TableGen/RegClassByHwMode.td b/llvm/test/TableGen/RegClassByHwMode.td index 5d813d2bfc83a..ca72cfbd403bf 100644 --- a/llvm/test/TableGen/RegClassByHwMode.td +++ b/llvm/test/TableGen/RegClassByHwMode.td @@ -50,7 +50,7 @@ include "llvm/Target/Target.td" // INSTRINFO-NEXT: }; // INSTRINFO: static inline void InitMyTargetMCInstrInfo( -// INSTRINFO-NEXT: II->InitMCInstrInfo(MyTargetDescs.Insts, MyTargetInstrNameIndices, MyTargetInstrNameData, nullptr, nullptr, 321, &MyTargetRegClassByHwModeTables[0][0], 3); +// INSTRINFO-NEXT: II->InitMCInstrInfo(MyTargetDescs.Insts, MyTargetInstrNameIndices, MyTargetInstrNameData, nullptr, nullptr, {{[0-9]+}}, &MyTargetRegClassByHwModeTables[0][0], 3); diff --git a/llvm/test/TableGen/intrinsic-struct.td b/llvm/test/TableGen/intrinsic-struct.td index 467fd9057c183..032cdc10e74ed 100644 --- a/llvm/test/TableGen/intrinsic-struct.td +++ b/llvm/test/TableGen/intrinsic-struct.td @@ -1,22 +1,58 @@ // RUN: llvm-tblgen -gen-intrinsic-enums -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s --check-prefix=CHECK-ENUM -// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS > /dev/null 2>&1 +// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS | FileCheck %s --check-prefix=CHECK-IMPL // RUN: not llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS -DENABLE_ERROR 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR // XFAIL: vg_leak include "llvm/IR/Intrinsics.td" -// Make sure we can return up to 9 values. -// CHECK-ENUM: returns_9_results = {{[0-9]+}}, // llvm.returns.9.results -def int_returns_9_results : Intrinsic< - !listsplat(llvm_anyint_ty, 9), - [], [], "llvm.returns.9.results">; +// Make sure we can return up to 257 values. Intrinsics are in alphabetical order. +// CHECK-ENUM: returns_a0_results = {{[0-9]+}}, // llvm.returns.a0.results +// CHECK-ENUM: returns_b1_results, // llvm.returns.b1.results +// CHECK-ENUM: returns_c2_results, // llvm.returns.c2.results +// CHECK-ENUM: returns_d9_results, // llvm.returns.d9.results +// CHECK-ENUM: returns_e10_results, // llvm.returns.e10.results +// CHECK-ENUM: returns_f257_results, // llvm.returns.f257.results -#ifdef ENABLE_ERROR -// CHECK-ERROR: error: intrinsics can only return upto 9 values, 'int_returns_10_results' returns 10 values -// CHECK-ERROR-NEXT: def int_returns_10_results : Intrinsic< -def int_returns_10_results : Intrinsic< +// Make sure the encoding table is correctly generated. +// CHECK-IMPL: IIT_LongEncodingTable +// CHECK-IMPL-NEXT: 21, 255 +// CHECK-IMPL-SAME: 15, 1, 15, 9, 15, 17, 15, 25, 15, 33, 15, 41, 15, 49, 15, 57, 15, 65, 15, 73, 15, 81, +// CHECK-IMPL-NEXT: 21, 0 +// CHECK-IMPL-SAME: 15, 1, 15, 9, 0 +// CHECK-IMPL-NEXT: 21, 7 +// CHECK-IMPL-SAME: 15, 1, 15, 9, 15, 17, 15, 25, 15, 33, 15, 41, 15, 49, 15, 57, 15, 65, 0 +// CHECK-IMPL-NEXT: 21, 8 +// CHECK-IMPL-SAME: 15, 1, 15, 9, 15, 17, 15, 25, 15, 33, 15, 41, 15, 49, 15, 57, 15, 65, 15, 73, 0 +def int_returns_a0_results : Intrinsic< + [], + [], [], "llvm.returns.a0.results">; + +def int_returns_b1_results : Intrinsic< + [llvm_anyint_ty], + [], [], "llvm.returns.b1.results">; + +def int_returns_c2_results : Intrinsic< + !listsplat(llvm_anyint_ty, 2), + [], [], "llvm.returns.c2.results">; + +def int_returns_d9_results : Intrinsic< + !listsplat(llvm_anyint_ty, 9), + [], [], "llvm.returns.d9.results">; + +def int_returns_e10_results : Intrinsic< !listsplat(llvm_anyint_ty, 10), - [], [], "llvm.returns.10.results">; + [], [], "llvm.returns.e10.results">; + +def int_returns_f257_results : Intrinsic< + !listsplat(llvm_anyint_ty, 257), + [], [], "llvm.returns.f257.results">; + +#ifdef ENABLE_ERROR +// CHECK-ERROR: error: intrinsics can only return upto 257 values, 'int_returns_g258_results' returns 258 values +// CHECK-ERROR-NEXT: def int_returns_g258_results : Intrinsic< +def int_returns_g258_results : Intrinsic< + !listsplat(llvm_anyint_ty, 258), + [], [], "llvm.returns.g258.results">; #endif diff --git a/llvm/test/TableGen/intrinsic-varargs.td b/llvm/test/TableGen/intrinsic-varargs.td index 3634e16e20565..f94e1d0d6750e 100644 --- a/llvm/test/TableGen/intrinsic-varargs.td +++ b/llvm/test/TableGen/intrinsic-varargs.td @@ -3,5 +3,5 @@ include "llvm/IR/Intrinsics.td" -// CHECK: /* 0 */ 0, 29, 0, +// CHECK: /* 0 */ 0, 26, 0, def int_foo : Intrinsic<[], [llvm_vararg_ty]>; diff --git a/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll b/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll new file mode 100644 index 0000000000000..bcd3cea5b7ff1 --- /dev/null +++ b/llvm/test/ThinLTO/X86/memprof-funcassigncloning2.ll @@ -0,0 +1,142 @@ +;; Similar to funcassigncloning.ll but hand modified to add another allocation +;; whose pruned cold context only includes an immediate caller node that itself +;; doesn't need cloning, but calls a cloned allocating function, and is in a +;; function that gets cloned multiple times for a different callsite. This test +;; makes sure the non-cloned callsite is correctly updated in all function +;; clones. This case was missed because, due to context pruning, we don't have +;; any caller edges for the first callsite, so the handling that kicks in to +;; "reclone" other callsites in cloned functions was being missed. + +; RUN: opt -thinlto-bc %s >%t.o +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=memprof-context-disambiguation -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=REMARKS + +; RUN: llvm-dis %t.out.1.4.opt.bc -o - | FileCheck %s --check-prefix=IR + + +;; Try again but with distributed ThinLTO +; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \ +; RUN: -supports-hot-cold-new \ +; RUN: -thinlto-distributed-indexes \ +; RUN: -r=%t.o,main,plx \ +; RUN: -r=%t.o,_Znam, \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: -o %t2.out + +;; Run ThinLTO backend +; RUN: opt -passes=memprof-context-disambiguation \ +; RUN: -memprof-import-summary=%t.o.thinlto.bc \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: %t.o -S 2>&1 | FileCheck %s --check-prefix=IR \ +; RUN: --check-prefix=REMARKS + + +source_filename = "funcassigncloning.ll" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; Eventually this function will be cloned several times (for the calls to new +;; for the various callers). However, function blah() includes an allocation +;; whose cold context was trimmed above here. We therefore should assume that +;; every caller of this function should call the same version of blah (which +;; will be the cloned ".memprof.1" version. +; Function Attrs: noinline optnone +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + call void @blah(), !callsite !19 + %call = call ptr @_Znam(i64 noundef 10), !memprof !0, !callsite !7 + %call1 = call ptr @_Znam(i64 noundef 10), !memprof !8, !callsite !15 + ret void +} + +; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1 + +; IR: define {{.*}} @_Z1EPPcS0_ +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.2 +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.3 +; IR: call {{.*}} @blah.memprof.1() + +declare ptr @_Znam(i64) + +define internal void @_Z1BPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !16 + ret void +} + +define internal void @_Z1CPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_() { +entry: + call void @_Z1EPPcS0_(ptr null, ptr null), !callsite !18 + ret void +} + +; Function Attrs: noinline optnone +define i32 @main() #0 { +entry: + call void @_Z1BPPcS0_() + call void @_Z1CPPcS0_() + call void @_Z1DPPcS0_() + ret i32 0 +} + +define internal void @blah() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21 + ret void +} + +define internal void @foo() #0 { +entry: + call void @blah(), !callsite !20 + ret void +} + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0, 2 } + +attributes #0 = { noinline optnone } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} +!19 = !{i64 123} +!20 = !{i64 234} +!21 = !{i64 345} +!22 = !{!23, !25} +!23 = !{!24, !"cold"} +!24 = !{i64 345, i64 123} +!25 = !{!26, !"notcold"} +!26 = !{i64 345, i64 234} diff --git a/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll b/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll index 94c912876d7b9..6015607c05df4 100644 --- a/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll +++ b/llvm/test/Transforms/AggressiveInstCombine/inline-strcmp-debugloc.ll @@ -5,8 +5,9 @@ @.str = constant [3 x i8] c"-h\00" -define i32 @main() { -; CHECK-LABEL: define i32 @main() { +define i32 @main() !prof !8 { +; CHECK-LABEL: define i32 @main() +; CHECK: !prof [[PROF_0:![0-9]+]] ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[SUB_0:.*]], !dbg [[DBG4:![0-9]+]] ; CHECK: [[SUB_0]]: @@ -14,13 +15,13 @@ define i32 @main() { ; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32, !dbg [[DBG4]] ; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], 45, !dbg [[DBG4]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0, !dbg [[DBG4]] -; CHECK-NEXT: br i1 [[TMP3]], label %[[NE:.*]], label %[[SUB_1:.*]], !dbg [[DBG4]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[NE:.*]], label %[[SUB_1:.*]], !dbg [[DBG4]], !prof [[PROF_1:![0-9]+]] ; CHECK: [[SUB_1]]: ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr getelementptr inbounds (i8, ptr null, i64 1), align 1, !dbg [[DBG4]] ; CHECK-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i32, !dbg [[DBG4]] ; CHECK-NEXT: [[TMP6:%.*]] = sub i32 [[TMP5]], 104, !dbg [[DBG4]] ; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP6]], 0, !dbg [[DBG4]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[NE]], label %[[SUB_2:.*]], !dbg [[DBG4]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[NE]], label %[[SUB_2:.*]], !dbg [[DBG4]], !prof [[PROF_1]] ; CHECK: [[SUB_2]]: ; CHECK-NEXT: br label %[[NE]], !dbg [[DBG4]] ; CHECK: [[NE]]: @@ -46,11 +47,14 @@ declare i32 @strcmp(ptr, ptr) !4 = !DILocation(line: 258, column: 10, scope: !5) !5 = distinct !DISubprogram(name: "streq", scope: !1, file: !1, line: 257, type: !7, scopeLine: 257, unit: !0, retainedNodes: !2) !7 = !DISubroutineType(types: !2) +!8 = !{!"function_entry_count", i64 1000} ;. ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], retainedTypes: [[META2]], globals: [[META2]]) ; CHECK: [[META1]] = !DIFile(filename: "test.c", directory: {{.*}}) ; CHECK: [[META2]] = !{} +; CHECK: [[PROF_0]] = !{!"function_entry_count", i64 1000} ; CHECK: [[DBG4]] = !DILocation(line: 258, column: 10, scope: [[META5:![0-9]+]]) ; CHECK: [[META5]] = distinct !DISubprogram(name: "streq", scope: [[META1]], file: [[META1]], line: 257, type: [[META6:![0-9]+]], scopeLine: 257, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META2]]) ; CHECK: [[META6]] = !DISubroutineType(types: [[META2]]) +; CHECK: [[PROF_1]] = !{!"unknown", !"aggressive-instcombine"} ;. diff --git a/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll b/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll index c7fc1dc699671..f9b9dd13b0d0c 100644 --- a/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll +++ b/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll @@ -9,10 +9,10 @@ define void @fn1() { ; CHECK-LABEL: define void @fn1() { -; CHECK-NEXT: call void @llvm.assume(i1 false) [ "align"(ptr @global, i64 1) ] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr @global, i64 1) ] ; CHECK-NEXT: ret void ; - call void @llvm.assume(i1 false) [ "align"(ptr @global, i64 1) ] + call void @llvm.assume(i1 true) [ "align"(ptr @global, i64 1) ] ret void } diff --git a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll index d5aa6b10b5add..1ab607465dbbb 100644 --- a/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll +++ b/llvm/test/Transforms/Attributor/AMDGPU/tag-invariant-loads.ll @@ -306,12 +306,12 @@ define amdgpu_kernel void @test_call_untouched_ptr() { define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) { ; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer( ; AMDGCN-SAME: ptr addrspace(1) nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] { -; AMDGCN-NEXT: [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR11:[0-9]+]] +; AMDGCN-NEXT: [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i64 noundef 0, i32 noundef 0) #[[ATTR11:[0-9]+]] ; AMDGCN-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4 ; AMDGCN-NEXT: call void @clobber(i32 [[VAL]]) #[[ATTR7]] ; AMDGCN-NEXT: ret void ; - %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0) + %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i64 0, i32 0) %val = load i32, ptr addrspace(7) %rsrc, align 4 ;; original %ptr may alias call void @clobber(i32 %val) @@ -321,12 +321,12 @@ define amdgpu_kernel void @test_make_buffer(ptr addrspace(1) %ptr) { define amdgpu_kernel void @test_make_buffer_noalias(ptr addrspace(1) noalias %ptr) { ; AMDGCN-LABEL: define amdgpu_kernel void @test_make_buffer_noalias( ; AMDGCN-SAME: ptr addrspace(1) noalias nofree readonly captures(none) [[PTR:%.*]]) #[[ATTR2]] { -; AMDGCN-NEXT: [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i32 noundef 0, i32 noundef 0) #[[ATTR11]] +; AMDGCN-NEXT: [[RSRC:%.*]] = call align 4 ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[PTR]], i16 noundef 0, i64 noundef 0, i32 noundef 0) #[[ATTR11]] ; AMDGCN-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(7) [[RSRC]], align 4, !invariant.load [[META0]] ; AMDGCN-NEXT: call void @clobber(i32 [[VAL]]) #[[ATTR7]] ; AMDGCN-NEXT: ret void ; - %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i32 0, i32 0) + %rsrc = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %ptr, i16 0, i64 0, i32 0) %val = load i32, ptr addrspace(7) %rsrc, align 4 call void @clobber(i32 %val) ret void diff --git a/llvm/test/Transforms/Coroutines/ArgAddr.ll b/llvm/test/Transforms/Coroutines/ArgAddr.ll index ab70836508101..9328c67459077 100644 --- a/llvm/test/Transforms/Coroutines/ArgAddr.ll +++ b/llvm/test/Transforms/Coroutines/ArgAddr.ll @@ -45,7 +45,7 @@ coro_Cleanup: br label %coro_Suspend coro_Suspend: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret ptr %1 } @@ -69,7 +69,7 @@ declare i32 @llvm.coro.size.i32() declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-align16.ll b/llvm/test/Transforms/Coroutines/coro-align16.ll index 39902be9149e8..afdca77e8af3e 100644 --- a/llvm/test/Transforms/Coroutines/coro-align16.ll +++ b/llvm/test/Transforms/Coroutines/coro-align16.ll @@ -24,7 +24,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @capture_call(ptr) declare void @nocapture_call(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-align32.ll b/llvm/test/Transforms/Coroutines/coro-align32.ll index 3d910e951259b..9e82ec83011f5 100644 --- a/llvm/test/Transforms/Coroutines/coro-align32.ll +++ b/llvm/test/Transforms/Coroutines/coro-align32.ll @@ -28,7 +28,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -48,7 +48,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @capture_call(ptr) declare void @nocapture_call(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-align64-02.ll b/llvm/test/Transforms/Coroutines/coro-align64-02.ll index 3e2e33d2da260..13c0cbe0e24da 100644 --- a/llvm/test/Transforms/Coroutines/coro-align64-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-align64-02.ll @@ -24,7 +24,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @capture_call(ptr) declare void @nocapture_call(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-align64.ll b/llvm/test/Transforms/Coroutines/coro-align64.ll index 9623a99a8b27e..f6cf8f40b2b41 100644 --- a/llvm/test/Transforms/Coroutines/coro-align64.ll +++ b/llvm/test/Transforms/Coroutines/coro-align64.ll @@ -24,7 +24,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @capture_call(ptr) declare void @nocapture_call(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-align8-02.ll b/llvm/test/Transforms/Coroutines/coro-align8-02.ll index 758d4ce3e21b2..0a6723a41256e 100644 --- a/llvm/test/Transforms/Coroutines/coro-align8-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-align8-02.ll @@ -20,7 +20,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -40,7 +40,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @aligned_alloc(i32, i32) declare void @free(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-align8.ll b/llvm/test/Transforms/Coroutines/coro-align8.ll index 48a2687cc4799..ac083378803ec 100644 --- a/llvm/test/Transforms/Coroutines/coro-align8.ll +++ b/llvm/test/Transforms/Coroutines/coro-align8.ll @@ -24,7 +24,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @capture_call(ptr) declare void @nocapture_call(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll b/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll index bdd49413cf15b..851f8a7e4e293 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O0.ll @@ -24,7 +24,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -54,7 +54,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @myAlloc(i64, i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O2.ll b/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O2.ll index a0ab5b733fdf0..ee2215efd1cd9 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O2.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloc-with-param-O2.ll @@ -21,7 +21,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -49,7 +49,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @myAlloc(i64, i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-01.ll b/llvm/test/Transforms/Coroutines/coro-alloca-01.ll index 5208c055c4fdf..f0c0bb31d40f5 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-01.ll @@ -33,7 +33,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -55,7 +55,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @print(ptr) declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-02.ll b/llvm/test/Transforms/Coroutines/coro-alloca-02.ll index 83f56009f00e3..832132d451776 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-02.ll @@ -25,7 +25,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @print(ptr) declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-03.ll b/llvm/test/Transforms/Coroutines/coro-alloca-03.ll index 7740ed440a0d5..5148d87bbc2b2 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-03.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-03.ll @@ -23,7 +23,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @capture_call(ptr) declare void @nocapture_call(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-04.ll b/llvm/test/Transforms/Coroutines/coro-alloca-04.ll index c19cd253a9179..9df1fd4326899 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-04.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-04.ll @@ -32,7 +32,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -53,7 +53,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @print(ptr) declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-05.ll b/llvm/test/Transforms/Coroutines/coro-alloca-05.ll index 96769e51fb80f..a096bb1beea21 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-05.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-05.ll @@ -23,7 +23,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -44,7 +44,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @print(i32) declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-06.ll b/llvm/test/Transforms/Coroutines/coro-alloca-06.ll index bf75196047aff..22997fbbcdfd7 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-06.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-06.ll @@ -37,7 +37,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -63,7 +63,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.lifetime.start.p0(ptr nocapture) declare void @llvm.lifetime.end.p0(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-07.ll b/llvm/test/Transforms/Coroutines/coro-alloca-07.ll index 8bfb8cfabbd27..ac07dc33707c7 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-07.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-07.ll @@ -36,7 +36,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -49,7 +49,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.lifetime.start.p0(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-08.ll b/llvm/test/Transforms/Coroutines/coro-alloca-08.ll index 80be62ac64c8c..dab55c5f0cd41 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-08.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-08.ll @@ -32,7 +32,7 @@ await.ready: %StrayCoroSave = call token @llvm.coro.save(ptr null) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -59,7 +59,7 @@ await.ready: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -75,6 +75,6 @@ declare token @llvm.coro.save(ptr) #3 declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-09.ll b/llvm/test/Transforms/Coroutines/coro-alloca-09.ll index 2539811f46b7c..4736790dfe324 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-09.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-09.ll @@ -52,7 +52,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll b/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll index 8b8dbacbfc5c7..baec3f1a0c869 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-loop-carried-address.ll @@ -68,7 +68,7 @@ loop: ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -80,6 +80,6 @@ declare i64 @llvm.coro.size.i64() declare ptr @llvm.coro.begin(token, ptr writeonly) declare token @llvm.coro.save(ptr) declare i8 @llvm.coro.suspend(token, i1) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.lifetime.start(ptr nocapture) declare void @llvm.lifetime.end(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-outside-frame.ll b/llvm/test/Transforms/Coroutines/coro-alloca-outside-frame.ll index ac6a5752438ce..e93e97fb06643 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-outside-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-outside-frame.ll @@ -33,7 +33,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -54,7 +54,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @print(ptr) declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-alloca-with-addrspace.ll b/llvm/test/Transforms/Coroutines/coro-alloca-with-addrspace.ll index 410d3e35e1c93..12057a953701c 100644 --- a/llvm/test/Transforms/Coroutines/coro-alloca-with-addrspace.ll +++ b/llvm/test/Transforms/Coroutines/coro-alloca-with-addrspace.ll @@ -31,7 +31,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0) + call void @llvm.coro.end(ptr %hdl, i1 0) ret ptr %hdl } @@ -50,7 +50,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1) +declare void @llvm.coro.end(ptr, i1) declare void @print(ptr) declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll index d662638d2dd9a..6562ac2e9e430 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-infinite-loop-bug.ll @@ -72,8 +72,8 @@ declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32 declare ptr @llvm.coro.prepare.async(ptr) declare token @llvm.coro.id.async(i32, i32, i32, ptr) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end.async(ptr, i1, ...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end.async(ptr, i1, ...) +declare void @llvm.coro.end(ptr, i1, token) declare {ptr, ptr, ptr} @llvm.coro.suspend.async(i32, ptr, ptr, ...) declare ptr @context_alloc() declare void @llvm.coro.async.context.dealloc(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll index 49c4207887340..efe6403941463 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll @@ -82,7 +82,7 @@ loop: loop_exit: call void @llvm.lifetime.end.p0(ptr %escaped_addr) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 false) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 false) unreachable } @@ -96,8 +96,8 @@ declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32 declare ptr @llvm.coro.prepare.async(ptr) declare token @llvm.coro.id.async(i32, i32, i32, ptr) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end.async(ptr, i1, ...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end.async(ptr, i1, ...) +declare void @llvm.coro.end(ptr, i1, token) declare {ptr, ptr, ptr} @llvm.coro.suspend.async(i32, ptr, ptr, ...) declare ptr @context_alloc() declare void @llvm.coro.async.context.dealloc(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-async-coro-id-async-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-coro-id-async-bug.ll index 3a2201f4d30c0..2405b40326eea 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-coro-id-async-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-coro-id-async-bug.ll @@ -16,7 +16,7 @@ entry: %5 = getelementptr inbounds <{ ptr, ptr }>, ptr %4, i32 0, i32 1 %6 = load ptr, ptr %5, align 8 %7 = load ptr, ptr %1, align 8 - %8 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @repo.0, ptr %6, ptr %7) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @repo.0, ptr %6, ptr %7) unreachable } @@ -35,6 +35,6 @@ declare token @llvm.coro.id.async(i32, i32, i32, ptr) #1 declare ptr @llvm.coro.begin(token, ptr writeonly) #1 ; Function Attrs: nounwind -declare i1 @llvm.coro.end.async(ptr, i1, ...) #1 +declare void @llvm.coro.end.async(ptr, i1, ...) #1 attributes #1 = { nounwind } diff --git a/llvm/test/Transforms/Coroutines/coro-async-declaration.ll b/llvm/test/Transforms/Coroutines/coro-async-declaration.ll index aee6aa4f78a83..2cbe5135d7c47 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-declaration.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-declaration.ll @@ -43,7 +43,7 @@ entry: %15 = getelementptr inbounds <{ ptr, ptr }>, ptr %14, i32 0, i32 1, !dbg !11 %16 = load ptr, ptr %15, align 8, !dbg !11 %17 = load ptr, ptr %1, align 8, !dbg !11 - %18 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @"$s3foo3FooO3baryyYaF.0.1", ptr %16, ptr %17), !dbg !11 + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @"$s3foo3FooO3baryyYaF.0.1", ptr %16, ptr %17), !dbg !11 unreachable, !dbg !11 } @@ -89,7 +89,7 @@ entry: } ; Function Attrs: nounwind -declare i1 @llvm.coro.end.async(ptr, i1, ...) #0 +declare void @llvm.coro.end.async(ptr, i1, ...) #0 attributes #0 = { nounwind } attributes #1 = { nomerge nounwind } diff --git a/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll b/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll index 040c9881c1ab3..ffcafca891199 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-dyn-align.ll @@ -27,8 +27,8 @@ declare void @llvm.coro.async.context.dealloc(ptr) declare ptr @llvm.coro.async.resume() declare token @llvm.coro.id.async(i32, i32, i32, ptr) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end.async(ptr, i1, ...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end.async(ptr, i1, ...) +declare void @llvm.coro.end(ptr, i1, token) declare swiftcc void @asyncReturn(ptr) declare swiftcc void @asyncSuspend(ptr) declare {ptr} @llvm.coro.suspend.async(i32, ptr, ptr, ...) @@ -91,6 +91,6 @@ entry: call void @opaque(ptr %tmp4) call void @llvm.coro.async.context.dealloc(ptr %callee_context) tail call swiftcc void @asyncReturn(ptr %async.ctxt) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } diff --git a/llvm/test/Transforms/Coroutines/coro-async-end-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-end-bug.ll index 0daa4b0c3da64..c5ce27c1328f2 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-end-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-end-bug.ll @@ -14,7 +14,7 @@ declare token @llvm.coro.id.async(i32, i32, i32, ptr) #0 declare ptr @llvm.coro.begin(token, ptr writeonly) #0 -declare i1 @llvm.coro.end.async(ptr, i1, ...) #0 +declare void @llvm.coro.end.async(ptr, i1, ...) #0 define swifttailcc void @repo(ptr swiftasync %0, ptr noalias nocapture %1, ptr noalias nocapture %2, ptr %3, ptr %4, ptr %Self, ptr %Self.AsyncSequence, ptr %Self.Element.Comparable) #1 { entry: @@ -27,7 +27,7 @@ entry: %10 = getelementptr inbounds <{ ptr, ptr }>, ptr %9, i32 0, i32 1 %11 = load ptr, ptr %10, align 8 %12 = load ptr, ptr %5, align 8 - %13 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %7, i1 false, ptr @repo.0, ptr %11, ptr %12, i1 %8, ptr null) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %7, i1 false, ptr @repo.0, ptr %11, ptr %12, i1 %8, ptr null) unreachable } diff --git a/llvm/test/Transforms/Coroutines/coro-async-no-cse-swift-async-context-addr.ll b/llvm/test/Transforms/Coroutines/coro-async-no-cse-swift-async-context-addr.ll index c898a1b0c2983..e745177e9cb28 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-no-cse-swift-async-context-addr.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-no-cse-swift-async-context-addr.ll @@ -41,7 +41,7 @@ entry: %11 = call ptr @llvm.swift.async.context.addr() store ptr %9, ptr %11, align 8 - %12 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @repo.0, ptr %9, ptr %10) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @repo.0, ptr %9, ptr %10) unreachable } @@ -66,7 +66,7 @@ entry: declare { ptr } @llvm.coro.suspend.async.sl_p0i8s(i32, ptr, ptr, ...) #1 declare token @llvm.coro.id.async(i32, i32, i32, ptr) #1 declare ptr @llvm.coro.begin(token, ptr writeonly) #1 -declare i1 @llvm.coro.end.async(ptr, i1, ...) #1 +declare void @llvm.coro.end.async(ptr, i1, ...) #1 declare ptr @llvm.coro.async.resume() #1 declare ptr @llvm.swift.async.context.addr() #1 diff --git a/llvm/test/Transforms/Coroutines/coro-async-nomerge.ll b/llvm/test/Transforms/Coroutines/coro-async-nomerge.ll index ac39704b93da5..42652bd88bc58 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-nomerge.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-nomerge.ll @@ -11,7 +11,7 @@ declare { ptr } @llvm.coro.suspend.async.sl_p0i8s(i32, ptr, ptr, ...) declare ptr @llvm.coro.begin(token, ptr writeonly) declare token @llvm.coro.id.async(i32, i32, i32, ptr) -declare i1 @llvm.coro.end.async(ptr, i1, ...) +declare void @llvm.coro.end.async(ptr, i1, ...) define linkonce_odr hidden ptr @__swift_async_resume_get_context(ptr %0) { entry: @@ -53,7 +53,7 @@ bb2: br label %tailblock tailblock: - %t = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %id, i1 false, ptr @repo.0, ptr @return, ptr %0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %id, i1 false, ptr @repo.0, ptr @return, ptr %0) unreachable } @@ -115,6 +115,6 @@ bb2: br label %tailblock tailblock: - %t = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %id, i1 false, ptr @repo.0, ptr @return, ptr %0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %id, i1 false, ptr @repo.0, ptr @return, ptr %0) unreachable } diff --git a/llvm/test/Transforms/Coroutines/coro-async-phi.ll b/llvm/test/Transforms/Coroutines/coro-async-phi.ll index 25be1eaa059eb..7aa6857aa04eb 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-phi.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-phi.ll @@ -83,7 +83,7 @@ bb68: ; preds = %bb30 br label %bb126 bb126: - %i162 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %i12, i1 false, ptr @__swift_suspend_dispatch_2, ptr @doIt, ptr null, ptr null) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %i12, i1 false, ptr @__swift_suspend_dispatch_2, ptr @doIt, ptr null, ptr null) unreachable } @@ -119,7 +119,7 @@ entry: declare { ptr } @llvm.coro.suspend.async.sl_p0i8s(i32, ptr, ptr, ...) #1 ; Function Attrs: nounwind -declare i1 @llvm.coro.end.async(ptr, i1, ...) #1 +declare void @llvm.coro.end.async(ptr, i1, ...) #1 ; Function Attrs: argmemonly nounwind declare extern_weak swiftcc ptr @swift_task_alloc(i64) #5 diff --git a/llvm/test/Transforms/Coroutines/coro-async.ll b/llvm/test/Transforms/Coroutines/coro-async.ll index 331d6a60bed6b..f94c6c11aa8b1 100644 --- a/llvm/test/Transforms/Coroutines/coro-async.ll +++ b/llvm/test/Transforms/Coroutines/coro-async.ll @@ -101,7 +101,7 @@ entry: call void @some_user(i64 %val.2) store <4 x double> %vector_spill, ptr %vector, align 16 tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -211,7 +211,7 @@ entry: %continuation_actor_arg = extractvalue {ptr, ptr, ptr} %res.2, 1 tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %continuation_actor_arg) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -297,7 +297,7 @@ entry: call void @llvm.coro.async.context.dealloc(ptr %callee_context) %continuation_task_arg = extractvalue {ptr, ptr, ptr} %res, 1 tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -339,11 +339,11 @@ entry: is_equal: tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable is_not_equal: - call i1 (ptr, i1, ...) @llvm.coro.end.async( + call void (ptr, i1, ...) @llvm.coro.end.async( ptr %hdl, i1 0, ptr @must_tail_call_return, ptr %async.ctxt, ptr %continuation_task_arg, ptr null) @@ -406,7 +406,7 @@ entry: call void @some_user(i64 %val.2) tail call swiftcc void @asyncReturn(ptr %async.ctxt, ptr %continuation_task_arg, ptr %actor) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -431,7 +431,7 @@ entry: ptr @no_coro_suspend_fp) %hdl = call ptr @llvm.coro.begin(token %id, ptr null) call void @some_may_write(ptr %some_alloca) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -459,7 +459,7 @@ entry: %hdl = call ptr @llvm.coro.begin(token %id, ptr null) store ptr null, ptr %some_alloca, align 8 call void @do_with_swifterror(ptr swifterror %some_alloca) - call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 0) unreachable } @@ -488,7 +488,7 @@ entry: %undefined_resume_pointer = call ptr @llvm.coro.async.resume() call void @use(ptr %undefined_resume_pointer) call void @crash() - %unused = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 false) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %hdl, i1 false) unreachable } ; CHECK-LABEL: define swiftcc void @undefined_coro_async_resume @@ -510,7 +510,7 @@ entry: %5 = getelementptr inbounds <{ ptr, ptr }>, ptr %4, i32 0, i32 1 %6 = load ptr, ptr %5, align 8 %7 = load ptr, ptr %1, align 8 - %8 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @simpleFunc.0, ptr %6, ptr %7) + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %3, i1 false, ptr @simpleFunc.0, ptr %6, ptr %7) unreachable } @@ -529,8 +529,8 @@ declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32 declare ptr @llvm.coro.prepare.async(ptr) declare token @llvm.coro.id.async(i32, i32, i32, ptr) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end.async(ptr, i1, ...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end.async(ptr, i1, ...) +declare void @llvm.coro.end(ptr, i1, token) declare {ptr, ptr, ptr} @llvm.coro.suspend.async(i32, ptr, ptr, ...) declare ptr @llvm.coro.async.context.alloc(ptr, ptr) declare void @llvm.coro.async.context.dealloc(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll index ee64ce6e4482b..4aef572f47a35 100644 --- a/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll +++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-handle-in-ramp.ll @@ -32,7 +32,7 @@ cleanup: br label %ret ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret void } @@ -53,7 +53,7 @@ declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @free(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll index fd3b7bd815300..67d179a8f9b04 100644 --- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll +++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower-invoke.ll @@ -88,7 +88,7 @@ cleanup: br label %ret ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret void } @@ -112,7 +112,7 @@ declare ptr @llvm.coro.begin(token, ptr) declare void @llvm.coro.await.suspend.void(ptr, ptr, ptr) declare i1 @llvm.coro.await.suspend.bool(ptr, ptr, ptr) declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @__cxa_begin_catch(ptr) declare void @use_val(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll index 8d019e6954628..72a158abffc6b 100644 --- a/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll +++ b/llvm/test/Transforms/Coroutines/coro-await-suspend-lower.ll @@ -65,7 +65,7 @@ cleanup: br label %ret ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret void } @@ -89,7 +89,7 @@ declare ptr @llvm.coro.begin(token, ptr) declare void @llvm.coro.await.suspend.void(ptr, ptr, ptr) declare i1 @llvm.coro.await.suspend.bool(ptr, ptr, ptr) declare void @llvm.coro.await.suspend.handle(ptr, ptr, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @free(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-byval-param.ll b/llvm/test/Transforms/Coroutines/coro-byval-param.ll index 864b7cae9ca5e..95bb7be73b04f 100644 --- a/llvm/test/Transforms/Coroutines/coro-byval-param.ll +++ b/llvm/test/Transforms/Coroutines/coro-byval-param.ll @@ -52,7 +52,7 @@ coro.free: ; preds = %cleanup33 br label %coro.ret coro.ret: ; preds = %coro.free, %cleanup33, %init.ready, %coro.init - %10 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #10 + call void @llvm.coro.end(ptr null, i1 false, token none) #10 ret ptr %call2 } @@ -103,7 +103,7 @@ declare i8 @llvm.coro.suspend(token, i1) #2 declare void @_ZN4task12promise_type13final_suspendEv(ptr nonnull dereferenceable(1)) local_unnamed_addr #7 align 2 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 ; Function Attrs: nobuiltin nounwind declare void @_ZdlPv(ptr) local_unnamed_addr #8 diff --git a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll index 2f6d23da82692..d0e7c1c29eb32 100644 --- a/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll +++ b/llvm/test/Transforms/Coroutines/coro-catchswitch-cleanuppad.ll @@ -37,7 +37,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl catch.dispatch.1: @@ -106,7 +106,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-catchswitch.ll b/llvm/test/Transforms/Coroutines/coro-catchswitch.ll index 3cf6dc86f2c6d..4332f2df03d15 100644 --- a/llvm/test/Transforms/Coroutines/coro-catchswitch.ll +++ b/llvm/test/Transforms/Coroutines/coro-catchswitch.ll @@ -54,7 +54,7 @@ resume: ; preds = %await2.suspend br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret void cleanuppad: @@ -80,7 +80,7 @@ declare void @print(i32) declare noalias ptr @malloc(i32) declare void @free(ptr) -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 ; Function Attrs: nobuiltin nounwind diff --git a/llvm/test/Transforms/Coroutines/coro-debug-O2.ll b/llvm/test/Transforms/Coroutines/coro-debug-O2.ll index cc1dbcd1c80fd..4daaa5960f0ae 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-O2.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-O2.ll @@ -112,7 +112,7 @@ cleanup.cont: ; preds = %after.coro.free br label %coro.ret coro.ret: ; preds = %cleanup.cont, %after.coro.free, %final.suspend, %await.suspend, %init.suspend - %end = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreachable: ; preds = %after.coro.free @@ -128,7 +128,7 @@ declare token @llvm.coro.save(ptr) declare ptr @llvm.coro.begin(token, ptr writeonly) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr nocapture readonly) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @new(i64) declare void @delete(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll index dff064ec084c9..f2aedefcfd381 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-coro-frame.ll @@ -205,7 +205,7 @@ cleanup.cont: ; preds = %after.coro.free br label %coro.ret coro.ret: ; preds = %cleanup.cont, %after.coro.free, %final.suspend, %await.suspend, %init.suspend - %end = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreachable: ; preds = %after.coro.free @@ -334,7 +334,7 @@ cleanup.cont: ; preds = %after.coro.free br label %coro.ret coro.ret: ; preds = %cleanup.cont, %after.coro.free, %final.suspend, %await.suspend, %init.suspend - %end = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreachable: ; preds = %after.coro.free @@ -350,7 +350,7 @@ declare token @llvm.coro.save(ptr) declare ptr @llvm.coro.begin(token, ptr writeonly) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr nocapture readonly) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @new(i64) declare void @delete(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll index deaec7b8d7f89..483c1a8e8608a 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll @@ -123,7 +123,7 @@ cleanup.cont: ; preds = %after.coro.free br label %coro.ret coro.ret: ; preds = %cleanup.cont, %after.coro.free, %final.suspend, %await.suspend, %init.suspend - %end = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreachable: ; preds = %after.coro.free @@ -155,7 +155,7 @@ declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @new(i64) diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll index 0934393a667ee..c524f38432ed1 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll @@ -162,7 +162,7 @@ cleanup.cont: ; preds = %after.coro.free br label %coro.ret coro.ret: ; preds = %cleanup.cont, %after.coro.free, %final.suspend, %await.suspend, %init.suspend - %end = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreachable: ; preds = %after.coro.free @@ -194,7 +194,7 @@ declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @new(i64) diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll index 125ec752c8345..0ef24a6b1e2a6 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll @@ -186,7 +186,7 @@ cleanup.cont: ; preds = %after.coro.free br label %coro.ret coro.ret: ; preds = %cleanup.cont, %after.coro.free, %final.suspend, %await.suspend, %init.suspend - %end = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreachable: ; preds = %after.coro.free @@ -201,7 +201,7 @@ declare token @llvm.coro.save(ptr) declare ptr @llvm.coro.begin(token, ptr writeonly) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr nocapture readonly) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @new(i64) declare void @delete(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-debug-spill-dbg.declare.ll b/llvm/test/Transforms/Coroutines/coro-debug-spill-dbg.declare.ll index 59a4b5b2dfbc8..bd9eb2036e6fb 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-spill-dbg.declare.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-spill-dbg.declare.ll @@ -74,7 +74,7 @@ cleanup: ; preds = %resume, %coro.begin br label %suspend suspend: ; preds = %cleanup, %coro.begin - %2 = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -104,7 +104,7 @@ declare i1 @llvm.coro.alloc(token) #4 declare ptr @llvm.coro.begin(token, ptr writeonly) #4 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #4 +declare void @llvm.coro.end(ptr, i1, token) #4 declare noalias ptr @malloc(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll index 5f8e9c9c1d16d..d1f1922c2a92f 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug.ll @@ -69,10 +69,10 @@ coro_Cleanup: ; preds = %sw.epilog, %sw.bb1 br label %coro_Suspend, !dbg !24 coro_Suspend: ; preds = %coro_Cleanup, %sw.default - %7 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24 - %8 = load ptr, ptr %coro_hdl, align 8, !dbg !24 + call void @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24 + %7 = load ptr, ptr %coro_hdl, align 8, !dbg !24 store i32 0, ptr %late_local, !dbg !24 - ret ptr %8, !dbg !24 + ret ptr %7, !dbg !24 ehcleanup: %ex = landingpad { ptr, i32 } @@ -110,7 +110,7 @@ declare void @free(ptr) #3 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) #5 +declare void @llvm.coro.end(ptr, i1, token) #5 ; Function Attrs: argmemonly nounwind readonly declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #2 diff --git a/llvm/test/Transforms/Coroutines/coro-early-twice.ll b/llvm/test/Transforms/Coroutines/coro-early-twice.ll index 39ec0ccc6fdb8..e4df0071bcc93 100644 --- a/llvm/test/Transforms/Coroutines/coro-early-twice.ll +++ b/llvm/test/Transforms/Coroutines/coro-early-twice.ll @@ -22,7 +22,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -33,4 +33,4 @@ declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) declare void @free(ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) diff --git a/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-00.ll b/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-00.ll index 2f5b989a620e0..ad84f7b33dc65 100644 --- a/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-00.ll +++ b/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-00.ll @@ -65,7 +65,7 @@ cleanup: ; preds = %invoke.cont15, %if.el br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreach: @@ -92,6 +92,6 @@ declare void @use_val(i32) declare void @__cxa_end_catch() ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @free(ptr) declare ptr @llvm.coro.free(token, ptr nocapture readonly) diff --git a/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-01.ll b/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-01.ll index d896c6a18b233..0b9bce5f9ad77 100644 --- a/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-01.ll @@ -59,7 +59,7 @@ cleanup: ; preds = %invoke.cont15, %if.el br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreach: @@ -86,6 +86,6 @@ declare void @use_val(i32) declare void @__cxa_end_catch() ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @free(ptr) declare ptr @llvm.coro.free(token, ptr nocapture readonly) diff --git a/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-02.ll b/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-02.ll index 79aa58b85eda8..6202df1fe00e6 100644 --- a/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-eh-aware-edge-split-02.ll @@ -59,7 +59,7 @@ cleanup: ; preds = %invoke.cont15, %if.el br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -83,6 +83,6 @@ declare void @use_val(i32) declare void @__cxa_end_catch() ; Function Attrs: nounwind -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @free(ptr) declare ptr @llvm.coro.free(token, ptr nocapture readonly) diff --git a/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll b/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll index 7d5ddabf7ea8e..722ff81bc0cad 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-arrayalloca.ll @@ -30,7 +30,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -65,7 +65,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-00.ll b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-00.ll index bf08d6ff0b205..a2c9d58469427 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-00.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-00.ll @@ -50,7 +50,7 @@ cleanup: call void @free(ptr %mem) br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -66,7 +66,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-01.ll b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-01.ll index 78c6f0cacd695..e7c28d92674ba 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-01.ll @@ -53,7 +53,7 @@ cleanup: call ptr @llvm.coro.free(token %0, ptr %1) br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -68,6 +68,6 @@ declare token @llvm.coro.save(ptr) #3 declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-02.ll b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-02.ll index 82657318d7785..b75995f3eaa8a 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-02.ll @@ -55,7 +55,7 @@ cleanup: call ptr @llvm.coro.free(token %0, ptr %1) br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } ; CHECK: %a.Frame = type { ptr, ptr, %"struct.task::promise_type", %struct.big_structure, i1 } @@ -69,6 +69,6 @@ declare token @llvm.coro.save(ptr) #3 declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-03.ll b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-03.ll index 66d41372cd9e7..427d8984e126c 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-03.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-03.ll @@ -50,7 +50,7 @@ cleanup: call void @free(ptr %mem) br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -66,7 +66,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-04.ll b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-04.ll index 6ff31e566283b..81a5dcc1d3858 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-04.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-04.ll @@ -55,7 +55,7 @@ cleanup: call ptr @llvm.coro.free(token %0, ptr %1) br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } ; CHECK: %a.Frame = type { ptr, ptr, %"struct.task::promise_type", %struct.big_structure, i1, [26 x i8], %struct.big_structure.2 } @@ -69,6 +69,6 @@ declare token @llvm.coro.save(ptr) #3 declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-05.ll b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-05.ll index c3da8e872dc07..6caa41f32f26e 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-05.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-reuse-alloca-05.ll @@ -55,7 +55,7 @@ cleanup: call ptr @llvm.coro.free(token %0, ptr %1) br label %coro.ret coro.ret: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } ; CHECK: %a.Frame = type { ptr, ptr, %"struct.task::promise_type", i1, [14 x i8], %struct.big_structure } @@ -69,6 +69,6 @@ declare token @llvm.coro.save(ptr) #3 declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-frame-unreachable.ll b/llvm/test/Transforms/Coroutines/coro-frame-unreachable.ll index b81f7d0ed7eac..3d290554e22c9 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame-unreachable.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame-unreachable.ll @@ -24,7 +24,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl no.predecessors: @@ -43,7 +43,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i1) diff --git a/llvm/test/Transforms/Coroutines/coro-frame.ll b/llvm/test/Transforms/Coroutines/coro-frame.ll index c20be8ce2ff68..d25d335fe63c6 100644 --- a/llvm/test/Transforms/Coroutines/coro-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-frame.ll @@ -26,7 +26,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl pad: %tok = cleanuppad within none [] @@ -58,7 +58,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll index df2ed7e4bcead..503b93ea76a02 100644 --- a/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll +++ b/llvm/test/Transforms/Coroutines/coro-lifetime-end.ll @@ -43,7 +43,7 @@ entry: await.ready: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -80,7 +80,7 @@ entry: await.ready: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) call void @llvm.lifetime.end.p0(ptr %testval) ret void } @@ -128,7 +128,7 @@ if.end: await.ready: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -137,6 +137,6 @@ declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) declare ptr @llvm.coro.begin(token, ptr writeonly) #3 declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-materialize.ll b/llvm/test/Transforms/Coroutines/coro-materialize.ll index 3bf1360001dc7..f55db35edb3ea 100644 --- a/llvm/test/Transforms/Coroutines/coro-materialize.ll +++ b/llvm/test/Transforms/Coroutines/coro-materialize.ll @@ -41,7 +41,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -73,7 +73,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -109,7 +109,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -145,7 +145,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -186,7 +186,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -200,7 +200,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-noalias-param.ll b/llvm/test/Transforms/Coroutines/coro-noalias-param.ll index e48ad8fddd5fe..77068b8d75367 100644 --- a/llvm/test/Transforms/Coroutines/coro-noalias-param.ll +++ b/llvm/test/Transforms/Coroutines/coro-noalias-param.ll @@ -19,7 +19,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret void } @@ -33,7 +33,7 @@ declare i32 @llvm.coro.size.i32() declare i8 @llvm.coro.suspend(token, i1) declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-only-destroy-when-complete.ll b/llvm/test/Transforms/Coroutines/coro-only-destroy-when-complete.ll index c3d0fb1a18dd2..e40ac4e0ec162 100644 --- a/llvm/test/Transforms/Coroutines/coro-only-destroy-when-complete.ll +++ b/llvm/test/Transforms/Coroutines/coro-only-destroy-when-complete.ll @@ -90,7 +90,7 @@ coro.free: ; preds = %cleanup62 br label %coro.ret coro.ret: ; preds = %coro.free, %cleanup62, %final.suspend, %await2.suspend, %await.suspend, %init.suspend - %19 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #12 + call void @llvm.coro.end(ptr null, i1 false, token none) #12 ret ptr %__promise } @@ -106,7 +106,7 @@ declare i8 @llvm.coro.suspend(token, i1) #3 declare ptr @_Z5Innerv() local_unnamed_addr declare dso_local void @_ZdlPv(ptr noundef) local_unnamed_addr #8 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @exit(i32 noundef) declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #10 declare void @dtor1() diff --git a/llvm/test/Transforms/Coroutines/coro-padding.ll b/llvm/test/Transforms/Coroutines/coro-padding.ll index 452b83bad388a..1de12a4f44a57 100644 --- a/llvm/test/Transforms/Coroutines/coro-padding.ll +++ b/llvm/test/Transforms/Coroutines/coro-padding.ll @@ -26,7 +26,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -54,7 +54,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-param-copy.ll b/llvm/test/Transforms/Coroutines/coro-param-copy.ll index 717ca46651414..f0e816bba9584 100644 --- a/llvm/test/Transforms/Coroutines/coro-param-copy.ll +++ b/llvm/test/Transforms/Coroutines/coro-param-copy.ll @@ -41,7 +41,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -83,7 +83,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.memset.p0.i32(ptr, i8, i32, i1) diff --git a/llvm/test/Transforms/Coroutines/coro-readnone-02.ll b/llvm/test/Transforms/Coroutines/coro-readnone-02.ll index 4ed962816154b..3ada99070967c 100644 --- a/llvm/test/Transforms/Coroutines/coro-readnone-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-readnone-02.ll @@ -39,7 +39,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -75,7 +75,7 @@ declare i8 @llvm.coro.suspend(token, i1) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @free(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-readnone.ll b/llvm/test/Transforms/Coroutines/coro-readnone.ll index 1fc91cefaf975..321ddab3ae4d7 100644 --- a/llvm/test/Transforms/Coroutines/coro-readnone.ll +++ b/llvm/test/Transforms/Coroutines/coro-readnone.ll @@ -33,7 +33,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -83,7 +83,7 @@ declare i8 @llvm.coro.suspend(token, i1) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @free(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll b/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll index b23c5222a3deb..94ed43ee43f2e 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll @@ -34,7 +34,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -69,7 +69,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -77,7 +77,7 @@ declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) declare void @llvm.coro.suspend.retcon.isVoid(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare token @llvm.coro.alloca.alloc.i32(i32, i32) declare ptr @llvm.coro.alloca.get(token) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll b/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll index aeb959e5ce711..6caa571cf1bbc 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll @@ -33,7 +33,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -69,7 +69,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -102,7 +102,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -170,7 +170,7 @@ forward: br label %back end: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -217,7 +217,7 @@ non_alloca_block: br label %suspend cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -225,7 +225,7 @@ declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) declare void @llvm.coro.suspend.retcon.isVoid(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare token @llvm.coro.alloca.alloc.i32(i32, i32) declare ptr @llvm.coro.alloca.get(token) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll b/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll index a81cdf475ae31..780f24e124a51 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-frame.ll @@ -32,7 +32,7 @@ resume: br label %end end: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } ; Make sure we don't lose writes to the frame. @@ -52,5 +52,5 @@ end: declare token @llvm.coro.id.retcon.once(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-once-private.ll b/llvm/test/Transforms/Coroutines/coro-retcon-once-private.ll index e7593cc8c6f81..35eb2e4df705b 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-once-private.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-private.ll @@ -30,14 +30,14 @@ neg.cont: br label %cleanup cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } declare token @llvm.coro.id.retcon.once(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare void @prototype(ptr, i1 zeroext) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll index fa10ddefee00e..026e23913d647 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll @@ -29,7 +29,7 @@ neg.cont: br label %cleanup cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -72,7 +72,7 @@ neg.cont: cleanup: %new.val = add i32 %val, 123 %tok = call token (...) @llvm.coro.end.results(ptr null, i32 %new.val, ptr @deallocate) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token %tok) + call void @llvm.coro.end(ptr %hdl, i1 0, token %tok) unreachable } @@ -96,7 +96,7 @@ entry: declare token @llvm.coro.id.retcon.once(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare token @llvm.coro.end.results(...) declare ptr @llvm.coro.prepare.retcon(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-once-value2.ll b/llvm/test/Transforms/Coroutines/coro-retcon-once-value2.ll index c33e60e98cd8b..aad762e2c9335 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-once-value2.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-value2.ll @@ -19,7 +19,7 @@ cont: br label %cleanup cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -41,7 +41,7 @@ cont: cleanup: %tok = call token (...) @llvm.coro.end.results(i8 %val) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token %tok) + call void @llvm.coro.end(ptr %hdl, i1 0, token %tok) unreachable } @@ -63,7 +63,7 @@ cont: cleanup: %tok = call token (...) @llvm.coro.end.results(ptr null, i32 123, ptr @deallocate) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token %tok) + call void @llvm.coro.end(ptr %hdl, i1 0, token %tok) unreachable } @@ -71,7 +71,7 @@ cleanup: declare token @llvm.coro.id.retcon.once(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare token @llvm.coro.end.results(...) declare void @prototype(ptr, i1 zeroext) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll b/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll index 1908b31f52db3..5484fec1b3ce4 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-opaque-ptr.ll @@ -26,7 +26,7 @@ resume: ; preds = %loop br label %loop cleanup: ; preds = %loop - %0 = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) unreachable } @@ -72,14 +72,14 @@ resume: ; preds = %loop br label %loop cleanup: ; preds = %loop - %0 = call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) unreachable } declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare ptr @prototype(ptr, i1 zeroext) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-remat.ll b/llvm/test/Transforms/Coroutines/coro-retcon-remat.ll index fd16ba96181b9..160754e7d11c3 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-remat.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-remat.ll @@ -31,14 +31,14 @@ resume1: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare { ptr, i32 } @f_prototype(ptr, i1 zeroext) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll index 907d7e588ffe0..2f04453d69c4b 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll @@ -27,7 +27,7 @@ resume: cleanup: call void @print(i32 %n.val) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -66,7 +66,7 @@ entry: declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare { i32, i1 } @llvm.coro.suspend.retcon.sl_i32i1s(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare ptr @prototype(ptr, i32, i1 zeroext) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values2.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values2.ll index 2caa6430ca012..a19c1ca0e7f3a 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values2.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values2.ll @@ -15,14 +15,14 @@ entry: %sum4 = call i32 @add(i32 %sum3, i32 %value1) %sum5 = call i32 @add(i32 %sum4, i32 %value2) call void @print(i32 %sum5) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i32 @llvm.coro.suspend.retcon.i32(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare ptr @prototype(ptr, i32) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-unreachable.ll b/llvm/test/Transforms/Coroutines/coro-retcon-unreachable.ll index 31839aa2a2b5f..6e4a287e53b0a 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-unreachable.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-unreachable.ll @@ -23,7 +23,7 @@ define hidden swiftcc { ptr, ptr } @no_suspends(ptr %buffer, i64 %arg) #1 { bb1: call void @print(i64 %arg) - call i1 @llvm.coro.end(ptr %begin, i1 false, token none) + call void @llvm.coro.end(ptr %begin, i1 false, token none) unreachable } @@ -41,7 +41,7 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #6 declare i1 @llvm.coro.suspend.retcon.i1(...) #5 declare void @llvm.lifetime.end.p0(ptr nocapture) #6 declare void @llvm.coro.alloca.free(token) #5 -declare i1 @llvm.coro.end(ptr, i1, token) #5 +declare void @llvm.coro.end(ptr, i1, token) #5 declare void @llvm.trap() diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-value.ll b/llvm/test/Transforms/Coroutines/coro-retcon-value.ll index 6a150c6a79807..d456c3b1cb2a3 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-value.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-value.ll @@ -25,7 +25,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -64,7 +64,7 @@ entry: declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend.retcon.i8(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare {ptr, i32} @prototype(ptr, i8 zeroext) diff --git a/llvm/test/Transforms/Coroutines/coro-retcon.ll b/llvm/test/Transforms/Coroutines/coro-retcon.ll index e0484c6d66941..86eba3b5d134f 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon.ll @@ -33,7 +33,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -105,7 +105,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -139,14 +139,14 @@ cleanup: call void @use_var_ptr(ptr %a) %al = load i32, ptr %a call void @use_var(i32 %al) - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare void @use_var(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll b/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll index bb43d8f4b3399..273ac7c5293d9 100644 --- a/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll +++ b/llvm/test/Transforms/Coroutines/coro-spill-after-phi.ll @@ -50,7 +50,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -63,7 +63,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare i32 @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-spill-corobegin.ll b/llvm/test/Transforms/Coroutines/coro-spill-corobegin.ll index f238955d1c3e9..bfc48adfed22a 100644 --- a/llvm/test/Transforms/Coroutines/coro-spill-corobegin.ll +++ b/llvm/test/Transforms/Coroutines/coro-spill-corobegin.ll @@ -33,7 +33,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -62,7 +62,7 @@ declare i8 @llvm.coro.suspend(token, i1) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print.i32(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll b/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll index 801c4a1776135..16df22b19fd11 100644 --- a/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll +++ b/llvm/test/Transforms/Coroutines/coro-spill-defs-before-corobegin.ll @@ -36,7 +36,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl lpad: @@ -70,7 +70,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare i32 @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-spill-promise-02.ll b/llvm/test/Transforms/Coroutines/coro-spill-promise-02.ll index 3293e5c84b987..a3888f0eff562 100644 --- a/llvm/test/Transforms/Coroutines/coro-spill-promise-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-spill-promise-02.ll @@ -29,7 +29,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -57,7 +57,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-spill-promise.ll b/llvm/test/Transforms/Coroutines/coro-spill-promise.ll index 47e891a57d222..344d77bd54930 100644 --- a/llvm/test/Transforms/Coroutines/coro-spill-promise.ll +++ b/llvm/test/Transforms/Coroutines/coro-spill-promise.ll @@ -28,7 +28,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -48,7 +48,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare double @print(double) diff --git a/llvm/test/Transforms/Coroutines/coro-spill-suspend.ll b/llvm/test/Transforms/Coroutines/coro-spill-suspend.ll index 8de02c8b7de23..6a80f3637fd10 100644 --- a/llvm/test/Transforms/Coroutines/coro-spill-suspend.ll +++ b/llvm/test/Transforms/Coroutines/coro-spill-suspend.ll @@ -49,7 +49,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } diff --git a/llvm/test/Transforms/Coroutines/coro-split-00.ll b/llvm/test/Transforms/Coroutines/coro-split-00.ll index 9909627e60597..06f71c848e250 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-00.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-00.ll @@ -28,7 +28,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -86,7 +86,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) allockind("alloc,uninitialized") "alloc-family"="malloc" declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-01.ll b/llvm/test/Transforms/Coroutines/coro-split-01.ll index 7a03495e75d8d..e74e927839dff 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-01.ll @@ -26,7 +26,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } define i32 @main() { @@ -49,7 +49,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-02.ll b/llvm/test/Transforms/Coroutines/coro-split-02.ll index c487ab1e42ff6..a11ea466af29c 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-02.ll @@ -34,7 +34,7 @@ await.ready: call void @print(i32 %val) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -60,6 +60,6 @@ declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(ptr) local_unnamed_addr #10 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-split-alloc.ll b/llvm/test/Transforms/Coroutines/coro-split-alloc.ll index f6f50e2f3c76c..9a5e97fedd97d 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-alloc.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-alloc.ll @@ -33,7 +33,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -57,7 +57,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @custom_alloctor(i32, i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg-labels-inlined.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg-labels-inlined.ll index e9737b62b0b8f..995795b8de1fa 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-dbg-labels-inlined.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-dbg-labels-inlined.ll @@ -28,7 +28,7 @@ coro_Cleanup: br label %coro_Suspend, !dbg !37 coro_Suspend: - tail call i1 @llvm.coro.end(ptr null, i1 false, token none) #3, !dbg !40 + tail call void @llvm.coro.end(ptr null, i1 false, token none) #3, !dbg !40 ret ptr %2, !dbg !41 } @@ -60,7 +60,7 @@ declare token @llvm.coro.save(ptr) #0 declare i8 @llvm.coro.suspend(token, i1) #0 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #4 declare void @free(ptr nocapture) local_unnamed_addr #0 -declare i1 @llvm.coro.end(ptr, i1, token) #0 +declare void @llvm.coro.end(ptr, i1, token) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg-labels.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg-labels.ll index 490e4fc102349..4c5d9fb81c272 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-dbg-labels.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-dbg-labels.ll @@ -64,7 +64,7 @@ coro_Cleanup: ; preds = %for.cond br label %coro_Suspend, !dbg !37 coro_Suspend: ; preds = %for.cond, %if.then, %coro_Cleanup - tail call i1 @llvm.coro.end(ptr null, i1 false, token none) #3, !dbg !40 + tail call void @llvm.coro.end(ptr null, i1 false, token none) #3, !dbg !40 ret ptr %2, !dbg !41 } @@ -115,7 +115,7 @@ declare token @llvm.coro.save(ptr) #0 declare i8 @llvm.coro.suspend(token, i1) #0 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #4 declare void @free(ptr nocapture) local_unnamed_addr #0 -declare i1 @llvm.coro.end(ptr, i1, token) #0 +declare void @llvm.coro.end(ptr, i1, token) #0 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll index 02bd2b2d0d65f..c53bea899ee51 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll @@ -38,7 +38,7 @@ coro_Cleanup: ; preds = %for.cond br label %coro_Suspend, !dbg !36 coro_Suspend: ; preds = %for.cond, %if.then, %coro_Cleanup - tail call i1 @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38 + tail call void @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38 ret ptr %2, !dbg !39 } @@ -57,7 +57,7 @@ declare i8 @llvm.coro.suspend(token, i1) #7 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #5 declare void @free(ptr nocapture) local_unnamed_addr #6 -declare i1 @llvm.coro.end(ptr, i1, token) #7 +declare void @llvm.coro.end(ptr, i1, token) #7 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #5 declare void @llvm.dbg.value(metadata, metadata, metadata) #1 diff --git a/llvm/test/Transforms/Coroutines/coro-split-eh-00.ll b/llvm/test/Transforms/Coroutines/coro-split-eh-00.ll index d7d60bb2bfa22..0695071306d8d 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-eh-00.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-eh-00.ll @@ -17,7 +17,7 @@ resume: invoke void @print(i32 1) to label %suspend unwind label %lpad suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) call void @print(i32 0) ; should not be present in f.resume ret ptr %hdl @@ -26,8 +26,9 @@ lpad: cleanup call void @print(i32 2) - %need.resume = call i1 @llvm.coro.end(ptr null, i1 true, token none) - br i1 %need.resume, label %eh.resume, label %cleanup.cont + call void @llvm.coro.end(ptr null, i1 true, token none) + %in.ramp = call i1 @llvm.coro.is_in_ramp() + br i1 %in.ramp, label %cleanup.cont, label %eh.resume cleanup.cont: call void @print(i32 3) ; should not be present in f.resume @@ -80,7 +81,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare ptr @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-eh-01.ll b/llvm/test/Transforms/Coroutines/coro-split-eh-01.ll index b25c4b9f5a700..093fd85b80cdd 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-eh-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-eh-01.ll @@ -17,14 +17,14 @@ resume: invoke void @print(i32 1) to label %suspend unwind label %lpad suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) call void @print(i32 0) ; should not be present in f.resume ret ptr %hdl lpad: %tok = cleanuppad within none [] call void @print(i32 2) - %unused = call i1 @llvm.coro.end(ptr null, i1 true, token none) [ "funclet"(token %tok) ] + call void @llvm.coro.end(ptr null, i1 true, token none) [ "funclet"(token %tok) ] cleanupret from %tok unwind label %cleanup.cont cleanup.cont: @@ -74,7 +74,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare ptr @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-final-suspend.ll b/llvm/test/Transforms/Coroutines/coro-split-final-suspend.ll index fbefd43f73c36..b620b2d7fa4be 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-final-suspend.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-final-suspend.ll @@ -28,7 +28,7 @@ resume: invoke void @print(i32 1) to label %suspend unwind label %lpad suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) call void @print(i32 0) ret ptr %hdl @@ -37,8 +37,9 @@ lpad: cleanup call void @print(i32 2) - %need.resume = call i1 @llvm.coro.end(ptr null, i1 true, token none) - br i1 %need.resume, label %eh.resume, label %cleanup.cont + call void @llvm.coro.end(ptr null, i1 true, token none) + %in.ramp = call i1 @llvm.coro.is_in_ramp() + br i1 %in.ramp, label %cleanup.cont, label %eh.resume cleanup.cont: call void @print(i32 3) @@ -97,7 +98,7 @@ resume: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) call void @print(i32 0) ret ptr %hdl } @@ -122,7 +123,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare ptr @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-hidden.ll b/llvm/test/Transforms/Coroutines/coro-split-hidden.ll index fa4f0ab13bebc..2c1bf35c2fafc 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-hidden.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-hidden.ll @@ -30,7 +30,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -74,7 +74,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) allockind("alloc,uninitialized") declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll index e661932bf020e..70f15f6129d8e 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail-chain-pgo-counter-promo.ll @@ -24,7 +24,7 @@ declare void @llvm.assume(i1 noundef) declare i64 @llvm.coro.align.i64() declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr writeonly) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.free(token, ptr nocapture readonly) declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) declare token @llvm.coro.save(ptr) @@ -162,7 +162,7 @@ define ptr @f(i32 %0) presplitcoroutine align 32 { 61: ; preds = %60, %57, %54, %47, %12 %62 = getelementptr inbounds i8, ptr %3, i64 -16 - %63 = call i1 @llvm.coro.end(ptr null, i1 false, token none) #28 + call void @llvm.coro.end(ptr null, i1 false, token none) #28 ret ptr %62 } diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll index e8596b78460a5..cb3a12952d7e5 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail-ppc64le.ll @@ -36,7 +36,7 @@ await.ready: i8 1, label %exit ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -58,7 +58,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll index 70f29f4a9a4dc..d224d17fb52ea 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail.ll @@ -27,7 +27,7 @@ await.ready: i8 1, label %exit ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -50,7 +50,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl) diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll index 3edb8728d8550..4228a9db64866 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail1.ll @@ -45,7 +45,7 @@ final.suspend: pre.exit: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreach: unreachable @@ -83,7 +83,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare i8 @switch_result() diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll index a55b3d16e2ded..7bf0d72facb28 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail10.ll @@ -33,7 +33,7 @@ await.ready: i8 1, label %exit ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -47,7 +47,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl) diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll index 5baec378876bb..c818d1c2b144c 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail12.ll @@ -56,7 +56,7 @@ coro.free: br label %coro.end coro.end: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -73,7 +73,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @delete(ptr nonnull) #2 diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll index 0290e42339e2a..c726810e25d13 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail13.ll @@ -39,7 +39,7 @@ lpad: %lpval = landingpad { ptr, i32 } cleanup - %need.resume = call i1 @llvm.coro.end(ptr null, i1 true, token none) + call void @llvm.coro.end(ptr null, i1 true, token none) resume { ptr, i32 } %lpval coro.free: @@ -47,7 +47,7 @@ coro.free: br label %coro.end coro.end: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -64,7 +64,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @delete(ptr nonnull) #2 diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll index ca1611e19b9f9..04d2352107041 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail2.ll @@ -31,7 +31,7 @@ await.ready: i8 1, label %exit ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -54,7 +54,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare ptr @await_suspend_function(ptr %awaiter, ptr %hdl) diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll index 84cdac17beebb..558e38b3919fb 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail3.ll @@ -41,7 +41,7 @@ final.suspend: pre.exit: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void unreach: unreachable @@ -78,7 +78,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare i8 @switch_result() diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll index b647bd2e4a207..97cd6fb4375f1 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail4.ll @@ -38,7 +38,7 @@ coro.free: br label %coro.end coro.end: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -54,7 +54,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @delete(ptr nonnull) #2 diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll index b2561751e6377..9a2697efa1f2b 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail5.ll @@ -32,7 +32,7 @@ await.ready: call void @llvm.lifetime.end.p0(ptr %alloc.var) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -49,7 +49,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @consume(ptr) diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll index 99174ff283120..36ae55e9e69e3 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail6.ll @@ -36,7 +36,7 @@ await.ready: call void @llvm.lifetime.end.p0(ptr %alloc.var) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -84,7 +84,7 @@ coro.free: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -101,7 +101,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @delete(ptr nonnull) #2 diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll index 91f8543dffe93..8b67ccb8b8718 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail7.ll @@ -40,7 +40,7 @@ await.ready: br label %exit exit: %result = phi i64 [0, %entry], [0, %entry], [%foo, %await.suspend], [%foo, %await.suspend], [%foo, %await.ready] - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret i64 %result } @@ -90,7 +90,7 @@ coro.free: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -109,7 +109,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @delete(ptr nonnull) #2 diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail8.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail8.ll index 31b18d746be5f..5eeaf9db83118 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail8.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail8.ll @@ -32,7 +32,7 @@ await.ready: i8 1, label %exit ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -46,7 +46,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @print() diff --git a/llvm/test/Transforms/Coroutines/coro-split-musttail9.ll b/llvm/test/Transforms/Coroutines/coro-split-musttail9.ll index 76376dbbbe3d8..2906877df924b 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-musttail9.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-musttail9.ll @@ -32,7 +32,7 @@ await.ready: i8 1, label %exit ] exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -46,7 +46,7 @@ declare token @llvm.coro.save(ptr) #2 declare ptr @llvm.coro.frame() #3 declare i8 @llvm.coro.suspend(token, i1) #2 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #1 -declare i1 @llvm.coro.end(ptr, i1, token) #2 +declare void @llvm.coro.end(ptr, i1, token) #2 declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #1 declare ptr @malloc(i64) declare void @print() diff --git a/llvm/test/Transforms/Coroutines/coro-split-no-lifetime.ll b/llvm/test/Transforms/Coroutines/coro-split-no-lifetime.ll index 12d65647c8b01..8c081f324e0cc 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-no-lifetime.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-no-lifetime.ll @@ -37,7 +37,7 @@ cleanup: br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -52,7 +52,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.lifetime.start.p0(ptr nocapture) declare void @llvm.lifetime.end.p0(ptr nocapture) diff --git a/llvm/test/Transforms/Coroutines/coro-split-noinline.ll b/llvm/test/Transforms/Coroutines/coro-split-noinline.ll index c53771570a079..498bb4745d43c 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-noinline.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-noinline.ll @@ -29,7 +29,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -53,7 +53,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) allockind("alloc,uninitialized") "alloc-family"="malloc" declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-01.ll b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-01.ll index a5a2bcf2ecb81..848cf8b3e461f 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-01.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-01.ll @@ -37,7 +37,7 @@ await.ready: call void @print(i32 %val) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -76,7 +76,7 @@ await.ready: call void @print(i32 %val) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -91,6 +91,6 @@ declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(ptr) local_unnamed_addr #10 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-02.ll b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-02.ll index abc91c3b11c6b..26037043a26ed 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-02.ll @@ -48,7 +48,7 @@ after.await: br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -68,6 +68,6 @@ declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(ptr) local_unnamed_addr #10 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-03.ll b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-03.ll index efd1adfc54b53..26c4c72ef0726 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-03.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-03.ll @@ -36,7 +36,7 @@ await.ready: call void @print(i32 %val) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } ; CHECK-LABEL: @a.gep.resume( @@ -59,6 +59,6 @@ declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(ptr) local_unnamed_addr #10 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-04.ll b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-04.ll index af5aa8ade0b65..be4bf4c14737c 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-04.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-sink-lifetime-04.ll @@ -35,7 +35,7 @@ await.ready: call void @print(i32 %val) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -61,6 +61,6 @@ declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(ptr) local_unnamed_addr #10 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1, token) #3 +declare void @llvm.coro.end(ptr, i1, token) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/coro-swifterror.ll b/llvm/test/Transforms/Coroutines/coro-swifterror.ll index 899be4a010326..76a4816219ffd 100644 --- a/llvm/test/Transforms/Coroutines/coro-swifterror.ll +++ b/llvm/test/Transforms/Coroutines/coro-swifterror.ll @@ -34,7 +34,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -76,7 +76,7 @@ resume: br label %loop cleanup: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) unreachable } @@ -86,7 +86,7 @@ declare token @llvm.coro.id.retcon(i32, i32, ptr, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) declare { i1, ptr } @llvm.coro.suspend.retcon.i1p0p0i8(...) declare i1 @llvm.coro.suspend.retcon.i1(...) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.prepare.retcon(ptr) declare ptr @f_prototype(ptr, i1 zeroext, ptr swifterror) diff --git a/llvm/test/Transforms/Coroutines/coro-zero-alloca.ll b/llvm/test/Transforms/Coroutines/coro-zero-alloca.ll index e3f09ba29cbf7..d1d826c7f1009 100644 --- a/llvm/test/Transforms/Coroutines/coro-zero-alloca.ll +++ b/llvm/test/Transforms/Coroutines/coro-zero-alloca.ll @@ -9,7 +9,7 @@ declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) declare i64 @llvm.coro.size.i64() declare ptr @llvm.coro.begin(token, ptr writeonly) declare i8 @llvm.coro.suspend(token, i1) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.free(token, ptr nocapture readonly) declare token @llvm.coro.save(ptr) @@ -42,7 +42,7 @@ wakeup: ; preds = %entry br label %cleanup suspend: ; preds = %cleanup, %entry - %unused = call i1 @llvm.coro.end(ptr %coro.state, i1 false, token none) + call void @llvm.coro.end(ptr %coro.state, i1 false, token none) ret void cleanup: ; preds = %wakeup, %entry diff --git a/llvm/test/Transforms/Coroutines/ex0.ll b/llvm/test/Transforms/Coroutines/ex0.ll index 9809488c85b37..420379ed5620a 100644 --- a/llvm/test/Transforms/Coroutines/ex0.ll +++ b/llvm/test/Transforms/Coroutines/ex0.ll @@ -24,7 +24,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -52,7 +52,7 @@ declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/ex1.ll b/llvm/test/Transforms/Coroutines/ex1.ll index 2db5ef6067cad..0a9b15641f2e6 100644 --- a/llvm/test/Transforms/Coroutines/ex1.ll +++ b/llvm/test/Transforms/Coroutines/ex1.ll @@ -20,7 +20,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -48,7 +48,7 @@ declare i32 @llvm.coro.size.i32() declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) diff --git a/llvm/test/Transforms/Coroutines/ex2.ll b/llvm/test/Transforms/Coroutines/ex2.ll index d9999d46b38cf..fb4eeb5ee8bbc 100644 --- a/llvm/test/Transforms/Coroutines/ex2.ll +++ b/llvm/test/Transforms/Coroutines/ex2.ll @@ -29,7 +29,7 @@ dyn.free: call void @CustomFree(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -63,7 +63,7 @@ declare i32 @llvm.coro.size.i32() declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) diff --git a/llvm/test/Transforms/Coroutines/ex3.ll b/llvm/test/Transforms/Coroutines/ex3.ll index e7fbc97d8f14f..3b3c579625df8 100644 --- a/llvm/test/Transforms/Coroutines/ex3.ll +++ b/llvm/test/Transforms/Coroutines/ex3.ll @@ -32,7 +32,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -67,7 +67,7 @@ declare i32 @llvm.coro.size.i32() declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) diff --git a/llvm/test/Transforms/Coroutines/ex4.ll b/llvm/test/Transforms/Coroutines/ex4.ll index 7c7a869e4a500..fa7b64b1379f6 100644 --- a/llvm/test/Transforms/Coroutines/ex4.ll +++ b/llvm/test/Transforms/Coroutines/ex4.ll @@ -27,7 +27,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -63,7 +63,7 @@ declare i32 @llvm.coro.size.i32() declare ptr @llvm.coro.begin(token, ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare void @llvm.coro.resume(ptr) declare void @llvm.coro.destroy(ptr) diff --git a/llvm/test/Transforms/Coroutines/ex5.ll b/llvm/test/Transforms/Coroutines/ex5.ll index bf5cbec266c91..3640b83ceb28b 100644 --- a/llvm/test/Transforms/Coroutines/ex5.ll +++ b/llvm/test/Transforms/Coroutines/ex5.ll @@ -31,7 +31,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret ptr %hdl } @@ -46,7 +46,7 @@ declare ptr @llvm.coro.begin(token, ptr) declare token @llvm.coro.save(ptr) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) ; CHECK-LABEL: @main define i32 @main() { diff --git a/llvm/test/Transforms/Coroutines/no-suspend.ll b/llvm/test/Transforms/Coroutines/no-suspend.ll index fd8c5ac990958..c08423d6053fc 100644 --- a/llvm/test/Transforms/Coroutines/no-suspend.ll +++ b/llvm/test/Transforms/Coroutines/no-suspend.ll @@ -32,7 +32,7 @@ dyn.free: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void } @@ -81,7 +81,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void } @@ -129,7 +129,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void lpad: %lpval = landingpad { ptr, i32 } @@ -190,7 +190,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void } @@ -244,7 +244,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void } @@ -291,7 +291,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void lpad: %lpval = landingpad { ptr, i32 } @@ -343,7 +343,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void } @@ -388,7 +388,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) ret void lpad: %lpval = landingpad { ptr, i32 } @@ -410,7 +410,7 @@ declare ptr @llvm.coro.begin(token, ptr) declare token @llvm.coro.save(ptr %hdl) declare i8 @llvm.coro.suspend(token, i1) declare ptr @llvm.coro.free(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.subfn.addr(ptr, i8) diff --git a/llvm/test/Transforms/Coroutines/phi-coro-end.ll b/llvm/test/Transforms/Coroutines/phi-coro-end.ll index aab76faed3f1a..adfcba01d6119 100644 --- a/llvm/test/Transforms/Coroutines/phi-coro-end.ll +++ b/llvm/test/Transforms/Coroutines/phi-coro-end.ll @@ -17,7 +17,7 @@ cleanup: suspend: %r = phi i32 [%n, %entry], [1, %cleanup] - call i1 @llvm.coro.end(ptr %hdl, i1 false, token none) + call void @llvm.coro.end(ptr %hdl, i1 false, token none) call void @print(i32 %r) ret ptr %hdl } @@ -41,7 +41,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/remarks.ll b/llvm/test/Transforms/Coroutines/remarks.ll index 5eaddbfc3d107..418a75cde49c9 100644 --- a/llvm/test/Transforms/Coroutines/remarks.ll +++ b/llvm/test/Transforms/Coroutines/remarks.ll @@ -33,7 +33,7 @@ cleanup: call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0, token none) + call void @llvm.coro.end(ptr %hdl, i1 0, token none) ret ptr %hdl } @@ -60,7 +60,7 @@ declare void @llvm.coro.destroy(ptr) declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare noalias ptr @malloc(i32) declare void @print(i32) diff --git a/llvm/test/Transforms/Coroutines/swift-async-dbg.ll b/llvm/test/Transforms/Coroutines/swift-async-dbg.ll index 5090274ea3ad4..00138c091890f 100644 --- a/llvm/test/Transforms/Coroutines/swift-async-dbg.ll +++ b/llvm/test/Transforms/Coroutines/swift-async-dbg.ll @@ -73,7 +73,7 @@ define swifttailcc void @coroutineA(ptr swiftasync %arg) !dbg !48 { %i33 = call { ptr } (i32, ptr, ptr, ...) @llvm.coro.suspend.async.sl_p0s(i32 0, ptr %i31, ptr nonnull @__swift_async_resume_get_context, ptr nonnull @coroutineA.1, ptr %i31, i64 0, i64 0, ptr %i29), !dbg !54 %i34 = extractvalue { ptr } %i33, 0, !dbg !54 %i35 = call ptr @__swift_async_resume_get_context(ptr %i34), !dbg !54 - %i45 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %i3, i1 false, ptr nonnull @coroutineA.0.1, ptr undef, ptr undef), !dbg !54 + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %i3, i1 false, ptr nonnull @coroutineA.0.1, ptr undef, ptr undef), !dbg !54 unreachable, !dbg !54 ; CHECK-NOT: define ; CHECK-LABEL: define {{.*}} @coroutineATY2_( @@ -116,7 +116,7 @@ define swifttailcc void @coroutineB(ptr swiftasync %arg) !dbg !37 { %i3 = call ptr @llvm.coro.begin(token %i2, ptr null) %i6 = getelementptr inbounds <{ ptr, ptr }>, ptr %arg, i64 0, i32 1, !dbg !42 %i712 = load ptr, ptr %i6, align 8, !dbg !42 - %i10 = call i1 (ptr, i1, ...) @llvm.coro.end.async(ptr %i3, i1 false, ptr nonnull @coroutineB.0, ptr %i712, ptr %arg), !dbg !42 + call void (ptr, i1, ...) @llvm.coro.end.async(ptr %i3, i1 false, ptr nonnull @coroutineB.0, ptr %i712, ptr %arg), !dbg !42 unreachable, !dbg !42 } define hidden swifttailcc void @coroutineB.0(ptr %arg, ptr %arg1) !dbg !44 { @@ -124,7 +124,7 @@ define hidden swifttailcc void @coroutineB.0(ptr %arg, ptr %arg1) !dbg !44 { ret void, !dbg !47 } -declare i1 @llvm.coro.end.async(ptr, i1, ...) +declare void @llvm.coro.end.async(ptr, i1, ...) declare ptr @llvm.coro.async.resume() declare ptr @llvm.coro.begin(token, ptr writeonly) declare ptr @llvm.swift.async.context.addr() diff --git a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll index ea0d5d3fca8ff..8a6f60ba7a204 100644 --- a/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll +++ b/llvm/test/Transforms/DropUnnecessaryAssumes/basic.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -S -passes=drop-unnecessary-assumes < %s | FileCheck %s +declare void @use(i32 %x) +declare i32 @get() + define void @basic_dead(i32 %x) { ; CHECK-LABEL: define void @basic_dead( ; CHECK-SAME: i32 [[X:%.*]]) { @@ -63,18 +66,17 @@ define i32 @multiple_live2(i32 %x, i32 %y) { ret i32 %y } -define void @operand_bundle_dead(ptr %x) { -; CHECK-LABEL: define void @operand_bundle_dead( +define void @operand_bundle_one_dead(ptr %x) { +; CHECK-LABEL: define void @operand_bundle_one_dead( ; CHECK-SAME: ptr [[X:%.*]]) { -; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[X]], i64 8) ] ; CHECK-NEXT: ret void ; call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8)] ret void } -define ptr @operand_bundle_live(ptr %x) { -; CHECK-LABEL: define ptr @operand_bundle_live( +define ptr @operand_bundle_one_live(ptr %x) { +; CHECK-LABEL: define ptr @operand_bundle_one_live( ; CHECK-SAME: ptr [[X:%.*]]) { ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[X]], i64 8) ] ; CHECK-NEXT: ret ptr [[X]] @@ -83,6 +85,93 @@ define ptr @operand_bundle_live(ptr %x) { ret ptr %x } +define void @operand_bundle_multiple_dead(ptr %x, ptr %y) { +; CHECK-LABEL: define void @operand_bundle_multiple_dead( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) { +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8), "align"(ptr %y, i64 8)] + ret void +} + +define ptr @operand_bundle_one_live_one_dead(ptr %x, ptr %y) { +; CHECK-LABEL: define ptr @operand_bundle_one_live_one_dead( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[Y]], i64 8) ] +; CHECK-NEXT: ret ptr [[Y]] +; + call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8), "align"(ptr %y, i64 8)] + ret ptr %y +} + +define i64 @operand_bundle_ignore_unaffected_operands(ptr %x, i64 %align) { +; CHECK-LABEL: define i64 @operand_bundle_ignore_unaffected_operands( +; CHECK-SAME: ptr [[X:%.*]], i64 [[ALIGN:%.*]]) { +; CHECK-NEXT: ret i64 [[ALIGN]] +; + call void @llvm.assume(i1 true) ["align"(ptr %x, i64 %align)] + ret i64 %align +} + +define void @operand_bundle_remove_dead_insts(ptr %x) { +; CHECK-LABEL: define void @operand_bundle_remove_dead_insts( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: ret void +; + %gep = getelementptr i8, ptr %x, i64 8 + call void @llvm.assume(i1 true) ["align"(ptr %gep, i64 8)] + ret void +} + +define void @operand_bundle_no_args() { +; CHECK-LABEL: define void @operand_bundle_no_args() { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "cold"() ] +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["cold"()] + ret void +} + +; Can always drop ignore bundles, regardless of uses. +define ptr @operand_bundle_ignore(ptr %x) { +; CHECK-LABEL: define ptr @operand_bundle_ignore( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[X]]) ] +; CHECK-NEXT: ret ptr [[X]] +; + call void @llvm.assume(i1 true) ["ignore"(), "ignore"(ptr %x), "nonnull"(ptr %x)] + ret ptr %x +} + +define void @operand_bundle_separate_storage_both_dead(ptr %x, ptr %y) { +; CHECK-LABEL: define void @operand_bundle_separate_storage_both_dead( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) { +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["separate_storage"(ptr %x, ptr %y)] + ret void +} + +define ptr @operand_bundle_separate_storage_one_live1(ptr %x, ptr %y) { +; CHECK-LABEL: define ptr @operand_bundle_separate_storage_one_live1( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "separate_storage"(ptr [[X]], ptr [[Y]]) ] +; CHECK-NEXT: ret ptr [[Y]] +; + call void @llvm.assume(i1 true) ["separate_storage"(ptr %x, ptr %y)] + ret ptr %y +} + +define ptr @operand_bundle_separate_storage_one_live2(ptr %x, ptr %y) { +; CHECK-LABEL: define ptr @operand_bundle_separate_storage_one_live2( +; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "separate_storage"(ptr [[X]], ptr [[Y]]) ] +; CHECK-NEXT: ret ptr [[X]] +; + call void @llvm.assume(i1 true) ["separate_storage"(ptr %x, ptr %y)] + ret ptr %x +} + define void @type_test(ptr %x) { ; CHECK-LABEL: define void @type_test( ; CHECK-SAME: ptr [[X:%.*]]) { @@ -94,3 +183,136 @@ define void @type_test(ptr %x) { call void @llvm.assume(i1 %test) ret void } + +define void @multiple_dead_conds(i32 %x) { +; CHECK-LABEL: define void @multiple_dead_conds( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: ret void +; + %cond1 = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond1) + %cond2 = icmp ne i32 %x, 64 + call void @llvm.assume(i1 %cond2) + ret void +} + +define void @multiple_dead_bundles(ptr %x) { +; CHECK-LABEL: define void @multiple_dead_bundles( +; CHECK-SAME: ptr [[X:%.*]]) { +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr %x, i64 8), "nonnull"(ptr %x)] + ret void +} + +; The assume is eliminated, but currently leaves behind a dead cycle. +define void @dead_cycle(i1 %loop.cond) { +; CHECK-LABEL: define void @dead_cycle( +; CHECK-SAME: i1 [[LOOP_COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %cond = icmp ne i32 %iv, 64 + call void @llvm.assume(i1 %cond) + %iv.next = add i32 %iv, 1 + br i1 %loop.cond, label %loop, label %exit + +exit: + ret void +} + +define void @use_in_side_effect(i32 %x) { +; CHECK-LABEL: define void @use_in_side_effect( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[COND:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: call void @use(i32 [[X]]) +; CHECK-NEXT: ret void +; + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + call void @use(i32 %x) + ret void +} + +define void @indirect_use_in_side_effect(i32 %x) { +; CHECK-LABEL: define void @indirect_use_in_side_effect( +; CHECK-SAME: i32 [[X:%.*]]) { +; CHECK-NEXT: [[COND:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[X]], 1 +; CHECK-NEXT: call void @use(i32 [[ADD]]) +; CHECK-NEXT: ret void +; + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + %add = add i32 %x, 1 + call void @use(i32 %add) + ret void +} + +; The affected value itself has a side effect, but we can still drop the +; assume. +define void @affected_value_has_side_effect() { +; CHECK-LABEL: define void @affected_value_has_side_effect() { +; CHECK-NEXT: [[X:%.*]] = call i32 @get() +; CHECK-NEXT: ret void +; + %x = call i32 @get() + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + ret void +} + +define i32 @affected_value_has_side_effect_and_is_used() { +; CHECK-LABEL: define i32 @affected_value_has_side_effect_and_is_used() { +; CHECK-NEXT: [[X:%.*]] = call i32 @get() +; CHECK-NEXT: [[COND:%.*]] = icmp sge i32 [[X]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) +; CHECK-NEXT: ret i32 [[X]] +; + %x = call i32 @get() + %cond = icmp sge i32 %x, 0 + call void @llvm.assume(i1 %cond) + ret i32 %x +} + +@g = external global i8 +@g2 = external global i8 + +; Assumes on globals are currently not supported. +define void @assume_on_global() { +; CHECK-LABEL: define void @assume_on_global() { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr @g, i64 8) ] +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr @g, i64 8)] + ret void +} + +define void @assume_on_global_used_in_other_func() { +; CHECK-LABEL: define void @assume_on_global_used_in_other_func() { +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr @g2, i64 8) ] +; CHECK-NEXT: ret void +; + call void @llvm.assume(i1 true) ["align"(ptr @g2, i64 8)] + ret void +} + +define ptr @other_func() { +; CHECK-LABEL: define ptr @other_func() { +; CHECK-NEXT: ret ptr @g2 +; + ret ptr @g2 +} diff --git a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll index f09a51c48a52f..922413a13cdf8 100644 --- a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll +++ b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll @@ -9,8 +9,8 @@ define amdgpu_kernel void @test_make_buffer_rsrc(ptr %p, ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) ; FNATTRS-LABEL: define {{[^@]+}}@test_make_buffer_rsrc ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]], ptr writeonly captures(none) [[Q:%.*]]) #[[ATTR0:[0-9]+]] { -; FNATTRS-NEXT: [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P]], i16 0, i32 4, i32 822243328) -; FNATTRS-NEXT: [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[Q]], i16 0, i32 4, i32 822243328) +; FNATTRS-NEXT: [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P]], i16 0, i64 4, i32 822243328) +; FNATTRS-NEXT: [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[Q]], i16 0, i64 4, i32 822243328) ; FNATTRS-NEXT: [[V:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[P_RSRC]], i32 0, i32 0, i32 0) ; FNATTRS-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[V]], ptr addrspace(8) [[Q_RSRC]], i32 0, i32 0, i32 0) ; FNATTRS-NEXT: ret void @@ -18,21 +18,21 @@ define amdgpu_kernel void @test_make_buffer_rsrc(ptr %p, ptr %q) { ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_make_buffer_rsrc ; ATTRIBUTOR-SAME: (ptr nofree readonly captures(none) [[P:%.*]], ptr nofree writeonly captures(none) [[Q:%.*]]) #[[ATTR0:[0-9]+]] { -; ATTRIBUTOR-NEXT: [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P]], i16 0, i32 4, i32 822243328) #[[ATTR4:[0-9]+]] -; ATTRIBUTOR-NEXT: [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[Q]], i16 0, i32 4, i32 822243328) #[[ATTR4]] +; ATTRIBUTOR-NEXT: [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P]], i16 0, i64 4, i32 822243328) #[[ATTR4:[0-9]+]] +; ATTRIBUTOR-NEXT: [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[Q]], i16 0, i64 4, i32 822243328) #[[ATTR4]] ; ATTRIBUTOR-NEXT: [[V:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) readonly captures(none) [[P_RSRC]], i32 0, i32 0, i32 0) #[[ATTR5:[0-9]+]] ; ATTRIBUTOR-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[V]], ptr addrspace(8) writeonly captures(none) [[Q_RSRC]], i32 0, i32 0, i32 0) #[[ATTR6:[0-9]+]] ; ATTRIBUTOR-NEXT: ret void ; - %p.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 4, i32 822243328) - %q.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %q, i16 0, i32 4, i32 822243328) + %p.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i64 4, i32 822243328) + %q.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %q, i16 0, i64 4, i32 822243328) %v = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %p.rsrc, i32 0, i32 0, i32 0) call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 %v, ptr addrspace(8) %q.rsrc, i32 0, i32 0, i32 0) ret void } ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr readnone, i16, i32, i32) #0 +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr readnone, i16, i64, i32) #0 ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read) declare i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1 diff --git a/llvm/test/Transforms/FunctionAttrs/noreturn.ll b/llvm/test/Transforms/FunctionAttrs/noreturn.ll index fa80f6c2eced4..ae2ccb02733f3 100644 --- a/llvm/test/Transforms/FunctionAttrs/noreturn.ll +++ b/llvm/test/Transforms/FunctionAttrs/noreturn.ll @@ -81,9 +81,9 @@ define void @unreachable() { ; CHECK: @coro define void @coro() presplitcoroutine { call token @llvm.coro.id.retcon.once(i32 0, i32 0, ptr null, ptr @coro, ptr null, ptr null) - call i1 (ptr, i1, ...) @llvm.coro.end(ptr null, i1 false) + call void (ptr, i1, ...) @llvm.coro.end(ptr null, i1 false) unreachable } declare token @llvm.coro.id.retcon.once(i32 %size, i32 %align, ptr %buffer, ptr %prototype, ptr %alloc, ptr %free) -declare i1 @llvm.coro.end(ptr, i1, ...) +declare void @llvm.coro.end(ptr, i1, ...) diff --git a/llvm/test/Transforms/FunctionSpecialization/profile-counts.ll b/llvm/test/Transforms/FunctionSpecialization/profile-counts.ll index bdf7690a71b69..4d26247ad09cb 100644 --- a/llvm/test/Transforms/FunctionSpecialization/profile-counts.ll +++ b/llvm/test/Transforms/FunctionSpecialization/profile-counts.ll @@ -1,20 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 ; RUN: opt -passes="ipsccp" -force-specialization -S < %s | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @A = external dso_local constant i32, align 4 @B = external dso_local constant i32, align 4 -; CHECK: define dso_local i32 @bar(i32 %x, i32 %y, ptr %z) !prof ![[BAR_PROF:[0-9]]] { +;. +; CHECK: @A = external dso_local constant i32, align 4 +; CHECK: @B = external dso_local constant i32, align 4 +;. define dso_local i32 @bar(i32 %x, i32 %y, ptr %z) !prof !0 { +; CHECK-LABEL: define dso_local i32 @bar( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr [[Z:%.*]]) !prof [[PROF0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @foo.specialized.1(i32 [[X]], ptr @A) +; CHECK-NEXT: [[CALL1:%.*]] = call i32 @foo.specialized.2(i32 [[Y]], ptr @B) +; CHECK-NEXT: [[CALL2:%.*]] = call i32 @foo.specialized.2(i32 [[Y]], ptr @B) +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CALL]], [[CALL1]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[ADD]], [[CALL2]] +; CHECK-NEXT: br label %[[RETURN:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[CALL3:%.*]] = call i32 @foo.specialized.2(i32 [[Y]], ptr @B) +; CHECK-NEXT: br label %[[RETURN]] +; CHECK: [[RETURN]]: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ [[ADD1]], %[[IF_THEN]] ], [ [[CALL3]], %[[IF_ELSE]] ] +; CHECK-NEXT: [[CALL4:%.*]] = call i32 @foo(i32 [[X]], ptr [[Z]]) +; CHECK-NEXT: [[ADD2:%.*]] = add i32 [[RETVAL_0]], [[CALL4]] +; CHECK-NEXT: ret i32 [[ADD2]] +; entry: %tobool = icmp ne i32 %x, 0 -; CHECK: br i1 %tobool, label %if.then, label %if.else, !prof ![[BRANCH_PROF:[0-9]]] br i1 %tobool, label %if.then, label %if.else, !prof !1 -; CHECK: if.then: -; CHECK: call i32 @foo.specialized.1(i32 %x, ptr @A) -; CHECK: call i32 @foo.specialized.2(i32 %y, ptr @B) -; CHECK: call i32 @foo.specialized.2(i32 %y, ptr @B) if.then: %call = call i32 @foo(i32 %x, ptr @A) %call1 = call i32 @foo(i32 %y, ptr @B) @@ -23,14 +43,10 @@ if.then: %add1 = add i32 %add, %call2 br label %return -; CHECK: if.else: -; CHECK: call i32 @foo.specialized.2(i32 %y, ptr @B) if.else: %call3 = call i32 @foo(i32 %y, ptr @B) br label %return -; CHECK: return: -; CHECK: call i32 @foo(i32 %x, ptr %z) return: %retval.0 = phi i32 [ %add1, %if.then ], [ %call3, %if.else ] %call4 = call i32 @foo(i32 %x, ptr %z); @@ -38,21 +54,62 @@ return: ret i32 %add2 } -; CHECK: define internal i32 @foo(i32 %x, ptr %b) !prof ![[FOO_UNSPEC_PROF:[0-9]]] -; CHECK: define internal i32 @foo.specialized.1(i32 %x, ptr %b) !prof ![[FOO_SPEC_1_PROF:[0-9]]] -; CHECK: define internal i32 @foo.specialized.2(i32 %x, ptr %b) !prof ![[FOO_SPEC_2_PROF:[0-9]]] -define internal i32 @foo(i32 %x, ptr %b) !prof !2 { +define dso_local i32 @baz(i32 %x, i32 %y) !prof !2 { +; CHECK-LABEL: define dso_local i32 @baz( +; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) !prof [[PROF2:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[X]], 0 +; CHECK-NEXT: br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_ELSE:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: [[CALL:%.*]] = call i32 @foo.specialized.1(i32 [[X]], ptr @A) +; CHECK-NEXT: br label %[[RETURN:.*]] +; CHECK: [[IF_ELSE]]: +; CHECK-NEXT: [[CALL1:%.*]] = call i32 @foo.specialized.2(i32 [[Y]], ptr @B) +; CHECK-NEXT: br label %[[RETURN]] +; CHECK: [[RETURN]]: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ [[CALL]], %[[IF_THEN]] ], [ [[CALL1]], %[[IF_ELSE]] ] +; CHECK-NEXT: ret i32 [[RETVAL_0]] +; +entry: + %tobool = icmp ne i32 %x, 0 + br i1 %tobool, label %if.then, label %if.else + +if.then: + %call = call i32 @foo(i32 %x, ptr @A) + br label %return + +if.else: + %call1 = call i32 @foo(i32 %y, ptr @B) + br label %return + +return: + %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ] + ret i32 %retval.0 +} + +define internal i32 @foo(i32 %x, ptr %b) !prof !3 { +; CHECK-LABEL: define internal i32 @foo( +; CHECK-SAME: i32 [[X:%.*]], ptr [[B:%.*]]) !prof [[PROF3:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[X]], [[TMP0]] +; CHECK-NEXT: ret i32 [[ADD]] +; entry: %0 = load i32, ptr %b, align 4 %add = add nsw i32 %x, %0 ret i32 %add } -; CHECK: ![[BAR_PROF]] = !{!"function_entry_count", i64 1000} -; CHECK: ![[BRANCH_PROF]] = !{!"branch_weights", i32 1, i32 3} -; CHECK: ![[FOO_UNSPEC_PROF]] = !{!"function_entry_count", i64 500} -; CHECK: ![[FOO_SPEC_1_PROF]] = !{!"function_entry_count", i64 250} -; CHECK: ![[FOO_SPEC_2_PROF]] = !{!"function_entry_count", i64 1250} !0 = !{!"function_entry_count", i64 1000} !1 = !{!"branch_weights", i32 1, i32 3} -!2 = !{!"function_entry_count", i64 2000} +!2 = !{!"function_entry_count", i64 0} +!3 = !{!"function_entry_count", i64 2000} +;. +; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 3} +; CHECK: [[PROF2]] = !{!"function_entry_count", i64 0} +; CHECK: [[PROF3]] = !{!"function_entry_count", i64 500} +; CHECK: [[META4:![0-9]+]] = !{!"function_entry_count", i64 250} +; CHECK: [[META5:![0-9]+]] = !{!"function_entry_count", i64 1250} +;. diff --git a/llvm/test/Transforms/IROutliner/outlining-special-state.ll b/llvm/test/Transforms/IROutliner/outlining-special-state.ll new file mode 100644 index 0000000000000..9ceec51895351 --- /dev/null +++ b/llvm/test/Transforms/IROutliner/outlining-special-state.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: opt -S -passes=verify,iroutliner -ir-outlining-no-cost < %s | FileCheck %s + +declare void @foo(); + +define void @atomicrmw_base(ptr %p) { +entry: + %1 = atomicrmw add ptr %p, i32 1 acquire, align 8 + call void @foo() + ret void +} + +define void @atomicrmw_copy(ptr %p) { +entry: + %1 = atomicrmw add ptr %p, i32 1 acquire, align 8 + call void @foo() + ret void +} + +define void @atomicrmw_wrong_type(ptr %p) { +entry: + %1 = atomicrmw add ptr %p, i64 1 acquire, align 8 + call void @foo() + ret void +} + +define void @atomicrmw_wrong_align(ptr %p) { +entry: + %1 = atomicrmw add ptr %p, i32 1 acquire, align 4 + call void @foo() + ret void +} + +define void @atomicrmw_wrong_volatile(ptr %p) { +entry: + %1 = atomicrmw volatile add ptr %p, i32 1 acquire, align 8 + call void @foo() + ret void +} + +define void @cmpxchg_base(ptr %p) { +entry: + %1 = cmpxchg ptr %p, i32 0, i32 1 monotonic monotonic, align 8 + call void @foo() + ret void +} + +define void @cmpxchg_copy(ptr %p) { +entry: + %1 = cmpxchg ptr %p, i32 0, i32 1 monotonic monotonic, align 8 + call void @foo() + ret void +} + +define void @cmpxchg_wrong_type(ptr %p) { +entry: + %1 = cmpxchg ptr %p, i64 0, i64 1 monotonic monotonic, align 8 + call void @foo() + ret void +} + +define void @cmpxchg_wrong_align(ptr %p) { +entry: + %1 = cmpxchg ptr %p, i32 0, i32 1 monotonic monotonic, align 4 + call void @foo() + ret void +} + +define void @cmpxchg_wrong_volatile(ptr %p) { +entry: + %1 = cmpxchg volatile ptr %p, i32 0, i32 1 monotonic monotonic, align 8 + call void @foo() + ret void +} + + +; CHECK-LABEL: @atomicrmw_base( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @outlined_ir_func_1(ptr [[P:%.*]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @atomicrmw_copy( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @outlined_ir_func_1(ptr [[P:%.*]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @atomicrmw_wrong_type( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add ptr [[P:%.*]], i64 1 acquire, align 8 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @atomicrmw_wrong_align( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw add ptr [[P:%.*]], i32 1 acquire, align 4 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @atomicrmw_wrong_volatile( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = atomicrmw volatile add ptr [[P:%.*]], i32 1 acquire, align 8 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @cmpxchg_base( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @outlined_ir_func_0(ptr [[P:%.*]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @cmpxchg_copy( +; CHECK-NEXT: entry: +; CHECK-NEXT: call void @outlined_ir_func_0(ptr [[P:%.*]]) +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @cmpxchg_wrong_type( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[P:%.*]], i64 0, i64 1 monotonic monotonic, align 8 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @cmpxchg_wrong_align( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg ptr [[P:%.*]], i32 0, i32 1 monotonic monotonic, align 4 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @cmpxchg_wrong_volatile( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = cmpxchg volatile ptr [[P:%.*]], i32 0, i32 1 monotonic monotonic, align 8 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outlined_ir_func_0( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[TMP1:%.*]] = cmpxchg ptr [[TMP0:%.*]], i32 0, i32 1 monotonic monotonic, align 8 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[ENTRY_AFTER_OUTLINE_EXITSTUB:%.*]] +; CHECK: entry_after_outline.exitStub: +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @outlined_ir_func_1( +; CHECK-NEXT: newFuncRoot: +; CHECK-NEXT: br label [[ENTRY_TO_OUTLINE:%.*]] +; CHECK: entry_to_outline: +; CHECK-NEXT: [[TMP1:%.*]] = atomicrmw add ptr [[TMP0:%.*]], i32 1 acquire, align 8 +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[ENTRY_AFTER_OUTLINE_EXITSTUB:%.*]] +; CHECK: entry_after_outline.exitStub: +; CHECK-NEXT: ret void +; diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll index ebc5c0d717c6d..678d462b0c1b7 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -200,17 +200,17 @@ define amdgpu_kernel void @load_to_lds_fat_pointer_as_flat(ptr addrspace(7) %buf ret void } -define amdgpu_kernel void @make_buffer_rsrc_global_as_flat(ptr addrspace(1) %global, i32 %extent) { +define amdgpu_kernel void @make_buffer_rsrc_global_as_flat(ptr addrspace(1) %global, i64 %extent) { ;; NOTE: flags value not representative of real input ; CHECK-LABEL: define amdgpu_kernel void @make_buffer_rsrc_global_as_flat( -; CHECK-SAME: ptr addrspace(1) [[GLOBAL:%.*]], i32 [[EXTENT:%.*]]) { -; CHECK-NEXT: [[BUFFER_FAT_PTR:%.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[GLOBAL]], i16 0, i32 [[EXTENT]], i32 0) -; CHECK-NEXT: store i32 [[EXTENT]], ptr addrspace(7) [[BUFFER_FAT_PTR]], align 4 +; CHECK-SAME: ptr addrspace(1) [[GLOBAL:%.*]], i64 [[EXTENT:%.*]]) { +; CHECK-NEXT: [[BUFFER_FAT_PTR:%.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) [[GLOBAL]], i16 0, i64 [[EXTENT]], i32 0) +; CHECK-NEXT: store i64 [[EXTENT]], ptr addrspace(7) [[BUFFER_FAT_PTR]], align 8 ; CHECK-NEXT: ret void ; %cast = addrspacecast ptr addrspace(1) %global to ptr - %buffer.fat.ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %cast, i16 0, i32 %extent, i32 0) - store i32 %extent, ptr addrspace(7) %buffer.fat.ptr + %buffer.fat.ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %cast, i16 0, i64 %extent, i32 0) + store i64 %extent, ptr addrspace(7) %buffer.fat.ptr ret void } @@ -221,7 +221,7 @@ declare void @llvm.memcpy.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) no declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1 declare void @llvm.amdgcn.load.to.lds.p0(ptr nocapture readonly, ptr addrspace(3) nocapture writeonly, i32 immarg, i32 immarg, i32 immarg) #1 -declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr readnone, i16, i32, i32) #0 +declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr readnone, i16, i64, i32) #0 attributes #0 = { nounwind } attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/InferAlignment/masked.ll b/llvm/test/Transforms/InferAlignment/masked.ll index 1b8d26417d75e..13acf9b50e7e8 100644 --- a/llvm/test/Transforms/InferAlignment/masked.ll +++ b/llvm/test/Transforms/InferAlignment/masked.ll @@ -29,6 +29,18 @@ entry: ret void } +define <2 x i32> @null(<2 x i1> %mask, <2 x i32> %val) { +; CHECK-LABEL: define <2 x i32> @null( +; CHECK-SAME: <2 x i1> [[MASK:%.*]], <2 x i32> [[VAL:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MASKED_LOAD:%.*]] = tail call <2 x i32> @llvm.masked.load.v2i32.p0(ptr null, i32 1, <2 x i1> [[MASK]], <2 x i32> [[VAL]]) +; CHECK-NEXT: ret <2 x i32> [[MASKED_LOAD]] +; +entry: + %masked_load = tail call <2 x i32> @llvm.masked.load.v2f64.p0(ptr null, i32 1, <2 x i1> %mask, <2 x i32> %val) + ret <2 x i32> %masked_load +} + declare void @llvm.assume(i1) declare <2 x i32> @llvm.masked.load.v2i32.p0(ptr, i32, <2 x i1>, <2 x i32>) declare void @llvm.masked.store.v2i32.p0(<2 x i32>, ptr, i32, <2 x i1>) diff --git a/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll b/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll index c3f6dd700b451..466499a07cb1b 100644 --- a/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll +++ b/llvm/test/Transforms/Inline/ML/state-tracking-coro.ll @@ -38,7 +38,7 @@ await.ready: call void @print(i32 %val) br label %exit exit: - call i1 @llvm.coro.end(ptr null, i1 false) + call void @llvm.coro.end(ptr null, i1 false) ret void } @@ -53,6 +53,6 @@ declare ptr @llvm.coro.frame() #5 declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(ptr) local_unnamed_addr #10 declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2 -declare i1 @llvm.coro.end(ptr, i1) #3 +declare void @llvm.coro.end(ptr, i1) #3 declare void @llvm.lifetime.start.p0(ptr nocapture) #4 declare void @llvm.lifetime.end.p0(ptr nocapture) #4 diff --git a/llvm/test/Transforms/Inline/dilocation-loop-metadata-update.ll b/llvm/test/Transforms/Inline/dilocation-loop-metadata-update.ll new file mode 100644 index 0000000000000..1bc132663331b --- /dev/null +++ b/llvm/test/Transforms/Inline/dilocation-loop-metadata-update.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=inline -S | FileCheck %s + +; When inlining we need to update DILocation recursively for the followup +; metadata when updating llvm.loop metadata. + +define void @a() !dbg !3 { +; CHECK-LABEL: define void @a( +; CHECK-SAME: ) !dbg [[DBG3:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: br label %[[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + br label %for.body, !llvm.loop !6 +} + +define void @f() !dbg !17 { +; CHECK-LABEL: define void @f( +; CHECK-SAME: ) !dbg [[DBG17:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[A_EXIT:.*]] +; CHECK: [[A_EXIT]]: +; CHECK-NEXT: br label %[[A_EXIT]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: [[A_EXIT1:.*:]] +; CHECK-NEXT: ret void +; +entry: + call void @a(), !dbg !18 + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "foo.c", directory: "/") +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = distinct !DISubprogram(name: "a", scope: !1, file: !1, line: 3, type: !4, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!4 = !DISubroutineType(types: !5) +!5 = !{} +!6 = distinct !{!6, !7, !8, !9, !10, !11} +!7 = !DILocation(line: 6, column: 3, scope: !3) +!8 = !DILocation(line: 7, column: 22, scope: !3) +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.distribute.enable", i1 true} +!11 = !{!"llvm.loop.distribute.followup_all", !7, !8, !9, !12, !13, !14} +!12 = !{!"llvm.loop.vectorize.width", i32 8} +!13 = !{!"llvm.loop.vectorize.enable", i1 true} +!14 = !{!"llvm.loop.vectorize.followup_all", !7, !8, !9, !15, !16} +!15 = !{!"llvm.loop.isvectorized"} +!16 = !{!"llvm.loop.unroll.count", i32 1} +!17 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 9, type: !4, scopeLine: 9, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!18 = !DILocation(line: 9, column: 12, scope: !17) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, splitDebugInlining: false, nameTableKind: None) +; CHECK: [[META1]] = !DIFile(filename: "{{.*}}foo.c", directory: {{.*}}) +; CHECK: [[DBG3]] = distinct !DISubprogram(name: "a", scope: [[META1]], file: [[META1]], line: 3, type: [[META4:![0-9]+]], scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]]) +; CHECK: [[META4]] = !DISubroutineType(types: [[META5:![0-9]+]]) +; CHECK: [[META5]] = !{} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]} +; CHECK: [[META7]] = !DILocation(line: 6, column: 3, scope: [[DBG3]]) +; CHECK: [[META8]] = !DILocation(line: 7, column: 22, scope: [[DBG3]]) +; CHECK: [[META9]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META10]] = !{!"llvm.loop.distribute.enable", i1 true} +; CHECK: [[META11]] = !{!"llvm.loop.distribute.followup_all", [[META7]], [[META8]], [[META9]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]]} +; CHECK: [[META12]] = !{!"llvm.loop.vectorize.width", i32 8} +; CHECK: [[META13]] = !{!"llvm.loop.vectorize.enable", i1 true} +; CHECK: [[META14]] = !{!"llvm.loop.vectorize.followup_all", [[META7]], [[META8]], [[META9]], [[META15:![0-9]+]], [[META16:![0-9]+]]} +; CHECK: [[META15]] = !{!"llvm.loop.isvectorized"} +; CHECK: [[META16]] = !{!"llvm.loop.unroll.count", i32 1} +; CHECK: [[DBG17]] = distinct !DISubprogram(name: "f", scope: [[META1]], file: [[META1]], line: 9, type: [[META4]], scopeLine: 9, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]]) +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META19:![0-9]+]], [[META21:![0-9]+]], [[META9]], [[META10]], [[META22:![0-9]+]]} +; CHECK: [[META19]] = !DILocation(line: 6, column: 3, scope: [[DBG3]], inlinedAt: [[META20:![0-9]+]]) +; CHECK: [[META20]] = distinct !DILocation(line: 9, column: 12, scope: [[DBG17]]) +; CHECK: [[META21]] = !DILocation(line: 7, column: 22, scope: [[DBG3]], inlinedAt: [[META20]]) +; CHECK: [[META22]] = !{!"llvm.loop.distribute.followup_all", [[META19]], [[META21]], [[META9]], [[META12]], [[META13]], [[META23:![0-9]+]]} +; CHECK: [[META23]] = !{!"llvm.loop.vectorize.followup_all", [[META19]], [[META21]], [[META9]], [[META15]], [[META16]]} +;. diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll index b65ed66fcce65..b0a238ff8efee 100644 --- a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll +++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll @@ -8,6 +8,11 @@ ; RUN: -pass-remarks-with-hotness 2>&1 | FileCheck %s ; RUN: cat %t | FileCheck -check-prefixes=YAML,YAML-NO-ANNOTATE %s +; RUN: opt < %s -S -passes=inline -pass-remarks-output=%t.bitstream -pass-remarks=inline \ +; RUN: -pass-remarks-missed=inline -pass-remarks-analysis=inline \ +; RUN: -pass-remarks-with-hotness -pass-remarks-format=bitstream 2>&1 | FileCheck %s +; RUN: llvm-remarkutil bitstream2yaml %t.bitstream | FileCheck -check-prefixes=YAML,YAML-NO-ANNOTATE %s + ; RUN: opt < %s -S -passes=inliner-wrapper -pass-remarks-output=%t -pass-remarks=inline \ ; RUN: -pass-remarks-missed=inline -pass-remarks-analysis=inline \ ; RUN: -annotate-inline-phase=false \ diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll index 077da9cda6523..3ff9439040438 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll @@ -6527,15 +6527,15 @@ define ptr addrspace(8) @make_buffer_rsrc_poison() { ; CHECK-LABEL: @make_buffer_rsrc_poison( ; CHECK-NEXT: ret ptr addrspace(8) poison ; - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) poison, i16 0, i32 1234, i32 5678) + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) poison, i16 0, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } define ptr addrspace(8) @make_buffer_rsrc_undef() { ; CHECK-LABEL: @make_buffer_rsrc_undef( -; CHECK-NEXT: [[RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) undef, i16 0, i32 1234, i32 5678) +; CHECK-NEXT: [[RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) undef, i16 0, i64 1234, i32 5678) ; CHECK-NEXT: ret ptr addrspace(8) [[RSRC]] ; - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) undef, i16 0, i32 1234, i32 5678) + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) undef, i16 0, i64 1234, i32 5678) ret ptr addrspace(8) %rsrc } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll index beb84362b7f92..90877be255e0f 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/ptr-replace-alloca.ll @@ -109,4 +109,23 @@ bb: ret void } +@global = external addrspace(1) constant [16 x float], align 64 + +define float @issue160302(i1 %cond, ptr addrspace(5) %arg) { +; CHECK-LABEL: define float @issue160302( +; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(5) [[ARG:%.*]]) { +; CHECK-NEXT: [[AGG_TMP2_I4:%.*]] = alloca [16 x float], align 64, addrspace(5) +; CHECK-NEXT: [[SELECT_PTR:%.*]] = select i1 [[COND]], ptr addrspace(5) [[AGG_TMP2_I4]], ptr addrspace(5) [[ARG]] +; CHECK-NEXT: [[COND_I:%.*]] = load float, ptr addrspace(5) [[SELECT_PTR]], align 4 +; CHECK-NEXT: ret float [[COND_I]] +; + %agg.tmp2.i4 = alloca [16 x float], align 64, addrspace(5) + call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) %agg.tmp2.i4, ptr addrspace(1) @global, i64 0, i1 false) + %m_Data.i14.i = getelementptr [16 x float], ptr addrspace(5) %agg.tmp2.i4, i32 0, i32 0 + %gep = getelementptr [16 x float], ptr addrspace(5) %arg, i32 0, i32 0 + %select.ptr = select i1 %cond, ptr addrspace(5) %m_Data.i14.i, ptr addrspace(5) %gep + %cond.i = load float, ptr addrspace(5) %select.ptr, align 4 + ret float %cond.i +} + declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0 diff --git a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll index 1819f4ed181c0..4d856699b2d24 100644 --- a/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll +++ b/llvm/test/Transforms/InstCombine/NVPTX/nvvm-intrins.ll @@ -185,52 +185,63 @@ define float @trunc_float_ftz(float %a) #0 { } ; Check NVVM intrinsics that correspond to LLVM cast operations. +; fp -> integer casts should not be converted, as the semantics +; for NaN/Inf/Overflow inputs are different. +; Only integer -> fp casts should be converted. ; CHECK-LABEL: @test_d2i define i32 @test_d2i(double %a) #0 { -; CHECK: fptosi double %a to i32 +; CHECK: call i32 @llvm.nvvm.d2i.rz(double %a) +; CHECK-NOT: fptosi double %a to i32 %ret = call i32 @llvm.nvvm.d2i.rz(double %a) ret i32 %ret } ; CHECK-LABEL: @test_f2i define i32 @test_f2i(float %a) #0 { -; CHECK: fptosi float %a to i32 +; CHECK: call i32 @llvm.nvvm.f2i.rz(float %a) +; CHECK-NOT: fptosi float %a to i32 %ret = call i32 @llvm.nvvm.f2i.rz(float %a) ret i32 %ret } ; CHECK-LABEL: @test_d2ll define i64 @test_d2ll(double %a) #0 { -; CHECK: fptosi double %a to i64 +; CHECK: call i64 @llvm.nvvm.d2ll.rz(double %a) +; CHECK-NOT: fptosi double %a to i64 %ret = call i64 @llvm.nvvm.d2ll.rz(double %a) ret i64 %ret } ; CHECK-LABEL: @test_f2ll define i64 @test_f2ll(float %a) #0 { -; CHECK: fptosi float %a to i64 +; CHECK: call i64 @llvm.nvvm.f2ll.rz(float %a) +; CHECK-NOT: fptosi float %a to i64 %ret = call i64 @llvm.nvvm.f2ll.rz(float %a) ret i64 %ret } ; CHECK-LABEL: @test_d2ui define i32 @test_d2ui(double %a) #0 { -; CHECK: fptoui double %a to i32 +; CHECK: call i32 @llvm.nvvm.d2ui.rz(double %a) +; CHECK-NOT: fptoui double %a to i32 %ret = call i32 @llvm.nvvm.d2ui.rz(double %a) ret i32 %ret } ; CHECK-LABEL: @test_f2ui define i32 @test_f2ui(float %a) #0 { -; CHECK: fptoui float %a to i32 +; CHECK: call i32 @llvm.nvvm.f2ui.rz(float %a) +; CHECK-NOT: fptoui float %a to i32 %ret = call i32 @llvm.nvvm.f2ui.rz(float %a) ret i32 %ret } ; CHECK-LABEL: @test_d2ull define i64 @test_d2ull(double %a) #0 { -; CHECK: fptoui double %a to i64 +; CHECK: call i64 @llvm.nvvm.d2ull.rz(double %a) +; CHECK-NOT: fptoui double %a to i64 %ret = call i64 @llvm.nvvm.d2ull.rz(double %a) ret i64 %ret } ; CHECK-LABEL: @test_f2ull define i64 @test_f2ull(float %a) #0 { -; CHECK: fptoui float %a to i64 +; CHECK: call i64 @llvm.nvvm.f2ull.rz(float %a) +; CHECK-NOT: fptoui float %a to i64 %ret = call i64 @llvm.nvvm.f2ull.rz(float %a) ret i64 %ret } @@ -497,4 +508,4 @@ declare float @llvm.nvvm.ui2f.rn(i32) declare double @llvm.nvvm.ull2d.rn(i64) declare float @llvm.nvvm.ull2f.rn(i64) declare i32 @llvm.nvvm.fshr.clamp.i32(i32, i32, i32) -declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32) \ No newline at end of file +declare i32 @llvm.nvvm.fshl.clamp.i32(i32, i32, i32) diff --git a/llvm/test/Transforms/InstCombine/assume-align.ll b/llvm/test/Transforms/InstCombine/assume-align.ll index 274632658496b..4185b10eeca95 100644 --- a/llvm/test/Transforms/InstCombine/assume-align.ll +++ b/llvm/test/Transforms/InstCombine/assume-align.ll @@ -247,6 +247,16 @@ define ptr @redundant_assume_align_8_via_asume(ptr %p) { ret ptr %p } +define ptr @assume_align_1(ptr %p) { +; CHECK-LABEL: @assume_align_1( +; CHECK-NEXT: call void @foo(ptr [[P:%.*]]) +; CHECK-NEXT: ret ptr [[P]] +; + call void @llvm.assume(i1 true) [ "align"(ptr %p, i32 1) ] + call void @foo(ptr %p) + ret ptr %p +} + declare void @foo(ptr) ; !align must have a constant integer alignment. diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index e87a61a57ea47..7b0b871513513 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -498,13 +498,13 @@ not_taken: define i1 @nonnull3B(ptr %a, i1 %control) { ; CHECK-LABEL: @nonnull3B( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[A:%.*]], align 8 ; CHECK-NEXT: br i1 [[CONTROL:%.*]], label [[TAKEN:%.*]], label [[NOT_TAKEN:%.*]] ; CHECK: taken: -; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[CMP:%.*]] = icmp ne ptr [[LOAD]], null -; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) [ "nonnull"(ptr [[LOAD]]) ] -; CHECK-NEXT: ret i1 [[CMP]] +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[LOAD]]) ] +; CHECK-NEXT: ret i1 true ; CHECK: not_taken: +; CHECK-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(ptr [[LOAD]]) ] ; CHECK-NEXT: ret i1 false ; entry: @@ -512,10 +512,10 @@ entry: %cmp = icmp ne ptr %load, null br i1 %control, label %taken, label %not_taken taken: - call void @llvm.assume(i1 %cmp) ["nonnull"(ptr %load)] + call void @llvm.assume(i1 true) ["nonnull"(ptr %load)] ret i1 %cmp not_taken: - call void @llvm.assume(i1 %cmp) ["nonnull"(ptr %load)] + call void @llvm.assume(i1 true) ["nonnull"(ptr %load)] ret i1 %control } @@ -544,7 +544,7 @@ taken: br label %exit exit: ; FIXME: this shouldn't be dropped because it is still dominated by the new position of %load - call void @llvm.assume(i1 %cmp) ["nonnull"(ptr %load)] + call void @llvm.assume(i1 %cmp) ret i1 %cmp2 not_taken: call void @llvm.assume(i1 %cmp) @@ -575,7 +575,7 @@ taken: exit: ret i1 %cmp2 not_taken: - call void @llvm.assume(i1 %cmp) ["nonnull"(ptr %load)] + call void @llvm.assume(i1 %cmp) ret i1 %control } diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll index 0325c60997dfd..28c541e1a9eb2 100644 --- a/llvm/test/Transforms/InstCombine/fsh.ll +++ b/llvm/test/Transforms/InstCombine/fsh.ll @@ -1214,3 +1214,75 @@ define i31 @fshr_neg_amount_non_power_two(i31 %x, i31 %y) { %r = call i31 @llvm.fshr.i31(i31 %x, i31 %x, i31 %n) ret i31 %r } + +define i32 @rot_const_consecutive(i32 %x) { +; CHECK-LABEL: @rot_const_consecutive( +; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 8) +; CHECK-NEXT: ret i32 [[R2]] +; + %r = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 13) + %r2 = call i32 @llvm.fshl.i32(i32 %r, i32 %r, i32 27) + ret i32 %r2 +} + +define i32 @rot_const_consecutive_multi_use(i32 %x) { +; CHECK-LABEL: @rot_const_consecutive_multi_use( +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 7) +; CHECK-NEXT: [[R3:%.*]] = call i32 @llvm.fshl.i32(i32 [[X]], i32 [[X]], i32 11) +; CHECK-NEXT: [[R2:%.*]] = and i32 [[R]], [[R3]] +; CHECK-NEXT: ret i32 [[R2]] +; + %r = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 7) + %r2 = call i32 @llvm.fshl.i32(i32 %r, i32 %r, i32 4) + %and = and i32 %r, %r2 + ret i32 %and +} + +define i32 @rot_const_consecutive_cancel_out(i32 %x) { +; CHECK-LABEL: @rot_const_consecutive_cancel_out( +; CHECK-NEXT: ret i32 [[X:%.*]] +; + %r = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 7) + %r2 = call i32 @llvm.fshl.i32(i32 %r, i32 %r, i32 25) + ret i32 %r2 +} + +;; negative test, consecutive rotates only fold if shift amounts are const + +define i32 @rot_nonconst_shift(i32 %x, i32 %amt) { +; CHECK-LABEL: @rot_nonconst_shift( +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 7) +; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.fshl.i32(i32 [[R]], i32 [[R]], i32 [[AMT:%.*]]) +; CHECK-NEXT: ret i32 [[R2]] +; + %r = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 7) + %r2 = call i32 @llvm.fshl.i32(i32 %r, i32 %r, i32 %amt) + ret i32 %r2 +} + +;; negative test, 1st funnel shift isn't a rotate. + +define i32 @fsh_rot(i32 %x, i32 %y) { +; CHECK-LABEL: @fsh_rot( +; CHECK-NEXT: [[FSH:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 7) +; CHECK-NEXT: [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[FSH]], i32 [[FSH]], i32 4) +; CHECK-NEXT: ret i32 [[R]] +; + %fsh = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7) + %r = call i32 @llvm.fshl.i32(i32 %fsh, i32 %fsh, i32 4) + ret i32 %r +} + +;; negative test, 2nd funnel shift isn't a rotate. + +define i32 @rot_fsh(i32 %x, i32 %y) { +; CHECK-LABEL: @rot_fsh( +; CHECK-NEXT: [[Y:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 7) +; CHECK-NEXT: [[R2:%.*]] = call i32 @llvm.fshl.i32(i32 [[Y]], i32 [[R:%.*]], i32 4) +; CHECK-NEXT: ret i32 [[R2]] +; + %r = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 7) + %r2 = call i32 @llvm.fshl.i32(i32 %r, i32 %y, i32 4) + ret i32 %r2 +} + diff --git a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll index f51e444a815c8..470d6be88672b 100644 --- a/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll +++ b/llvm/test/Transforms/InstCombine/insert-extract-shuffle.ll @@ -804,3 +804,49 @@ define <4 x i32> @infloop_D151807(<4 x float> %arg) { %i4 = insertelement <4 x i32> zeroinitializer, i32 %i3, i64 0 ret <4 x i32> %i4 } + +; Make sure we don't crash in this case. + +define i64 @pr160507(ptr %arg, i32 %arg1, i1 %arg2, i8 %arg3, i64 %arg4) { +; CHECK-LABEL: @pr160507( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb5: +; CHECK-NEXT: br i1 [[ARG2:%.*]], label [[BB6:%.*]], label [[BB8:%.*]] +; CHECK: bb6: +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb8: +; CHECK-NEXT: br label [[BB10:%.*]] +; CHECK: bb10: +; CHECK-NEXT: br label [[BB12:%.*]] +; CHECK: bb12: +; CHECK-NEXT: store i64 0, ptr [[ARG:%.*]], align 8 +; CHECK-NEXT: br label [[BB5]] +; +bb: + br label %bb5 + +bb5: + %phi = phi i8 [ 0, %bb ], [ %extractelement, %bb6 ], [ 0, %bb12 ] + br i1 %arg2, label %bb6, label %bb8 + +bb6: + %extractelement = extractelement <2 x i8> zeroinitializer, i64 %arg4 + br label %bb5 + +bb8: + %insertelement9 = insertelement <2 x i8> , i8 %phi, i64 0 + %zext = zext <2 x i8> %insertelement9 to <2 x i64> + %shufflevector = shufflevector <2 x i64> %zext, <2 x i64> poison, <4 x i32> + br label %bb10 + +bb10: + br label %bb12 + +bb12: + %extractelement11 = extractelement <2 x i64> %zext, i64 1 + %insertelement13 = insertelement <4 x i64> %shufflevector, i64 %extractelement11, i64 0 + %extractelement14 = extractelement <4 x i64> %insertelement13, i32 %arg1 + store i64 %extractelement14, ptr %arg, align 8 + br label %bb5 +} diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll new file mode 100644 index 0000000000000..61b13312521d2 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s +target datalayout = "p1:64:64:64:32" + +define i32 @ptrtoaddr_inttoptr_arg(i32 %a) { +; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg( +; CHECK-SAME: i32 [[A:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[A]] to i64 +; CHECK-NEXT: [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1) +; CHECK-NEXT: [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32 +; CHECK-NEXT: ret i32 [[TOADDR]] +; + %toptr = inttoptr i32 %a to ptr addrspace(1) + %toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32 + ret i32 %toaddr +} + +define i32 @ptrtoaddr_inttoptr() { +; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr() { +; CHECK-NEXT: ret i32 -1 +; + ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i32 -1 to ptr addrspace(1)) to i32) +} + +define i32 @ptrtoaddr_inttoptr_diff_size1() { +; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size1() { +; CHECK-NEXT: ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32) +; + ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i64 -1 to ptr addrspace(1)) to i32) +} + +define i32 @ptrtoaddr_inttoptr_diff_size2() { +; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_diff_size2() { +; CHECK-NEXT: ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32) +; + ret i32 ptrtoaddr (ptr addrspace(1) inttoptr (i16 -1 to ptr addrspace(1)) to i32) +} + +define i64 @ptrtoaddr_inttoptr_noas1() { +; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas1() { +; CHECK-NEXT: ret i64 1 +; + ret i64 ptrtoaddr (ptr getelementptr (i8, ptr null, i64 1) to i64) +} + +define i64 @ptr2addr2_inttoptr_noas2() { +; CHECK-LABEL: define i64 @ptr2addr2_inttoptr_noas2() { +; CHECK-NEXT: ret i64 123 +; + ret i64 ptrtoaddr (ptr inttoptr (i64 123 to ptr) to i64) +} + +define i64 @ptrtoaddr_inttoptr_noas_diff_size1() { +; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size1() { +; CHECK-NEXT: ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64) +; + ret i64 ptrtoaddr (ptr inttoptr (i32 -1 to ptr) to i64) +} + +define i64 @ptrtoaddr_inttoptr_noas_diff_size2() { +; CHECK-LABEL: define i64 @ptrtoaddr_inttoptr_noas_diff_size2() { +; CHECK-NEXT: ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64) +; + ret i64 ptrtoaddr (ptr inttoptr (i128 -1 to ptr) to i64) +} diff --git a/llvm/test/Transforms/InstCombine/select-masked_load.ll b/llvm/test/Transforms/InstCombine/select-masked_load.ll index b6bac612d6f9b..22e30ac019a5d 100644 --- a/llvm/test/Transforms/InstCombine/select-masked_load.ll +++ b/llvm/test/Transforms/InstCombine/select-masked_load.ll @@ -26,8 +26,7 @@ define <4 x i32> @masked_load_and_zero_inactive_2(ptr %ptr, <4 x i1> %mask) { ; No transform when the load's passthrough cannot be reused or altered. define <4 x i32> @masked_load_and_zero_inactive_3(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthrough) { ; CHECK-LABEL: @masked_load_and_zero_inactive_3( -; CHECK-NEXT: [[LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> [[PASSTHROUGH:%.*]]) -; CHECK-NEXT: [[MASKED:%.*]] = select <4 x i1> [[MASK]], <4 x i32> [[LOAD]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[MASKED:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[PTR:%.*]], i32 4, <4 x i1> [[MASK:%.*]], <4 x i32> zeroinitializer) ; CHECK-NEXT: ret <4 x i32> [[MASKED]] ; %load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %ptr, i32 4, <4 x i1> %mask, <4 x i32> %passthrough) @@ -116,6 +115,40 @@ entry: ret <8 x float> %1 } +define @fold_sel_into_masked_load_scalable(ptr %loc, %mask, %passthrough) { +; CHECK-LABEL: @fold_sel_into_masked_load_scalable( +; CHECK-NEXT: [[SEL:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, [[MASK:%.*]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: ret [[SEL]] +; + %load = call @llvm.masked.load.nxv4f32.p0(ptr %loc, i32 1, %mask, zeroinitializer) + %sel = select %mask, %load, %passthrough + ret %sel +} + +define @neg_fold_sel_into_masked_load_mask_mismatch(ptr %loc, %mask, %mask2, %passthrough) { +; CHECK-LABEL: @neg_fold_sel_into_masked_load_mask_mismatch( +; CHECK-NEXT: [[LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, [[MASK:%.*]], [[PASSTHROUGH:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = select [[MASK2:%.*]], [[LOAD]], [[PASSTHROUGH]] +; CHECK-NEXT: ret [[SEL]] +; + %load = call @llvm.masked.load.nxv4f32.p0(ptr %loc, i32 1, %mask, %passthrough) + %sel = select %mask2, %load, %passthrough + ret %sel +} + +define @fold_sel_into_masked_load_scalable_one_use_check(ptr %loc1, %mask, %passthrough, ptr %loc2) { +; CHECK-LABEL: @fold_sel_into_masked_load_scalable_one_use_check( +; CHECK-NEXT: [[LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[LOC:%.*]], i32 1, [[MASK:%.*]], zeroinitializer) +; CHECK-NEXT: [[SEL:%.*]] = select [[MASK]], [[LOAD]], [[PASSTHROUGH:%.*]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[LOAD]], ptr [[LOC2:%.*]], i32 1, [[MASK]]) +; CHECK-NEXT: ret [[SEL]] +; + %load = call @llvm.masked.load.nxv4f32.p0(ptr %loc1, i32 1, %mask, zeroinitializer) + %sel = select %mask, %load, %passthrough + call void @llvm.masked.store.nxv4f32.p0( %load, ptr %loc2, i32 1, %mask) + ret %sel +} + declare <8 x float> @llvm.masked.load.v8f32.p0(ptr, i32 immarg, <8 x i1>, <8 x float>) declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) diff --git a/llvm/test/Transforms/InstCombine/vector-reductions.ll b/llvm/test/Transforms/InstCombine/vector-reductions.ll index 10f4aca72dbc7..f1e0dd9bd06d7 100644 --- a/llvm/test/Transforms/InstCombine/vector-reductions.ll +++ b/llvm/test/Transforms/InstCombine/vector-reductions.ll @@ -308,3 +308,174 @@ define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) { %r = sub i32 %r0, %r1 ret i32 %r } + +define i32 @constant_multiplied_4xi32(i32 %0) { +; CHECK-LABEL: @constant_multiplied_4xi32( +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = insertelement <4 x i32> poison, i32 %0, i64 0 + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3) + ret i32 %4 +} + +define i32 @constant_multiplied_3xi32(i32 %0) { +; CHECK-LABEL: @constant_multiplied_3xi32( +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 3 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = insertelement <3 x i32> poison, i32 %0, i64 0 + %3 = shufflevector <3 x i32> %2, <3 x i32> poison, <3 x i32> zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %3) + ret i32 %4 +} + +define i64 @constant_multiplied_4xi64(i64 %0) { +; CHECK-LABEL: @constant_multiplied_4xi64( +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0:%.*]], 2 +; CHECK-NEXT: ret i64 [[TMP2]] +; + %2 = insertelement <4 x i64> poison, i64 %0, i64 0 + %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <4 x i32> zeroinitializer + %4 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3) + ret i64 %4 +} + +define i32 @constant_multiplied_8xi32(i32 %0) { +; CHECK-LABEL: @constant_multiplied_8xi32( +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 3 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = insertelement <4 x i32> poison, i32 %0, i64 0 + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <8 x i32> zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) + ret i32 %4 +} + + +define i32 @constant_multiplied_16xi32(i32 %0) { +; CHECK-LABEL: @constant_multiplied_16xi32( +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 4 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = insertelement <4 x i32> poison, i32 %0, i64 0 + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) + ret i32 %4 +} + + +define i32 @constant_multiplied_4xi32_at_idx1(i32 %0) { +; CHECK-LABEL: @constant_multiplied_4xi32_at_idx1( +; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = insertelement <4 x i32> poison, i32 %0, i64 1 + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, + <4 x i32> + %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3) + ret i32 %4 +} + +define i32 @negative_constant_multiplied_4xi32(i32 %0) { +; CHECK-LABEL: @negative_constant_multiplied_4xi32( +; CHECK-NEXT: ret i32 poison +; + %2 = insertelement <4 x i32> poison, i32 %0, i64 1 + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3) + ret i32 %4 +} + +define i32 @constant_multiplied_6xi32(i32 %0) { +; CHECK-LABEL: @constant_multiplied_6xi32( +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 6 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = insertelement <4 x i32> poison, i32 %0, i64 0 + %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <6 x i32> zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.v6i32(<6 x i32> %3) + ret i32 %4 +} + +define i64 @constant_multiplied_6xi64(i64 %0) { +; CHECK-LABEL: @constant_multiplied_6xi64( +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0:%.*]], 6 +; CHECK-NEXT: ret i64 [[TMP2]] +; + %2 = insertelement <4 x i64> poison, i64 %0, i64 0 + %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <6 x i32> zeroinitializer + %4 = tail call i64 @llvm.vector.reduce.add.v6i64(<6 x i64> %3) + ret i64 %4 +} + +define i1 @constant_multiplied_8xi1(i1 %0) { +; CHECK-LABEL: @constant_multiplied_8xi1( +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i1> poison, i1 [[TMP0:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[TMP5:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP4]]) +; CHECK-NEXT: [[TMP6:%.*]] = trunc i8 [[TMP5]] to i1 +; CHECK-NEXT: ret i1 [[TMP6]] +; + %2 = insertelement <8 x i1> poison, i1 %0, i32 0 + %3 = shufflevector <8 x i1> %2, <8 x i1> poison, <8 x i32> zeroinitializer + %4 = tail call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> %3) + ret i1 %4 +} + +define i2 @constant_multiplied_4xi2(i2 %0) { +; CHECK-LABEL: @constant_multiplied_4xi2( +; CHECK-NEXT: ret i2 0 +; + %2 = insertelement <4 x i2> poison, i2 %0, i32 0 + %3 = shufflevector <4 x i2> %2, <4 x i2> poison, <4 x i32> zeroinitializer + %4 = tail call i2 @llvm.vector.reduce.add.v4i2(<4 x i2> %3) + ret i2 %4 +} + +define i2 @constant_multiplied_5xi2(i2 %0) { +; CHECK-LABEL: @constant_multiplied_5xi2( +; CHECK-NEXT: ret i2 [[TMP0:%.*]] +; + %2 = insertelement <5 x i2> poison, i2 %0, i64 0 + %3 = shufflevector <5 x i2> %2, <5 x i2> poison, <5 x i32> zeroinitializer + %4 = tail call i2 @llvm.vector.reduce.add.v5i2(<5 x i2> %3) + ret i2 %4 +} + +define i2 @constant_multiplied_6xi2(i2 %0) { +; CHECK-LABEL: @constant_multiplied_6xi2( +; CHECK-NEXT: [[TMP2:%.*]] = shl i2 [[TMP0:%.*]], 1 +; CHECK-NEXT: ret i2 [[TMP2]] +; + %2 = insertelement <6 x i2> poison, i2 %0, i64 0 + %3 = shufflevector <6 x i2> %2, <6 x i2> poison, <6 x i32> zeroinitializer + %4 = tail call i2 @llvm.vector.reduce.add.v6i2(<6 x i2> %3) + ret i2 %4 +} + +define i2 @constant_multiplied_7xi2(i2 %0) { +; CHECK-LABEL: @constant_multiplied_7xi2( +; CHECK-NEXT: [[TMP2:%.*]] = sub i2 0, [[TMP0:%.*]] +; CHECK-NEXT: ret i2 [[TMP2]] +; + %2 = insertelement <7 x i2> poison, i2 %0, i64 0 + %3 = shufflevector <7 x i2> %2, <7 x i2> poison, <7 x i32> zeroinitializer + %4 = tail call i2 @llvm.vector.reduce.add.v7i2(<7 x i2> %3) + ret i2 %4 +} + +define i32 @negative_scalable_vector(i32 %0) { +; CHECK-LABEL: @negative_scalable_vector( +; CHECK-NEXT: [[TMP2:%.*]] = insertelement poison, i32 [[TMP0:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP3]]) +; CHECK-NEXT: ret i32 [[TMP4]] +; + %2 = insertelement poison, i32 %0, i64 0 + %3 = shufflevector %2, poison, zeroinitializer + %4 = tail call i32 @llvm.vector.reduce.add.nxv4i32( %3) + ret i32 %4 +} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/inttoptr-gep-index-width.ll b/llvm/test/Transforms/InstSimplify/ConstProp/inttoptr-gep-index-width.ll deleted file mode 100644 index 03056e8361e21..0000000000000 --- a/llvm/test/Transforms/InstSimplify/ConstProp/inttoptr-gep-index-width.ll +++ /dev/null @@ -1,14 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes=instsimplify < %s | FileCheck %s - -target datalayout = "p:16:16:16:8" - -; The GEP should only modify the low 8 bits of the pointer. -define ptr @test() { -; CHECK-LABEL: define ptr @test() { -; CHECK-NEXT: ret ptr inttoptr (i16 -256 to ptr) -; - %base = inttoptr i16 -1 to ptr - %gep = getelementptr i8, ptr %base, i8 1 - ret ptr %gep -} diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/inttoptr-gep-nonintegral.ll b/llvm/test/Transforms/InstSimplify/ConstProp/inttoptr-gep-nonintegral.ll new file mode 100644 index 0000000000000..f66825767bd0b --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/inttoptr-gep-nonintegral.ll @@ -0,0 +1,145 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instsimplify < %s | FileCheck %s +;; Check that we do not create new inttoptr instructions for unstable pointers +;; or pointers with external state (even if the values are all constants). +;; NOTE: for all but the zero address space, the GEP should only modify the +;; low 8 bits of the pointer. +target datalayout = "p:16:16:16:16-p1:16:16:16:8-pu2:16:16:16:8-pe3:16:16:16:8" + +define ptr @test_null_base_normal() { +; CHECK-LABEL: define ptr @test_null_base_normal() { +; CHECK-NEXT: ret ptr inttoptr (i16 -2 to ptr) +; + %gep = getelementptr i8, ptr null, i8 -2 + ret ptr %gep +} +define ptr @test_inttoptr_base_normal() { +; CHECK-LABEL: define ptr @test_inttoptr_base_normal() { +; CHECK-NEXT: ret ptr null +; + %base = inttoptr i16 -1 to ptr + %gep = getelementptr i8, ptr %base, i8 1 + ret ptr %gep +} + +;; Transformation is fine for non-integral address space, but we can only change +;; the index bits: (i8 -2 == i16 254) +define ptr addrspace(1) @test_null_base_nonintegral() { +; CHECK-LABEL: define ptr addrspace(1) @test_null_base_nonintegral() { +; CHECK-NEXT: ret ptr addrspace(1) inttoptr (i16 254 to ptr addrspace(1)) +; + %gep = getelementptr i8, ptr addrspace(1) null, i8 -2 + ret ptr addrspace(1) %gep +} +define ptr addrspace(1) @test_inttoptr_base_nonintegral() { +; CHECK-LABEL: define ptr addrspace(1) @test_inttoptr_base_nonintegral() { +; CHECK-NEXT: ret ptr addrspace(1) inttoptr (i16 -256 to ptr addrspace(1)) +; + %base = inttoptr i16 -1 to ptr addrspace(1) + %gep = getelementptr i8, ptr addrspace(1) %base, i8 1 + ret ptr addrspace(1) %gep +} + +;; For unstable pointers we should avoid any introduction of inttoptr +define ptr addrspace(2) @test_null_base_unstable() { +; CHECK-LABEL: define ptr addrspace(2) @test_null_base_unstable() { +; CHECK-NEXT: ret ptr addrspace(2) getelementptr (i8, ptr addrspace(2) null, i8 -2) +; + %gep = getelementptr i8, ptr addrspace(2) null, i8 -2 + ret ptr addrspace(2) %gep +} +define ptr addrspace(2) @test_inttoptr_base_unstable() { +; CHECK-LABEL: define ptr addrspace(2) @test_inttoptr_base_unstable() { +; CHECK-NEXT: ret ptr addrspace(2) getelementptr (i8, ptr addrspace(2) inttoptr (i16 -1 to ptr addrspace(2)), i8 1) +; + %base = inttoptr i16 -1 to ptr addrspace(2) + %gep = getelementptr i8, ptr addrspace(2) %base, i8 1 + ret ptr addrspace(2) %gep +} + +;; The same is true for pointers with external state: no new inttoptr +define ptr addrspace(3) @test_null_base_external() { +; CHECK-LABEL: define ptr addrspace(3) @test_null_base_external() { +; CHECK-NEXT: ret ptr addrspace(3) getelementptr (i8, ptr addrspace(3) null, i8 -2) +; + %gep = getelementptr i8, ptr addrspace(3) null, i8 -2 + ret ptr addrspace(3) %gep +} + +define ptr addrspace(3) @test_inttoptr_base_external() { +; CHECK-LABEL: define ptr addrspace(3) @test_inttoptr_base_external() { +; CHECK-NEXT: ret ptr addrspace(3) getelementptr (i8, ptr addrspace(3) inttoptr (i16 -1 to ptr addrspace(3)), i8 1) +; + %base = inttoptr i16 -1 to ptr addrspace(3) + %gep = getelementptr i8, ptr addrspace(3) %base, i8 1 + ret ptr addrspace(3) %gep +} + +define <2 x ptr> @test_vec_null_base_normal() { +; CHECK-LABEL: define <2 x ptr> @test_vec_null_base_normal() { +; CHECK-NEXT: ret <2 x ptr> getelementptr (i8, <2 x ptr> zeroinitializer, <2 x i16> ) +; + %gep = getelementptr i8, <2 x ptr> , <2 x i8> + ret <2 x ptr> %gep +} +define <2 x ptr> @test_vec_inttoptr_base_normal() { +; CHECK-LABEL: define <2 x ptr> @test_vec_inttoptr_base_normal() { +; CHECK-NEXT: ret <2 x ptr> getelementptr (i8, <2 x ptr> , <2 x i16> ) +; + %base = inttoptr <2 x i16> to <2 x ptr> + %gep = getelementptr i8, <2 x ptr> %base, <2 x i8> + ret <2 x ptr> %gep +} + +;; Transformation is fine for non-integral address space, but we can only change +;; the index bits: (i8 -2 == i16 254) +define <2 x ptr addrspace(1)> @test_vec_null_base_nonintegral() { +; CHECK-LABEL: define <2 x ptr addrspace(1)> @test_vec_null_base_nonintegral() { +; CHECK-NEXT: ret <2 x ptr addrspace(1)> getelementptr (i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i8> ) +; + %gep = getelementptr i8, <2 x ptr addrspace(1)> , <2 x i8> + ret <2 x ptr addrspace(1)> %gep +} +define <2 x ptr addrspace(1)> @test_vec_inttoptr_base_nonintegral() { +; CHECK-LABEL: define <2 x ptr addrspace(1)> @test_vec_inttoptr_base_nonintegral() { +; CHECK-NEXT: ret <2 x ptr addrspace(1)> getelementptr (i8, <2 x ptr addrspace(1)> , <2 x i8> ) +; + %base = inttoptr <2 x i16> to <2 x ptr addrspace(1)> + %gep = getelementptr i8, <2 x ptr addrspace(1)> %base, <2 x i8> + ret <2 x ptr addrspace(1)> %gep +} + +;; For unstable pointers we should avoid any introduction of inttoptr +define <2 x ptr addrspace(2)> @test_vec_null_base_unstable() { +; CHECK-LABEL: define <2 x ptr addrspace(2)> @test_vec_null_base_unstable() { +; CHECK-NEXT: ret <2 x ptr addrspace(2)> getelementptr (i8, <2 x ptr addrspace(2)> zeroinitializer, <2 x i8> ) +; + %gep = getelementptr i8, <2 x ptr addrspace(2)> , <2 x i8> + ret <2 x ptr addrspace(2)> %gep +} +define <2 x ptr addrspace(2)> @test_vec_inttoptr_base_unstable() { +; CHECK-LABEL: define <2 x ptr addrspace(2)> @test_vec_inttoptr_base_unstable() { +; CHECK-NEXT: ret <2 x ptr addrspace(2)> getelementptr (i8, <2 x ptr addrspace(2)> , <2 x i8> ) +; + %base = inttoptr <2 x i16> to <2 x ptr addrspace(2)> + %gep = getelementptr i8, <2 x ptr addrspace(2)> %base, <2 x i8> + ret <2 x ptr addrspace(2)> %gep +} + +;; The same is true for pointers with external state: no new inttoptr +define <2 x ptr addrspace(3)> @test_vec_null_base_external() { +; CHECK-LABEL: define <2 x ptr addrspace(3)> @test_vec_null_base_external() { +; CHECK-NEXT: ret <2 x ptr addrspace(3)> getelementptr (i8, <2 x ptr addrspace(3)> zeroinitializer, <2 x i8> ) +; + %gep = getelementptr i8, <2 x ptr addrspace(3)> , <2 x i8> + ret <2 x ptr addrspace(3)> %gep +} + +define <2 x ptr addrspace(3)> @test_vec_inttoptr_base_external() { +; CHECK-LABEL: define <2 x ptr addrspace(3)> @test_vec_inttoptr_base_external() { +; CHECK-NEXT: ret <2 x ptr addrspace(3)> getelementptr (i8, <2 x ptr addrspace(3)> , <2 x i8> ) +; + %base = inttoptr <2 x i16> to <2 x ptr addrspace(3)> + %gep = getelementptr i8, <2 x ptr addrspace(3)> %base, <2 x i8> + ret <2 x ptr addrspace(3)> %gep +} diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll index 543c73137c1b6..b1a1e6b86c293 100644 --- a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll +++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2i-d2i.ll @@ -334,8 +334,7 @@ define i32 @test_neg_1_5_d2i_rz() { ;+-------------------------------------------------------------+ define i32 @test_neg_1_5_f2ui_rm() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rm(float -1.5) ret i32 %res @@ -343,8 +342,7 @@ define i32 @test_neg_1_5_f2ui_rm() { define i32 @test_neg_1_5_f2ui_rn() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rn(float -1.5) ret i32 %res @@ -353,8 +351,7 @@ define i32 @test_neg_1_5_f2ui_rn() { define i32 @test_neg_1_5_f2ui_rp() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rp(float -1.5) ret i32 %res @@ -362,8 +359,7 @@ define i32 @test_neg_1_5_f2ui_rp() { define i32 @test_neg_1_5_f2ui_rz() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rz(float -1.5) ret i32 %res @@ -374,8 +370,7 @@ define i32 @test_neg_1_5_f2ui_rz() { ;+-------------------------------------------------------------+ define i32 @test_neg_1_5_f2ui_rm_ftz() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rm_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rm.ftz(float -1.5) ret i32 %res @@ -383,8 +378,7 @@ define i32 @test_neg_1_5_f2ui_rm_ftz() { define i32 @test_neg_1_5_f2ui_rn_ftz() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rn_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rn.ftz(float -1.5) ret i32 %res @@ -392,8 +386,7 @@ define i32 @test_neg_1_5_f2ui_rn_ftz() { define i32 @test_neg_1_5_f2ui_rp_ftz() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rp_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rp.ftz(float -1.5) ret i32 %res @@ -401,8 +394,7 @@ define i32 @test_neg_1_5_f2ui_rp_ftz() { define i32 @test_neg_1_5_f2ui_rz_ftz() { ; CHECK-LABEL: define i32 @test_neg_1_5_f2ui_rz_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rz.ftz(float -1.5) ret i32 %res @@ -412,8 +404,7 @@ define i32 @test_neg_1_5_f2ui_rz_ftz() { ;+-------------------------------------------------------------+ define i32 @test_neg_1_5_d2ui_rm() { ; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.d2ui.rm(double -1.5) ret i32 %res @@ -421,8 +412,7 @@ define i32 @test_neg_1_5_d2ui_rm() { define i32 @test_neg_1_5_d2ui_rn() { ; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rn() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rn(double -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.d2ui.rn(double -1.5) ret i32 %res @@ -431,8 +421,7 @@ define i32 @test_neg_1_5_d2ui_rn() { define i32 @test_neg_1_5_d2ui_rp() { ; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rp() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rp(double -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.d2ui.rp(double -1.5) ret i32 %res @@ -440,8 +429,7 @@ define i32 @test_neg_1_5_d2ui_rp() { define i32 @test_neg_1_5_d2ui_rz() { ; CHECK-LABEL: define i32 @test_neg_1_5_d2ui_rz() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rz(double -1.500000e+00) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.d2ui.rz(double -1.5) ret i32 %res @@ -526,7 +514,7 @@ define i32 @test_nan_f2i_rz_ftz() { ;+-------------------------------------------------------------+ define i32 @test_nan_d2i_rm() { ; CHECK-LABEL: define i32 @test_nan_d2i_rm() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2i.rm(double 0xFFF8000000000000) ret i32 %res @@ -534,7 +522,7 @@ define i32 @test_nan_d2i_rm() { define i32 @test_nan_d2i_rn() { ; CHECK-LABEL: define i32 @test_nan_d2i_rn() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2i.rn(double 0xFFF8000000000000) ret i32 %res @@ -543,7 +531,7 @@ define i32 @test_nan_d2i_rn() { define i32 @test_nan_d2i_rp() { ; CHECK-LABEL: define i32 @test_nan_d2i_rp() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2i.rp(double 0xFFF8000000000000) ret i32 %res @@ -551,7 +539,7 @@ define i32 @test_nan_d2i_rp() { define i32 @test_nan_d2i_rz() { ; CHECK-LABEL: define i32 @test_nan_d2i_rz() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2i.rz(double 0xFFF8000000000000) ret i32 %res @@ -632,7 +620,7 @@ define i32 @test_nan_f2ui_rz_ftz() { ;+-------------------------------------------------------------+ define i32 @test_nan_d2ui_rm() { ; CHECK-LABEL: define i32 @test_nan_d2ui_rm() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2ui.rm(double 0xFFF8000000000000) ret i32 %res @@ -640,7 +628,7 @@ define i32 @test_nan_d2ui_rm() { define i32 @test_nan_d2ui_rn() { ; CHECK-LABEL: define i32 @test_nan_d2ui_rn() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2ui.rn(double 0xFFF8000000000000) ret i32 %res @@ -649,7 +637,7 @@ define i32 @test_nan_d2ui_rn() { define i32 @test_nan_d2ui_rp() { ; CHECK-LABEL: define i32 @test_nan_d2ui_rp() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2ui.rp(double 0xFFF8000000000000) ret i32 %res @@ -657,7 +645,7 @@ define i32 @test_nan_d2ui_rp() { define i32 @test_nan_d2ui_rz() { ; CHECK-LABEL: define i32 @test_nan_d2ui_rz() { -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 -2147483648 ; %res = call i32 @llvm.nvvm.d2ui.rz(double 0xFFF8000000000000) ret i32 %res @@ -994,8 +982,7 @@ define i32 @test_neg_subnormal_d2i_rz() { ;+-------------------------------------------------------------+ define i32 @test_neg_subnormal_f2ui_rm() { ; CHECK-LABEL: define i32 @test_neg_subnormal_f2ui_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.f2ui.rm(float 0xB80FFFFFC0000000) ret i32 %res @@ -1065,8 +1052,7 @@ define i32 @test_neg_subnormal_f2ui_rz_ftz() { ;+-------------------------------------------------------------+ define i32 @test_neg_subnormal_d2ui_rm() { ; CHECK-LABEL: define i32 @test_neg_subnormal_d2ui_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i32 @llvm.nvvm.d2ui.rm(double 0x800FFFFFFFFFFFFF) -; CHECK-NEXT: ret i32 [[RES]] +; CHECK-NEXT: ret i32 0 ; %res = call i32 @llvm.nvvm.d2ui.rm(double 0x800fffffffffffff) ret i32 %res diff --git a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll index be38177dce2c3..ffadf26f3c5b5 100644 --- a/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll +++ b/llvm/test/Transforms/InstSimplify/const-fold-nvvm-f2ll-d2ll.ll @@ -334,8 +334,7 @@ define i64 @test_neg_1_5_d2ll_rz() { ;+-------------------------------------------------------------+ define i64 @test_neg_1_5_f2ull_rm() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rm(float -1.5) ret i64 %res @@ -343,8 +342,7 @@ define i64 @test_neg_1_5_f2ull_rm() { define i64 @test_neg_1_5_f2ull_rn() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rn(float -1.5) ret i64 %res @@ -353,8 +351,7 @@ define i64 @test_neg_1_5_f2ull_rn() { define i64 @test_neg_1_5_f2ull_rp() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rp(float -1.5) ret i64 %res @@ -362,8 +359,7 @@ define i64 @test_neg_1_5_f2ull_rp() { define i64 @test_neg_1_5_f2ull_rz() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rz(float -1.5) ret i64 %res @@ -374,8 +370,7 @@ define i64 @test_neg_1_5_f2ull_rz() { ;+-------------------------------------------------------------+ define i64 @test_neg_1_5_f2ull_rm_ftz() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rm_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float -1.5) ret i64 %res @@ -383,8 +378,7 @@ define i64 @test_neg_1_5_f2ull_rm_ftz() { define i64 @test_neg_1_5_f2ull_rn_ftz() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rn_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float -1.5) ret i64 %res @@ -392,8 +386,7 @@ define i64 @test_neg_1_5_f2ull_rn_ftz() { define i64 @test_neg_1_5_f2ull_rp_ftz() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rp_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float -1.5) ret i64 %res @@ -401,8 +394,7 @@ define i64 @test_neg_1_5_f2ull_rp_ftz() { define i64 @test_neg_1_5_f2ull_rz_ftz() { ; CHECK-LABEL: define i64 @test_neg_1_5_f2ull_rz_ftz() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float -1.5) ret i64 %res @@ -412,8 +404,7 @@ define i64 @test_neg_1_5_f2ull_rz_ftz() { ;+-------------------------------------------------------------+ define i64 @test_neg_1_5_d2ull_rm() { ; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.d2ull.rm(double -1.5) ret i64 %res @@ -421,8 +412,7 @@ define i64 @test_neg_1_5_d2ull_rm() { define i64 @test_neg_1_5_d2ull_rn() { ; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rn() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rn(double -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.d2ull.rn(double -1.5) ret i64 %res @@ -431,8 +421,7 @@ define i64 @test_neg_1_5_d2ull_rn() { define i64 @test_neg_1_5_d2ull_rp() { ; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rp() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rp(double -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.d2ull.rp(double -1.5) ret i64 %res @@ -440,8 +429,7 @@ define i64 @test_neg_1_5_d2ull_rp() { define i64 @test_neg_1_5_d2ull_rz() { ; CHECK-LABEL: define i64 @test_neg_1_5_d2ull_rz() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rz(double -1.500000e+00) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.d2ull.rz(double -1.5) ret i64 %res @@ -456,7 +444,7 @@ define i64 @test_neg_1_5_d2ull_rz() { ;+-------------------------------------------------------------+ define i64 @test_nan_f2ll_rm() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rm() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rm(float 0x7FFFFF0000000000) ret i64 %res @@ -464,7 +452,7 @@ define i64 @test_nan_f2ll_rm() { define i64 @test_nan_f2ll_rn() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rn() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rn(float 0x7FFFFF0000000000) ret i64 %res @@ -473,7 +461,7 @@ define i64 @test_nan_f2ll_rn() { define i64 @test_nan_f2ll_rp() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rp() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rp(float 0x7FFFFF0000000000) ret i64 %res @@ -481,7 +469,7 @@ define i64 @test_nan_f2ll_rp() { define i64 @test_nan_f2ll_rz() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rz(float 0x7FFFFF0000000000) ret i64 %res @@ -492,7 +480,7 @@ define i64 @test_nan_f2ll_rz() { ;+-------------------------------------------------------------+ define i64 @test_nan_f2ll_rm_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rm_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rm.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -500,7 +488,7 @@ define i64 @test_nan_f2ll_rm_ftz() { define i64 @test_nan_f2ll_rn_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rn_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rn.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -508,7 +496,7 @@ define i64 @test_nan_f2ll_rn_ftz() { define i64 @test_nan_f2ll_rp_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rp_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rp.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -516,7 +504,7 @@ define i64 @test_nan_f2ll_rp_ftz() { define i64 @test_nan_f2ll_rz_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ll_rz_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ll.rz.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -526,7 +514,7 @@ define i64 @test_nan_f2ll_rz_ftz() { ;+-------------------------------------------------------------+ define i64 @test_nan_d2ll_rm() { ; CHECK-LABEL: define i64 @test_nan_d2ll_rm() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ll.rm(double 0xFFF8000000000000) ret i64 %res @@ -534,7 +522,7 @@ define i64 @test_nan_d2ll_rm() { define i64 @test_nan_d2ll_rn() { ; CHECK-LABEL: define i64 @test_nan_d2ll_rn() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ll.rn(double 0xFFF8000000000000) ret i64 %res @@ -543,7 +531,7 @@ define i64 @test_nan_d2ll_rn() { define i64 @test_nan_d2ll_rp() { ; CHECK-LABEL: define i64 @test_nan_d2ll_rp() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ll.rp(double 0xFFF8000000000000) ret i64 %res @@ -551,7 +539,7 @@ define i64 @test_nan_d2ll_rp() { define i64 @test_nan_d2ll_rz() { ; CHECK-LABEL: define i64 @test_nan_d2ll_rz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ll.rz(double 0xFFF8000000000000) ret i64 %res @@ -562,7 +550,7 @@ define i64 @test_nan_d2ll_rz() { ;+-------------------------------------------------------------+ define i64 @test_nan_f2ull_rm() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rm() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rm(float 0x7FFFFF0000000000) ret i64 %res @@ -570,7 +558,7 @@ define i64 @test_nan_f2ull_rm() { define i64 @test_nan_f2ull_rn() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rn() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rn(float 0x7FFFFF0000000000) ret i64 %res @@ -579,7 +567,7 @@ define i64 @test_nan_f2ull_rn() { define i64 @test_nan_f2ull_rp() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rp() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rp(float 0x7FFFFF0000000000) ret i64 %res @@ -587,7 +575,7 @@ define i64 @test_nan_f2ull_rp() { define i64 @test_nan_f2ull_rz() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rz(float 0x7FFFFF0000000000) ret i64 %res @@ -598,7 +586,7 @@ define i64 @test_nan_f2ull_rz() { ;+-------------------------------------------------------------+ define i64 @test_nan_f2ull_rm_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rm_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rm.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -606,7 +594,7 @@ define i64 @test_nan_f2ull_rm_ftz() { define i64 @test_nan_f2ull_rn_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rn_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rn.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -614,7 +602,7 @@ define i64 @test_nan_f2ull_rn_ftz() { define i64 @test_nan_f2ull_rp_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rp_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rp.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -622,7 +610,7 @@ define i64 @test_nan_f2ull_rp_ftz() { define i64 @test_nan_f2ull_rz_ftz() { ; CHECK-LABEL: define i64 @test_nan_f2ull_rz_ftz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.f2ull.rz.ftz(float 0x7FFFFF0000000000) ret i64 %res @@ -632,7 +620,7 @@ define i64 @test_nan_f2ull_rz_ftz() { ;+-------------------------------------------------------------+ define i64 @test_nan_d2ull_rm() { ; CHECK-LABEL: define i64 @test_nan_d2ull_rm() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ull.rm(double 0xFFF8000000000000) ret i64 %res @@ -640,7 +628,7 @@ define i64 @test_nan_d2ull_rm() { define i64 @test_nan_d2ull_rn() { ; CHECK-LABEL: define i64 @test_nan_d2ull_rn() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ull.rn(double 0xFFF8000000000000) ret i64 %res @@ -649,7 +637,7 @@ define i64 @test_nan_d2ull_rn() { define i64 @test_nan_d2ull_rp() { ; CHECK-LABEL: define i64 @test_nan_d2ull_rp() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ull.rp(double 0xFFF8000000000000) ret i64 %res @@ -657,7 +645,7 @@ define i64 @test_nan_d2ull_rp() { define i64 @test_nan_d2ull_rz() { ; CHECK-LABEL: define i64 @test_nan_d2ull_rz() { -; CHECK-NEXT: ret i64 0 +; CHECK-NEXT: ret i64 -9223372036854775808 ; %res = call i64 @llvm.nvvm.d2ull.rz(double 0xFFF8000000000000) ret i64 %res @@ -994,8 +982,7 @@ define i64 @test_neg_subnormal_d2ll_rz() { ;+-------------------------------------------------------------+ define i64 @test_neg_subnormal_f2ull_rm() { ; CHECK-LABEL: define i64 @test_neg_subnormal_f2ull_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.f2ull.rm(float 0xB80FFFFFC0000000) ret i64 %res @@ -1065,8 +1052,7 @@ define i64 @test_neg_subnormal_f2ull_rz_ftz() { ;+-------------------------------------------------------------+ define i64 @test_neg_subnormal_d2ull_rm() { ; CHECK-LABEL: define i64 @test_neg_subnormal_d2ull_rm() { -; CHECK-NEXT: [[RES:%.*]] = call i64 @llvm.nvvm.d2ull.rm(double 0x800FFFFFFFFFFFFF) -; CHECK-NEXT: ret i64 [[RES]] +; CHECK-NEXT: ret i64 0 ; %res = call i64 @llvm.nvvm.d2ull.rm(double 0x800fffffffffffff) ret i64 %res diff --git a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll index fff6cfd8a3b4b..26b51146057e9 100644 --- a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll +++ b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll @@ -1,1388 +1,854 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s -declare half @llvm.minimum.f16(half, half) -declare half @llvm.maximum.f16(half, half) - -declare float @llvm.minnum.f32(float, float) -declare float @llvm.maxnum.f32(float, float) -declare float @llvm.minimum.f32(float, float) -declare float @llvm.maximum.f32(float, float) -declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) -declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) -declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>) -declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>) - -declare double @llvm.minnum.f64(double, double) -declare double @llvm.maxnum.f64(double, double) -declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) -declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) -declare double @llvm.minimum.f64(double, double) -declare double @llvm.maximum.f64(double, double) -declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) -declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>) - -define float @test_minnum_const_nan(float %x) { -; CHECK-LABEL: @test_minnum_const_nan( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call float @llvm.minnum.f32(float %x, float 0x7fff000000000000) - ret float %r -} - -define float @test_maxnum_const_nan(float %x) { -; CHECK-LABEL: @test_maxnum_const_nan( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call float @llvm.maxnum.f32(float %x, float 0x7fff000000000000) - ret float %r -} - -define float @test_maximum_const_nan(float %x) { -; CHECK-LABEL: @test_maximum_const_nan( -; CHECK-NEXT: ret float 0x7FFF000000000000 -; - %r = call float @llvm.maximum.f32(float %x, float 0x7fff000000000000) - ret float %r -} - -define float @test_minimum_const_nan(float %x) { -; CHECK-LABEL: @test_minimum_const_nan( -; CHECK-NEXT: ret float 0x7FFF000000000000 -; - %r = call float @llvm.minimum.f32(float %x, float 0x7fff000000000000) - ret float %r -} - -define float @test_minnum_const_inf(float %x) { -; CHECK-LABEL: @test_minnum_const_inf( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x7FF0000000000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.minnum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_maxnum_const_inf(float %x) { -; CHECK-LABEL: @test_maxnum_const_inf( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %r = call float @llvm.maxnum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_maximum_const_inf(float %x) { -; CHECK-LABEL: @test_maximum_const_inf( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 0x7FF0000000000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.maximum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_minimum_const_inf(float %x) { -; CHECK-LABEL: @test_minimum_const_inf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call float @llvm.minimum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_minnum_const_neg_inf(float %x) { -; CHECK-LABEL: @test_minnum_const_neg_inf( -; CHECK-NEXT: ret float 0xFFF0000000000000 -; - %r = call float @llvm.minnum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_maxnum_const_neg_inf(float %x) { -; CHECK-LABEL: @test_maxnum_const_neg_inf( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0xFFF0000000000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.maxnum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_maximum_const_neg_inf(float %x) { -; CHECK-LABEL: @test_maximum_const_neg_inf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call float @llvm.maximum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_minimum_const_neg_inf(float %x) { -; CHECK-LABEL: @test_minimum_const_neg_inf( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0xFFF0000000000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.minimum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_minnum_const_inf_nnan(float %x) { -; CHECK-LABEL: @test_minnum_const_inf_nnan( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan float @llvm.minnum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_maxnum_const_inf_nnan(float %x) { -; CHECK-LABEL: @test_maxnum_const_inf_nnan( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %r = call nnan float @llvm.maxnum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_maximum_const_inf_nnan(float %x) { -; CHECK-LABEL: @test_maximum_const_inf_nnan( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %r = call nnan float @llvm.maximum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_minimum_const_inf_nnan(float %x) { -; CHECK-LABEL: @test_minimum_const_inf_nnan( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan float @llvm.minimum.f32(float %x, float 0x7ff0000000000000) - ret float %r -} - -define float @test_minnum_const_inf_nnan_comm(float %x) { -; CHECK-LABEL: @test_minnum_const_inf_nnan_comm( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan float @llvm.minnum.f32(float 0x7ff0000000000000, float %x) - ret float %r -} - -define float @test_maxnum_const_inf_nnan_comm(float %x) { -; CHECK-LABEL: @test_maxnum_const_inf_nnan_comm( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %r = call nnan float @llvm.maxnum.f32(float 0x7ff0000000000000, float %x) - ret float %r -} - -define float @test_maximum_const_inf_nnan_comm(float %x) { -; CHECK-LABEL: @test_maximum_const_inf_nnan_comm( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %r = call nnan float @llvm.maximum.f32(float 0x7ff0000000000000, float %x) - ret float %r -} - -define float @test_minimum_const_inf_nnan_comm(float %x) { -; CHECK-LABEL: @test_minimum_const_inf_nnan_comm( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan float @llvm.minimum.f32(float 0x7ff0000000000000, float %x) - ret float %r -} - -define <2 x float> @test_minnum_const_inf_nnan_comm_vec(<2 x float> %x) { -; CHECK-LABEL: @test_minnum_const_inf_nnan_comm_vec( -; CHECK-NEXT: ret <2 x float> [[X:%.*]] -; - %r = call nnan <2 x float> @llvm.minnum.v2f32(<2 x float> , <2 x float> %x) - ret <2 x float> %r -} - -define <2 x float> @test_maxnum_const_inf_nnan_comm_vec(<2 x float> %x) { -; CHECK-LABEL: @test_maxnum_const_inf_nnan_comm_vec( -; CHECK-NEXT: ret <2 x float> splat (float 0x7FF0000000000000) -; - %r = call nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> , <2 x float> %x) - ret <2 x float> %r -} - -define <2 x float> @test_maximum_const_inf_nnan_comm_vec(<2 x float> %x) { -; CHECK-LABEL: @test_maximum_const_inf_nnan_comm_vec( -; CHECK-NEXT: ret <2 x float> splat (float 0x7FF0000000000000) -; - %r = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> , <2 x float> %x) - ret <2 x float> %r -} - -define <2 x float> @test_minimum_const_inf_nnan_comm_vec(<2 x float> %x) { -; CHECK-LABEL: @test_minimum_const_inf_nnan_comm_vec( -; CHECK-NEXT: ret <2 x float> [[X:%.*]] -; - %r = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> , <2 x float> %x) - ret <2 x float> %r -} - -define float @test_minnum_const_neg_inf_nnan(float %x) { -; CHECK-LABEL: @test_minnum_const_neg_inf_nnan( -; CHECK-NEXT: ret float 0xFFF0000000000000 -; - %r = call nnan float @llvm.minnum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_maxnum_const_neg_inf_nnan(float %x) { -; CHECK-LABEL: @test_maxnum_const_neg_inf_nnan( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan float @llvm.maxnum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_maximum_const_neg_inf_nnan(float %x) { -; CHECK-LABEL: @test_maximum_const_neg_inf_nnan( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan float @llvm.maximum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_minimum_const_neg_inf_nnan(float %x) { -; CHECK-LABEL: @test_minimum_const_neg_inf_nnan( -; CHECK-NEXT: ret float 0xFFF0000000000000 -; - %r = call nnan float @llvm.minimum.f32(float %x, float 0xfff0000000000000) - ret float %r -} - -define float @test_minnum_const_max(float %x) { -; CHECK-LABEL: @test_minnum_const_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_maxnum_const_max(float %x) { -; CHECK-LABEL: @test_maxnum_const_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_maximum_const_max(float %x) { -; CHECK-LABEL: @test_maximum_const_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_minimum_const_max(float %x) { -; CHECK-LABEL: @test_minimum_const_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_minnum_const_neg_max(float %x) { -; CHECK-LABEL: @test_minnum_const_neg_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_maxnum_const_neg_max(float %x) { -; CHECK-LABEL: @test_maxnum_const_neg_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_maximum_const_neg_max(float %x) { -; CHECK-LABEL: @test_maximum_const_neg_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_minimum_const_neg_max(float %x) { -; CHECK-LABEL: @test_minimum_const_neg_max( -; CHECK-NEXT: [[R:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_minnum_const_max_ninf(float %x) { -; CHECK-LABEL: @test_minnum_const_max_ninf( -; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.minnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call ninf float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_maxnum_const_max_ninf(float %x) { -; CHECK-LABEL: @test_maxnum_const_max_ninf( -; CHECK-NEXT: ret float 0x47EFFFFFE0000000 -; - %r = call ninf float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_maximum_const_max_ninf(float %x) { -; CHECK-LABEL: @test_maximum_const_max_ninf( -; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.maximum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call ninf float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_minimum_const_max_ninf(float %x) { -; CHECK-LABEL: @test_minimum_const_max_ninf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call ninf float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_minnum_const_neg_max_ninf(float %x) { -; CHECK-LABEL: @test_minnum_const_neg_max_ninf( -; CHECK-NEXT: ret float 0xC7EFFFFFE0000000 -; - %r = call ninf float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_maxnum_const_neg_max_ninf(float %x) { -; CHECK-LABEL: @test_maxnum_const_neg_max_ninf( -; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.maxnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call ninf float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_maximum_const_neg_max_ninf(float %x) { -; CHECK-LABEL: @test_maximum_const_neg_max_ninf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call ninf float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_minimum_const_neg_max_ninf(float %x) { -; CHECK-LABEL: @test_minimum_const_neg_max_ninf( -; CHECK-NEXT: [[R:%.*]] = call ninf float @llvm.minimum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) -; CHECK-NEXT: ret float [[R]] -; - %r = call ninf float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_minnum_const_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_minnum_const_max_nnan_ninf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan ninf float @llvm.minnum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_maxnum_const_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_maxnum_const_max_nnan_ninf( -; CHECK-NEXT: ret float 0x47EFFFFFE0000000 -; - %r = call nnan ninf float @llvm.maxnum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_maximum_const_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_maximum_const_max_nnan_ninf( -; CHECK-NEXT: ret float 0x47EFFFFFE0000000 -; - %r = call nnan ninf float @llvm.maximum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_minimum_const_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_minimum_const_max_nnan_ninf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan ninf float @llvm.minimum.f32(float %x, float 0x47efffffe0000000) - ret float %r -} - -define float @test_minnum_const_neg_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_minnum_const_neg_max_nnan_ninf( -; CHECK-NEXT: ret float 0xC7EFFFFFE0000000 -; - %r = call nnan ninf float @llvm.minnum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_maxnum_const_neg_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_maxnum_const_neg_max_nnan_ninf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan ninf float @llvm.maxnum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_maximum_const_neg_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_maximum_const_neg_max_nnan_ninf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %r = call nnan ninf float @llvm.maximum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -define float @test_minimum_const_neg_max_nnan_ninf(float %x) { -; CHECK-LABEL: @test_minimum_const_neg_max_nnan_ninf( -; CHECK-NEXT: ret float 0xC7EFFFFFE0000000 -; - %r = call nnan ninf float @llvm.minimum.f32(float %x, float 0xc7efffffe0000000) - ret float %r -} - -; From the LangRef for minnum/maxnum: -; "If either operand is a NaN, returns the other non-NaN operand." - -define double @maxnum_nan_op0(double %x) { -; CHECK-LABEL: @maxnum_nan_op0( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.maxnum.f64(double 0x7ff8000000000000, double %x) - ret double %r -} - -define double @maxnum_nan_op1(double %x) { -; CHECK-LABEL: @maxnum_nan_op1( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.maxnum.f64(double %x, double 0x7ff800000000dead) - ret double %r -} - -define double @minnum_nan_op0(double %x) { -; CHECK-LABEL: @minnum_nan_op0( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.minnum.f64(double 0x7ff8000dead00000, double %x) - ret double %r -} - -define double @minnum_nan_op1(double %x) { -; CHECK-LABEL: @minnum_nan_op1( -; CHECK-NEXT: ret double [[X:%.*]] -; - %r = call double @llvm.minnum.f64(double %x, double 0x7ff800dead00dead) - ret double %r -} - -define <2 x double> @maxnum_nan_op0_vec(<2 x double> %x) { -; CHECK-LABEL: @maxnum_nan_op0_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @maxnum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @maxnum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define <2 x double> @minnum_nan_op0_vec(<2 x double> %x) { -; CHECK-LABEL: @minnum_nan_op0_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @minnum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @minnum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> [[X:%.*]] -; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define float @maxnum_undef_op1(float %x) { -; CHECK-LABEL: @maxnum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maxnum.f32(float %x, float undef) - ret float %val -} - -define float @maxnum_poison_op1(float %x) { -; CHECK-LABEL: @maxnum_poison_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maxnum.f32(float %x, float poison) - ret float %val -} - -define float @maxnum_undef_op0(float %x) { -; CHECK-LABEL: @maxnum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maxnum.f32(float undef, float %x) - ret float %val -} - -define float @maxnum_poison_op0(float %x) { -; CHECK-LABEL: @maxnum_poison_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maxnum.f32(float poison, float %x) - ret float %val -} - -define float @minnum_undef_op1(float %x) { -; CHECK-LABEL: @minnum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minnum.f32(float %x, float undef) - ret float %val -} - -define float @minnum_poison_op1(float %x) { -; CHECK-LABEL: @minnum_poison_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minnum.f32(float %x, float poison) - ret float %val -} - -define float @minnum_undef_op0(float %x) { -; CHECK-LABEL: @minnum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minnum.f32(float undef, float %x) - ret float %val -} - -define float @minnum_poison_op0(float %x) { -; CHECK-LABEL: @minnum_poison_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minnum.f32(float poison, float %x) - ret float %val -} - -define float @minnum_undef_undef(float %x) { -; CHECK-LABEL: @minnum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.minnum.f32(float undef, float undef) - ret float %val -} - -define float @minnum_poison_undef(float %x) { -; CHECK-LABEL: @minnum_poison_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.minnum.f32(float poison, float undef) - ret float %val -} - -define float @minnum_undef_poison(float %x) { -; CHECK-LABEL: @minnum_undef_poison( -; CHECK-NEXT: ret float poison -; - %val = call float @llvm.minnum.f32(float undef, float poison) - ret float %val -} - -define float @maxnum_undef_undef(float %x) { -; CHECK-LABEL: @maxnum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.maxnum.f32(float undef, float undef) - ret float %val -} - -define float @maxnum_poison_undef(float %x) { -; CHECK-LABEL: @maxnum_poison_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.maxnum.f32(float poison, float undef) - ret float %val -} - -define float @maxnum_undef_poison(float %x) { -; CHECK-LABEL: @maxnum_undef_poison( -; CHECK-NEXT: ret float poison -; - %val = call float @llvm.maxnum.f32(float undef, float poison) - ret float %val -} - -define float @minnum_same_args(float %x) { -; CHECK-LABEL: @minnum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.minnum.f32(float %x, float %x) - ret float %y -} - -define float @maxnum_same_args(float %x) { -; CHECK-LABEL: @maxnum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.maxnum.f32(float %x, float %x) - ret float %y -} - -define float @minnum_x_minnum_x_y(float %x, float %y) { -; CHECK-LABEL: @minnum_x_minnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %x, float %a) - ret float %b -} - -define float @minnum_y_minnum_x_y(float %x, float %y) { -; CHECK-LABEL: @minnum_y_minnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %y, float %a) - ret float %b -} - -define float @minnum_x_y_minnum_x(float %x, float %y) { -; CHECK-LABEL: @minnum_x_y_minnum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %a, float %x) - ret float %b -} - -define float @minnum_x_y_minnum_y(float %x, float %y) { -; CHECK-LABEL: @minnum_x_y_minnum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @minnum_z_minnum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @minnum_z_minnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minnum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @minnum_x_y_minnum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @minnum_x_y_minnum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minnum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minnum.f32(float %x, float %y) - %b = call float @llvm.minnum.f32(float %a, float %z) - ret float %b -} - -; minnum(X, -INF) --> -INF - -define float @minnum_neginf(float %x) { -; CHECK-LABEL: @minnum_neginf( -; CHECK-NEXT: ret float 0xFFF0000000000000 -; - %val = call float @llvm.minnum.f32(float %x, float 0xFFF0000000000000) - ret float %val -} - -define <2 x double> @minnum_neginf_commute_vec(<2 x double> %x) { -; CHECK-LABEL: @minnum_neginf_commute_vec( -; CHECK-NEXT: ret <2 x double> splat (double 0xFFF0000000000000) -; - %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -; negative test - -define float @minnum_inf(float %x) { -; CHECK-LABEL: @minnum_inf( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minnum.f32(float 0x7FF0000000000000, float [[X:%.*]]) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.minnum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} -define float @maxnum_x_maxnum_x_y(float %x, float %y) { -; CHECK-LABEL: @maxnum_x_maxnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %x, float %a) - ret float %b -} - -define float @maxnum_y_maxnum_x_y(float %x, float %y) { -; CHECK-LABEL: @maxnum_y_maxnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %y, float %a) - ret float %b -} - -define float @maxnum_x_y_maxnum_x(float %x, float %y) { -; CHECK-LABEL: @maxnum_x_y_maxnum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %a, float %x) - ret float %b -} - -define float @maxnum_x_y_maxnum_y(float %x, float %y) { -; CHECK-LABEL: @maxnum_x_y_maxnum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @maxnum_z_maxnum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @maxnum_z_maxnum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maxnum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @maxnum_x_y_maxnum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @maxnum_x_y_maxnum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maxnum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maxnum.f32(float %x, float %y) - %b = call float @llvm.maxnum.f32(float %a, float %z) - ret float %b -} - -; maxnum(X, INF) --> INF - -define <2 x double> @maxnum_inf(<2 x double> %x) { -; CHECK-LABEL: @maxnum_inf( -; CHECK-NEXT: ret <2 x double> splat (double 0x7FF0000000000000) -; - %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double>) - ret <2 x double> %val -} - -define float @maxnum_inf_commute(float %x) { -; CHECK-LABEL: @maxnum_inf_commute( -; CHECK-NEXT: ret float 0x7FF0000000000000 -; - %val = call float @llvm.maxnum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} - -; negative test - -define float @maxnum_neginf(float %x) { -; CHECK-LABEL: @maxnum_neginf( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float [[X:%.*]]) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.maxnum.f32(float 0xFFF0000000000000, float %x) - ret float %val -} - -; From the LangRef for minimum/maximum: -; "If either operand is a NaN, returns NaN." - -define double @maximum_nan_op0(double %x) { -; CHECK-LABEL: @maximum_nan_op0( -; CHECK-NEXT: ret double 0x7FF8000000000000 -; - %r = call double @llvm.maximum.f64(double 0x7ff8000000000000, double %x) - ret double %r -} - -define double @maximum_nan_op1(double %x) { -; CHECK-LABEL: @maximum_nan_op1( -; CHECK-NEXT: ret double 0x7FF800000000DEAD -; - %r = call double @llvm.maximum.f64(double %x, double 0x7ff800000000dead) - ret double %r -} - -define double @minimum_nan_op0(double %x) { -; CHECK-LABEL: @minimum_nan_op0( -; CHECK-NEXT: ret double 0x7FF8000DEAD00000 -; - %r = call double @llvm.minimum.f64(double 0x7ff8000dead00000, double %x) - ret double %r -} - -define double @minimum_nan_op1(double %x) { -; CHECK-LABEL: @minimum_nan_op1( -; CHECK-NEXT: ret double 0x7FF800DEAD00DEAD -; - %r = call double @llvm.minimum.f64(double %x, double 0x7ff800dead00dead) - ret double %r -} - -define <2 x double> @maximum_nan_op0_vec_partial_poison(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op0_vec_partial_poison( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @maximum_nan_op1_vec_partial_poison(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op1_vec_partial_poison( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @maximum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define <2 x double> @minimum_nan_op0_vec_partial_poison(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op0_vec_partial_poison( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -define <2 x double> @minimum_nan_op1_vec_partial_poison(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op1_vec_partial_poison( -; CHECK-NEXT: ret <2 x double> -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define <2 x double> @minimum_nan_op1_vec(<2 x double> %x) { -; CHECK-LABEL: @minimum_nan_op1_vec( -; CHECK-NEXT: ret <2 x double> splat (double 0x7FF800DEAD00DEAD) -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> ) - ret <2 x double> %r -} - -define float @maximum_undef_op1(float %x) { -; CHECK-LABEL: @maximum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maximum.f32(float %x, float undef) - ret float %val -} - -define float @maximum_poison_op1(float %x) { -; CHECK-LABEL: @maximum_poison_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maximum.f32(float %x, float poison) - ret float %val -} - -define float @maximum_undef_op0(float %x) { -; CHECK-LABEL: @maximum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maximum.f32(float undef, float %x) - ret float %val -} - -define float @maximum_poison_op0(float %x) { -; CHECK-LABEL: @maximum_poison_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.maximum.f32(float poison, float %x) - ret float %val -} - -define float @minimum_undef_op1(float %x) { -; CHECK-LABEL: @minimum_undef_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float %x, float undef) - ret float %val -} - -define float @minimum_poison_op1(float %x) { -; CHECK-LABEL: @minimum_poison_op1( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float %x, float poison) - ret float %val -} - -define float @minimum_undef_op0(float %x) { -; CHECK-LABEL: @minimum_undef_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float undef, float %x) - ret float %val -} - -define float @minimum_poison_op0(float %x) { -; CHECK-LABEL: @minimum_poison_op0( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float poison, float %x) - ret float %val -} - -define float @minimum_undef_undef(float %x) { -; CHECK-LABEL: @minimum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.minimum.f32(float undef, float undef) - ret float %val -} - -define float @maximum_undef_undef(float %x) { -; CHECK-LABEL: @maximum_undef_undef( -; CHECK-NEXT: ret float undef -; - %val = call float @llvm.maximum.f32(float undef, float undef) - ret float %val -} - -define float @minimum_same_args(float %x) { -; CHECK-LABEL: @minimum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.minimum.f32(float %x, float %x) - ret float %y -} - -define float @maximum_same_args(float %x) { -; CHECK-LABEL: @maximum_same_args( -; CHECK-NEXT: ret float [[X:%.*]] -; - %y = call float @llvm.maximum.f32(float %x, float %x) - ret float %y -} - -define float @minimum_x_minimum_x_y(float %x, float %y) { -; CHECK-LABEL: @minimum_x_minimum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %x, float %a) - ret float %b -} - -define float @minimum_y_minimum_x_y(float %x, float %y) { -; CHECK-LABEL: @minimum_y_minimum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %y, float %a) - ret float %b -} - -define float @minimum_x_y_minimum_x(float %x, float %y) { -; CHECK-LABEL: @minimum_x_y_minimum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %a, float %x) - ret float %b -} - -define float @minimum_x_y_minimum_y(float %x, float %y) { -; CHECK-LABEL: @minimum_x_y_minimum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @minimum_z_minimum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @minimum_z_minimum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minimum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @minimum_x_y_minimum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @minimum_x_y_minimum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.minimum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.minimum.f32(float %x, float %y) - %b = call float @llvm.minimum.f32(float %a, float %z) - ret float %b -} - -define float @maximum_x_maximum_x_y(float %x, float %y) { -; CHECK-LABEL: @maximum_x_maximum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %x, float %a) - ret float %b -} - -define float @maximum_y_maximum_x_y(float %x, float %y) { -; CHECK-LABEL: @maximum_y_maximum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %y, float %a) - ret float %b -} - -define float @maximum_x_y_maximum_x(float %x, float %y) { -; CHECK-LABEL: @maximum_x_y_maximum_x( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %a, float %x) - ret float %b -} - -define float @maximum_x_y_maximum_y(float %x, float %y) { -; CHECK-LABEL: @maximum_x_y_maximum_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[A]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %a, float %y) - ret float %b -} - -; negative test - -define float @maximum_z_maximum_x_y(float %x, float %y, float %z) { -; CHECK-LABEL: @maximum_z_maximum_x_y( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maximum.f32(float [[Z:%.*]], float [[A]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %z, float %a) - ret float %b -} - -; negative test - -define float @maximum_x_y_maximum_z(float %x, float %y, float %z) { -; CHECK-LABEL: @maximum_x_y_maximum_z( -; CHECK-NEXT: [[A:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[B:%.*]] = call float @llvm.maximum.f32(float [[A]], float [[Z:%.*]]) -; CHECK-NEXT: ret float [[B]] -; - %a = call float @llvm.maximum.f32(float %x, float %y) - %b = call float @llvm.maximum.f32(float %a, float %z) - ret float %b -} - -; negative test - minimum(X, -INF) != -INF because X could be NaN - -define float @minimum_neginf(float %x) { -; CHECK-LABEL: @minimum_neginf( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float 0xFFF0000000000000) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.minimum.f32(float %x, float 0xFFF0000000000000) - ret float %val -} - -; negative test - minimum(-INF, X) != -INF because X could be NaN - -define <2 x double> @minimum_neginf_commute_vec(<2 x double> %x) { -; CHECK-LABEL: @minimum_neginf_commute_vec( -; CHECK-NEXT: [[R:%.*]] = call <2 x double> @llvm.minimum.v2f64(<2 x double> splat (double 0xFFF0000000000000), <2 x double> [[X:%.*]]) -; CHECK-NEXT: ret <2 x double> [[R]] -; - %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) - ret <2 x double> %r -} - -; TODO: minimum(INF, X) --> X - -define float @minimum_inf(float %x) { -; CHECK-LABEL: @minimum_inf( -; CHECK-NEXT: ret float [[X:%.*]] -; - %val = call float @llvm.minimum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} - -; negative test - maximum(X, INF) != INF because X could be NaN - -define <2 x double> @maximum_inf(<2 x double> %x) { -; CHECK-LABEL: @maximum_inf( -; CHECK-NEXT: [[VAL:%.*]] = call <2 x double> @llvm.maximum.v2f64(<2 x double> [[X:%.*]], <2 x double> splat (double 0x7FF0000000000000)) -; CHECK-NEXT: ret <2 x double> [[VAL]] -; - %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double>) - ret <2 x double> %val -} - -; negative test - maximum(INF, X) != INF because X could be NaN - -define float @maximum_inf_commute(float %x) { -; CHECK-LABEL: @maximum_inf_commute( -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.maximum.f32(float 0x7FF0000000000000, float [[X:%.*]]) -; CHECK-NEXT: ret float [[VAL]] -; - %val = call float @llvm.maximum.f32(float 0x7FF0000000000000, float %x) - ret float %val -} - -define float @maximum_maximum_minimum(float %x, float %y) { -; CHECK-LABEL: @maximum_maximum_minimum( -; CHECK-NEXT: [[MAX:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[MAX]] -; - %max = call float @llvm.maximum.f32(float %x, float %y) - %min = call float @llvm.minimum.f32(float %x, float %y) - %val = call float @llvm.maximum.f32(float %max, float %min) - ret float %val -} - -define double @maximum_minimum_maximum(double %x, double %y) { -; CHECK-LABEL: @maximum_minimum_maximum( -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.maximum.f64(double [[X:%.*]], double [[Y:%.*]]) -; CHECK-NEXT: ret double [[MAX]] -; - %max = call double @llvm.maximum.f64(double %x, double %y) - %min = call double @llvm.minimum.f64(double %x, double %y) - %val = call double @llvm.maximum.f64(double %min, double %max) - ret double %val -} - -define float @maximum_minimum_minimum(float %x, float %y) { -; CHECK-LABEL: @maximum_minimum_minimum( -; CHECK-NEXT: [[MIN1:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[MIN2:%.*]] = call float @llvm.minimum.f32(float [[X]], float [[Y]]) -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.maximum.f32(float [[MIN1]], float [[MIN2]]) -; CHECK-NEXT: ret float [[VAL]] -; - %min1 = call float @llvm.minimum.f32(float %x, float %y) - %min2 = call float @llvm.minimum.f32(float %x, float %y) - %val = call float @llvm.maximum.f32(float %min1, float %min2) - ret float %val -} - -define half @maximum_maximum_maximum(half %x, half %y) { -; CHECK-LABEL: @maximum_maximum_maximum( -; CHECK-NEXT: [[MAX1:%.*]] = call half @llvm.maximum.f16(half [[X:%.*]], half [[Y:%.*]]) -; CHECK-NEXT: ret half [[MAX1]] -; - %max1 = call half @llvm.maximum.f16(half %x, half %y) - %max2 = call half @llvm.maximum.f16(half %x, half %y) - %val = call half @llvm.maximum.f16(half %max1, half %max2) - ret half %val -} - -define <2 x float> @minimum_maximum_minimum(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @minimum_maximum_minimum( -; CHECK-NEXT: [[MIN:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) -; CHECK-NEXT: ret <2 x float> [[MIN]] -; - %max = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> %y) - %min = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> %y) - %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max, <2 x float> %min) - ret <2 x float> %val -} - -define <2 x double> @minimum_minimum_maximum(<2 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @minimum_minimum_maximum( -; CHECK-NEXT: [[MIN:%.*]] = call <2 x double> @llvm.minimum.v2f64(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) -; CHECK-NEXT: ret <2 x double> [[MIN]] -; - %max = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> %y) - %min = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y) - %val = call <2 x double> @llvm.minimum.v2f64(<2 x double> %min, <2 x double> %max) - ret <2 x double> %val -} - -define float @minimum_maximum_maximum(float %x, float %y) { -; CHECK-LABEL: @minimum_maximum_maximum( -; CHECK-NEXT: [[MAX1:%.*]] = call float @llvm.maximum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: [[MAX2:%.*]] = call float @llvm.maximum.f32(float [[X]], float [[Y]]) -; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.minimum.f32(float [[MAX1]], float [[MAX2]]) -; CHECK-NEXT: ret float [[VAL]] -; - %max1 = call float @llvm.maximum.f32(float %x, float %y) - %max2 = call float @llvm.maximum.f32(float %x, float %y) - %val = call float @llvm.minimum.f32(float %max1, float %max2) - ret float %val -} - -define float @minimum_minimum_minimum(float %x, float %y) { -; CHECK-LABEL: @minimum_minimum_minimum( -; CHECK-NEXT: [[MIN1:%.*]] = call float @llvm.minimum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[MIN1]] -; - %min1 = call float @llvm.minimum.f32(float %x, float %y) - %min2 = call float @llvm.minimum.f32(float %x, float %y) - %val = call float @llvm.minimum.f32(float %min1, float %min2) - ret float %val -} - -define double @maxnum_maxnum_minnum(double %x, double %y) { -; CHECK-LABEL: @maxnum_maxnum_minnum( -; CHECK-NEXT: [[MAX:%.*]] = call double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]]) -; CHECK-NEXT: ret double [[MAX]] -; - %max = call double @llvm.maxnum.f64(double %x, double %y) - %min = call double @llvm.minnum.f64(double %x, double %y) - %val = call double @llvm.maxnum.f64(double %max, double %min) - ret double %val -} - -define <2 x float> @maxnum_minnum_maxnum(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @maxnum_minnum_maxnum( -; CHECK-NEXT: [[MAX:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) -; CHECK-NEXT: ret <2 x float> [[MAX]] -; - %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) - %min = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) - %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %min, <2 x float> %max) - ret <2 x float> %val -} - -define <2 x double> @maxnum_minnum_minmum(<2 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @maxnum_minnum_minmum( -; CHECK-NEXT: [[MIN1:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) -; CHECK-NEXT: [[MIN2:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[X]], <2 x double> [[Y]]) -; CHECK-NEXT: [[VAL:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[MIN1]], <2 x double> [[MIN2]]) -; CHECK-NEXT: ret <2 x double> [[VAL]] -; - %min1 = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) - %min2 = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) - %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %min1, <2 x double> %min2) - ret <2 x double> %val -} - -define float @maxnum_maxnum_maxnum(float %x, float %y) { -; CHECK-LABEL: @maxnum_maxnum_maxnum( -; CHECK-NEXT: [[MAX1:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[MAX1]] -; - %max1 = call float @llvm.maxnum.f32(float %x, float %y) - %max2 = call float @llvm.maxnum.f32(float %x, float %y) - %val = call float @llvm.maxnum.f32(float %max1, float %max2) - ret float %val -} - -define double @minnum_maxnum_minnum(double %x, double %y) { -; CHECK-LABEL: @minnum_maxnum_minnum( -; CHECK-NEXT: [[MIN:%.*]] = call double @llvm.minnum.f64(double [[X:%.*]], double [[Y:%.*]]) -; CHECK-NEXT: ret double [[MIN]] -; - %max = call double @llvm.maxnum.f64(double %x, double %y) - %min = call double @llvm.minnum.f64(double %x, double %y) - %val = call double @llvm.minnum.f64(double %max, double %min) - ret double %val -} - -define float @minnum_minnum_maxnum(float %x, float %y) { -; CHECK-LABEL: @minnum_minnum_maxnum( -; CHECK-NEXT: [[MIN:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) -; CHECK-NEXT: ret float [[MIN]] -; - %max = call float @llvm.maxnum.f32(float %x, float %y) - %min = call float @llvm.minnum.f32(float %x, float %y) - %val = call float @llvm.minnum.f32(float %min, float %max) - ret float %val -} - -define <2 x float> @minnum_maxnum_maxnum(<2 x float> %x, <2 x float> %y) { -; CHECK-LABEL: @minnum_maxnum_maxnum( -; CHECK-NEXT: [[MAX1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) -; CHECK-NEXT: [[MAX2:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) -; CHECK-NEXT: [[VAL:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[MAX1]], <2 x float> [[MAX2]]) -; CHECK-NEXT: ret <2 x float> [[VAL]] -; - %max1 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) - %max2 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) - %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max1, <2 x float> %max2) - ret <2 x float> %val -} - -define <2 x double> @minnum_minnum_minmum(<2 x double> %x, <2 x double> %y) { -; CHECK-LABEL: @minnum_minnum_minmum( -; CHECK-NEXT: [[MIN1:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[X:%.*]], <2 x double> [[Y:%.*]]) -; CHECK-NEXT: ret <2 x double> [[MIN1]] -; - %min1 = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) - %min2 = call <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> %y) - %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %min1, <2 x double> %min2) - ret <2 x double> %val +;############################################################### +;# NaN Tests # +;############################################################### +; minnum(X, qnan) -> X +; maxnum(X, qnan) -> X +; TODO: minnum(X, snan) -> qnan (currently we treat SNaN the same as QNaN) +; TODO: maxnum(X, snan) -> qnan (currently we treat SNaN the same as QNaN) +; minimum(X, nan) -> qnan +; maximum(X, nan) -> qnan +; TODO: minimumnum(X, nan) -> X +; TODO: maximumnum(X, nan) -> X + +define void @minmax_qnan_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_qnan_f32( +; CHECK-NEXT: store float [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float 0x7FFF000000000000, ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: store float 0x7FFF000000000000, ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float 0x7FFF000000000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float 0x7FFF000000000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float 0x7FFF000000000000) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float 0x7FFF000000000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float 0x7FFF000000000000) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float 0x7FFF000000000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float 0x7FFF000000000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float 0x7FFF000000000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; TODO currently snan is treated the same as qnan, but maxnum/minnum should really return qnan for these cases, not X +define void @minmax_snan_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_snan_f32( +; CHECK-NEXT: store float [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float 0x7FFC000000000000, ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: store float 0x7FFC000000000000, ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float 0x7FF4000000000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float 0x7FF4000000000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float 0x7FF4000000000000) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float 0x7FF4000000000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float 0x7FF4000000000000) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float 0x7FF4000000000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float 0x7FF4000000000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float 0x7FF4000000000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +define void @minmax_qnan_nxv2f64_op0( %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_qnan_nxv2f64_op0( +; CHECK-NEXT: store [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 16 +; CHECK-NEXT: store [[X]], ptr [[MAXNUM_RES:%.*]], align 16 +; CHECK-NEXT: store splat (double 0x7FF8000DEAD00000), ptr [[MINIMUM_RES:%.*]], align 16 +; CHECK-NEXT: store splat (double 0x7FF8000DEAD00000), ptr [[MAXIMUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call @llvm.minimumnum.nxv2f64( splat (double 0x7FF8000DEAD00000), [[X]]) +; CHECK-NEXT: store [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call @llvm.maximumnum.nxv2f64( splat (double 0x7FF8000DEAD00000), [[X]]) +; CHECK-NEXT: store [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: ret void +; + %minnum = call @llvm.minnum.nxv2f64( splat (double 0x7FF8000DEAD00000), %x) + store %minnum, ptr %minnum_res + %maxnum = call @llvm.maxnum.nxv2f64( splat (double 0x7FF8000DEAD00000), %x) + store %maxnum, ptr %maxnum_res + + %minimum = call @llvm.minimum.nxv2f64( splat (double 0x7FF8000DEAD00000), %x) + store %minimum, ptr %minimum_res + %maximum = call @llvm.maximum.nxv2f64( splat (double 0x7FF8000DEAD00000), %x) + store %maximum, ptr %maximum_res + + %minimumnum = call @llvm.minimumnum.nxv2f64( splat (double 0x7FF8000DEAD00000), %x) + store %minimumnum, ptr %minimumnum_res + %maximumnum = call @llvm.maximumnum.nxv2f64( splat (double 0x7FF8000DEAD00000), %x) + store %maximumnum, ptr %maximumnum_res + ret void +} + +; TODO currently snan is treated the same as qnan, but maxnum/minnum should really return qnan for these cases, not X +define void @minmax_snan_nxv2f64_op1( %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_snan_nxv2f64_op1( +; CHECK-NEXT: store [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 16 +; CHECK-NEXT: store [[X]], ptr [[MAXNUM_RES:%.*]], align 16 +; CHECK-NEXT: store splat (double 0x7FFC00DEAD00DEAD), ptr [[MINIMUM_RES:%.*]], align 16 +; CHECK-NEXT: store splat (double 0x7FFC00DEAD00DEAD), ptr [[MAXIMUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call @llvm.minimumnum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), [[X]]) +; CHECK-NEXT: store [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call @llvm.maximumnum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), [[X]]) +; CHECK-NEXT: store [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: ret void +; + %minnum = call @llvm.minnum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), %x) + store %minnum, ptr %minnum_res + %maxnum = call @llvm.maxnum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), %x) + store %maxnum, ptr %maxnum_res + + %minimum = call @llvm.minimum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), %x) + store %minimum, ptr %minimum_res + %maximum = call @llvm.maximum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), %x) + store %maximum, ptr %maximum_res + + %minimumnum = call @llvm.minimumnum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), %x) + store %minimumnum, ptr %minimumnum_res + %maximumnum = call @llvm.maximumnum.nxv2f64( splat (double 0x7FF400DEAD00DEAD), %x) + store %maximumnum, ptr %maximumnum_res + ret void +} + +; TODO Currently, we treat SNaN and QNaN the same. However, for maxnum and minnum, we should not optimize this, as we should return <%x0, QNaN> instead of <%x0, %x1> +define void @minmax_mixed_snan_qnan_v2f64(<2 x double> %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_mixed_snan_qnan_v2f64( +; CHECK-NEXT: store <2 x double> [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> [[X]], ptr [[MAXNUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> , ptr [[MINIMUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> , ptr [[MAXIMUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> , <2 x double> [[X]]) +; CHECK-NEXT: store <2 x double> [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> , <2 x double> [[X]]) +; CHECK-NEXT: store <2 x double> [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: ret void +; + %minnum = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %minnum, ptr %minnum_res + %maxnum = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %maxnum, ptr %maxnum_res + + %minimum = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %minimum, ptr %minimum_res + %maximum = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %maximum, ptr %maximum_res + + %minimumnum = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %minimumnum, ptr %minimumnum_res + %maximumnum = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %maximumnum, ptr %maximumnum_res + ret void +} + +; Test with vector variants (v2f64) with NaN and poison +; Use the poison element for flexibility to choose to return either the constant arg or the other arg X +define void @minmax_mixed_qnan_poison_v2f64(<2 x double> %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_mixed_qnan_poison_v2f64( +; CHECK-NEXT: store <2 x double> [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> [[X]], ptr [[MAXNUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> , ptr [[MINIMUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> , ptr [[MAXIMUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> , <2 x double> [[X]]) +; CHECK-NEXT: store <2 x double> [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> , <2 x double> [[X]]) +; CHECK-NEXT: store <2 x double> [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: ret void +; + %minnum = call <2 x double> @llvm.minnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %minnum, ptr %minnum_res + %maxnum = call <2 x double> @llvm.maxnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %maxnum, ptr %maxnum_res + + %minimum = call <2 x double> @llvm.minimum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %minimum, ptr %minimum_res + %maximum = call <2 x double> @llvm.maximum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %maximum, ptr %maximum_res + + %minimumnum = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %minimumnum, ptr %minimumnum_res + %maximumnum = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> , <2 x double> %x) + store <2 x double> %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Poison Tests # +;############################################################### +define void @minmax_poison_op0_f16(half %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_poison_op0_f16( +; CHECK-NEXT: store half [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 2 +; CHECK-NEXT: store half [[X]], ptr [[MAXNUM_RES:%.*]], align 2 +; CHECK-NEXT: store half [[X]], ptr [[MINIMUM_RES:%.*]], align 2 +; CHECK-NEXT: store half [[X]], ptr [[MAXIMUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call half @llvm.minimumnum.f16(half poison, half [[X]]) +; CHECK-NEXT: store half [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call half @llvm.maximumnum.f16(half poison, half [[X]]) +; CHECK-NEXT: store half [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 2 +; CHECK-NEXT: ret void +; + %minnum = call half @llvm.minnum.f16(half poison, half %x) + store half %minnum, ptr %minnum_res + %maxnum = call half @llvm.maxnum.f16(half poison, half %x) + store half %maxnum, ptr %maxnum_res + + %minimum = call half @llvm.minimum.f16(half poison, half %x) + store half %minimum, ptr %minimum_res + %maximum = call half @llvm.maximum.f16(half poison, half %x) + store half %maximum, ptr %maximum_res + + %minimumnum = call half @llvm.minimumnum.f16(half poison, half %x) + store half %minimumnum, ptr %minimumnum_res + %maximumnum = call half @llvm.maximumnum.f16(half poison, half %x) + store half %maximumnum, ptr %maximumnum_res + ret void +} + +define void @minmax_poison_op1_nxv2f64( %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_poison_op1_nxv2f64( +; CHECK-NEXT: store [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 16 +; CHECK-NEXT: store [[X]], ptr [[MAXNUM_RES:%.*]], align 16 +; CHECK-NEXT: store [[X]], ptr [[MINIMUM_RES:%.*]], align 16 +; CHECK-NEXT: store [[X]], ptr [[MAXIMUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call nnan @llvm.minimumnum.nxv2f64( [[X]], poison) +; CHECK-NEXT: store [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call nnan @llvm.maximumnum.nxv2f64( [[X]], poison) +; CHECK-NEXT: store [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: ret void +; + %minnum = call nnan @llvm.minnum.nxv2f64( %x, poison) + store %minnum, ptr %minnum_res + %maxnum = call nnan @llvm.maxnum.nxv2f64( %x, poison) + store %maxnum, ptr %maxnum_res + + %minimum = call nnan @llvm.minimum.nxv2f64( %x, poison) + store %minimum, ptr %minimum_res + %maximum = call nnan @llvm.maximum.nxv2f64( %x, poison) + store %maximum, ptr %maximum_res + + %minimumnum = call nnan @llvm.minimumnum.nxv2f64( %x, poison) + store %minimumnum, ptr %minimumnum_res + %maximumnum = call nnan @llvm.maximumnum.nxv2f64( %x, poison) + store %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Positive Infinity Tests # +;############################################################### +; maxnum(X, +inf) -> +inf (ignoring SNaN -> QNaN propagation) +; minnum(X, +inf) -> X if nnan (ignoring NaN quieting) +; maximum(X, +inf) -> +inf if nnan +; minimum(X, +inf) -> X (ignoring NaN quieting) +; TODO: maximumnum(X, +inf) -> +inf +; TODO: minimumnum(X, +inf) -> X if nnan (ignoring NaN quieting) + +; Can only optimize maxnum and minimum without the nnan flag +define void @minmax_pos_inf_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_pos_inf_f32( +; CHECK-NEXT: [[MINNUM:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x7FF0000000000000) +; CHECK-NEXT: store float [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float 0x7FF0000000000000, ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUM:%.*]] = call float @llvm.maximum.f32(float [[X]], float 0x7FF0000000000000) +; CHECK-NEXT: store float [[MAXIMUM]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float 0x7FF0000000000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float 0x7FF0000000000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float 0x7FF0000000000000) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float 0x7FF0000000000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float 0x7FF0000000000000) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float 0x7FF0000000000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float 0x7FF0000000000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float 0x7FF0000000000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; Can optimize all minmax variants if the nnan flag is set +; TODO maximumnum/minimumnum +define void @minmax_pos_inf_nnan_v2f32(<2 x float> %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_pos_inf_nnan_v2f32( +; CHECK-NEXT: store <2 x float> [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 8 +; CHECK-NEXT: store <2 x float> splat (float 0x7FF0000000000000), ptr [[MAXNUM_RES:%.*]], align 8 +; CHECK-NEXT: store <2 x float> [[X]], ptr [[MINIMUM_RES:%.*]], align 8 +; CHECK-NEXT: store <2 x float> splat (float 0x7FF0000000000000), ptr [[MAXIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call nnan <2 x float> @llvm.minimumnum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> [[X]]) +; CHECK-NEXT: store <2 x float> [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call nnan <2 x float> @llvm.maximumnum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> [[X]]) +; CHECK-NEXT: store <2 x float> [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: ret void +; + %minnum = call nnan <2 x float> @llvm.minnum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> %x) + store <2 x float> %minnum, ptr %minnum_res + %maxnum = call nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> %x) + store <2 x float> %maxnum, ptr %maxnum_res + + %minimum = call nnan <2 x float> @llvm.minimum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> %x) + store <2 x float> %minimum, ptr %minimum_res + %maximum = call nnan <2 x float> @llvm.maximum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> %x) + store <2 x float> %maximum, ptr %maximum_res + + %minimumnum = call nnan <2 x float> @llvm.minimumnum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> %x) + store <2 x float> %minimumnum, ptr %minimumnum_res + %maximumnum = call nnan <2 x float> @llvm.maximumnum.v2f32(<2 x float> splat (float 0x7FF0000000000000), <2 x float> %x) + store <2 x float> %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Negative Infinity Tests # +;############################################################### +; minnum(X, -inf) -> -inf (Ignoring SNaN -> QNaN propagation) +; maxnum(X, -inf) -> X if nnan +; minimum(X, -inf) -> -inf if nnan +; maximum(X, -inf) -> X (Ignoring NaN quieting) +; TODO: minimumnum(X, -inf) -> -inf +; TODO: maximumnum(X, -inf) -> X if nnan + +; Can only optimize minnum and maximum without the nnan flag +define void @minmax_neg_inf_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_neg_inf_f32( +; CHECK-NEXT: store float 0xFFF0000000000000, ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXNUM:%.*]] = call float @llvm.maxnum.f32(float [[X:%.*]], float 0xFFF0000000000000) +; CHECK-NEXT: store float [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUM:%.*]] = call float @llvm.minimum.f32(float [[X]], float 0xFFF0000000000000) +; CHECK-NEXT: store float [[MINIMUM]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float 0xFFF0000000000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float 0xFFF0000000000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float 0xFFF0000000000000) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float 0xFFF0000000000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float 0xFFF0000000000000) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float 0xFFF0000000000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float 0xFFF0000000000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float 0xFFF0000000000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; Can optimize all minmax variants if the nnan flag is set +; TODO maximumnum/minimumnum +define void @minmax_neg_inf_nnan_v2f64(<2 x double> %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_neg_inf_nnan_v2f64( +; CHECK-NEXT: store <2 x double> splat (double 0xFFF0000000000000), ptr [[MINNUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> [[X:%.*]], ptr [[MAXNUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> splat (double 0xFFF0000000000000), ptr [[MINIMUM_RES:%.*]], align 16 +; CHECK-NEXT: store <2 x double> [[X]], ptr [[MAXIMUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call nnan <2 x double> @llvm.minimumnum.v2f64(<2 x double> [[X]], <2 x double> splat (double 0xFFF0000000000000)) +; CHECK-NEXT: store <2 x double> [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call nnan <2 x double> @llvm.maximumnum.v2f64(<2 x double> [[X]], <2 x double> splat (double 0xFFF0000000000000)) +; CHECK-NEXT: store <2 x double> [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 16 +; CHECK-NEXT: ret void +; + %minnum = call nnan <2 x double> @llvm.minnum.v2f64(<2 x double> %x, <2 x double> splat (double 0xFFF0000000000000)) + store <2 x double> %minnum, ptr %minnum_res + %maxnum = call nnan <2 x double> @llvm.maxnum.v2f64(<2 x double> %x, <2 x double> splat (double 0xFFF0000000000000)) + store <2 x double> %maxnum, ptr %maxnum_res + + %minimum = call nnan <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> splat (double 0xFFF0000000000000)) + store <2 x double> %minimum, ptr %minimum_res + %maximum = call nnan <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> splat (double 0xFFF0000000000000)) + store <2 x double> %maximum, ptr %maximum_res + + %minimumnum = call nnan <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> splat (double 0xFFF0000000000000)) + store <2 x double> %minimumnum, ptr %minimumnum_res + %maximumnum = call nnan <2 x double> @llvm.maximumnum.v2f64(<2 x double> %x, <2 x double> splat (double 0xFFF0000000000000)) + store <2 x double> %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Largest Positive Float Constant Tests # +;############################################################### +; maxnum(X, +largest) -> +largest if ninf (ignoring SNaN -> QNaN propagation) +; minnum(X, +largest) -> X if ninf && nnan +; maximum(X, +largest) -> +largest if ninf && nnan +; minimum(X, +largest) -> X if ninf (ignoring quieting of sNaNs) +; TODO: maximumnum(X, +largest) -> +largest if ninf && nnan +; TODO: minimumnum(X, +largest) -> X if ninf && nnan + +; None of these should be optimized away without the nnan/ninf flags +define void @minmax_largest_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_largest_f32( +; CHECK-NEXT: [[MINNUM:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXNUM:%.*]] = call float @llvm.maxnum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUM:%.*]] = call float @llvm.minimum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUM]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUM:%.*]] = call float @llvm.maximum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUM]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float 0x47EFFFFFE0000000) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float 0x47EFFFFFE0000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; We can optimize maxnum and minimum if we know ninf is set +define void @minmax_largest_f32_ninf(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_largest_f32_ninf( +; CHECK-NEXT: [[MINNUM:%.*]] = call ninf float @llvm.minnum.f32(float [[X:%.*]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float 0x47EFFFFFE0000000, ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUM:%.*]] = call ninf float @llvm.maximum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUM]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call ninf float @llvm.minimumnum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call ninf float @llvm.maximumnum.f32(float [[X]], float 0x47EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call ninf float @llvm.minnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %minnum, ptr %minnum_res + %maxnum = call ninf float @llvm.maxnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call ninf float @llvm.minimum.f32(float %x, float 0x47EFFFFFE0000000) + store float %minimum, ptr %minimum_res + %maximum = call ninf float @llvm.maximum.f32(float %x, float 0x47EFFFFFE0000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call ninf float @llvm.minimumnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call ninf float @llvm.maximumnum.f32(float %x, float 0x47EFFFFFE0000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; All can be optimized if both the ninf and nnan flags are set (ignoring SNaN propagation in minnum/maxnum) +; TODO maximumnum/minimumnum +define void @minmax_largest_v2f32_ninf_nnan(<2 x float> %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_largest_v2f32_ninf_nnan( +; CHECK-NEXT: store <2 x float> [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 8 +; CHECK-NEXT: store <2 x float> splat (float 0x47EFFFFFE0000000), ptr [[MAXNUM_RES:%.*]], align 8 +; CHECK-NEXT: store <2 x float> [[X]], ptr [[MINIMUM_RES:%.*]], align 8 +; CHECK-NEXT: store <2 x float> splat (float 0x47EFFFFFE0000000), ptr [[MAXIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call nnan ninf <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[X]], <2 x float> splat (float 0x47EFFFFFE0000000)) +; CHECK-NEXT: store <2 x float> [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call nnan ninf <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[X]], <2 x float> splat (float 0x47EFFFFFE0000000)) +; CHECK-NEXT: store <2 x float> [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: ret void +; + %minnum = call ninf nnan <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> splat (float 0x47EFFFFFE0000000)) + store <2 x float> %minnum, ptr %minnum_res + %maxnum = call ninf nnan <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> splat (float 0x47EFFFFFE0000000)) + store <2 x float> %maxnum, ptr %maxnum_res + + %minimum = call ninf nnan <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> splat (float 0x47EFFFFFE0000000)) + store <2 x float> %minimum, ptr %minimum_res + %maximum = call ninf nnan <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> splat (float 0x47EFFFFFE0000000)) + store <2 x float> %maximum, ptr %maximum_res + + %minimumnum = call ninf nnan <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> splat (float 0x47EFFFFFE0000000)) + store <2 x float> %minimumnum, ptr %minimumnum_res + %maximumnum = call ninf nnan <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> splat (float 0x47EFFFFFE0000000)) + store <2 x float> %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Largest Negative Float Constant Tests # +;############################################################### +; maxnum(X, -largest) -> X if ninf && nnan +; minnum(X, -largest) -> -largest if ninf (ignoring SNaN -> QNaN propagation) +; maximum(X, -largest) -> X if ninf (ignoring quieting of sNaNs) +; minimum(X, -largest) -> -largest if ninf && nnan +; TODO: maximumnum(X, -largest) -> X if ninf && nnan +; TODO: minimumnum(X, -largest) -> -largest if ninf + +; None of these should be optimized away without the nnan/ninf flags +define void @minmax_neg_largest_f32(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_neg_largest_f32( +; CHECK-NEXT: [[MINNUM:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MINNUM]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXNUM:%.*]] = call float @llvm.maxnum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUM:%.*]] = call float @llvm.minimum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUM]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUM:%.*]] = call float @llvm.maximum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUM]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; We can optimize minnum and maximum if we know ninf is set +define void @minmax_neg_largest_f32_ninf(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_neg_largest_f32_ninf( +; CHECK-NEXT: store float 0xC7EFFFFFE0000000, ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXNUM:%.*]] = call ninf float @llvm.maxnum.f32(float [[X:%.*]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXNUM]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUM:%.*]] = call ninf float @llvm.minimum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUM]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call ninf float @llvm.minimumnum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call ninf float @llvm.maximumnum.f32(float [[X]], float 0xC7EFFFFFE0000000) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call ninf float @llvm.minnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %minnum, ptr %minnum_res + %maxnum = call ninf float @llvm.maxnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %maxnum, ptr %maxnum_res + + %minimum = call ninf float @llvm.minimum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %minimum, ptr %minimum_res + %maximum = call ninf float @llvm.maximum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %maximum, ptr %maximum_res + + %minimumnum = call ninf float @llvm.minimumnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call ninf float @llvm.maximumnum.f32(float %x, float 0xC7EFFFFFE0000000) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +; All can be optimized if both the ninf and nnan flags are set (ignoring SNaN propagation in minnum/maxnum) +; TODO maximumnum/minimumnum +define void @minmax_neg_largest_nxv2f32_nnan_ninf( %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_neg_largest_nxv2f32_nnan_ninf( +; CHECK-NEXT: store splat (float 0xC7EFFFFFE0000000), ptr [[MINNUM_RES:%.*]], align 8 +; CHECK-NEXT: store [[X:%.*]], ptr [[MAXNUM_RES:%.*]], align 8 +; CHECK-NEXT: store splat (float 0xC7EFFFFFE0000000), ptr [[MINIMUM_RES:%.*]], align 8 +; CHECK-NEXT: store [[X]], ptr [[MAXIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call nnan ninf @llvm.minimumnum.nxv2f32( [[X]], splat (float 0xC7EFFFFFE0000000)) +; CHECK-NEXT: store [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call nnan ninf @llvm.maximumnum.nxv2f32( [[X]], splat (float 0xC7EFFFFFE0000000)) +; CHECK-NEXT: store [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: ret void +; + %minnum = call nnan ninf @llvm.minnum.nxv2f32( %x, splat (float 0xC7EFFFFFE0000000)) + store %minnum, ptr %minnum_res + %maxnum = call nnan ninf @llvm.maxnum.nxv2f32( %x, splat (float 0xC7EFFFFFE0000000)) + store %maxnum, ptr %maxnum_res + + %minimum = call nnan ninf @llvm.minimum.nxv2f32( %x, splat (float 0xC7EFFFFFE0000000)) + store %minimum, ptr %minimum_res + %maximum = call nnan ninf @llvm.maximum.nxv2f32( %x, splat (float 0xC7EFFFFFE0000000)) + store %maximum, ptr %maximum_res + + %minimumnum = call nnan ninf @llvm.minimumnum.nxv2f32( %x, splat (float 0xC7EFFFFFE0000000)) + store %minimumnum, ptr %minimumnum_res + %maximumnum = call nnan ninf @llvm.maximumnum.nxv2f32( %x, splat (float 0xC7EFFFFFE0000000)) + store %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Min(x, x) / Max(x, x) # +;############################################################### +; min(x, x) -> x and max(x, x) -> x for all variants (ignoring SNaN quieting) +define void @minmax_same_args(float %x, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_same_args( +; CHECK-NEXT: store float [[X:%.*]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: store float [[X]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float [[X]]) +; CHECK-NEXT: store float [[MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float [[X]]) +; CHECK-NEXT: store float [[MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum = call float @llvm.minnum.f32(float %x, float %x) + store float %minnum, ptr %minnum_res + %maxnum = call float @llvm.maxnum.f32(float %x, float %x) + store float %maxnum, ptr %maxnum_res + + %minimum = call float @llvm.minimum.f32(float %x, float %x) + store float %minimum, ptr %minimum_res + %maximum = call float @llvm.maximum.f32(float %x, float %x) + store float %maximum, ptr %maximum_res + + %minimumnum = call float @llvm.minimumnum.f32(float %x, float %x) + store float %minimumnum, ptr %minimumnum_res + %maximumnum = call float @llvm.maximumnum.f32(float %x, float %x) + store float %maximumnum, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Nested calls: M(x, M(x, y)) -> M(x, y) # +;############################################################### +define void @minmax_x_minmax_xy(<2 x float> %x, <2 x float> %y, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_x_minmax_xy( +; CHECK-NEXT: [[MINNUM_XY:%.*]] = call <2 x float> @llvm.minnum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]]) +; CHECK-NEXT: store <2 x float> [[MINNUM_XY]], ptr [[MINNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXNUM_XY:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) +; CHECK-NEXT: store <2 x float> [[MAXNUM_XY]], ptr [[MAXNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUM_XY:%.*]] = call <2 x float> @llvm.minimum.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) +; CHECK-NEXT: store <2 x float> [[MINIMUM_XY]], ptr [[MINIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUM_XY:%.*]] = call <2 x float> @llvm.maximum.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) +; CHECK-NEXT: store <2 x float> [[MAXIMUM_XY]], ptr [[MAXIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUMNUM_XY:%.*]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) +; CHECK-NEXT: [[MINIMUMNUM_NESTED:%.*]] = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> [[X]], <2 x float> [[MINIMUMNUM_XY]]) +; CHECK-NEXT: store <2 x float> [[MINIMUMNUM_NESTED]], ptr [[MINIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUMNUM_XY:%.*]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[X]], <2 x float> [[Y]]) +; CHECK-NEXT: [[MAXIMUMNUM_NESTED:%.*]] = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> [[X]], <2 x float> [[MAXIMUMNUM_XY]]) +; CHECK-NEXT: store <2 x float> [[MAXIMUMNUM_NESTED]], ptr [[MAXIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: ret void +; + %minnum_xy = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %y) + %minnum_nested = call <2 x float> @llvm.minnum.v2f32(<2 x float> %x, <2 x float> %minnum_xy) + store <2 x float> %minnum_nested, ptr %minnum_res + + %maxnum_xy = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %y) + %maxnum_nested = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %x, <2 x float> %maxnum_xy) + store <2 x float> %maxnum_nested, ptr %maxnum_res + + %minimum_xy = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> %y) + %minimum_nested = call <2 x float> @llvm.minimum.v2f32(<2 x float> %x, <2 x float> %minimum_xy) + store <2 x float> %minimum_nested, ptr %minimum_res + + %maximum_xy = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> %y) + %maximum_nested = call <2 x float> @llvm.maximum.v2f32(<2 x float> %x, <2 x float> %maximum_xy) + store <2 x float> %maximum_nested, ptr %maximum_res + + %minimumnum_xy = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> %y) + %minimumnum_nested = call <2 x float> @llvm.minimumnum.v2f32(<2 x float> %x, <2 x float> %minimumnum_xy) + store <2 x float> %minimumnum_nested, ptr %minimumnum_res + + %maximumnum_xy = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> %y) + %maximumnum_nested = call <2 x float> @llvm.maximumnum.v2f32(<2 x float> %x, <2 x float> %maximumnum_xy) + store <2 x float> %maximumnum_nested, ptr %maximumnum_res + ret void +} + +; Negative test: m(Z, m(X,Y)) cannot be optimized to m(x, y) +define void @minmax_z_minmax_xy(float %x, float %y, float %z, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_z_minmax_xy( +; CHECK-NEXT: [[MINNUM_XY:%.*]] = call float @llvm.minnum.f32(float [[X:%.*]], float [[Y:%.*]]) +; CHECK-NEXT: [[MINNUM_NESTED:%.*]] = call float @llvm.minnum.f32(float [[Z:%.*]], float [[MINNUM_XY]]) +; CHECK-NEXT: store float [[MINNUM_NESTED]], ptr [[MINNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXNUM_XY:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]]) +; CHECK-NEXT: [[MAXNUM_NESTED:%.*]] = call float @llvm.maxnum.f32(float [[Z]], float [[MAXNUM_XY]]) +; CHECK-NEXT: store float [[MAXNUM_NESTED]], ptr [[MAXNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUM_XY:%.*]] = call float @llvm.minimum.f32(float [[X]], float [[Y]]) +; CHECK-NEXT: [[MINIMUM_NESTED:%.*]] = call float @llvm.minimum.f32(float [[Z]], float [[MINIMUM_XY]]) +; CHECK-NEXT: store float [[MINIMUM_NESTED]], ptr [[MINIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUM_XY:%.*]] = call float @llvm.maximum.f32(float [[X]], float [[Y]]) +; CHECK-NEXT: [[MAXIMUM_NESTED:%.*]] = call float @llvm.maximum.f32(float [[Z]], float [[MAXIMUM_XY]]) +; CHECK-NEXT: store float [[MAXIMUM_NESTED]], ptr [[MAXIMUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MINIMUMNUM_XY:%.*]] = call float @llvm.minimumnum.f32(float [[X]], float [[Y]]) +; CHECK-NEXT: [[MINIMUMNUM_NESTED:%.*]] = call float @llvm.minimumnum.f32(float [[Z]], float [[MINIMUMNUM_XY]]) +; CHECK-NEXT: store float [[MINIMUMNUM_NESTED]], ptr [[MINIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: [[MAXIMUMNUM_XY:%.*]] = call float @llvm.maximumnum.f32(float [[X]], float [[Y]]) +; CHECK-NEXT: [[MAXIMUMNUM_NESTED:%.*]] = call float @llvm.maximumnum.f32(float [[Z]], float [[MAXIMUMNUM_XY]]) +; CHECK-NEXT: store float [[MAXIMUMNUM_NESTED]], ptr [[MAXIMUMNUM_RES:%.*]], align 4 +; CHECK-NEXT: ret void +; + %minnum_xy = call float @llvm.minnum.f32(float %x, float %y) + %minnum_nested = call float @llvm.minnum.f32(float %z, float %minnum_xy) + store float %minnum_nested, ptr %minnum_res + + %maxnum_xy = call float @llvm.maxnum.f32(float %x, float %y) + %maxnum_nested = call float @llvm.maxnum.f32(float %z, float %maxnum_xy) + store float %maxnum_nested, ptr %maxnum_res + + %minimum_xy = call float @llvm.minimum.f32(float %x, float %y) + %minimum_nested = call float @llvm.minimum.f32(float %z, float %minimum_xy) + store float %minimum_nested, ptr %minimum_res + + %maximum_xy = call float @llvm.maximum.f32(float %x, float %y) + %maximum_nested = call float @llvm.maximum.f32(float %z, float %maximum_xy) + store float %maximum_nested, ptr %maximum_res + + %minimumnum_xy = call float @llvm.minimumnum.f32(float %x, float %y) + %minimumnum_nested = call float @llvm.minimumnum.f32(float %z, float %minimumnum_xy) + store float %minimumnum_nested, ptr %minimumnum_res + + %maximumnum_xy = call float @llvm.maximumnum.f32(float %x, float %y) + %maximumnum_nested = call float @llvm.maximumnum.f32(float %z, float %maximumnum_xy) + store float %maximumnum_nested, ptr %maximumnum_res + ret void +} + +;############################################################### +;# Nested calls: M(M(x, y), M'(x, y)) -> M(x, y) # +;############################################################### +; m(m(X,Y), m'(Y,X)) -> m(X, Y) +; Test where m' is the same op as m +define void @minmax_minmax_xy_minmax_yx(half %x, half %y, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_minmax_xy_minmax_yx( +; CHECK-NEXT: [[MINNUM_XY:%.*]] = call half @llvm.minnum.f16(half [[X:%.*]], half [[Y:%.*]]) +; CHECK-NEXT: store half [[MINNUM_XY]], ptr [[MINNUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MAXNUM_XY:%.*]] = call half @llvm.maxnum.f16(half [[X]], half [[Y]]) +; CHECK-NEXT: store half [[MAXNUM_XY]], ptr [[MAXNUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MINIMUM_XY:%.*]] = call half @llvm.minimum.f16(half [[X]], half [[Y]]) +; CHECK-NEXT: store half [[MINIMUM_XY]], ptr [[MINIMUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MAXIMUM_XY:%.*]] = call half @llvm.maximum.f16(half [[X]], half [[Y]]) +; CHECK-NEXT: store half [[MAXIMUM_XY]], ptr [[MAXIMUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MINIMUMNUM_XY:%.*]] = call half @llvm.minimumnum.f16(half [[X]], half [[Y]]) +; CHECK-NEXT: [[MINIMUMNUM_YX:%.*]] = call half @llvm.minimumnum.f16(half [[Y]], half [[X]]) +; CHECK-NEXT: [[FINAL_MINIMUMNUM:%.*]] = call half @llvm.minimumnum.f16(half [[MINIMUMNUM_XY]], half [[MINIMUMNUM_YX]]) +; CHECK-NEXT: store half [[FINAL_MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 2 +; CHECK-NEXT: [[MAXIMUMNUM_XY:%.*]] = call half @llvm.maximumnum.f16(half [[X]], half [[Y]]) +; CHECK-NEXT: [[MAXIMUMNUM_YX:%.*]] = call half @llvm.maximumnum.f16(half [[Y]], half [[X]]) +; CHECK-NEXT: [[FINAL_MAXIMUMNUM:%.*]] = call half @llvm.maximumnum.f16(half [[MAXIMUMNUM_XY]], half [[MAXIMUMNUM_YX]]) +; CHECK-NEXT: store half [[FINAL_MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 2 +; CHECK-NEXT: ret void +; + %minnum_xy = call half @llvm.minnum.f16(half %x, half %y) + %minnum_yx = call half @llvm.minnum.f16(half %y, half %x) + %final_minnum = call half @llvm.minnum.f16(half %minnum_xy, half %minnum_yx) + store half %final_minnum, ptr %minnum_res + + %maxnum_xy = call half @llvm.maxnum.f16(half %x, half %y) + %maxnum_yx = call half @llvm.maxnum.f16(half %y, half %x) + %final_maxnum = call half @llvm.maxnum.f16(half %maxnum_xy, half %maxnum_yx) + store half %final_maxnum, ptr %maxnum_res + + %minimum_xy = call half @llvm.minimum.f16(half %x, half %y) + %minimum_yx = call half @llvm.minimum.f16(half %y, half %x) + %final_minimum = call half @llvm.minimum.f16(half %minimum_xy, half %minimum_yx) + store half %final_minimum, ptr %minimum_res + + %maximum_xy = call half @llvm.maximum.f16(half %x, half %y) + %maximum_yx = call half @llvm.maximum.f16(half %y, half %x) + %final_maximum = call half @llvm.maximum.f16(half %maximum_xy, half %maximum_yx) + store half %final_maximum, ptr %maximum_res + + %minimumnum_xy = call half @llvm.minimumnum.f16(half %x, half %y) + %minimumnum_yx = call half @llvm.minimumnum.f16(half %y, half %x) + %final_minimumnum = call half @llvm.minimumnum.f16(half %minimumnum_xy, half %minimumnum_yx) + store half %final_minimumnum, ptr %minimumnum_res + + %maximumnum_xy = call half @llvm.maximumnum.f16(half %x, half %y) + %maximumnum_yx = call half @llvm.maximumnum.f16(half %y, half %x) + %final_maximumnum = call half @llvm.maximumnum.f16(half %maximumnum_xy, half %maximumnum_yx) + store half %final_maximumnum, ptr %maximumnum_res + ret void +} + +; m(m(X,Y), m'(Y,X)) -> m(X, Y) +; Test where m' is the opposite op from m +define void @minmax_minmax_xy_maxmin_yx(double %x, double %y, ptr %minnum_res, ptr %maxnum_res, ptr %minimum_res, ptr %maximum_res, ptr %minimumnum_res, ptr %maximumnum_res) { +; CHECK-LABEL: @minmax_minmax_xy_maxmin_yx( +; CHECK-NEXT: [[MINNUM_XY:%.*]] = call double @llvm.minnum.f64(double [[Y:%.*]], double [[X:%.*]]) +; CHECK-NEXT: store double [[MINNUM_XY]], ptr [[MINNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXNUM_XY:%.*]] = call double @llvm.maxnum.f64(double [[Y]], double [[X]]) +; CHECK-NEXT: store double [[MAXNUM_XY]], ptr [[MAXNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUM_XY:%.*]] = call double @llvm.minimum.f64(double [[Y]], double [[X]]) +; CHECK-NEXT: store double [[MINIMUM_XY]], ptr [[MINIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUM_XY:%.*]] = call double @llvm.maximum.f64(double [[Y]], double [[X]]) +; CHECK-NEXT: store double [[MAXIMUM_XY]], ptr [[MAXIMUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MINIMUMNUM_XY:%.*]] = call double @llvm.minimumnum.f64(double [[Y]], double [[X]]) +; CHECK-NEXT: [[MAXIMUMNUM_XY:%.*]] = call double @llvm.maximumnum.f64(double [[X]], double [[Y]]) +; CHECK-NEXT: [[FINAL_MINIMUMNUM:%.*]] = call double @llvm.minimumnum.f64(double [[MINIMUMNUM_XY]], double [[MAXIMUMNUM_XY]]) +; CHECK-NEXT: store double [[FINAL_MINIMUMNUM]], ptr [[MINIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: [[MAXIMUMNUM_XY1:%.*]] = call double @llvm.maximumnum.f64(double [[Y]], double [[X]]) +; CHECK-NEXT: [[MINIMUMNUM_YX:%.*]] = call double @llvm.minimumnum.f64(double [[X]], double [[Y]]) +; CHECK-NEXT: [[FINAL_MAXIMUMNUM:%.*]] = call double @llvm.maximumnum.f64(double [[MAXIMUMNUM_XY1]], double [[MINIMUMNUM_YX]]) +; CHECK-NEXT: store double [[FINAL_MAXIMUMNUM]], ptr [[MAXIMUMNUM_RES:%.*]], align 8 +; CHECK-NEXT: ret void +; + %minnum_xy = call double @llvm.minnum.f64(double %x, double %y) + %maxnum_yx = call double @llvm.maxnum.f64(double %y, double %x) + %final_minnum = call double @llvm.minnum.f64(double %minnum_xy, double %maxnum_yx) + store double %final_minnum, ptr %minnum_res + + %maxnum_xy = call double @llvm.maxnum.f64(double %x, double %y) + %minnum_yx = call double @llvm.minnum.f64(double %y, double %x) + %final_maxnum = call double @llvm.maxnum.f64(double %maxnum_xy, double %minnum_yx) + store double %final_maxnum, ptr %maxnum_res + + %minimum_xy = call double @llvm.minimum.f64(double %x, double %y) + %maximum_yx = call double @llvm.maximum.f64(double %y, double %x) + %final_minimum = call double @llvm.minimum.f64(double %minimum_xy, double %maximum_yx) + store double %final_minimum, ptr %minimum_res + + %maximum_xy = call double @llvm.maximum.f64(double %x, double %y) + %minimum_yx = call double @llvm.minimum.f64(double %y, double %x) + %final_maximum = call double @llvm.maximum.f64(double %maximum_xy, double %minimum_yx) + store double %final_maximum, ptr %maximum_res + + %minimumnum_xy = call double @llvm.minimumnum.f64(double %x, double %y) + %maximumnum_yx = call double @llvm.maximumnum.f64(double %y, double %x) + %final_minimumnum = call double @llvm.minimumnum.f64(double %minimumnum_xy, double %maximumnum_yx) + store double %final_minimumnum, ptr %minimumnum_res + + %maximumnum_xy = call double @llvm.maximumnum.f64(double %x, double %y) + %minimumnum_yx = call double @llvm.minimumnum.f64(double %y, double %x) + %final_maximumnum = call double @llvm.maximumnum.f64(double %maximumnum_xy, double %minimumnum_yx) + store double %final_maximumnum, ptr %maximumnum_res + ret void } diff --git a/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll b/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll index a3b8e4efbe939..180012a4e8211 100644 --- a/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll +++ b/llvm/test/Transforms/InstSimplify/get_active_lane_mask.ll @@ -18,3 +18,51 @@ define @foo_nxv8i1(i32 %a) { %mask = call @llvm.get.active.lane.mask.nxv8i1(i32 %a, i32 0) ret %mask } + +define @foo_vscale_max_255() vscale_range(1,16) { +; CHECK-LABEL: define @foo_vscale_max_255( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[MASK:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 255) +; CHECK-NEXT: ret [[MASK]] +; + %mask = call @llvm.get.active.lane.mask.nxv16i1(i32 0, i32 255) + ret %mask +} + +define @foo_vscale_max_256() vscale_range(1,16) { +; CHECK-LABEL: define @foo_vscale_max_256( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: ret splat (i1 true) +; + %mask = call @llvm.get.active.lane.mask.nxv16i1(i32 0, i32 256) + ret %mask +} + +define @foo_vscale_max_nxv2i1_1_1_2() vscale_range(1,1) { +; CHECK-LABEL: define @foo_vscale_max_nxv2i1_1_1_2( +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: ret splat (i1 true) +; + %mask = call @llvm.get.active.lane.mask.nxv2i1(i32 0, i32 2) + ret %mask +} + +define @foo_vscale_max_nxv4i1_2_4_16() vscale_range(2,4) { +; CHECK-LABEL: define @foo_vscale_max_nxv4i1_2_4_16( +; CHECK-SAME: ) #[[ATTR2:[0-9]+]] { +; CHECK-NEXT: ret splat (i1 true) +; + %mask = call @llvm.get.active.lane.mask.nxv4i1(i128 0, i128 16) + ret %mask +} + +define @foo_vscale_max_nxv4i1_2_4_1_16() vscale_range(2,4) { +; CHECK-LABEL: define @foo_vscale_max_nxv4i1_2_4_1_16( +; CHECK-SAME: ) #[[ATTR2]] { +; CHECK-NEXT: [[MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i128(i128 1, i128 16) +; CHECK-NEXT: ret [[MASK]] +; + %mask = call @llvm.get.active.lane.mask.nxv4i1(i128 1, i128 16) + ret %mask +} + diff --git a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll index e69da434c0caf..1d3a13bede799 100644 --- a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll +++ b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll @@ -75,13 +75,13 @@ define void @hoistable_alias_scope(ptr addrspace(8) %p, ptr addrspace(8) %q, i32 ; CHECK-LABEL: define void @hoistable_alias_scope ; CHECK-SAME: (ptr addrspace(8) [[P:%.*]], ptr addrspace(8) [[Q:%.*]], i32 [[BOUND:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0), !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope !3, !noalias !0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] @@ -165,8 +165,8 @@ define void @hoistable_buffer_construction_intrinsic(ptr addrspace(1) noalias %p ; CHECK-LABEL: define void @hoistable_buffer_construction_intrinsic ; CHECK-SAME: (ptr addrspace(1) noalias [[P_GLOBAL:%.*]], ptr addrspace(1) noalias [[Q_GLOBAL:%.*]], i32 [[BOUND:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[P:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P_GLOBAL]], i16 0, i32 0, i32 0) -; CHECK-NEXT: [[Q:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[Q_GLOBAL]], i16 0, i32 0, i32 0) +; CHECK-NEXT: [[P:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P_GLOBAL]], i16 0, i64 0, i32 0) +; CHECK-NEXT: [[Q:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[Q_GLOBAL]], i16 0, i64 0, i32 0) ; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0) ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: @@ -181,8 +181,8 @@ define void @hoistable_buffer_construction_intrinsic(ptr addrspace(1) noalias %p ; CHECK-NEXT: ret void ; entry: - %p = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p.global, i16 0, i32 0, i32 0) - %q = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %q.global, i16 0, i32 0, i32 0) + %p = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p.global, i16 0, i64 0, i32 0) + %q = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %q.global, i16 0, i64 0, i32 0) br label %loop loop: %i = phi i32 [0, %entry], [%next, %loop] @@ -212,13 +212,13 @@ define void @hoistable_buffer_construction_alias_scope(ptr addrspace(1) %p.globa ; CHECK-NEXT: [[Q_EXT:%.*]] = zext i48 [[Q_TRUNC]] to i128 ; CHECK-NEXT: [[P:%.*]] = inttoptr i128 [[P_EXT]] to ptr addrspace(8) ; CHECK-NEXT: [[Q:%.*]] = inttoptr i128 [[Q_EXT]] to ptr addrspace(8) -; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0), !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0), !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[ORIG:%.*]] = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[INC:%.*]] = add i32 [[HOISTABLE]], [[ORIG]] -; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope !3, !noalias !0 +; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 [[INC]], ptr addrspace(8) [[P]], i32 [[I]], i32 0, i32 0), !alias.scope [[META3]], !noalias [[META0]] ; CHECK-NEXT: [[NEXT]] = add i32 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp ult i32 [[NEXT]], [[BOUND]] ; CHECK-NEXT: br i1 [[COND]], label [[LOOP]], label [[TAIL:%.*]] @@ -257,7 +257,7 @@ declare i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) nocapture r ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #1 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) #2 -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) readnone nocapture, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) readnone nocapture, i16, i64, i32) attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/Transforms/LICM/sink-with-coroutine.ll b/llvm/test/Transforms/LICM/sink-with-coroutine.ll index 2013df11d9c44..33ec28e40c0f3 100644 --- a/llvm/test/Transforms/LICM/sink-with-coroutine.ll +++ b/llvm/test/Transforms/LICM/sink-with-coroutine.ll @@ -22,7 +22,7 @@ define i64 @licm(i64 %n) #0 { ; CHECK-NEXT: [[T6:%.*]] = icmp ult i64 [[T5]], [[N]] ; CHECK-NEXT: br i1 [[T6]], label [[LOOP]], label [[BB2]] ; CHECK: bb2: -; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.coro.end(ptr null, i1 false, token none) +; CHECK-NEXT: call void @llvm.coro.end(ptr null, i1 false, token none) ; CHECK-NEXT: ret i64 0 ; entry: @@ -46,7 +46,7 @@ await.ready: br i1 %t6, label %loop, label %bb2 bb2: - %res = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret i64 0 } @@ -82,7 +82,7 @@ define i64 @hoist_threadlocal() presplitcoroutine { ; CHECK: loop.end: ; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[FOR_BODY]] ; CHECK: exit: -; CHECK-NEXT: [[RES:%.*]] = call i1 @llvm.coro.end(ptr null, i1 false, token none) +; CHECK-NEXT: call void @llvm.coro.end(ptr null, i1 false, token none) ; CHECK-NEXT: ret i64 0 ; entry: @@ -119,12 +119,11 @@ loop.end: br i1 %cmp, label %exit, label %for.body exit: - %res = call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret i64 0 } declare i8 @llvm.coro.suspend(token, i1) -declare i1 @llvm.coro.end(ptr, i1, token) declare nonnull ptr @readonly_funcs() readonly declare nonnull ptr @llvm.threadlocal.address(ptr nonnull) nounwind readnone willreturn declare void @not.reachable() diff --git a/llvm/test/Transforms/LoopFusion/da_separate_loops.ll b/llvm/test/Transforms/LoopFusion/da_separate_loops.ll new file mode 100644 index 0000000000000..6359f48199290 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/da_separate_loops.ll @@ -0,0 +1,182 @@ +; REQUIRES: asserts + +; RUN: opt -passes=loop-fusion -da-disable-delinearization-checks -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s +; STAT: 2 loop-fusion - DA checks passed + +; The two inner loops have no dependency and are allowed to be fused as in the +; outer loops, different levels are accessed to. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) +;; A[i][j][k] = i; +;; for (long int k = 0; k < n; k++) +;; temp = A[i + 3][j + 2][k + 1]; +;; } +;; } + +define void @nonequal_outer_access(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, 1 + %add13 = add nsw i64 %j.07, 2 + %add14 = add nsw i64 %i.011, 3 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} + +; The two inner loops have a forward loop-carried dependency, allowing them +; to be fused. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) +;; A[i][j][k] = i; +;; for (long int k = 0; k < n; k++) +;; temp = A[i][j][k - 1]; +;; } +;; } + +define void @forward_dep(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, -1 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} \ No newline at end of file diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll index d63890df14461..f3cd5877bd4aa 100644 --- a/llvm/test/Transforms/LoopFusion/simple.ll +++ b/llvm/test/Transforms/LoopFusion/simple.ll @@ -298,42 +298,55 @@ bb23: ; preds = %bb17, %bb ret void } +; The following IR is a representation of the provided code below. With PR +; #146383, loop fusion is able to utilize the information from dependence +; analysis, enabling the loops in the function to be fused. +; +; void forward_dep(int *arg) { +; for (int i = 0; i < 100; i++) { +; int tmp = i - 3; +; int val = tmp * (i + 3) % i; +; arg[i] = val; +; } +; +; for (int j = 0; j < 100; j++) { +; int val = arg[j - 3]; +; arg[j] = val * 3; +; } +; } +; define void @forward_dep(ptr noalias %arg) { ; CHECK-LABEL: @forward_dep( -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB7:.*]] ; CHECK: bb7: -; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ] -; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ] +; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ] +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ] ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[DOT013]], -3 ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]] ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP13]], align 4 -; CHECK-NEXT: br label [[BB14]] +; CHECK-NEXT: br label %[[BB14:.*]] ; CHECK: bb14: -; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 -; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 -; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 -; CHECK-NEXT: br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]] -; CHECK: bb19.preheader: -; CHECK-NEXT: br label [[BB19:%.*]] -; CHECK: bb19: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ] ; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]] ; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 -; CHECK-NEXT: br label [[BB25]] +; CHECK-NEXT: br label %[[BB25]] ; CHECK: bb25: +; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 +; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]] ; CHECK: bb26: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopInterchange/bail-out-all-deps.ll b/llvm/test/Transforms/LoopInterchange/bail-out-all-deps.ll new file mode 100644 index 0000000000000..83cfd91c4da4c --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/bail-out-all-deps.ll @@ -0,0 +1,44 @@ +; RUN: opt < %s -passes=loop-interchange -pass-remarks-output=%t \ +; RUN: -disable-output +; RUN: FileCheck -input-file %t %s + +; Check that loop interchange bails out early when finding a direction vector +; with all '*' elements. +; +; for (int i = 0; i < 4; i++) +; for (int j = 0; j < 4; j++) +; A[i & val][j & val] = 0; + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Dependence +; CHECK-NEXT: Function: f +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: All loops have dependencies in all directions. +; CHECK-NEXT: ... +define void @f(ptr %A, i64 %val) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i64 [ 0, %entry ], [ %i.next, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i64 [ 0, %for.i.header ], [ %j.next, %for.j ] + %subscript.0 = and i64 %i, %val + %subscript.1 = and i64 %j, %val + %idx = getelementptr inbounds [4 x i8], ptr %A, i64 %subscript.0, i64 %subscript.1 + store i8 0, ptr %idx + %j.next = add nuw nsw i64 %j, 1 + %exit.j = icmp eq i64 %j.next, 4 + br i1 %exit.j, label %for.i.latch, label %for.j + +for.i.latch: + %i.next = add nuw nsw i64 %i, 1 + %exit.i = icmp eq i64 %i.next, 4 + br i1 %exit.i, label %exit, label %for.i.header + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopInterchange/confused-dependence.ll b/llvm/test/Transforms/LoopInterchange/confused-dependence.ll index 49b7b0e4797b8..94080949f0af8 100644 --- a/llvm/test/Transforms/LoopInterchange/confused-dependence.ll +++ b/llvm/test/Transforms/LoopInterchange/confused-dependence.ll @@ -1,6 +1,6 @@ -; REQUIRES: asserts -; RUN: opt < %s -passes=loop-interchange -verify-dom-info -verify-loop-info \ -; RUN: -disable-output -debug 2>&1 | FileCheck %s +; RUN: opt < %s -passes=loop-interchange -pass-remarks-output=%t \ +; RUN: -disable-output +; RUN: FileCheck -input-file %t %s ;; In the following case, p0 and p1 may alias, so the direction vector must be [* *]. ;; @@ -10,9 +10,13 @@ ;; p0[4 * i + j] = p1[4 * j + i]; ;; } -; CHECK: Dependency matrix before interchange: -; CHECK-NEXT: * * -; CHECK-NEXT: Processing InnerLoopId = 1 and OuterLoopId = 0 +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Dependence +; CHECK-NEXT: Function: may_alias +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: All loops have dependencies in all directions. +; CHECK-NEXT: ... define void @may_alias(ptr %p0, ptr %p1) { entry: br label %for.i.header diff --git a/llvm/test/Transforms/LoopInterchange/legality-for-scalar-deps.ll b/llvm/test/Transforms/LoopInterchange/legality-for-scalar-deps.ll index c30f9a399fed8..5f4a8486d9ad7 100644 --- a/llvm/test/Transforms/LoopInterchange/legality-for-scalar-deps.ll +++ b/llvm/test/Transforms/LoopInterchange/legality-for-scalar-deps.ll @@ -21,13 +21,13 @@ ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: issue46867 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. ; CHECK: --- !Missed ; CHECK-NEXT: Pass: loop-interchange ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: issue46867 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. define void @issue46867(ptr noundef captures(none) %s, i32 noundef %c, ptr noundef readonly captures(none) %ff) { entry: %tobool7.not = icmp eq i32 %c, 0 @@ -121,7 +121,7 @@ land.end: ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: issue47401 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. define void @issue47401(ptr noundef writeonly captures(none) %e, ptr noundef readonly captures(none) %bb) { entry: br label %for.cond1.preheader @@ -175,7 +175,7 @@ land.end: ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: issue47295 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. define void @issue47295(ptr noundef captures(none) %f, ptr noundef writeonly captures(none) %cc) { entry: br label %for.cond1.preheader @@ -221,7 +221,7 @@ for.body4: ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: issue54176 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. define void @issue54176(i32 noundef %n, i32 noundef %m, ptr noundef captures(none) %aa, ptr noundef readonly captures(none) %bb, ptr noundef writeonly captures(none) %cc) { entry: diff --git a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll index 73a566a310157..14836ba73433d 100644 --- a/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll +++ b/llvm/test/Transforms/LoopInterchange/loop-interchange-optimization-remarks.ll @@ -71,7 +71,7 @@ for.end19: ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: test01 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. ; CHECK-NEXT: ... ; DELIN: --- !Analysis @@ -147,7 +147,7 @@ define void @test02(i32 %k, i32 %N) { ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: test02 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. ; CHECK-NEXT: ... ; DELIN: --- !Analysis @@ -290,7 +290,7 @@ for.end17: ; CHECK-NEXT: Name: Dependence ; CHECK-NEXT: Function: test04 ; CHECK-NEXT: Args: -; CHECK-NEXT: - String: Cannot interchange loops due to dependences. +; CHECK-NEXT: - String: All loops have dependencies in all directions. ; CHECK-NEXT: ... ; DELIN: --- !Missed diff --git a/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll b/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll index 68089b43121c5..3af9e7304e3be 100644 --- a/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll +++ b/llvm/test/Transforms/LoopInterchange/unique-dep-matrix.ll @@ -2,14 +2,13 @@ ; RUN: opt < %s -passes=loop-interchange -S -debug 2>&1 | FileCheck %s ; CHECK: Dependency matrix before interchange: -; CHECK-NEXT: * * ; CHECK-NEXT: = * ; CHECK-NEXT: < * ; CHECK-NEXT: Processing InnerLoopId ; This example is taken from github issue #54176 ; -define void @foo(i32 noundef %n, i32 noundef %m, ptr nocapture noundef %aa, ptr nocapture noundef readonly %bb, ptr nocapture noundef writeonly %cc) { +define void @foo(i32 noundef %n, i32 noundef %m, ptr nocapture noundef noalias %aa, ptr nocapture noundef readonly noalias %bb, ptr nocapture noundef writeonly noalias %cc) { entry: %arrayidx7 = getelementptr inbounds i8, ptr %aa, i64 512 br label %for.cond1.preheader diff --git a/llvm/test/Transforms/LoopUnroll/scevunroll.ll b/llvm/test/Transforms/LoopUnroll/scevunroll.ll index b6b14e365cc1d..fa55eab062198 100644 --- a/llvm/test/Transforms/LoopUnroll/scevunroll.ll +++ b/llvm/test/Transforms/LoopUnroll/scevunroll.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -passes='loop(indvars),loop-unroll' -verify-loop-info | FileCheck %s ; +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32" + ; Unit tests for loop unrolling using ScalarEvolution to compute trip counts. ; ; Indvars is run first to generate an "old" SCEV result. Some unit @@ -66,14 +68,14 @@ define i64 @earlyLoopTest(ptr %base) nounwind { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[VAL:%.*]] = load i64, ptr [[BASE:%.*]], align 4 +; CHECK-NEXT: [[VAL:%.*]] = load i64, ptr [[BASE:%.*]], align 8 ; CHECK-NEXT: br label [[TAIL:%.*]] ; CHECK: tail: ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i64 [[VAL]], 0 ; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP_1:%.*]], label [[EXIT2:%.*]] ; CHECK: loop.1: ; CHECK-NEXT: [[ADR_1:%.*]] = getelementptr i64, ptr [[BASE]], i64 1 -; CHECK-NEXT: [[VAL_1:%.*]] = load i64, ptr [[ADR_1]], align 4 +; CHECK-NEXT: [[VAL_1:%.*]] = load i64, ptr [[ADR_1]], align 8 ; CHECK-NEXT: [[S_NEXT_1:%.*]] = add i64 [[VAL]], [[VAL_1]] ; CHECK-NEXT: br label [[TAIL_1:%.*]] ; CHECK: tail.1: @@ -81,7 +83,7 @@ define i64 @earlyLoopTest(ptr %base) nounwind { ; CHECK-NEXT: br i1 [[CMP2_1]], label [[LOOP_2:%.*]], label [[EXIT2]] ; CHECK: loop.2: ; CHECK-NEXT: [[ADR_2:%.*]] = getelementptr i64, ptr [[BASE]], i64 2 -; CHECK-NEXT: [[VAL_2:%.*]] = load i64, ptr [[ADR_2]], align 4 +; CHECK-NEXT: [[VAL_2:%.*]] = load i64, ptr [[ADR_2]], align 8 ; CHECK-NEXT: [[S_NEXT_2:%.*]] = add i64 [[S_NEXT_1]], [[VAL_2]] ; CHECK-NEXT: br label [[TAIL_2:%.*]] ; CHECK: tail.2: @@ -89,7 +91,7 @@ define i64 @earlyLoopTest(ptr %base) nounwind { ; CHECK-NEXT: br i1 [[CMP2_2]], label [[LOOP_3:%.*]], label [[EXIT2]] ; CHECK: loop.3: ; CHECK-NEXT: [[ADR_3:%.*]] = getelementptr i64, ptr [[BASE]], i64 3 -; CHECK-NEXT: [[VAL_3:%.*]] = load i64, ptr [[ADR_3]], align 4 +; CHECK-NEXT: [[VAL_3:%.*]] = load i64, ptr [[ADR_3]], align 8 ; CHECK-NEXT: [[S_NEXT_3:%.*]] = add i64 [[S_NEXT_2]], [[VAL_3]] ; CHECK-NEXT: br i1 false, label [[TAIL_3:%.*]], label [[EXIT1:%.*]] ; CHECK: tail.3: @@ -381,7 +383,7 @@ define i32 @test_pr56044(ptr %src, i32 %a) { ; CHECK: loop.2.peel: ; CHECK-NEXT: [[IV_2_NEXT_PEEL:%.*]] = add i32 0, [[ADD_2]] ; CHECK-NEXT: [[IV_1_NEXT_PEEL:%.*]] = add nuw nsw i32 0, 1 -; CHECK-NEXT: [[EC_2_PEEL:%.*]] = icmp ult i32 [[IV_1_NEXT_PEEL]], 12345 +; CHECK-NEXT: [[EC_2_PEEL:%.*]] = icmp ne i32 [[IV_1_NEXT_PEEL]], 12345 ; CHECK-NEXT: br i1 [[EC_2_PEEL]], label [[LOOP_2_PEEL_NEXT:%.*]], label [[EXIT:%.*]] ; CHECK: loop.2.peel.next: ; CHECK-NEXT: br label [[LOOP_2_PEEL_NEXT2:%.*]] @@ -394,8 +396,8 @@ define i32 @test_pr56044(ptr %src, i32 %a) { ; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[IV_2_NEXT_PEEL]], [[MID_PEEL_NEWPH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ] ; CHECK-NEXT: [[IV_2_NEXT]] = add i32 2, [[IV_2]] ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i32 [[IV_1]], 1 -; CHECK-NEXT: [[EC_2:%.*]] = icmp ult i32 [[IV_1_NEXT]], 12345 -; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP_2]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV_1_NEXT]], 12345 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_2]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit.loopexit: ; CHECK-NEXT: [[LCSSA_2_PH:%.*]] = phi i32 [ [[IV_2_NEXT]], [[LOOP_2]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -435,3 +437,65 @@ exit: } declare void @fn(i32) + + +define void @peel_int_eq_condition(i32 %start) { +; CHECK-LABEL: @peel_int_eq_condition( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[START:%.*]], i32 100) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw i32 [[SMAX]], 1 +; CHECK-NEXT: br label [[LOOP_PEEL_BEGIN:%.*]] +; CHECK: loop.peel.begin: +; CHECK-NEXT: br label [[LOOP_PEEL:%.*]] +; CHECK: loop.peel: +; CHECK-NEXT: [[C_0_PEEL:%.*]] = icmp eq i32 [[START]], [[START]] +; CHECK-NEXT: br i1 [[C_0_PEEL]], label [[IF_THEN_PEEL:%.*]], label [[LOOP_LATCH_PEEL:%.*]] +; CHECK: if.then.peel: +; CHECK-NEXT: call void @fn(i32 [[START]]) +; CHECK-NEXT: br label [[LOOP_LATCH_PEEL]] +; CHECK: loop.latch.peel: +; CHECK-NEXT: [[IV_NEXT_PEEL:%.*]] = add i32 [[START]], 1 +; CHECK-NEXT: [[EXITCOND_PEEL:%.*]] = icmp ne i32 [[IV_NEXT_PEEL]], [[TMP0]] +; CHECK-NEXT: br i1 [[EXITCOND_PEEL]], label [[LOOP_PEEL_NEXT:%.*]], label [[EXIT:%.*]] +; CHECK: loop.peel.next: +; CHECK-NEXT: br label [[LOOP_PEEL_NEXT1:%.*]] +; CHECK: loop.peel.next1: +; CHECK-NEXT: br label [[ENTRY_PEEL_NEWPH:%.*]] +; CHECK: entry.peel.newph: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT_PEEL]], [[ENTRY_PEEL_NEWPH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[IV]], [[START]] +; CHECK-NEXT: br i1 [[C_0]], label [[IF_THEN:%.*]], label [[LOOP_LATCH]] +; CHECK: if.then: +; CHECK-NEXT: call void @fn(i32 [[IV]]) +; CHECK-NEXT: br label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[IV_NEXT]], [[TMP0]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i32 [ %start, %entry ], [ %iv.next, %loop.latch ] + %c.0 = icmp eq i32 %iv, %start + br i1 %c.0, label %if.then, label %loop.latch + +if.then: + call void @fn(i32 %iv) + br label %loop.latch + +loop.latch: + %iv.next = add i32 %iv, 1 + %ec = icmp slt i32 %iv, 100 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index b3780013559a5..0a433ec76acf4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -50,8 +50,7 @@ define void @test_pr25490(i32 %n, ptr noalias nocapture %a, ptr noalias nocaptur ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll index 3a46944712567..dc52e644742e2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll @@ -47,8 +47,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -169,8 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll index b83d3af3a0d65..a3b7392dd280f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll @@ -53,8 +53,7 @@ define void @add_i8(ptr noalias nocapture noundef writeonly %A, ptr nocapture no ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -163,8 +162,7 @@ define void @add_i16(ptr noalias nocapture noundef writeonly %A, ptr nocapture n ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -273,8 +271,7 @@ define void @add_i32(ptr noalias nocapture noundef writeonly %A, ptr nocapture n ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[ITERATIONS]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[ITERATIONS]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -466,8 +463,7 @@ define void @trip_count_based_on_ptrtoint(i64 %x) "target-cpu"="apple-m1" { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[N_VEC]], 4 ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[TMP12]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll index 87b8c4af1e0c7..307d4c43198af 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -135,8 +135,7 @@ define void @test_widen_induction(ptr %A, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -221,8 +220,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) { ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[START]], [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -305,8 +303,7 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[IND_END4]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[IND_END4]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilogue-vectorization-fix-scalar-resume-values.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilogue-vectorization-fix-scalar-resume-values.ll new file mode 100644 index 0000000000000..cb4e99332c04b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilogue-vectorization-fix-scalar-resume-values.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s + +; This tests exercises the out-of-band fixing-up of scalar resume values. + +target triple = "aarch64" + +define void @epilogue_vectorization_fix_scalar_resume_values(ptr %dst, i64 %n) { +; CHECK-LABEL: define void @epilogue_vectorization_fix_scalar_resume_values( +; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[REM:%.*]] = urem i64 [[N]], 3 +; CHECK-NEXT: br i1 true, label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[REM]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[REM]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP0]], align 1 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[REM]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]] +; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 [[REM]], 0 +; CHECK-NEXT: br i1 [[CMP_N3]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_DST_IV:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 0, ptr [[GEP_DST_IV]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[REM]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %rem = urem i64 %n, 3 + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.dst.iv = getelementptr i8, ptr %dst, i64 %iv + store i8 0, ptr %gep.dst.iv, align 1 + %iv.next = add i64 %iv, 1 + %exit.cond = icmp eq i64 %iv.next, %rem + br i1 %exit.cond, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll index 91ec9da11928c..35d7e2cc8c586 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll @@ -58,8 +58,7 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 8 ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index 939eaaa34c514..db088f88e2d8a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -84,8 +84,7 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) { ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]] ; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[START]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll index cc7b4aecc3642..fd6e275d098ca 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll @@ -68,10 +68,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; DEFAULT: [[VEC_EPILOG_ITER_CHECK]]: -; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; DEFAULT-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP32:%.*]] = shl nuw i64 [[TMP31]], 2 -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP32]] +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP2]] ; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; DEFAULT: [[VEC_EPILOG_PH]]: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -812,7 +809,7 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) { ; PRED-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 1 ; PRED-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[TMP0]], 4294967295 ; PRED-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] -; PRED-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; PRED-NEXT: br i1 [[TMP5]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1 ; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index be8c8cd6480e4..93e71af74f4ac 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -41,9 +41,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -174,9 +172,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) ; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = shl nuw i64 [[TMP14]], 1 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll index 9bd3d309c0ad9..9b4151f30d640 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll @@ -69,8 +69,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; INTERLEAVE-4: vec.epilog.iter.check: -; INTERLEAVE-4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; INTERLEAVE-4: vec.epilog.ph: ; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -158,8 +157,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; INTERLEAVE-2: vec.epilog.iter.check: -; INTERLEAVE-2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; INTERLEAVE-2: vec.epilog.ph: ; INTERLEAVE-2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll index a869cf647b5ce..aa94763b44a30 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll @@ -51,8 +51,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; INTERLEAVE-4: vec.epilog.iter.check: -; INTERLEAVE-4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; INTERLEAVE-4: vec.epilog.ph: ; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -184,8 +183,7 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-VLA-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; INTERLEAVE-4-VLA-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; INTERLEAVE-4-VLA: vec.epilog.iter.check: -; INTERLEAVE-4-VLA-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; INTERLEAVE-4-VLA-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; INTERLEAVE-4-VLA-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; INTERLEAVE-4-VLA-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; INTERLEAVE-4-VLA: vec.epilog.ph: ; INTERLEAVE-4-VLA-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll index 58ebc7ce1f8f4..ee3a4a04566c9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -57,8 +57,7 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -180,8 +179,7 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]] ; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll index 5066a9b8337bd..bd33af286b05d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -35,8 +35,7 @@ define void @add_a(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -132,8 +131,7 @@ define void @add_a1(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -294,8 +292,7 @@ define void @add_c(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -472,8 +469,7 @@ define void @add_e(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -612,8 +608,7 @@ define void @add_f(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll index 9b62525370210..dd8bd273050c7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -88,8 +88,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef ; CHECK-VS1-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK-VS1: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-VS1-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]] -; CHECK-VS1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-VS1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK-VS1: [[VEC_EPILOG_PH]]: ; CHECK-VS1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -182,8 +181,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef ; CHECK-VS2-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK-VS2: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-VS2-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]] -; CHECK-VS2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-VS2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK-VS2: [[VEC_EPILOG_PH]]: ; CHECK-VS2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll index b157a2818e676..157b78704234a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call-scalarize.ll @@ -109,36 +109,35 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP9]], i32 1 ; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP14]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP14]], <2 x double> zeroinitializer, <2 x double> splat (double 1.000000e+00) ; TFA_INTERLEAVE-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; TFA_INTERLEAVE: pred.store.if: -; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 0 +; TFA_INTERLEAVE-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0 ; TFA_INTERLEAVE-NEXT: store double [[TMP20]], ptr [[P:%.*]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE]] ; TFA_INTERLEAVE: pred.store.continue: ; TFA_INTERLEAVE-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; TFA_INTERLEAVE: pred.store.if4: -; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI]], i32 1 +; TFA_INTERLEAVE: pred.store.if3: +; TFA_INTERLEAVE-NEXT: [[TMP22:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1 ; TFA_INTERLEAVE-NEXT: store double [[TMP22]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE5]] -; TFA_INTERLEAVE: pred.store.continue5: +; TFA_INTERLEAVE: pred.store.continue4: ; TFA_INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 0 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] -; TFA_INTERLEAVE: pred.store.if6: +; TFA_INTERLEAVE: pred.store.if5: ; TFA_INTERLEAVE-NEXT: [[TMP32:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 0 ; TFA_INTERLEAVE-NEXT: store double [[TMP32]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE7]] -; TFA_INTERLEAVE: pred.store.continue7: +; TFA_INTERLEAVE: pred.store.continue6: ; TFA_INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK2]], i32 1 ; TFA_INTERLEAVE-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; TFA_INTERLEAVE: pred.store.if8: +; TFA_INTERLEAVE: pred.store.if7: ; TFA_INTERLEAVE-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[PREDPHI3]], i32 1 ; TFA_INTERLEAVE-NEXT: store double [[TMP34]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label [[PRED_STORE_CONTINUE9]] -; TFA_INTERLEAVE: pred.store.continue9: +; TFA_INTERLEAVE: pred.store.continue8: ; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP27:%.*]] = add i64 [[INDEX]], 2 ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX]], i64 [[TMP3]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index fd02300232a84..d8a81f9316e4b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -51,11 +51,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[ENTRY:.*]]: ; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFCOMMON-NEXT: br label %[[VECTOR_BODY:.*]] ; TFCOMMON: [[VECTOR_BODY]]: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]] ; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFCOMMON-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) @@ -76,12 +75,11 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() @@ -179,11 +177,10 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[ENTRY:.*]]: ; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFCOMMON-NEXT: br label %[[VECTOR_BODY:.*]] ; TFCOMMON: [[VECTOR_BODY]]: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], splat (i64 50) @@ -207,12 +204,11 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() @@ -332,11 +328,10 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFCOMMON-NEXT: [[ENTRY:.*]]: ; TFCOMMON-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFCOMMON-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFCOMMON-NEXT: br label %[[VECTOR_BODY:.*]] ; TFCOMMON: [[VECTOR_BODY]]: ; TFCOMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFCOMMON-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFCOMMON-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TFCOMMON-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFCOMMON-NEXT: [[TMP6:%.*]] = icmp ugt [[WIDE_MASKED_LOAD]], splat (i64 50) @@ -363,12 +358,11 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() @@ -608,11 +602,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFALWAYS-NEXT: [[ENTRY:.*]]: ; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]] ; TFALWAYS: [[VECTOR_BODY]]: ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]] ; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFALWAYS-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) @@ -631,11 +624,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFFALLBACK-NEXT: [[ENTRY:.*]]: ; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]] ; TFFALLBACK: [[VECTOR_BODY]]: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]] ; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) ; TFFALLBACK-NEXT: [[TMP6:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) @@ -656,12 +648,11 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025) ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDEX]] ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() @@ -763,13 +754,12 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFALWAYS-NEXT: [[ENTRY:.*]]: ; TFALWAYS-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFALWAYS-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFALWAYS-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M]], i64 0 ; TFALWAYS-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFALWAYS-NEXT: br label %[[VECTOR_BODY:.*]] ; TFALWAYS: [[VECTOR_BODY]]: ; TFALWAYS-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFALWAYS-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] ; TFALWAYS-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] ; TFALWAYS-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) @@ -793,13 +783,12 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFFALLBACK-NEXT: [[ENTRY:.*]]: ; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; TFFALLBACK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 2 -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFFALLBACK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M]], i64 0 ; TFFALLBACK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFFALLBACK-NEXT: br label %[[VECTOR_BODY:.*]] ; TFFALLBACK: [[VECTOR_BODY]]: ; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] ; TFFALLBACK-NEXT: [[TMP5:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] ; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) @@ -825,14 +814,13 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; TFA_INTERLEAVE-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1 -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP6]], i64 1025) ; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, double [[M]], i64 0 ; TFA_INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TFA_INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] ; TFA_INTERLEAVE: [[VECTOR_BODY]]: ; TFA_INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY1]], %[[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24:%.*]], %[[VECTOR_BODY]] ] ; TFA_INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] @@ -972,13 +960,11 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP4:%.*]] = load double, ptr [[P2]], align 8 ; TFA_INTERLEAVE-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7:[0-9]+]] ; TFA_INTERLEAVE-NEXT: [[TMP8:%.*]] = fcmp ogt double [[TMP6]], 0.000000e+00 -; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 ; TFA_INTERLEAVE-NEXT: [[PREDPHI3:%.*]] = select i1 [[TMP8]], double 0.000000e+00, double 1.000000e+00 -; TFA_INTERLEAVE-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[ACTIVE_LANE_MASK2]], double [[PREDPHI3]], double [[PREDPHI]] ; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = or i1 [[ACTIVE_LANE_MASK]], [[ACTIVE_LANE_MASK2]] ; TFA_INTERLEAVE-NEXT: br i1 [[TMP14]], label %[[BB8:.*]], label %[[TMP9]] ; TFA_INTERLEAVE: [[BB8]]: -; TFA_INTERLEAVE-NEXT: store double [[SPEC_SELECT]], ptr [[P]], align 8 +; TFA_INTERLEAVE-NEXT: store double [[PREDPHI3]], ptr [[P]], align 8 ; TFA_INTERLEAVE-NEXT: br label %[[TMP9]] ; TFA_INTERLEAVE: [[TMP9]]: ; TFA_INTERLEAVE-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index f4784b6259ce1..229209e98af78 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1381,8 +1381,8 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 } -define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { -; CHECK-NEON-LABEL: define i32 @red_extended_add_chain( +define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { +; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-NEON-NEXT: entry: ; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1404,7 +1404,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 ; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <16 x i32> @llvm.vector.partial.reduce.add.v16i32.v16i32(<16 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] ; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] ; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1415,7 +1415,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-NEON: scalar.ph: ; -; CHECK-SVE-LABEL: define i32 @red_extended_add_chain( +; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SVE-NEXT: entry: ; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1452,7 +1452,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: ; -; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_chain( +; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain( ; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { ; CHECK-SVE-MAXBW-NEXT: entry: ; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 @@ -1478,7 +1478,7 @@ define i32 @red_extended_add_chain(ptr %start, ptr %end, i32 %offset) { ; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] ; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 ; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv8i32.nxv8i32( [[VEC_PHI]], [[TMP7]]) +; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add [[VEC_PHI]], [[TMP7]] ; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] ; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll new file mode 100644 index 0000000000000..0086f6e61cd36 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll @@ -0,0 +1,469 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S %s | FileCheck %s + +target triple = "arm64-apple-macosx" + +define i32 @red_zext_mul_by_63(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_zext_mul_by_63( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 63) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 63 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +define i32 @red_zext_mul_by_255(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_zext_mul_by_255( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 255) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 255 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 255 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +define i32 @red_zext_mul_by_256(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_zext_mul_by_256( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 256) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 256 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 256 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +define i32 @red_sext_mul_by_63(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_sext_mul_by_63( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 63) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = sext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 63 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr_iv = phi ptr [ %start, %entry ], [ %gep_iv_next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red_next, %loop ] + %l = load i8, ptr %ptr_iv, align 1 + %l_ext = sext i8 %l to i32 + %mul = mul i32 %l_ext, 63 + %red_next = add i32 %red, %mul + %gep_iv_next = getelementptr i8, ptr %ptr_iv, i64 1 + %ec = icmp eq ptr %ptr_iv, %end + br i1 %ec, label %exit, label %loop + +exit: + %red_next_lcssa = phi i32 [ %red_next, %loop ] + ret i32 %red_next_lcssa +} + +; Constants >= 128 cannot be treated as sign-extended. +define i32 @red_sext_mul_by_128(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_sext_mul_by_128( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 128) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = sext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 128 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr_iv = phi ptr [ %start, %entry ], [ %gep_iv_next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red_next, %loop ] + %l = load i8, ptr %ptr_iv, align 1 + %l_ext = sext i8 %l to i32 + %mul = mul i32 %l_ext, 128 + %red_next = add i32 %red, %mul + %gep_iv_next = getelementptr i8, ptr %ptr_iv, i64 1 + %ec = icmp eq ptr %ptr_iv, %end + br i1 %ec, label %exit, label %loop + +exit: + %red_next_lcssa = phi i32 [ %red_next, %loop ] + ret i32 %red_next_lcssa +} + +define i32 @red_sext_mul_by_255(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_sext_mul_by_255( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 255) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = sext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 255 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = sext i8 %l to i32 + %mul = mul i32 %l.ext, 255 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +define i32 @red_sext_mul_by_256(ptr %start, ptr %end) { +; CHECK-LABEL: define i32 @red_sext_mul_by_256( +; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], splat (i32 256) +; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = sext i8 [[L]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[L_EXT]], 256 +; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[MUL]] +; CHECK-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = sext i8 %l to i32 + %mul = mul i32 %l.ext, 256 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 766b60a79520b..24375dd864fae 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -87,8 +87,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[IV_NEXT]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll index efce4bdf712a0..1dcd665817196 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll @@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux" ; Test case from https://github.com/llvm/llvm-project/issues/148431. define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 %n, i64 %off) #0 { ; CHECK-LABEL: define void @test_predicated_load_cast_hint( -; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) { +; CHECK-SAME: ptr [[DST_1:%.*]], ptr [[DST_2:%.*]], ptr [[SRC:%.*]], i8 [[N:%.*]], i64 [[OFF:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[N_EXT:%.*]] = sext i8 [[N]] to i32 ; CHECK-NEXT: [[N_SUB:%.*]] = add i32 [[N_EXT]], -15 @@ -66,205 +66,64 @@ define void @test_predicated_load_cast_hint(ptr %dst.1, ptr %dst.2, ptr %src, i8 ; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]] ; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP2]], 15 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP2]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 0, i32 [[TMP2]]) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE50:.*]] ] -; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[INDEX]] to i8 -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i8 [[DOTCAST]], 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <16 x i32> poison, i32 [[INDEX]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT17]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[VEC_IV:%.*]] = add <16 x i32> [[BROADCAST_SPLAT18]], -; CHECK-NEXT: [[TMP25:%.*]] = icmp ule <16 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <16 x i8> poison, i8 [[TMP26]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT19]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT20]] to <16 x i64> -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <16 x i1> [[TMP25]], i32 0 -; CHECK-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE22:.*]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE22]] ] +; CHECK-NEXT: [[TMP28:%.*]] = load i8, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP28]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i64> +; CHECK-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[VEC_IND]] to <4 x i64> +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 0 +; CHECK-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: -; CHECK-NEXT: [[TMP29:%.*]] = add i8 [[OFFSET_IDX]], 0 -; CHECK-NEXT: [[TMP30:%.*]] = zext i8 [[TMP29]] to i64 -; CHECK-NEXT: [[TMP31:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP30]], i64 [[OFF]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <16 x i64> [[TMP27]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = or i64 [[TMP32]], 1 -; CHECK-NEXT: store i64 [[TMP33]], ptr [[TMP31]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] -; CHECK: [[PRED_STORE_CONTINUE]]: -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP25]], i32 1 -; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] -; CHECK: [[PRED_STORE_IF21]]: -; CHECK-NEXT: [[TMP35:%.*]] = add i8 [[OFFSET_IDX]], 4 -; CHECK-NEXT: [[TMP36:%.*]] = zext i8 [[TMP35]] to i64 -; CHECK-NEXT: [[TMP37:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP36]], i64 [[OFF]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <16 x i64> [[TMP27]], i32 1 -; CHECK-NEXT: [[TMP39:%.*]] = or i64 [[TMP38]], 1 -; CHECK-NEXT: store i64 [[TMP39]], ptr [[TMP37]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] -; CHECK: [[PRED_STORE_CONTINUE22]]: -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i1> [[TMP25]], i32 2 -; CHECK-NEXT: br i1 [[TMP40]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] -; CHECK: [[PRED_STORE_IF23]]: -; CHECK-NEXT: [[TMP41:%.*]] = add i8 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP42:%.*]] = zext i8 [[TMP41]] to i64 -; CHECK-NEXT: [[TMP43:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP42]], i64 [[OFF]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i64> [[TMP27]], i32 2 -; CHECK-NEXT: [[TMP45:%.*]] = or i64 [[TMP44]], 1 -; CHECK-NEXT: store i64 [[TMP45]], ptr [[TMP43]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]] -; CHECK: [[PRED_STORE_CONTINUE24]]: -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x i1> [[TMP25]], i32 3 -; CHECK-NEXT: br i1 [[TMP46]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] -; CHECK: [[PRED_STORE_IF25]]: -; CHECK-NEXT: [[TMP47:%.*]] = add i8 [[OFFSET_IDX]], 12 -; CHECK-NEXT: [[TMP48:%.*]] = zext i8 [[TMP47]] to i64 -; CHECK-NEXT: [[TMP49:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP48]], i64 [[OFF]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i64> [[TMP27]], i32 3 -; CHECK-NEXT: [[TMP51:%.*]] = or i64 [[TMP50]], 1 -; CHECK-NEXT: store i64 [[TMP51]], ptr [[TMP49]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]] -; CHECK: [[PRED_STORE_CONTINUE26]]: -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <16 x i1> [[TMP25]], i32 4 -; CHECK-NEXT: br i1 [[TMP52]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] -; CHECK: [[PRED_STORE_IF27]]: -; CHECK-NEXT: [[TMP53:%.*]] = add i8 [[OFFSET_IDX]], 16 -; CHECK-NEXT: [[TMP54:%.*]] = zext i8 [[TMP53]] to i64 -; CHECK-NEXT: [[TMP55:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP54]], i64 [[OFF]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <16 x i64> [[TMP27]], i32 4 -; CHECK-NEXT: [[TMP57:%.*]] = or i64 [[TMP56]], 1 -; CHECK-NEXT: store i64 [[TMP57]], ptr [[TMP55]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]] -; CHECK: [[PRED_STORE_CONTINUE28]]: -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <16 x i1> [[TMP25]], i32 5 -; CHECK-NEXT: br i1 [[TMP58]], label %[[PRED_STORE_IF29:.*]], label %[[PRED_STORE_CONTINUE30:.*]] -; CHECK: [[PRED_STORE_IF29]]: -; CHECK-NEXT: [[TMP59:%.*]] = add i8 [[OFFSET_IDX]], 20 -; CHECK-NEXT: [[TMP60:%.*]] = zext i8 [[TMP59]] to i64 -; CHECK-NEXT: [[TMP61:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP60]], i64 [[OFF]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <16 x i64> [[TMP27]], i32 5 -; CHECK-NEXT: [[TMP63:%.*]] = or i64 [[TMP62]], 1 -; CHECK-NEXT: store i64 [[TMP63]], ptr [[TMP61]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE30]] -; CHECK: [[PRED_STORE_CONTINUE30]]: -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP25]], i32 6 -; CHECK-NEXT: br i1 [[TMP64]], label %[[PRED_STORE_IF31:.*]], label %[[PRED_STORE_CONTINUE32:.*]] -; CHECK: [[PRED_STORE_IF31]]: -; CHECK-NEXT: [[TMP65:%.*]] = add i8 [[OFFSET_IDX]], 24 -; CHECK-NEXT: [[TMP66:%.*]] = zext i8 [[TMP65]] to i64 -; CHECK-NEXT: [[TMP67:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP66]], i64 [[OFF]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <16 x i64> [[TMP27]], i32 6 -; CHECK-NEXT: [[TMP69:%.*]] = or i64 [[TMP68]], 1 -; CHECK-NEXT: store i64 [[TMP69]], ptr [[TMP67]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE32]] -; CHECK: [[PRED_STORE_CONTINUE32]]: -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i1> [[TMP25]], i32 7 -; CHECK-NEXT: br i1 [[TMP70]], label %[[PRED_STORE_IF33:.*]], label %[[PRED_STORE_CONTINUE34:.*]] -; CHECK: [[PRED_STORE_IF33]]: -; CHECK-NEXT: [[TMP71:%.*]] = add i8 [[OFFSET_IDX]], 28 -; CHECK-NEXT: [[TMP72:%.*]] = zext i8 [[TMP71]] to i64 -; CHECK-NEXT: [[TMP73:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP72]], i64 [[OFF]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i64> [[TMP27]], i32 7 -; CHECK-NEXT: [[TMP75:%.*]] = or i64 [[TMP74]], 1 -; CHECK-NEXT: store i64 [[TMP75]], ptr [[TMP73]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE34]] -; CHECK: [[PRED_STORE_CONTINUE34]]: -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <16 x i1> [[TMP25]], i32 8 -; CHECK-NEXT: br i1 [[TMP76]], label %[[PRED_STORE_IF35:.*]], label %[[PRED_STORE_CONTINUE36:.*]] -; CHECK: [[PRED_STORE_IF35]]: -; CHECK-NEXT: [[TMP77:%.*]] = add i8 [[OFFSET_IDX]], 32 -; CHECK-NEXT: [[TMP78:%.*]] = zext i8 [[TMP77]] to i64 -; CHECK-NEXT: [[TMP79:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP78]], i64 [[OFF]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x i64> [[TMP27]], i32 8 -; CHECK-NEXT: [[TMP81:%.*]] = or i64 [[TMP80]], 1 -; CHECK-NEXT: store i64 [[TMP81]], ptr [[TMP79]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE36]] -; CHECK: [[PRED_STORE_CONTINUE36]]: -; CHECK-NEXT: [[TMP82:%.*]] = extractelement <16 x i1> [[TMP25]], i32 9 -; CHECK-NEXT: br i1 [[TMP82]], label %[[PRED_STORE_IF37:.*]], label %[[PRED_STORE_CONTINUE38:.*]] -; CHECK: [[PRED_STORE_IF37]]: -; CHECK-NEXT: [[TMP83:%.*]] = add i8 [[OFFSET_IDX]], 36 -; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP83]] to i64 -; CHECK-NEXT: [[TMP85:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP84]], i64 [[OFF]] -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <16 x i64> [[TMP27]], i32 9 -; CHECK-NEXT: [[TMP87:%.*]] = or i64 [[TMP86]], 1 -; CHECK-NEXT: store i64 [[TMP87]], ptr [[TMP85]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE38]] -; CHECK: [[PRED_STORE_CONTINUE38]]: -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <16 x i1> [[TMP25]], i32 10 -; CHECK-NEXT: br i1 [[TMP88]], label %[[PRED_STORE_IF39:.*]], label %[[PRED_STORE_CONTINUE40:.*]] -; CHECK: [[PRED_STORE_IF39]]: -; CHECK-NEXT: [[TMP89:%.*]] = add i8 [[OFFSET_IDX]], 40 -; CHECK-NEXT: [[TMP90:%.*]] = zext i8 [[TMP89]] to i64 -; CHECK-NEXT: [[TMP91:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP90]], i64 [[OFF]] -; CHECK-NEXT: [[TMP92:%.*]] = extractelement <16 x i64> [[TMP27]], i32 10 -; CHECK-NEXT: [[TMP93:%.*]] = or i64 [[TMP92]], 1 -; CHECK-NEXT: store i64 [[TMP93]], ptr [[TMP91]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE40]] -; CHECK: [[PRED_STORE_CONTINUE40]]: -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <16 x i1> [[TMP25]], i32 11 -; CHECK-NEXT: br i1 [[TMP94]], label %[[PRED_STORE_IF41:.*]], label %[[PRED_STORE_CONTINUE42:.*]] -; CHECK: [[PRED_STORE_IF41]]: -; CHECK-NEXT: [[TMP95:%.*]] = add i8 [[OFFSET_IDX]], 44 -; CHECK-NEXT: [[TMP96:%.*]] = zext i8 [[TMP95]] to i64 -; CHECK-NEXT: [[TMP97:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP96]], i64 [[OFF]] -; CHECK-NEXT: [[TMP98:%.*]] = extractelement <16 x i64> [[TMP27]], i32 11 -; CHECK-NEXT: [[TMP99:%.*]] = or i64 [[TMP98]], 1 -; CHECK-NEXT: store i64 [[TMP99]], ptr [[TMP97]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE42]] -; CHECK: [[PRED_STORE_CONTINUE42]]: -; CHECK-NEXT: [[TMP100:%.*]] = extractelement <16 x i1> [[TMP25]], i32 12 -; CHECK-NEXT: br i1 [[TMP100]], label %[[PRED_STORE_IF43:.*]], label %[[PRED_STORE_CONTINUE44:.*]] -; CHECK: [[PRED_STORE_IF43]]: -; CHECK-NEXT: [[TMP101:%.*]] = add i8 [[OFFSET_IDX]], 48 -; CHECK-NEXT: [[TMP102:%.*]] = zext i8 [[TMP101]] to i64 +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i64> [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP103:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP102]], i64 [[OFF]] -; CHECK-NEXT: [[TMP104:%.*]] = extractelement <16 x i64> [[TMP27]], i32 12 +; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0 ; CHECK-NEXT: [[TMP105:%.*]] = or i64 [[TMP104]], 1 ; CHECK-NEXT: store i64 [[TMP105]], ptr [[TMP103]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE44]] -; CHECK: [[PRED_STORE_CONTINUE44]]: -; CHECK-NEXT: [[TMP106:%.*]] = extractelement <16 x i1> [[TMP25]], i32 13 -; CHECK-NEXT: br i1 [[TMP106]], label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46:.*]] -; CHECK: [[PRED_STORE_IF45]]: -; CHECK-NEXT: [[TMP107:%.*]] = add i8 [[OFFSET_IDX]], 52 -; CHECK-NEXT: [[TMP108:%.*]] = zext i8 [[TMP107]] to i64 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 +; CHECK-NEXT: br i1 [[TMP32]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; CHECK: [[PRED_STORE_IF17]]: +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i64> [[TMP26]], i32 1 ; CHECK-NEXT: [[TMP109:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP108]], i64 [[OFF]] -; CHECK-NEXT: [[TMP110:%.*]] = extractelement <16 x i64> [[TMP27]], i32 13 +; CHECK-NEXT: [[TMP110:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1 ; CHECK-NEXT: [[TMP111:%.*]] = or i64 [[TMP110]], 1 ; CHECK-NEXT: store i64 [[TMP111]], ptr [[TMP109]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE46]] -; CHECK: [[PRED_STORE_CONTINUE46]]: -; CHECK-NEXT: [[TMP112:%.*]] = extractelement <16 x i1> [[TMP25]], i32 14 -; CHECK-NEXT: br i1 [[TMP112]], label %[[PRED_STORE_IF47:.*]], label %[[PRED_STORE_CONTINUE48:.*]] -; CHECK: [[PRED_STORE_IF47]]: -; CHECK-NEXT: [[TMP113:%.*]] = add i8 [[OFFSET_IDX]], 56 -; CHECK-NEXT: [[TMP114:%.*]] = zext i8 [[TMP113]] to i64 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; CHECK: [[PRED_STORE_CONTINUE18]]: +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2 +; CHECK-NEXT: br i1 [[TMP37]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; CHECK: [[PRED_STORE_IF19]]: +; CHECK-NEXT: [[TMP114:%.*]] = extractelement <4 x i64> [[TMP26]], i32 2 ; CHECK-NEXT: [[TMP115:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP114]], i64 [[OFF]] -; CHECK-NEXT: [[TMP116:%.*]] = extractelement <16 x i64> [[TMP27]], i32 14 +; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2 ; CHECK-NEXT: [[TMP117:%.*]] = or i64 [[TMP116]], 1 ; CHECK-NEXT: store i64 [[TMP117]], ptr [[TMP115]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE48]] -; CHECK: [[PRED_STORE_CONTINUE48]]: -; CHECK-NEXT: [[TMP118:%.*]] = extractelement <16 x i1> [[TMP25]], i32 15 -; CHECK-NEXT: br i1 [[TMP118]], label %[[PRED_STORE_IF49:.*]], label %[[PRED_STORE_CONTINUE50]] -; CHECK: [[PRED_STORE_IF49]]: -; CHECK-NEXT: [[TMP119:%.*]] = add i8 [[OFFSET_IDX]], 60 -; CHECK-NEXT: [[TMP120:%.*]] = zext i8 [[TMP119]] to i64 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; CHECK: [[PRED_STORE_CONTINUE20]]: +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3 +; CHECK-NEXT: br i1 [[TMP42]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP120:%.*]] = extractelement <4 x i64> [[TMP26]], i32 3 ; CHECK-NEXT: [[TMP121:%.*]] = getelementptr [16 x i64], ptr [[DST_1]], i64 [[TMP120]], i64 [[OFF]] -; CHECK-NEXT: [[TMP122:%.*]] = extractelement <16 x i64> [[TMP27]], i32 15 +; CHECK-NEXT: [[TMP122:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3 ; CHECK-NEXT: [[TMP123:%.*]] = or i64 [[TMP122]], 1 ; CHECK-NEXT: store i64 [[TMP123]], ptr [[TMP121]], align 8, !alias.scope [[META3]] -; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE50]] -; CHECK: [[PRED_STORE_CONTINUE50]]: +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: ; CHECK-NEXT: store i8 0, ptr [[DST_2]], align 1, !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 -; CHECK-NEXT: [[TMP124:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP124]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX_NEXT]], i32 [[TMP2]]) +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP48:%.*]] = xor i1 [[TMP47]], true +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 16) +; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: @@ -309,6 +168,219 @@ exit: ret void } +; Check computing costs for sdiv/udiv with invariant divisor and tail folding. +; From https://github.com/llvm/llvm-project/issues/160354. +define void @srem_sdiv_with_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #0 { +; CHECK-LABEL: define void @srem_sdiv_with_tail_folding( +; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1 +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]] +; CHECK-NEXT: [[REM_1:%.*]] = add nsw i32 [[REM]], 1 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]] +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[DIV]], 1 +; CHECK-NEXT: [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]] +; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %iv.sub = add nsw i32 %iv, -1 + %rem = srem i32 %iv.sub, %d.0 + %rem.1 = add nsw i32 %rem, 1 + %c = icmp eq i32 %rem.1, %d.0 + br i1 %c, label %then, label %loop.latch + +then: + %div = sdiv i32 %iv.sub, %d.1 + %add.1 = add i32 %div, 1 + %add.1.ext = sext i32 %add.1 to i64 + %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext + store i32 %iv, ptr %gep.dst, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp ne i32 %iv.next, %end + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} + +; Check computing costs for predicated sdiv/udiv with invariant divisor without tail folding. +; From https://github.com/llvm/llvm-project/issues/160356. +define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %end) #1 { +; CHECK-LABEL: define void @srem_sdiv_without_tail_folding( +; CHECK-SAME: i32 [[D_0:%.*]], i32 [[D_1:%.*]], ptr [[DST:%.*]], i32 [[END:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[END]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[END]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[END]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[D_0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add nsw <4 x i32> [[VEC_IND]], splat (i32 -1) +; CHECK-NEXT: [[TMP1:%.*]] = srem <4 x i32> [[TMP0]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], splat (i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]] +; CHECK: [[PRED_SDIV_IF]]: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = sdiv i32 [[TMP5]], [[D_1]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE]] +; CHECK: [[PRED_SDIV_CONTINUE]]: +; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]] +; CHECK: [[PRED_SDIV_IF1]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[D_1]] +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE2]] +; CHECK: [[PRED_SDIV_CONTINUE2]]: +; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP12]], %[[PRED_SDIV_IF1]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 +; CHECK-NEXT: br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]] +; CHECK: [[PRED_SDIV_IF3]]: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[D_1]] +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE4]] +; CHECK: [[PRED_SDIV_CONTINUE4]]: +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP17]], %[[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 [[TMP19]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]] +; CHECK: [[PRED_SDIV_IF5]]: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[D_1]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: br label %[[PRED_SDIV_CONTINUE6]] +; CHECK: [[PRED_SDIV_CONTINUE6]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP22]], %[[PRED_SDIV_IF5]] ] +; CHECK-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP23]], splat (i32 1) +; CHECK-NEXT: [[TMP25:%.*]] = sext <4 x i32> [[TMP24]] to <4 x i64> +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP25]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP30]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; CHECK: [[PRED_STORE_IF7]]: +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP25]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]] +; CHECK-NEXT: [[TMP33:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; CHECK: [[PRED_STORE_CONTINUE8]]: +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2 +; CHECK-NEXT: br i1 [[TMP34]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; CHECK: [[PRED_STORE_IF9]]: +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP25]], i32 2 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: store i32 [[TMP37]], ptr [[TMP36]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; CHECK: [[PRED_STORE_CONTINUE10]]: +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 [[TMP38]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; CHECK: [[PRED_STORE_IF11]]: +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP25]], i32 3 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP39]] +; CHECK-NEXT: [[TMP41:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: store i32 [[TMP41]], ptr [[TMP40]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; CHECK: [[PRED_STORE_CONTINUE12]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP42]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[END]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV_SUB:%.*]] = add nsw i32 [[IV]], -1 +; CHECK-NEXT: [[REM:%.*]] = srem i32 [[IV_SUB]], [[D_0]] +; CHECK-NEXT: [[REM_1:%.*]] = add nsw i32 [[REM]], 1 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[REM_1]], [[D_0]] +; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[IV_SUB]], [[D_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[DIV]], 1 +; CHECK-NEXT: [[ADD_1_EXT:%.*]] = sext i32 [[ADD_1]] to i64 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[ADD_1_EXT]] +; CHECK-NEXT: store i32 [[IV]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[END]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %iv.sub = add nsw i32 %iv, -1 + %rem = srem i32 %iv.sub, %d.0 + %rem.1 = add nsw i32 %rem, 1 + %c = icmp eq i32 %rem.1, %d.0 + br i1 %c, label %then, label %loop.latch + +then: + %div = sdiv i32 %iv.sub, %d.1 + %add.1 = add i32 %div, 1 + %add.1.ext = sext i32 %add.1 to i64 + %gep.dst = getelementptr i32, ptr %dst, i64 %add.1.ext + store i32 %iv, ptr %gep.dst, align 4 + br label %loop.latch + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp ne i32 %iv.next, %end + br i1 %ec, label %loop.header, label %exit + +exit: + ret void +} + +attributes #0 = { "target-cpu"="neoverse-v1" } +attributes #1 = { "target-cpu"="neoverse-v2" } + !0 = distinct !{!0, !1, !2, !3} !1 = !{!"llvm.loop.mustprogress"} !2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} @@ -327,4 +399,6 @@ exit: ; CHECK: [[META10]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META11]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META9]], [[META10]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META10]], [[META11]]} +; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META11]], [[META10]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll index 496bfbb18a106..0c7dc29cb46d8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-recurrence-costs-sve.ll @@ -418,11 +418,8 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; VSCALEFORTUNING2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; VSCALEFORTUNING2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; VSCALEFORTUNING2: [[VEC_EPILOG_ITER_CHECK]]: -; VSCALEFORTUNING2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; VSCALEFORTUNING2-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; VSCALEFORTUNING2-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP22]], 1 -; VSCALEFORTUNING2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]] -; VSCALEFORTUNING2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; VSCALEFORTUNING2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP5]] +; VSCALEFORTUNING2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]] ; VSCALEFORTUNING2: [[VEC_EPILOG_PH]]: ; VSCALEFORTUNING2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; VSCALEFORTUNING2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -443,7 +440,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; VSCALEFORTUNING2-NEXT: [[TMP24]] = or [[TMP23]], [[VEC_PHI9]] ; VSCALEFORTUNING2-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[IV]], [[TMP20]] ; VSCALEFORTUNING2-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC5]] -; VSCALEFORTUNING2-NEXT: br i1 [[TMP25]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VSCALEFORTUNING2-NEXT: br i1 [[TMP25]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VSCALEFORTUNING2: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; VSCALEFORTUNING2-NEXT: [[TMP26:%.*]] = call i16 @llvm.vector.reduce.or.nxv2i16( [[TMP24]]) ; VSCALEFORTUNING2-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] @@ -461,7 +458,7 @@ define i16 @reduce_udiv(ptr %src, i16 %x, i64 %N) #0 { ; VSCALEFORTUNING2-NEXT: [[RED_NEXT]] = or i16 [[DIV]], [[RED]] ; VSCALEFORTUNING2-NEXT: [[IV_NEXT]] = add i64 [[IV1]], 1 ; VSCALEFORTUNING2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV1]], [[N]] -; VSCALEFORTUNING2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; VSCALEFORTUNING2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; VSCALEFORTUNING2: [[EXIT]]: ; VSCALEFORTUNING2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i16 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP18]], %[[MIDDLE_BLOCK]] ], [ [[TMP26]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; VSCALEFORTUNING2-NEXT: ret i16 [[RED_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll index 2941b3677af81..8830ce33aecff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-struct-return.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 6 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s target triple = "aarch64-unknown-linux-gnu" @@ -5,14 +6,41 @@ target triple = "aarch64-unknown-linux-gnu" ; Tests basic vectorization of scalable homogeneous struct literal returns. define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 -; CHECK: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_A]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK: call void @llvm.masked.store.nxv4f32.p0( [[WIDE_B]], ptr {{%.*}}, i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-LABEL: define void @struct_return_f32_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP7]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP9]], ptr [[TMP11]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP10]], ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -36,14 +64,41 @@ exit: } define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f64_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { , } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { , } [[WIDE_CALL]], 1 -; CHECK: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_A]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK: call void @llvm.masked.store.nxv2f64.p0( [[WIDE_B]], ptr {{%.*}}, i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-LABEL: define void @struct_return_f64_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ugt i64 1024, [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP8:%.*]] = call { , } @scalable_vec_masked_bar( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[TMP8]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[TMP8]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP9]], ptr [[TMP11]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP10]], ptr [[TMP12]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -67,15 +122,59 @@ exit: } define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks -; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) -; CHECK: entry: -; CHECK: br label %vector.memcheck -; CHECK: vector.memcheck: -; CHECK: vector.body: -; CHECK: call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD:%.*]], [[ACTIVE_LANE_MASK:%.*]]) -; CHECK: for.body: -; CHECK: call { float, float } @foo(float [[LOAD:%.*]]) +; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks( +; CHECK-SAME: ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[IN3:%.*]] = ptrtoint ptr [[IN]] to i64 +; CHECK-NEXT: [[OUT_A2:%.*]] = ptrtoint ptr [[OUT_A]] to i64 +; CHECK-NEXT: [[OUT_B1:%.*]] = ptrtoint ptr [[OUT_B]] to i64 +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 [[OUT_B1]], [[OUT_A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[OUT_A2]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = sub i64 [[OUT_B1]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i64 [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = sub i64 1024, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 1024, [[TMP11]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1024) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4f32.p0(ptr [[TMP15]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP16:%.*]] = call { , } @scalable_vec_masked_foo( [[WIDE_MASKED_LOAD]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[TMP16]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[TMP16]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP17]], ptr [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0( [[TMP18]], ptr [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = xor i1 [[TMP21]], true +; CHECK-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index f9eb9eb2a5a96..79fb3fd181cc5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -36,8 +36,7 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 101, [[N_VEC]] -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll index 793813e55409e..b78ada07db1b3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -42,8 +42,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll index 44a8eba84a1d0..27779d5ceb0ac 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -41,8 +41,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index 4c4c9e57b4ffb..ebc1c1ef1e773 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -38,8 +38,7 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll index 612aad5c665cd..8d33ccbf38861 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -48,8 +48,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -97,8 +96,7 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-VF8: vec.epilog.iter.check: -; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] -; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK-VF8: vec.epilog.ph: ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -189,8 +187,7 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-VF8: vec.epilog.iter.check: -; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]] -; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK-VF8: vec.epilog.ph: ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -269,8 +266,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -322,8 +318,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-VF8: vec.epilog.iter.check: -; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK-VF8: vec.epilog.ph: ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -393,8 +388,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 10000, [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -445,8 +439,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-VF8: vec.epilog.iter.check: ; CHECK-VF8-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-VF8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 10000, [[N_VEC]] -; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-VF8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-VF8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK-VF8: vec.epilog.ph: ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -526,8 +519,7 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -657,8 +649,7 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll index 5742b3ad45749..4706798c525bd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vscale-fixed.ll @@ -55,8 +55,7 @@ define void @main_vf_vscale_x_16(ptr %A, i64 %n) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -110,10 +109,7 @@ define void @main_vf_vscale_x_16(ptr %A, i64 %n) #0 { ; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-EPILOG-PREFER-SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK-EPILOG-PREFER-SCALABLE: vec.epilog.iter.check: -; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 3 -; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP12]] +; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP1]] ; CHECK-EPILOG-PREFER-SCALABLE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK-EPILOG-PREFER-SCALABLE: vec.epilog.ph: ; CHECK-EPILOG-PREFER-SCALABLE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll index c3ace983fd911..7628b39cf4eb7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll @@ -7,21 +7,20 @@ define void @test_big_little_params(ptr readonly %a, ptr readonly %b, ptr noalia ; CHECK-LABEL: define void @test_big_little_params ; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP0]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP1]], i32 1, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP2:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]], [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP2]], ptr [[TMP3]], i32 4, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP2]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr [[TMP3]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP4:%.*]] = call @foo_vector( [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[TMP4]], ptr [[TMP5]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[EXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] @@ -52,21 +51,20 @@ define void @test_little_big_params(ptr readonly %a, ptr readonly %b, ptr noalia ; CHECK-LABEL: define void @test_little_big_params ; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 1 -; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f32.p0(ptr [[TMP0]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP1]], i32 8, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP2:%.*]] = call @bar_vector( [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]], [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] -; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP2]], ptr [[TMP3]], i32 8, [[ACTIVE_LANE_MASK]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ splat (i1 true), [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f32.p0(ptr [[TMP2]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP4:%.*]] = call @bar_vector( [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]], [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP4]], ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 ; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll index c54511e957ef8..209fa60b260aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll @@ -37,9 +37,7 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 1 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP23]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll index a5f47e7275f65..2a19402347e40 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll @@ -212,8 +212,7 @@ define void @test_interleave_store_one_constant(ptr noalias %src, ptr noalias %d ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -366,8 +365,7 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index 46ba7f645a03e..6c36dfb81311b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -16,7 +16,6 @@ define void @load_store_interleave_group(ptr noalias %data) { ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -24,7 +23,7 @@ define void @load_store_interleave_group(ptr noalias %data) { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 8 ; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: @@ -66,7 +65,6 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no ; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -75,7 +73,7 @@ define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr no ; CHECK-NEXT: [[TMP7:%.*]] = load , ptr [[TMP1]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = fneg [[TMP7]] ; CHECK-NEXT: store [[TMP9]], ptr [[TMP1]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll index 21928ce715007..44b4e5a8c2bc7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll @@ -29,8 +29,7 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TC]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TC]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll index b7a697831e117..26b80967c336a 100644 --- a/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/buffer-fat-pointer.ll @@ -7,21 +7,21 @@ define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32(ptr addrspace(1) %.ptr, i64 %v) { ; CHECK-LABEL: define amdgpu_kernel void @_dynamic_pack_simple_dispatch_0_pack_i32( ; CHECK-SAME: ptr addrspace(1) [[DOTPTR:%.*]], i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[_LR_PH5:.*]]: -; CHECK-NEXT: [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i32 -2147483648, i32 159744) +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[DOTRSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[DOTPTR]], i16 0, i64 2147483648, i32 159744) ; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(8) [[DOTRSRC]] to ptr addrspace(7) ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[_LR_PH5]] ], [ [[TMP5:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5]] = add i64 [[TMP3]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP3]], [[V]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[__CRIT_EDGE_LOOPEXIT:.*]], label %[[LOOP]] -; CHECK: [[__CRIT_EDGE_LOOPEXIT]]: +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; entry: - %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i32 2147483648, i32 159744) + %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %.ptr, i16 0, i64 2147483648, i32 159744) %fat = addrspacecast ptr addrspace(8) %rsrc to ptr addrspace(7) br label %loop @@ -36,4 +36,4 @@ exit: ; preds = %exit ret void } -declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32) +declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i64, i32) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index 658b9a4569191..1540baab53719 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -1679,8 +1679,7 @@ define i64 @test_std_q31(ptr %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP2]]) ; CHECK-NEXT: [[TMP4]] = add i64 [[VEC_PHI]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i32> [[TMP1]] to <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP2]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) ; CHECK-NEXT: [[TMP8]] = add i64 [[VEC_PHI1]], [[TMP7]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll index 3938b7a4c7ff6..abbd176a1df6e 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll @@ -143,8 +143,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) { ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll index 43cce8005bbf6..231175f362888 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll @@ -238,9 +238,8 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_FOR_END13_CRIT_EDGE:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP158]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -276,7 +275,7 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-NEXT: [[TMP176]] = fadd fast <2 x double> [[TMP184]], [[TMP182]] ; CHECK-NEXT: [[INDEX_NEXT80]] = add nuw i64 [[INDVARS_IV]], 2 ; CHECK-NEXT: [[TMP185:%.*]] = icmp eq i64 [[INDEX_NEXT80]], [[N_VEC70]] -; CHECK-NEXT: br i1 [[TMP185]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP185]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP186:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[TMP176]]) ; CHECK-NEXT: [[CMP_N81:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC70]] @@ -318,7 +317,7 @@ define void @QLA_F3_r_veq_norm2_V(ptr noalias %r, ptr noalias %a, i32 %n) { ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_COND_FOR_END13_CRIT_EDGE]], label %[[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[FOR_COND_FOR_END13_CRIT_EDGE]]: ; CHECK-NEXT: [[ADD10_2_LCSSA:%.*]] = phi double [ [[ADD10_2]], %[[FOR_COND1_PREHEADER]] ], [ [[TMP158]], %[[MIDDLE_BLOCK]] ], [ [[TMP186]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[PHITMP:%.*]] = fptrunc double [[ADD10_2_LCSSA]] to float @@ -385,6 +384,7 @@ for.end13: ; preds = %for.cond.for.end13_ ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]} +; CHECK: [[PROF3]] = !{!"branch_weights", i32 2, i32 14} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll index 1b0a38689603d..7677c9666455a 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll @@ -84,8 +84,7 @@ define void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 %N) { ; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; VF-TWO-CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; VF-TWO-CHECK: [[VEC_EPILOG_PH]]: ; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -187,8 +186,7 @@ define void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 %N) { ; VF-FOUR-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; VF-FOUR-CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; VF-FOUR-CHECK: [[VEC_EPILOG_PH]]: ; VF-FOUR-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -327,8 +325,7 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) { ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; VF-TWO-CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; VF-TWO-CHECK-NEXT: [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32 -; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; VF-TWO-CHECK: [[VEC_EPILOG_PH]]: ; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -451,8 +448,7 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) { ; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; VF-FOUR-CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; VF-FOUR-CHECK-NEXT: [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32 -; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; VF-FOUR-CHECK: [[VEC_EPILOG_PH]]: ; VF-FOUR-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll index ca39b35aeae1c..d82a3cde4639a 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/small-loop-rdx.ll @@ -72,8 +72,7 @@ define void @test(ptr %arr, i32 %len) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll index dd9cdb43ef92a..612e7c083bda1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll @@ -27,8 +27,7 @@ define void @block_with_dead_inst_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -101,11 +100,10 @@ define void @block_with_dead_inst_2(ptr %src) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP10]], splat (i1 true), i32 [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -175,11 +173,10 @@ define void @multiple_blocks_with_dead_insts_3(ptr %src) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP10]], splat (i1 true), i32 [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -259,11 +256,10 @@ define void @multiple_blocks_with_dead_insts_4(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -345,11 +341,10 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_5(ptr %src) #0 { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP10]], splat (i1 true), i32 [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -449,11 +444,10 @@ define void @multiple_blocks_with_dead_inst_multiple_successors_6(ptr %src, i1 % ; CHECK-NEXT: [[TMP23:%.*]] = select [[TMP17]], [[BROADCAST_SPLAT]], zeroinitializer ; CHECK-NEXT: [[TMP24:%.*]] = or [[TMP22]], [[TMP23]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP20]], [[TMP24]], i32 [[TMP27]]) -; CHECK-NEXT: [[TMP25:%.*]] = zext i32 [[TMP27]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -540,7 +534,7 @@ define void @empty_block_with_phi_1(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[TMP9]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -610,7 +604,7 @@ define void @empty_block_with_phi_2(ptr %src, i64 %N) #0 { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP11]], [[TMP9]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -707,12 +701,11 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP23]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] -; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( zeroinitializer, align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]), !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] -; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( zeroinitializer, align 4 [[TMP21]], splat (i1 true), i32 [[TMP18]]), !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: @@ -731,7 +724,7 @@ define void @dead_load_in_block(ptr %dst, ptr %src, i8 %N, i64 %x) #0 { ; CHECK-NEXT: store i32 0, ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N_EXT]] -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP_HEADER]], label %[[EXIT]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -763,23 +756,22 @@ exit: attributes #0 = { "target-features"="+64bit,+v" } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[META11]] = !{[[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]} -; CHECK: [[META13]] = distinct !{[[META13]], !"LVerDomain"} -; CHECK: [[META14]] = !{[[META15:![0-9]+]], [[META16:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], [[META13]]} -; CHECK: [[META16]] = distinct !{[[META16]], [[META13]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} +; CHECK: [[META10]] = !{[[META11:![0-9]+]]} +; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]} +; CHECK: [[META12]] = distinct !{[[META12]], !"LVerDomain"} +; CHECK: [[META13]] = !{[[META14:![0-9]+]], [[META15:![0-9]+]]} +; CHECK: [[META14]] = distinct !{[[META14]], [[META12]]} +; CHECK: [[META15]] = distinct !{[[META15]], [[META12]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 65373e4128b7f..96c3a0d1a2f01 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -36,8 +36,7 @@ define void @dead_load(ptr %p, i16 %start) { ; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i16, ptr [[P]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP21]], splat (i1 true), i32 [[TMP16]]) -; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP16]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT2]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -113,11 +112,11 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP15:%.*]] = sext [[VEC_IND]] to ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[DST]], [[TMP15]] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0( zeroinitializer, [[TMP16]], i32 1, splat (i1 true)), !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i8.nxv4p0( zeroinitializer, [[TMP16]], i32 1, splat (i1 true)), !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -132,7 +131,7 @@ define i8 @dead_live_out_due_to_scalar_epilogue_required(ptr %src, ptr %dst) { ; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IV]], 1001 -; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[R:%.*]] = phi i8 [ [[L]], %[[LOOP]] ] ; CHECK-NEXT: ret i8 [[R]] @@ -197,48 +196,48 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[B]], i32 [[INDEX]] ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] ; CHECK: [[PRED_STORE_IF]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] ; CHECK: [[PRED_STORE_CONTINUE]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]] ; CHECK: [[PRED_STORE_IF5]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE6]] ; CHECK: [[PRED_STORE_CONTINUE6]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] ; CHECK: [[PRED_STORE_IF7]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE8]] ; CHECK: [[PRED_STORE_CONTINUE8]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] ; CHECK: [[PRED_STORE_IF9]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE10]] ; CHECK: [[PRED_STORE_CONTINUE10]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] ; CHECK: [[PRED_STORE_IF11]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE12]] ; CHECK: [[PRED_STORE_CONTINUE12]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] ; CHECK: [[PRED_STORE_IF13]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE14]] ; CHECK: [[PRED_STORE_CONTINUE14]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] ; CHECK: [[PRED_STORE_IF15]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE16]] ; CHECK: [[PRED_STORE_CONTINUE16]]: ; CHECK-NEXT: br i1 [[C]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18]] ; CHECK: [[PRED_STORE_IF17]]: -; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]] +; CHECK-NEXT: store i1 false, ptr [[A]], align 1, !alias.scope [[META10]], !noalias [[META13]] ; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]] ; CHECK: [[PRED_STORE_CONTINUE18]]: -; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META14]] +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> zeroinitializer, ptr [[TMP11]], i32 4, <8 x i1> [[BROADCAST_SPLAT]]), !alias.scope [[META13]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: @@ -260,7 +259,7 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 ; CHECK-NEXT: [[EC:%.*]] = icmp slt i32 [[IV]], [[SUB]] ; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_LATCH]], label %[[EXIT:.*]] ; CHECK: [[LOOP_LATCH]]: -; CHECK-NEXT: br label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br label %[[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: br label %[[RETURN:.*]] ; CHECK: [[RETURN]]: @@ -321,11 +320,10 @@ define void @test_phi_in_latch_redundant(ptr %dst, i32 %a) { ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP19]], align 4 [[TMP16]], splat (i1 true), i32 [[TMP8]]) -; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -403,12 +401,11 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: [[TMP18:%.*]] = zext [[TMP17]] to ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP18]], align 4 [[TMP19]], splat (i1 true), i32 [[TMP10]]) -; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -453,24 +450,23 @@ exit: attributes #0 = { "target-features"="+64bit,+v" } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[META4]] = !{[[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} -; CHECK: [[META6]] = distinct !{[[META6]], !"LVerDomain"} -; CHECK: [[META7]] = !{[[META8:![0-9]+]]} -; CHECK: [[META8]] = distinct !{[[META8]], [[META6]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META3]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} -; CHECK: [[META11]] = !{[[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META13:![0-9]+]]} -; CHECK: [[META13]] = distinct !{[[META13]], !"LVerDomain"} -; CHECK: [[META14]] = !{[[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]], [[META13]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META3]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]]} +; CHECK: [[META5]] = distinct !{[[META5]], !"LVerDomain"} +; CHECK: [[META6]] = !{[[META7:![0-9]+]]} +; CHECK: [[META7]] = distinct !{[[META7]], [[META5]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} +; CHECK: [[META10]] = !{[[META11:![0-9]+]]} +; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]} +; CHECK: [[META12]] = distinct !{[[META12]], !"LVerDomain"} +; CHECK: [[META13]] = !{[[META14:![0-9]+]]} +; CHECK: [[META14]] = distinct !{[[META14]], [[META12]]} +; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]} +; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]]} +; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll index 4322989e1a46f..0a605563e45a9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll @@ -24,9 +24,8 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VEC_IND]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP11]]) -; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -80,11 +79,10 @@ define void @test_wide_ptr_induction(ptr noalias %a, ptr noalias %b, i64 %N) { ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP7]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 8, [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = mul i64 8, [[TMP9]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: @@ -118,9 +116,8 @@ for.cond.cleanup: ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll deleted file mode 100644 index 4de0e666149f3..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll +++ /dev/null @@ -1,333 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s -; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL - -define void @simple(ptr noalias %a, ptr noalias %b, %c, i64 %N) vscale_range(2, 1024) { -; CHECK-LABEL: define void @simple( -; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[C]], [[VP_OP_LOAD1]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 -; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP18]], ptr align 4 [[TMP20]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; CHECK-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; -; LOOP-DEL-LABEL: define void @simple( -; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; LOOP-DEL-NEXT: entry: -; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] -; LOOP-DEL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; LOOP-DEL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] -; LOOP-DEL: vector.ph: -; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] -; LOOP-DEL: vector.body: -; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; LOOP-DEL-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true) -; LOOP-DEL-NEXT: [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] -; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; LOOP-DEL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP5]]) -; LOOP-DEL-NEXT: [[TMP11:%.*]] = add nsw [[C]], [[VP_OP_LOAD1]] -; LOOP-DEL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] -; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; LOOP-DEL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP11]], ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) -; LOOP-DEL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 -; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; LOOP-DEL-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; LOOP-DEL: for.body: -; LOOP-DEL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; LOOP-DEL-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; LOOP-DEL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; LOOP-DEL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 -; LOOP-DEL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; LOOP-DEL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; LOOP-DEL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; LOOP-DEL: for.cond.cleanup: -; LOOP-DEL-NEXT: ret void -; -entry: - %0 = sub i64 -1, %N - %1 = call i64 @llvm.vscale.i64() - %2 = mul i64 %1, 4 - %3 = icmp ult i64 %0, %2 - br i1 %3, label %scalar.ph, label %vector.ph - -vector.ph: ; preds = %entry - %4 = call i64 @llvm.vscale.i64() - %5 = mul i64 %4, 4 - %6 = call i64 @llvm.vscale.i64() - %7 = mul i64 %6, 4 - %8 = sub i64 %7, 1 - %n.rnd.up = add i64 %N, %8 - %n.mod.vf = urem i64 %n.rnd.up, %5 - %n.vec = sub i64 %n.rnd.up, %n.mod.vf - %9 = call i64 @llvm.vscale.i64() - %10 = mul i64 %9, 4 - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] - %11 = sub i64 %N, %evl.based.iv - %12 = call i32 @llvm.experimental.get.vector.length.i64(i64 %11, i32 4, i1 true) - %13 = add i64 %evl.based.iv, 0 - %14 = getelementptr inbounds i32, ptr %b, i64 %13 - %15 = getelementptr inbounds i32, ptr %14, i32 0 - %vp.op.load = call @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, splat (i1 true), i32 %12) - %18 = add nsw %c, %vp.op.load - %19 = getelementptr inbounds i32, ptr %a, i64 %13 - %20 = getelementptr inbounds i32, ptr %19, i32 0 - call void @llvm.vp.store.nxv4i32.p0( %18, ptr align 4 %20, splat (i1 true), i32 %12) - %21 = zext i32 %12 to i64 - %index.evl.next = add i64 %21, %evl.based.iv - %index.next = add nuw i64 %index, %10 - %22 = icmp eq i64 %index.next, %n.vec - br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0 - -middle.block: ; preds = %vector.body - br i1 true, label %for.cond.cleanup, label %scalar.ph - -scalar.ph: ; preds = %entry, %middle.block - %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %entry ] - br label %for.body - -for.body: ; preds = %for.body, %scalar.ph - %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv - %23 = load i32, ptr %arrayidx, align 4 - %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %23, ptr %arrayidx4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3 - -for.cond.cleanup: ; preds = %middle.block, %for.body - ret void -} - -; Fixed IV steps resulting from vscale_range with a single element - -define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 { -; CHECK-LABEL: define void @fixed_iv_step( -; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; CHECK: for.end.loopexit5: -; CHECK-NEXT: br label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -; LOOP-DEL-LABEL: define void @fixed_iv_step( -; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; LOOP-DEL-NEXT: entry: -; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] -; LOOP-DEL: vector.body: -; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; LOOP-DEL: for.end: -; LOOP-DEL-NEXT: ret void -; -entry: - br label %vector.ph - -vector.ph: - %n.rnd.up = add nsw i64 %N, 15 - %n.vec = and i64 %n.rnd.up, -16 - %broadcast.splatinsert = insertelement poison, ptr %arg0, i64 0 - %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer - br label %vector.body - -vector.body: - %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ] - %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] - %41 = sub i64 %N, %evl.based.iv - %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true) - %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv - tail call void @llvm.vp.store.nxv2p0.p0( %broadcast.splat, ptr align 8 %gep, splat (i1 true), i32 %42) - %43 = zext i32 %42 to i64 - %index.evl.next = add i64 %evl.based.iv, %43 - %lsr.iv.next33 = add i64 %lsr.iv32, -16 - %44 = icmp eq i64 %lsr.iv.next33, 0 - br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3 - -for.end.loopexit5: - br label %for.end - -for.end: - ret void -} - -; Fixed IV step and trip count -define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 { -; CHECK-LABEL: define void @fixed_iv_step_tc( -; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; CHECK: for.end.loopexit5: -; CHECK-NEXT: br label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -; LOOP-DEL-LABEL: define void @fixed_iv_step_tc( -; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] { -; LOOP-DEL-NEXT: entry: -; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] -; LOOP-DEL: vector.body: -; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87 -; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; LOOP-DEL: for.end: -; LOOP-DEL-NEXT: ret void -; -entry: - br label %vector.ph - -vector.ph: - %n.rnd.up = add nsw i64 87, 15 - %n.vec = and i64 %n.rnd.up, -16 - %broadcast.splatinsert = insertelement poison, ptr %arg0, i64 0 - %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer - br label %vector.body - -vector.body: - %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ] - %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] - %41 = sub i64 87, %evl.based.iv - %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true) - %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv - tail call void @llvm.vp.store.nxv2p0.p0( %broadcast.splat, ptr align 8 %gep, splat (i1 true), i32 %42) - %43 = zext i32 %42 to i64 - %index.evl.next = add i64 %evl.based.iv, %43 - %lsr.iv.next33 = add i64 %lsr.iv32, -16 - %44 = icmp eq i64 %lsr.iv.next33, 0 - br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3 - -for.end.loopexit5: - br label %for.end - -for.end: - ret void -} - -declare i64 @llvm.vscale.i64() - -declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg) - -declare @llvm.vp.load.nxv4i32.p0(ptr nocapture, , i32) - -declare void @llvm.vp.store.nxv4i32.p0(, ptr nocapture, , i32) - -attributes #0 = { vscale_range(8,8) } - -!0 = distinct !{!0, !1, !2, !4} -!1 = !{!"llvm.loop.isvectorized", i32 1} -!2 = !{!"llvm.loop.unroll.runtime.disable"} -!3 = distinct !{!3, !2, !1, !4} -!4 = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]} -;. -; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; LOOP-DEL: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; LOOP-DEL: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll index 4e7145e5422bd..9f747dd6222a2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/fminimumnum.ll @@ -635,7 +635,7 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -690,7 +690,7 @@ define void @fmin16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; ZVFHMIN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; ZVFHMIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 4096 -; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; ZVFHMIN: [[EXIT]]: ; ZVFHMIN-NEXT: ret void ; @@ -752,7 +752,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[INDEX]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: @@ -768,7 +768,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; CHECK-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -807,7 +807,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP16]], [[INDEX]] ; ZVFHMIN-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; ZVFHMIN-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; ZVFHMIN-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; ZVFHMIN-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; ZVFHMIN: [[MIDDLE_BLOCK]]: ; ZVFHMIN-NEXT: br label %[[EXIT:.*]] ; ZVFHMIN: [[SCALAR_PH]]: @@ -823,7 +823,7 @@ define void @fmax16(ptr noundef readonly captures(none) %input1, ptr noundef rea ; ZVFHMIN-NEXT: store half [[OUT]], ptr [[ARRAYIDX4]], align 2 ; ZVFHMIN-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; ZVFHMIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 4096 -; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; ZVFHMIN-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; ZVFHMIN: [[EXIT]]: ; ZVFHMIN-NEXT: ret void ; @@ -859,11 +859,10 @@ declare half @llvm.maximumnum.f16(half, half) ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} ; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META11:![0-9]+]], [[META2]]} -; CHECK: [[META11]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META11]], [[META2]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} ;. ; ZVFHMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; ZVFHMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -875,9 +874,8 @@ declare half @llvm.maximumnum.f16(half, half) ; ZVFHMIN: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} ; ZVFHMIN: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} ; ZVFHMIN: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} -; ZVFHMIN: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META11:![0-9]+]], [[META2]]} -; ZVFHMIN: [[META11]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; ZVFHMIN: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} -; ZVFHMIN: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META11]], [[META2]]} -; ZVFHMIN: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} +; ZVFHMIN: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; ZVFHMIN: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; ZVFHMIN: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; ZVFHMIN: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll index ea2ccb07b388b..5df4f703c1b1b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/gather-scatter-cost.ll @@ -111,8 +111,7 @@ define void @predicated_strided_store(ptr %start) { ; RVA23-NEXT: [[TMP4:%.*]] = mul [[VEC_IND]], splat (i64 7) ; RVA23-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[START:%.*]], [[TMP4]] ; RVA23-NEXT: call void @llvm.vp.scatter.nxv8i8.nxv8p0( zeroinitializer, align 1 [[TMP5]], splat (i1 true), i32 [[TMP2]]) -; RVA23-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 -; RVA23-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]] +; RVA23-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; RVA23-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; RVA23-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; RVA23-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -149,8 +148,7 @@ define void @predicated_strided_store(ptr %start) { ; RVA23ZVL1024B-NEXT: [[TMP4:%.*]] = mul [[VEC_IND]], splat (i64 7) ; RVA23ZVL1024B-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[START:%.*]], [[TMP4]] ; RVA23ZVL1024B-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( zeroinitializer, align 1 [[TMP5]], splat (i1 true), i32 [[TMP2]]) -; RVA23ZVL1024B-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 -; RVA23ZVL1024B-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]] +; RVA23ZVL1024B-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; RVA23ZVL1024B-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; RVA23ZVL1024B-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; RVA23ZVL1024B-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -213,8 +211,7 @@ define void @store_to_addr_generated_from_invariant_addr(ptr noalias %p0, ptr no ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0( zeroinitializer, align 4 [[TMP7]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i32.nxv2p0( zeroinitializer, align 4 [[TMP7]], splat (i1 true), i32 [[TMP3]]) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv2i8.nxv2p0( zeroinitializer, align 1 [[TMP7]], splat (i1 true), i32 [[TMP3]]) -; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP4]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll index e0831be75d96f..4d97a659e94e9 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/induction-costs.ll @@ -69,8 +69,7 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { ; CHECK-NEXT: [[DOTSPLAT25:%.*]] = shufflevector [[DOTSPLATINSERT24]], poison, zeroinitializer ; CHECK-NEXT: [[TMP59:%.*]] = getelementptr i16, ptr [[A]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.vp.scatter.nxv8i16.nxv8p0( zeroinitializer, align 2 [[TMP59]], splat (i1 true), i32 [[TMP27]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]] -; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP27]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP47]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP28]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT25]] ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -91,7 +90,7 @@ define void @skip_free_iv_truncate(i16 %x, ptr %A) #0 { ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 3 ; CHECK-NEXT: [[TMP64]] = trunc i64 [[IV_NEXT]] to i32 ; CHECK-NEXT: [[C:%.*]] = icmp slt i64 [[IV]], 99 -; CHECK-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -120,7 +119,172 @@ exit: ret void } +define void @test_3_inductions(ptr noalias %dst, ptr noalias %src, i64 %n) #1 { +; CHECK-LABEL: define void @test_3_inductions( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[DST]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.stepvector.nxv2i32() +; CHECK-NEXT: [[TMP2:%.*]] = mul [[TMP1]], splat (i32 2) +; CHECK-NEXT: [[INDUCTION:%.*]] = add splat (i32 1), [[TMP2]] +; CHECK-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP2]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND2:%.*]] = phi [ [[INDUCTION1]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) +; CHECK-NEXT: [[TMP4:%.*]] = mul i32 2, [[TMP3]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = or [[VEC_IND2]], [[VEC_IND]] +; CHECK-NEXT: [[TMP6:%.*]] = sext [[TMP5]] to +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], [[TMP6]] +; CHECK-NEXT: call void @llvm.vp.scatter.nxv2p0.nxv2p0( [[TMP7]], align 8 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[VEC_IND_NEXT5]] = add [[VEC_IND2]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV_0:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[IV_0_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_OR:%.*]] = or i32 [[IV_2]], [[IV_0]] +; CHECK-NEXT: [[IV_OR_EXT:%.*]] = sext i32 [[IV_OR]] to i64 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_OR_EXT]] +; CHECK-NEXT: store ptr [[GEP_SRC]], ptr [[DST]], align 8 +; CHECK-NEXT: [[IV_0_NEXT]] = add i32 [[IV_0]], 2 +; CHECK-NEXT: [[IV_1_NEXT]] = add i64 [[IV_1]], 1 +; CHECK-NEXT: [[IV_2_NEXT]] = add i32 [[IV_2]], 2 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_1]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv.0 = phi i32 [ 1, %entry ], [ %iv.0.next, %loop ] + %iv.1 = phi i64 [ 0, %entry ], [ %iv.1.next, %loop ] + %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ] + %iv.or = or i32 %iv.2, %iv.0 + %iv.or.ext = sext i32 %iv.or to i64 + %gep.src = getelementptr i8, ptr %src, i64 %iv.or.ext + store ptr %gep.src, ptr %dst, align 8 + %iv.0.next = add i32 %iv.0, 2 + %iv.1.next = add i64 %iv.1, 1 + %iv.2.next = add i32 %iv.2, 2 + %ec = icmp eq i64 %iv.1, %n + br i1 %ec, label %exit, label %loop + +exit: + ret void +} + +define void @redundant_iv_trunc_for_cse(ptr noalias %src, ptr noalias %dst, i64 %n) #0 { +; CHECK-LABEL: define void @redundant_iv_trunc_for_cse( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[TMP2:%.*]] = mul [[TMP1]], splat (i32 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP2]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND1:%.*]] = phi [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[TMP0]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP4]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq [[VP_OP_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shl [[VEC_IND1]], splat (i32 16) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP5]], [[TMP6]], [[VEC_IND]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc [[PREDPHI]] to +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[EVL_BASED_IV]] +; CHECK-NEXT: call void @llvm.vp.store.nxv4i8.p0( [[TMP7]], ptr align 1 [[TMP8]], splat (i1 true), i32 [[TMP3]]) +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP9]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[VEC_IND_NEXT2]] = add [[VEC_IND1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[C_0:%.*]] = icmp eq i32 [[L]], 0 +; CHECK-NEXT: [[TRUNC_IV:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-NEXT: br i1 [[C_0]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[TRUNC_IV_2:%.*]] = trunc i64 [[IV]] to i32 +; CHECK-NEXT: [[SHL_IV:%.*]] = shl i32 [[TRUNC_IV_2]], 16 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[SHL_IV]], %[[THEN]] ], [ [[TRUNC_IV]], %[[LOOP_HEADER]] ] +; CHECK-NEXT: [[TRUNC_P:%.*]] = trunc i32 [[P]] to i8 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 [[TRUNC_P]], ptr [[GEP_DST]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv + %l = load i32, ptr %gep.src + %c.0 = icmp eq i32 %l, 0 + %trunc.iv = trunc i64 %iv to i32 + br i1 %c.0, label %then, label %loop.latch + +then: + %trunc.iv.2 = trunc i64 %iv to i32 + %shl.iv = shl i32 %trunc.iv.2, 16 + br label %loop.latch + +loop.latch: + %p = phi i32 [ %shl.iv, %then ], [ %trunc.iv, %loop.header ] + %trunc.p = trunc i32 %p to i8 + %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %trunc.p, ptr %gep.dst, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + + + attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } +attributes #1 = { "target-cpu"="sifive-p670" } ;. ; CHECK: [[META0]] = !{[[META1:![0-9]+]]} ; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} @@ -128,9 +292,10 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } ; CHECK: [[META3]] = !{[[META4:![0-9]+]], [[META5:![0-9]+]]} ; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} ; CHECK: [[META5]] = distinct !{[[META5]], [[META2]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META7:![0-9]+]], [[META8:![0-9]+]]} ; CHECK: [[META7]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META8]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META9]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]]} +; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META7]], [[META8]]} +; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META7]], [[META8]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll index 86b3a7e32c852..ae6c90c5ce188 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -34,9 +34,8 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-NEXT: [[TMP17:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] ; VLENUNK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] ; VLENUNK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP17]], ptr align 4 [[TMP18]], splat (i1 true), i32 [[TMP7]]) -; VLENUNK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP7]] to i64 -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP19]], [[INDEX]] -; VLENUNK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] +; VLENUNK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] ; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll index 7d32302abfe24..89819f2be4967 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll @@ -52,8 +52,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV32-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV32-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] ; RV32-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] -; RV32-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] +; RV32-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; RV32-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV32-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; RV32-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -121,8 +120,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea ; RV64-NEXT: [[TMP18:%.*]] = fadd [[WIDE_MASKED_GATHER6]], [[TMP17]] ; RV64-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], [[VEC_IND]] ; RV64-NEXT: call void @llvm.vp.scatter.nxv2f64.nxv2p0( [[TMP18]], align 8 [[TMP19]], [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]] -; RV64-NEXT: [[TMP20:%.*]] = zext i32 [[TMP10]] to i64 -; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] +; RV64-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; RV64-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; RV64-NEXT: [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; RV64-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll index edacefaa1a1e3..782c2f6c24fa4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll @@ -43,9 +43,8 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64 ; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP23]], i64 poison, i64 [[INDEX]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[ARG]], i64 [[PREDPHI]] ; CHECK-NEXT: call void @llvm.vp.store.nxv8i16.p0( zeroinitializer, ptr align 2 [[TMP24]], [[TMP22]], i32 [[TMP25]]) -; CHECK-NEXT: [[TMP26:%.*]] = zext i32 [[TMP25]] to i64 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP26]], [[INDEX]] -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP26]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -111,8 +110,7 @@ exit: ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index 06fce26415e47..3739f85afe740 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -89,8 +89,7 @@ exit: ; preds = %for.body ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll index 5876a6bf32848..93c0a7455165b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/reg-usage-prune-vf.ll @@ -12,11 +12,9 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) { ; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul [[TMP0]], splat (i64 2) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul [[TMP2]], splat (i64 3) +; CHECK-NEXT: [[TMP3:%.*]] = mul [[TMP0]], splat (i64 3) ; CHECK-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv4i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul [[TMP4]], splat (i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = mul [[TMP0]], splat (i64 4) ; CHECK-NEXT: [[INDUCTION2:%.*]] = add zeroinitializer, [[TMP5]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -30,12 +28,10 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) { ; CHECK-NEXT: [[TMP8:%.*]] = mul i64 4, [[TMP7]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP7]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP6]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP7]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = sub [[VEC_IND]], splat (i64 1) @@ -53,9 +49,8 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) { ; CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP6]], 3 ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv12i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER9]], [[WIDE_MASKED_GATHER10]]) ; CHECK-NEXT: call void @llvm.vp.store.nxv12i8.p0( [[INTERLEAVED_VEC]], ptr align 1 [[TMP21]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) -; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT8]] ; CHECK-NEXT: [[VEC_IND_NEXT11]] = add [[VEC_IND3]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[VEC_IND_NEXT12]] = add [[VEC_IND4]], [[BROADCAST_SPLAT]] @@ -104,11 +99,9 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) { ; NO-REG-PRESSURE-CHECK-NEXT: [[TMP0:%.*]] = call @llvm.stepvector.nxv8i64() ; NO-REG-PRESSURE-CHECK-NEXT: [[TMP1:%.*]] = mul [[TMP0]], splat (i64 2) ; NO-REG-PRESSURE-CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP1]] -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv8i64() -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP3:%.*]] = mul [[TMP2]], splat (i64 3) +; NO-REG-PRESSURE-CHECK-NEXT: [[TMP3:%.*]] = mul [[TMP0]], splat (i64 3) ; NO-REG-PRESSURE-CHECK-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP3]] -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv8i64() -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP5:%.*]] = mul [[TMP4]], splat (i64 4) +; NO-REG-PRESSURE-CHECK-NEXT: [[TMP5:%.*]] = mul [[TMP0]], splat (i64 4) ; NO-REG-PRESSURE-CHECK-NEXT: [[INDUCTION2:%.*]] = add zeroinitializer, [[TMP5]] ; NO-REG-PRESSURE-CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; NO-REG-PRESSURE-CHECK: [[VECTOR_BODY]]: @@ -122,12 +115,10 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) { ; NO-REG-PRESSURE-CHECK-NEXT: [[TMP8:%.*]] = mul i64 4, [[TMP7]] ; NO-REG-PRESSURE-CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; NO-REG-PRESSURE-CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP9]] +; NO-REG-PRESSURE-CHECK-NEXT: [[TMP10:%.*]] = mul i64 3, [[TMP7]] ; NO-REG-PRESSURE-CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 ; NO-REG-PRESSURE-CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP6]] to i64 -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP11]] +; NO-REG-PRESSURE-CHECK-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP7]] ; NO-REG-PRESSURE-CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; NO-REG-PRESSURE-CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer ; NO-REG-PRESSURE-CHECK-NEXT: [[TMP13:%.*]] = sub [[VEC_IND]], splat (i64 1) @@ -145,9 +136,8 @@ define void @f(ptr noalias %p0, ptr noalias %p1, ptr noalias %p2) { ; NO-REG-PRESSURE-CHECK-NEXT: [[INTERLEAVE_EVL:%.*]] = mul nuw nsw i32 [[TMP6]], 3 ; NO-REG-PRESSURE-CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave3.nxv24i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER9]], [[WIDE_MASKED_GATHER10]]) ; NO-REG-PRESSURE-CHECK-NEXT: call void @llvm.vp.store.nxv24i8.p0( [[INTERLEAVED_VEC]], ptr align 1 [[TMP21]], splat (i1 true), i32 [[INTERLEAVE_EVL]]) -; NO-REG-PRESSURE-CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 -; NO-REG-PRESSURE-CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP22]], [[EVL_BASED_IV]] -; NO-REG-PRESSURE-CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP22]] +; NO-REG-PRESSURE-CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP7]], [[EVL_BASED_IV]] +; NO-REG-PRESSURE-CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; NO-REG-PRESSURE-CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT8]] ; NO-REG-PRESSURE-CHECK-NEXT: [[VEC_IND_NEXT11]] = add [[VEC_IND3]], [[BROADCAST_SPLAT6]] ; NO-REG-PRESSURE-CHECK-NEXT: [[VEC_IND_NEXT12]] = add [[VEC_IND4]], [[BROADCAST_SPLAT]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll index 799efbd5f26ea..7b8404abdc54b 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/remark-reductions.ll @@ -50,8 +50,7 @@ exit: ret float %red.lcssa } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 4c84913eea23d..ca1c710e614f3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -26,8 +26,7 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP16:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP16]], align 4 [[TMP15]], splat (i1 true), i32 [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -143,8 +142,7 @@ define void @single_constant_stride_int_iv(ptr %p) { ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) ; CHECK-NEXT: [[TMP13:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP13]], align 4 [[TMP12]], splat (i1 true), i32 [[TMP7]]) -; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 -; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] +; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -261,8 +259,7 @@ define void @single_constant_stride_ptr_iv(ptr %p) { ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP20]], align 4 [[VECTOR_GEP]], splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 8, [[TMP9]] ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] @@ -834,8 +831,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[P2]], [[TMP18]] ; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP20]], align 4 [[TMP21]], splat (i1 true), i32 [[TMP43]]), !alias.scope [[META9:![0-9]+]], !noalias [[META6]] -; STRIDED-NEXT: [[TMP46:%.*]] = zext i32 [[TMP43]] to i64 -; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP46]] +; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP44]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; STRIDED-NEXT: br i1 [[TMP41]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] @@ -1184,21 +1180,16 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[DOTSPLAT10:%.*]] = shufflevector [[DOTSPLATINSERT9]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul [[TMP19]], [[DOTSPLAT10]] ; STRIDED-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI11]], [[TMP18]] -; STRIDED-NEXT: [[TMP27:%.*]] = call @llvm.stepvector.nxv4i64() -; STRIDED-NEXT: [[TMP21:%.*]] = mul [[TMP27]], [[DOTSPLAT10]] -; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP21]] +; STRIDED-NEXT: [[VECTOR_GEP7:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP18]] ; STRIDED-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[VECTOR_GEP7]], splat (i1 true), i32 [[TMP14]]), !alias.scope [[META13:![0-9]+]] ; STRIDED-NEXT: [[TMP30:%.*]] = add [[WIDE_MASKED_GATHER]], splat (i32 1) ; STRIDED-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[TMP30]], align 4 [[VECTOR_GEP]], splat (i1 true), i32 [[TMP14]]), !alias.scope [[META16:![0-9]+]], !noalias [[META13]] ; STRIDED-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 ; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] -; STRIDED-NEXT: [[TMP20:%.*]] = zext i32 [[TMP14]] to i64 -; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP20]] +; STRIDED-NEXT: [[TMP25:%.*]] = mul i64 [[STRIDE]], [[TMP16]] ; STRIDED-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP25]] -; STRIDED-NEXT: [[TMP22:%.*]] = zext i32 [[TMP14]] to i64 -; STRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[STRIDE]], [[TMP22]] -; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP17]] +; STRIDED-NEXT: [[PTR_IND12]] = getelementptr i8, ptr [[POINTER_PHI11]], i64 [[TMP25]] ; STRIDED-NEXT: [[TMP23:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; STRIDED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; STRIDED: middle.block: @@ -1270,9 +1261,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector [[BROADCAST_SPLATINSERT10]], poison, zeroinitializer ; STRIDED-UF2-NEXT: [[TMP16:%.*]] = mul [[TMP15]], [[BROADCAST_SPLAT11]] ; STRIDED-UF2-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI9]], [[TMP16]] -; STRIDED-UF2-NEXT: [[TMP17:%.*]] = call @llvm.stepvector.nxv4i64() -; STRIDED-UF2-NEXT: [[TMP18:%.*]] = mul [[TMP17]], [[BROADCAST_SPLAT11]] -; STRIDED-UF2-NEXT: [[VECTOR_GEP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP18]] +; STRIDED-UF2-NEXT: [[VECTOR_GEP12:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP16]] ; STRIDED-UF2-NEXT: [[STEP_ADD:%.*]] = getelementptr i8, [[VECTOR_GEP12]], [[TMP14]] ; STRIDED-UF2-NEXT: [[STEP_ADD13:%.*]] = getelementptr i8, [[VECTOR_GEP]], [[TMP14]] ; STRIDED-UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[VECTOR_GEP12]], i32 4, splat (i1 true), poison), !alias.scope [[META15:![0-9]+]] @@ -1284,8 +1273,7 @@ define void @double_stride_ptr_iv(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] ; STRIDED-UF2-NEXT: [[TMP21:%.*]] = mul i64 [[STRIDE]], [[TMP9]] ; STRIDED-UF2-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP21]] -; STRIDED-UF2-NEXT: [[TMP22:%.*]] = mul i64 [[STRIDE]], [[TMP9]] -; STRIDED-UF2-NEXT: [[PTR_IND15]] = getelementptr i8, ptr [[POINTER_PHI9]], i64 [[TMP22]] +; STRIDED-UF2-NEXT: [[PTR_IND15]] = getelementptr i8, ptr [[POINTER_PHI9]], i64 [[TMP21]] ; STRIDED-UF2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; STRIDED-UF2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; STRIDED-UF2: middle.block: @@ -1363,9 +1351,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; NOSTRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP4]], splat (i1 true), i32 [[TMP2]]) ; NOSTRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]] ; NOSTRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) -; NOSTRIDED-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 -; NOSTRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP6]], [[EVL_BASED_IV]] -; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]] +; NOSTRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP3]], [[EVL_BASED_IV]] +; NOSTRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; NOSTRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; NOSTRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; NOSTRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -1459,9 +1446,8 @@ define void @constant_stride_reinterpret(ptr noalias %in, ptr noalias %out) { ; STRIDED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP4]], splat (i1 true), i32 [[TMP2]]) ; STRIDED-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw i64, ptr [[OUT:%.*]], i64 [[EVL_BASED_IV]] ; STRIDED-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP5]], splat (i1 true), i32 [[TMP2]]) -; STRIDED-NEXT: [[TMP6:%.*]] = zext i32 [[TMP2]] to i64 -; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP6]], [[EVL_BASED_IV]] -; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP6]] +; STRIDED-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP3]], [[EVL_BASED_IV]] +; STRIDED-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP3]] ; STRIDED-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; STRIDED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; STRIDED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll index 6d13d81fe2e2e..0bcd027588f7d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-bin-unary-ops-args.ll @@ -50,7 +50,7 @@ define void @test_and(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -151,7 +151,7 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -165,7 +165,7 @@ define void @test_or(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -266,7 +266,7 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -280,7 +280,7 @@ define void @test_xor(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -381,7 +381,7 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -395,7 +395,7 @@ define void @test_shl(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -496,7 +496,7 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -510,7 +510,7 @@ define void @test_lshr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -611,7 +611,7 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -625,7 +625,7 @@ define void @test_ashr(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -726,7 +726,7 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -740,7 +740,7 @@ define void @test_add(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -841,7 +841,7 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -855,7 +855,7 @@ define void @test_sub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -956,7 +956,7 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -970,7 +970,7 @@ define void @test_mul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1071,7 +1071,7 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1085,7 +1085,7 @@ define void @test_sdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1186,7 +1186,7 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1200,7 +1200,7 @@ define void @test_udiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1301,7 +1301,7 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1315,7 +1315,7 @@ define void @test_srem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP26:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP25:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1416,7 +1416,7 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP18]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1430,7 +1430,7 @@ define void @test_urem(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store i8 [[TMP]], ptr [[ARRAYIDX1]], align 1 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP28:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1534,7 +1534,7 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1548,7 +1548,7 @@ define void @test_fadd(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1651,7 +1651,7 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1665,7 +1665,7 @@ define void @test_fsub(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP32:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP31:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1768,7 +1768,7 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1782,7 +1782,7 @@ define void @test_fmul(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP34:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1885,7 +1885,7 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1899,7 +1899,7 @@ define void @test_fdiv(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP36:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP35:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -2055,7 +2055,7 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FINISH_LOOPEXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -2069,7 +2069,7 @@ define void @test_fneg(ptr nocapture %a, ptr nocapture readonly %b) { ; IF-EVL-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[LEN]] ; IF-EVL-NEXT: store float [[TMP]], ptr [[ARRAYIDX1]], align 4 ; IF-EVL-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[DEC]], 100 -; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP38:![0-9]+]] +; IF-EVL-NEXT: br i1 [[DOTNOT]], label %[[FINISH_LOOPEXIT]], label %[[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] ; IF-EVL: [[FINISH_LOOPEXIT]]: ; IF-EVL-NEXT: ret void ; @@ -2143,45 +2143,44 @@ finish.loopexit: ret void } ;. -; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} -; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]} -; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]]} -; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} -; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} -; IF-EVL: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} -; IF-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]]} -; IF-EVL: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} -; IF-EVL: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]} -; IF-EVL: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]]} -; IF-EVL: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]]} -; IF-EVL: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]]} -; IF-EVL: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]]} -; IF-EVL: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]]} -; IF-EVL: [[LOOP31]] = distinct !{[[LOOP31]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]]} -; IF-EVL: [[LOOP33]] = distinct !{[[LOOP33]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]]} -; IF-EVL: [[LOOP35]] = distinct !{[[LOOP35]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]]} -; IF-EVL: [[LOOP37]] = distinct !{[[LOOP37]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP38]] = distinct !{[[LOOP38]], [[META1]]} +; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} +; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; IF-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} +; IF-EVL: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} +; IF-EVL: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} +; IF-EVL: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP21]] = distinct !{[[LOOP21]], [[META1]]} +; IF-EVL: [[LOOP22]] = distinct !{[[LOOP22]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP23]] = distinct !{[[LOOP23]], [[META1]]} +; IF-EVL: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP25]] = distinct !{[[LOOP25]], [[META1]]} +; IF-EVL: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP27]] = distinct !{[[LOOP27]], [[META1]]} +; IF-EVL: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP29]] = distinct !{[[LOOP29]], [[META1]]} +; IF-EVL: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP31]] = distinct !{[[LOOP31]], [[META1]]} +; IF-EVL: [[LOOP32]] = distinct !{[[LOOP32]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP33]] = distinct !{[[LOOP33]], [[META1]]} +; IF-EVL: [[LOOP34]] = distinct !{[[LOOP34]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP35]] = distinct !{[[LOOP35]], [[META1]]} +; IF-EVL: [[LOOP36]] = distinct !{[[LOOP36]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP37]] = distinct !{[[LOOP37]], [[META1]]} ;. ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll index 85a9228486aa7..f5ff512b94123 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-call-intrinsics.ll @@ -59,7 +59,7 @@ define void @vp_smax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: store i32 [[DOT]], ptr [[GEP11]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -181,7 +181,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -197,7 +197,7 @@ define void @vp_smin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: store i32 [[DOT]], ptr [[GEP11]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -319,7 +319,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -335,7 +335,7 @@ define void @vp_umax(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: store i32 [[DOT]], ptr [[GEP11]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -457,7 +457,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -473,7 +473,7 @@ define void @vp_umin(ptr %a, ptr %b, ptr %c, i64 %N) { ; IF-EVL-NEXT: store i32 [[DOT]], ptr [[GEP11]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -589,7 +589,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -603,7 +603,7 @@ define void @vp_ctlz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[TMP19]], ptr [[GEP3]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -707,7 +707,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -721,7 +721,7 @@ define void @vp_cttz(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[TMP23]], ptr [[GEP3]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -827,7 +827,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -843,7 +843,7 @@ define void @vp_lrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[CONV3]], ptr [[GEP5]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -955,7 +955,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -971,7 +971,7 @@ define void @vp_llrint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[CONV3]], ptr [[GEP5]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1081,7 +1081,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP16]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1095,7 +1095,7 @@ define void @vp_abs(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[COND]], ptr [[GEP9]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1263,27 +1263,26 @@ declare i64 @llvm.lrint.i64.f64(double) declare i64 @llvm.llrint.i64.f64(double) declare i32 @llvm.abs.i32(i32, i1 immarg) ;. -; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} -; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]} -; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]]} -; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]} -; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]} -; IF-EVL: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} -; IF-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]]} -; IF-EVL: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} -; IF-EVL: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]]} +; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]]} +; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]]} +; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} +; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]]} +; IF-EVL: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]]} +; IF-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]]} +; IF-EVL: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} +; IF-EVL: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} ;. ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll index 498ff8beb9d4c..6652fefb35d60 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll @@ -50,7 +50,7 @@ define void @vp_sext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i64 [[CONV2]], ptr [[GEP4]], align 8 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -145,15 +145,15 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META10:![0-9]+]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META9:![0-9]+]] ; IF-EVL-NEXT: [[TMP16:%.*]] = zext [[VP_OP_LOAD]] to ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP16]], ptr align 8 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META13:![0-9]+]], !noalias [[META10]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP16]], ptr align 8 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META12:![0-9]+]], !noalias [[META9]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -167,7 +167,7 @@ define void @vp_zext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i64 [[CONV]], ptr [[GEP2]], align 8 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -262,15 +262,15 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META17:![0-9]+]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META16:![0-9]+]] ; IF-EVL-NEXT: [[TMP16:%.*]] = trunc [[VP_OP_LOAD]] to ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i32.p0( [[TMP16]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META20:![0-9]+]], !noalias [[META17]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i32.p0( [[TMP16]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META19:![0-9]+]], !noalias [[META16]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -284,7 +284,7 @@ define void @vp_trunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[CONV]], ptr [[GEP2]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -379,15 +379,15 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2f32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META24:![0-9]+]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2f32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META23:![0-9]+]] ; IF-EVL-NEXT: [[TMP16:%.*]] = fpext [[VP_OP_LOAD]] to ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[TMP16]], ptr align 8 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META27:![0-9]+]], !noalias [[META24]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2f64.p0( [[TMP16]], ptr align 8 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META26:![0-9]+]], !noalias [[META23]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -401,7 +401,7 @@ define void @vp_fpext(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store double [[CONV]], ptr [[GEP2]], align 8 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP30:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -496,15 +496,15 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META31:![0-9]+]] +; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv2f64.p0(ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META30:![0-9]+]] ; IF-EVL-NEXT: [[TMP16:%.*]] = fptrunc [[VP_OP_LOAD]] to ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[EVL_BASED_IV]] -; IF-EVL-NEXT: call void @llvm.vp.store.nxv2f32.p0( [[TMP16]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META34:![0-9]+]], !noalias [[META31]] +; IF-EVL-NEXT: call void @llvm.vp.store.nxv2f32.p0( [[TMP16]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]), !alias.scope [[META33:![0-9]+]], !noalias [[META30]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP12]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP19]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -518,7 +518,7 @@ define void @vp_fptrunc(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store float [[CONV]], ptr [[GEP2]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP37:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP36:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -621,7 +621,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -635,7 +635,7 @@ define void @vp_sitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store float [[CONV]], ptr [[GEP2]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP39:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP38:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -738,7 +738,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -752,7 +752,7 @@ define void @vp_uitofp(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store float [[CONV]], ptr [[GEP2]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP41:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP40:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -855,7 +855,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -869,7 +869,7 @@ define void @vp_fptosi(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[CONV]], ptr [[GEP2]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP43:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP42:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -972,7 +972,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -986,7 +986,7 @@ define void @vp_fptoui(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store i32 [[CONV]], ptr [[GEP2]], align 4 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP45:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP44:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1089,7 +1089,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] ; IF-EVL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH]]: @@ -1103,7 +1103,7 @@ define void @vp_inttoptr(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: store ptr [[TMP24]], ptr [[GEP2]], align 8 ; IF-EVL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP47:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP46:![0-9]+]] ; IF-EVL: [[EXIT]]: ; IF-EVL-NEXT: ret void ; @@ -1199,12 +1199,11 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[TMP15:%.*]] = ptrtoint [[TMP14]] to ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP15]], ptr align 8 [[TMP16]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP18]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP12]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP12]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -1290,50 +1289,49 @@ exit: ; IF-EVL: [[META2]] = distinct !{[[META2]], !"LVerDomain"} ; IF-EVL: [[META3]] = !{[[META4:![0-9]+]]} ; IF-EVL: [[META4]] = distinct !{[[META4]], [[META2]]} -; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]} ; IF-EVL: [[META6]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]]} -; IF-EVL: [[META10]] = !{[[META11:![0-9]+]]} -; IF-EVL: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]} -; IF-EVL: [[META12]] = distinct !{[[META12]], !"LVerDomain"} -; IF-EVL: [[META13]] = !{[[META14:![0-9]+]]} -; IF-EVL: [[META14]] = distinct !{[[META14]], [[META12]]} -; IF-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP16]] = distinct !{[[LOOP16]], [[META6]]} -; IF-EVL: [[META17]] = !{[[META18:![0-9]+]]} -; IF-EVL: [[META18]] = distinct !{[[META18]], [[META19:![0-9]+]]} -; IF-EVL: [[META19]] = distinct !{[[META19]], !"LVerDomain"} -; IF-EVL: [[META20]] = !{[[META21:![0-9]+]]} -; IF-EVL: [[META21]] = distinct !{[[META21]], [[META19]]} -; IF-EVL: [[LOOP22]] = distinct !{[[LOOP22]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP23]] = distinct !{[[LOOP23]], [[META6]]} -; IF-EVL: [[META24]] = !{[[META25:![0-9]+]]} -; IF-EVL: [[META25]] = distinct !{[[META25]], [[META26:![0-9]+]]} -; IF-EVL: [[META26]] = distinct !{[[META26]], !"LVerDomain"} -; IF-EVL: [[META27]] = !{[[META28:![0-9]+]]} -; IF-EVL: [[META28]] = distinct !{[[META28]], [[META26]]} -; IF-EVL: [[LOOP29]] = distinct !{[[LOOP29]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP30]] = distinct !{[[LOOP30]], [[META6]]} -; IF-EVL: [[META31]] = !{[[META32:![0-9]+]]} -; IF-EVL: [[META32]] = distinct !{[[META32]], [[META33:![0-9]+]]} -; IF-EVL: [[META33]] = distinct !{[[META33]], !"LVerDomain"} -; IF-EVL: [[META34]] = !{[[META35:![0-9]+]]} -; IF-EVL: [[META35]] = distinct !{[[META35]], [[META33]]} -; IF-EVL: [[LOOP36]] = distinct !{[[LOOP36]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP37]] = distinct !{[[LOOP37]], [[META6]]} -; IF-EVL: [[LOOP38]] = distinct !{[[LOOP38]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP39]] = distinct !{[[LOOP39]], [[META6]]} -; IF-EVL: [[LOOP40]] = distinct !{[[LOOP40]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP41]] = distinct !{[[LOOP41]], [[META6]]} -; IF-EVL: [[LOOP42]] = distinct !{[[LOOP42]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP43]] = distinct !{[[LOOP43]], [[META6]]} -; IF-EVL: [[LOOP44]] = distinct !{[[LOOP44]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP45]] = distinct !{[[LOOP45]], [[META6]]} -; IF-EVL: [[LOOP46]] = distinct !{[[LOOP46]], [[META6]], [[META7]], [[META8]]} -; IF-EVL: [[LOOP47]] = distinct !{[[LOOP47]], [[META6]]} -; IF-EVL: [[LOOP48]] = distinct !{[[LOOP48]], [[META6]], [[META7]], [[META8]]} +; IF-EVL: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]} +; IF-EVL: [[META9]] = !{[[META10:![0-9]+]]} +; IF-EVL: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]} +; IF-EVL: [[META11]] = distinct !{[[META11]], !"LVerDomain"} +; IF-EVL: [[META12]] = !{[[META13:![0-9]+]]} +; IF-EVL: [[META13]] = distinct !{[[META13]], [[META11]]} +; IF-EVL: [[LOOP14]] = distinct !{[[LOOP14]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP15]] = distinct !{[[LOOP15]], [[META6]]} +; IF-EVL: [[META16]] = !{[[META17:![0-9]+]]} +; IF-EVL: [[META17]] = distinct !{[[META17]], [[META18:![0-9]+]]} +; IF-EVL: [[META18]] = distinct !{[[META18]], !"LVerDomain"} +; IF-EVL: [[META19]] = !{[[META20:![0-9]+]]} +; IF-EVL: [[META20]] = distinct !{[[META20]], [[META18]]} +; IF-EVL: [[LOOP21]] = distinct !{[[LOOP21]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP22]] = distinct !{[[LOOP22]], [[META6]]} +; IF-EVL: [[META23]] = !{[[META24:![0-9]+]]} +; IF-EVL: [[META24]] = distinct !{[[META24]], [[META25:![0-9]+]]} +; IF-EVL: [[META25]] = distinct !{[[META25]], !"LVerDomain"} +; IF-EVL: [[META26]] = !{[[META27:![0-9]+]]} +; IF-EVL: [[META27]] = distinct !{[[META27]], [[META25]]} +; IF-EVL: [[LOOP28]] = distinct !{[[LOOP28]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP29]] = distinct !{[[LOOP29]], [[META6]]} +; IF-EVL: [[META30]] = !{[[META31:![0-9]+]]} +; IF-EVL: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]]} +; IF-EVL: [[META32]] = distinct !{[[META32]], !"LVerDomain"} +; IF-EVL: [[META33]] = !{[[META34:![0-9]+]]} +; IF-EVL: [[META34]] = distinct !{[[META34]], [[META32]]} +; IF-EVL: [[LOOP35]] = distinct !{[[LOOP35]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP36]] = distinct !{[[LOOP36]], [[META6]]} +; IF-EVL: [[LOOP37]] = distinct !{[[LOOP37]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP38]] = distinct !{[[LOOP38]], [[META6]]} +; IF-EVL: [[LOOP39]] = distinct !{[[LOOP39]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP40]] = distinct !{[[LOOP40]], [[META6]]} +; IF-EVL: [[LOOP41]] = distinct !{[[LOOP41]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP42]] = distinct !{[[LOOP42]], [[META6]]} +; IF-EVL: [[LOOP43]] = distinct !{[[LOOP43]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP44]] = distinct !{[[LOOP44]], [[META6]]} +; IF-EVL: [[LOOP45]] = distinct !{[[LOOP45]], [[META6]], [[META7]]} +; IF-EVL: [[LOOP46]] = distinct !{[[LOOP46]], [[META6]]} +; IF-EVL: [[LOOP47]] = distinct !{[[LOOP47]], [[META6]], [[META7]]} ;. ; NO-VP: [[META0]] = !{[[META1:![0-9]+]]} ; NO-VP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll index 470e9941d5af7..61f97aa0a47ed 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll @@ -56,7 +56,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] @@ -97,7 +97,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] @@ -239,7 +239,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-OUTLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP10]], [[TMP23]] ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PREDPHI]]) ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] @@ -259,7 +259,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] @@ -284,7 +284,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-INLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[TMP11]], [[TMP23]] ; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: @@ -303,7 +303,7 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[FOR_INC]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] @@ -466,7 +466,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP20]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] @@ -483,7 +483,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[SELECT]], [[RDX]] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[ADD_LCSSA]] @@ -516,7 +516,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP19]] ; IF-EVL-INLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-INLOOP-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-INLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: @@ -532,7 +532,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[ADD1]] = add nsw i32 [[SELECT]], [[RDX1]] ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD1]], [[FOR_BODY]] ], [ [[ADD]], [[MIDDLE_BLOCK]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[ADD_LCSSA]] @@ -700,7 +700,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP25]] ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND_NEXT7]] = add [[VEC_IND2]], [[BROADCAST_SPLAT2]] ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP24]]) ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] @@ -721,7 +721,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[FOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[MIDDLE_BLOCK1]] ] ; IF-EVL-OUTLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] @@ -753,7 +753,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP18]] ; IF-EVL-INLOOP-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-INLOOP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK1:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL-INLOOP: middle.block: ; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: @@ -773,7 +773,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[RDX_ADD]] = phi i32 [ [[ADD_PRED]], [[IF_THEN]] ], [ [[BC_MERGE_RDX]], [[FOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: [[RDX_ADD_LCSSA:%.*]] = phi i32 [ [[RDX_ADD]], [[MIDDLE_BLOCK]] ], [ [[TMP17]], [[MIDDLE_BLOCK1]] ] ; IF-EVL-INLOOP-NEXT: ret i32 [[RDX_ADD_LCSSA]] @@ -928,25 +928,23 @@ for.end: !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} ;. -; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; IF-EVL-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL-OUTLOOP: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL-OUTLOOP: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-OUTLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} -; IF-EVL-OUTLOOP: [[META5]] = !{!"llvm.loop.vectorize.enable", i1 true} -; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]], [[META3]]} -; IF-EVL-OUTLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} -; IF-EVL-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]], [[META3]]} +; IF-EVL-OUTLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META4]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL-OUTLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} ;. -; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; IF-EVL-INLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL-INLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL-INLOOP: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL-INLOOP: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-INLOOP: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} -; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.vectorize.enable", i1 true} -; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]], [[META3]]} -; IF-EVL-INLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} -; IF-EVL-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]], [[META3]]} +; IF-EVL-INLOOP: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL-INLOOP: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} ;. ; NO-VP-OUTLOOP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP-OUTLOOP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll index cafb53ad01cf4..22d216e059af3 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll @@ -140,7 +140,7 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -249,7 +249,7 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -358,7 +358,7 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[EXIT:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -444,13 +444,12 @@ exit: ret void } ;. -; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]], [[META3]]} +; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} ;. ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll index 830583614e9a4..b153328663184 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll @@ -54,7 +54,7 @@ define void @first_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL: [[FOR_END]]: ; IF-EVL-NEXT: ret void ; @@ -167,7 +167,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP23]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP23]] ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -183,7 +183,7 @@ define void @second_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL: [[FOR_END]]: ; IF-EVL-NEXT: ret void ; @@ -316,7 +316,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP27]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP27]] ; IF-EVL-NEXT: [[TMP26:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -334,7 +334,7 @@ define void @third_order_recurrence(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: store i32 [[ADD1]], ptr [[ARRAYIDX2]], align 4 ; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[INDVARS]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL: [[FOR_END]]: ; IF-EVL-NEXT: ret void ; @@ -469,7 +469,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: store [[TMP11]], ptr [[TMP12]], align 4 ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDVARS]], [[TMP3]] ; IF-EVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; IF-EVL-NEXT: [[TMP15:%.*]] = mul nuw i32 [[TMP14]], 4 @@ -495,7 +495,7 @@ define i32 @FOR_reduction(ptr noalias %A, ptr noalias %B, i64 %TC) { ; IF-EVL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX2]], align 4 ; IF-EVL-NEXT: [[INDVARS_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_NEXT]], [[TC]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL: [[FOR_END]]: ; IF-EVL-NEXT: [[FOR1_LCSSA:%.*]] = phi i32 [ [[FOR1]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], %[[MIDDLE_BLOCK]] ] ; IF-EVL-NEXT: ret i32 [[FOR1_LCSSA]] @@ -609,12 +609,11 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) { ; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.experimental.vp.splice.nxv2i64( [[VECTOR_RECUR]], [[TMP20]], i32 -1, splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP11]]) ; IF-EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i64, ptr [[A]], i64 [[EVL_BASED_IV]] ; IF-EVL-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[TMP15]], ptr align 8 [[TMP9]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP21]] +; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP7]], [[EVL_BASED_IV]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; IF-EVL-NEXT: br i1 [[TMP22]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; IF-EVL: [[MIDDLE_BLOCK]]: ; IF-EVL-NEXT: br label %[[FOR_END:.*]] ; IF-EVL: [[SCALAR_PH:.*]]: @@ -627,7 +626,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) { ; IF-EVL-NEXT: store i64 [[FOR1]], ptr [[ARRAYIDX]], align 8 ; IF-EVL-NEXT: [[IV1_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV1_NEXT]], [[TC]] -; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]] +; IF-EVL-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP3]] ; IF-EVL: [[FOR_END]]: ; IF-EVL-NEXT: ret void ; @@ -711,17 +710,16 @@ for.end: !0 = distinct !{!0, !1} !1 = !{!"llvm.loop.vectorize.enable", i1 true} ;. -; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; IF-EVL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; IF-EVL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} -; IF-EVL: [[META5]] = !{!"llvm.loop.vectorize.enable", i1 true} -; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]], [[META3]]} -; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META3]]} -; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META3]], [[META1]]} -; IF-EVL: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]], [[META3]]} +; IF-EVL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]} +; IF-EVL: [[META4]] = !{!"llvm.loop.vectorize.enable", i1 true} +; IF-EVL: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]} +; IF-EVL: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]} +; IF-EVL: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]} ;. ; NO-VP: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; NO-VP: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index a23933c7fb005..7c05f4613b575 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -153,8 +153,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP10]], splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[WIDE_MASKED_GATHER2]] ; IF-EVL-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP11]], [[VEC_PHI]], i32 [[TMP4]]) -; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP4]] to i64 -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -425,8 +424,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP10]], splat (i1 true), i32 [[TMP4]]) ; IF-EVL-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[WIDE_MASKED_GATHER2]] ; IF-EVL-NEXT: [[TMP12]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP11]], [[VEC_PHI]], i32 [[TMP4]]) -; IF-EVL-NEXT: [[TMP13:%.*]] = zext i32 [[TMP4]] to i64 -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP5]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -696,8 +694,7 @@ define i32 @load_factor_4_reverse(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = add [[TMP13]], [[WIDE_MASKED_GATHER5]] ; IF-EVL-NEXT: [[TMP16]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[TMP15]], [[VEC_PHI]], i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP17:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP17]] +; IF-EVL-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP7]] ; IF-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; IF-EVL-NEXT: [[TMP18:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-intermediate-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-intermediate-store.ll index 2330f13e43ae2..9e89cde3bc24a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-intermediate-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-intermediate-store.ll @@ -49,7 +49,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-OUTLOOP: middle.block: ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) -; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4, !alias.scope [[META7:![0-9]+]], !noalias [[META0]] +; IF-EVL-OUTLOOP-NEXT: store i32 [[TMP23]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-OUTLOOP: scalar.ph: ; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY1:%.*]] @@ -62,7 +62,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-OUTLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; IF-EVL-OUTLOOP-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 ; IF-EVL-OUTLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] -; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY1]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL-OUTLOOP: for.end: ; IF-EVL-OUTLOOP-NEXT: ret void ; @@ -95,7 +95,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; IF-EVL-INLOOP: middle.block: -; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4, !alias.scope [[META7:![0-9]+]], !noalias [[META0]] +; IF-EVL-INLOOP-NEXT: store i32 [[TMP22]], ptr [[ADDR]], align 4, !alias.scope [[META6:![0-9]+]], !noalias [[META0]] ; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]] ; IF-EVL-INLOOP: scalar.ph: ; IF-EVL-INLOOP-NEXT: br label [[FOR_BODY:%.*]] @@ -108,7 +108,7 @@ define void @reduction_intermediate_store(ptr %a, i64 %n, i32 %start, ptr %addr) ; IF-EVL-INLOOP-NEXT: store i32 [[ADD]], ptr [[ADDR]], align 4 ; IF-EVL-INLOOP-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; IF-EVL-INLOOP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; IF-EVL-INLOOP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; IF-EVL-INLOOP: for.end: ; IF-EVL-INLOOP-NEXT: ret void ; @@ -241,24 +241,22 @@ for.end: ; IF-EVL-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; IF-EVL-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} ; IF-EVL-OUTLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"} -; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]]} +; IF-EVL-OUTLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; IF-EVL-OUTLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL-OUTLOOP: [[META5]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL-OUTLOOP: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-OUTLOOP: [[META7]] = !{[[META8:![0-9]+]]} -; IF-EVL-OUTLOOP: [[META8]] = distinct !{[[META8]], [[META2]]} -; IF-EVL-OUTLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]} +; IF-EVL-OUTLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-OUTLOOP: [[META6]] = !{[[META7:![0-9]+]]} +; IF-EVL-OUTLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} +; IF-EVL-OUTLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} ;. ; IF-EVL-INLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; IF-EVL-INLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} ; IF-EVL-INLOOP: [[META2]] = distinct !{[[META2]], !"LVerDomain"} -; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]]} +; IF-EVL-INLOOP: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]], [[META5:![0-9]+]]} ; IF-EVL-INLOOP: [[META4]] = !{!"llvm.loop.isvectorized", i32 1} -; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; IF-EVL-INLOOP: [[META6]] = !{!"llvm.loop.unroll.runtime.disable"} -; IF-EVL-INLOOP: [[META7]] = !{[[META8:![0-9]+]]} -; IF-EVL-INLOOP: [[META8]] = distinct !{[[META8]], [[META2]]} -; IF-EVL-INLOOP: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]} +; IF-EVL-INLOOP: [[META5]] = !{!"llvm.loop.unroll.runtime.disable"} +; IF-EVL-INLOOP: [[META6]] = !{[[META7:![0-9]+]]} +; IF-EVL-INLOOP: [[META7]] = distinct !{[[META7]], [[META2]]} +; IF-EVL-INLOOP: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]} ;. ; NO-VP-OUTLOOP: [[META0]] = !{[[META1:![0-9]+]]} ; NO-VP-OUTLOOP: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll index d911c7e513930..a03b4306bad66 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-known-no-overflow.ll @@ -89,7 +89,7 @@ define void @overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP13]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -149,7 +149,7 @@ define void @no_overflow_at_0(ptr %p, i64 %tc) vscale_range(2, 1024) { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP9]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -185,10 +185,9 @@ exit: ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll index 300696e41829a..5c89f218fdf7d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-cost.ll @@ -92,7 +92,7 @@ define void @test_pr98413_sext_removed(ptr %src, ptr noalias %dst, i64 %x) { ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[TMP7]] ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -148,7 +148,7 @@ define void @truncate_to_i1_used_by_branch(i8 %x, ptr %dst) #0 { ; CHECK-NEXT: call void @llvm.vp.scatter.nxv4i8.nxv4p0( zeroinitializer, align 1 [[BROADCAST_SPLAT2]], splat (i1 true), i32 [[TMP6]]) ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP6]] ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -257,7 +257,7 @@ define void @icmp_only_first_op_truncated(ptr noalias %dst, i32 %x, i64 %N, i64 ; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP14]] to i64 ; CHECK-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 -; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH:.*]]: @@ -310,11 +310,10 @@ attributes #0 = { "target-features"="+64bit,+v,+zvl256b" } attributes #1 = { "target-features"="+64bit,+v" } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]], [[META3]]} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]], [[META3]]} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll index 6de3b505cba23..6efb0358242c7 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/truncate-to-minimal-bitwidth-evl-crash.ll @@ -59,8 +59,7 @@ exit: ; preds = %loop ret void } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll index b8cae1d609e34..df848f2db917f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/type-info-cache-evl-crash.ll @@ -61,7 +61,7 @@ define void @type_info_cache_clobber(ptr %dstv, ptr %src, i64 %wide.trip.count) ; CHECK-NEXT: store i16 [[CONV36]], ptr null, align 2 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -95,9 +95,8 @@ exit: ; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} ; CHECK: [[META3]] = !{[[META4:![0-9]+]]} ; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]} ; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]]} +; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index adfecdff8fcc8..9095d6e87ad4f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -285,9 +285,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[WIDE_MASKED_GATHER]], zeroinitializer ; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[PREDPHI]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP17]]) -; SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP17]] to i64 -; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] -; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; SCALABLE-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -383,9 +382,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[PREDPHI:%.*]] = select [[TMP10]], [[WIDE_MASKED_GATHER]], zeroinitializer ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv4i64.p0( [[PREDPHI]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP7]]) -; TF-SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 -; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] -; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; TF-SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; TF-SCALABLE-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -698,9 +696,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT1]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] ; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT]], ptr align 8 [[TMP16]], splat (i1 true), i32 [[TMP7]]) -; SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 -; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[TMP10]] -; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP8]], [[TMP10]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; SCALABLE-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -780,9 +777,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT3]], ptr align 8 [[TMP10]], splat (i1 true), i32 [[TMP9]]) -; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 -; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] -; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP13]], [[INDEX]] +; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP13]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; TF-SCALABLE-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] @@ -843,9 +839,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[BROADCAST_SPLAT2]], [[TMP10]], i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT1]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP7]]) -; SCALABLE-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 -; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP15]], [[INDEX]] -; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP15]] +; SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]] +; SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] ; SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; SCALABLE-NEXT: [[TMP11:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; SCALABLE-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] @@ -940,9 +935,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[BROADCAST_SPLAT2]], [[TMP10]], i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] ; TF-SCALABLE-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[BROADCAST_SPLAT1]], ptr align 8 [[TMP12]], splat (i1 true), i32 [[TMP9]]) -; TF-SCALABLE-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64 -; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP14]], [[INDEX]] -; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP14]] +; TF-SCALABLE-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP11]], [[INDEX]] +; TF-SCALABLE-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP11]] ; TF-SCALABLE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[AVL_NEXT]], 0 ; TF-SCALABLE-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll index dbedce99f8d7e..8c67b4cb7996e 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vector-loop-backedge-elimination-with-evl.ll @@ -92,8 +92,7 @@ exit: attributes #0 = { vscale_range(2,2) } ;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll index 601c917192bf5..ed797fcd6c026 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vf-will-not-generate-any-vector-insts.ll @@ -41,7 +41,7 @@ define void @vf_will_not_generate_any_vector_insts(ptr %src, ptr %dst) { ; CHECK-NEXT: store i32 [[DOTPRE]], ptr [[DST]], align 4 ; CHECK-NEXT: [[TMP3]] = add nuw i64 [[TMP2]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[TMP3]], 100 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -65,9 +65,8 @@ exit: ; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"} ; CHECK: [[META3]] = !{[[META4:![0-9]+]]} ; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]} -; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]], [[META8:![0-9]+]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]} ; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META7]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[META8]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META6]]} +; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 9506ad30c788b..6d2cda48f90ca 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -36,8 +36,7 @@ define i32 @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END5:%.*]] = add i64 3, [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll index 21fa6ceb2cc12..590b2691c3238 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll @@ -580,8 +580,7 @@ define void @cost_duplicate_recipe_for_sinking(ptr %A, i64 %N) #2 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index 0078d00de28f8..0ba885d7811b2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -222,9 +222,8 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END9:%.*]] = mul i64 [[N_VEC]], 32 -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP124]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -269,7 +268,7 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[TMP155]] = fadd fast <4 x float> [[TMP154]], [[TMP153]] ; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX10]], 4 ; CHECK-NEXT: [[TMP156:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[TMP156]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP156]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP157:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP155]]) ; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC8]] @@ -289,7 +288,7 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 ; CHECK-NEXT: [[ADD4]] = fadd fast float [[ADD]], [[T2]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32 ; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T0]] -; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: loopexit: ; CHECK-NEXT: [[ADD4_LCSSA:%.*]] = phi float [ [[ADD4]], [[FOR]] ], [ [[TMP124]], [[MIDDLE_BLOCK]] ], [ [[TMP157]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] @@ -369,10 +368,10 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META5:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = load i64, ptr [[SRC_1]], align 8, !alias.scope [[META6:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP13]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META8:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = load i64, ptr [[SRC_2]], align 8, !alias.scope [[META9:![0-9]+]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[TMP14]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer @@ -380,10 +379,10 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = and <2 x i1> [[TMP16]], [[TMP15]] ; CHECK-NEXT: [[TMP18:%.*]] = zext <2 x i1> [[TMP17]] to <2 x i8> ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i8> [[TMP18]], i32 1 -; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META10:![0-9]+]], !noalias [[META12:![0-9]+]] +; CHECK-NEXT: store i8 [[TMP19]], ptr [[DST]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -406,7 +405,7 @@ define void @multi_exit(ptr %dst, ptr %src.1, ptr %src.2, i64 %A, i64 %B) #0 { ; CHECK-NEXT: [[IV_1_NEXT]] = add i32 [[IV_1]], 1 ; CHECK-NEXT: [[IV_1_NEXT_WIDE]] = zext i32 [[IV_1_NEXT]] to i64 ; CHECK-NEXT: [[EC_2:%.*]] = icmp ult i64 [[IV_1_NEXT_WIDE]], [[B]] -; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[EC_2]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -485,7 +484,7 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[TMP27]] = or <2 x i1> [[VEC_PHI3]], [[TMP25]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i1> [[TMP27]], [[TMP26]] ; CHECK-NEXT: [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[BIN_RDX]]) @@ -505,7 +504,7 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[ANY_OF_NEXT]] = select i1 [[CMP13_NOT_NOT]], i1 [[ANY_OF]], i1 false ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 40 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq ptr [[PTR_IV]], [[END]] -; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[ANY_OF_NEXT_LCSSA:%.*]] = phi i1 [ [[ANY_OF_NEXT]], [[LOOP]] ] ; CHECK-NEXT: ret i1 [[ANY_OF_NEXT_LCSSA]] @@ -562,7 +561,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <2 x i64> [[TMP9]], [[BIN_RDX]] @@ -583,7 +582,7 @@ define i64 @cost_assume(ptr %end, i64 %N) { ; CHECK-NEXT: tail call void @llvm.assume(i1 [[C]]) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr nusw [9 x i8], ptr null, i64 [[IV_NEXT]] ; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[GEP]], [[END]] -; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i64 [ [[TMP12]], [[LOOP]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[DOTLCSSA]] @@ -627,7 +626,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 24 -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP12]], [[TMP11]] ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[BIN_RDX]]) @@ -649,7 +648,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 { ; CHECK-NEXT: store i32 [[RED_NEXT]], ptr [[DST]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], 29 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -694,7 +693,7 @@ define i64 @live_in_known_1_via_scev() { ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_PHI]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8 -; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]]) ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -751,7 +750,7 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { ; CHECK: vector.body: ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ splat (i64 1), [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3]] = mul <2 x i64> [[TMP2]], [[VEC_PHI]] -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]]) ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -805,7 +804,7 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1> ; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32> -; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i32 @@ -891,7 +890,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD_3]], splat (i32 4) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i32> [[TMP16]], [[TMP15]] ; CHECK-NEXT: [[BIN_RDX5:%.*]] = or <4 x i32> [[TMP17]], [[BIN_RDX]] @@ -900,9 +899,8 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP20]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -926,7 +924,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i32 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND10]], splat (i32 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT15]], [[N_VEC8]] -; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP25]]) ; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC8]] @@ -943,7 +941,7 @@ define i32 @g(i64 %n) { ; CHECK-NEXT: [[SELECT_I:%.*]] = select i1 [[EXITCOND]], i32 0, i32 2 ; CHECK-NEXT: [[SELECT_NEXT]] = or i32 [[SELECT_I]], [[SELECT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[SELECT_NEXT_LCSSA:%.*]] = phi i32 [ [[SELECT_NEXT]], [[LOOP]] ], [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SELECT_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index 90d261b78c27c..ed288d2f99a0b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -54,8 +54,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) { ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = add i64 [[IV_START]], [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -146,11 +145,9 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[IND_END:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = mul <16 x i16> splat (i16 16), [[TMP2]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT2]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFF]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = mul <16 x i16> , [[TMP2]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i16> zeroinitializer, [[TMP11]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -159,10 +156,10 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <16 x i16> [[VEC_IND]], [[TMP1]] ; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <16 x i16> [[STEP_ADD]], [[TMP1]] ; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <16 x i16> [[STEP_ADD_2]], [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <16 x i16> [[VEC_IND]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[STEP_ADD_2]], [[BROADCAST_SPLAT3]] -; CHECK-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[STEP_ADD_3]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[TMP4:%.*]] = sub <16 x i16> [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[STEP_ADD]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[STEP_ADD_2]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <16 x i16> [[STEP_ADD_3]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[K:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 16 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, ptr [[TMP8]], i32 32 @@ -174,15 +171,14 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD_3]], [[TMP1]] ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[L]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[DOTCAST9:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[IND_END10:%.*]] = mul i16 [[DOTCAST9]], [[TMP0]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[L]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index c0ff8816c2543..3b0ad73d91338 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -59,8 +59,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: [[DOTCAST12:%.*]] = sitofp i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP11:%.*]] = fmul fast float 5.000000e-01, [[DOTCAST12]] ; AUTO_VEC-NEXT: [[IND_END1:%.*]] = fadd fast float 1.000000e+00, [[TMP11]] -; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; AUTO_VEC: [[VEC_EPILOG_PH]]: ; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -353,8 +352,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: [[DOTCAST16:%.*]] = sitofp i64 [[N_VEC]] to float ; AUTO_VEC-NEXT: [[TMP12:%.*]] = fmul reassoc float 4.200000e+01, [[DOTCAST16]] ; AUTO_VEC-NEXT: [[IND_END1:%.*]] = fadd reassoc float 1.000000e+00, [[TMP12]] -; AUTO_VEC-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AUTO_VEC-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; AUTO_VEC-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; AUTO_VEC: [[VEC_EPILOG_PH]]: ; AUTO_VEC-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll index be546a1e79f0a..9528510f568fa 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -689,8 +689,7 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt ; AVX512-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]] ; AVX512-NEXT: [[TMP38:%.*]] = mul i64 [[N_VEC]], 64 ; AVX512-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]] -; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]] ; AVX512: vec.epilog.ph: ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index e94e0789c42cb..27eef017727dc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -44,8 +44,7 @@ define i32 @iv_used_widened_and_truncated(ptr %dst, i64 %N) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[SCALAR_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll index 4b4103e9806b9..61f07eff768c1 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-step.ll @@ -17,9 +17,7 @@ define i16 @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]] ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP2]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -89,19 +87,17 @@ define i16 @wide_sub_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[TMP1:%.*]] = mul i16 [[DOTCAST]], [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> splat (i16 4), [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[O_1]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i16> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i16> zeroinitializer, [[TMP3]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[BROADCAST_SPLAT2]] -; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i16> [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP10:%.*]] = sub <4 x i16> [[STEP_ADD]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[DST:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP6]], i32 4 ; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[TMP6]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll index b480eaf7502a8..d75fd0e0023f7 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -63,8 +63,7 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -190,8 +189,7 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]] ; CHECK-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]] ; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index 42d3019cc0ba2..9a3616a4340ff 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -46,8 +46,7 @@ define i32 @inv_load_conditional(ptr %a, i64 %n, ptr %b, i32 %k) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index 199f1c15fbc3d..5853e914ce112 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -61,8 +61,7 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -169,8 +168,7 @@ define void @inv_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b, i3 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -317,8 +315,7 @@ define void @variant_val_store_to_inv_address_conditional(ptr %a, i64 %n, ptr %b ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX10]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[SMAX10]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF29:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -398,16 +395,30 @@ for.end: ; preds = %for.body define void @test_store_of_final_reduction_value(i64 %x, ptr %dst) { ; CHECK-LABEL: define void @test_store_of_final_reduction_value( ; CHECK-SAME: i64 [[X:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ , %[[VECTOR_PH]] ], [ [[TMP0:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0]] = mul <2 x i64> [[VEC_PHI]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP0]]) +; CHECK-NEXT: store i64 [[TMP1]], ptr [[DST]], align 8 +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH:.*]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV4:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV4:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[RED_NEXT]] = mul i64 [[RED]], [[X]] ; CHECK-NEXT: store i64 [[RED_NEXT]], ptr [[DST]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV4]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV4]], 1 -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index cf04cd21c16be..b907e7e2fbfbf 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1378,8 +1378,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX1: [[VEC_EPILOG_ITER_CHECK]]: -; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19:![0-9]+]] ; AVX1: [[VEC_EPILOG_PH]]: ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -1471,8 +1470,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX2: [[VEC_EPILOG_ITER_CHECK]]: -; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF33:![0-9]+]] ; AVX2: [[VEC_EPILOG_PH]]: ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -1564,8 +1562,7 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX512: [[VEC_EPILOG_ITER_CHECK]]: -; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF21]] ; AVX512: [[VEC_EPILOG_PH]]: ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -1702,8 +1699,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX1: [[VEC_EPILOG_ITER_CHECK]]: -; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF19]] ; AVX1: [[VEC_EPILOG_PH]]: ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -1795,8 +1791,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX2: [[VEC_EPILOG_ITER_CHECK]]: -; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF33]] ; AVX2: [[VEC_EPILOG_PH]]: ; AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -1888,8 +1883,7 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], [[FOR_END_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; AVX512: [[VEC_EPILOG_ITER_CHECK]]: -; AVX512-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; AVX512-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; AVX512-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF21]] ; AVX512: [[VEC_EPILOG_PH]]: ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index c1adffde07510..31269b1b8c221 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -57,8 +57,7 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[UMAX2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index 5d16ce5346bbf..737bcf35fbd2c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -264,9 +264,8 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; AVX1: vec.epilog.iter.check: -; AVX1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]] +; AVX1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; AVX1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; AVX1: vec.epilog.ph: ; AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] ; AVX1-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 @@ -294,7 +293,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: store <4 x i32> [[TMP57]], ptr [[TMP58]], align 4 ; AVX1-NEXT: [[INDEX_NEXT33]] = add nuw i64 [[INDEX26]], 4 ; AVX1-NEXT: [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT33]], [[N_VEC25]] -; AVX1-NEXT: br i1 [[TMP59]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP3:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP59]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX1: vec.epilog.middle.block: ; AVX1-NEXT: [[CMP_N34:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC25]] ; AVX1-NEXT: br i1 [[CMP_N34]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_PH]] @@ -324,7 +323,7 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: store i32 [[ADD18]], ptr [[ARRAYIDX20]], align 4 ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll index 8a48f997052f0..286da4d31c799 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr54634.ll @@ -22,10 +22,10 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[DOTUNPACK2:%.*]] = load i64, ptr addrspace(10) [[DOTELT1]], align 8, !tbaa [[JTBAA_IMMUT_TBAA8]] ; CHECK-NEXT: [[TMP8:%.*]] = add nsw i64 [[TMP2]], 1 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP8]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[TOP:.*]] -; CHECK: [[TOP]]: +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[TMP17:%.*]] = icmp ult i64 [[TMP8]], 16 -; CHECK-NEXT: br i1 [[TMP17]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP8]], 16 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF]] @@ -59,16 +59,15 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD5]], splat (i64 4) ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK1:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK1]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[L44:.*]], label %[[MIDDLE_BLOCK:.*]] +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP8]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[SCALAR_PH]], !prof [[PROF15:![0-9]+]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[TOP]] ] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[L44:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF15:![0-9]+]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP8]], 4 ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP8]], [[N_MOD_VF4]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x ptr addrspace(10)> poison, ptr addrspace(10) [[DOTUNPACK]], i64 0 @@ -78,10 +77,10 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[DOTSPLAT]], -; CHECK-NEXT: br label %[[L26:.*]] -; CHECK: [[L26]]: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDEX_NEXT14:%.*]], %[[L26]] ] -; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[SCALAR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], %[[L26]] ] +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <4 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 0 ; CHECK-NEXT: call void @llvm.masked.scatter.v4p10.v4p13(<4 x ptr addrspace(10)> [[BROADCAST_SPLAT11]], <4 x ptr addrspace(13)> [[TMP28]], i32 8, <4 x i1> splat (i1 true)), !tbaa [[JTBAA_ARRAYBUF_TBAA10]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], <4 x i64> [[VEC_IND8]], i32 1 @@ -89,22 +88,22 @@ define ptr addrspace(10) @japi1_vect_42283(ptr nocapture readonly %0, i32 %1) lo ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX7]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND8]], splat (i64 4) ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP30]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[L26]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP30]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[CMP_N15]], label %[[L44]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: br label %[[L27:.*]] -; CHECK: [[L27]]: -; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP27:%.*]], %[[L27]] ] +; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[L26:.*]] +; CHECK: [[L26]]: +; CHECK-NEXT: [[VALUE_PHI5:%.*]] = phi i64 [ [[BC_RESUME_VAL17]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[TMP27:%.*]], %[[L26]] ] ; CHECK-NEXT: [[DOTREPACK:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 0 ; CHECK-NEXT: store ptr addrspace(10) [[DOTUNPACK]], ptr addrspace(13) [[DOTREPACK]], align 8, !tbaa [[JTBAA_ARRAYBUF_TBAA10]] ; CHECK-NEXT: [[DOTREPACK4:%.*]] = getelementptr inbounds { ptr addrspace(10), i64 }, ptr addrspace(13) [[TMP7]], i64 [[VALUE_PHI5]], i32 1 ; CHECK-NEXT: store i64 [[DOTUNPACK2]], ptr addrspace(13) [[DOTREPACK4]], align 8, !tbaa [[JTBAA_ARRAYBUF_TBAA10]] ; CHECK-NEXT: [[TMP27]] = add i64 [[VALUE_PHI5]], 1 ; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[VALUE_PHI5]], [[TMP2]] -; CHECK-NEXT: br i1 [[DOTNOT]], label %[[L44]], label %[[L27]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[DOTNOT]], label %[[L44]], label %[[L26]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[L44]]: ; CHECK-NEXT: ret ptr addrspace(10) null ; diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll index bdef894794850..90f3df50153a2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll @@ -40,13 +40,10 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-NEXT: br label %[[PRED_UREM_CONTINUE6]] ; CHECK: [[PRED_UREM_CONTINUE6]]: ; CHECK-NEXT: [[TMP13:%.*]] = tail call i64 @llvm.smax.i64(i64 0, i64 0) -; CHECK-NEXT: [[P:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]] ; CHECK-NEXT: [[PREDPHI7:%.*]] = select i1 [[C]], i64 1, i64 [[TMP13]] -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[P]], 1 ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[PREDPHI7]], 1 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[DST]], i64 [[ADD]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP17]] -; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 +; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8 ; CHECK-NEXT: store i64 0, ptr [[TMP19]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll index c2dfce0aa70b8..bdbac7c1a9931 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll @@ -70,8 +70,7 @@ define void @_Z3fn1v() #0 { ; CHECK-NEXT: [[TMP64:%.*]] = mul i64 [[N_VEC]], 2 ; CHECK-NEXT: [[IND_END9:%.*]] = add i64 8, [[TMP64]] ; CHECK-NEXT: [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2 -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP6]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -160,8 +159,7 @@ define void @_Z3fn1v() #0 { ; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2 ; CHECK-NEXT: [[IND_END55:%.*]] = add i64 8, [[TMP42]] ; CHECK-NEXT: [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2 -; CHECK-NEXT: [[N_VEC_REMAINING49:%.*]] = sub i64 [[TMP28]], [[N_VEC32]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_VEC_REMAINING49]], 8 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_MOD_VF31]], 8 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH40]], label %[[VEC_EPILOG_PH42]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH42]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL51:%.*]] = phi i64 [ [[N_VEC32]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK24]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll index 3618affdf1880..282e9a503e6ed 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vectorize-force-tail-with-evl.ll @@ -104,8 +104,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) { ; NO-VP-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NO-VP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; NO-VP: vec.epilog.iter.check: -; NO-VP-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; NO-VP-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8 +; NO-VP-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 ; NO-VP-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; NO-VP: vec.epilog.ph: ; NO-VP-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll index 85f72d283a0e4..6f262109f95be 100644 --- a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll +++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll @@ -111,8 +111,6 @@ define i64 @invar_cond(i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -164,8 +162,6 @@ define i64 @invar_cond_incoming_ops_reordered(i1 %c) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/cse-casts.ll b/llvm/test/Transforms/LoopVectorize/cse-casts.ll new file mode 100644 index 0000000000000..e923560bb77e8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/cse-casts.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 +; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s + +define i8 @preserve_flags_when_cloning_trunc(i8 %start, ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define i8 @preserve_flags_when_cloning_trunc( +; CHECK-SAME: i8 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> splat (i8 1), i8 [[START]], i32 0 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ [[TMP0]], %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i8> [ splat (i8 1), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i1> [[TMP2]] to <4 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 4 +; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[TMP4]], align 2 +; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6]] = mul <4 x i8> [[VEC_PHI]], splat (i8 3) +; CHECK-NEXT: [[TMP7]] = mul <4 x i8> [[VEC_PHI1]], splat (i8 3) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 416 +; CHECK-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <4 x i8> [[TMP7]], [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> [[BIN_RDX]]) +; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %red = phi i8 [ %red.next, %loop ], [ %start, %entry ] + %l = load i32, ptr %src, align 4 + %cmp = icmp ne i32 %l, 0 + %cmp.ext = zext i1 %cmp to i64 + %cmp.trunc = trunc i64 %cmp.ext to i16 + %gep.dst = getelementptr i16, ptr %dst, i64 %iv + store i16 %cmp.trunc, ptr %gep.dst, align 2 + %red.next = mul i8 %red, 3 + %iv.next = add i64 %iv, 1 + %ec = icmp ult i64 %iv, 416 + br i1 %ec, label %loop, label %exit + +exit: + ret i8 %red.next +} + + +define void @preserve_flags_narrowing_extends_and_truncs(ptr noalias %A, ptr noalias %B, ptr noalias %C) { +; CHECK-LABEL: define void @preserve_flags_narrowing_extends_and_truncs( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: br i1 true, label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]] +; CHECK: [[PRED_LOAD_IF]]: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE]] +; CHECK: [[PRED_LOAD_CONTINUE]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i8> [ poison, %[[VECTOR_BODY]] ], [ [[TMP2]], %[[PRED_LOAD_IF]] ] +; CHECK-NEXT: br i1 true, label %[[PRED_LOAD_IF1:.*]], label %[[PRED_LOAD_CONTINUE2:.*]] +; CHECK: [[PRED_LOAD_IF1]]: +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[TMP5]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]] +; CHECK: [[PRED_LOAD_CONTINUE2]]: +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i8> [ [[TMP3]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP6]], %[[PRED_LOAD_IF1]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]] +; CHECK: [[PRED_LOAD_IF3]]: +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP8]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i8> [[TMP7]], i8 [[TMP9]], i32 2 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE4]] +; CHECK: [[PRED_LOAD_CONTINUE4]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i8> [ [[TMP7]], %[[PRED_LOAD_CONTINUE2]] ], [ [[TMP10]], %[[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]] +; CHECK: [[PRED_LOAD_IF5]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[TMP13]], i32 3 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]] +; CHECK: [[PRED_LOAD_CONTINUE6]]: +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i8> [ [[TMP11]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP14]], %[[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]] +; CHECK: [[PRED_LOAD_IF7]]: +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4 +; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i8> poison, i8 [[TMP17]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE8]] +; CHECK: [[PRED_LOAD_CONTINUE8]]: +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i8> [ poison, %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP18]], %[[PRED_LOAD_IF7]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF9:.*]], label %[[PRED_LOAD_CONTINUE10:.*]] +; CHECK: [[PRED_LOAD_IF9]]: +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 5 +; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP21]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE10]] +; CHECK: [[PRED_LOAD_CONTINUE10]]: +; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i8> [ [[TMP19]], %[[PRED_LOAD_CONTINUE8]] ], [ [[TMP22]], %[[PRED_LOAD_IF9]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF11:.*]], label %[[PRED_LOAD_CONTINUE12:.*]] +; CHECK: [[PRED_LOAD_IF11]]: +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 6 +; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i8> [[TMP23]], i8 [[TMP25]], i32 2 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE12]] +; CHECK: [[PRED_LOAD_CONTINUE12]]: +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i8> [ [[TMP23]], %[[PRED_LOAD_CONTINUE10]] ], [ [[TMP26]], %[[PRED_LOAD_IF11]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF13:.*]], label %[[PRED_LOAD_CONTINUE14:.*]] +; CHECK: [[PRED_LOAD_IF13]]: +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 7 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[TMP28]], align 1 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i8> [[TMP27]], i8 [[TMP29]], i32 3 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE14]] +; CHECK: [[PRED_LOAD_CONTINUE14]]: +; CHECK-NEXT: [[TMP31:%.*]] = phi <4 x i8> [ [[TMP27]], %[[PRED_LOAD_CONTINUE12]] ], [ [[TMP30]], %[[PRED_LOAD_IF13]] ] +; CHECK-NEXT: [[TMP32:%.*]] = zext <4 x i8> [[TMP15]] to <4 x i64> +; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i8> [[TMP31]] to <4 x i64> +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i64> [[TMP32]], i32 0 +; CHECK-NEXT: store i64 [[TMP35]], ptr [[TMP34]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; CHECK: [[PRED_STORE_IF15]]: +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 1 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <4 x i64> [[TMP32]], i32 1 +; CHECK-NEXT: store i64 [[TMP37]], ptr [[TMP36]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; CHECK: [[PRED_STORE_CONTINUE16]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; CHECK: [[PRED_STORE_IF17]]: +; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 2 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i64> [[TMP32]], i32 2 +; CHECK-NEXT: store i64 [[TMP39]], ptr [[TMP38]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; CHECK: [[PRED_STORE_CONTINUE18]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; CHECK: [[PRED_STORE_IF19]]: +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 3 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i64> [[TMP32]], i32 3 +; CHECK-NEXT: store i64 [[TMP41]], ptr [[TMP40]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; CHECK: [[PRED_STORE_CONTINUE20]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; CHECK: [[PRED_STORE_IF21]]: +; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 4 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i64> [[TMP33]], i32 0 +; CHECK-NEXT: store i64 [[TMP43]], ptr [[TMP42]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; CHECK: [[PRED_STORE_CONTINUE22]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; CHECK: [[PRED_STORE_IF23]]: +; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 5 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i64> [[TMP33]], i32 1 +; CHECK-NEXT: store i64 [[TMP45]], ptr [[TMP44]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; CHECK: [[PRED_STORE_CONTINUE24]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; CHECK: [[PRED_STORE_IF25]]: +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 6 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <4 x i64> [[TMP33]], i32 2 +; CHECK-NEXT: store i64 [[TMP47]], ptr [[TMP46]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; CHECK: [[PRED_STORE_CONTINUE26]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28:.*]] +; CHECK: [[PRED_STORE_IF27]]: +; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 7 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i64> [[TMP33]], i32 3 +; CHECK-NEXT: store i64 [[TMP49]], ptr [[TMP48]], align 4 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; CHECK: [[PRED_STORE_CONTINUE28]]: +; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 0 +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 1 +; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 2 +; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 3 +; CHECK-NEXT: [[TMP54:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP50]], i32 0 +; CHECK-NEXT: [[TMP55:%.*]] = insertelement <4 x ptr> [[TMP54]], ptr [[TMP51]], i32 1 +; CHECK-NEXT: [[TMP56:%.*]] = insertelement <4 x ptr> [[TMP55]], ptr [[TMP52]], i32 2 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <4 x ptr> [[TMP56]], ptr [[TMP53]], i32 3 +; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 4 +; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 5 +; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 6 +; CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 7 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP58]], i32 0 +; CHECK-NEXT: [[TMP63:%.*]] = insertelement <4 x ptr> [[TMP62]], ptr [[TMP59]], i32 1 +; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x ptr> [[TMP63]], ptr [[TMP60]], i32 2 +; CHECK-NEXT: [[TMP65:%.*]] = insertelement <4 x ptr> [[TMP64]], ptr [[TMP61]], i32 3 +; CHECK-NEXT: br i1 true, label %[[PRED_LOAD_IF29:.*]], label %[[PRED_LOAD_CONTINUE30:.*]] +; CHECK: [[PRED_LOAD_IF29]]: +; CHECK-NEXT: [[TMP66:%.*]] = load i8, ptr [[TMP50]], align 1 +; CHECK-NEXT: [[TMP67:%.*]] = insertelement <4 x i8> poison, i8 [[TMP66]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE30]] +; CHECK: [[PRED_LOAD_CONTINUE30]]: +; CHECK-NEXT: [[TMP68:%.*]] = phi <4 x i8> [ poison, %[[PRED_STORE_CONTINUE28]] ], [ [[TMP67]], %[[PRED_LOAD_IF29]] ] +; CHECK-NEXT: br i1 true, label %[[PRED_LOAD_IF31:.*]], label %[[PRED_LOAD_CONTINUE32:.*]] +; CHECK: [[PRED_LOAD_IF31]]: +; CHECK-NEXT: [[TMP69:%.*]] = load i8, ptr [[TMP51]], align 1 +; CHECK-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP68]], i8 [[TMP69]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE32]] +; CHECK: [[PRED_LOAD_CONTINUE32]]: +; CHECK-NEXT: [[TMP71:%.*]] = phi <4 x i8> [ [[TMP68]], %[[PRED_LOAD_CONTINUE30]] ], [ [[TMP70]], %[[PRED_LOAD_IF31]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF33:.*]], label %[[PRED_LOAD_CONTINUE34:.*]] +; CHECK: [[PRED_LOAD_IF33]]: +; CHECK-NEXT: [[TMP72:%.*]] = load i8, ptr [[TMP52]], align 1 +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <4 x i8> [[TMP71]], i8 [[TMP72]], i32 2 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE34]] +; CHECK: [[PRED_LOAD_CONTINUE34]]: +; CHECK-NEXT: [[TMP74:%.*]] = phi <4 x i8> [ [[TMP71]], %[[PRED_LOAD_CONTINUE32]] ], [ [[TMP73]], %[[PRED_LOAD_IF33]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF35:.*]], label %[[PRED_LOAD_CONTINUE36:.*]] +; CHECK: [[PRED_LOAD_IF35]]: +; CHECK-NEXT: [[TMP75:%.*]] = load i8, ptr [[TMP53]], align 1 +; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i8> [[TMP74]], i8 [[TMP75]], i32 3 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE36]] +; CHECK: [[PRED_LOAD_CONTINUE36]]: +; CHECK-NEXT: [[TMP77:%.*]] = phi <4 x i8> [ [[TMP74]], %[[PRED_LOAD_CONTINUE34]] ], [ [[TMP76]], %[[PRED_LOAD_IF35]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF37:.*]], label %[[PRED_LOAD_CONTINUE38:.*]] +; CHECK: [[PRED_LOAD_IF37]]: +; CHECK-NEXT: [[TMP78:%.*]] = load i8, ptr [[TMP58]], align 1 +; CHECK-NEXT: [[TMP79:%.*]] = insertelement <4 x i8> poison, i8 [[TMP78]], i32 0 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE38]] +; CHECK: [[PRED_LOAD_CONTINUE38]]: +; CHECK-NEXT: [[TMP80:%.*]] = phi <4 x i8> [ poison, %[[PRED_LOAD_CONTINUE36]] ], [ [[TMP79]], %[[PRED_LOAD_IF37]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF39:.*]], label %[[PRED_LOAD_CONTINUE40:.*]] +; CHECK: [[PRED_LOAD_IF39]]: +; CHECK-NEXT: [[TMP81:%.*]] = load i8, ptr [[TMP59]], align 1 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i8> [[TMP80]], i8 [[TMP81]], i32 1 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE40]] +; CHECK: [[PRED_LOAD_CONTINUE40]]: +; CHECK-NEXT: [[TMP83:%.*]] = phi <4 x i8> [ [[TMP80]], %[[PRED_LOAD_CONTINUE38]] ], [ [[TMP82]], %[[PRED_LOAD_IF39]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF41:.*]], label %[[PRED_LOAD_CONTINUE42:.*]] +; CHECK: [[PRED_LOAD_IF41]]: +; CHECK-NEXT: [[TMP84:%.*]] = load i8, ptr [[TMP60]], align 1 +; CHECK-NEXT: [[TMP85:%.*]] = insertelement <4 x i8> [[TMP83]], i8 [[TMP84]], i32 2 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE42]] +; CHECK: [[PRED_LOAD_CONTINUE42]]: +; CHECK-NEXT: [[TMP86:%.*]] = phi <4 x i8> [ [[TMP83]], %[[PRED_LOAD_CONTINUE40]] ], [ [[TMP85]], %[[PRED_LOAD_IF41]] ] +; CHECK-NEXT: br i1 false, label %[[PRED_LOAD_IF43:.*]], label %[[PRED_LOAD_CONTINUE44:.*]] +; CHECK: [[PRED_LOAD_IF43]]: +; CHECK-NEXT: [[TMP87:%.*]] = load i8, ptr [[TMP61]], align 1 +; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i8> [[TMP86]], i8 [[TMP87]], i32 3 +; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE44]] +; CHECK: [[PRED_LOAD_CONTINUE44]]: +; CHECK-NEXT: [[TMP89:%.*]] = phi <4 x i8> [ [[TMP86]], %[[PRED_LOAD_CONTINUE42]] ], [ [[TMP88]], %[[PRED_LOAD_IF43]] ] +; CHECK-NEXT: [[TMP90:%.*]] = trunc <4 x i8> [[TMP77]] to <4 x i1> +; CHECK-NEXT: [[TMP91:%.*]] = trunc <4 x i8> [[TMP89]] to <4 x i1> +; CHECK-NEXT: [[TMP92:%.*]] = and <4 x i1> [[TMP90]], splat (i1 true) +; CHECK-NEXT: [[TMP93:%.*]] = and <4 x i1> [[TMP91]], splat (i1 true) +; CHECK-NEXT: [[TMP94:%.*]] = select <4 x i1> [[TMP90]], <4 x float> splat (float 1.000000e+00), <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP95:%.*]] = select <4 x i1> [[TMP91]], <4 x float> splat (float 1.000000e+00), <4 x float> zeroinitializer +; CHECK-NEXT: [[TMP96:%.*]] = select <4 x i1> [[TMP92]], <4 x float> splat (float 3.000000e+00), <4 x float> [[TMP94]] +; CHECK-NEXT: [[TMP97:%.*]] = select <4 x i1> [[TMP93]], <4 x float> splat (float 3.000000e+00), <4 x float> [[TMP95]] +; CHECK-NEXT: [[TMP98:%.*]] = bitcast <4 x float> [[TMP96]] to <4 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = bitcast <4 x float> [[TMP97]] to <4 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = trunc <4 x i32> [[TMP98]] to <4 x i8> +; CHECK-NEXT: [[TMP101:%.*]] = trunc <4 x i32> [[TMP99]] to <4 x i8> +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF45:.*]], label %[[PRED_STORE_CONTINUE46:.*]] +; CHECK: [[PRED_STORE_IF45]]: +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <4 x i8> [[TMP100]], i32 0 +; CHECK-NEXT: store i8 [[TMP102]], ptr [[TMP50]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE46]] +; CHECK: [[PRED_STORE_CONTINUE46]]: +; CHECK-NEXT: br i1 true, label %[[PRED_STORE_IF47:.*]], label %[[PRED_STORE_CONTINUE48:.*]] +; CHECK: [[PRED_STORE_IF47]]: +; CHECK-NEXT: [[TMP103:%.*]] = extractelement <4 x i8> [[TMP100]], i32 1 +; CHECK-NEXT: store i8 [[TMP103]], ptr [[TMP51]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE48]] +; CHECK: [[PRED_STORE_CONTINUE48]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF49:.*]], label %[[PRED_STORE_CONTINUE50:.*]] +; CHECK: [[PRED_STORE_IF49]]: +; CHECK-NEXT: [[TMP104:%.*]] = extractelement <4 x i8> [[TMP100]], i32 2 +; CHECK-NEXT: store i8 [[TMP104]], ptr [[TMP52]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE50]] +; CHECK: [[PRED_STORE_CONTINUE50]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF51:.*]], label %[[PRED_STORE_CONTINUE52:.*]] +; CHECK: [[PRED_STORE_IF51]]: +; CHECK-NEXT: [[TMP105:%.*]] = extractelement <4 x i8> [[TMP100]], i32 3 +; CHECK-NEXT: store i8 [[TMP105]], ptr [[TMP53]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE52]] +; CHECK: [[PRED_STORE_CONTINUE52]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF53:.*]], label %[[PRED_STORE_CONTINUE54:.*]] +; CHECK: [[PRED_STORE_IF53]]: +; CHECK-NEXT: [[TMP106:%.*]] = extractelement <4 x i8> [[TMP101]], i32 0 +; CHECK-NEXT: store i8 [[TMP106]], ptr [[TMP58]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE54]] +; CHECK: [[PRED_STORE_CONTINUE54]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF55:.*]], label %[[PRED_STORE_CONTINUE56:.*]] +; CHECK: [[PRED_STORE_IF55]]: +; CHECK-NEXT: [[TMP107:%.*]] = extractelement <4 x i8> [[TMP101]], i32 1 +; CHECK-NEXT: store i8 [[TMP107]], ptr [[TMP59]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE56]] +; CHECK: [[PRED_STORE_CONTINUE56]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF57:.*]], label %[[PRED_STORE_CONTINUE58:.*]] +; CHECK: [[PRED_STORE_IF57]]: +; CHECK-NEXT: [[TMP108:%.*]] = extractelement <4 x i8> [[TMP101]], i32 2 +; CHECK-NEXT: store i8 [[TMP108]], ptr [[TMP60]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE58]] +; CHECK: [[PRED_STORE_CONTINUE58]]: +; CHECK-NEXT: br i1 false, label %[[PRED_STORE_IF59:.*]], label %[[PRED_STORE_CONTINUE60:.*]] +; CHECK: [[PRED_STORE_IF59]]: +; CHECK-NEXT: [[TMP109:%.*]] = extractelement <4 x i8> [[TMP101]], i32 3 +; CHECK-NEXT: store i8 [[TMP109]], ptr [[TMP61]], align 1 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE60]] +; CHECK: [[PRED_STORE_CONTINUE60]]: +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv + %l = load i8, ptr %gep.A + %l.ext = zext i8 %l to i64 + %gep.C = getelementptr inbounds i8, ptr %C, i64 %iv + store i64 %l.ext, ptr %gep.C + %gep.B = getelementptr inbounds i8, ptr %B, i64 %iv + %l.1 = load i8, ptr %gep.B, align 1 + %masked = and i8 %l.1, 1 + %l.1.trunc = trunc i8 %l.1 to i1 + %sel.0 = select i1 %l.1.trunc, float 1.000000e+00, float 0.000000e+00 + %masked.trunc = trunc i8 %masked to i1 + %sel.1 = select i1 %masked.trunc, float 3.000000e+00, float %sel.0 + %bc = bitcast float %sel.1 to i32 + %bc.trunc = trunc i32 %bc to i8 + store i8 %bc.trunc, ptr %gep.B, align 1 + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, 1 + br i1 %ec, label %exit, label %loop + +exit: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll index 2d75576bc36ee..5e3a70222d7bb 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll @@ -33,9 +33,8 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 3, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -60,7 +59,7 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[TMP7]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 @@ -80,7 +79,7 @@ define i64 @select_icmp_const(ptr %a, i64 %n) { ; CHECK-NEXT: [[SEL]] = select i1 [[C]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i64 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SEL_LCSSA]] @@ -127,7 +126,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP0]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP4]]) ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -9223372036854775808 @@ -135,9 +134,8 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX1:%.*]] = phi i64 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 2, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -162,7 +160,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[TMP7]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i64> [[VEC_IND5]], splat (i64 4) ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> [[TMP11]]) ; CHECK-NEXT: [[RDX_SELECT_CMP10:%.*]] = icmp ne i64 [[TMP13]], -9223372036854775808 @@ -182,7 +180,7 @@ define i64 @select_fcmp_const_fast(ptr %a, i64 %n) { ; CHECK-NEXT: [[SEL]] = select i1 [[C]], i64 [[IV]], i64 [[RDX]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i64 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[SEL_LCSSA]] @@ -235,7 +233,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]]) ; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128 @@ -244,9 +242,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8 -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i32 [[TMP2]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -274,7 +271,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i32 [[INDEX4]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <4 x i8> [[VEC_IND5]], splat (i8 4) ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT11]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]]) ; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128 @@ -294,7 +291,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) { ; CHECK-NEXT: [[SEL]] = select i1 [[C]], i8 [[IV]], i8 [[RDX]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i8 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i8 [ [[SEL]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i8 [[SEL_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll index 1b822011990ba..1a99c47aa351d 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll @@ -34,8 +34,7 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -129,8 +128,7 @@ define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %st ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -228,8 +226,7 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[IND_END6:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -362,8 +359,7 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) { ; CHECK: vec.epilog.iter.check: ; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[N_VEC]], 16 ; CHECK-NEXT: [[IND_END9:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP24]] -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll index 5e97cedb452b4..15daf90ad770c 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -30,9 +30,8 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 5, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -48,7 +47,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[TMP10]] = add <4 x i64> [[WIDE_LOAD6]], [[VEC_PHI5]] ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP10]]) ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] @@ -65,7 +64,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) { ; CHECK-NEXT: [[ADD]] = add i64 [[TMP13]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ [[TMP12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[ADD_LCSSA]] @@ -111,15 +110,14 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP4]] = select fast <4 x i1> [[TMP3]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0.000000e+00, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -137,7 +135,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[TMP11]] = select fast <4 x i1> [[TMP10]], <4 x float> [[VEC_PHI5]], <4 x float> [[WIDE_LOAD6]] ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP11]]) ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] @@ -155,7 +153,7 @@ define float @fp_reduction_max(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[V0]] = select fast i1 [[C0]], float [[RESULT_08]], float [[L0]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[V0_LCSSA:%.*]] = phi float [ [[V0]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[V0_LCSSA]] @@ -201,13 +199,13 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) { ; CHECK-NEXT: [[TMP7]] = zext <4 x i16> [[TMP6]] to <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP6]]) ; CHECK-NEXT: [[TMP11:%.*]] = zext i16 [[TMP10]] to i32 ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 256, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -225,7 +223,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) { ; CHECK-NEXT: [[TMP20]] = zext <4 x i16> [[TMP19]] to <4 x i32> ; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 4 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 256 -; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP23:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP19]]) ; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP23]] to i32 @@ -244,7 +242,7 @@ define i16 @reduction_or_trunc(ptr noalias nocapture %ptr) { ; CHECK-NEXT: [[XOR]] = or i32 [[SUM_02]], [[EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[XOR_LCSSA]] to i16 @@ -295,16 +293,15 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) { ; CHECK-NEXT: [[TMP4]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 1.500000e+01, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -324,7 +321,7 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) { ; CHECK-NEXT: [[TMP14]] = fmul fast <4 x float> [[VEC_PHI7]], [[WIDE_LOAD9]] ; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP14]]) ; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP13]]) @@ -345,7 +342,7 @@ define float @multiple_fp_rdx(ptr %A, i64 %N) { ; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP18]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] @@ -400,15 +397,14 @@ define i32 @reduction_phi_start_val(ptr %A, i64 %N) { ; CHECK-NEXT: [[TMP4]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START_SUM]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -424,7 +420,7 @@ define i32 @reduction_phi_start_val(ptr %A, i64 %N) { ; CHECK-NEXT: [[TMP11]] = sub <4 x i32> [[VEC_PHI5]], [[WIDE_LOAD6]] ; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP11]]) ; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] @@ -441,7 +437,7 @@ define i32 @reduction_phi_start_val(ptr %A, i64 %N) { ; CHECK-NEXT: [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.cond: ; CHECK-NEXT: [[SUB_LCSSA]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 @@ -501,15 +497,14 @@ define i64 @test_reduction_with_widen_induction_order_1(ptr %A, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -531,7 +526,7 @@ define i64 @test_reduction_with_widen_induction_order_1(ptr %A, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND5]], splat (i64 4) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) ; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] @@ -549,7 +544,7 @@ define i64 @test_reduction_with_widen_induction_order_1(ptr %A, i64 %N) { ; CHECK-NEXT: store i64 [[IV_1]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] @@ -596,15 +591,14 @@ define i64 @test_reduction_with_widen_induction_order_2(ptr %A, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP1]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP3]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -626,7 +620,7 @@ define i64 @test_reduction_with_widen_induction_order_2(ptr %A, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX4]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i64> [[VEC_IND6]], splat (i64 4) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]]) ; CHECK-NEXT: [[CMP_N10:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] @@ -644,7 +638,7 @@ define i64 @test_reduction_with_widen_induction_order_2(ptr %A, i64 %N) { ; CHECK-NEXT: store i64 [[IV_1]], ptr [[GEP_A]], align 4 ; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP23:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ], [ [[TMP8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll index 09bfad56923ab..f79deac2a45b0 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-trunc-induction-steps.ll @@ -37,8 +37,7 @@ define void @trunc_iv_steps_with_epilogue(ptr %A, i64 %N) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll b/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll index ff550da1ae0e1..4af9f4a13b62b 100644 --- a/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll +++ b/llvm/test/Transforms/LoopVectorize/expand-scev-after-invoke.ll @@ -18,9 +18,7 @@ define void @test(ptr %dst) personality ptr null { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP0:%.*]] = mul i32 160, [[STEP]] ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> splat (i32 4), [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> , [[DOTSPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP2]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index 2b15aae628274..901f67ee676ee 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -85,17 +85,15 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 ; VEC4_INTERL2-NEXT: [[FPINC_INS:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[FPINC_INS]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]] -; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[FPINC_INS]], -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], splat (float 4.000000e+00) +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP2]] +; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub fast <4 x float> [[DOTSPLAT3]], [[TMP7]] ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -332,17 +330,15 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640 ; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]] ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]] -; VEC4_INTERL2-NEXT: [[MUL:%.*]] = fmul reassoc <4 x float> [[DOTSPLATINSERT2]], -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[MUL]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 -; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[FPINC]], i64 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = fmul reassoc <4 x float> [[BROADCAST_SPLAT]], splat (float 4.000000e+00) +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul reassoc <4 x float> [[DOTSPLAT3]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT]], [[TMP2]] +; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fmul reassoc <4 x float> [[BROADCAST_SPLAT]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fsub reassoc <4 x float> [[DOTSPLAT3]], [[TMP7]] ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -834,22 +830,20 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2: vector.ph: ; VEC4_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483640 ; VEC4_INTERL2-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; VEC4_INTERL2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer ; VEC4_INTERL2-NEXT: [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01 ; VEC4_INTERL2-NEXT: [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000 ; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]] ; VEC4_INTERL2-NEXT: [[IND_END3:%.*]] = fadd fast float [[INIT:%.*]], [[TMP3]] -; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLATINSERT2]], -; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; VEC4_INTERL2-NEXT: [[BROADCAST:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 +; VEC4_INTERL2-NEXT: [[DOTSPLAT5:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], splat (float 4.000000e+00) +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; VEC4_INTERL2-NEXT: [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0 ; VEC4_INTERL2-NEXT: [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> poison, <4 x i32> zeroinitializer -; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[DOTSPLAT7]], -; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP4]] +; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], +; VEC4_INTERL2-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT7]], [[TMP19]] ; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] ; VEC4_INTERL2: vector.body: ; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -860,8 +854,8 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP6]], i64 16 ; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4 -; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST]] -; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST]] +; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT]] +; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[DOTSPLAT]] ; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[VEC_IND]], splat (float -5.000000e-01) ; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast <4 x float> [[VEC_IND]], splat (float -2.500000e+00) ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll index b3cb3a77467ee..362de0e0bba7a 100644 --- a/llvm/test/Transforms/LoopVectorize/induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll @@ -337,8 +337,6 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i16 ; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DOTCAST]], [[O_1]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[O_1]], i64 0 ; CHECK-NEXT: [[DOTSPLAT1:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <8 x i16> , [[DOTSPLAT1]] @@ -350,7 +348,7 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i16> [[VEC_IND]], [[DOTSPLAT1]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[TMP3]] ; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[TMP5]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP3]], 8 @@ -362,11 +360,11 @@ define void @wide_add_induction_step_live_in(ptr %dst, i64 %N, i16 %off) { ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i16 [ [[TMP0]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL5]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i16 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[ADD]] = add i16 [[IV_2]], [[O_1]] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i16 [[ADD]], ptr [[GEP_DST]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 261c336b329fa..60c844c3f6415 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -6211,12 +6211,10 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL: vector.ph: ; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4 ; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 +; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer ; UNROLL-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; UNROLL-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] -; UNROLL-NEXT: [[TMP15:%.*]] = shl <2 x i32> [[BROADCAST_SPLATINSERT]], -; UNROLL-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 -; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer +; UNROLL-NEXT: [[TMP16:%.*]] = shl <2 x i32> [[DOTSPLAT]], splat (i32 1) ; UNROLL-NEXT: [[TMP17:%.*]] = mul nuw <2 x i32> [[DOTSPLAT]], ; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL: vector.body: @@ -6293,9 +6291,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = mul i32 [[DOTCAST]], [[STEP]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = mul <2 x i32> splat (i32 2), [[BROADCAST_SPLAT]] -; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[STEP]], i64 0 -; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer -; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = mul <2 x i32> , [[DOTSPLAT]] +; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = mul <2 x i32> , [[BROADCAST_SPLAT]] ; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> zeroinitializer, [[TMP18]] ; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-IC: vector.body: @@ -6365,12 +6361,10 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE: vector.ph: ; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -8 ; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 +; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer ; INTERLEAVE-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 ; INTERLEAVE-NEXT: [[IND_END:%.*]] = mul i32 [[STEP]], [[DOTCAST]] -; INTERLEAVE-NEXT: [[TMP15:%.*]] = shl <4 x i32> [[BROADCAST_SPLATINSERT]], -; INTERLEAVE-NEXT: [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP15]], <4 x i32> poison, <4 x i32> zeroinitializer -; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i64 0 -; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; INTERLEAVE-NEXT: [[TMP16:%.*]] = shl <4 x i32> [[DOTSPLAT]], splat (i32 2) ; INTERLEAVE-NEXT: [[TMP17:%.*]] = mul <4 x i32> [[DOTSPLAT]], ; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]] ; INTERLEAVE: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll index 6cdd154f0e00e..8525b3aa5d349 100644 --- a/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/nested-loops-scev-expansion.ll @@ -240,11 +240,9 @@ define void @pr52024(ptr %dst, i16 %N) { ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = mul i16 24, [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i16> splat (i16 2), [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i16> poison, i16 [[REM_TRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT3]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i16> poison, i16 [[REM_TRUNC]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT5]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = mul <2 x i16> , [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP7:%.*]] = mul <2 x i16> , [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i16> zeroinitializer, [[TMP7]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -252,8 +250,8 @@ define void @pr52024(ptr %dst, i16 %N) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i16> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i16> [[VEC_IND]], [[TMP6]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 8, [[INDEX]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i16> [[VEC_IND]], [[BROADCAST_SPLAT4]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i16> [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i16> [[STEP_ADD]], [[BROADCAST_SPLAT6]] ; CHECK-NEXT: [[TMP10:%.*]] = zext <2 x i16> [[TMP8]] to <2 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i16> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[DST]], i32 [[OFFSET_IDX]] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll index ee74f2225a425..18803e71f1041 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -41,9 +41,8 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; VF-TWO-CHECK: vec.epilog.iter.check: -; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 -; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 +; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.ph: ; VF-TWO-CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 2 @@ -58,7 +57,7 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD8]] ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2 ; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: ; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 ; VF-TWO-CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]] @@ -75,7 +74,7 @@ define signext i32 @f1(ptr noalias %A, ptr noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]] ; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] +; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]] ; VF-TWO-CHECK: for.end.loopexit: ; VF-TWO-CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_END]] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index ce77811e81562..1319d068145a8 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -45,8 +45,7 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -189,8 +188,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[IND_END4:%.*]] = trunc i64 [[N_VEC]] to i32 -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -220,11 +218,11 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; CHECK-NEXT: br i1 [[CMP_N6]], label %[[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i32 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i32 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL5]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[I_014]], -1 ; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP32]], [[N]] ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 @@ -327,8 +325,7 @@ define void @f3(ptr noalias %A, i64 %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -383,8 +380,7 @@ define void @f3(ptr noalias %A, i64 %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_PH]]: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -505,12 +501,12 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-NEXT: br i1 true, label %[[OUTER_LATCH]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, %[[VEC_EPILOG_ITER_CHECK]] ], [ 1, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i64 [ 85, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, %[[VEC_EPILOG_ITER_CHECK]] ], [ 1, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[INNER:.*]] ; CHECK: [[INNER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ] -; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL6]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[INNER]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[INNER]] ] ; CHECK-NEXT: [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i8 [[IV_2]], ptr [[GEP_DST]], align 1 @@ -592,12 +588,12 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_MIDDLE_BLOCK]]: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 true, label %[[OUTER_LATCH]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 85, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, %[[VEC_EPILOG_ITER_CHECK]] ], [ 1, %[[ITER_CHECK]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i64 [ 85, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, %[[VEC_EPILOG_ITER_CHECK]] ], [ 1, %[[ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label %[[INNER:.*]] ; CHECK-PROFITABLE-BY-DEFAULT: [[INNER]]: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL3]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL6]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[INNER]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL14]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL15]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[INNER]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV_2_NEXT]] = sub i8 [[IV_2]], [[TRUNC_ADD]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store i8 [[IV_2]], ptr [[GEP_DST]], align 1 @@ -665,8 +661,7 @@ define void @f4(ptr noalias %A, i32 signext %n) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -691,10 +686,10 @@ define void @f4(ptr noalias %A, i32 signext %n) { ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N5]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[TMP11]] to i8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] @@ -732,8 +727,7 @@ define void @f4(ptr noalias %A, i32 signext %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]] ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_PH]]: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] @@ -758,10 +752,10 @@ define void @f4(ptr noalias %A, i32 signext %n) { ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N5]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK-PROFITABLE-BY-DEFAULT: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL9:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label %[[LOOP:.*]] ; CHECK-PROFITABLE-BY-DEFAULT: [[LOOP]]: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP11:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CONV:%.*]] = trunc i32 [[TMP11]] to i8 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index f6e8de608645a..f4d4cca0d4220 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -2675,8 +2675,7 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[VEC_PHI]], [[TMP7]] -; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP18]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP13]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) ; CHECK-NEXT: [[TMP11]] = add i32 [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 @@ -2746,12 +2745,10 @@ define i32 @predicated_not_dominates_reduction_twoadd(ptr nocapture noundef read ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP28]], <4 x i32> [[TMP30]], <4 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add i32 [[VEC_PHI1]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = zext <4 x i8> [[TMP7]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP27]], <4 x i32> [[TMP31]], <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP27]], <4 x i32> [[TMP29]], <4 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]]) ; CHECK-INTERLEAVED-NEXT: [[TMP17]] = add i32 [[TMP11]], [[TMP16]] -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP28]], <4 x i32> [[TMP22]], <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP28]], <4 x i32> [[TMP30]], <4 x i32> zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) ; CHECK-INTERLEAVED-NEXT: [[TMP20]] = add i32 [[TMP14]], [[TMP19]] ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index cb0c778b95026..73d5e26ef82a2 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -220,14 +220,18 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP9]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[INDVAR_LCSSA1]], 2 +; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; CHECK: [[VECTOR_SCEVCHECK]]: ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], 1 ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP9]], 2 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP9]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP15]], 2 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP15]], [[N_MOD_VF]] ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[IV_1_LCSSA]], [[N_VEC]] ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -239,7 +243,7 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP9]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP15]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] ; CHECK: [[SCALAR_PH]]: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ [[IV_1_LCSSA]], %[[LOOP_2_PREHEADER]] ], [ [[IV_1_LCSSA]], %[[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index f04d034a59f31..04f04a8a08fc2 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -567,6 +567,73 @@ exit: ret ptr %res } +define i64 @loop_guards_needed_to_prove_deref_multiple(i32 %x, i1 %c, ptr dereferenceable(1024) %src) { +; CHECK-LABEL: define i64 @loop_guards_needed_to_prove_deref_multiple( +; CHECK-SAME: i32 [[X:%.*]], i1 [[C:%.*]], ptr dereferenceable(1024) [[SRC:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X_AND:%.*]] = and i32 [[X]], -2 +; CHECK-NEXT: [[PRE_0:%.*]] = icmp eq i32 [[X]], 0 +; CHECK-NEXT: br i1 [[PRE_0]], label [[THEN:%.*]], label [[EXIT:%.*]] +; CHECK: then: +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C]], i32 [[X_AND]], i32 0 +; CHECK-NEXT: [[PRE_1:%.*]] = icmp ugt i32 [[SEL]], 1024 +; CHECK-NEXT: br i1 [[PRE_1]], label [[EXIT]], label [[PH:%.*]] +; CHECK: ph: +; CHECK-NEXT: [[PRE_2:%.*]] = icmp ne i32 [[SEL]], 0 +; CHECK-NEXT: call void @llvm.assume(i1 [[PRE_2]]) +; CHECK-NEXT: [[N:%.*]] = add i32 [[SEL]], -1 +; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[PH]] ] +; CHECK-NEXT: [[GEP_SRC_I:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I]], align 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L]], 0 +; CHECK-NEXT: br i1 [[C_1]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_LATCH]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[N_EXT]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP_HEADER]] +; CHECK: exit.loopexit: +; CHECK-NEXT: [[RES_PH:%.*]] = phi i64 [ [[IV]], [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[RES:%.*]] = phi i64 [ -1, [[ENTRY:%.*]] ], [ -2, [[THEN]] ], [ [[RES_PH]], [[EXIT_LOOPEXIT]] ] +; CHECK-NEXT: ret i64 [[RES]] +; +entry: + %x.and = and i32 %x, -2 + %pre.0 = icmp eq i32 %x, 0 + br i1 %pre.0, label %then, label %exit + +then: + %sel = select i1 %c, i32 %x.and, i32 0 + %pre.1 = icmp ugt i32 %sel, 1024 + br i1 %pre.1, label %exit, label %ph + +ph: + %pre.2 = icmp ne i32 %sel, 0 + call void @llvm.assume(i1 %pre.2) + %n = add i32 %sel, -1 + %n.ext = zext i32 %n to i64 + br label %loop.header + +loop.header: + %iv = phi i64 [ %iv.next, %loop.latch ], [ 0, %ph ] + %gep.src.i = getelementptr i8, ptr %src, i64 %iv + %l = load i8, ptr %gep.src.i, align 1 + %c.1 = icmp eq i8 %l, 0 + br i1 %c.1, label %exit, label %loop.latch + +loop.latch: + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv, %n.ext + br i1 %ec, label %exit, label %loop.header + +exit: + %res = phi i64 [ -1, %entry ], [ -2, %then ], [ 0, %loop.latch ], [ %iv, %loop.header ] + ret i64 %res +} ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll index 5c622f825beaf..99916a503750a 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return-replicate.ll @@ -453,6 +453,221 @@ exit: ret void } +define void @struct_return_2xf32_replicate_predicated(ptr %a) { +; VF4-LABEL: define void @struct_return_2xf32_replicate_predicated( +; VF4-SAME: ptr [[A:%.*]]) { +; VF4-NEXT: [[ENTRY:.*:]] +; VF4-NEXT: br label %[[VECTOR_PH:.*]] +; VF4: [[VECTOR_PH]]: +; VF4-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4: [[VECTOR_BODY]]: +; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ] +; VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 8 +; VF4-NEXT: [[TMP1:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], zeroinitializer +; VF4-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; VF4-NEXT: br i1 [[TMP2]], label %[[PRED_CALL_IF:.*]], label %[[PRED_CALL_CONTINUE:.*]] +; VF4: [[PRED_CALL_IF]]: +; VF4-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 0 +; VF4-NEXT: [[TMP4:%.*]] = tail call { float, float } @fn2(float [[TMP3]]) #[[ATTR3:[0-9]+]] +; VF4-NEXT: [[TMP5:%.*]] = extractvalue { float, float } [[TMP4]], 0 +; VF4-NEXT: [[TMP6:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i32 0 +; VF4-NEXT: [[TMP7:%.*]] = insertvalue { <4 x float>, <4 x float> } poison, <4 x float> [[TMP6]], 0 +; VF4-NEXT: [[TMP8:%.*]] = extractvalue { float, float } [[TMP4]], 1 +; VF4-NEXT: [[TMP9:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP7]], 1 +; VF4-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP8]], i32 0 +; VF4-NEXT: [[TMP11:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP7]], <4 x float> [[TMP10]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE]] +; VF4: [[PRED_CALL_CONTINUE]]: +; VF4-NEXT: [[TMP12:%.*]] = phi { <4 x float>, <4 x float> } [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_CALL_IF]] ] +; VF4-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; VF4-NEXT: br i1 [[TMP13]], label %[[PRED_CALL_IF1:.*]], label %[[PRED_CALL_CONTINUE2:.*]] +; VF4: [[PRED_CALL_IF1]]: +; VF4-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 1 +; VF4-NEXT: [[TMP15:%.*]] = tail call { float, float } @fn2(float [[TMP14]]) #[[ATTR3]] +; VF4-NEXT: [[TMP16:%.*]] = extractvalue { float, float } [[TMP15]], 0 +; VF4-NEXT: [[TMP17:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP12]], 0 +; VF4-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP16]], i32 1 +; VF4-NEXT: [[TMP19:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP12]], <4 x float> [[TMP18]], 0 +; VF4-NEXT: [[TMP20:%.*]] = extractvalue { float, float } [[TMP15]], 1 +; VF4-NEXT: [[TMP21:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP19]], 1 +; VF4-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP20]], i32 1 +; VF4-NEXT: [[TMP23:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP19]], <4 x float> [[TMP22]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE2]] +; VF4: [[PRED_CALL_CONTINUE2]]: +; VF4-NEXT: [[TMP24:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP12]], %[[PRED_CALL_CONTINUE]] ], [ [[TMP19]], %[[PRED_CALL_IF1]] ] +; VF4-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; VF4-NEXT: br i1 [[TMP25]], label %[[PRED_CALL_IF3:.*]], label %[[PRED_CALL_CONTINUE4:.*]] +; VF4: [[PRED_CALL_IF3]]: +; VF4-NEXT: [[TMP26:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 2 +; VF4-NEXT: [[TMP27:%.*]] = tail call { float, float } @fn2(float [[TMP26]]) #[[ATTR3]] +; VF4-NEXT: [[TMP28:%.*]] = extractvalue { float, float } [[TMP27]], 0 +; VF4-NEXT: [[TMP29:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP24]], 0 +; VF4-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP28]], i32 2 +; VF4-NEXT: [[TMP31:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP24]], <4 x float> [[TMP30]], 0 +; VF4-NEXT: [[TMP32:%.*]] = extractvalue { float, float } [[TMP27]], 1 +; VF4-NEXT: [[TMP33:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP31]], 1 +; VF4-NEXT: [[TMP34:%.*]] = insertelement <4 x float> [[TMP33]], float [[TMP32]], i32 2 +; VF4-NEXT: [[TMP35:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP31]], <4 x float> [[TMP34]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE4]] +; VF4: [[PRED_CALL_CONTINUE4]]: +; VF4-NEXT: [[TMP36:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP24]], %[[PRED_CALL_CONTINUE2]] ], [ [[TMP31]], %[[PRED_CALL_IF3]] ] +; VF4-NEXT: [[TMP37:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; VF4-NEXT: br i1 [[TMP37]], label %[[PRED_CALL_IF5:.*]], label %[[PRED_CALL_CONTINUE6:.*]] +; VF4: [[PRED_CALL_IF5]]: +; VF4-NEXT: [[TMP38:%.*]] = extractelement <4 x float> [[WIDE_LOAD]], i32 3 +; VF4-NEXT: [[TMP39:%.*]] = tail call { float, float } @fn2(float [[TMP38]]) #[[ATTR3]] +; VF4-NEXT: [[TMP40:%.*]] = extractvalue { float, float } [[TMP39]], 0 +; VF4-NEXT: [[TMP41:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP36]], 0 +; VF4-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP40]], i32 3 +; VF4-NEXT: [[TMP43:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP36]], <4 x float> [[TMP42]], 0 +; VF4-NEXT: [[TMP44:%.*]] = extractvalue { float, float } [[TMP39]], 1 +; VF4-NEXT: [[TMP45:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP43]], 1 +; VF4-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP44]], i32 3 +; VF4-NEXT: [[TMP47:%.*]] = insertvalue { <4 x float>, <4 x float> } [[TMP43]], <4 x float> [[TMP46]], 1 +; VF4-NEXT: br label %[[PRED_CALL_CONTINUE6]] +; VF4: [[PRED_CALL_CONTINUE6]]: +; VF4-NEXT: [[TMP48:%.*]] = phi { <4 x float>, <4 x float> } [ [[TMP36]], %[[PRED_CALL_CONTINUE4]] ], [ [[TMP43]], %[[PRED_CALL_IF5]] ] +; VF4-NEXT: [[TMP49:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP48]], 0 +; VF4-NEXT: [[TMP50:%.*]] = fdiv <4 x float> [[TMP49]], [[WIDE_LOAD]] +; VF4-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; VF4-NEXT: br i1 [[TMP51]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF4: [[PRED_STORE_IF]]: +; VF4-NEXT: [[TMP52:%.*]] = add i64 [[INDEX]], 0 +; VF4-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP52]] +; VF4-NEXT: [[TMP54:%.*]] = extractelement <4 x float> [[TMP50]], i32 0 +; VF4-NEXT: store float [[TMP54]], ptr [[TMP53]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF4: [[PRED_STORE_CONTINUE]]: +; VF4-NEXT: [[TMP55:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; VF4-NEXT: br i1 [[TMP55]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; VF4: [[PRED_STORE_IF7]]: +; VF4-NEXT: [[TMP56:%.*]] = add i64 [[INDEX]], 1 +; VF4-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP56]] +; VF4-NEXT: [[TMP58:%.*]] = extractelement <4 x float> [[TMP50]], i32 1 +; VF4-NEXT: store float [[TMP58]], ptr [[TMP57]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; VF4: [[PRED_STORE_CONTINUE8]]: +; VF4-NEXT: [[TMP59:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; VF4-NEXT: br i1 [[TMP59]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; VF4: [[PRED_STORE_IF9]]: +; VF4-NEXT: [[TMP60:%.*]] = add i64 [[INDEX]], 2 +; VF4-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP60]] +; VF4-NEXT: [[TMP62:%.*]] = extractelement <4 x float> [[TMP50]], i32 2 +; VF4-NEXT: store float [[TMP62]], ptr [[TMP61]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; VF4: [[PRED_STORE_CONTINUE10]]: +; VF4-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3 +; VF4-NEXT: br i1 [[TMP63]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]] +; VF4: [[PRED_STORE_IF11]]: +; VF4-NEXT: [[TMP64:%.*]] = add i64 [[INDEX]], 3 +; VF4-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP64]] +; VF4-NEXT: [[TMP66:%.*]] = extractelement <4 x float> [[TMP50]], i32 3 +; VF4-NEXT: store float [[TMP66]], ptr [[TMP65]], align 8 +; VF4-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; VF4: [[PRED_STORE_CONTINUE12]]: +; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF4-NEXT: [[TMP67:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF4-NEXT: br i1 [[TMP67]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF4: [[MIDDLE_BLOCK]]: +; +; VF2IC2-LABEL: define void @struct_return_2xf32_replicate_predicated( +; VF2IC2-SAME: ptr [[A:%.*]]) { +; VF2IC2-NEXT: [[ENTRY:.*:]] +; VF2IC2-NEXT: br label %[[VECTOR_PH:.*]] +; VF2IC2: [[VECTOR_PH]]: +; VF2IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF2IC2: [[VECTOR_BODY]]: +; VF2IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ] +; VF2IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]] +; VF2IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i32 2 +; VF2IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 8 +; VF2IC2-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, ptr [[TMP1]], align 8 +; VF2IC2-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], zeroinitializer +; VF2IC2-NEXT: [[TMP3:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], zeroinitializer +; VF2IC2-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; VF2IC2: [[PRED_STORE_IF]]: +; VF2IC2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP6:%.*]] = tail call { float, float } @fn2(float [[TMP5]]) #[[ATTR3:[0-9]+]] +; VF2IC2-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0 +; VF2IC2-NEXT: [[TMP8:%.*]] = extractvalue { float, float } [[TMP6]], 0 +; VF2IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; VF2IC2-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 0 +; VF2IC2-NEXT: [[TMP11:%.*]] = fdiv float [[TMP8]], [[TMP10]] +; VF2IC2-NEXT: store float [[TMP11]], ptr [[TMP9]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; VF2IC2: [[PRED_STORE_CONTINUE]]: +; VF2IC2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]] +; VF2IC2: [[PRED_STORE_IF2]]: +; VF2IC2-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP14:%.*]] = tail call { float, float } @fn2(float [[TMP13]]) #[[ATTR3]] +; VF2IC2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 1 +; VF2IC2-NEXT: [[TMP16:%.*]] = extractvalue { float, float } [[TMP14]], 0 +; VF2IC2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP15]] +; VF2IC2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[WIDE_LOAD]], i32 1 +; VF2IC2-NEXT: [[TMP19:%.*]] = fdiv float [[TMP16]], [[TMP18]] +; VF2IC2-NEXT: store float [[TMP19]], ptr [[TMP17]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE3]] +; VF2IC2: [[PRED_STORE_CONTINUE3]]: +; VF2IC2-NEXT: [[TMP20:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0 +; VF2IC2-NEXT: br i1 [[TMP20]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] +; VF2IC2: [[PRED_STORE_IF4]]: +; VF2IC2-NEXT: [[TMP21:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP22:%.*]] = tail call { float, float } @fn2(float [[TMP21]]) #[[ATTR3]] +; VF2IC2-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 2 +; VF2IC2-NEXT: [[TMP24:%.*]] = extractvalue { float, float } [[TMP22]], 0 +; VF2IC2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] +; VF2IC2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 0 +; VF2IC2-NEXT: [[TMP27:%.*]] = fdiv float [[TMP24]], [[TMP26]] +; VF2IC2-NEXT: store float [[TMP27]], ptr [[TMP25]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; VF2IC2: [[PRED_STORE_CONTINUE5]]: +; VF2IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1 +; VF2IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]] +; VF2IC2: [[PRED_STORE_IF6]]: +; VF2IC2-NEXT: [[TMP29:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP30:%.*]] = tail call { float, float } @fn2(float [[TMP29]]) #[[ATTR3]] +; VF2IC2-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 3 +; VF2IC2-NEXT: [[TMP32:%.*]] = extractvalue { float, float } [[TMP30]], 0 +; VF2IC2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP31]] +; VF2IC2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[WIDE_LOAD1]], i32 1 +; VF2IC2-NEXT: [[TMP35:%.*]] = fdiv float [[TMP32]], [[TMP34]] +; VF2IC2-NEXT: store float [[TMP35]], ptr [[TMP33]], align 8 +; VF2IC2-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; VF2IC2: [[PRED_STORE_CONTINUE7]]: +; VF2IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF2IC2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VF2IC2-NEXT: br i1 [[TMP36]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF2IC2: [[MIDDLE_BLOCK]]: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.inc ] + %arrayidx = getelementptr inbounds float, ptr %a, i64 %iv + %in_val = load float, ptr %arrayidx, align 8 + %sgt_zero = fcmp ogt float %in_val, 0.0 + br i1 %sgt_zero, label %if.then, label %for.inc + +if.then: + %call = tail call { float, float } @fn2(float %in_val) #3 + %extract_a = extractvalue { float, float } %call, 0 + %div = fdiv float %extract_a, %in_val + store float %div, ptr %arrayidx, align 8 + br label %for.inc + +for.inc: + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret void +} + declare { i64 } @fn1(float) declare { float, float } @fn2(float) declare { i32, i32, i32 } @fn3(i32) @@ -464,3 +679,4 @@ declare { <8 x i32>, <8 x i32>, <8 x i32> } @fixed_vec_fn3(<8 x i32>) attributes #0 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn1(fixed_vec_fn1)" } attributes #1 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn2(fixed_vec_fn2)" } attributes #2 = { nounwind "vector-function-abi-variant"="_ZGVnN8v_fn3(fixed_vec_fn3)" } +attributes #3 = { nounwind "vector-function-abi-variant"="_ZGVnM8v_fn2(fixed_vec_fn2)" } diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll index b721e9e489804..f2e2e2846614b 100644 --- a/llvm/test/Transforms/LoopVectorize/struct-return.ll +++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 6 ; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize 2>%t | FileCheck %s ; RUN: cat %t | FileCheck --check-prefix=CHECK-REMARKS %s @@ -7,14 +8,30 @@ target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x float>, <2 x float> } [[WIDE_CALL]], 1 -; CHECK: store <2 x float> [[WIDE_A]], ptr {{%.*}}, align 4 -; CHECK: store <2 x float> [[WIDE_B]], ptr {{%.*}}, align 4 +; CHECK-LABEL: define void @struct_return_f32_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -39,14 +56,30 @@ exit: ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f64_widen -; CHECK-SAME: (ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) -; CHECK: vector.body: -; CHECK: [[WIDE_CALL:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD:%.*]]) -; CHECK: [[WIDE_A:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 0 -; CHECK: [[WIDE_B:%.*]] = extractvalue { <2 x double>, <2 x double> } [[WIDE_CALL]], 1 -; CHECK: store <2 x double> [[WIDE_A]], ptr {{%.*}}, align 8 -; CHECK: store <2 x double> [[WIDE_B]], ptr {{%.*}}, align 8 +; CHECK-LABEL: define void @struct_return_f64_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x double>, <2 x double> } @fixed_vec_bar(<2 x double> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x double>, <2 x double> } [[TMP1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -71,14 +104,43 @@ exit: ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, ptr writeonly %out_b) { -; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks -; CHECK-SAME: (ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) -; CHECK: entry: -; CHECK: br label %vector.memcheck -; CHECK: vector.memcheck: -; CHECK: vector.body: -; CHECK: call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD:%.*]]) -; CHECK: for.body: +; CHECK-LABEL: define void @struct_return_f32_widen_rt_checks( +; CHECK-SAME: ptr [[IN:%.*]], ptr writeonly [[OUT_A:%.*]], ptr writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[IN3:%.*]] = ptrtoint ptr [[IN]] to i32 +; CHECK-NEXT: [[OUT_A2:%.*]] = ptrtoint ptr [[OUT_A]] to i32 +; CHECK-NEXT: [[OUT_B1:%.*]] = ptrtoint ptr [[OUT_B]] to i32 +; CHECK-NEXT: br label %[[VECTOR_MEMCHECK:.*]] +; CHECK: [[VECTOR_MEMCHECK]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[OUT_B1]], [[OUT_A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i32 [[TMP0]], 8 +; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[OUT_A2]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i32 [[TMP1]], 8 +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[OUT_B1]], [[IN3]] +; CHECK-NEXT: [[DIFF_CHECK5:%.*]] = icmp ult i32 [[TMP2]], 8 +; CHECK-NEXT: [[CONFLICT_RDX6:%.*]] = or i1 [[CONFLICT_RDX]], [[DIFF_CHECK5]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX6]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = call { <2 x float>, <2 x float> } @fixed_vec_foo(<2 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP4]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x float> [[TMP6]], ptr [[TMP8]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH]]: +; ; CHECK call { float, float } @foo(float [[LOAD:%.*]]) entry: br label %for.body @@ -105,9 +167,28 @@ exit: ; TODO: Allow mixed-struct type vectorization and mark overflow intrinsics as trivially vectorizable. ; CHECK-REMARKS: remark: {{.*}} loop not vectorized: call instruction cannot be vectorized define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @test_overflow_intrinsic -; CHECK-NOT: vector.body: -; CHECK-NOT: @llvm.sadd.with.overflow.v{{.+}}i32 +; CHECK-LABEL: define void @test_overflow_intrinsic( +; CHECK-SAME: ptr noalias readonly [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[IN_VAL]], i32 [[IN_VAL]]) +; CHECK-NEXT: [[EXTRACT_RET:%.*]] = extractvalue { i32, i1 } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[CALL]], 1 +; CHECK-NEXT: [[ZEXT_OVERFLOW:%.*]] = zext i1 [[EXTRACT_OVERFLOW]] to i8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXTRACT_RET]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store i8 [[ZEXT_OVERFLOW]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -133,9 +214,27 @@ exit: ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias writeonly %out_a) { -; CHECK-LABEL: define void @struct_return_i32_three_results_widen -; CHECK: vector.body: -; CHECK: call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD:%.*]]) +; CHECK-LABEL: define void @struct_return_i32_three_results_widen( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @fixed_vec_qux(<2 x i32> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[OUT_A]], i64 [[INDEX]] +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -159,10 +258,50 @@ exit: ; (mainly it does not crash). ; CHECK-REMARKS: remark: {{.*}} vectorized loop define void @scalarized_predicated_struct_return(ptr %a) { -; CHECK-LABEL: define void @scalarized_predicated_struct_return -; CHECK: vector.body: -; CHECK: pred.store.if: -; CHECK: tail call { i64, i64 } @bar_i64(i64 {{.+}}) +; CHECK-LABEL: define void @scalarized_predicated_struct_return( +; CHECK-SAME: ptr [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 +; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; CHECK: [[PRED_STORE_IF]]: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP8]] +; CHECK-NEXT: store i64 [[TMP7]], ptr [[TMP9]], align 8 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE]] +; CHECK: [[PRED_STORE_CONTINUE]]: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 +; CHECK-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_IF1]]: +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR4]] +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i64, i64 } [[TMP12]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = udiv i64 [[TMP13]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]] +; CHECK-NEXT: store i64 [[TMP15]], ptr [[TMP17]], align 8 +; CHECK-NEXT: br label %[[PRED_STORE_CONTINUE2]] +; CHECK: [[PRED_STORE_CONTINUE2]]: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br [[EXIT:label %.*]] +; CHECK: [[SCALAR_PH:.*:]] +; entry: br label %for.body @@ -192,8 +331,27 @@ exit: ; Negative test. Widening structs of vectors is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_struct_of_vectors -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_of_vectors( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load <1 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { <1 x float>, <1 x float> } @foo(<1 x float> [[IN_VAL]]) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { <1 x float>, <1 x float> } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { <1 x float>, <1 x float> } [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store <1 x float> [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store <1 x float> [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -219,9 +377,27 @@ exit: ; Negative test. Widening structs with mixed element types is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_mixed_element_type_struct_return(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_mixed_element_type_struct_return -; CHECK-NOT: vector.body: -; CHECK-NOT: call {{.*}} @fixed_vec_baz +; CHECK-LABEL: define void @negative_mixed_element_type_struct_return( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, i32 } @baz(float [[IN_VAL]]) #[[ATTR5:[0-9]+]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, i32 } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, i32 } [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -249,9 +425,27 @@ exit: ; Negative test. Widening non-literal structs is not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_named_struct_return -; CHECK-NOT: vector.body: -; CHECK-NOT: call {{.*}} @fixed_vec_bar +; CHECK-LABEL: define void @negative_named_struct_return( +; CHECK-SAME: ptr noalias readonly [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR6:[0-9]+]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store double [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store double [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -277,8 +471,28 @@ exit: ; Negative test. Nested homogeneous structs are not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_nested_struct(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_nested_struct -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_nested_struct( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { { float, float } } @foo_nested_struct(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[EXTRACT_INNER:%.*]] = extractvalue { { float, float } } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[EXTRACT_INNER]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[EXTRACT_INNER]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -305,8 +519,24 @@ exit: ; Negative test. The second element of the struct cannot be widened. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_non_widenable_element(ptr noalias %in, ptr noalias writeonly %out_a) { -; CHECK-LABEL: define void @negative_non_widenable_element -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_non_widenable_element( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, [1 x float] } @foo_one_non_widenable_element(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, [1 x float] } [[CALL]], 0 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -329,8 +559,28 @@ exit: ; Negative test. Homogeneous structs of arrays are not supported. ; CHECK-REMARKS-COUNT: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_array_elements(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_struct_array_elements -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_array_elements( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { [2 x float] } @foo_arrays(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[EXTRACT_INNER:%.*]] = extractvalue { [2 x float] } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue [2 x float] [[EXTRACT_INNER]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue [2 x float] [[EXTRACT_INNER]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -357,8 +607,26 @@ exit: ; Negative test. Widening struct loads is not supported. ; CHECK-REMARKS: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_load(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias writeonly %out_b) { -; CHECK-LABEL: define void @negative_struct_load -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_load( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds { float, float }, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[CALL:%.*]] = load { float, float }, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[EXTRACT_A:%.*]] = extractvalue { float, float } [[CALL]], 0 +; CHECK-NEXT: [[EXTRACT_B:%.*]] = extractvalue { float, float } [[CALL]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[OUT_B]], i64 [[IV]] +; CHECK-NEXT: store float [[EXTRACT_B]], ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -383,8 +651,23 @@ exit: ; Negative test. Widening struct stores is not supported. ; CHECK-REMARKS: remark: {{.*}} loop not vectorized: instruction return type cannot be vectorized define void @negative_struct_return_store_struct(ptr noalias %in, ptr noalias writeonly %out) { -; CHECK-LABEL: define void @negative_struct_return_store_struct -; CHECK-NOT: vector.body: +; CHECK-LABEL: define void @negative_struct_return_store_struct( +; CHECK-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds { float, float }, ptr [[IN]], i64 [[IV]] +; CHECK-NEXT: [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[CALL:%.*]] = tail call { float, float } @foo(float [[IN_VAL]]) #[[ATTR1]] +; CHECK-NEXT: [[OUT_PTR:%.*]] = getelementptr inbounds { float, float }, ptr [[OUT]], i64 [[IV]] +; CHECK-NEXT: store { float, float } [[CALL]], ptr [[OUT_PTR]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll index 0b86a2280b529..027dcaf771072 100644 --- a/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll +++ b/llvm/test/Transforms/LoopVectorize/version-stride-with-integer-casts.ll @@ -22,13 +22,11 @@ define void @test_versioned_with_sext_use(i32 %offset, ptr %dst) { ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1 ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_1]], 200 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]] ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -94,13 +92,11 @@ define void @test_versioned_with_zext_use(i32 %offset, ptr %dst) { ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1 ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_1]], 200 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP3]] ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 @@ -233,13 +229,11 @@ define void @test_versioned_with_different_uses(i32 %offset, ptr noalias %dst.1, ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[OFFSET]], 1 ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 200, [[OFFSET_EXT]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_1]], [[TMP0]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IV_1]], 200 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[INDEX]], [[OFFSET_EXT]] -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IV_1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IV_1]], [[INDEX]] ; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX2]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX2]], 1 @@ -414,26 +408,20 @@ define void @zext_of_i1_stride(i1 %g, ptr %dst) mustprogress { ; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i1 [[G]], true ; CHECK-NEXT: br i1 [[IDENT_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[G_64]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[G_64]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]] ; CHECK-NEXT: store <4 x i16> splat (i16 1), ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i16 [[G_16]], ptr [[GEP]], align 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], [[G_64]] diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll new file mode 100644 index 0000000000000..18def1d41c30c --- /dev/null +++ b/llvm/test/Transforms/MemProfContextDisambiguation/funcassigncloning2.ll @@ -0,0 +1,122 @@ +;; Similar to funcassigncloning.ll but hand modified to add another allocation +;; whose pruned cold context only includes an immediate caller node that itself +;; doesn't need cloning, but calls a cloned allocating function, and is in a +;; function that gets cloned multiple times for a different callsite. This test +;; makes sure the non-cloned callsite is correctly updated in all function +;; clones. This case was missed because, due to context pruning, we don't have +;; any caller edges for the first callsite, so the handling that kicks in to +;; "reclone" other callsites in cloned functions was being missed. + +; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=memprof-context-disambiguation \ +; RUN: %s -S 2>&1 | FileCheck %s --check-prefix=IR --check-prefix=REMARKS + + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; Eventually this function will be cloned several times (for the calls to new +;; for the various callers). However, function blah() includes an allocation +;; whose cold context was trimmed above here. We therefore should assume that +;; every caller of this function should call the same version of blah (which +;; will be the cloned ".memprof.1" version. +define internal void @_Z1EPPcS0_(ptr %buf1, ptr %buf2) #0 { +entry: + call void @blah(), !callsite !19 + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !0, !callsite !7 + %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !8, !callsite !15 + ret void +} + +; REMARKS: created clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_ assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.1 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.2 assigned to call function clone blah.memprof.1 +; REMARKS: call in clone _Z1EPPcS0_.memprof.3 assigned to call function clone blah.memprof.1 + +; IR: define {{.*}} @_Z1EPPcS0_ +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.1 +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.2 +; IR: call {{.*}} @blah.memprof.1() +; IR: define {{.*}} @_Z1EPPcS0_.memprof.3 +; IR: call {{.*}} @blah.memprof.1() + +declare ptr @_Znam(i64) #1 + +define internal void @_Z1BPPcS0_(ptr %0, ptr %1) { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !16 + ret void +} + +; Function Attrs: noinline +define internal void @_Z1CPPcS0_(ptr %0, ptr %1) #2 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !17 + ret void +} + +define internal void @_Z1DPPcS0_(ptr %0, ptr %1) #3 { +entry: + call void @_Z1EPPcS0_(ptr noundef %0, ptr noundef %1), !callsite !18 + ret void +} + +define internal void @blah() #0 { +entry: + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !memprof !22, !callsite !21 + ret void +} + +define internal void @foo() #0 { +entry: + call void @blah(), !callsite !20 + ret void +} + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +declare i32 @sleep() #5 + +; uselistorder directives +uselistorder ptr @_Znam, { 1, 0, 2 } + +attributes #0 = { "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" } +attributes #1 = { "no-trapping-math"="true" } +attributes #2 = { noinline } +attributes #3 = { "frame-pointer"="all" } +attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: write) } +attributes #5 = { "disable-tail-calls"="true" } +attributes #6 = { builtin } + +!0 = !{!1, !3, !5} +!1 = !{!2, !"cold"} +!2 = !{i64 -3461278137325233666, i64 -7799663586031895603} +!3 = !{!4, !"notcold"} +!4 = !{i64 -3461278137325233666, i64 -3483158674395044949} +!5 = !{!6, !"notcold"} +!6 = !{i64 -3461278137325233666, i64 -2441057035866683071} +!7 = !{i64 -3461278137325233666} +!8 = !{!9, !11, !13} +!9 = !{!10, !"notcold"} +!10 = !{i64 -1415475215210681400, i64 -2441057035866683071} +!11 = !{!12, !"cold"} +!12 = !{i64 -1415475215210681400, i64 -3483158674395044949} +!13 = !{!14, !"notcold"} +!14 = !{i64 -1415475215210681400, i64 -7799663586031895603} +!15 = !{i64 -1415475215210681400} +!16 = !{i64 -2441057035866683071} +!17 = !{i64 -3483158674395044949} +!18 = !{i64 -7799663586031895603} +!19 = !{i64 123} +!20 = !{i64 234} +!21 = !{i64 345} +!22 = !{!23, !25} +!23 = !{!24, !"cold"} +!24 = !{i64 345, i64 123} +!25 = !{!26, !"notcold"} +!26 = !{i64 345, i64 234} diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll index e301fa03ea099..0bf622276b328 100644 --- a/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll +++ b/llvm/test/Transforms/MemProfContextDisambiguation/recursive.ll @@ -47,7 +47,7 @@ ; RUN: -memprof-allow-recursive-callsites=true \ ; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ +; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=ALLOW-RECUR-CONTEXTS ;; Skipping recursive callsites should result in no cloning. @@ -56,7 +56,7 @@ ; RUN: -pass-remarks=memprof-context-disambiguation \ ; RUN: -memprof-allow-recursive-callsites=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ +; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof" \ ; RUN: --implicit-check-not="created clone" \ ; RUN: --implicit-check-not="marked with memprof allocation attribute cold" \ ; RUN: --check-prefix=ALL @@ -87,7 +87,7 @@ ; RUN: -memprof-allow-recursive-contexts=false \ ; RUN: -memprof-clone-recursive-contexts=false \ ; RUN: %s -S 2>&1 | FileCheck %s \ -; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned" \ +; RUN: --implicit-check-not "memprof_recursive.cc:12:10: call in clone _Z1Ci.memprof.1 assigned to call function clone _Z1Bi.memprof" \ ; RUN: --check-prefix=ALL --check-prefix=ALLOW-RECUR-CALLSITES --check-prefix=SKIP-RECUR-CONTEXTS ; ALLOW-RECUR-CALLSITES: memprof_recursive.cc:4:0: created clone _Z1Dv.memprof.1 diff --git a/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll new file mode 100644 index 0000000000000..b29834f9fe960 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/chr-lifetimes.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='require,chr' -S | FileCheck %s + +declare void @foo() +declare void @bar() +declare void @baz(i64) + +; Test that when we have a static alloca in an entry block that will get split, +; the alloca remains static and we preserve its lifetime annotations. +define void @test_chr_with_lifetimes(ptr %i) !prof !14 { +; CHECK-LABEL: @test_chr_with_lifetimes( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEST:%.*]] = alloca i32, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = freeze i1 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = select i1 true, i1 [[TMP9]], i1 false +; CHECK-NEXT: [[TMP11:%.*]] = freeze i1 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP10]], i1 [[TMP11]], i1 false +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15:![0-9]+]] +; CHECK: entry.split: +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16:![0-9]+]] +; CHECK-NEXT: call void @baz(i64 [[TMP6]]) +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17:![0-9]+]] +; CHECK: bb0: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[BB1]] +; CHECK: entry.split.nonchr: +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: call void @baz(i64 [[TMP7]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK: bb0.nonchr: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[TEST]]) +; CHECK-NEXT: store ptr [[TEST]], ptr [[I]], align 8 +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP2:%.*]] = phi ptr [ [[TMP3:%.*]], [[BB2]] ], [ null, [[BB1]] ] +; CHECK-NEXT: [[TMP3]] = getelementptr i8, ptr [[TMP2]], i64 24 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq ptr [[TMP2]], [[I]] +; CHECK-NEXT: br i1 [[TMP4]], label [[BB3:%.*]], label [[BB2]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +entry: + %1 = load i32, ptr %i + %2 = icmp eq i32 %1, 0 + %3 = select i1 %2, i64 4, i64 0, !prof !15 + %test = alloca i32, align 8 + call void @baz(i64 %3) + br i1 %2, label %bb1, label %bb0, !prof !15 + +bb0: + call void @foo() + br label %bb1 + +bb1: + call void @llvm.lifetime.start.p0(ptr %test) + store ptr %test, ptr %i, align 8 + br label %bb2 + +bb2: + %4 = phi ptr [ %5, %bb2 ], [ null, %bb1 ] + %5 = getelementptr i8, ptr %4, i64 24 + %6 = icmp eq ptr %4, %i + br i1 %6, label %bb3, label %bb2 + +bb3: + ret void +} + +; Test that we remove lifetime markers that would otherwise refer to phi +; nodes given the dynamic allocas they referred to have been duplicated. +define void @test_chr_dynamic_alloca(ptr %i) !prof !14 { +; CHECK-LABEL: @test_chr_dynamic_alloca( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEST1:%.*]] = load i32, ptr [[I:%.*]], align 4 +; CHECK-NEXT: [[TEST2:%.*]] = icmp eq i32 [[TEST1]], 5 +; CHECK-NEXT: br i1 [[TEST2]], label [[BB4:%.*]], label [[BB3:%.*]] +; CHECK: bb4: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false +; CHECK-NEXT: br i1 [[TMP5]], label [[BB4_SPLIT:%.*]], label [[BB4_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK: bb4.split: +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TEST:%.*]] = alloca i32, align 8 +; CHECK-NEXT: call void @baz(i64 [[TMP6]]) +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK: bb0: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: store ptr [[TEST]], ptr [[I]], align 8 +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb4.split.nonchr: +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: [[TEST_NONCHR:%.*]] = alloca i32, align 8 +; CHECK-NEXT: call void @baz(i64 [[TMP7]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK: bb0.nonchr: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: store ptr [[TEST_NONCHR]], ptr [[I]], align 8 +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP8:%.*]] = phi ptr [ [[TEST]], [[BB0]] ], [ [[TEST]], [[BB4_SPLIT]] ], [ [[TEST_NONCHR]], [[BB0_NONCHR]] ], [ [[TEST_NONCHR]], [[BB4_SPLIT_NONCHR]] ] +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: store ptr [[TMP8]], ptr [[I]], align 8 +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP10:%.*]], [[BB2]] ], [ null, [[BB1]] ] +; CHECK-NEXT: [[TMP10]] = getelementptr i8, ptr [[TMP9]], i64 24 +; CHECK-NEXT: [[TEST5:%.*]] = load ptr, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq ptr [[TMP9]], [[TEST5]] +; CHECK-NEXT: br i1 [[TMP11]], label [[BB3]], label [[BB2]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +entry: + %test1 = load i32, ptr %i + %test2 = icmp eq i32 %test1, 5 + br i1 %test2, label %bb4, label %bb3 + +bb4: + %1 = load i32, ptr %i + %2 = icmp eq i32 %1, 0 + %3 = select i1 %2, i64 4, i64 0, !prof !15 + %test = alloca i32, align 8 + call void @baz(i64 %3) + br i1 %2, label %bb1, label %bb0, !prof !15 + +bb0: + call void @foo() + call void @llvm.lifetime.start.p0(ptr %test) + store ptr %test, ptr %i, align 8 + br label %bb1 + +bb1: + call void @bar() + call void @llvm.lifetime.start.p0(ptr %test) + store ptr %test, ptr %i, align 8 + br label %bb2 + +bb2: + %4 = phi ptr [ %5, %bb2 ], [ null, %bb1 ] + %5 = getelementptr i8, ptr %4, i64 24 + %test5 = load ptr, ptr %test + call void @llvm.lifetime.end.p0(ptr %test) + %6 = icmp eq ptr %4, %test5 + br i1 %6, label %bb3, label %bb2 + +bb3: + ret void +} + +; Test that we do not move around allocas that occur in the entry block +; before splitting. If we accidentally sink them, we can move them after +; their users. +define void @test_no_move_allocas(ptr %i) !prof !14 { +; CHECK-LABEL: @test_no_move_allocas( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TEST:%.*]] = alloca i32, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[TEST]]) +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[I:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i32 [[TMP0]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = freeze i1 [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 true, i1 [[TMP2]], i1 false +; CHECK-NEXT: [[TMP4:%.*]] = freeze i1 [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = select i1 [[TMP3]], i1 [[TMP4]], i1 false +; CHECK-NEXT: br i1 [[TMP5]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]] +; CHECK: entry.split: +; CHECK-NEXT: [[TMP6:%.*]] = select i1 true, i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: call void @baz(i64 [[TMP6]]) +; CHECK-NEXT: br i1 false, label [[BB1:%.*]], label [[BB0:%.*]], !prof [[PROF17]] +; CHECK: bb0: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[BB1]] +; CHECK: entry.split.nonchr: +; CHECK-NEXT: [[TMP7:%.*]] = select i1 [[TMP1]], i64 0, i64 4, !prof [[PROF16]] +; CHECK-NEXT: call void @baz(i64 [[TMP7]]) +; CHECK-NEXT: br i1 [[TMP1]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]] +; CHECK: bb0.nonchr: +; CHECK-NEXT: call void @foo() +; CHECK-NEXT: br label [[BB1]] +; CHECK: bb1: +; CHECK-NEXT: call void @bar() +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP8:%.*]] = phi ptr [ [[TMP9:%.*]], [[BB2]] ], [ null, [[BB1]] ] +; CHECK-NEXT: [[TMP9]] = getelementptr i8, ptr [[TMP8]], i64 24 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq ptr [[TMP8]], [[I]] +; CHECK-NEXT: br i1 [[TMP10]], label [[BB3:%.*]], label [[BB2]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; +entry: + %test = alloca i32, align 8 + call void @llvm.lifetime.start.p0(ptr %test) + %1 = load i32, ptr %i + %2 = icmp eq i32 %1, 0 + %3 = select i1 %2, i64 4, i64 0, !prof !15 + call void @baz(i64 %3) + br i1 %2, label %bb1, label %bb0, !prof !15 + +bb0: + call void @foo() + br label %bb1 + +bb1: + call void @bar() + br label %bb2 + +bb2: + %4 = phi ptr [ %5, %bb2 ], [ null, %bb1 ] + %5 = getelementptr i8, ptr %4, i64 24 + %6 = icmp eq ptr %4, %i + br i1 %6, label %bb3, label %bb2 + +bb3: + ret void +} + + +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} + +!14 = !{!"function_entry_count", i64 100} +!15 = !{!"branch_weights", i32 0, i32 1} +; CHECK: !15 = !{!"branch_weights", i32 1000, i32 0} diff --git a/llvm/test/Transforms/PGOProfile/profcheck-synthetic.ll b/llvm/test/Transforms/PGOProfile/profcheck-synthetic.ll new file mode 100644 index 0000000000000..a3fd6b1f512a9 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/profcheck-synthetic.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 6 +; RUN: opt -passes=prof-inject -profcheck-weights-for-test %s -S -o - | FileCheck %s --check-prefixes=TEST,CHECK +; RUN: opt -passes=prof-inject %s -S -o - | FileCheck %s --check-prefixes=NORMAL,CHECK + +define void @foo(i32 %cond) { +; TEST-LABEL: define void @foo( +; TEST-SAME: i32 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] { +; TEST-NEXT: [[I:%.*]] = icmp eq i32 [[COND]], 0 +; TEST-NEXT: br i1 [[I]], label %[[A:.*]], label %[[B:.*]], !prof [[PROF1:![0-9]+]] +; TEST: [[A]]: +; TEST-NEXT: switch i32 [[COND]], label %[[DEFAULT:.*]] [ +; TEST-NEXT: i32 10, label %[[C:.*]] +; TEST-NEXT: i32 20, label %[[D:.*]] +; TEST-NEXT: ], !prof [[PROF2:![0-9]+]] +; TEST: [[BB1:.*:]] +; TEST-NEXT: br label %[[B]] +; TEST: [[B]]: +; TEST-NEXT: ret void +; TEST: [[DEFAULT]]: +; TEST-NEXT: ret void +; TEST: [[C]]: +; TEST-NEXT: ret void +; TEST: [[D]]: +; TEST-NEXT: ret void +; +; NORMAL-LABEL: define void @foo( +; NORMAL-SAME: i32 [[COND:%.*]]) !prof [[PROF0:![0-9]+]] { +; NORMAL-NEXT: [[I:%.*]] = icmp eq i32 [[COND]], 0 +; NORMAL-NEXT: br i1 [[I]], label %[[A:.*]], label %[[B:.*]], !prof [[PROF1:![0-9]+]] +; NORMAL: [[A]]: +; NORMAL-NEXT: switch i32 [[COND]], label %[[DEFAULT:.*]] [ +; NORMAL-NEXT: i32 10, label %[[C:.*]] +; NORMAL-NEXT: i32 20, label %[[D:.*]] +; NORMAL-NEXT: ], !prof [[PROF2:![0-9]+]] +; NORMAL: [[BB1:.*:]] +; NORMAL-NEXT: br label %[[B]] +; NORMAL: [[B]]: +; NORMAL-NEXT: ret void +; NORMAL: [[DEFAULT]]: +; NORMAL-NEXT: ret void +; NORMAL: [[C]]: +; NORMAL-NEXT: ret void +; NORMAL: [[D]]: +; NORMAL-NEXT: ret void +; + %i = icmp eq i32 %cond, 0 + br i1 %i, label %a, label %b +a: + switch i32 %cond, label %default [ + i32 10, label %c + i32 20, label %d + ] + br label %b +b: + ret void +default: + ret void +c: + ret void +d: + ret void +} +;. +; TEST: [[PROF0]] = !{!"function_entry_count", i64 1000} +; TEST: [[PROF1]] = !{!"branch_weights", i32 3, i32 5} +; TEST: [[PROF2]] = !{!"branch_weights", i32 5, i32 7, i32 11} +;. +; NORMAL: [[PROF0]] = !{!"function_entry_count", i64 1000} +; NORMAL: [[PROF1]] = !{!"branch_weights", i32 3, i32 5} +; NORMAL: [[PROF2]] = !{!"branch_weights", i32 1, i32 1, i32 1} +;. +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll index b056f44a6c469..8d20a3ba8ed08 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll @@ -14,16 +14,9 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK-NEXT: [[SUB:%.*]] = add i32 [[XA]], -1 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[SUB]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[XB]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[SMAX7:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP2]], i64 32000) -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i64 [[TMP2]], 32000 -; CHECK-NEXT: [[UMIN8:%.*]] = zext i1 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP2]], [[UMIN8]] -; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[SMAX7]], [[TMP4]] -; CHECK-NEXT: [[UMAX9:%.*]] = tail call i64 @llvm.umax.i64(i64 [[TMP1]], i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = udiv i64 [[TMP5]], [[UMAX9]] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], [[UMIN8]] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP0]], i64 31999) +; CHECK-NEXT: [[SMAX10:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[SMAX10]], [[TMP0]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP8]], 23 ; CHECK-NEXT: [[IDENT_CHECK_NOT:%.*]] = icmp eq i32 [[XB]], 1 ; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[MIN_ITERS_CHECK]], [[IDENT_CHECK_NOT]] @@ -50,13 +43,11 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY_PREHEADER13]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP8]], -8 -; CHECK-NEXT: [[TMP18:%.*]] = mul nuw i64 [[N_VEC]], [[TMP1]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP18]], [[TMP0]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], [[TMP0]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[INDEX]], [[TMP1]] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP19]], [[TMP0]] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[INDEX]], [[TMP0]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP20]], i64 16 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META0:![0-9]+]] @@ -75,7 +66,7 @@ define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP8]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER13]] -; CHECK: for.body.preheader13: +; CHECK: for.body.preheader14: ; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index 2fe420183c683..92891286d11d1 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -46,6 +46,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124 ; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]] ; AVX2: vector.ph: +; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 ; AVX2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776 ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: @@ -84,7 +85,6 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2: vec.epilog.iter.check: ; AVX2-NEXT: [[TMP26:%.*]] = shl i64 [[N_VEC]], 2 ; AVX2-NEXT: [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]] -; AVX2-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24 ; AVX2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; AVX2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]] ; AVX2: vec.epilog.ph: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll index f7bc01e0e8af1..bcdf90c6c5c89 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll @@ -30,6 +30,7 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[MIN_ITERS_CHECK6:%.*]] = icmp ult i32 [[N]], 16 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]] ; CHECK: [[VECTOR_PH1]]: +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483632 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[A]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer @@ -67,7 +68,6 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_END]], label %[[VEC_EPILOG_ITER_CHECK:.*]] ; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 12 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0 ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[FOR_BODY_PREHEADER9]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelement-with-copyable-args.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-with-copyable-args.ll new file mode 100644 index 0000000000000..67fb9ddf983c9 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelement-with-copyable-args.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-99999 < %s | FileCheck %s + +define i64 @test(i32 %arg) { +; CHECK-LABEL: define i64 @test( +; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-NEXT: [[BB:.*:]] +; CHECK-NEXT: [[FREEZE:%.*]] = freeze i32 0 +; CHECK-NEXT: br i1 false, label %[[BB1:.*]], label %[[BB1]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(1) null, align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ARG]], i32 3 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[FREEZE]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[FREEZE]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[ARG]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[LOAD]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult <2 x i32> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: [[AND:%.*]] = and i1 [[TMP8]], false +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP10]], [[TMP0]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ult <4 x i32> [[TMP10]], [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i1> [[TMP11]], <4 x i1> [[TMP12]], <4 x i32> +; CHECK-NEXT: br i1 false, label %[[BB11:.*]], label %[[BB12:.*]] +; CHECK: [[BB11]]: +; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: ret i64 0 +; CHECK: [[BB12]]: +; CHECK-NEXT: [[ZEXT13:%.*]] = zext i32 [[ADD]] to i64 +; CHECK-NEXT: ret i64 0 +; +bb: + %freeze = freeze i32 0 + br i1 false, label %bb1, label %bb1 + +bb1: + %load = load i32, ptr addrspace(1) null, align 4 + %0 = insertelement <4 x i32> , i32 %freeze, i32 2 + %1 = insertelement <4 x i32> , i32 %arg, i32 3 + %add = add i32 %freeze, 0 + %2 = insertelement <2 x i32> poison, i32 %freeze, i32 0 + %3 = shufflevector <2 x i32> %2, <2 x i32> poison, <2 x i32> zeroinitializer + %4 = add <2 x i32> %3, zeroinitializer + %5 = insertelement <2 x i32> poison, i32 %arg, i32 0 + %6 = insertelement <2 x i32> %5, i32 %load, i32 1 + %7 = icmp ult <2 x i32> %4, %6 + %8 = extractelement <2 x i1> %7, i32 0 + %and = and i1 %8, false + %9 = insertelement <4 x i32> %0, i32 %add, i32 1 + %10 = icmp eq <4 x i32> %9, %1 + %11 = icmp ult <4 x i32> %9, %1 + %12 = shufflevector <4 x i1> %10, <4 x i1> %11, <4 x i32> + br i1 false, label %bb11, label %bb12 + +bb11: + %zext = zext i32 %add to i64 + ret i64 0 + +bb12: + %zext13 = zext i32 %add to i64 + ret i64 0 +} diff --git a/llvm/test/Transforms/SimplifyCFG/nonintegral.ll b/llvm/test/Transforms/SimplifyCFG/nonintegral.ll index 423ac4d1e69c1..1bdd436f01d02 100644 --- a/llvm/test/Transforms/SimplifyCFG/nonintegral.ll +++ b/llvm/test/Transforms/SimplifyCFG/nonintegral.ll @@ -1,12 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 ; RUN: opt -passes=simplifycfg -S < %s | FileCheck %s -target datalayout = "ni:1" +target datalayout = "pu1:64:64-pe2:64:64:64:32" -define void @test_01(ptr addrspace(1) align 8 %ptr) { -; CHECK-LABEL: @test_01( -; CHECK-NOT: ptrtoint -; CHECK-NEXT: icmp eq ptr addrspace(1) %ptr, null -; CHECK-NOT: ptrtoint +;; TODO: it would probably be better to just emit a pointer compare against null. +define void @test_default_null_base(ptr addrspace(0) align 8 %ptr) { +; CHECK-LABEL: define void @test_default_null_base( +; CHECK-SAME: ptr align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[MAGICPTR]], 0 +; CHECK-NEXT: br i1 [[COND]], label %[[TRUE2:.*]], label %[[FALSE1:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr [[PTR]], align 8 +; CHECK-NEXT: store i64 3, ptr [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET:.*]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(0) %ptr, null + %cond2 = icmp eq ptr addrspace(0) %ptr, null + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(0) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(0) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(0) %ptr, align 8 + ret void +} + +;; We should not introduce ptrtoint instructions with unstable pointers +define void @test_default_inttoptr_base(ptr addrspace(0) align 8 %ptr) { +; CHECK-LABEL: define void @test_default_inttoptr_base( +; CHECK-SAME: ptr align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[MAGICPTR]], 4 +; CHECK-NEXT: br i1 [[COND]], label %[[TRUE2:.*]], label %[[FALSE1:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr [[PTR]], align 8 +; CHECK-NEXT: store i64 3, ptr [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET:.*]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(0) %ptr, inttoptr (i32 4 to ptr addrspace(0)) + %cond2 = icmp eq ptr addrspace(0) %ptr, inttoptr (i32 4 to ptr addrspace(0)) + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(0) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(0) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(0) %ptr, align 8 + ret void +} + +;; We should not introduce ptrtoint instructions with unstable pointers +define void @test_default_mixed_base(ptr addrspace(0) align 8 %ptr) { +; CHECK-LABEL: define void @test_default_mixed_base( +; CHECK-SAME: ptr align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[COND2:%.*]] = icmp eq ptr [[PTR]], null +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr [[PTR]] to i64 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[MAGICPTR]], 4 +; CHECK-NEXT: br i1 [[COND]], label %[[FALSE2:.*]], label %[[FALSE1:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr [[PTR]], align 8 +; CHECK-NEXT: br i1 [[COND2]], label %[[TRUE2:.*]], label %[[FALSE2]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE2]]: +; CHECK-NEXT: store i64 3, ptr [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(0) %ptr, inttoptr (i32 4 to ptr addrspace(0)) + %cond2 = icmp eq ptr addrspace(0) %ptr, null + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(0) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(0) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(0) %ptr, align 8 + ret void +} + +;; We should not introduce ptrtoint instructions with unstable pointers +define void @test_unstable_null_base(ptr addrspace(1) align 8 %ptr) { +; CHECK-LABEL: define void @test_unstable_null_base( +; CHECK-SAME: ptr addrspace(1) align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[COND1:%.*]] = icmp eq ptr addrspace(1) [[PTR]], null +; CHECK-NEXT: [[COND2:%.*]] = icmp eq ptr addrspace(1) [[PTR]], null +; CHECK-NEXT: br i1 [[COND1]], label %[[TRUE1:.*]], label %[[FALSE1:.*]] +; CHECK: [[TRUE1]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[TRUE2:.*]], label %[[FALSE2:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[TRUE1]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE2]]: +; CHECK-NEXT: store i64 3, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; %cond1 = icmp eq ptr addrspace(1) %ptr, null %cond2 = icmp eq ptr addrspace(1) %ptr, null br i1 %cond1, label %true1, label %false1 @@ -26,3 +157,200 @@ false2: store i64 3, ptr addrspace(1) %ptr, align 8 ret void } + +;; We should not introduce ptrtoint instructions with unstable pointers +define void @test_unstable_inttoptr_base(ptr addrspace(1) align 8 %ptr) { +; CHECK-LABEL: define void @test_unstable_inttoptr_base( +; CHECK-SAME: ptr addrspace(1) align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[COND1:%.*]] = icmp eq ptr addrspace(1) [[PTR]], inttoptr (i32 4 to ptr addrspace(1)) +; CHECK-NEXT: [[COND2:%.*]] = icmp eq ptr addrspace(1) [[PTR]], inttoptr (i32 4 to ptr addrspace(1)) +; CHECK-NEXT: br i1 [[COND1]], label %[[TRUE1:.*]], label %[[FALSE1:.*]] +; CHECK: [[TRUE1]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[TRUE2:.*]], label %[[FALSE2:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[TRUE1]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE2]]: +; CHECK-NEXT: store i64 3, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(1) %ptr, inttoptr (i32 4 to ptr addrspace(1)) + %cond2 = icmp eq ptr addrspace(1) %ptr, inttoptr (i32 4 to ptr addrspace(1)) + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(1) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(1) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(1) %ptr, align 8 + ret void +} + +;; We should not introduce ptrtoint instructions with unstable pointers +define void @test_unstable_mixed_base(ptr addrspace(1) align 8 %ptr) { +; CHECK-LABEL: define void @test_unstable_mixed_base( +; CHECK-SAME: ptr addrspace(1) align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[COND1:%.*]] = icmp eq ptr addrspace(1) [[PTR]], inttoptr (i32 4 to ptr addrspace(1)) +; CHECK-NEXT: [[COND2:%.*]] = icmp eq ptr addrspace(1) [[PTR]], null +; CHECK-NEXT: br i1 [[COND1]], label %[[TRUE1:.*]], label %[[FALSE1:.*]] +; CHECK: [[TRUE1]]: +; CHECK-NEXT: br i1 [[COND2]], label %[[TRUE2:.*]], label %[[FALSE2:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[TRUE1]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE2]]: +; CHECK-NEXT: store i64 3, ptr addrspace(1) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(1) %ptr, inttoptr (i32 4 to ptr addrspace(1)) + %cond2 = icmp eq ptr addrspace(1) %ptr, null + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(1) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(1) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(1) %ptr, align 8 + ret void +} + +;; This transformation is fine for pointers with external state. +;; TODO: it would probably be better to just emit a pointer compare against null. +define void @test_external_null_base(ptr addrspace(2) align 8 %ptr) { +; CHECK-LABEL: define void @test_external_null_base( +; CHECK-SAME: ptr addrspace(2) align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr addrspace(2) [[PTR]] to i64 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[MAGICPTR]], 0 +; CHECK-NEXT: br i1 [[COND]], label %[[TRUE2:.*]], label %[[FALSE1:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: store i64 3, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET:.*]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(2) %ptr, null + %cond2 = icmp eq ptr addrspace(2) %ptr, null + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(2) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(2) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(2) %ptr, align 8 + ret void +} + +;; This transformation is fine for pointers with external state (even with inttoptr). +define void @test_external_inttoptr_base(ptr addrspace(2) align 8 %ptr) { +; CHECK-LABEL: define void @test_external_inttoptr_base( +; CHECK-SAME: ptr addrspace(2) align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr addrspace(2) [[PTR]] to i64 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[MAGICPTR]], 4 +; CHECK-NEXT: br i1 [[COND]], label %[[TRUE2:.*]], label %[[FALSE1:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: store i64 3, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET:.*]] +; CHECK: [[COMMON_RET]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(2) %ptr, inttoptr (i32 4 to ptr addrspace(2)) + %cond2 = icmp eq ptr addrspace(2) %ptr, inttoptr (i32 4 to ptr addrspace(2)) + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(2) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(2) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(2) %ptr, align 8 + ret void +} + +;; This transformation is fine for pointers with external state (even with inttoptr). +define void @test_external_mixed_base(ptr addrspace(2) align 8 %ptr) { +; CHECK-LABEL: define void @test_external_mixed_base( +; CHECK-SAME: ptr addrspace(2) align 8 [[PTR:%.*]]) { +; CHECK-NEXT: [[COND2:%.*]] = icmp eq ptr addrspace(2) [[PTR]], null +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr addrspace(2) [[PTR]] to i64 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[MAGICPTR]], 4 +; CHECK-NEXT: br i1 [[COND]], label %[[FALSE2:.*]], label %[[FALSE1:.*]] +; CHECK: [[FALSE1]]: +; CHECK-NEXT: store i64 1, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br i1 [[COND2]], label %[[TRUE2:.*]], label %[[FALSE2]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[TRUE2]]: +; CHECK-NEXT: store i64 2, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE2]]: +; CHECK-NEXT: store i64 3, ptr addrspace(2) [[PTR]], align 8 +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %cond1 = icmp eq ptr addrspace(2) %ptr, inttoptr (i32 4 to ptr addrspace(2)) + %cond2 = icmp eq ptr addrspace(2) %ptr, null + br i1 %cond1, label %true1, label %false1 + +true1: + br i1 %cond2, label %true2, label %false2 + +false1: + store i64 1, ptr addrspace(2) %ptr, align 8 + br label %true1 + +true2: + store i64 2, ptr addrspace(2) %ptr, align 8 + ret void + +false2: + store i64 3, ptr addrspace(2) %ptr, align 8 + ret void +} diff --git a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll index fe2e897125eb8..9d78b97c204a8 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-to-select-two-case.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals ; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s ; int foo1_with_default(int a) { @@ -11,20 +11,20 @@ ; return 4; ; } -define i32 @foo1_with_default(i32 %a) { +define i32 @foo1_with_default(i32 %a) !prof !0 { ; CHECK-LABEL: @foo1_with_default( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i32 [[A:%.*]], 20 -; CHECK-NEXT: [[SWITCH_SELECT:%.*]] = select i1 [[SWITCH_SELECTCMP]], i32 2, i32 4 +; CHECK-NEXT: [[SWITCH_SELECT:%.*]] = select i1 [[SWITCH_SELECTCMP]], i32 2, i32 4, !prof [[PROF1:![0-9]+]] ; CHECK-NEXT: [[SWITCH_SELECTCMP1:%.*]] = icmp eq i32 [[A]], 10 -; CHECK-NEXT: [[SWITCH_SELECT2:%.*]] = select i1 [[SWITCH_SELECTCMP1]], i32 10, i32 [[SWITCH_SELECT]] +; CHECK-NEXT: [[SWITCH_SELECT2:%.*]] = select i1 [[SWITCH_SELECTCMP1]], i32 10, i32 [[SWITCH_SELECT]], !prof [[PROF2:![0-9]+]] ; CHECK-NEXT: ret i32 [[SWITCH_SELECT2]] ; entry: switch i32 %a, label %sw.epilog [ i32 10, label %sw.bb i32 20, label %sw.bb1 - ] + ], !prof !1 sw.bb: br label %return @@ -41,20 +41,20 @@ return: } ; Same as above, but both cases have the same value. -define i32 @same_value(i32 %a) { +define i32 @same_value(i32 %a) !prof !0 { ; CHECK-LABEL: @same_value( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SWITCH_SELECTCMP_CASE1:%.*]] = icmp eq i32 [[A:%.*]], 10 ; CHECK-NEXT: [[SWITCH_SELECTCMP_CASE2:%.*]] = icmp eq i32 [[A]], 20 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = or i1 [[SWITCH_SELECTCMP_CASE1]], [[SWITCH_SELECTCMP_CASE2]] -; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i32 10, i32 4 +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i32 10, i32 4, !prof [[PROF3:![0-9]+]] ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: switch i32 %a, label %sw.epilog [ i32 10, label %sw.bb i32 20, label %sw.bb - ] + ], !prof !1 sw.bb: br label %return @@ -67,17 +67,17 @@ return: ret i32 %retval.0 } -define i1 @switch_to_select_same2_case_results_different_default(i8 %0) { +define i1 @switch_to_select_same2_case_results_different_default(i8 %0) !prof !0 { ; CHECK-LABEL: @switch_to_select_same2_case_results_different_default( ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i8 [[TMP0:%.*]], -5 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i8 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %0, label %2 [ i8 4, label %3 i8 0, label %3 - ] + ], !prof !1 2: br label %3 @@ -87,18 +87,18 @@ define i1 @switch_to_select_same2_case_results_different_default(i8 %0) { ret i1 %4 } -define i1 @switch_to_select_same2_case_results_different_default_and_positive_offset_for_case(i8 %0) { +define i1 @switch_to_select_same2_case_results_different_default_and_positive_offset_for_case(i8 %0) !prof !0 { ; CHECK-LABEL: @switch_to_select_same2_case_results_different_default_and_positive_offset_for_case( ; CHECK-NEXT: [[TMP2:%.*]] = sub i8 [[TMP0:%.*]], 43 ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i8 [[TMP2]], -3 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i8 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP3]] ; switch i8 %0, label %2 [ i8 43, label %3 i8 45, label %3 - ] + ], !prof !1 2: br label %3 @@ -108,20 +108,20 @@ define i1 @switch_to_select_same2_case_results_different_default_and_positive_of ret i1 %4 } -define i8 @switch_to_select_same2_case_results_different_default_and_negative_offset_for_case(i32 %i) { +define i8 @switch_to_select_same2_case_results_different_default_and_negative_offset_for_case(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_same2_case_results_different_default_and_negative_offset_for_case( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[I:%.*]], -5 ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i32 [[TMP0]], -3 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i32 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i8 3, i8 42 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i8 3, i8 42, !prof [[PROF3]] ; CHECK-NEXT: ret i8 [[TMP1]] ; entry: switch i32 %i, label %default [ i32 -3, label %end i32 -5, label %end - ] + ], !prof !1 default: br label %end @@ -131,12 +131,12 @@ end: ret i8 %t0 } -define i1 @switch_to_select_same4_case_results_different_default(i32 %i) { +define i1 @switch_to_select_same4_case_results_different_default(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_same4_case_results_different_default( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i32 [[I:%.*]], -7 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i32 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF4:![0-9]+]] ; CHECK-NEXT: ret i1 [[TMP0]] ; entry: @@ -145,7 +145,7 @@ entry: i32 2, label %lor.end i32 4, label %lor.end i32 6, label %lor.end - ] + ], !prof !2 lor.rhs: br label %lor.end @@ -155,12 +155,12 @@ lor.end: ret i1 %0 } -define i1 @switch_to_select_same4_case_results_different_default_alt_bitmask(i32 %i) { +define i1 @switch_to_select_same4_case_results_different_default_alt_bitmask(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_same4_case_results_different_default_alt_bitmask( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i32 [[I:%.*]], -11 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i32 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP0]] ; entry: @@ -169,7 +169,7 @@ entry: i32 2, label %lor.end i32 8, label %lor.end i32 10, label %lor.end - ] + ], !prof !2 lor.rhs: br label %lor.end @@ -179,13 +179,13 @@ lor.end: ret i1 %0 } -define i1 @switch_to_select_same4_case_results_different_default_positive_offset(i32 %i) { +define i1 @switch_to_select_same4_case_results_different_default_positive_offset(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_same4_case_results_different_default_positive_offset( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sub i32 [[I:%.*]], 2 ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i32 [[TMP0]], -11 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i32 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP1]] ; entry: @@ -194,7 +194,7 @@ entry: i32 4, label %lor.end i32 10, label %lor.end i32 12, label %lor.end - ] + ], !prof !2 lor.rhs: br label %lor.end @@ -204,7 +204,7 @@ lor.end: ret i1 %0 } -define i1 @switch_to_select_invalid_mask(i32 %i) { +define i1 @switch_to_select_invalid_mask(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_invalid_mask( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i32 [[I:%.*]], label [[LOR_RHS:%.*]] [ @@ -212,7 +212,7 @@ define i1 @switch_to_select_invalid_mask(i32 %i) { ; CHECK-NEXT: i32 4, label [[LOR_END]] ; CHECK-NEXT: i32 10, label [[LOR_END]] ; CHECK-NEXT: i32 12, label [[LOR_END]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF5:![0-9]+]] ; CHECK: lor.rhs: ; CHECK-NEXT: br label [[LOR_END]] ; CHECK: lor.end: @@ -225,7 +225,7 @@ entry: i32 4, label %lor.end i32 10, label %lor.end i32 12, label %lor.end - ] + ], !prof !2 lor.rhs: br label %lor.end @@ -235,14 +235,14 @@ lor.end: ret i1 %0 } -define i1 @switch_to_select_nonpow2_cases(i32 %i) { +define i1 @switch_to_select_nonpow2_cases(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_nonpow2_cases( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i32 [[I:%.*]], label [[LOR_RHS:%.*]] [ ; CHECK-NEXT: i32 0, label [[LOR_END:%.*]] ; CHECK-NEXT: i32 2, label [[LOR_END]] ; CHECK-NEXT: i32 4, label [[LOR_END]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF6:![0-9]+]] ; CHECK: lor.rhs: ; CHECK-NEXT: br label [[LOR_END]] ; CHECK: lor.end: @@ -254,7 +254,7 @@ entry: i32 0, label %lor.end i32 2, label %lor.end i32 4, label %lor.end - ] + ], !prof !3 lor.rhs: br label %lor.end @@ -265,7 +265,7 @@ lor.end: } ; TODO: we can produce the optimal code when there is no default also -define i8 @switch_to_select_two_case_results_no_default(i32 %i) { +define i8 @switch_to_select_two_case_results_no_default(i32 %i) !prof !0 { ; CHECK-LABEL: @switch_to_select_two_case_results_no_default( ; CHECK-NEXT: entry: ; CHECK-NEXT: switch i32 [[I:%.*]], label [[DEFAULT:%.*]] [ @@ -273,7 +273,7 @@ define i8 @switch_to_select_two_case_results_no_default(i32 %i) { ; CHECK-NEXT: i32 2, label [[END]] ; CHECK-NEXT: i32 4, label [[CASE3:%.*]] ; CHECK-NEXT: i32 6, label [[CASE3]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF5]] ; CHECK: case3: ; CHECK-NEXT: br label [[END]] ; CHECK: default: @@ -288,7 +288,7 @@ entry: i32 2, label %case2 i32 4, label %case3 i32 6, label %case4 - ] + ], !prof !2 case1: br label %end @@ -310,12 +310,12 @@ end: ret i8 %t0 } -define i1 @no_range(i8 %f) { +define i1 @no_range(i8 %f) !prof !0 { ; CHECK-LABEL: @no_range( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 60 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 60 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF7:![0-9]+]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ @@ -335,7 +335,7 @@ define i1 @no_range(i8 %f) { i8 253, label %bb2 i8 254, label %bb2 i8 255, label %bb2 - ] + ], !prof !4 bb1: br label %bb3 bb2: @@ -345,7 +345,7 @@ bb3: ret i1 %phi } -define i1 @negative_no_range(i8 %f) { +define i1 @negative_no_range(i8 %f) !prof !0 { ; CHECK-LABEL: @negative_no_range( ; CHECK-NEXT: switch i8 [[F:%.*]], label [[BB3:%.*]] [ ; CHECK-NEXT: i8 52, label [[BB2:%.*]] @@ -364,12 +364,12 @@ define i1 @negative_no_range(i8 %f) { ; CHECK-NEXT: i8 -3, label [[BB2]] ; CHECK-NEXT: i8 -2, label [[BB2]] ; CHECK-NEXT: i8 -1, label [[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF8:![0-9]+]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[_0_SROA_0_0:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] -; CHECK-NEXT: ret i1 [[_0_SROA_0_0]] +; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] +; CHECK-NEXT: ret i1 [[PHI]] ; switch i8 %f, label %bb1 [ i8 52, label %bb2 @@ -388,7 +388,7 @@ define i1 @negative_no_range(i8 %f) { i8 253, label %bb2 i8 254, label %bb2 i8 255, label %bb2 - ] + ], !prof !4 bb1: br label %bb3 bb2: @@ -400,18 +400,19 @@ bb3: ; Using ranges. -define i1 @range0to4odd(i8 range(i8 0, 4) %f) { +define i1 @range0to4odd(i8 range(i8 0, 4) %f) !prof !0 { ; CHECK-LABEL: @range0to4odd( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ i8 1, label %bb2 i8 3, label %bb2 - ] + ], !prof !1 + bb1: br label %bb3 bb2: @@ -421,18 +422,18 @@ bb3: ret i1 %phi } -define i1 @range1to4odd(i8 range(i8 1, 4) %f) { +define i1 @range1to4odd(i8 range(i8 1, 4) %f) !prof !0 { ; CHECK-LABEL: @range1to4odd( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ i8 1, label %bb2 i8 3, label %bb2 - ] + ], !prof !1 bb1: br label %bb3 bb2: @@ -442,12 +443,12 @@ bb3: ret i1 %phi } -define i1 @range0to8odd(i8 range(i8 0, 8) %f) { +define i1 @range0to8odd(i8 range(i8 0, 8) %f) !prof !0 { ; CHECK-LABEL: @range0to8odd( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ @@ -455,7 +456,7 @@ define i1 @range0to8odd(i8 range(i8 0, 8) %f) { i8 3, label %bb2 i8 5, label %bb2 i8 7, label %bb2 - ] + ], !prof !2 bb1: br label %bb3 bb2: @@ -465,12 +466,12 @@ bb3: ret i1 %phi } -define i1 @range0to8most_significant_bit(i8 range(i8 0, 8) %f) { +define i1 @range0to8most_significant_bit(i8 range(i8 0, 8) %f) !prof !0 { ; CHECK-LABEL: @range0to8most_significant_bit( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ @@ -478,7 +479,7 @@ define i1 @range0to8most_significant_bit(i8 range(i8 0, 8) %f) { i8 5, label %bb2 i8 6, label %bb2 i8 7, label %bb2 - ] + ], !prof !2 bb1: br label %bb3 bb2: @@ -488,12 +489,12 @@ bb3: ret i1 %phi } -define i1 @range0to15_middle_two_bits(i8 range(i8 0, 16) %f) { +define i1 @range0to15_middle_two_bits(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @range0to15_middle_two_bits( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 6 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ @@ -501,7 +502,8 @@ define i1 @range0to15_middle_two_bits(i8 range(i8 0, 16) %f) { i8 7, label %bb2 i8 14, label %bb2 i8 15, label %bb2 - ] + ], !prof !2 + bb1: br label %bb3 bb2: @@ -511,24 +513,25 @@ bb3: ret i1 %phi } -define i1 @negative_range0to15(i8 range(i8 0, 16) %f) { +define i1 @negative_range0to15(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to15( ; CHECK-NEXT: switch i8 [[F:%.*]], label [[BB3:%.*]] [ ; CHECK-NEXT: i8 6, label [[BB2:%.*]] ; CHECK-NEXT: i8 7, label [[BB2]] ; CHECK-NEXT: i8 14, label [[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF6]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[_0_SROA_0_0:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] -; CHECK-NEXT: ret i1 [[_0_SROA_0_0]] +; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] +; CHECK-NEXT: ret i1 [[PHI]] ; switch i8 %f, label %bb1 [ i8 6, label %bb2 i8 7, label %bb2 i8 14, label %bb2 - ] + ], !prof !3 + bb1: br label %bb3 bb2: @@ -538,19 +541,19 @@ bb3: ret i1 %phi } -define i1 @negative_range0to15_pow_2(i8 range(i8 0, 16) %f) { +define i1 @negative_range0to15_pow_2(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to15_pow_2( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = sub i8 [[F:%.*]], 6 ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i8 [[TMP0]], -2 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i8 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP1]] ; switch i8 %f, label %bb1 [ i8 6, label %bb2 i8 7, label %bb2 - ] + ], !prof !1 bb1: br label %bb3 bb2: @@ -560,19 +563,19 @@ bb3: ret i1 %phi } -define i1 @negative_range0to5even(i8 range(i8 0, 5) %f) { +define i1 @negative_range0to5even(i8 range(i8 0, 5) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to5even( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = sub i8 [[F:%.*]], 2 ; CHECK-NEXT: [[SWITCH_AND:%.*]] = and i8 [[TMP0]], -3 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = icmp eq i8 [[SWITCH_AND]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP1]] ; switch i8 %f, label %bb1 [ i8 2, label %bb2 i8 4, label %bb2 - ] + ], !prof !1 bb1: br label %bb3 bb2: @@ -582,16 +585,17 @@ bb3: ret i1 %phi } -define i1 @range0to15_corner_case(i8 range(i8 0, 16) %f) { +define i1 @range0to15_corner_case(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @range0to15_corner_case( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[F:%.*]], 15 -; CHECK-NEXT: [[DOT:%.*]] = select i1 [[COND]], i1 true, i1 false +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[COND]], i1 true, i1 false, !prof [[PROF9:![0-9]+]] ; CHECK-NEXT: ret i1 [[DOT]] ; switch i8 %f, label %bb1 [ i8 15, label %bb2 - ] + ], !prof !5 + bb1: br label %bb3 bb2: @@ -601,19 +605,19 @@ bb3: ret i1 %phi } -define i1 @negative_range0to15_corner_case(i8 range(i8 0, 16) %f) { +define i1 @negative_range0to15_corner_case(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to15_corner_case( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[SWITCH_SELECTCMP_CASE1:%.*]] = icmp eq i8 [[F:%.*]], 15 ; CHECK-NEXT: [[SWITCH_SELECTCMP_CASE2:%.*]] = icmp eq i8 [[F]], 8 ; CHECK-NEXT: [[SWITCH_SELECTCMP:%.*]] = or i1 [[SWITCH_SELECTCMP_CASE1]], [[SWITCH_SELECTCMP_CASE2]] -; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false +; CHECK-NEXT: [[TMP0:%.*]] = select i1 [[SWITCH_SELECTCMP]], i1 true, i1 false, !prof [[PROF3]] ; CHECK-NEXT: ret i1 [[TMP0]] ; switch i8 %f, label %bb1 [ i8 15, label %bb2 - i8 8, label %bb2 - ] + i8 8, label %bb2 + ], !prof !1 bb1: br label %bb3 bb2: @@ -626,12 +630,12 @@ bb3: ; Out of range scenarios. Check if the cases, that have a value out of range ; are eliminated and the optimization is performed. -define i1 @range0to15_out_of_range_non_prime(i8 range(i8 0, 16) %f) { +define i1 @range0to15_out_of_range_non_prime(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @range0to15_out_of_range_non_prime( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 6 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ @@ -640,7 +644,7 @@ define i1 @range0to15_out_of_range_non_prime(i8 range(i8 0, 16) %f) { i8 14, label %bb2 i8 15, label %bb2 i8 22, label %bb2 - ] + ], !prof !6 bb1: br label %bb3 bb2: @@ -650,12 +654,12 @@ bb3: ret i1 %phi } -define i1 @range0to15_out_of_range_non_prime_more(i8 range(i8 0, 16) %f) { +define i1 @range0to15_out_of_range_non_prime_more(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @range0to15_out_of_range_non_prime_more( ; CHECK-NEXT: bb3: ; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[F:%.*]], 6 ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 6 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i1 true, i1 false, !prof [[PROF4]] ; CHECK-NEXT: ret i1 [[TMP2]] ; switch i8 %f, label %bb1 [ @@ -665,7 +669,7 @@ define i1 @range0to15_out_of_range_non_prime_more(i8 range(i8 0, 16) %f) { i8 15, label %bb2 i8 22, label %bb2 i8 23, label %bb2 - ] + ], !prof !7 bb1: br label %bb3 bb2: @@ -675,25 +679,25 @@ bb3: ret i1 %phi } -define i1 @negative_range0to15_out_of_range_non_prime(i8 range(i8 0, 16) %f) { +define i1 @negative_range0to15_out_of_range_non_prime(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to15_out_of_range_non_prime( ; CHECK-NEXT: switch i8 [[F:%.*]], label [[BB3:%.*]] [ ; CHECK-NEXT: i8 6, label [[BB2:%.*]] ; CHECK-NEXT: i8 14, label [[BB2]] ; CHECK-NEXT: i8 15, label [[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF6]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] -; CHECK-NEXT: ret i1 [[TMP2]] +; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] +; CHECK-NEXT: ret i1 [[PHI]] ; switch i8 %f, label %bb1 [ i8 6, label %bb2 i8 14, label %bb2 i8 15, label %bb2 i8 23, label %bb2 - ] + ], !prof !2 bb1: br label %bb3 bb2: @@ -703,25 +707,25 @@ bb3: ret i1 %phi } -define i1 @negative_range0to15_out_of_range(i8 range(i8 0, 16) %f) { +define i1 @negative_range0to15_out_of_range(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to15_out_of_range( ; CHECK-NEXT: switch i8 [[F:%.*]], label [[BB3:%.*]] [ ; CHECK-NEXT: i8 6, label [[BB2:%.*]] ; CHECK-NEXT: i8 7, label [[BB2]] ; CHECK-NEXT: i8 14, label [[BB2]] -; CHECK-NEXT: ] +; CHECK-NEXT: ], !prof [[PROF6]] ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[_0_SROA_0_0:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] -; CHECK-NEXT: ret i1 [[_0_SROA_0_0]] +; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ true, [[BB2]] ], [ false, [[TMP0:%.*]] ] +; CHECK-NEXT: ret i1 [[PHI]] ; switch i8 %f, label %bb1 [ i8 6, label %bb2 i8 7, label %bb2 i8 14, label %bb2 - i8 150, label %bb2 - ] + i8 -106, label %bb2 + ], !prof !2 bb1: br label %bb3 bb2: @@ -731,7 +735,7 @@ bb3: ret i1 %phi } -define i1 @negative_range0to15_all_out_of_range(i8 range(i8 0, 16) %f) { +define i1 @negative_range0to15_all_out_of_range(i8 range(i8 0, 16) %f) !prof !0 { ; CHECK-LABEL: @negative_range0to15_all_out_of_range( ; CHECK-NEXT: bb1: ; CHECK-NEXT: ret i1 false @@ -741,7 +745,7 @@ define i1 @negative_range0to15_all_out_of_range(i8 range(i8 0, 16) %f) { i8 23, label %bb2 i8 30, label %bb2 i8 31, label %bb2 - ] + ], !prof !2 bb1: br label %bb3 bb2: @@ -750,3 +754,43 @@ bb3: %phi = phi i1 [ false, %bb1 ], [ true, %bb2 ] ret i1 %phi } + +define i32 @negative_constfold_select() { +; CHECK-LABEL: @negative_constfold_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: ret i32 poison +; +entry: + switch i32 poison, label %default [ + i32 0, label %bb + i32 2, label %bb + ] + +bb: + br label %default + +default: + %ret = phi i32 [ poison, %entry ], [ poison, %bb ] + ret i32 %ret +} + +!0 = !{!"function_entry_count", i64 1000} +!1 = !{!"branch_weights", i32 3, i32 5, i32 7} +!2 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13} +!3 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11} +!4 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13, i32 17, i32 19, i32 23, i32 29, i32 31, i32 37, i32 41, i32 43, i32 47, i32 53, i32 59, i32 61} +!5 = !{!"branch_weights", i32 3, i32 5} +!6 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13, i32 17} +!7 = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13, i32 17, i32 19} +;. +; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 7, i32 8} +; CHECK: [[PROF2]] = !{!"branch_weights", i32 5, i32 10} +; CHECK: [[PROF3]] = !{!"branch_weights", i32 12, i32 3} +; CHECK: [[PROF4]] = !{!"branch_weights", i32 36, i32 3} +; CHECK: [[PROF5]] = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13} +; CHECK: [[PROF6]] = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11} +; CHECK: [[PROF7]] = !{!"branch_weights", i32 496, i32 3} +; CHECK: [[PROF8]] = !{!"branch_weights", i32 3, i32 5, i32 7, i32 11, i32 13, i32 17, i32 19, i32 23, i32 29, i32 31, i32 37, i32 41, i32 43, i32 47, i32 53, i32 59, i32 61} +; CHECK: [[PROF9]] = !{!"branch_weights", i32 5, i32 3} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll b/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll index 336fc5e14d758..8103124e3e5a6 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_create-custom-dl.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -switch-range-to-icmp < %s | FileCheck %s -target datalayout="p:40:64:64:32" +target datalayout="p:40:64:64:32-pe200:64:64:64:32-pu201:64:64:64:32" declare void @foo1() @@ -89,6 +89,63 @@ F: ; preds = %0 ret void } +; We also allow the transformation for pointers with external state +define void @test1_ptr_external_state(ptr addrspace(200) %V) { +; CHECK-LABEL: @test1_ptr_external_state( +; CHECK-NEXT: [[MAGICPTR:%.*]] = ptrtoint ptr addrspace(200) [[V:%.*]] to i64 +; CHECK-NEXT: switch i64 [[MAGICPTR]], label [[F:%.*]] [ +; CHECK-NEXT: i64 17, label [[T:%.*]] +; CHECK-NEXT: i64 4, label [[T]] +; CHECK-NEXT: ] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: T: +; CHECK-NEXT: call void @foo1() +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: F: +; CHECK-NEXT: call void @foo2() +; CHECK-NEXT: br label [[COMMON_RET]] +; + %C1 = icmp eq ptr addrspace(200) %V, inttoptr (i32 4 to ptr addrspace(200)) + %C2 = icmp eq ptr addrspace(200) %V, inttoptr (i32 17 to ptr addrspace(200)) + %CN = or i1 %C1, %C2 ; [#uses=1] + br i1 %CN, label %T, label %F +T: ; preds = %0 + call void @foo1( ) + ret void +F: ; preds = %0 + call void @foo2( ) + ret void +} + +; But it is not permitted for unstable pointer representations +define void @test1_ptr_unstable(ptr addrspace(201) %V) { +; CHECK-LABEL: @test1_ptr_unstable( +; CHECK-NEXT: [[C1:%.*]] = icmp eq ptr addrspace(201) [[V:%.*]], inttoptr (i32 4 to ptr addrspace(201)) +; CHECK-NEXT: [[C2:%.*]] = icmp eq ptr addrspace(201) [[V]], inttoptr (i32 17 to ptr addrspace(201)) +; CHECK-NEXT: [[CN:%.*]] = or i1 [[C1]], [[C2]] +; CHECK-NEXT: br i1 [[CN]], label [[T:%.*]], label [[F:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: T: +; CHECK-NEXT: call void @foo1() +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: F: +; CHECK-NEXT: call void @foo2() +; CHECK-NEXT: br label [[COMMON_RET]] +; + %C1 = icmp eq ptr addrspace(201) %V, inttoptr (i32 4 to ptr addrspace(201)) + %C2 = icmp eq ptr addrspace(201) %V, inttoptr (i32 17 to ptr addrspace(201)) + %CN = or i1 %C1, %C2 ; [#uses=1] + br i1 %CN, label %T, label %F +T: ; preds = %0 + call void @foo1( ) + ret void +F: ; preds = %0 + call void @foo2( ) + ret void +} + define void @test2(i32 %V) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: switch i32 [[V:%.*]], label [[T:%.*]] [ diff --git a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll index 6341c8945247d..1503a1b51d256 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll @@ -14,9 +14,9 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -36,9 +36,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -58,9 +58,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -80,9 +80,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_outer_nneg(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -102,9 +102,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_inner_nneg_outer_sext(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext nneg <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -125,9 +125,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) { ; CHECK-NEXT: call void @use.i32(i32 0) ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -148,9 +148,9 @@ define <4 x i32> @load_i32_sext_zext_to_v4i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -170,9 +170,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_load_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: call void @use.i32(i32 [[L]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; @@ -194,9 +194,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_ins_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: call void @use.v2i32(<2 x i32> [[VEC_INS]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; @@ -218,9 +218,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_bc_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: call void @use.v8i8(<8 x i8> [[VEC_BC]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; @@ -266,10 +266,10 @@ define <4 x i32> @load_i32_zext_to_v4i32_shuffle_other_users(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> -; CHECK-NEXT: call void @use.v8i16(<4 x i16> [[VEC_SHUFFLE]]) +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[E_1]] to <4 x i32> +; CHECK-NEXT: call void @use.v8i16(<4 x i16> [[E_1]]) ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -290,9 +290,9 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> , i64 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = zext <8 x i8> [[VEC_SHUFFLE]] to <8 x i16> +; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[EXT_1]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]] ; entry: @@ -312,9 +312,9 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> , i24 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = zext <3 x i8> [[VEC_SHUFFLE]] to <3 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <3 x i16> [[EXT_1]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[EXT_2]] ; entry: @@ -334,9 +334,9 @@ define <4 x i32> @load_i32_insert_idx_1_sext(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 1 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -356,9 +356,9 @@ define <4 x i32> @mask_extracts_not_all_elements_1_sext(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -378,9 +378,9 @@ define <4 x i32> @mask_extracts_not_all_elements_2_sext(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[EXT_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -422,9 +422,9 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[E_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[E_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -444,9 +444,9 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> , i64 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> -; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i8> [[VEC_BC]], <16 x i8> poison, <8 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = sext <8 x i8> [[VEC_SHUFFLE]] to <8 x i16> +; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i16> [[EXT_1]] to <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]] ; entry: @@ -466,9 +466,9 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> , i24 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i8> [[VEC_BC]], <6 x i8> poison, <3 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = sext <3 x i8> [[VEC_SHUFFLE]] to <3 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i16> [[EXT_1]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[EXT_2]] ; entry: @@ -488,9 +488,9 @@ define <4 x i32> @load_i32_insert_idx_1(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 1 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -510,9 +510,9 @@ define <4 x i32> @mask_extracts_not_all_elements_1(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: @@ -532,9 +532,9 @@ define <4 x i32> @mask_extracts_not_all_elements_2(ptr %di) { ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 ; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 ; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[EXT_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[EXT_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[VEC_BC]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[EXT_1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[EXT_1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index acbc836ffcab0..ed29719d49493 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -205,8 +205,8 @@ define <8 x i8> @abs_different(<8 x i8> %a) { define <4 x i32> @poison_intrinsic(<2 x i16> %l256) { ; CHECK-LABEL: @poison_intrinsic( ; CHECK-NEXT: [[L266:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[L256:%.*]], i1 false) -; CHECK-NEXT: [[L267:%.*]] = zext <2 x i16> [[L266]] to <2 x i32> -; CHECK-NEXT: [[L271:%.*]] = shufflevector <2 x i32> [[L267]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[L267:%.*]] = shufflevector <2 x i16> [[L266]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[L271:%.*]] = zext <4 x i16> [[L267]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[L271]] ; %l266 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %l256, i1 false) @@ -534,9 +534,9 @@ define <4 x i64> @single_zext(<4 x i32> %x) { define <4 x i64> @not_zext(<4 x i32> %x) { ; CHECK-LABEL: @not_zext( -; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> [[X:%.*]] to <4 x i64> -; CHECK-NEXT: [[REVSHUF:%.*]] = shufflevector <4 x i64> [[ZEXT]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[REVSHUF]] +; CHECK-NEXT: [[REVSHUF:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> [[REVSHUF:%.*]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[ZEXT]] ; %zext = zext <4 x i32> %x to <4 x i64> %revshuf = shufflevector <4 x i64> %zext, <4 x i64> poison, <4 x i32> @@ -922,10 +922,9 @@ define <4 x i8> @singleop(<4 x i8> %a, <4 x i8> %b) { define <4 x i64> @cast_mismatched_types(<4 x i32> %x) { ; CHECK-LABEL: @cast_mismatched_types( -; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i32> [[SHUF]] to <2 x i64> -; CHECK-NEXT: [[EXTSHUF:%.*]] = shufflevector <2 x i64> [[ZEXT]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: ret <4 x i64> [[EXTSHUF]] +; CHECK-SAME: <4 x i32> [[X:%.*]]) { +; CHECK-NEXT: [[ZEXT:%.*]] = zext <4 x i32> [[X]] to <4 x i64> +; CHECK-NEXT: ret <4 x i64> [[ZEXT]] ; %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <2 x i32> %zext = zext <2 x i32> %shuf to <2 x i64> diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll index 8c504843d87d8..b293976974bf5 100644 --- a/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/narrow-phi-of-shuffles.ll @@ -392,7 +392,7 @@ define <4 x i32> @shuffle_v4i32(<3 x i32> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -427,7 +427,7 @@ define <8 x i32> @shuffle_v8i32(<3 x i32> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -462,7 +462,7 @@ define <16 x i32> @shuffle_v16i32(<3 x i32> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -497,7 +497,7 @@ define <32 x i32> @shuffle_v32i32(<3 x i32> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x i32> [[ARG0]], <3 x i32> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -1092,7 +1092,7 @@ define <4 x float> @shuffle_v4f32(<3 x float> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -1127,7 +1127,7 @@ define <6 x float> @shuffle_v6f32(<3 x float> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -1162,7 +1162,7 @@ define <8 x float> @shuffle_v8f32(<3 x float> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -1197,7 +1197,7 @@ define <16 x float> @shuffle_v16f32(<3 x float> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -1232,7 +1232,7 @@ define <32 x float> @shuffle_v32f32(<3 x float> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[ARG0]], <3 x float> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: diff --git a/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll index 59422e98cbcc6..594017ecf84c3 100644 --- a/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll +++ b/llvm/test/Transforms/VectorCombine/X86/narrow-phi-of-shuffles.ll @@ -605,7 +605,7 @@ define <4 x bfloat> @shuffle_v4bf16(<3 x bfloat> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -640,7 +640,7 @@ define <6 x bfloat> @shuffle_v6bf16(<3 x bfloat> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -675,7 +675,7 @@ define <8 x bfloat> @shuffle_v8bf16(<3 x bfloat> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -710,7 +710,7 @@ define <16 x bfloat> @shuffle_v16bf16(<3 x bfloat> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -745,7 +745,7 @@ define <32 x bfloat> @shuffle_v32bf16(<3 x bfloat> %arg0, i1 %cond) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK: [[THEN]]: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x bfloat> [[ARG0]], <3 x bfloat> poison, <3 x i32> ; CHECK-NEXT: tail call void @func0() ; CHECK-NEXT: br label %[[FINALLY:.*]] ; CHECK: [[ELSE]]: @@ -850,7 +850,7 @@ define <4 x half> @shuffle_v4f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V1-NEXT: [[ENTRY:.*:]] ; CHECK-V1-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V1: [[THEN]]: -; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V1-NEXT: tail call void @func0() ; CHECK-V1-NEXT: br label %[[FINALLY:.*]] ; CHECK-V1: [[ELSE]]: @@ -866,7 +866,7 @@ define <4 x half> @shuffle_v4f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V2-NEXT: [[ENTRY:.*:]] ; CHECK-V2-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V2: [[THEN]]: -; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V2-NEXT: tail call void @func0() ; CHECK-V2-NEXT: br label %[[FINALLY:.*]] ; CHECK-V2: [[ELSE]]: @@ -933,7 +933,7 @@ define <6 x half> @shuffle_v6f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V1-NEXT: [[ENTRY:.*:]] ; CHECK-V1-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V1: [[THEN]]: -; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V1-NEXT: tail call void @func0() ; CHECK-V1-NEXT: br label %[[FINALLY:.*]] ; CHECK-V1: [[ELSE]]: @@ -949,7 +949,7 @@ define <6 x half> @shuffle_v6f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V2-NEXT: [[ENTRY:.*:]] ; CHECK-V2-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V2: [[THEN]]: -; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V2-NEXT: tail call void @func0() ; CHECK-V2-NEXT: br label %[[FINALLY:.*]] ; CHECK-V2: [[ELSE]]: @@ -1016,7 +1016,7 @@ define <8 x half> @shuffle_v8f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V1-NEXT: [[ENTRY:.*:]] ; CHECK-V1-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V1: [[THEN]]: -; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V1-NEXT: tail call void @func0() ; CHECK-V1-NEXT: br label %[[FINALLY:.*]] ; CHECK-V1: [[ELSE]]: @@ -1032,7 +1032,7 @@ define <8 x half> @shuffle_v8f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V2-NEXT: [[ENTRY:.*:]] ; CHECK-V2-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V2: [[THEN]]: -; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V2-NEXT: tail call void @func0() ; CHECK-V2-NEXT: br label %[[FINALLY:.*]] ; CHECK-V2: [[ELSE]]: @@ -1099,7 +1099,7 @@ define <16 x half> @shuffle_v16f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V1-NEXT: [[ENTRY:.*:]] ; CHECK-V1-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V1: [[THEN]]: -; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V1-NEXT: tail call void @func0() ; CHECK-V1-NEXT: br label %[[FINALLY:.*]] ; CHECK-V1: [[ELSE]]: @@ -1115,7 +1115,7 @@ define <16 x half> @shuffle_v16f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V2-NEXT: [[ENTRY:.*:]] ; CHECK-V2-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V2: [[THEN]]: -; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V2-NEXT: tail call void @func0() ; CHECK-V2-NEXT: br label %[[FINALLY:.*]] ; CHECK-V2: [[ELSE]]: @@ -1182,7 +1182,7 @@ define <32 x half> @shuffle_v32f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V1-NEXT: [[ENTRY:.*:]] ; CHECK-V1-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V1: [[THEN]]: -; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V1-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V1-NEXT: tail call void @func0() ; CHECK-V1-NEXT: br label %[[FINALLY:.*]] ; CHECK-V1: [[ELSE]]: @@ -1198,7 +1198,7 @@ define <32 x half> @shuffle_v32f16(<3 x half> %arg0, i1 %cond) { ; CHECK-V2-NEXT: [[ENTRY:.*:]] ; CHECK-V2-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] ; CHECK-V2: [[THEN]]: -; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> +; CHECK-V2-NEXT: [[TMP0:%.*]] = shufflevector <3 x half> [[ARG0]], <3 x half> poison, <3 x i32> ; CHECK-V2-NEXT: tail call void @func0() ; CHECK-V2-NEXT: br label %[[FINALLY:.*]] ; CHECK-V2: [[ELSE]]: diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll index fba4b60ef417b..82a739964c9d0 100644 --- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll @@ -342,3 +342,59 @@ define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) { %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <16 x i32> ret <16 x i32> %r } + +; Unary shuffles + +define <4 x i16> @unary_shuffle_zext_v8i8_v4i16(<8 x i8> %a0) { +; CHECK-LABEL: define <4 x i16> @unary_shuffle_zext_v8i8_v4i16( +; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[A0]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[X1:%.*]] = zext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: ret <4 x i16> [[X1]] +; + %x1 = zext <8 x i8> %a0 to <8 x i16> + %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> + ret <4 x i16> %vec.shuffle +} + +define <4 x i16> @unary_shuffle_sext_v8i8_v4i16(<8 x i8> %a0) { +; CHECK-LABEL: define <4 x i16> @unary_shuffle_sext_v8i8_v4i16( +; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i8> [[A0]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[X1:%.*]] = sext <4 x i8> [[VEC_SHUFFLE]] to <4 x i16> +; CHECK-NEXT: ret <4 x i16> [[X1]] +; + %x1 = sext <8 x i8> %a0 to <8 x i16> + %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> + ret <4 x i16> %vec.shuffle +} + +; negative - avoid loop with foldBitcastOfShuffle + +define <2 x i32> @unary_shuffle_bitcast_v8i8_v2i32(<8 x i8> %a0) { +; CHECK-LABEL: define <2 x i32> @unary_shuffle_bitcast_v8i8_v2i32( +; CHECK-SAME: <8 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X1:%.*]] = bitcast <8 x i8> [[A0]] to <2 x i32> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <2 x i32> [[X1]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[VEC_SHUFFLE]] +; + %x1 = bitcast <8 x i8> %a0 to <2 x i32> + %vec.shuffle = shufflevector <2 x i32> %x1, <2 x i32> poison, <2 x i32> + ret <2 x i32> %vec.shuffle +} + +; negative - multiuse + +define <4 x i16> @unary_shuffle_sext_v8i8_v4i16_multiuse(<8 x i8> %a0, ptr %a1) { +; CHECK-LABEL: define <4 x i16> @unary_shuffle_sext_v8i8_v4i16_multiuse( +; CHECK-SAME: <8 x i8> [[A0:%.*]], ptr [[A1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[X1:%.*]] = sext <8 x i8> [[A0]] to <8 x i16> +; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[X1]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: store <8 x i16> [[X1]], ptr [[A1]], align 16 +; CHECK-NEXT: ret <4 x i16> [[VEC_SHUFFLE]] +; + %x1 = sext <8 x i8> %a0 to <8 x i16> + %vec.shuffle = shufflevector <8 x i16> %x1, <8 x i16> poison, <4 x i32> + store <8 x i16> %x1, ptr %a1, align 16 + ret <4 x i16> %vec.shuffle +} diff --git a/llvm/test/Verifier/assume-bundles.ll b/llvm/test/Verifier/assume-bundles.ll index d8037b965edb5..728b118c99fb6 100644 --- a/llvm/test/Verifier/assume-bundles.ll +++ b/llvm/test/Verifier/assume-bundles.ll @@ -3,7 +3,7 @@ declare void @llvm.assume(i1) -define void @func(ptr %P, i32 %P1, ptr %P2, ptr %P3) { +define void @func(ptr %P, i32 %P1, ptr %P2, ptr %P3, i1 %cond) { ; CHECK: tags must be valid attribute names ; CHECK: "adazdazd" call void @llvm.assume(i1 true) ["adazdazd"()] @@ -32,5 +32,7 @@ define void @func(ptr %P, i32 %P1, ptr %P2, ptr %P3) { call void @llvm.assume(i1 true) ["separate_storage"(ptr %P, i32 123)] ; CHECK: dereferenceable assumptions should have 2 arguments call void @llvm.assume(i1 true) ["align"(ptr %P, i32 4), "dereferenceable"(ptr %P)] +; CHECK: assume with operand bundles must have i1 true condition + call void @llvm.assume(i1 %cond) ["nonnull"(ptr %P)] ret void } diff --git a/llvm/test/Verifier/errno-tbaa-metadata-1.ll b/llvm/test/Verifier/errno-tbaa-metadata-1.ll new file mode 100644 index 0000000000000..0530653309966 --- /dev/null +++ b/llvm/test/Verifier/errno-tbaa-metadata-1.ll @@ -0,0 +1,5 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: assembly parsed, but does not verify as correct! +; CHECK-NEXT: llvm.errno.tbaa must have at least one operand +!llvm.errno.tbaa = !{} diff --git a/llvm/test/Verifier/errno-tbaa-metadata-2.ll b/llvm/test/Verifier/errno-tbaa-metadata-2.ll new file mode 100644 index 0000000000000..6b2a4c6e8bda7 --- /dev/null +++ b/llvm/test/Verifier/errno-tbaa-metadata-2.ll @@ -0,0 +1,9 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: assembly parsed, but does not verify as correct! +; CHECK-NEXT: Malformed struct tag metadata: base and access-type should be non-null and point to Metadata nodes +!llvm.errno.tbaa = !{!0} +!0 = !{!1, i64 0, !1} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C/C++ TBAA"} diff --git a/llvm/test/Verifier/preallocated-invalid.ll b/llvm/test/Verifier/preallocated-invalid.ll index 38ed1067c497d..2c5aff231e1bd 100644 --- a/llvm/test/Verifier/preallocated-invalid.ll +++ b/llvm/test/Verifier/preallocated-invalid.ll @@ -65,13 +65,21 @@ define void @preallocated_one_call() { ret void } -; CHECK: must be a constant +; CHECK: immarg operand has non-immediate parameter define void @preallocated_setup_constant() { %ac = call i32 @blackbox() %cs = call token @llvm.call.preallocated.setup(i32 %ac) ret void } +; CHECK: llvm.call.preallocated.alloc arg index must be a constant +define void @preallocated_arg_constant() { + %ac = call i32 @blackbox() + %cs = call token @llvm.call.preallocated.setup(i32 3) + call token @llvm.call.preallocated.arg(token %cs, i32 %ac) + ret void +} + ; CHECK: must be between 0 and corresponding define void @preallocated_setup_arg_index_in_bounds() { %cs = call token @llvm.call.preallocated.setup(i32 2) diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/conflicting-prefixes.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/conflicting-prefixes.ll new file mode 100644 index 0000000000000..fdc53951d6bb0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/conflicting-prefixes.ll @@ -0,0 +1,16 @@ +; RUN: sed 's/RETVAL/1/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKA %s +; RUN: sed 's/RETVAL/2/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKA %s +; RUN: sed 's/RETVAL/3/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKB %s +; RUN: sed 's/RETVAL/4/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKB %s + +define i32 @foo() { + ret i32 RETVAL +} + +define i32 @bar() { + ret i32 100 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/conflicting-prefixes.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/conflicting-prefixes.ll.expected new file mode 100644 index 0000000000000..b3cad11e2ec1d --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/conflicting-prefixes.ll.expected @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no-generate-body-for-unused-prefixes +; RUN: sed 's/RETVAL/1/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKA %s +; RUN: sed 's/RETVAL/2/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKA %s +; RUN: sed 's/RETVAL/3/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKB %s +; RUN: sed 's/RETVAL/4/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKB %s + +define i32 @foo() { + ret i32 RETVAL +} + +define i32 @bar() { +; CHECK-LABEL: bar: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 100 +; CHECK-NEXT: ret + ret i32 100 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/differing-set-of-functions.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/differing-set-of-functions.ll new file mode 100644 index 0000000000000..6c3c66e1a7229 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/differing-set-of-functions.ll @@ -0,0 +1,14 @@ +; RUN: sed 's/FN/foo/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKA %s +; RUN: sed 's/FN/foo/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKB %s +; RUN: sed 's/FN/bar/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKC %s + +define i32 @FN() { + ret i32 1 +} + +define i32 @common() { + ret i32 100 +} diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/differing-set-of-functions.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/differing-set-of-functions.ll.expected new file mode 100644 index 0000000000000..b851f3a3ae249 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/differing-set-of-functions.ll.expected @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --include-generated-funcs +; RUN: sed 's/FN/foo/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKA %s +; RUN: sed 's/FN/foo/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKB %s +; RUN: sed 's/FN/bar/g' %s | llc -mtriple=riscv32 \ +; RUN: | FileCheck -check-prefixes=CHECK,CHECKC %s + +define i32 @FN() { + ret i32 1 +} + +define i32 @common() { + ret i32 100 +} +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: ret +; +; CHECK-LABEL: common: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 100 +; CHECK-NEXT: ret +; +; CHECKA-LABEL: foo: +; CHECKA: # %bb.0: +; CHECKA-NEXT: li a0, 1 +; CHECKA-NEXT: ret +; +; CHECKA-LABEL: common: +; CHECKA: # %bb.0: +; CHECKA-NEXT: li a0, 100 +; CHECKA-NEXT: ret +; +; CHECKB-LABEL: foo: +; CHECKB: # %bb.0: +; CHECKB-NEXT: li a0, 1 +; CHECKB-NEXT: ret +; +; CHECKB-LABEL: common: +; CHECKB: # %bb.0: +; CHECKB-NEXT: li a0, 100 +; CHECKB-NEXT: ret +; +; CHECKC-LABEL: bar: +; CHECKC: # %bb.0: +; CHECKC-NEXT: li a0, 1 +; CHECKC-NEXT: ret +; +; CHECKC-LABEL: common: +; CHECKC: # %bb.0: +; CHECKC-NEXT: li a0, 100 +; CHECKC-NEXT: ret diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/conflicting-prefixes.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/conflicting-prefixes.test new file mode 100644 index 0000000000000..e835b5f83f9a0 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/conflicting-prefixes.test @@ -0,0 +1,7 @@ +# REQUIRES: riscv-registered-target + +# RUN: cp -f %S/Inputs/conflicting-prefixes.ll %t.ll +# RUN: %update_llc_test_checks --no-generate-body-for-unused-prefixes %t.ll 2>&1 | FileCheck %s +# RUN: diff -u %S/Inputs/conflicting-prefixes.ll.expected %t.ll + +# CHECK: WARNING: For function 'foo', the following RUN lines will not generate checks due to conflicting output: RUN #1 (prefixes: CHECK, CHECKA), RUN #2 (prefixes: CHECK, CHECKA), RUN #3 (prefixes: CHECK, CHECKB), RUN #4 (prefixes: CHECK, CHECKB): diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/differing-set-of-functions.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/differing-set-of-functions.test new file mode 100644 index 0000000000000..749f3f2a528c1 --- /dev/null +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/differing-set-of-functions.test @@ -0,0 +1,11 @@ +# REQUIRES: riscv-registered-target + +# RUN: cp -f %S/Inputs/differing-set-of-functions.ll %t.ll +# RUN: %update_llc_test_checks --include-generated-funcs %t.ll 2>&1 | FileCheck --allow-empty %s +# RUN: diff -u %S/Inputs/differing-set-of-functions.ll.expected %t.ll + +# We shouldn't print the warning for clashing CHECK prefixes in the case that +# we're trying to handle a function that is only present for some RUN lines. +# Better warning behaviour than this might be possible. + +# CHECK-NOT: WARNING diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/prefix-never-matches.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/prefix-never-matches.test index 2e75148addd84..90ae70bda64d9 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/prefix-never-matches.test +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/prefix-never-matches.test @@ -4,5 +4,5 @@ # RUN: %update_llc_test_checks --no-generate-body-for-unused-prefixes %t.ll 2>&1 | FileCheck %s # RUN: FileCheck --input-file=%t.ll %s --check-prefix=OUTPUT -# CHECK: WARNING: Prefix A had conflicting output +# CHECK: WARNING: For function 'fold_v2i64', the following RUN lines will not generate checks due to conflicting output # OUTPUT-NOT: A: diff --git a/llvm/test/tools/dsymutil/ARM/remarks-linking-bundle-empty.test b/llvm/test/tools/dsymutil/ARM/remarks-linking-bundle-empty.test new file mode 100644 index 0000000000000..0a89fa1ddee3c --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/remarks-linking-bundle-empty.test @@ -0,0 +1,13 @@ +RUN: rm -rf %t +RUN: mkdir -p %t +RUN: cat %p/../Inputs/remarks/basic.macho.remarks.empty.arm64 > %t/basic.macho.remarks.empty.arm64 + +RUN: dsymutil -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.empty.arm64 + +Check that the remark file in the bundle does not exist: +RUN: not cat %t/basic.macho.remarks.empty.arm64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.arm64 2>&1 + +RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.empty.arm64 + +Check that the remark file in the bundle does not exist: +RUN: not cat %t/basic.macho.remarks.empty.arm64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.empty.arm64 2>&1 diff --git a/llvm/test/tools/dsymutil/ARM/remarks-linking-bundle.test b/llvm/test/tools/dsymutil/ARM/remarks-linking-bundle.test new file mode 100644 index 0000000000000..e1b04455b0d9d --- /dev/null +++ b/llvm/test/tools/dsymutil/ARM/remarks-linking-bundle.test @@ -0,0 +1,81 @@ +RUN: rm -rf %t +RUN: mkdir -p %t/private/tmp/remarks +RUN: cat %p/../Inputs/remarks/basic.macho.remarks.arm64> %t/basic.macho.remarks.arm64 +RUN: llvm-remarkutil yaml2bitstream %p/../Inputs/private/tmp/remarks/basic1.macho.remarks.arm64.opt.yaml -o %t/private/tmp/remarks/basic1.macho.remarks.arm64.opt.bitstream +RUN: llvm-remarkutil yaml2bitstream %p/../Inputs/private/tmp/remarks/basic2.macho.remarks.arm64.opt.yaml -o %t/private/tmp/remarks/basic2.macho.remarks.arm64.opt.bitstream +RUN: llvm-remarkutil yaml2bitstream %p/../Inputs/private/tmp/remarks/basic3.macho.remarks.arm64.opt.yaml -o %t/private/tmp/remarks/basic3.macho.remarks.arm64.opt.bitstream + +RUN: dsymutil -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%t %t/basic.macho.remarks.arm64 + +Check that the remark file in the bundle exists and is sane: +RUN: llvm-bcanalyzer -dump %t/basic.macho.remarks.arm64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.arm64 | FileCheck %s + +RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%t %t/basic.macho.remarks.arm64 + +Check that the remark file in the bundle exists and is sane: +RUN: llvm-bcanalyzer -dump %t/basic.macho.remarks.arm64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.arm64 | FileCheck %s + +Now emit it in a different format: YAML. +RUN: dsymutil -remarks-output-format=yaml -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%t %t/basic.macho.remarks.arm64 +RUN: cat %t/basic.macho.remarks.arm64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.arm64 | FileCheck %s --check-prefix=CHECK-YAML + +RUN: dsymutil --linker parallel -remarks-output-format=yaml -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%t %t/basic.macho.remarks.arm64 +RUN: cat %t/basic.macho.remarks.arm64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.arm64 | FileCheck %s --check-prefix=CHECK-YAML + +CHECK: %t/basic.macho.remarks.empty.x86_64 - -RUN: dsymutil -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.empty.x86_64 - -Check that the remark file in the bundle does not exist: -RUN: not cat %t/basic.macho.remarks.empty.x86_64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.empty.x86_64 2>&1 - -RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.empty.x86_64 - -Check that the remark file in the bundle does not exist: -RUN: not cat %t/basic.macho.remarks.empty.x86_64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.empty.x86_64 2>&1 diff --git a/llvm/test/tools/dsymutil/X86/remarks-linking-bundle.test b/llvm/test/tools/dsymutil/X86/remarks-linking-bundle.test deleted file mode 100644 index d85cd54c8f640..0000000000000 --- a/llvm/test/tools/dsymutil/X86/remarks-linking-bundle.test +++ /dev/null @@ -1,67 +0,0 @@ -RUN: rm -rf %t -RUN: mkdir -p %t -RUN: cat %p/../Inputs/remarks/basic.macho.remarks.x86_64 > %t/basic.macho.remarks.x86_64 - -RUN: dsymutil -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.x86_64 - -Check that the remark file in the bundle exists and is sane: -RUN: llvm-bcanalyzer -dump %t/basic.macho.remarks.x86_64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.x86_64 | FileCheck %s - -RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.x86_64 - -Check that the remark file in the bundle exists and is sane: -RUN: llvm-bcanalyzer -dump %t/basic.macho.remarks.x86_64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.x86_64 | FileCheck %s - -Now emit it in a different format: YAML. -RUN: dsymutil -remarks-output-format=yaml -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.x86_64 -RUN: cat %t/basic.macho.remarks.x86_64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.x86_64 | FileCheck %s --check-prefix=CHECK-YAML - -RUN: dsymutil --linker parallel -remarks-output-format=yaml -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/basic.macho.remarks.x86_64 -RUN: cat %t/basic.macho.remarks.x86_64.dSYM/Contents/Resources/Remarks/basic.macho.remarks.x86_64 | FileCheck %s --check-prefix=CHECK-YAML - -CHECK: %t/fat.macho.remarks.x86 +RUN: llvm-remarkutil yaml2bitstream %p/../Inputs/private/tmp/remarks/fat.macho.remarks.x86_64.opt.yaml -o %t/private/tmp/remarks/fat.macho.remarks.x86_64.opt.bitstream +RUN: llvm-remarkutil yaml2bitstream %p/../Inputs/private/tmp/remarks/fat.macho.remarks.x86_64h.opt.yaml -o %t/private/tmp/remarks/fat.macho.remarks.x86_64h.opt.bitstream -RUN: dsymutil -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/fat.macho.remarks.x86 +RUN: dsymutil -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%t %t/fat.macho.remarks.x86 Check that the remark files in the bundle exist and are all sane: RUN: llvm-bcanalyzer -dump %t/fat.macho.remarks.x86.dSYM/Contents/Resources/Remarks/fat.macho.remarks.x86-x86_64h | FileCheck %s RUN: llvm-bcanalyzer -dump %t/fat.macho.remarks.x86.dSYM/Contents/Resources/Remarks/fat.macho.remarks.x86-x86_64 | FileCheck %s -RUN: llvm-bcanalyzer -dump %t/fat.macho.remarks.x86.dSYM/Contents/Resources/Remarks/fat.macho.remarks.x86-i386 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-i386 -RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%p/../Inputs %t/fat.macho.remarks.x86 +RUN: dsymutil --linker parallel -oso-prepend-path=%p/../Inputs -remarks-prepend-path=%t %t/fat.macho.remarks.x86 Check that the remark files in the bundle exist and are all sane: RUN: llvm-bcanalyzer -dump %t/fat.macho.remarks.x86.dSYM/Contents/Resources/Remarks/fat.macho.remarks.x86-x86_64h | FileCheck %s RUN: llvm-bcanalyzer -dump %t/fat.macho.remarks.x86.dSYM/Contents/Resources/Remarks/fat.macho.remarks.x86-x86_64 | FileCheck %s -RUN: llvm-bcanalyzer -dump %t/fat.macho.remarks.x86.dSYM/Contents/Resources/Remarks/fat.macho.remarks.x86-i386 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-i386 CHECK: &1 | \ +# RUN: env -u OBJECT_MODE llvm-ar -q -c archive-default.a xcoff32.o elf32.o xcoff64.o elf64.o 2>&1 | \ # RUN: FileCheck %s --check-prefixes=WARN-XCOFF64,WARN-ELF64 # RUN: llvm-ar -t -Xany archive-default.a | \ # RUN: FileCheck %s --check-prefixes=OBJ32 @@ -74,7 +73,7 @@ # RUN: FileCheck %s --check-prefixes=OBJ32_64 ## Test -X option for print operation. -# RUN: llvm-ar -t archive-any.a | \ +# RUN: env -u OBJECT_MODE llvm-ar -t archive-any.a | \ # RUN: FileCheck %s --check-prefixes=OBJ32 # RUN: llvm-ar -t -X32 archive-any.a | \ @@ -115,7 +114,7 @@ # RUN: cmp elf64.o any/elf64.o ## Extract a 64-bit object file with option -X32 (or default object mode). -# RUN: not llvm-ar --output=err64 -x archive-any.a xcoff64.o 2>&1 | \ +# RUN: env -u OBJECT_MODE not llvm-ar --output=err64 -x archive-any.a xcoff64.o 2>&1 | \ # RUN: FileCheck %s -DFILE=xcoff64.o --check-prefixes=ERR64 # RUN: not llvm-ar --output=err64 -x -X32 archive-any.a xcoff64.o 2>&1 | \ # RUN: FileCheck %s -DFILE=xcoff64.o --check-prefixes=ERR64 @@ -156,7 +155,7 @@ ## Without -X64, -X32_64 or -Xany, nothing changed here, ## since xcoff.o is a 64-bit object file in command line, but ## the xcoff.o member in archive-rep.a is a 32-bit object file. -# RUN: llvm-ar -r archive-rep.a xcoff.o +# RUN: env -u OBJECT_MODE llvm-ar -r archive-rep.a xcoff.o # RUN: llvm-ar -t -Xany archive-rep.a | \ # RUN: FileCheck %s --check-prefixes=REP # RUN: llvm-nm -Xany --print-armap archive-rep.a | \ @@ -178,7 +177,7 @@ ## Test move member. # RUN: cp archive-any.a archive.a ## Do not move 64-bit object without options -X64, -X32_64, Xany. -# RUN: llvm-ar -ma elf32.o archive.a xcoff64.o 2>&1 | \ +# RUN: env -u OBJECT_MODE llvm-ar -ma elf32.o archive.a xcoff64.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=WARN-XCOFF64 # RUN: llvm-ar -t -Xany archive.a | \ @@ -240,7 +239,7 @@ # MOVE32-EMPTY: ## Move after a file with a bitness that doesn't match the object mode. -# RUN: not llvm-ar -ma xcoff64.o archive-any.a xcoff32.o 2>&1 | \ +# RUN: env -u OBJECT_MODE not llvm-ar -ma xcoff64.o archive-any.a xcoff32.o 2>&1 | \ # RUN: FileCheck %s --check-prefixes=ERR-INSERT-POINT # RUN: not llvm-ar -X32 -ma xcoff64.o archive-any.a xcoff32.o 2>&1 | \ @@ -308,7 +307,7 @@ # RUN: yaml2obj --docnum=5 %s -o wasm.o # RUN: yaml2obj --docnum=6 %s -o coff.o -# RUN: llvm-ar -q -c archive-other32.a coff.o 32.bc 64.bc wasm.o macho32.o macho64.o 2>&1 | \ +# RUN: env -u OBJECT_MODE llvm-ar -q -c archive-other32.a coff.o 32.bc 64.bc wasm.o macho32.o macho64.o 2>&1 | \ # RUN: FileCheck %s --check-prefixes=WARN-64 # RUN: llvm-ar -t -Xany archive-other32.a | \ # RUN: FileCheck %s --check-prefixes=OTHER32 diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.canonical.json b/llvm/test/tools/llvm-cov/Inputs/binary-formats.canonical.json index 5f9122d01da9a..f219ca6c7e179 100644 --- a/llvm/test/tools/llvm-cov/Inputs/binary-formats.canonical.json +++ b/llvm/test/tools/llvm-cov/Inputs/binary-formats.canonical.json @@ -33,4 +33,4 @@ CHECK-SAME: "mcdc":{"count":0,"covered":0,"notcovered":0,"percent":0}, CHECK-SAME: "regions":{"count":1,"covered":1,"notcovered":0,"percent":100}}} CHECK-SAME: ], CHECK-SAME: "type":"llvm.coverage.json.export" -CHECK-SAME: "version":"3.0.1" +CHECK-SAME: "version":"3.1.0" diff --git a/llvm/test/tools/llvm-cov/mcdc-export-json.test b/llvm/test/tools/llvm-cov/mcdc-export-json.test index e6dbd17bee5b2..4b6f3b011451a 100644 --- a/llvm/test/tools/llvm-cov/mcdc-export-json.test +++ b/llvm/test/tools/llvm-cov/mcdc-export-json.test @@ -1,10 +1,10 @@ // RUN: llvm-profdata merge %S/Inputs/mcdc-general.proftext -o %t.profdata // RUN: llvm-cov export --format=text %S/Inputs/mcdc-general.o -instr-profile %t.profdata | FileCheck %s -// CHECK: 12,7,12,27,2,4,0,0,5,[true,true,true,true] -// CHECK: 15,7,15,13,1,2,0,0,5,[true,true] -// CHECK: 15,19,15,25,1,1,0,0,5,[true,false] -// CHECK: 18,7,19,15,1,3,0,0,5,[true,true,false,true] +// CHECK: 12,7,12,27,2,4,0,0,5,[true,true,true,true],[{"conditions":[false,null,false,null],"executed":true,"result":false},{"conditions":[false,null,true,false],"executed":true,"result":false},{"conditions":[true,false,false,null],"executed":true,"result":false},{"conditions":[true,false,true,false],"executed":true,"result":false},{"conditions":[true,false,true,true],"executed":true,"result":true},{"conditions":[true,true,null,null],"executed":true,"result":true}] +// CHECK: 15,7,15,13,1,2,0,0,5,[true,true],[{"conditions":[false,null],"executed":true,"result":false},{"conditions":[true,false],"executed":true,"result":false},{"conditions":[true,true],"executed":true,"result":true}] +// CHECK: 15,19,15,25,1,1,0,0,5,[true,false],[{"conditions":[false,null],"executed":true,"result":false},{"conditions":[true,true],"executed":true,"result":true}] +// CHECK: 18,7,19,15,1,3,0,0,5,[true,true,false,true],[{"conditions":[false,null,null,null],"executed":true,"result":false},{"conditions":[true,false,null,null],"executed":true,"result":false},{"conditions":[true,true,true,false],"executed":true,"result":false},{"conditions":[true,true,true,true],"executed":true,"result":true}] // CHECK: "mcdc":{"count":12,"covered":10,"notcovered":2,"percent":83.333333333333343} Instructions for regenerating the test: diff --git a/llvm/test/tools/llvm-cov/multiple-path_equivalence.test b/llvm/test/tools/llvm-cov/multiple-path_equivalence.test index 1c8ec9a82a81c..14451d2128124 100644 --- a/llvm/test/tools/llvm-cov/multiple-path_equivalence.test +++ b/llvm/test/tools/llvm-cov/multiple-path_equivalence.test @@ -6,11 +6,12 @@ # /tmp/coverage/b/c/f4.c # Setup -// RUN: touch %/T/main.c; touch %/T/f1.c; touch %/T/f2.c; touch %/T/f3.c; touch %/T/f4.c +// RUN: mkdir -p %t +// RUN: touch %/t/main.c; touch %/t/f1.c; touch %/t/f2.c; touch %/t/f3.c; touch %/t/f4.c // RUN: llvm-profdata merge %S/Inputs/multiple-path_equivalence.proftext -o %t.profdata # Make sure that remapping follows the specified order with the first matching entry being used first (f4 comes before f3) -// RUN: llvm-cov show %S/Inputs/multiple-path_equivalence.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp/coverage/a,%/T -path-equivalence=/tmp/coverage/b/c,%/T -path-equivalence=/tmp/coverage/b,%/T -path-equivalence=/tmp/coverage,%/T 2>&1 | FileCheck %s +// RUN: llvm-cov show %S/Inputs/multiple-path_equivalence.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp/coverage/a,%/t -path-equivalence=/tmp/coverage/b/c,%/t -path-equivalence=/tmp/coverage/b,%/t -path-equivalence=/tmp/coverage,%/t 2>&1 | FileCheck %s // CHECK-DAG: {{/|\\}}tmp{{/|\\}}coverage{{/|\\}}main.c: // CHECK-DAG: {{/|\\}}tmp{{/|\\}}coverage{{/|\\}}f1.c: @@ -20,7 +21,7 @@ // CHECK-NOT: isn't covered. # Make sure remapping follows the specified order by proving paths in an overriding order (f4 comes after f3) -// RUN: llvm-cov show %S/Inputs/multiple-path_equivalence.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp/coverage/a,%/T -path-equivalence=/tmp/coverage/b,%/T -path-equivalence=/tmp/coverage/b/c,%/T -path-equivalence=/tmp/coverage,%/T 2>&1 | FileCheck %s -check-prefix=OVERRIDING_ORDER +// RUN: llvm-cov show %S/Inputs/multiple-path_equivalence.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp/coverage/a,%/t -path-equivalence=/tmp/coverage/b,%/t -path-equivalence=/tmp/coverage/b/c,%/t -path-equivalence=/tmp/coverage,%/t 2>&1 | FileCheck %s -check-prefix=OVERRIDING_ORDER // OVERRIDING_ORDER-DAG: {{/|\\}}tmp{{/|\\}}coverage{{/|\\}}main.c: // OVERRIDING_ORDER-DAG: {{/|\\}}tmp{{/|\\}}coverage{{/|\\}}f1.c: @@ -28,5 +29,5 @@ // OVERRIDING_ORDER-DAG: {{/|\\}}tmp{{/|\\}}coverage{{/|\\}}b{{/|\\}}f3.c: // OVERRIDING_ORDER-DAG: warning: The file '{{/|\\}}tmp{{/|\\}}coverage{{/|\\}}b{{/|\\}}c{{/|\\}}f4.c' isn't covered. -// RUN: not llvm-cov show %S/Inputs/multiple-path_equivalence.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp/coverage/a,%/T -path-equivalence=/tmp/coverage/b, -path-equivalence=/tmp/coverage/b/c,%/T -path-equivalence=/tmp/coverage,%/T 2>&1 | FileCheck %s -check-prefix=EMPTY_PATH +// RUN: not llvm-cov show %S/Inputs/multiple-path_equivalence.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp/coverage/a,%/t -path-equivalence=/tmp/coverage/b, -path-equivalence=/tmp/coverage/b/c,%/t -path-equivalence=/tmp/coverage,%/t 2>&1 | FileCheck %s -check-prefix=EMPTY_PATH // EMPTY_PATH: must be in format 'from,to' \ No newline at end of file diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/COFF/01-coff-print-basic-details.test b/llvm/test/tools/llvm-debuginfo-analyzer/COFF/01-coff-print-basic-details.test index 035382897d17e..0696c57d33b1c 100644 --- a/llvm/test/tools/llvm-debuginfo-analyzer/COFF/01-coff-print-basic-details.test +++ b/llvm/test/tools/llvm-debuginfo-analyzer/COFF/01-coff-print-basic-details.test @@ -18,31 +18,14 @@ ; sorted by the debug information internal offset; it includes its lexical ; level and debug info format. -; RUN: llvm-debuginfo-analyzer --attribute=level,format \ -; RUN: --output-sort=offset \ -; RUN: --print=scopes,symbols,types,lines,instructions \ -; RUN: %p/Inputs/test-codeview-clang.o 2>&1 | \ -; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s - -; If `--output-sort=id`, elements are iterated in the order in which they were -; added (which matches the increasing offset of the reference output). ; RUN: llvm-debuginfo-analyzer --attribute=level,format \ ; RUN: --output-sort=id \ ; RUN: --print=scopes,symbols,types,lines,instructions \ ; RUN: %p/Inputs/test-codeview-clang.o 2>&1 | \ ; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s -; If `--output-sort=none`, `LVScope::Children` is not sorted; it, however, -; reflects the order in which elements were added (same as `--output-sort=id`). -; This is expected to change once #69160 is resolved though. -; RUN: llvm-debuginfo-analyzer --attribute=level,format \ -; RUN: --output-sort=none \ -; RUN: --print=scopes,symbols,types,lines,instructions \ -; RUN: %p/Inputs/test-codeview-clang.o 2>&1 | \ -; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s - ; RUN: llvm-debuginfo-analyzer --attribute=level,format \ -; RUN: --output-sort=offset \ +; RUN: --output-sort=id \ ; RUN: --print=elements \ ; RUN: %p/Inputs/test-codeview-clang.o 2>&1 | \ ; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s @@ -80,3 +63,43 @@ ; ONE-NEXT: [003] {Code} 'addq $0x20, %rsp' ; ONE-NEXT: [003] {Code} 'retq' ; ONE-NEXT: [002] {TypeAlias} 'INTPTR' -> '* const int' + +; RUN: llvm-debuginfo-analyzer --attribute=level,format \ +; RUN: --output-sort=none \ +; RUN: --print=scopes,symbols,types,lines,instructions \ +; RUN: %p/Inputs/test-codeview-clang.o 2>&1 | \ +; RUN: FileCheck --strict-whitespace -check-prefix=ONE-NOSORT %s + +; ONE-NOSORT: Logical View: +; ONE-NOSORT-NEXT: [000] {File} 'test-codeview-clang.o' -> COFF-x86-64 +; ONE-NOSORT-EMPTY: +; ONE-NOSORT-NEXT: [001] {CompileUnit} 'test.cpp' +; ONE-NOSORT-NEXT: [002] {Function} extern not_inlined 'foo' -> 'int' +; ONE-NOSORT-NEXT: [003] {Block} +; ONE-NOSORT-NEXT: [004] {Variable} 'CONSTANT' -> 'const int' +; ONE-NOSORT-NEXT: [004] 5 {Line} +; ONE-NOSORT-NEXT: [004] {Code} 'movl $0x7, 0x4(%rsp)' +; ONE-NOSORT-NEXT: [004] 6 {Line} +; ONE-NOSORT-NEXT: [004] {Code} 'movl $0x7, 0x1c(%rsp)' +; ONE-NOSORT-NEXT: [004] {Code} 'jmp 0x8' +; ONE-NOSORT-NEXT: [003] {TypeAlias} 'INTEGER' -> 'int' +; ONE-NOSORT-NEXT: [003] {Parameter} 'ParamPtr' -> '* const int' +; ONE-NOSORT-NEXT: [003] {Parameter} 'ParamUnsigned' -> 'unsigned' +; ONE-NOSORT-NEXT: [003] {Parameter} 'ParamBool' -> 'bool' +; ONE-NOSORT-NEXT: [003] 2 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'subq $0x20, %rsp' +; ONE-NOSORT-NEXT: [003] {Code} 'andb $0x1, %r8b' +; ONE-NOSORT-NEXT: [003] {Code} 'movb %r8b, 0x1b(%rsp)' +; ONE-NOSORT-NEXT: [003] {Code} 'movl %edx, 0x14(%rsp)' +; ONE-NOSORT-NEXT: [003] {Code} 'movq %rcx, 0x8(%rsp)' +; ONE-NOSORT-NEXT: [003] 3 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'testb $0x1, 0x1b(%rsp)' +; ONE-NOSORT-NEXT: [003] {Code} 'je 0x15' +; ONE-NOSORT-NEXT: [003] 8 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'movl 0x14(%rsp), %eax' +; ONE-NOSORT-NEXT: [003] {Code} 'movl %eax, 0x1c(%rsp)' +; ONE-NOSORT-NEXT: [003] 9 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'movl 0x1c(%rsp), %eax' +; ONE-NOSORT-NEXT: [003] {Code} 'addq $0x20, %rsp' +; ONE-NOSORT-NEXT: [003] {Code} 'retq' +; ONE-NOSORT-NEXT: [002] {TypeAlias} 'INTPTR' -> '* const int' diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-compare-logical-elements.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-compare-logical-elements.test index a076887140c28..1b790eeb3b691 100644 --- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-compare-logical-elements.test +++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-compare-logical-elements.test @@ -35,8 +35,8 @@ ; ONE-NEXT: [002] 1 {TypeAlias} 'INTPTR' -> '* const int' ; ONE-NEXT: [002] 2 {Function} extern not_inlined 'foo' -> 'int' ; ONE-NEXT: [003] {Block} -; ONE-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' ; ONE-NEXT: +[004] 4 {TypeAlias} 'INTEGER' -> 'int' +; ONE-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' ; ONE-NEXT: [003] 2 {Parameter} 'ParamBool' -> 'bool' ; ONE-NEXT: [003] 2 {Parameter} 'ParamPtr' -> 'INTPTR' ; ONE-NEXT: [003] 2 {Parameter} 'ParamUnsigned' -> 'unsigned int' diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-print-basic-details.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-print-basic-details.test index 35662554d5593..1ce9c1ef682a2 100644 --- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-print-basic-details.test +++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/01-dwarf-print-basic-details.test @@ -18,32 +18,23 @@ ; sorted by the debug information internal offset; it includes its lexical ; level and debug info format. -; RUN: llvm-debuginfo-analyzer --attribute=level,format \ -; RUN: --output-sort=offset \ -; RUN: --print=scopes,symbols,types,lines,instructions \ -; RUN: %p/Inputs/test-dwarf-clang.o 2>&1 | \ -; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s - -; If `--output-sort=id`, elements are iterated in the order in which they -; were added (which matches the increasing offset of the reference output). ; RUN: llvm-debuginfo-analyzer --attribute=level,format \ ; RUN: --output-sort=id \ ; RUN: --print=scopes,symbols,types,lines,instructions \ ; RUN: %p/Inputs/test-dwarf-clang.o 2>&1 | \ ; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s -; If `--output-sort=none`, `LVScope::Children` is not sorted; it, however, -; reflects the order in which elements were added (same as `--output-sort=id`). -; This is expected to change once #69160 is resolved though. ; RUN: llvm-debuginfo-analyzer --attribute=level,format \ -; RUN: --output-sort=none \ -; RUN: --print=scopes,symbols,types,lines,instructions \ +; RUN: --output-sort=id \ +; RUN: --print=elements \ ; RUN: %p/Inputs/test-dwarf-clang.o 2>&1 | \ ; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s +; For DWARF, `--output-sort=offset` matches `--output-sort=id`, i.e., +; `LVElement`s are always iterated in the order in which they were added. ; RUN: llvm-debuginfo-analyzer --attribute=level,format \ ; RUN: --output-sort=offset \ -; RUN: --print=elements \ +; RUN: --print=scopes,symbols,types,lines,instructions \ ; RUN: %p/Inputs/test-dwarf-clang.o 2>&1 | \ ; RUN: FileCheck --strict-whitespace -check-prefix=ONE %s @@ -84,3 +75,47 @@ ; ONE-NEXT: [003] {Code} 'retq' ; ONE-NEXT: [002] 1 {TypeAlias} 'INTPTR' -> '* const int' ; ONE-NEXT: [002] 9 {Line} + +; RUN: llvm-debuginfo-analyzer --attribute=level,format \ +; RUN: --output-sort=none \ +; RUN: --print=scopes,symbols,types,lines,instructions \ +; RUN: %p/Inputs/test-dwarf-clang.o 2>&1 | \ +; RUN: FileCheck --strict-whitespace -check-prefix=ONE-NOSORT %s + +; ONE-NOSORT: Logical View: +; ONE-NOSORT-NEXT: [000] {File} 'test-dwarf-clang.o' -> elf64-x86-64 +; ONE-NOSORT-EMPTY: +; ONE-NOSORT-NEXT: [001] {CompileUnit} 'test.cpp' +; ONE-NOSORT-NEXT: [002] 2 {Function} extern not_inlined 'foo' -> 'int' +; ONE-NOSORT-NEXT: [003] {Block} +; ONE-NOSORT-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' +; ONE-NOSORT-NEXT: [004] 5 {Line} +; ONE-NOSORT-NEXT: [004] {Code} 'movl $0x7, -0x1c(%rbp)' +; ONE-NOSORT-NEXT: [004] 6 {Line} +; ONE-NOSORT-NEXT: [004] {Code} 'movl $0x7, -0x4(%rbp)' +; ONE-NOSORT-NEXT: [004] {Code} 'jmp 0x6' +; ONE-NOSORT-NEXT: [003] 4 {TypeAlias} 'INTEGER' -> 'int' +; ONE-NOSORT-NEXT: [003] 2 {Parameter} 'ParamPtr' -> 'INTPTR' +; ONE-NOSORT-NEXT: [003] 2 {Parameter} 'ParamUnsigned' -> 'unsigned int' +; ONE-NOSORT-NEXT: [003] 2 {Parameter} 'ParamBool' -> 'bool' +; ONE-NOSORT-NEXT: [003] 2 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'pushq %rbp' +; ONE-NOSORT-NEXT: [003] {Code} 'movq %rsp, %rbp' +; ONE-NOSORT-NEXT: [003] {Code} 'movb %dl, %al' +; ONE-NOSORT-NEXT: [003] {Code} 'movq %rdi, -0x10(%rbp)' +; ONE-NOSORT-NEXT: [003] {Code} 'movl %esi, -0x14(%rbp)' +; ONE-NOSORT-NEXT: [003] {Code} 'andb $0x1, %al' +; ONE-NOSORT-NEXT: [003] {Code} 'movb %al, -0x15(%rbp)' +; ONE-NOSORT-NEXT: [003] 3 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'testb $0x1, -0x15(%rbp)' +; ONE-NOSORT-NEXT: [003] {Code} 'je 0x13' +; ONE-NOSORT-NEXT: [003] 8 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'movl -0x14(%rbp), %eax' +; ONE-NOSORT-NEXT: [003] 8 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'movl %eax, -0x4(%rbp)' +; ONE-NOSORT-NEXT: [003] 9 {Line} +; ONE-NOSORT-NEXT: [003] {Code} 'movl -0x4(%rbp), %eax' +; ONE-NOSORT-NEXT: [003] {Code} 'popq %rbp' +; ONE-NOSORT-NEXT: [003] {Code} 'retq' +; ONE-NOSORT-NEXT: [002] 1 {TypeAlias} 'INTPTR' -> '* const int' +; ONE-NOSORT-NEXT: [002] 9 {Line} diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-incorrect-function-compare.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-incorrect-function-compare.test index 278d4f4850f5f..78604d9164c0f 100644 --- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-incorrect-function-compare.test +++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/pr-57040-incorrect-function-compare.test @@ -55,8 +55,8 @@ ; ONE-NEXT: [002] 1 {TypeAlias} 'INTPTR' -> '* const int' ; ONE-NEXT: [002] 2 {Function} extern not_inlined 'foo' -> 'int' ; ONE-NEXT: [003] {Block} -; ONE-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' ; ONE-NEXT: +[004] 4 {TypeAlias} 'INTEGER' -> 'int' +; ONE-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' ; ONE-NEXT: [003] 2 {Parameter} 'ParamBool' -> 'bool' ; ONE-NEXT: [003] 2 {Parameter} 'ParamPtr' -> 'INTPTR' ; ONE-NEXT: [003] 2 {Parameter} 'ParamUnsigned' -> 'unsigned int' diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test index f52c9c7cc7164..98fc47e3d3c80 100644 --- a/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test +++ b/llvm/test/tools/llvm-debuginfo-analyzer/WebAssembly/01-wasm-compare-logical-elements.test @@ -38,8 +38,8 @@ ; ONE-NEXT: [002] 1 {TypeAlias} 'INTPTR' -> '* const int' ; ONE-NEXT: [002] 2 {Function} extern not_inlined 'foo' -> 'int' ; ONE-NEXT: [003] {Block} -; ONE-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' ; ONE-NEXT: +[004] 4 {TypeAlias} 'INTEGER' -> 'int' +; ONE-NEXT: [004] 5 {Variable} 'CONSTANT' -> 'const INTEGER' ; ONE-NEXT: [003] 2 {Parameter} 'ParamBool' -> 'bool' ; ONE-NEXT: [003] 2 {Parameter} 'ParamPtr' -> 'INTPTR' ; ONE-NEXT: [003] 2 {Parameter} 'ParamUnsigned' -> 'unsigned int' diff --git a/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml b/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml new file mode 100644 index 0000000000000..5312c2573902d --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/verify_stmt_seq.yaml @@ -0,0 +1,1656 @@ +# Object file copied from llvm/test/tools/dsymutil/ARM/stmt-seq-macho.test +# Then manually tempered with some of the value of the attribute +# I hope there are easier ways to construct tests like this. + +# RUN: yaml2obj %s -o verify_stmt_seq.o +# RUN: not llvm-dwarfdump -verify -debug-info verify_stmt_seq.o | FileCheck %s --check-prefix=CHECK_INVALID --implicit-check-not=error: +# RUN: llvm-dwarfdump -debug-line -verbose -debug-info verify_stmt_seq.o | FileCheck %s --check-prefix=CHECK_DEBUG_LINE + +# CHECK_INVALID: error: DW_AT_LLVM_stmt_sequence offset 0x00000000 is not within the line table bounds [0x00000034, 0x000000fd) +# CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset] (0x00000000) + +# CHECK_DEBUG_LINE: Address Line Column File ISA Discriminator OpIndex Flags +# CHECK_DEBUG_LINE-NEXT: ------------------ ------ ------ ------ --- ------------- ------- ------------- +# CHECK_DEBUG_LINE-NEXT: 0x00000034: 05 DW_LNS_set_column (10) +# CHECK_DEBUG_LINE-NEXT: 0x00000036: 0a DW_LNS_set_prologue_end +# CHECK_DEBUG_LINE-NEXT: 0x00000037: 00 DW_LNE_set_address (0x0000000000000000) +# CHECK_DEBUG_LINE-NEXT: 0x00000042: 14 address += 0, line += 2, op-index += 0 +# CHECK_DEBUG_LINE-NEXT: 0x0000000000000000 3 10 1 0 0 0 is_stmt prologue_end +# CHECK_DEBUG_LINE-NEXT: 0x00000043: 05 DW_LNS_set_column (3) +# CHECK_DEBUG_LINE-NEXT: 0x00000045: 06 DW_LNS_negate_stmt +# CHECK_DEBUG_LINE-NEXT: 0x00000046: 4a address += 4, line += 0, op-index += 0 +# CHECK_DEBUG_LINE-NEXT: 0x0000000000000004 3 3 1 0 0 0 +# CHECK_DEBUG_LINE-NEXT: 0x00000047: 00 DW_LNE_end_sequence +# CHECK_DEBUG_LINE-NEXT: 0x0000000000000004 3 3 1 0 0 0 end_sequence + +# 0xd3 would be a valid offset, if the line table wasn't ill formed with two rows having the same PC (0x8c). +# CHECK_INVALID: error: DW_AT_LLVM_stmt_sequence offset 0x000000d3 does not point to a valid sequence offset in the line table +# CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset] (0x000000d3) + +# CHECK_DEBUG_LINE: 0x000000d3: 05 DW_LNS_set_column (85) +# CHECK_DEBUG_LINE-NEXT: 0x000000d5: 0a DW_LNS_set_prologue_end +# CHECK_DEBUG_LINE-NEXT: 0x000000d6: 00 DW_LNE_set_address (0x000000000000008c) +# CHECK_DEBUG_LINE-NEXT: 0x000000e1: 03 DW_LNS_advance_line (30) +# CHECK_DEBUG_LINE-NEXT: 0x000000e3: 01 DW_LNS_copy +# CHECK_DEBUG_LINE-NEXT: 0x000000000000008c 30 85 1 0 0 0 is_stmt prologue_end +# CHECK_DEBUG_LINE-NEXT: 0x000000e4: 00 DW_LNE_end_sequence +# CHECK_DEBUG_LINE-NEXT: 0x000000000000008c 30 85 1 0 0 0 is_stmt end_sequence + +# CHECK_INVALID: error: DIE has invalid DW_AT_LLVM_stmt_sequence encoding +# CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_data4] (0x000000a7) +# CHECK_INVALID: error: DW_AT_LLVM_stmt_sequence offset 0x000000ab does not point to a valid sequence offset in the line table +# CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset] (0x000000ab) + +# CHECK_INVALID: error: DW_AT_LLVM_stmt_sequence offset is beyond .debug_line bounds: 0x00eeeee7 +# CHECK_INVALID: DW_AT_LLVM_stmt_sequence [DW_FORM_sec_offset] (0x00eeeee7) + +# CHECK_DEBUG_LINE: 0x000000f8: 02 DW_LNS_advance_pc (addr += 4, op-index += 0) +# CHECK_DEBUG_LINE-NEXT: 0x000000fa: 00 DW_LNE_end_sequence +# CHECK_DEBUG_LINE-NEXT: 0x0000000000000094 30 86 1 0 0 0 is_stmt end_sequence + +# CHECK_INVALID: error: Aggregated error counts: +# CHECK_INVALID-NEXT: error: DW_AT_LLVM_stmt_sequence offset out of bounds occurred 1 time(s). +# CHECK_INVALID-NEXT: error: DW_AT_LLVM_stmt_sequence offset out of line table bounds occurred 1 time(s). +# CHECK_INVALID-NEXT: error: Invalid DW_AT_LLVM_stmt_sequence encoding occurred 1 time(s). +# CHECK_INVALID-NEXT: error: Invalid DW_AT_LLVM_stmt_sequence offset occurred 2 time(s). + +--- !mach-o +IsLittleEndian: true +FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x1 + ncmds: 5 + sizeofcmds: 1176 + flags: 0x2000 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 1032 + segname: '' + vmaddr: 0 + vmsize: 3125 + fileoff: 1208 + filesize: 3125 + maxprot: 7 + initprot: 7 + nsects: 12 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x0 + size: 148 + offset: 0x4B8 + align: 2 + reloff: 0x10F0 + nreloc: 8 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00040011C0035FD600100011C0035FD600580051C0035FD600100011C0035FD600580051C0035FD6FFC300D1F44F01A9FD7B02A9FD8300916000805200000094F30300AA20058052000000941400130B6001805200000094F30300AA40058052000000947302000B0100009021000091E03F0091000000948002130BFD7B42A9F44F41A9FFC30091C0035FD600000014C0035FD6 + relocations: + - address: 0x8C + symbolnum: 4 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x74 + symbolnum: 3 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x6C + symbolnum: 1 + pcrel: false + length: 2 + extern: true + type: 4 + scattered: false + value: 0 + - address: 0x68 + symbolnum: 1 + pcrel: true + length: 2 + extern: true + type: 3 + scattered: false + value: 0 + - address: 0x60 + symbolnum: 5 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x54 + symbolnum: 6 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x48 + symbolnum: 9 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - address: 0x3C + symbolnum: 7 + pcrel: true + length: 2 + extern: true + type: 2 + scattered: false + value: 0 + - sectname: __cstring + segname: __TEXT + addr: 0x94 + size: 5 + offset: 0x54C + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: '7465737400' + - sectname: __debug_loc + segname: __DWARF + addr: 0x99 + size: 412 + offset: 0x551 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 08000000000000000C000000000000000100500C0000000000000010000000000000000400A301509F0000000000000000000000000000000008000000000000000C00000000000000030070039F0000000000000000000000000000000010000000000000001400000000000000010050140000000000000018000000000000000400A301509F0000000000000000000000000000000018000000000000001C000000000000000100501C0000000000000020000000000000000400A301509F0000000000000000000000000000000018000000000000001C00000000000000030070039F0000000000000000000000000000000020000000000000002400000000000000010050240000000000000028000000000000000400A301509F00000000000000000000000000000000240000000000000028000000000000000100500000000000000000000000000000000038000000000000004400000000000000030011009F4400000000000000500000000000000001006350000000000000005C0000000000000001006400000000000000000000000000000000 + - sectname: __debug_abbrev + segname: __DWARF + addr: 0x235 + size: 372 + offset: 0x6ED + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + - sectname: __debug_info + segname: __DWARF + addr: 0x3A9 + size: 747 + offset: 0x861 + align: 0 + reloff: 0x1130 + nreloc: 16 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + relocations: + - address: 0x2A7 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x28E + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x253 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1F5 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1E1 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1CE + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1BA + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x1A7 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x169 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x12D + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xF1 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xC4 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x88 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x5F + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x37 + symbolnum: 2 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x22 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - sectname: __debug_str + segname: __DWARF + addr: 0x694 + size: 400 + offset: 0xB4C + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + - sectname: __apple_names + segname: __DWARF + addr: 0x824 + size: 288 + offset: 0xCDC + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 485341480100000009000000090000000C00000000000000010000000100060000000000FFFFFFFFFFFFFFFF0100000003000000040000000600000007000000080000004A08311CC78E3C8288CB36CF89CB36CFD1125E53522B705390D9F86F6A7F9A7C4908311C8C0000009C000000AC000000BC000000CC000000DC000000EC00000000010000100100000601000001000000F000000000000000D6000000010000005E00000000000000F600000001000000C30000000000000016010000010000002C01000000000000440100000100000052020000000000005C01000001000000A6020000000000002B0100000200000052020000A60200000000000026010000010000006801000000000000E6000000010000008700000000000000 + - sectname: __apple_objc + segname: __DWARF + addr: 0x944 + size: 36 + offset: 0xDFC + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF + - sectname: __apple_namespac + segname: __DWARF + addr: 0x968 + size: 36 + offset: 0xE20 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 485341480100000001000000000000000C000000000000000100000001000600FFFFFFFF + - sectname: __apple_types + segname: __DWARF + addr: 0x98C + size: 195 + offset: 0xE44 + align: 0 + reloff: 0x0 + nreloc: 0 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 48534148010000000500000005000000140000000000000003000000010006000300050004000B000000000002000000FFFFFFFF03000000040000007CA8F05D90D9F86F5B738CDC3080880B6320957C64000000770000008A0000009D000000B0000000380100000100000027020000130000000000002B010000010000000502000013000000000000C20000000100000057000000240000000000007401000001000000DE02000024000000000000BD000000010000005000000024000000000000 + - sectname: __debug_frame + segname: __DWARF + addr: 0xA50 + size: 232 + offset: 0xF08 + align: 3 + reloff: 0x11B0 + nreloc: 8 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 14000000FFFFFFFF0400080001781E0C1F00000000000000140000000000000000000000000000000800000000000000140000000000000008000000000000000800000000000000140000000000000010000000000000000800000000000000140000000000000018000000000000000800000000000000140000000000000020000000000000000800000000000000240000000000000028000000000000006400000000000000500C1D109E019D02930394040000000014000000000000008C000000000000000400000000000000140000000000000090000000000000000400000000000000 + relocations: + - address: 0xD8 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xC0 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x98 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x80 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x68 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x50 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x38 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x20 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - sectname: __debug_line + segname: __DWARF + addr: 0xB38 + size: 253 + offset: 0xFF0 + align: 0 + reloff: 0x11F0 + nreloc: 8 + flags: 0x2000000 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + relocations: + - address: 0xED + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xD9 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0xAA + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x96 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x7E + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x66 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x50 + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - address: 0x3A + symbolnum: 1 + pcrel: false + length: 3 + extern: false + type: 0 + scattered: false + value: 0 + - cmd: LC_BUILD_VERSION + cmdsize: 24 + platform: 1 + minos: 720896 + sdk: 0 + ntools: 0 + - cmd: LC_LINKER_OPTIMIZATION_HINT + cmdsize: 16 + dataoff: 4656 + datasize: 8 + - cmd: LC_SYMTAB + cmdsize: 24 + symoff: 4664 + nsyms: 11 + stroff: 4840 + strsize: 168 + - cmd: LC_DYSYMTAB + cmdsize: 80 + ilocalsym: 0 + nlocalsym: 3 + iextdefsym: 3 + nextdefsym: 8 + iundefsym: 11 + nundefsym: 0 + tocoff: 0 + ntoc: 0 + modtaboff: 0 + nmodtab: 0 + extrefsymoff: 0 + nextrefsyms: 0 + indirectsymoff: 0 + nindirectsyms: 0 + extreloff: 0 + nextrel: 0 + locreloff: 0 + nlocrel: 0 +LinkEditData: + NameList: + - n_strx: 155 + n_type: 0xE + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 1 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 148 + - n_strx: 149 + n_type: 0xE + n_sect: 2 + n_desc: 0 + n_value: 148 + - n_strx: 39 + n_type: 0xF + n_sect: 1 + n_desc: 192 + n_value: 140 + - n_strx: 14 + n_type: 0xF + n_sect: 1 + n_desc: 192 + n_value: 144 + - n_strx: 132 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 0 + - n_strx: 115 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 16 + - n_strx: 81 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 32 + - n_strx: 98 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 8 + - n_strx: 64 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 24 + - n_strx: 8 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 40 + StringTable: + - '' + - l_.str + - _main + - __ZN12length_errorC2EPKc + - __ZN12length_errorC1EPKc + - _function3_copy2 + - _function2_copy2 + - _function3_copy1 + - _function2_copy1 + - _function1_copy1 + - ltmp1 + - ltmp0 + - '' + - '' + - '' + - '' + - '' + - '' + - '' +DWARF: + debug_str: + - 'Facebook clang version 19.1.5 (https://git.internal.tfbnw.net/repos/git/rw/osmeta/external/llvm-project b36c9ae1f8f2b39e4aafb9ca4700c608c3036365)' + - stmt_seq_macho.cpp + - '/' + - '/private/tmp/stmt_seq' + - char + - __ARRAY_SIZE_TYPE__ + - function1_copy1 + - function3_copy1 + - function2_copy1 + - function3_copy2 + - function2_copy2 + - main + - length_error + - logic_error + - _ZN12length_errorC1EPKc + - _ZN12length_errorC2EPKc + - int + - a + - b + - result + - e + - sum + - this + - s + debug_abbrev: + - ID: 0 + Table: + - Code: 0x1 + Tag: DW_TAG_compile_unit + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_producer + Form: DW_FORM_strp + - Attribute: DW_AT_language + Form: DW_FORM_data2 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_LLVM_sysroot + Form: DW_FORM_strp + - Attribute: DW_AT_stmt_list + Form: DW_FORM_sec_offset + - Attribute: DW_AT_comp_dir + Form: DW_FORM_strp + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Code: 0x2 + Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Code: 0x3 + Tag: DW_TAG_array_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x4 + Tag: DW_TAG_subrange_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_count + Form: DW_FORM_data1 + - Code: 0x5 + Tag: DW_TAG_const_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x6 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Code: 0x7 + Tag: DW_TAG_base_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_encoding + Form: DW_FORM_data1 + - Code: 0x8 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_APPLE_omit_frame_ptr + Form: DW_FORM_flag_present + - Attribute: DW_AT_LLVM_stmt_sequence + Form: DW_FORM_sec_offset + - Attribute: DW_AT_frame_base + Form: DW_FORM_exprloc + - Attribute: DW_AT_call_all_calls + Form: DW_FORM_flag_present + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Code: 0x9 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xA + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_sec_offset + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xB + Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_sec_offset + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xC + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_LLVM_stmt_sequence + Form: DW_FORM_data4 + - Attribute: DW_AT_frame_base + Form: DW_FORM_exprloc + - Attribute: DW_AT_call_all_calls + Form: DW_FORM_flag_present + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Code: 0xD + Tag: DW_TAG_variable + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0xE + Tag: DW_TAG_call_site + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_call_origin + Form: DW_FORM_ref4 + - Attribute: DW_AT_call_return_pc + Form: DW_FORM_addr + - Code: 0xF + Tag: DW_TAG_call_site_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_call_value + Form: DW_FORM_exprloc + - Code: 0x10 + Tag: DW_TAG_structure_type + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_calling_convention + Form: DW_FORM_data1 + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_byte_size + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Code: 0x11 + Tag: DW_TAG_inheritance + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_data_member_location + Form: DW_FORM_data1 + - Code: 0x12 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Attribute: DW_AT_explicit + Form: DW_FORM_flag_present + - Code: 0x13 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_artificial + Form: DW_FORM_flag_present + - Code: 0x14 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x15 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_declaration + Form: DW_FORM_flag_present + - Attribute: DW_AT_external + Form: DW_FORM_flag_present + - Attribute: DW_AT_APPLE_optimized + Form: DW_FORM_flag_present + - Code: 0x16 + Tag: DW_TAG_pointer_type + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x17 + Tag: DW_TAG_subprogram + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_low_pc + Form: DW_FORM_addr + - Attribute: DW_AT_high_pc + Form: DW_FORM_data4 + - Attribute: DW_AT_APPLE_omit_frame_ptr + Form: DW_FORM_flag_present + - Attribute: DW_AT_LLVM_stmt_sequence + Form: DW_FORM_sec_offset + - Attribute: DW_AT_frame_base + Form: DW_FORM_exprloc + - Attribute: DW_AT_object_pointer + Form: DW_FORM_ref4 + - Attribute: DW_AT_call_all_calls + Form: DW_FORM_flag_present + - Attribute: DW_AT_linkage_name + Form: DW_FORM_strp + - Attribute: DW_AT_specification + Form: DW_FORM_ref4 + - Code: 0x18 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Attribute: DW_AT_artificial + Form: DW_FORM_flag_present + - Code: 0x19 + Tag: DW_TAG_formal_parameter + Children: DW_CHILDREN_no + Attributes: + - Attribute: DW_AT_location + Form: DW_FORM_exprloc + - Attribute: DW_AT_name + Form: DW_FORM_strp + - Attribute: DW_AT_decl_file + Form: DW_FORM_data1 + - Attribute: DW_AT_decl_line + Form: DW_FORM_data1 + - Attribute: DW_AT_type + Form: DW_FORM_ref4 + - Code: 0x1A + Tag: DW_TAG_call_site + Children: DW_CHILDREN_yes + Attributes: + - Attribute: DW_AT_call_origin + Form: DW_FORM_ref4 + - Attribute: DW_AT_call_tail_call + Form: DW_FORM_flag_present + - Attribute: DW_AT_call_pc + Form: DW_FORM_addr + debug_info: + - Length: 0x2E7 + Version: 4 + AbbrevTableID: 0 + AbbrOffset: 0x0 + AddrSize: 8 + Entries: + - AbbrCode: 0x1 + Values: + - Value: 0x0 + - Value: 0x21 + - Value: 0x92 + - Value: 0xA5 + - Value: 0x0 + - Value: 0xA7 + - Value: 0x1 + - Value: 0x0 + - Value: 0x94 + - AbbrCode: 0x2 + Values: + - Value: 0x3F + - Value: 0x1 + - Value: 0x27 + - Value: 0x9 + BlockData: [ 0x3, 0x94, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0 ] + - AbbrCode: 0x3 + Values: + - Value: 0x4B + - AbbrCode: 0x4 + Values: + - Value: 0x57 + - Value: 0x5 + - AbbrCode: 0x0 + - AbbrCode: 0x5 + Values: + - Value: 0x50 + - AbbrCode: 0x6 + Values: + - Value: 0xBD + - Value: 0x6 + - Value: 0x1 + - AbbrCode: 0x7 + Values: + - Value: 0xC2 + - Value: 0x8 + - Value: 0x7 + - AbbrCode: 0x8 + Values: + - Value: 0x0 + - Value: 0x8 + - Value: 0x1 + - BlockData: [ 0x6F ] + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0xD6 + - Value: 0x1 + - Value: 0x2 + - Value: 0x2DE + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x9 + Values: + - Value: 0x178 + - Value: 0x1 + - Value: 0x2 + - Value: 0x2DE + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x8 + - Value: 0x8 + - Value: 0x1 + - Value: 0x4A + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0xE6 + - Value: 0x1 + - Value: 0x6 + - Value: 0x2DE + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0xA + Values: + - Value: 0x0 + - Value: 0x178 + - Value: 0x1 + - Value: 0x6 + - Value: 0x2DE + - AbbrCode: 0xB + Values: + - Value: 0x39 + - Value: 0x17A + - Value: 0x1 + - Value: 0x7 + - Value: 0x2DE + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x10 + - Value: 0x8 + - Value: 0x1 + - Value: 0x60 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0xF6 + - Value: 0x1 + - Value: 0xB + - Value: 0x2DE + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0xA + Values: + - Value: 0x5E + - Value: 0x178 + - Value: 0x1 + - Value: 0xB + - Value: 0x2DE + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x18 + - Value: 0x8 + - Value: 0x1 + - Value: 0xD3 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0x106 + - Value: 0x1 + - Value: 0xF + - Value: 0x2DE + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0xA + Values: + - Value: 0x97 + - Value: 0x178 + - Value: 0x1 + - Value: 0xF + - Value: 0x2DE + - AbbrCode: 0xB + Values: + - Value: 0xD0 + - Value: 0x17A + - Value: 0x1 + - Value: 0x10 + - Value: 0x2DE + - AbbrCode: 0x0 + - AbbrCode: 0x8 + Values: + - Value: 0x20 + - Value: 0x8 + - Value: 0x1 + - Value: 0xE7 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x1 + - Value: 0x116 + - Value: 0x1 + - Value: 0x14 + - Value: 0x2DE + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0xA + Values: + - Value: 0xF5 + - Value: 0x178 + - Value: 0x1 + - Value: 0x14 + - Value: 0x2DE + - AbbrCode: 0xB + Values: + - Value: 0x12E + - Value: 0x17C + - Value: 0x1 + - Value: 0x15 + - Value: 0x2DE + - AbbrCode: 0x0 + - AbbrCode: 0xC + Values: + - Value: 0x28 + - Value: 0x64 + - Value: 0xA7 + - Value: 0x1 + BlockData: [ 0x6D ] + - Value: 0x1 + - Value: 0x126 + - Value: 0x1 + - Value: 0x21 + - Value: 0x2DE + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0xD + Values: + - Value: 0x2 + BlockData: [ 0x8F, 0xF ] + - Value: 0x183 + - Value: 0x1 + - Value: 0x27 + - Value: 0x205 + - AbbrCode: 0xB + Values: + - Value: 0x151 + - Value: 0x185 + - Value: 0x1 + - Value: 0x22 + - Value: 0x2DE + - AbbrCode: 0xE + Values: + - Value: 0x12C + - Value: 0x40 + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x1 + BlockData: [ 0x33 ] + - AbbrCode: 0x0 + - AbbrCode: 0xE + Values: + - Value: 0xF0 + - Value: 0x4C + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x2 + BlockData: [ 0x10, 0x29 ] + - AbbrCode: 0x0 + - AbbrCode: 0xE + Values: + - Value: 0xC3 + - Value: 0x58 + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x1 + BlockData: [ 0x3B ] + - AbbrCode: 0x0 + - AbbrCode: 0xE + Values: + - Value: 0x5E + - Value: 0x64 + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x2 + BlockData: [ 0x10, 0x2A ] + - AbbrCode: 0x0 + - AbbrCode: 0xE + Values: + - Value: 0x252 + - Value: 0x78 + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x2 + BlockData: [ 0x8F, 0xF ] + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x10 + Values: + - Value: 0x5 + - Value: 0x12B + - Value: 0x1 + - Value: 0x1 + - Value: 0x1D + - AbbrCode: 0x11 + Values: + - Value: 0x227 + - Value: 0x0 + - AbbrCode: 0x12 + Values: + - Value: 0x12B + - Value: 0x1 + - Value: 0x1E + - Value: 0x1 + - Value: 0x1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x13 + Values: + - Value: 0x24D + - Value: 0x1 + - AbbrCode: 0x14 + Values: + - Value: 0x248 + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x10 + Values: + - Value: 0x5 + - Value: 0x138 + - Value: 0x1 + - Value: 0x1 + - Value: 0x19 + - AbbrCode: 0x15 + Values: + - Value: 0x138 + - Value: 0x1 + - Value: 0x1A + - Value: 0x1 + - Value: 0x1 + - Value: 0x1 + - AbbrCode: 0x13 + Values: + - Value: 0x243 + - Value: 0x1 + - AbbrCode: 0x14 + Values: + - Value: 0x248 + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x16 + Values: + - Value: 0x227 + - AbbrCode: 0x16 + Values: + - Value: 0x4B + - AbbrCode: 0x16 + Values: + - Value: 0x205 + - AbbrCode: 0x17 + Values: + - Value: 0x8C + - Value: 0x4 + - Value: 0x1 + - Value: 0xAB + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x271 + - Value: 0x1 + - Value: 0x144 + - Value: 0x214 + - AbbrCode: 0x18 + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x189 + - Value: 0x2E5 + - Value: 0x1 + - AbbrCode: 0x19 + Values: + - Value: 0x1 + BlockData: [ 0x51 ] + - Value: 0x18E + - Value: 0x1 + - Value: 0x1E + - Value: 0x248 + - AbbrCode: 0x1A + Values: + - Value: 0x2A6 + - Value: 0x1 + - Value: 0x8C + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x3 + BlockData: [ 0xA3, 0x1, 0x50 ] + - AbbrCode: 0xF + Values: + - Value: 0x1 + BlockData: [ 0x51 ] + - Value: 0x3 + BlockData: [ 0xA3, 0x1, 0x51 ] + - AbbrCode: 0x0 + - AbbrCode: 0x0 + - AbbrCode: 0x17 + Values: + - Value: 0x90 + - Value: 0x4 + - Value: 0x1 + - Value: 0xEEEEE7 + - Value: 0x1 + BlockData: [ 0x6F ] + - Value: 0x2C5 + - Value: 0x1 + - Value: 0x15C + - Value: 0x214 + - AbbrCode: 0x18 + Values: + - Value: 0x1 + BlockData: [ 0x50 ] + - Value: 0x189 + - Value: 0x2E5 + - Value: 0x1 + - AbbrCode: 0x19 + Values: + - Value: 0x1 + BlockData: [ 0x51 ] + - Value: 0x18E + - Value: 0x1 + - Value: 0x1E + - Value: 0x248 + - AbbrCode: 0x0 + - AbbrCode: 0x6 + Values: + - Value: 0x174 + - Value: 0x5 + - Value: 0x4 + - AbbrCode: 0x16 + Values: + - Value: 0x205 + - AbbrCode: 0x0 + debug_line: + - Length: 249 + Version: 4 + PrologueLength: 42 + MinInstLength: 1 + MaxOpsPerInst: 1 + DefaultIsStmt: 1 + LineBase: 251 + LineRange: 14 + OpcodeBase: 13 + StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ] + Files: + - Name: stmt_seq_macho.cpp + DirIdx: 0 + ModTime: 0 + Length: 0 + Opcodes: + - Opcode: DW_LNS_set_column + Data: 10 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 0 + - Opcode: 0x14 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 3 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 14 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 8 + - Opcode: 0x19 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 14 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 16 + - Opcode: DW_LNS_advance_line + SData: 11 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 14 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 24 + - Opcode: DW_LNS_advance_line + SData: 16 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4A + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 20 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 32 + - Opcode: DW_LNS_advance_line + SData: 20 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: 0x4B + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 40 + - Opcode: DW_LNS_advance_line + SData: 32 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_set_column + Data: 12 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: 0xF4 + Data: 0 + - Opcode: 0xBB + Data: 0 + - Opcode: DW_LNS_set_column + Data: 9 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x82 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 12 + - Opcode: DW_LNS_negate_stmt + Data: 0 + - Opcode: 0x4B + Data: 0 + - Opcode: 0xBB + Data: 0 + - Opcode: DW_LNS_set_column + Data: 9 + - Opcode: 0x81 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 18 + - Opcode: 0x4C + Data: 0 + - Opcode: DW_LNS_set_column + Data: 9 + - Opcode: 0xF1 + Data: 0 + - Opcode: DW_LNS_set_column + Data: 5 + - Opcode: DW_LNS_set_epilogue_begin + Data: 0 + - Opcode: 0x4C + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 85 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 140 + - Opcode: DW_LNS_advance_line + SData: 29 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 + - Opcode: DW_LNS_set_column + Data: 86 + - Opcode: DW_LNS_set_prologue_end + Data: 0 + - Opcode: DW_LNS_extended_op + ExtLen: 9 + SubOpcode: DW_LNE_set_address + Data: 144 + - Opcode: DW_LNS_advance_line + SData: 29 + Data: 0 + - Opcode: DW_LNS_copy + Data: 0 + - Opcode: DW_LNS_advance_pc + Data: 4 + - Opcode: DW_LNS_extended_op + ExtLen: 1 + SubOpcode: DW_LNE_end_sequence + Data: 0 +... diff --git a/llvm/test/tools/llvm-lib/sym64-threshold.test b/llvm/test/tools/llvm-lib/sym64-threshold.test new file mode 100644 index 0000000000000..76f0a030274ef --- /dev/null +++ b/llvm/test/tools/llvm-lib/sym64-threshold.test @@ -0,0 +1,71 @@ +# RUN: yaml2obj --docnum=1 %s -o %t01234567890234567789.obj +# RUN: yaml2obj --docnum=2 %s -o %t-ec.obj +# RUN: env SYM64_THRESHOLD=100 llvm-lib -machine:amd64 -out:%t.lib %t01234567890234567789.obj +# RUN: llvm-nm --print-armap %t.lib | FileCheck --check-prefix=ARMAP %s +# ARMAP: Archive map +# ARMAP-NEXT: sym + +# RUN: env SYM64_THRESHOLD=100 not llvm-lib -machine:arm64x -out:%t-ec.lib %t-ec.obj %t01234567890234567789.obj 2>&1 | FileCheck %s +# CHECK: Archive is too large: ARM64X does not support archives larger than 4GB + +--- !COFF +header: + Machine: IMAGE_FILE_MACHINE_AMD64 + Characteristics: [ ] +sections: + - Name: .text + Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ] + Alignment: 4 + SectionData: '' +symbols: + - Name: .text + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + SectionDefinition: + Length: 0 + NumberOfRelocations: 0 + NumberOfLinenumbers: 0 + CheckSum: 0 + Number: 1 + - !Symbol + Name: sym + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL # (0) + ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2) + StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2) +... + +--- !COFF +header: + Machine: IMAGE_FILE_MACHINE_ARM64 + Characteristics: [ ] +sections: + - Name: .text + Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ] + Alignment: 4 + SectionData: '' +symbols: + - Name: .text + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + SectionDefinition: + Length: 0 + NumberOfRelocations: 0 + NumberOfLinenumbers: 0 + CheckSum: 0 + Number: 1 + - !Symbol + Name: sym + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL # (0) + ComplexType: IMAGE_SYM_DTYPE_FUNCTION # (2) + StorageClass: IMAGE_SYM_CLASS_EXTERNAL # (2) +... diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s index 911ad1900195c..fe3742c9e4d3b 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-sve-instructions.s @@ -2649,7 +2649,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 5 0.50 2 V1UnitV,V1UnitV01 BFMLALT_ZZZI bfmlalt z0.s, z1.h, z2.h[7] # CHECK-NEXT: 1 5 0.50 2 V1UnitV,V1UnitV01 BFMLALT_ZZZI bfmlalt z0.s, z1.h, z7.h[7] # CHECK-NEXT: 1 5 0.50 2 V1UnitV,V1UnitV01 BFMLALT_ZZZ bfmlalt z14.s, z10.h, z21.h -# CHECK-NEXT: 1 5 0.50 3 V1UnitV,V1UnitV01 BFMMLA_ZZZ bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 1 5 0.50 3 V1UnitV,V1UnitV01 BFMMLA_ZZZ_HtoS bfmmla z0.s, z1.h, z2.h # CHECK-NEXT: 1 1 1.00 1 V1UnitI,V1UnitM,V1UnitM0 BIC_PPzPP bic p0.b, p0/z, p0.b, p0.b # CHECK-NEXT: 1 1 1.00 1 V1UnitI,V1UnitM,V1UnitM0 BIC_PPzPP bic p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 BIC_ZZZ bic z0.d, z0.d, z0.d @@ -4228,10 +4228,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 12 7.00 12 V1UnitV[7],V1UnitV0[7],V1UnitV01[7],V1UnitV02[7] SDIV_ZPmZ_S sdiv z0.s, p7/m, z0.s, z31.s # CHECK-NEXT: 1 20 7.00 20 V1UnitV[7],V1UnitV0[7],V1UnitV01[7],V1UnitV02[7] SDIVR_ZPmZ_D sdivr z0.d, p7/m, z0.d, z31.d # CHECK-NEXT: 1 12 7.00 12 V1UnitV[7],V1UnitV0[7],V1UnitV01[7],V1UnitV02[7] SDIVR_ZPmZ_S sdivr z0.s, p7/m, z0.s, z31.s -# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 SDOT_ZZZI_D sdot z0.d, z1.h, z15.h[1] -# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 SDOT_ZZZ_D sdot z0.d, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 SDOT_ZZZ_S sdot z0.s, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 SDOT_ZZZI_S sdot z0.s, z1.b, z7.b[3] +# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 SDOT_ZZZI_HtoD sdot z0.d, z1.h, z15.h[1] +# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 SDOT_ZZZ_HtoD sdot z0.d, z1.h, z31.h +# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 SDOT_ZZZ_BtoS sdot z0.s, z1.b, z31.b +# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 SDOT_ZZZI_BtoS sdot z0.s, z1.b, z7.b[3] # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 SEL_ZPZZ_B sel z23.b, p11, z13.b, z8.b # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 SEL_ZPZZ_D sel z23.d, p11, z13.d, z8.d # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 SEL_ZPZZ_H sel z23.h, p11, z13.h, z8.h @@ -4708,11 +4708,11 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 12 7.00 12 V1UnitV[7],V1UnitV0[7],V1UnitV01[7],V1UnitV02[7] UDIV_ZPmZ_S udiv z0.s, p7/m, z0.s, z31.s # CHECK-NEXT: 1 20 7.00 20 V1UnitV[7],V1UnitV0[7],V1UnitV01[7],V1UnitV02[7] UDIVR_ZPmZ_D udivr z0.d, p7/m, z0.d, z31.d # CHECK-NEXT: 1 12 7.00 12 V1UnitV[7],V1UnitV0[7],V1UnitV01[7],V1UnitV02[7] UDIVR_ZPmZ_S udivr z0.s, p7/m, z0.s, z31.s -# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 UDOT_ZZZI_D udot z0.d, z1.h, z15.h[1] -# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 UDOT_ZZZ_D udot z0.d, z1.h, z31.h +# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 UDOT_ZZZI_HtoD udot z0.d, z1.h, z15.h[1] +# CHECK-NEXT: 1 4 1.00 1 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 UDOT_ZZZ_HtoD udot z0.d, z1.h, z31.h # CHECK-NEXT: 1 3 1.00 3 V1UnitV,V1UnitV0,V1UnitV01,V1UnitV02 UCVTF_ZPmZ_StoD ucvtf z24.d, p5/m, z9.s -# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 UDOT_ZZZ_S udot z0.s, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 UDOT_ZZZI_S udot z0.s, z1.b, z7.b[3] +# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 UDOT_ZZZ_BtoS udot z0.s, z1.b, z31.b +# CHECK-NEXT: 1 3 0.50 1 V1UnitV,V1UnitV01 UDOT_ZZZI_BtoS udot z0.s, z1.b, z7.b[3] # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 UMAX_ZI_B umax z0.b, z0.b, #0 # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 UMAX_ZPmZ_B umax z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 2 0.50 2 V1UnitV,V1UnitV01 UMAX_ZI_B umax z31.b, z31.b, #255 diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/mask.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/mask.s new file mode 100644 index 0000000000000..486b535382f87 --- /dev/null +++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/mask.s @@ -0,0 +1,125 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 -instruction-tables=full < %s | FileCheck %s + +vsetvli zero, zero, e32, m1, ta, ma + +vmslt.vv v0, v4, v20 +vmsle.vv v8, v4, v20 +vmsgt.vv v8, v20, v4 +vmsge.vv v8, v20, v4 +vmseq.vv v8, v4, v20 +vmsne.vv v8, v4, v20 +vmsltu.vv v8, v4, v20 +vmsleu.vv v8, v4, v20 +vmsgtu.vv v8, v20, v4 +vmsgeu.vv v8, v20, v4 + +vmflt.vv v0, v4, v20 +vmfle.vv v8, v4, v20 +vmfgt.vv v8, v20, v4 +vmfge.vv v8, v20, v4 +vmfeq.vv v8, v4, v20 +vmfne.vv v8, v4, v20 + +vmadc.vv v8, v4, v20 +vmsbc.vv v8, v4, v20 + +vfirst.m a2, v4 +vpopc.m a2, v4 + +viota.m v8, v4 + +vmsbf.m v8, v4 +vmsif.m v8, v4 +vmsof.m v8, v4 + +# CHECK: Resources: +# CHECK-NEXT: [0] - VLEN512SiFive7FDiv:1 +# CHECK-NEXT: [1] - VLEN512SiFive7IDiv:1 +# CHECK-NEXT: [2] - VLEN512SiFive7PipeA:1 +# CHECK-NEXT: [3] - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB +# CHECK-NEXT: [4] - VLEN512SiFive7PipeB:1 +# CHECK-NEXT: [5] - VLEN512SiFive7VA:1 +# CHECK-NEXT: [6] - VLEN512SiFive7VCQ:1 +# CHECK-NEXT: [7] - VLEN512SiFive7VL:1 +# CHECK-NEXT: [8] - VLEN512SiFive7VS:1 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) +# CHECK-NEXT: [7]: Bypass Latency +# CHECK-NEXT: [8]: Resources ( | [] | [, | [] | [, | [] | [,&1 | FileCheck --check-prefixes=GLOB32 --implicit-check-not="in t64" %s # RUN: cp t_all.a t_X32.a # RUN: env OBJECT_MODE=32 llvm-ranlib t_X32.a diff --git a/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test b/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test index 9ba1dff6ec84e..642532b39dfa5 100644 --- a/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test +++ b/llvm/test/tools/llvm-readobj/ELF/file-header-machine-types.test @@ -480,6 +480,9 @@ # RUN: yaml2obj %s -o %t.ve.o -D MACHINE=EM_VE # RUN: llvm-readelf --file-headers %t.ve.o | FileCheck %s -DMACHINE="NEC SX-Aurora Vector Engine" +# RUN: yaml2obj %s -o %t.igt.o -D MACHINE=EM_INTELGT +# RUN: llvm-readelf --file-headers %t.igt.o | FileCheck %s -DMACHINE="Intel Graphics Technology" + # CHECK: Machine: [[MACHINE]] --- !ELF diff --git a/llvm/test/tools/llvm-remarkutil/Inputs/filter.yaml b/llvm/test/tools/llvm-remarkutil/Inputs/filter.yaml new file mode 100644 index 0000000000000..89def7fc4c0e5 --- /dev/null +++ b/llvm/test/tools/llvm-remarkutil/Inputs/filter.yaml @@ -0,0 +1,28 @@ +--- !Passed +Pass: pass1 +Name: Remark1 +DebugLoc: { File: 'path/to/func1.c', Line: 1, Column: 2 } +Function: func1 +Args: + - String: ' text' + - arg1: argval1 +... +--- !Missed +Pass: pass2 +Name: Remark2 +DebugLoc: { File: 'path/to/func2.c', Line: 1, Column: 2 } +Function: func2 +Args: + - String: ' text' + - arg2: argval2 +... +--- !Analysis +Pass: pass3 +Name: Remark3 +DebugLoc: { File: 'path/to/func3.c', Line: 1, Column: 2 } +Function: func3 +Args: + - String: ' text' + - arg3: argval3 + DebugLoc: { File: 'path/to/func3.c', Line: 2, Column: 2 } +... diff --git a/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.bitstream b/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.bitstream index 2a528436791ae..49ba47e2504ed 100644 Binary files a/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.bitstream and b/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.bitstream differ diff --git a/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.v0.bitstream b/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.v0.bitstream new file mode 100644 index 0000000000000..2a528436791ae Binary files /dev/null and b/llvm/test/tools/llvm-remarkutil/Inputs/two-remarks.v0.bitstream differ diff --git a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test index f469eadc07f99..c21dbd72a2a18 100644 --- a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test +++ b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark-magic.test @@ -2,5 +2,6 @@ RUN: not llvm-remarkutil instruction-count %p/Inputs/broken-remark-magic.bitstre RUN: not llvm-remarkutil instruction-mix %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil count %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s +RUN: not llvm-remarkutil filter %p/Inputs/broken-remark-magic.bitstream -o - 2>&1 | FileCheck %s CHECK: error: Automatic detection of remark format failed. Unknown magic number: '1234' diff --git a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test index 78011aece08f7..339f082d4825b 100644 --- a/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test +++ b/llvm/test/tools/llvm-remarkutil/broken-bitstream-remark.test @@ -2,5 +2,6 @@ RUN: not llvm-remarkutil bitstream2yaml %p/Inputs/broken-remark -o - 2>&1 | File RUN: not llvm-remarkutil instruction-count --parser=bitstream %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil annotation-count --parser=bitstream --annotation-type=remark %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil count --parser=bitstream %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s +RUN: not llvm-remarkutil filter --parser=bitstream %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s CHECK: error: Unknown magic number: expecting RMRK, got --- . diff --git a/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test b/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test index 464d0b80c4ad0..9da3de4034b0f 100644 --- a/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test +++ b/llvm/test/tools/llvm-remarkutil/broken-yaml-remark.test @@ -3,5 +3,6 @@ RUN: not llvm-remarkutil instruction-count --parser=yaml %p/Inputs/broken-remark RUN: not llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil annotation-count --parser=yaml --annotation-type=remark %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s RUN: not llvm-remarkutil count --parser=yaml %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s +RUN: not llvm-remarkutil filter --parser=yaml %p/Inputs/broken-remark -o - 2>&1 | FileCheck %s CHECK: error: Type, Pass, Name or Function missing diff --git a/llvm/test/tools/llvm-remarkutil/convert.test b/llvm/test/tools/llvm-remarkutil/convert.test index 0d7ab8e4682a5..2b1bdcb0a5b3a 100644 --- a/llvm/test/tools/llvm-remarkutil/convert.test +++ b/llvm/test/tools/llvm-remarkutil/convert.test @@ -1,8 +1,11 @@ RUN: llvm-remarkutil bitstream2yaml %p/Inputs/two-remarks.bitstream -o %t.yaml RUN: FileCheck %s -strict-whitespace < %t.yaml +RUN: not llvm-remarkutil bitstream2yaml %p/Inputs/two-remarks.v0.bitstream 2>&1 -o - | FileCheck %s --check-prefix=ERR RUN: llvm-remarkutil yaml2bitstream %p/Inputs/two-remarks.yaml -o %t.bitstream RUN: llvm-remarkutil bitstream2yaml %t.bitstream -o - | FileCheck %s -strict-whitespace +; ERR: error: Unsupported remark container version (expected: 1, read: 0). Please upgrade/downgrade your toolchain to read this container. + ; CHECK: --- !Analysis ; CHECK-NEXT: Pass: prologepilog ; CHECK-NEXT: Name: StackSize diff --git a/llvm/test/tools/llvm-remarkutil/empty-file.test b/llvm/test/tools/llvm-remarkutil/empty-file.test index d9820a088ea8f..9b2b000e9c24b 100644 --- a/llvm/test/tools/llvm-remarkutil/empty-file.test +++ b/llvm/test/tools/llvm-remarkutil/empty-file.test @@ -3,16 +3,19 @@ RUN: not llvm-remarkutil instruction-count --parser=yaml %p/Inputs/empty-file -o RUN: not llvm-remarkutil instruction-mix --parser=yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER RUN: not llvm-remarkutil annotation-count --parser=yaml --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER RUN: not llvm-remarkutil count --parser=yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER +RUN: not llvm-remarkutil filter --parser=yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --check-prefix=YAMLPARSER RUN: llvm-remarkutil bitstream2yaml %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=BITSTREAM2YAML RUN: llvm-remarkutil instruction-count --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=SIZEBITSTREAM RUN: llvm-remarkutil instruction-mix --parser=bitstream %p/Inputs/empty-file --report_style=csv -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=MIXBITSTREAM RUN: llvm-remarkutil annotation-count --parser=bitstream --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=ANNOTATIONBITSTREAM RUN: llvm-remarkutil count --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=COUNTBITSTREAM +RUN: llvm-remarkutil filter --parser=bitstream %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=FILTERBITSTREAM ; Parser format auto-detection should treat empty files as bitstream files RUN: llvm-remarkutil instruction-count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=SIZEBITSTREAM RUN: llvm-remarkutil instruction-mix %p/Inputs/empty-file --report_style=csv -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=MIXBITSTREAM RUN: llvm-remarkutil annotation-count --annotation-type=remark %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=ANNOTATIONBITSTREAM RUN: llvm-remarkutil count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=COUNTBITSTREAM +RUN: llvm-remarkutil filter %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow-empty --check-prefix=FILTERBITSTREAM ; YAMLPARSER: error: document root is not of mapping type. @@ -30,3 +33,5 @@ RUN: llvm-remarkutil count %p/Inputs/empty-file -o - 2>&1 | FileCheck %s --allow ; MIXBITSTREAM-LABEL: Instruction,Count ; MIXBITSTREAM-EMPTY: + +; FILTERBITSTREAM-NOT: {{.}} diff --git a/llvm/test/tools/llvm-remarkutil/filter.test b/llvm/test/tools/llvm-remarkutil/filter.test new file mode 100644 index 0000000000000..8304b9f0129a8 --- /dev/null +++ b/llvm/test/tools/llvm-remarkutil/filter.test @@ -0,0 +1,59 @@ +RUN: llvm-remarkutil filter %p/Inputs/filter.yaml | diff %p/Inputs/filter.yaml - +RUN: llvm-remarkutil filter --rfunction=func %p/Inputs/filter.yaml | diff %p/Inputs/filter.yaml - +RUN: llvm-remarkutil filter --rremark-name=Remark %p/Inputs/filter.yaml | diff %p/Inputs/filter.yaml - +RUN: llvm-remarkutil filter --rpass-name=pass %p/Inputs/filter.yaml | diff %p/Inputs/filter.yaml - +RUN: llvm-remarkutil filter --rfilter-arg-by=argval %p/Inputs/filter.yaml | diff %p/Inputs/filter.yaml - + +RUN: llvm-remarkutil filter --rfunction=unc1 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK1 +RUN: llvm-remarkutil filter --rremark-name=ark3 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK3 +RUN: llvm-remarkutil filter --rpass-name=s1 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK1 +RUN: llvm-remarkutil filter --filter-arg-by=argval2 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK2 +RUN: llvm-remarkutil filter --function=func1 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK1 +RUN: llvm-remarkutil filter --pass-name=pass2 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK2 +RUN: llvm-remarkutil filter --remark-name=Remark3 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK3 +RUN: llvm-remarkutil filter --function=func1 --pass-name=pass1 --remark-name=Remark1 %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK1 +RUN: llvm-remarkutil filter --remark-type=passed %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK1 +RUN: llvm-remarkutil filter --remark-type=missed %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK2 +RUN: llvm-remarkutil filter --remark-type=analysis %p/Inputs/filter.yaml | FileCheck %s --strict-whitespace --check-prefix=REMARK3 + +RUN: llvm-remarkutil yaml2bitstream -o %t.opt.bitstream %p/Inputs/filter.yaml +RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream | FileCheck %s --strict-whitespace --check-prefix=REMARK1 + +RUN: llvm-remarkutil filter --function=func1 %t.opt.bitstream -o %t.r1.opt.bitstream +RUN: llvm-remarkutil bitstream2yaml %t.r1.opt.bitstream | FileCheck %s --strict-whitespace --check-prefix=REMARK1 + +RUN: llvm-remarkutil filter --function=func %p/Inputs/filter.yaml | FileCheck %s --allow-empty --strict-whitespace --check-prefix=EMPTY + +; REMARK1: --- !Passed +; REMARK1-NEXT: Pass: pass1 +; REMARK1-NEXT: Name: Remark1 +; REMARK1-NEXT: DebugLoc: { File: 'path/to/func1.c', Line: 1, Column: 2 } +; REMARK1-NEXT: Function: func1 +; REMARK1-NEXT: Args: +; REMARK1-NEXT: - String: ' text' +; REMARK1-NEXT: - arg1: argval1 +; REMARK1-NEXT: ... +; REMARK1-NOT: {{.}} +; REMARK2: --- !Missed +; REMARK2-NEXT: Pass: pass2 +; REMARK2-NEXT: Name: Remark2 +; REMARK2-NEXT: DebugLoc: { File: 'path/to/func2.c', Line: 1, Column: 2 } +; REMARK2-NEXT: Function: func2 +; REMARK2-NEXT: Args: +; REMARK2-NEXT: - String: ' text' +; REMARK2-NEXT: - arg2: argval2 +; REMARK2-NEXT: ... +; REMARK2-NOT: {{.}} +; REMARK3: --- !Analysis +; REMARK3-NEXT: Pass: pass3 +; REMARK3-NEXT: Name: Remark3 +; REMARK3-NEXT: DebugLoc: { File: 'path/to/func3.c', Line: 1, Column: 2 } +; REMARK3-NEXT: Function: func3 +; REMARK3-NEXT: Args: +; REMARK3-NEXT: - String: ' text' +; REMARK3-NEXT: - arg3: argval3 +; REMARK3-NEXT: DebugLoc: { File: 'path/to/func3.c', Line: 2, Column: 2 } +; REMARK3-NEXT: ... +; REMARK3-NOT: {{.}} + +; EMPTY-NOT: {{.}} diff --git a/llvm/test/tools/llvm-strings/eof.test b/llvm/test/tools/llvm-strings/eof.test index 19b5adc85ef0e..a2a3fc77db9a1 100644 --- a/llvm/test/tools/llvm-strings/eof.test +++ b/llvm/test/tools/llvm-strings/eof.test @@ -1,11 +1,13 @@ ## Show that llvm-strings prints the last string in the input even if no ## unprintable character follows it. -RUN: echo -n abcdefg | llvm-strings - | FileCheck %s --check-prefix=PRINT +RUN: echo -n abcdefg > %t +RUN: llvm-strings %t - | FileCheck %s --check-prefix=PRINT PRINT: abcdefg ## Show that llvm-strings does not print the last string in the input if it is ## too short and no unprintable character follows it. -RUN: echo -n abc | llvm-strings - | FileCheck --allow-empty %s --check-prefix=NOPRINT +RUN: echo -n abc > %t +RUN: llvm-strings %t - | FileCheck --allow-empty %s --check-prefix=NOPRINT NOPRINT-NOT: {{.}} diff --git a/llvm/test/tools/llvm-strings/stdin.test b/llvm/test/tools/llvm-strings/stdin.test index 06dcd194a3016..63f7194ab973d 100644 --- a/llvm/test/tools/llvm-strings/stdin.test +++ b/llvm/test/tools/llvm-strings/stdin.test @@ -1,3 +1,5 @@ +# XFAIL: system-aix + ## Show that llvm-strings can handle stdin input properly. ## Case 1: output with single string. diff --git a/llvm/test/tools/llvm-strings/whitespace.test b/llvm/test/tools/llvm-strings/whitespace.test index 7963ff73fb837..c51e5e62724cc 100644 --- a/llvm/test/tools/llvm-strings/whitespace.test +++ b/llvm/test/tools/llvm-strings/whitespace.test @@ -1,3 +1,4 @@ ## Show that the default output format matches GNU strings. -RUN: echo -n abcd | llvm-strings - | FileCheck %s --strict-whitespace --implicit-check-not={{.}} +RUN: echo -n abcd > %t +RUN: llvm-strings %t - | FileCheck %s --strict-whitespace --implicit-check-not={{.}} CHECK: {{^}}abcd{{$}} diff --git a/llvm/test/tools/llvm-tli-checker/ifuncs.yaml b/llvm/test/tools/llvm-tli-checker/ifuncs.yaml new file mode 100644 index 0000000000000..4eae66c3051a7 --- /dev/null +++ b/llvm/test/tools/llvm-tli-checker/ifuncs.yaml @@ -0,0 +1,39 @@ +# REQUIRES: x86-registered-target +# +# stpncpy is declared as available in TargetLibraryInfo for FreeBSD, but +# llvm-tli-checker won't be able to find it unless it knows how to check ifuncs. +# This test makes sure that llvm-tli-checker supports processing ifuncs. +# +# RUN: yaml2obj %s -o=%t1 +# RUN: llvm-tli-checker --triple=x86_64-unknown-freebsd %t1 | FileCheck %s +# +# CHECK: == Total TLI yes SDK yes: 1 +# + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + OSABI: ELFOSABI_FREEBSD + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + - Name: .rela.plt + Type: SHT_RELA + Flags: [ SHF_ALLOC, SHF_INFO_LINK ] + Address: 0x3CA20 + Link: .dynsym + AddressAlign: 0x8 + Relocations: + - Offset: 0x1E2C68 + Symbol: stpncpy + Type: R_X86_64_JUMP_SLOT +DynamicSymbols: + - Name: stpncpy + Type: STT_GNU_IFUNC + Section: .text + Binding: STB_WEAK + Value: 0x15D5E0 + Size: 0xC diff --git a/llvm/test/tools/yaml2obj/empty-or-invalid-doc.yaml b/llvm/test/tools/yaml2obj/empty-or-invalid-doc.yaml index 31a0973209f36..6da53297696ad 100644 --- a/llvm/test/tools/yaml2obj/empty-or-invalid-doc.yaml +++ b/llvm/test/tools/yaml2obj/empty-or-invalid-doc.yaml @@ -1,5 +1,6 @@ # RUN: echo "" | not yaml2obj 2>&1 | FileCheck %s -# RUN: echo -n "" | not yaml2obj 2>&1 | FileCheck %s +# RUN: echo -n "" > %t +# RUN: not yaml2obj %t 2>&1 | FileCheck %s # RUN: echo " " | not yaml2obj 2>&1 | FileCheck %s # RUN: echo " " | not yaml2obj 2>&1 | FileCheck %s # CHECK: yaml2obj: error: unknown document type diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index 138c5d0a513ed..b91c27e6a0f86 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -291,6 +291,7 @@ ErrorOr> DwarfLinkerForBinary::loadObject( [&](StringRef FileName) { BinHolder.eraseObjectEntry(FileName); }); Error E = RL.link(*ErrorOrObj); + // FIXME: Remark parsing errors are not propagated to the user. if (Error NewE = handleErrors( std::move(E), [&](std::unique_ptr EC) -> Error { return remarksErrorHandler(Obj, *this, std::move(EC)); diff --git a/llvm/tools/dsymutil/Reproducer.cpp b/llvm/tools/dsymutil/Reproducer.cpp index 31e49cdd0518c..0c1d3f90af299 100644 --- a/llvm/tools/dsymutil/Reproducer.cpp +++ b/llvm/tools/dsymutil/Reproducer.cpp @@ -37,9 +37,10 @@ ReproducerGenerate::ReproducerGenerate(std::error_code &EC, int Argc, char **Argv, bool GenerateOnExit) : Root(createReproducerDir(EC)), GenerateOnExit(GenerateOnExit) { llvm::append_range(Args, ArrayRef(Argv, Argc)); + auto RealFS = vfs::getRealFileSystem(); if (!Root.empty()) - FC = std::make_shared(Root, Root); - VFS = FileCollector::createCollectorVFS(vfs::getRealFileSystem(), FC); + FC = std::make_shared(Root, Root, RealFS); + VFS = FileCollector::createCollectorVFS(std::move(RealFS), FC); } ReproducerGenerate::~ReproducerGenerate() { diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp index b3d7185e7f144..a2327fbc3b66a 100644 --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -387,13 +387,13 @@ int main(int argc, char **argv) { // Set a diagnostic handler that doesn't exit on the first error Context.setDiagnosticHandler(std::make_unique()); - Expected> RemarksFileOrErr = + Expected RemarksFileOrErr = setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses, RemarksFormat, RemarksWithHotness, RemarksHotnessThreshold); if (Error E = RemarksFileOrErr.takeError()) reportError(std::move(E), RemarksFilename); - std::unique_ptr RemarksFile = std::move(*RemarksFileOrErr); + LLVMRemarkFileHandle RemarksFile = std::move(*RemarksFileOrErr); if (InputLanguage != "" && InputLanguage != "ir" && InputLanguage != "mir") reportError("input language must be '', 'IR' or 'MIR'"); diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp index 06de33dc070e0..4c07c05396732 100644 --- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp +++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp @@ -21,7 +21,8 @@ // -- Branches: array => List of Branches in the file // -- Branch: dict => Describes a branch of the file with counters // -- MCDC Records: array => List of MCDC records in the file -// -- MCDC Values: array => List of T/F covered condition values +// -- MCDC Values: array => List of T/F covered condition values and +// list of executed test vectors // -- Segments: array => List of Segments contained in the file // -- Segment: dict => Describes a segment of the file with a counter // -- Expansions: array => List of expansion records @@ -62,7 +63,7 @@ #include /// The semantic version combined as a string. -#define LLVM_COVERAGE_EXPORT_JSON_STR "3.0.1" +#define LLVM_COVERAGE_EXPORT_JSON_STR "3.1.0" /// Unique type identifier for JSON coverage export. #define LLVM_COVERAGE_EXPORT_JSON_TYPE_STR "llvm.coverage.json.export" @@ -108,13 +109,43 @@ json::Array gatherConditions(const coverage::MCDCRecord &Record) { return Conditions; } +json::Value renderCondState(const coverage::MCDCRecord::CondState CondState) { + switch (CondState) { + case coverage::MCDCRecord::MCDC_DontCare: + return json::Value(nullptr); + case coverage::MCDCRecord::MCDC_True: + return json::Value(true); + case coverage::MCDCRecord::MCDC_False: + return json::Value(false); + } + llvm_unreachable("Unknown llvm::coverage::MCDCRecord::CondState enum"); +} + +json::Array gatherTestVectors(coverage::MCDCRecord &Record) { + json::Array TestVectors; + unsigned NumConditions = Record.getNumConditions(); + for (unsigned tv = 0; tv < Record.getNumTestVectors(); tv++) { + + json::Array TVConditions; + for (unsigned c = 0; c < NumConditions; c++) + TVConditions.push_back(renderCondState(Record.getTVCondition(tv, c))); + + TestVectors.push_back( + json::Object({{"executed", json::Value(true)}, + {"result", renderCondState(Record.getTVResult(tv))}, + {"conditions", std::move(TVConditions)}})); + } + return TestVectors; +} + json::Array renderMCDCRecord(const coverage::MCDCRecord &Record) { const llvm::coverage::CounterMappingRegion &CMR = Record.getDecisionRegion(); const auto [TrueDecisions, FalseDecisions] = Record.getDecisions(); - return json::Array({CMR.LineStart, CMR.ColumnStart, CMR.LineEnd, - CMR.ColumnEnd, TrueDecisions, FalseDecisions, - CMR.FileID, CMR.ExpandedFileID, int64_t(CMR.Kind), - gatherConditions(Record)}); + return json::Array( + {CMR.LineStart, CMR.ColumnStart, CMR.LineEnd, CMR.ColumnEnd, + TrueDecisions, FalseDecisions, CMR.FileID, CMR.ExpandedFileID, + int64_t(CMR.Kind), gatherConditions(Record), + gatherTestVectors(const_cast(Record))}); } json::Array renderRegions(ArrayRef Regions) { @@ -216,32 +247,28 @@ json::Object renderSummary(const FileCoverageSummary &Summary) { } json::Array renderFileExpansions(const coverage::CoverageMapping &Coverage, - const coverage::CoverageData &FileCoverage, - const FileCoverageSummary &FileReport) { + const coverage::CoverageData &FileCoverage) { json::Array ExpansionArray; for (const auto &Expansion : FileCoverage.getExpansions()) ExpansionArray.push_back(renderExpansion(Coverage, Expansion)); return ExpansionArray; } -json::Array renderFileSegments(const coverage::CoverageData &FileCoverage, - const FileCoverageSummary &FileReport) { +json::Array renderFileSegments(const coverage::CoverageData &FileCoverage) { json::Array SegmentArray; for (const auto &Segment : FileCoverage) SegmentArray.push_back(renderSegment(Segment)); return SegmentArray; } -json::Array renderFileBranches(const coverage::CoverageData &FileCoverage, - const FileCoverageSummary &FileReport) { +json::Array renderFileBranches(const coverage::CoverageData &FileCoverage) { json::Array BranchArray; for (const auto &Branch : FileCoverage.getBranches()) BranchArray.push_back(renderBranch(Branch)); return BranchArray; } -json::Array renderFileMCDC(const coverage::CoverageData &FileCoverage, - const FileCoverageSummary &FileReport) { +json::Array renderFileMCDC(const coverage::CoverageData &FileCoverage) { json::Array MCDCRecordArray; for (const auto &Record : FileCoverage.getMCDCRecords()) MCDCRecordArray.push_back(renderMCDCRecord(Record)); @@ -256,12 +283,11 @@ json::Object renderFile(const coverage::CoverageMapping &Coverage, if (!Options.ExportSummaryOnly) { // Calculate and render detailed coverage information for given file. auto FileCoverage = Coverage.getCoverageForFile(Filename); - File["segments"] = renderFileSegments(FileCoverage, FileReport); - File["branches"] = renderFileBranches(FileCoverage, FileReport); - File["mcdc_records"] = renderFileMCDC(FileCoverage, FileReport); + File["segments"] = renderFileSegments(FileCoverage); + File["branches"] = renderFileBranches(FileCoverage); + File["mcdc_records"] = renderFileMCDC(FileCoverage); if (!Options.SkipExpansions) { - File["expansions"] = - renderFileExpansions(Coverage, FileCoverage, FileReport); + File["expansions"] = renderFileExpansions(Coverage, FileCoverage); } } File["summary"] = renderSummary(FileReport); diff --git a/llvm/tools/llvm-gpu-loader/server.h b/llvm/tools/llvm-gpu-loader/server.h index bc54b4b74915a..da73cc007f5d5 100644 --- a/llvm/tools/llvm-gpu-loader/server.h +++ b/llvm/tools/llvm-gpu-loader/server.h @@ -12,8 +12,6 @@ #include #include -#include "include/llvm-libc-types/test_rpc_opcodes_t.h" - #include "shared/rpc.h" #include "shared/rpc_opcodes.h" #include "shared/rpc_server.h" @@ -28,59 +26,6 @@ inline uint32_t handle_server(rpc::Server &server, uint32_t index, int status = rpc::RPC_SUCCESS; switch (port->get_opcode()) { - case RPC_TEST_INCREMENT: { - port->recv_and_send([](rpc::Buffer *buffer, uint32_t) { - reinterpret_cast(buffer->data)[0] += 1; - }); - break; - } - case RPC_TEST_INTERFACE: { - bool end_with_recv; - uint64_t cnt; - port->recv([&](rpc::Buffer *buffer, uint32_t) { - end_with_recv = buffer->data[0]; - }); - port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); - port->send([&](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = cnt = cnt + 1; - }); - port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); - port->send([&](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = cnt = cnt + 1; - }); - port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); - port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); - port->send([&](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = cnt = cnt + 1; - }); - port->send([&](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = cnt = cnt + 1; - }); - if (end_with_recv) - port->recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; }); - else - port->send([&](rpc::Buffer *buffer, uint32_t) { - buffer->data[0] = cnt = cnt + 1; - }); - - break; - } - case RPC_TEST_STREAM: { - uint64_t sizes[num_lanes] = {0}; - void *dst[num_lanes] = {nullptr}; - port->recv_n(dst, sizes, - [](uint64_t size) -> void * { return new char[size]; }); - port->send_n(dst, sizes); - for (uint64_t i = 0; i < num_lanes; ++i) { - if (dst[i]) - delete[] reinterpret_cast(dst[i]); - } - break; - } - case RPC_TEST_NOOP: { - port->recv([&](rpc::Buffer *, uint32_t) {}); - break; - } case LIBC_MALLOC: { port->recv_and_send([&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = reinterpret_cast(alloc(buffer->data[0])); diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 31bf6a9d2d9c8..e09ddb45da6e9 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -1519,10 +1519,10 @@ class MemoryMatcher { static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) { using namespace support::endian; - auto Armv7MovWTle = byte_swap(0xe300c000); - auto Armv7BxR12le = byte_swap(0xe12fff1c); - auto Thumbv7MovWTle = byte_swap(0x0c00f240); - auto Thumbv7BxR12le = byte_swap(0x4760); + auto Armv7MovWTle = byte_swap(0xe300c000, endianness::little); + auto Armv7BxR12le = byte_swap(0xe12fff1c, endianness::little); + auto Thumbv7MovWTle = byte_swap(0x0c00f240, endianness::little); + auto Thumbv7BxR12le = byte_swap(0x4760, endianness::little); MemoryMatcher M(Stub.getContent()); if (M.matchMask(Thumbv7MovWTle)) { diff --git a/llvm/tools/llvm-mca/llvm-mca.cpp b/llvm/tools/llvm-mca/llvm-mca.cpp index a4194da4a7b63..a64539c09b81e 100644 --- a/llvm/tools/llvm-mca/llvm-mca.cpp +++ b/llvm/tools/llvm-mca/llvm-mca.cpp @@ -668,7 +668,7 @@ int main(int argc, char **argv) { return 1; } - IPP->postProcessInstruction(Inst.get(), MCI); + IPP->postProcessInstruction(*Inst.get(), MCI); InstToInstruments.insert({&MCI, Instruments}); LoweredSequence.emplace_back(std::move(Inst.get())); } diff --git a/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp b/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp index 9dac1646b1e26..d65b402571ae8 100644 --- a/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp +++ b/llvm/tools/llvm-offload-wrapper/llvm-offload-wrapper.cpp @@ -84,6 +84,10 @@ static Error wrapImages(ArrayRef> BuffersToWrap) { M, BuffersToWrap.front(), offloading::getOffloadEntryArray(M))) return Err; break; + case llvm::object::OFK_SYCL: + if (Error Err = offloading::wrapSYCLBinaries(M, BuffersToWrap.front())) + return Err; + break; default: return createStringError(getOffloadKindName(Kind) + " wrapping is not supported"); diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index b7e48cfec5885..3092bfd42e25e 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1305,6 +1305,7 @@ const EnumEntry ElfMachineType[] = { ENUM_ENT(EM_BPF, "EM_BPF"), ENUM_ENT(EM_VE, "NEC SX-Aurora Vector Engine"), ENUM_ENT(EM_LOONGARCH, "LoongArch"), + ENUM_ENT(EM_INTELGT, "Intel Graphics Technology"), }; const EnumEntry ElfSymbolBindings[] = { diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp index b1138ef9d5289..c479233a712e7 100644 --- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp +++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp @@ -62,12 +62,12 @@ static cl::opt TmpFilesAsBitcode( cl::desc("Always write temporary files as bitcode instead of textual IR"), cl::init(false), cl::cat(LLVMReduceOptions)); -static SmallVector constructSaveRestorePoints( - ArrayRef SRPoints, +static SaveRestorePoints constructSaveRestorePoints( + const SaveRestorePoints &SRPoints, const DenseMap &BBMap) { - SmallVector Pts; + SaveRestorePoints Pts{}; for (auto &Src : SRPoints) - Pts.push_back(BBMap.find(Src)->second); + Pts.insert({BBMap.find(Src.first)->second, Src.second}); return Pts; } diff --git a/llvm/tools/llvm-remarkutil/CMakeLists.txt b/llvm/tools/llvm-remarkutil/CMakeLists.txt index ed398ad272024..c6e9334d87c04 100644 --- a/llvm/tools/llvm-remarkutil/CMakeLists.txt +++ b/llvm/tools/llvm-remarkutil/CMakeLists.txt @@ -8,6 +8,7 @@ add_llvm_tool(llvm-remarkutil RemarkConvert.cpp RemarkCount.cpp RemarkCounter.cpp + RemarkFilter.cpp RemarkInstructionMix.cpp RemarkSizeDiff.cpp RemarkUtil.cpp diff --git a/llvm/tools/llvm-remarkutil/RemarkConvert.cpp b/llvm/tools/llvm-remarkutil/RemarkConvert.cpp index 207c5e0a8048b..203c8266b077d 100644 --- a/llvm/tools/llvm-remarkutil/RemarkConvert.cpp +++ b/llvm/tools/llvm-remarkutil/RemarkConvert.cpp @@ -80,8 +80,8 @@ static Error tryReserializeYAML2Bitstream( if (!MaybeOF) return MaybeOF.takeError(); auto OF = std::move(*MaybeOF); - auto MaybeSerializer = createRemarkSerializer( - OutputFormat, SerializerMode::Standalone, OF->os(), std::move(StrTab)); + auto MaybeSerializer = + createRemarkSerializer(OutputFormat, OF->os(), std::move(StrTab)); if (!MaybeSerializer) return MaybeSerializer.takeError(); auto Serializer = std::move(*MaybeSerializer); @@ -110,8 +110,7 @@ static Error tryBitstream2YAML() { if (!MaybeOF) return MaybeOF.takeError(); auto OF = std::move(*MaybeOF); - auto MaybeSerializer = createRemarkSerializer( - OutputFormat, SerializerMode::Standalone, OF->os()); + auto MaybeSerializer = createRemarkSerializer(OutputFormat, OF->os()); if (!MaybeSerializer) return MaybeSerializer.takeError(); diff --git a/llvm/tools/llvm-remarkutil/RemarkCounter.cpp b/llvm/tools/llvm-remarkutil/RemarkCounter.cpp index 7d5c84815b3bb..2e842c8c2d72e 100644 --- a/llvm/tools/llvm-remarkutil/RemarkCounter.cpp +++ b/llvm/tools/llvm-remarkutil/RemarkCounter.cpp @@ -25,6 +25,9 @@ static cl::SubCommand CountSub("count", INPUT_FORMAT_COMMAND_LINE_OPTIONS(CountSub) INPUT_OUTPUT_COMMAND_LINE_OPTIONS(CountSub) +REMARK_FILTER_COMMAND_LINE_OPTIONS(CountSub) + +REMARK_FILTER_SETUP_FUNC() static cl::list Keys("args", cl::desc("Specify remark argument/s to count by."), @@ -34,45 +37,7 @@ static cl::list RKeys( cl::desc( "Specify remark argument/s to count (accepts regular expressions)."), cl::value_desc("arguments"), cl::sub(CountSub), cl::ValueOptional); -static cl::opt - RemarkNameOpt("remark-name", - cl::desc("Optional remark name to filter collection by."), - cl::ValueOptional, cl::sub(CountSub)); -static cl::opt - PassNameOpt("pass-name", cl::ValueOptional, - cl::desc("Optional remark pass name to filter collection by."), - cl::sub(CountSub)); -static cl::opt RemarkFilterArgByOpt( - "filter-arg-by", cl::desc("Optional remark arg to filter collection by."), - cl::ValueOptional, cl::sub(CountSub)); -static cl::opt - RemarkNameOptRE("rremark-name", - cl::desc("Optional remark name to filter collection by " - "(accepts regular expressions)."), - cl::ValueOptional, cl::sub(CountSub)); -static cl::opt - RemarkArgFilterOptRE("rfilter-arg-by", - cl::desc("Optional remark arg to filter collection by " - "(accepts regular expressions)."), - cl::sub(CountSub), cl::ValueOptional); -static cl::opt - PassNameOptRE("rpass-name", cl::ValueOptional, - cl::desc("Optional remark pass name to filter collection " - "by (accepts regular expressions)."), - cl::sub(CountSub)); -static cl::opt RemarkTypeOpt( - "remark-type", cl::desc("Optional remark type to filter collection by."), - cl::values(clEnumValN(Type::Unknown, "unknown", "UNKOWN"), - clEnumValN(Type::Passed, "passed", "PASSED"), - clEnumValN(Type::Missed, "missed", "MISSED"), - clEnumValN(Type::Analysis, "analysis", "ANALYSIS"), - clEnumValN(Type::AnalysisFPCommute, "analysis-fp-commute", - "ANALYSIS_FP_COMMUTE"), - clEnumValN(Type::AnalysisAliasing, "analysis-aliasing", - "ANALYSIS_ALIASING"), - clEnumValN(Type::Failure, "failure", "FAILURE")), - cl::init(Type::Failure), cl::sub(CountSub)); static cl::opt CountByOpt( "count-by", cl::desc("Specify the property to collect remarks by."), cl::values( @@ -112,21 +77,6 @@ static unsigned getValForKey(StringRef Key, const Remark &Remark) { return *RemarkArg->getValAsInt(); } -bool Filters::filterRemark(const Remark &Remark) { - if (RemarkNameFilter && !RemarkNameFilter->match(Remark.RemarkName)) - return false; - if (PassNameFilter && !PassNameFilter->match(Remark.PassName)) - return false; - if (RemarkTypeFilter) - return *RemarkTypeFilter == Remark.RemarkType; - if (ArgFilter) { - if (!any_of(Remark.Args, - [this](Argument Arg) { return ArgFilter->match(Arg.Val); })) - return false; - } - return true; -} - Error ArgumentCounter::getAllMatchingArgumentsInRemark( StringRef Buffer, ArrayRef Arguments, Filters &Filter) { auto MaybeParser = createRemarkParser(InputFormat, Buffer); @@ -223,33 +173,6 @@ Error RemarkCounter::print(StringRef OutputFileName) { return Error::success(); } -Expected getRemarkFilter() { - // Create Filter properties. - auto MaybeRemarkNameFilter = - FilterMatcher::createExactOrRE(RemarkNameOpt, RemarkNameOptRE); - if (!MaybeRemarkNameFilter) - return MaybeRemarkNameFilter.takeError(); - - auto MaybePassNameFilter = - FilterMatcher::createExactOrRE(PassNameOpt, PassNameOptRE); - if (!MaybePassNameFilter) - return MaybePassNameFilter.takeError(); - - auto MaybeRemarkArgFilter = FilterMatcher::createExactOrRE( - RemarkFilterArgByOpt, RemarkArgFilterOptRE); - if (!MaybeRemarkArgFilter) - return MaybeRemarkArgFilter.takeError(); - - std::optional RemarkType; - if (RemarkTypeOpt != Type::Failure) - RemarkType = RemarkTypeOpt; - - // Create RemarkFilter. - return Filters{std::move(*MaybeRemarkNameFilter), - std::move(*MaybePassNameFilter), - std::move(*MaybeRemarkArgFilter), RemarkType}; -} - Error useCollectRemark(StringRef Buffer, Counter &Counter, Filters &Filter) { // Create Parser. auto MaybeParser = createRemarkParser(InputFormat, Buffer); @@ -278,7 +201,7 @@ static Error collectRemarks() { if (!MaybeBuf) return MaybeBuf.takeError(); StringRef Buffer = (*MaybeBuf)->getBuffer(); - auto MaybeFilter = getRemarkFilter(); + auto MaybeFilter = getRemarkFilters(); if (!MaybeFilter) return MaybeFilter.takeError(); auto &Filter = *MaybeFilter; diff --git a/llvm/tools/llvm-remarkutil/RemarkCounter.h b/llvm/tools/llvm-remarkutil/RemarkCounter.h index 3b977791d87c2..69e552e3742ec 100644 --- a/llvm/tools/llvm-remarkutil/RemarkCounter.h +++ b/llvm/tools/llvm-remarkutil/RemarkCounter.h @@ -14,6 +14,7 @@ #include "RemarkUtilHelpers.h" #include "llvm/ADT/MapVector.h" #include "llvm/Support/Regex.h" +#include namespace llvm { namespace remarks { @@ -45,18 +46,6 @@ inline std::string groupByToStr(GroupBy GroupBy) { } } -/// Filter out remarks based on remark properties based on name, pass name, -/// argument and type. -struct Filters { - std::optional RemarkNameFilter; - std::optional PassNameFilter; - std::optional ArgFilter; - std::optional RemarkTypeFilter; - - /// Returns true if \p Remark satisfies all the provided filters. - bool filterRemark(const Remark &Remark); -}; - /// Abstract counter class used to define the general required methods for /// counting a remark. struct Counter { diff --git a/llvm/tools/llvm-remarkutil/RemarkFilter.cpp b/llvm/tools/llvm-remarkutil/RemarkFilter.cpp new file mode 100644 index 0000000000000..acfef6608677c --- /dev/null +++ b/llvm/tools/llvm-remarkutil/RemarkFilter.cpp @@ -0,0 +1,84 @@ +//===- RemarkFilter.cpp ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Generic tool to filter remarks +// +//===----------------------------------------------------------------------===// + +#include "RemarkUtilHelpers.h" +#include "RemarkUtilRegistry.h" + +#include "llvm/Support/Error.h" +#include "llvm/Support/Regex.h" + +using namespace llvm; +using namespace remarks; +using namespace llvm::remarkutil; + +namespace filter { + +static cl::SubCommand FilterSub("filter", + "Filter remarks based on specified criteria."); + +INPUT_FORMAT_COMMAND_LINE_OPTIONS(FilterSub) +OUTPUT_FORMAT_COMMAND_LINE_OPTIONS(FilterSub) +INPUT_OUTPUT_COMMAND_LINE_OPTIONS(FilterSub) +REMARK_FILTER_COMMAND_LINE_OPTIONS(FilterSub) + +REMARK_FILTER_SETUP_FUNC() + +static Error tryFilter() { + auto MaybeFilter = getRemarkFilters(); + if (!MaybeFilter) + return MaybeFilter.takeError(); + Filters &Filter = *MaybeFilter; + + auto MaybeBuf = getInputMemoryBuffer(InputFileName); + if (!MaybeBuf) + return MaybeBuf.takeError(); + auto MaybeParser = createRemarkParser(InputFormat, (*MaybeBuf)->getBuffer()); + if (!MaybeParser) + return MaybeParser.takeError(); + auto &Parser = **MaybeParser; + + Format SerializerFormat = OutputFormat; + if (SerializerFormat == Format::Auto) { + SerializerFormat = Parser.ParserFormat; + if (OutputFileName.empty() || OutputFileName == "-") + SerializerFormat = Format::YAML; + } + + auto MaybeOF = getOutputFileForRemarks(OutputFileName, SerializerFormat); + if (!MaybeOF) + return MaybeOF.takeError(); + auto OF = std::move(*MaybeOF); + + auto MaybeSerializer = createRemarkSerializer(SerializerFormat, OF->os()); + if (!MaybeSerializer) + return MaybeSerializer.takeError(); + auto &Serializer = **MaybeSerializer; + + auto MaybeRemark = Parser.next(); + for (; MaybeRemark; MaybeRemark = Parser.next()) { + Remark &Remark = **MaybeRemark; + if (!Filter.filterRemark(Remark)) + continue; + Serializer.emit(Remark); + } + + auto E = MaybeRemark.takeError(); + if (!E.isA()) + return E; + consumeError(std::move(E)); + OF->keep(); + return Error::success(); +} + +static CommandRegistration FilterReg(&FilterSub, tryFilter); + +} // namespace filter diff --git a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp index ad6c46eceb8f2..be529480e7d24 100644 --- a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp +++ b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.cpp @@ -92,5 +92,22 @@ FilterMatcher::createExactOrRE(const llvm::cl::opt &ExactArg, return std::nullopt; } +bool Filters::filterRemark(const Remark &Remark) { + if (FunctionFilter && !FunctionFilter->match(Remark.FunctionName)) + return false; + if (RemarkNameFilter && !RemarkNameFilter->match(Remark.RemarkName)) + return false; + if (PassNameFilter && !PassNameFilter->match(Remark.PassName)) + return false; + if (RemarkTypeFilter) + return *RemarkTypeFilter == Remark.RemarkType; + if (ArgFilter) { + if (!any_of(Remark.Args, + [this](Argument Arg) { return ArgFilter->match(Arg.Val); })) + return false; + } + return true; +} + } // namespace remarks } // namespace llvm diff --git a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h index 894ac8354e18b..0dd550765c1c6 100644 --- a/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h +++ b/llvm/tools/llvm-remarkutil/RemarkUtilHelpers.h @@ -9,12 +9,11 @@ // Helpers for remark utilites // //===----------------------------------------------------------------------===// -#include "llvm-c/Remarks.h" #include "llvm/ADT/StringRef.h" #include "llvm/Remarks/Remark.h" #include "llvm/Remarks/RemarkFormat.h" #include "llvm/Remarks/RemarkParser.h" -#include "llvm/Remarks/YAMLRemarkSerializer.h" +#include "llvm/Remarks/RemarkSerializer.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" @@ -43,6 +42,16 @@ clEnumValN(Format::Bitstream, "bitstream", "Bitstream")), \ cl::sub(SUBOPT)); +#define OUTPUT_FORMAT_COMMAND_LINE_OPTIONS(SUBOPT) \ + static cl::opt OutputFormat( \ + "serializer", cl::init(Format::Auto), \ + cl::desc("Output remark format to serialize"), \ + cl::values(clEnumValN(Format::Auto, "auto", \ + "Follow the parser format (default)"), \ + clEnumValN(Format::YAML, "yaml", "YAML"), \ + clEnumValN(Format::Bitstream, "bitstream", "Bitstream")), \ + cl::sub(SUBOPT)); + #define DEBUG_LOC_INFO_COMMAND_LINE_OPTIONS(SUBOPT) \ static cl::opt UseDebugLoc( \ "use-debug-loc", \ @@ -52,6 +61,87 @@ "number)"), \ cl::init(false), cl::sub(SUBOPT)); +#define REMARK_FILTER_COMMAND_LINE_OPTIONS(SUBOPT) \ + static cl::opt FunctionOpt( \ + "function", cl::sub(SUBOPT), cl::ValueOptional, \ + cl::desc("Optional function name to filter collection by.")); \ + static cl::opt FunctionOptRE( \ + "rfunction", cl::sub(SUBOPT), cl::ValueOptional, \ + cl::desc("Optional function name to filter collection by " \ + "(accepts regular expressions).")); \ + static cl::opt RemarkNameOpt( \ + "remark-name", \ + cl::desc("Optional remark name to filter collection by."), \ + cl::ValueOptional, cl::sub(SUBOPT)); \ + static cl::opt RemarkNameOptRE( \ + "rremark-name", \ + cl::desc("Optional remark name to filter collection by " \ + "(accepts regular expressions)."), \ + cl::ValueOptional, cl::sub(SUBOPT)); \ + static cl::opt PassNameOpt( \ + "pass-name", cl::ValueOptional, \ + cl::desc("Optional remark pass name to filter collection by."), \ + cl::sub(SUBOPT)); \ + static cl::opt PassNameOptRE( \ + "rpass-name", cl::ValueOptional, \ + cl::desc("Optional remark pass name to filter collection " \ + "by (accepts regular expressions)."), \ + cl::sub(SUBOPT)); \ + static cl::opt RemarkTypeOpt( \ + "remark-type", \ + cl::desc("Optional remark type to filter collection by."), \ + cl::values(clEnumValN(Type::Unknown, "unknown", "UNKOWN"), \ + clEnumValN(Type::Passed, "passed", "PASSED"), \ + clEnumValN(Type::Missed, "missed", "MISSED"), \ + clEnumValN(Type::Analysis, "analysis", "ANALYSIS"), \ + clEnumValN(Type::AnalysisFPCommute, "analysis-fp-commute", \ + "ANALYSIS_FP_COMMUTE"), \ + clEnumValN(Type::AnalysisAliasing, "analysis-aliasing", \ + "ANALYSIS_ALIASING"), \ + clEnumValN(Type::Failure, "failure", "FAILURE")), \ + cl::sub(SUBOPT)); \ + static cl::opt RemarkFilterArgByOpt( \ + "filter-arg-by", \ + cl::desc("Optional remark arg to filter collection by."), \ + cl::ValueOptional, cl::sub(SUBOPT)); \ + static cl::opt RemarkArgFilterOptRE( \ + "rfilter-arg-by", \ + cl::desc("Optional remark arg to filter collection by " \ + "(accepts regular expressions)."), \ + cl::sub(SUBOPT), cl::ValueOptional); + +#define REMARK_FILTER_SETUP_FUNC() \ + static Expected getRemarkFilters() { \ + auto MaybeFunctionFilter = \ + FilterMatcher::createExactOrRE(FunctionOpt, FunctionOptRE); \ + if (!MaybeFunctionFilter) \ + return MaybeFunctionFilter.takeError(); \ + \ + auto MaybeRemarkNameFilter = \ + FilterMatcher::createExactOrRE(RemarkNameOpt, RemarkNameOptRE); \ + if (!MaybeRemarkNameFilter) \ + return MaybeRemarkNameFilter.takeError(); \ + \ + auto MaybePassNameFilter = \ + FilterMatcher::createExactOrRE(PassNameOpt, PassNameOptRE); \ + if (!MaybePassNameFilter) \ + return MaybePassNameFilter.takeError(); \ + \ + auto MaybeRemarkArgFilter = FilterMatcher::createExactOrRE( \ + RemarkFilterArgByOpt, RemarkArgFilterOptRE); \ + if (!MaybeRemarkArgFilter) \ + return MaybeRemarkArgFilter.takeError(); \ + \ + std::optional TypeFilter; \ + if (RemarkTypeOpt.getNumOccurrences()) \ + TypeFilter = RemarkTypeOpt.getValue(); \ + \ + return Filters{std::move(*MaybeFunctionFilter), \ + std::move(*MaybeRemarkNameFilter), \ + std::move(*MaybePassNameFilter), \ + std::move(*MaybeRemarkArgFilter), TypeFilter}; \ + } + namespace llvm { namespace remarks { Expected> @@ -95,5 +185,18 @@ class FilterMatcher { } }; +/// Filter out remarks based on remark properties (function, remark name, pass +/// name, argument values and type). +struct Filters { + std::optional FunctionFilter; + std::optional RemarkNameFilter; + std::optional PassNameFilter; + std::optional ArgFilter; + std::optional RemarkTypeFilter; + + /// Returns true if \p Remark satisfies all the provided filters. + bool filterRemark(const Remark &Remark); +}; + } // namespace remarks } // namespace llvm diff --git a/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp b/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp index 3cd5d597ee133..0cf8c5c63bef2 100644 --- a/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp +++ b/llvm/tools/llvm-tli-checker/llvm-tli-checker.cpp @@ -153,8 +153,12 @@ void SDKNameMap::maybeInsertSymbol(const SymbolRef &S, const ObjectFile &O) { uint32_t Flags = unwrapIgnoreError(S.getFlags()); section_iterator Section = unwrapIgnoreError(S.getSection(), /*Default=*/O.section_end()); - if (Type == SymbolRef::ST_Function && (Flags & SymbolRef::SF_Global) && - Section != O.section_end()) { + bool IsRegularFunction = Type == SymbolRef::ST_Function && + (Flags & SymbolRef::SF_Global) && + Section != O.section_end(); + bool IsIFunc = + Type == SymbolRef::ST_Other && (Flags & SymbolRef::SF_Indirect); + if (IsRegularFunction || IsIFunc) { StringRef Name = unwrapIgnoreError(S.getName()); insert({ Name, true }); } diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp index 0c991b71a6b26..c19fc19f90afe 100644 --- a/llvm/tools/opt/NewPMDriver.cpp +++ b/llvm/tools/opt/NewPMDriver.cpp @@ -361,27 +361,25 @@ bool llvm::runPassPipeline( bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex, bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve, bool EnableProfcheck, bool UnifiedLTO) { - auto FS = vfs::getRealFileSystem(); std::optional P; switch (PGOKindFlag) { case InstrGen: - P = PGOOptions(ProfileFile, "", "", MemoryProfileFile, FS, - PGOOptions::IRInstr, PGOOptions::NoCSAction, - PGOColdFuncAttr); + P = PGOOptions(ProfileFile, "", "", MemoryProfileFile, PGOOptions::IRInstr, + PGOOptions::NoCSAction, PGOColdFuncAttr); break; case InstrUse: - P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, FS, + P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, PGOOptions::IRUse, PGOOptions::NoCSAction, PGOColdFuncAttr); break; case SampleUse: - P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, FS, + P = PGOOptions(ProfileFile, "", ProfileRemappingFile, MemoryProfileFile, PGOOptions::SampleUse, PGOOptions::NoCSAction, PGOColdFuncAttr); break; case NoPGO: if (DebugInfoForProfiling || PseudoProbeForProfiling || !MemoryProfileFile.empty()) - P = PGOOptions("", "", "", MemoryProfileFile, FS, PGOOptions::NoAction, + P = PGOOptions("", "", "", MemoryProfileFile, PGOOptions::NoAction, PGOOptions::NoCSAction, PGOColdFuncAttr, DebugInfoForProfiling, PseudoProbeForProfiling); else @@ -403,7 +401,7 @@ bool llvm::runPassPipeline( P->CSProfileGenFile = CSProfileGenFile; } else P = PGOOptions("", CSProfileGenFile, ProfileRemappingFile, - /*MemoryProfile=*/"", FS, PGOOptions::NoAction, + /*MemoryProfile=*/"", PGOOptions::NoAction, PGOOptions::CSIRInstr); } else /* CSPGOKindFlag == CSInstrUse */ { if (!P) { diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp index 26902b213571f..d4fa6eb50cda7 100644 --- a/llvm/tools/opt/optdriver.cpp +++ b/llvm/tools/opt/optdriver.cpp @@ -510,7 +510,7 @@ extern "C" int optMain( if (!DisableDITypeMap) Context.enableDebugTypeODRUniquing(); - Expected> RemarksFileOrErr = + Expected RemarksFileOrErr = setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses, RemarksFormat, RemarksWithHotness, RemarksHotnessThreshold); @@ -518,7 +518,7 @@ extern "C" int optMain( errs() << toString(std::move(E)) << '\n'; return 1; } - std::unique_ptr RemarksFile = std::move(*RemarksFileOrErr); + LLVMRemarkFileHandle RemarksFile = std::move(*RemarksFileOrErr); // Load the input module... auto SetDataLayout = [&](StringRef IRTriple, diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp index 116693c873f30..ca9f9f17ee112 100644 --- a/llvm/unittests/ADT/APIntTest.cpp +++ b/llvm/unittests/ADT/APIntTest.cpp @@ -3718,8 +3718,9 @@ TEST(APIntTest, ScaleBitMask) { TEST(APIntTest, DenseMap) { DenseMap Map; APInt ZeroWidthInt(0, 0, false); - Map.insert({ZeroWidthInt, 0}); - Map.find(ZeroWidthInt); + Map.insert({ZeroWidthInt, 123}); + auto It = Map.find(ZeroWidthInt); + EXPECT_EQ(It->second, 123); } TEST(APIntTest, TryExt) { diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp index 6a4780c143e54..12ba0041af551 100644 --- a/llvm/unittests/ADT/BitVectorTest.cpp +++ b/llvm/unittests/ADT/BitVectorTest.cpp @@ -8,6 +8,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" #include "gtest/gtest.h" @@ -1177,6 +1178,98 @@ TYPED_TEST(BitVectorTest, Iterators) { EXPECT_EQ(List[i++], Bit); } +TYPED_TEST(BitVectorTest, BidirectionalIterator) { + // Test decrement operators. + TypeParam Vec(100, false); + Vec.set(10); + Vec.set(20); + Vec.set(30); + Vec.set(40); + + // Test that we can decrement from end(). + auto EndIt = Vec.set_bits_end(); + auto LastIt = EndIt; + --LastIt; + EXPECT_EQ(*LastIt, 40U); + + // Test post-decrement. + auto It = Vec.set_bits_end(); + auto PrevIt = It--; + EXPECT_EQ(PrevIt, Vec.set_bits_end()); + EXPECT_EQ(*It, 40U); + + // Test pre-decrement. + --It; + EXPECT_EQ(*It, 30U); + + // Test full backward iteration. + std::vector BackwardBits; + for (auto RIt = Vec.set_bits_end(); RIt != Vec.set_bits_begin();) { + --RIt; + BackwardBits.push_back(*RIt); + } + EXPECT_EQ(BackwardBits.size(), 4U); + EXPECT_EQ(BackwardBits[0], 40U); + EXPECT_EQ(BackwardBits[1], 30U); + EXPECT_EQ(BackwardBits[2], 20U); + EXPECT_EQ(BackwardBits[3], 10U); +} + +TYPED_TEST(BitVectorTest, ReverseIteration) { + // Test using llvm::reverse. + TypeParam Vec(100, false); + Vec.set(5); + Vec.set(15); + Vec.set(25); + Vec.set(35); + Vec.set(45); + + std::vector ReversedBits; + for (unsigned Bit : llvm::reverse(Vec.set_bits())) { + ReversedBits.push_back(Bit); + } + + EXPECT_EQ(ReversedBits.size(), 5U); + EXPECT_EQ(ReversedBits[0], 45U); + EXPECT_EQ(ReversedBits[1], 35U); + EXPECT_EQ(ReversedBits[2], 25U); + EXPECT_EQ(ReversedBits[3], 15U); + EXPECT_EQ(ReversedBits[4], 5U); +} + +TYPED_TEST(BitVectorTest, BidirectionalIteratorEdgeCases) { + // Test empty BitVector. + TypeParam Empty; + EXPECT_EQ(Empty.set_bits_begin(), Empty.set_bits_end()); + + // Decrementing end() on empty should give -1 (no bits set). + auto EmptyEndIt = Empty.set_bits_end(); + --EmptyEndIt; + // After decrement on empty, iterator should still be at "no bit" position. + EXPECT_EQ(*EmptyEndIt, static_cast(-1)); + + // Test single bit. + TypeParam Single(10, false); + Single.set(5); + + auto SingleIt = Single.set_bits_end(); + --SingleIt; + EXPECT_EQ(*SingleIt, 5U); + // After decrementing past the first element, the iterator is in an + // undefined state (before begin), so we don't test this case. + + // Test all bits set. + TypeParam AllSet(10, true); + std::vector AllBitsReverse; + for (unsigned Bit : llvm::reverse(AllSet.set_bits())) { + AllBitsReverse.push_back(Bit); + } + EXPECT_EQ(AllBitsReverse.size(), 10U); + for (unsigned i = 0; i < 10; ++i) { + EXPECT_EQ(AllBitsReverse[i], 9 - i); + } +} + TYPED_TEST(BitVectorTest, PushBack) { TypeParam Vec(10, false); EXPECT_EQ(-1, Vec.find_first()); diff --git a/llvm/unittests/ADT/ImmutableSetTest.cpp b/llvm/unittests/ADT/ImmutableSetTest.cpp index c0bde4c4d680b..87bc2a8da4bad 100644 --- a/llvm/unittests/ADT/ImmutableSetTest.cpp +++ b/llvm/unittests/ADT/ImmutableSetTest.cpp @@ -164,4 +164,35 @@ TEST_F(ImmutableSetTest, IterLongSetTest) { ASSERT_EQ(6, i); } +TEST_F(ImmutableSetTest, AddIfNotFoundTest) { + ImmutableSet::Factory f(/*canonicalize=*/false); + ImmutableSet S = f.getEmptySet(); + S = f.add(S, 1); + S = f.add(S, 2); + S = f.add(S, 3); + + ImmutableSet T1 = f.add(S, 1); + ImmutableSet T2 = f.add(S, 2); + ImmutableSet T3 = f.add(S, 3); + EXPECT_EQ(S.getRoot(), T1.getRoot()); + EXPECT_EQ(S.getRoot(), T2.getRoot()); + EXPECT_EQ(S.getRoot(), T3.getRoot()); + + ImmutableSet U = f.add(S, 4); + EXPECT_NE(S.getRoot(), U.getRoot()); +} + +TEST_F(ImmutableSetTest, RemoveIfNotFoundTest) { + ImmutableSet::Factory f(/*canonicalize=*/false); + ImmutableSet S = f.getEmptySet(); + S = f.add(S, 1); + S = f.add(S, 2); + S = f.add(S, 3); + + ImmutableSet T = f.remove(S, 4); + EXPECT_EQ(S.getRoot(), T.getRoot()); + + ImmutableSet U = f.remove(S, 3); + EXPECT_NE(S.getRoot(), U.getRoot()); } +} // namespace diff --git a/llvm/unittests/ADT/PackedVectorTest.cpp b/llvm/unittests/ADT/PackedVectorTest.cpp index 30fc7c0b6d07f..df2cbf0e7f0f8 100644 --- a/llvm/unittests/ADT/PackedVectorTest.cpp +++ b/llvm/unittests/ADT/PackedVectorTest.cpp @@ -71,6 +71,14 @@ TEST(PackedVectorTest, RawBitsSize) { EXPECT_EQ(12u, Vec.raw_bits().size()); } +TEST(PackedVectorTest, SignedValueOverwrite) { + PackedVector Vec(1); + Vec[0] = -1; + EXPECT_EQ(-1, Vec[0]); + Vec[0] = 1; + EXPECT_EQ(1, Vec[0]); +} + #ifdef EXPECT_DEBUG_DEATH TEST(PackedVectorTest, UnsignedValues) { diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp index 03009d53d63f4..fa451fab67549 100644 --- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp +++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp @@ -49,6 +49,9 @@ void getSimilarities( SimilarityCandidates = Identifier.findSimilarity(M); } +// TODO: All these tests could probably become IR LIT tests like +// IROutliner/outlining-special-state.ll + // Checks that different opcodes are mapped to different values TEST(IRInstructionMapper, OpcodeDifferentiation) { StringRef ModuleString = R"( diff --git a/llvm/unittests/CodeGen/TypeTraitsTest.cpp b/llvm/unittests/CodeGen/TypeTraitsTest.cpp index dde86280cff6a..f0ed0e870cbb3 100644 --- a/llvm/unittests/CodeGen/TypeTraitsTest.cpp +++ b/llvm/unittests/CodeGen/TypeTraitsTest.cpp @@ -6,13 +6,16 @@ // //===----------------------------------------------------------------------===// +#include "llvm/CodeGen/RDFRegisters.h" #include "llvm/CodeGen/RegisterPressure.h" #include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "gtest/gtest.h" +#include #include +#include using namespace llvm; @@ -23,3 +26,35 @@ static_assert(std::is_trivially_copyable_v, "trivially copyable"); static_assert(std::is_trivially_copyable_v, "trivially copyable"); static_assert(std::is_trivially_copyable_v, "trivially copyable"); + +// https://llvm.org/PR105169 +// Verify that we won't accidently specialize std::less and std::equal_to in a +// wrong way. +// C++17 [namespace.std]/2, C++20/23 [namespace.std]/5: +// A program may explicitly instantiate a template defined in the standard +// library only if the declaration +// - depends on the name of a user-defined type and +// - the instantiation meets the standard library requirements for the +// original template. +template constexpr bool CheckStdCmpRequirements() { + // std::less and std::equal_to are literal, default constructible, and + // copyable classes. + Fn f1{}; + auto f2 = f1; + auto f3 = std::move(f2); + f2 = f3; + f2 = std::move(f3); + + // Properties held on all known implementations, although not guaranteed by + // the standard. + static_assert(std::is_empty_v); + static_assert(std::is_trivially_default_constructible_v); + static_assert(std::is_trivially_copyable_v); + + return true; +} + +static_assert(CheckStdCmpRequirements>(), + "same as the original template"); +static_assert(CheckStdCmpRequirements>(), + "same as the original template"); diff --git a/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp b/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp index 78dc8502e9676..fb728c8c22e77 100644 --- a/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp +++ b/llvm/unittests/DebugInfo/LogicalView/DWARFReaderTest.cpp @@ -163,13 +163,12 @@ void checkUnspecifiedParameters(LVReader *Reader) { LVPublicNames::const_iterator IterNames = PublicNames.cbegin(); LVScope *Function = (*IterNames).first; EXPECT_EQ(Function->getName(), "foo_printf"); - const LVElements *Elements = Function->getChildren(); - ASSERT_NE(Elements, nullptr); + const LVElementsView Elements = Function->getChildren(); // foo_printf is a variadic function whose prototype is // `int foo_printf(const char *, ...)`, where the '...' is represented by a // DW_TAG_unspecified_parameters, i.e. we expect to find at least one child // for which getIsUnspecified() returns true. - EXPECT_TRUE(llvm::any_of(*Elements, [](const LVElement *elt) { + EXPECT_TRUE(llvm::any_of(Elements, [](const LVElement *elt) { return elt->getIsSymbol() && static_cast(elt)->getIsUnspecified(); })); @@ -183,8 +182,8 @@ void checkScopeModule(LVReader *Reader) { EXPECT_EQ(Root->getFileFormatName(), "Mach-O 64-bit x86-64"); EXPECT_EQ(Root->getName(), DwarfClangModule); - ASSERT_NE(CompileUnit->getChildren(), nullptr); - LVElement *FirstChild = *(CompileUnit->getChildren()->begin()); + LVElement *FirstChild = *(CompileUnit->getChildren().begin()); + ASSERT_NE(FirstChild, nullptr); EXPECT_EQ(FirstChild->getIsScope(), 1); LVScopeModule *Module = static_cast(FirstChild); EXPECT_EQ(Module->getIsModule(), 1); diff --git a/llvm/unittests/IR/DataLayoutTest.cpp b/llvm/unittests/IR/DataLayoutTest.cpp index e0c0f35847f07..9ca88141ca0eb 100644 --- a/llvm/unittests/IR/DataLayoutTest.cpp +++ b/llvm/unittests/IR/DataLayoutTest.cpp @@ -320,7 +320,8 @@ TEST(DataLayout, ParsePointerSpec) { "\"p[]::[:[:]]\"")); // address space - for (StringRef Str : {"p0x0:32:32", "px:32:32:32", "p16777216:32:32:32:32"}) + for (StringRef Str : + {"p0x0:32:32", "p10_000:32:32:32", "p16777216:32:32:32:32"}) EXPECT_THAT_EXPECTED( DataLayout::parse(Str), FailedWithMessage("address space must be a 24-bit integer")); @@ -401,6 +402,26 @@ TEST(DataLayout, ParsePointerSpec) { EXPECT_THAT_EXPECTED( DataLayout::parse(Str), FailedWithMessage("index size cannot be larger than the pointer size")); + + // Only 'e', 'u', and 'n' flags are valid. + EXPECT_THAT_EXPECTED( + DataLayout::parse("pa:32:32"), + FailedWithMessage("'a' is not a valid pointer specification flag")); + EXPECT_THAT_EXPECTED( + DataLayout::parse("puX:32:32"), + FailedWithMessage("'X' is not a valid pointer specification flag")); + // Flags must be before the address space number. + EXPECT_THAT_EXPECTED( + DataLayout::parse("p2n:32:32"), + FailedWithMessage("address space must be a 24-bit integer")); + + // AS0 cannot be non-integral. + for (StringRef Str : {"pe:64:64", "pu:64:64", "pue:64:64", "pe0:64:64", + "pu0:64:64", "peu0:64:64"}) + EXPECT_THAT_EXPECTED( + DataLayout::parse(Str), + FailedWithMessage( + "address space 0 cannot be unstable or have external state")); } TEST(DataLayoutTest, ParseNativeIntegersSpec) { @@ -556,18 +577,127 @@ TEST(DataLayout, GetPointerPrefAlignment) { } TEST(DataLayout, IsNonIntegralAddressSpace) { - DataLayout Default; - EXPECT_THAT(Default.getNonIntegralAddressSpaces(), ::testing::SizeIs(0)); + const DataLayout Default; + EXPECT_THAT(Default.getNonStandardAddressSpaces(), ::testing::SizeIs(0)); EXPECT_FALSE(Default.isNonIntegralAddressSpace(0)); EXPECT_FALSE(Default.isNonIntegralAddressSpace(1)); - DataLayout Custom = cantFail(DataLayout::parse("ni:2:16777215")); - EXPECT_THAT(Custom.getNonIntegralAddressSpaces(), + const DataLayout Custom = cantFail(DataLayout::parse("ni:2:16777215")); + EXPECT_THAT(Custom.getNonStandardAddressSpaces(), ::testing::ElementsAreArray({2U, 16777215U})); EXPECT_FALSE(Custom.isNonIntegralAddressSpace(0)); EXPECT_FALSE(Custom.isNonIntegralAddressSpace(1)); EXPECT_TRUE(Custom.isNonIntegralAddressSpace(2)); + EXPECT_TRUE(Custom.mustNotIntroduceIntToPtr(2)); + EXPECT_TRUE(Custom.mustNotIntroducePtrToInt(2)); EXPECT_TRUE(Custom.isNonIntegralAddressSpace(16777215)); + EXPECT_TRUE(Custom.mustNotIntroduceIntToPtr(16777215)); + EXPECT_TRUE(Custom.mustNotIntroducePtrToInt(16777215)); + + // Pointers are marked as non-integral if the address size != total size + for (const auto *Layout : {"p2:64:64:64:32", "p2:128:64:64:64"}) { + const DataLayout DL = cantFail(DataLayout::parse(Layout)); + EXPECT_TRUE(DL.isNonIntegralAddressSpace(2)); + EXPECT_FALSE(DL.hasUnstableRepresentation(2)); + EXPECT_FALSE(DL.hasExternalState(2)); + EXPECT_FALSE(DL.mustNotIntroduceIntToPtr(2)); + EXPECT_FALSE(DL.mustNotIntroducePtrToInt(2)); + EXPECT_THAT(DL.getNonStandardAddressSpaces(), + ::testing::ElementsAreArray({2U})); + } + // Pointers can be marked as unstable using 'pu' + for (const auto *Layout : {"pu2:64:64:64:64", "pu2:64:64:64:32"}) { + const DataLayout DL = cantFail(DataLayout::parse(Layout)); + // Note: isNonIntegralAddressSpace returns true for even with index == + EXPECT_TRUE(DL.isNonIntegralAddressSpace(2)); + EXPECT_TRUE(DL.hasUnstableRepresentation(2)); + EXPECT_FALSE(DL.hasExternalState(2)); + EXPECT_TRUE(DL.mustNotIntroducePtrToInt(2)); + EXPECT_TRUE(DL.mustNotIntroduceIntToPtr(2)); + EXPECT_THAT(DL.getNonStandardAddressSpaces(), + ::testing::ElementsAreArray({2U})); + } + + // Non-integral pointers with external state ('e' flag). + for (const auto *Layout : {"pe2:64:64:64:32", "pe2:64:64:64:64"}) { + const DataLayout DL = cantFail(DataLayout::parse(Layout)); + EXPECT_TRUE(DL.isNonIntegralAddressSpace(2)); + EXPECT_TRUE(DL.hasExternalState(2)); + EXPECT_TRUE(DL.mustNotIntroduceIntToPtr(2)); + EXPECT_FALSE(DL.mustNotIntroducePtrToInt(2)); + EXPECT_FALSE(DL.hasUnstableRepresentation(2)); + EXPECT_THAT(DL.getNonStandardAddressSpaces(), + ::testing::ElementsAreArray({2U})); + } + + // It is also possible to have both unstable representation and external state + for (const auto *Layout : {"peu2:64:64:64:32", "pue2:128:64:64:64"}) { + const DataLayout DL = cantFail(DataLayout::parse(Layout)); + EXPECT_TRUE(DL.isNonIntegralAddressSpace(2)); + EXPECT_TRUE(DL.hasExternalState(2)); + EXPECT_TRUE(Custom.mustNotIntroduceIntToPtr(2)); + EXPECT_TRUE(Custom.mustNotIntroducePtrToInt(2)); + EXPECT_TRUE(DL.hasUnstableRepresentation(2)); + EXPECT_THAT(DL.getNonStandardAddressSpaces(), + ::testing::ElementsAreArray({2U})); + } + + // For backwards compatibility, the ni DataLayout part overrides any + // p[e][u]. + for (const auto *Layout : + {"ni:2-p2:64:64:64:32", "ni:2-pu2:64:64:64:32", "ni:2-pu2:64:64:64:32", + "p2:64:64:64:32-ni:2", "pu2:64:64:64:32-ni:2", "pe2:64:64:64:32-ni:2", + "peeee2:64:64:64:32-pu2:64:64:64:32-ni:2"}) { + DataLayout DL = cantFail(DataLayout::parse(Layout)); + EXPECT_TRUE(DL.isNonIntegralAddressSpace(2)); + EXPECT_TRUE(DL.hasUnstableRepresentation(2)); + // The external state property is new and not expected for existing uses of + // non-integral pointers, so existing :ni data layouts should not set it. + EXPECT_FALSE(DL.hasExternalState(2)); + EXPECT_THAT(DL.getNonStandardAddressSpaces(), + ::testing::ElementsAreArray({2U})); + } +} + +TEST(DataLayout, NonIntegralHelpers) { + DataLayout DL = cantFail(DataLayout::parse( + "p1:128:128:128:64-pu2:32:32:32:32-pu3:64:64:64:32-pe4:64:64:64:32")); + EXPECT_THAT(DL.getNonStandardAddressSpaces(), + ::testing::ElementsAreArray({1u, 2u, 3u, 4u})); + struct Result { + unsigned Addrspace; + bool NonIntegral; + bool Unstable; + bool ExternalState; + unsigned Size; + } ExpectedResults[] = { + {0, false, false, false, 64}, {1, true, false, false, 128}, + {2, true, true, false, 32}, {3, true, true, false, 64}, + {4, true, false, true, 64}, + }; + LLVMContext Ctx; + for (const auto &Exp : ExpectedResults) { + EXPECT_EQ(Exp.NonIntegral, DL.isNonIntegralAddressSpace(Exp.Addrspace)); + EXPECT_EQ(Exp.Unstable, DL.hasUnstableRepresentation(Exp.Addrspace)); + EXPECT_EQ(Exp.ExternalState, DL.hasExternalState(Exp.Addrspace)); + bool AvoidIntToPtr = Exp.Unstable || Exp.ExternalState; + EXPECT_EQ(AvoidIntToPtr, DL.mustNotIntroduceIntToPtr(Exp.Addrspace)); + bool AvoidPtrToInt = Exp.Unstable; + EXPECT_EQ(AvoidPtrToInt, DL.mustNotIntroducePtrToInt(Exp.Addrspace)); + Type *PtrTy = PointerType::get(Ctx, Exp.Addrspace); + Type *PtrVecTy = VectorType::get(PtrTy, 2, /*Scalable=*/false); + Type *ScalablePtrVecTy = VectorType::get(PtrTy, 1, /*Scalable=*/true); + for (Type *Ty : {PtrTy, PtrVecTy, ScalablePtrVecTy}) { + EXPECT_EQ(AvoidPtrToInt, DL.mustNotIntroducePtrToInt(Ty)); + EXPECT_EQ(AvoidIntToPtr, DL.mustNotIntroduceIntToPtr(Ty)); + // The old API should return true for both unstable and non-integral. + EXPECT_EQ(Exp.Unstable || Exp.NonIntegral, + DL.isNonIntegralPointerType(Ty)); + } + // Both helpers gracefully handle non-pointer, non-vector-of-pointers: + EXPECT_FALSE(DL.mustNotIntroducePtrToInt(IntegerType::getInt1Ty(Ctx))); + EXPECT_FALSE(DL.mustNotIntroduceIntToPtr(IntegerType::getInt1Ty(Ctx))); + } } TEST(DataLayoutTest, CopyAssignmentInvalidatesStructLayout) { diff --git a/llvm/unittests/MC/StringTableBuilderTest.cpp b/llvm/unittests/MC/StringTableBuilderTest.cpp index 05f469a229bf9..44a985be6cfcb 100644 --- a/llvm/unittests/MC/StringTableBuilderTest.cpp +++ b/llvm/unittests/MC/StringTableBuilderTest.cpp @@ -58,8 +58,8 @@ TEST(StringTableBuilderTest, BasicWinCOFF) { std::string Expected; - ExpectedSize = support::endian::byte_swap( - ExpectedSize); + ExpectedSize = support::endian::byte_swap(ExpectedSize, + llvm::endianness::little); Expected.append((const char*)&ExpectedSize, 4); Expected += "pygmy hippopotamus"; Expected += '\x00'; diff --git a/llvm/unittests/Object/DXContainerTest.cpp b/llvm/unittests/Object/DXContainerTest.cpp index 396d060a75bfd..d6f7b26b99cd7 100644 --- a/llvm/unittests/Object/DXContainerTest.cpp +++ b/llvm/unittests/Object/DXContainerTest.cpp @@ -1200,4 +1200,52 @@ TEST(RootSignature, ParseStaticSamplers) { ASSERT_EQ(Sampler.RegisterSpace, 32u); ASSERT_EQ(Sampler.ShaderVisibility, 7u); } + { + // this is testing static sampler parsing for root signature version 1.2, + // it changes: the version number, the size of root signature being emitted + // and the values for flag fields. + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x4c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, + 0xa4, 0x70, 0x9d, 0x3f, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x85, 0xeb, 0x91, 0x40, 0x66, 0x66, 0x0e, 0x41, + 0x1f, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00}; + DXContainer C = + llvm::cantFail(DXContainer::create(getMemoryBuffer<148>(Buffer))); + + auto MaybeRS = C.getRootSignature(); + ASSERT_TRUE(MaybeRS.has_value()); + const auto &RS = MaybeRS.value(); + ASSERT_EQ(RS.getVersion(), 3U); + ASSERT_EQ(RS.getNumParameters(), 0U); + ASSERT_EQ(RS.getRootParametersOffset(), 0U); + ASSERT_EQ(RS.getNumStaticSamplers(), 1U); + ASSERT_EQ(RS.getStaticSamplersOffset(), 24U); + ASSERT_EQ(RS.getFlags(), 17U); + + auto Sampler = *RS.samplers().begin(); + + ASSERT_EQ(Sampler.Filter, 10U); + ASSERT_EQ(Sampler.AddressU, 1U); + ASSERT_EQ(Sampler.AddressV, 2U); + ASSERT_EQ(Sampler.AddressW, 5U); + ASSERT_FLOAT_EQ(Sampler.MipLODBias, 1.23F); + ASSERT_EQ(Sampler.MaxAnisotropy, 20U); + ASSERT_EQ(Sampler.ComparisonFunc, 4U); + ASSERT_EQ(Sampler.BorderColor, 0U); + ASSERT_FLOAT_EQ(Sampler.MinLOD, 4.56F); + ASSERT_FLOAT_EQ(Sampler.MaxLOD, 8.9F); + ASSERT_EQ(Sampler.ShaderRegister, 31U); + ASSERT_EQ(Sampler.RegisterSpace, 32U); + ASSERT_EQ(Sampler.ShaderVisibility, 7U); + ASSERT_EQ(Sampler.Flags, 1U); + } } diff --git a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp index b0ad208625436..1b21fe01dfca9 100644 --- a/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp +++ b/llvm/unittests/ObjectYAML/DXContainerYAMLTest.cpp @@ -526,3 +526,54 @@ TEST(RootSignature, ParseStaticSamplers) { EXPECT_EQ(Storage.size(), 144u); EXPECT_TRUE(memcmp(Buffer, Storage.data(), 144u) == 0); } + +TEST(RootSignature, ParseStaticSamplersV13) { + SmallString<128> Storage; + + // First read a fully explicit yaml with all sizes and offsets provided + ASSERT_TRUE(convert(Storage, R"(--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + PartCount: 1 + PartOffsets: [ 60 ] +Parts: + - Name: RTS0 + Size: 76 + RootSignature: + Version: 3 + NumRootParameters: 0 + RootParametersOffset: 24 + NumStaticSamplers: 1 + StaticSamplersOffset: 24 + Parameters: [] + Samplers: + - ShaderRegister: 31 + RegisterSpace: 32 + ShaderVisibility: All + SAMPLER_FLAG_UINT_BORDER_COLOR: true + AllowInputAssemblerInputLayout: true + DenyGeometryShaderRootAccess: true + )")); + + uint8_t Buffer[] = { + 0x44, 0x58, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x52, 0x54, 0x53, 0x30, 0x4c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x18, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x7f, 0x7f, + 0x1f, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00}; + + EXPECT_EQ(Storage.size(), 148U); + EXPECT_TRUE(memcmp(Buffer, Storage.data(), 148U) == 0); +} diff --git a/llvm/unittests/Remarks/BitstreamRemarksFormatTest.cpp b/llvm/unittests/Remarks/BitstreamRemarksFormatTest.cpp index ea61691f4c835..ddf744521ef13 100644 --- a/llvm/unittests/Remarks/BitstreamRemarksFormatTest.cpp +++ b/llvm/unittests/Remarks/BitstreamRemarksFormatTest.cpp @@ -21,7 +21,7 @@ TEST(BitstreamRemarksFormat, Magic) { // This should be updated whenever any of the tests below are modified. TEST(BitstreamRemarksFormat, ContainerVersion) { - EXPECT_EQ(remarks::CurrentContainerVersion, 0UL); + EXPECT_EQ(remarks::CurrentContainerVersion, 1UL); } // The values of the current blocks should not change over time. diff --git a/llvm/unittests/Remarks/BitstreamRemarksParsingTest.cpp b/llvm/unittests/Remarks/BitstreamRemarksParsingTest.cpp index 6234931b3bece..f5973f5431c9c 100644 --- a/llvm/unittests/Remarks/BitstreamRemarksParsingTest.cpp +++ b/llvm/unittests/Remarks/BitstreamRemarksParsingTest.cpp @@ -14,7 +14,7 @@ using namespace llvm; -template void parseGood(const char (&Buf)[N]) { +template static void parseGood(const char (&Buf)[N]) { // 1. Parse the YAML remark -> FromYAMLRemark // 2. Serialize it to bitstream -> BSStream // 3. Parse it back -> FromBSRemark @@ -48,11 +48,11 @@ template void parseGood(const char (&Buf)[N]) { std::string BSBuf; raw_string_ostream BSStream(BSBuf); Expected> BSSerializer = - remarks::createRemarkSerializer(remarks::Format::Bitstream, - remarks::SerializerMode::Standalone, - BSStream, std::move(BSStrTab)); + remarks::createRemarkSerializer(remarks::Format::Bitstream, BSStream, + std::move(BSStrTab)); EXPECT_FALSE(errorToBool(BSSerializer.takeError())); (*BSSerializer)->emit(*FromYAMLRemark); + (*BSSerializer)->finalize(); // 3. Expected> MaybeBSParser = @@ -256,11 +256,11 @@ TEST(BitstreamRemarks, ContentsCAPI) { std::string BSBuf; raw_string_ostream BSStream(BSBuf); Expected> BSSerializer = - remarks::createRemarkSerializer(remarks::Format::Bitstream, - remarks::SerializerMode::Standalone, - BSStream, std::move(BSStrTab)); + remarks::createRemarkSerializer(remarks::Format::Bitstream, BSStream, + std::move(BSStrTab)); EXPECT_FALSE(errorToBool(BSSerializer.takeError())); (*BSSerializer)->emit(ToSerializeRemark); + (*BSSerializer)->finalize(); StringRef Buf = BSStream.str(); LLVMRemarkParserRef Parser = diff --git a/llvm/unittests/Remarks/BitstreamRemarksSerializerTest.cpp b/llvm/unittests/Remarks/BitstreamRemarksSerializerTest.cpp index 8113d35b3aff8..3b460965fdb23 100644 --- a/llvm/unittests/Remarks/BitstreamRemarksSerializerTest.cpp +++ b/llvm/unittests/Remarks/BitstreamRemarksSerializerTest.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Bitcode/BitcodeAnalyzer.h" -#include "llvm/Remarks/BitstreamRemarkSerializer.h" #include "llvm/Remarks/Remark.h" +#include "llvm/Remarks/RemarkSerializer.h" #include "llvm/Support/raw_ostream.h" #include "gtest/gtest.h" #include @@ -34,23 +34,24 @@ static void checkAnalyze(StringRef Input, StringRef Expected) { EXPECT_EQ(OutputOS.str(), Expected); } -static void check(remarks::SerializerMode Mode, const remarks::Remark &R, - StringRef ExpectedR, std::optional ExpectedMeta, - std::optional StrTab) { +static void check(const remarks::Remark &R, StringRef ExpectedR, + std::optional ExpectedMeta = std::nullopt, + std::optional StrTab = std::nullopt) { // Emit the remark. std::string InputBuf; raw_string_ostream InputOS(InputBuf); Expected> MaybeSerializer = [&] { if (StrTab) - return createRemarkSerializer(remarks::Format::Bitstream, Mode, InputOS, + return createRemarkSerializer(remarks::Format::Bitstream, InputOS, std::move(*StrTab)); else - return createRemarkSerializer(remarks::Format::Bitstream, Mode, InputOS); + return createRemarkSerializer(remarks::Format::Bitstream, InputOS); }(); EXPECT_FALSE(errorToBool(MaybeSerializer.takeError())); std::unique_ptr Serializer = std::move(*MaybeSerializer); Serializer->emit(R); + Serializer->finalize(); // Analyze the serialized remark. checkAnalyze(InputOS.str(), ExpectedR); @@ -66,20 +67,6 @@ static void check(remarks::SerializerMode Mode, const remarks::Remark &R, } } -static void check(const remarks::Remark &R, StringRef ExpectedR, - StringRef ExpectedMeta, - std::optional StrTab = std::nullopt) { - return check(remarks::SerializerMode::Separate, R, ExpectedR, ExpectedMeta, - std::move(StrTab)); -} - -static void -checkStandalone(const remarks::Remark &R, StringRef ExpectedR, - std::optional StrTab = std::nullopt) { - return check(remarks::SerializerMode::Standalone, R, ExpectedR, - /*ExpectedMeta=*/std::nullopt, std::move(StrTab)); -} - TEST(BitstreamRemarkSerializer, SeparateRemarkFileNoOptionals) { remarks::Remark R; R.RemarkType = remarks::Type::Missed; @@ -89,19 +76,21 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileNoOptionals) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'remark\\x00pass\\x00function\\x00'\n" - " blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n"); } @@ -118,19 +107,21 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileNoOptionalsSeparateStrTab) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'function\\x00pass\\x00remark\\x00'\n" - " blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n", std::move(StrTab)); } @@ -148,20 +139,22 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileDebugLoc) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" " \n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'remark\\x00pass\\x00function\\x00path\\x00'\n" - " blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n"); } @@ -175,20 +168,22 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileHotness) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" " \n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'remark\\x00pass\\x00function\\x00'\n" - " blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n"); } @@ -204,20 +199,22 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileArgNoDebugLoc) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" " \n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'remark\\x00pass\\x00function\\x00key\\x00value\\x00'\n" - " blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n"); } @@ -237,21 +234,23 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileArgDebugLoc) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" " \n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'remark\\x00pass\\x00function\\x00key\\x00value\\x00path\\x00'\n" - " blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n"); } @@ -276,7 +275,7 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileAll) { check(R, "\n" "\n" - " \n" + " \n" " \n" "\n" "\n" @@ -285,14 +284,17 @@ TEST(BitstreamRemarkSerializer, SeparateRemarkFileAll) { " \n" " \n" - "\n", - "\n" - "\n" - " \n" - " blob data = " + "\n" + "\n" + " blob data = " "'remark\\x00pass\\x00function\\x00path\\x00key\\x00value\\x00argpa" - "th\\x00'\n blob data = " - "'" EXTERNALFILETESTPATH"'\n" + "th\\x00'\n" + "\n", + "\n" + "\n" + " \n" + " blob data = " + "'" EXTERNALFILETESTPATH "'\n" "\n"); } @@ -323,15 +325,12 @@ TEST(BitstreamRemarkSerializer, Standalone) { R.Args.back().Loc->SourceFilePath = "argpath"; R.Args.back().Loc->SourceLine = 11; R.Args.back().Loc->SourceColumn = 66; - checkStandalone( + check( R, "\n" - "\n" - " \n" + "\n" + " \n" " \n" - " blob data = " - "'pass\\x00remark\\x00function\\x00path\\x00key\\x00value\\x00argpath\\x0" - "0'\n" "\n" "\n" " \n" @@ -339,6 +338,11 @@ TEST(BitstreamRemarkSerializer, Standalone) { " \n" " \n" - "\n", - std::move(StrTab)); + "\n" + "\n" + " blob data = " + "'pass\\x00remark\\x00function\\x00path\\x00key\\x00value\\x00argpath\\x0" + "0'\n" + "\n", + std::nullopt, std::move(StrTab)); } diff --git a/llvm/unittests/Remarks/RemarksLinkingTest.cpp b/llvm/unittests/Remarks/RemarksLinkingTest.cpp index 89de9e8f4f95d..54942ff681b47 100644 --- a/llvm/unittests/Remarks/RemarksLinkingTest.cpp +++ b/llvm/unittests/Remarks/RemarksLinkingTest.cpp @@ -133,16 +133,18 @@ TEST(Remarks, LinkingGoodBitstream) { "...\n", remarks::Format::Bitstream, "\n" - "\n" - " \n" + "\n" + " \n" " \n" - " blob data = " - "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00'\n" "\n" "\n" " \n" " \n" - "\n"); + "\n" + "\n" + " blob data = " + "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00'\n" + "\n"); // Check that we keep remarks without debug info. check(remarks::Format::YAML, @@ -153,15 +155,17 @@ TEST(Remarks, LinkingGoodBitstream) { "...\n", remarks::Format::Bitstream, "\n" - "\n" - " \n" + "\n" + " \n" " \n" - " blob data = " - "'inline\\x00NoDefinition\\x00foo\\x00'\n" "\n" "\n" " \n" - "\n"); + "\n" + "\n" + " blob data = " + "'inline\\x00NoDefinition\\x00foo\\x00'\n" + "\n"); // Check that we deduplicate remarks. check(remarks::Format::YAML, @@ -179,16 +183,18 @@ TEST(Remarks, LinkingGoodBitstream) { "...\n", remarks::Format::Bitstream, "\n" - "\n" - " \n" + "\n" + " \n" " \n" - " blob data = " - "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00'\n" "\n" "\n" " \n" " \n" - "\n"); + "\n" + "\n" + " blob data = " + "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00'\n" + "\n"); } TEST(Remarks, LinkingGoodStrTab) { @@ -209,11 +215,9 @@ TEST(Remarks, LinkingGoodStrTab) { "...\n", remarks::Format::Bitstream, "\n" - "\n" - " \n" + "\n" + " \n" " \n" - " blob data = " - "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00Ok\\x00'\n" "\n" "\n" " \n" @@ -222,7 +226,11 @@ TEST(Remarks, LinkingGoodStrTab) { "\n" " \n" " \n" - "\n"); + "\n" + "\n" + " blob data = " + "'inline\\x00NoDefinition\\x00foo\\x00file.c\\x00Ok\\x00'\n" + "\n"); } // Check that we propagate parsing errors. diff --git a/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp b/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp index 7e994ac4d58bc..974356d9cf30a 100644 --- a/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp +++ b/llvm/unittests/Remarks/YAMLRemarksSerializerTest.cpp @@ -23,23 +23,23 @@ using namespace llvm; static void check(remarks::Format SerializerFormat, - remarks::SerializerMode Mode, ArrayRef Rs, - StringRef ExpectedR, std::optional ExpectedMeta, + ArrayRef Rs, StringRef ExpectedR, + std::optional ExpectedMeta, std::optional StrTab = std::nullopt) { std::string Buf; raw_string_ostream OS(Buf); Expected> MaybeS = [&] { if (StrTab) - return createRemarkSerializer(SerializerFormat, Mode, OS, - std::move(*StrTab)); + return createRemarkSerializer(SerializerFormat, OS, std::move(*StrTab)); else - return createRemarkSerializer(SerializerFormat, Mode, OS); + return createRemarkSerializer(SerializerFormat, OS); }(); EXPECT_FALSE(errorToBool(MaybeS.takeError())); std::unique_ptr S = std::move(*MaybeS); for (const remarks::Remark &R : Rs) S->emit(R); + S->finalize(); EXPECT_EQ(OS.str(), ExpectedR); if (ExpectedMeta) { @@ -54,8 +54,7 @@ static void check(remarks::Format SerializerFormat, static void check(remarks::Format SerializerFormat, const remarks::Remark &R, StringRef ExpectedR, StringRef ExpectedMeta, std::optional StrTab = std::nullopt) { - return check(SerializerFormat, remarks::SerializerMode::Separate, - ArrayRef(&R, &R + 1), ExpectedR, ExpectedMeta, + return check(SerializerFormat, ArrayRef(&R, &R + 1), ExpectedR, ExpectedMeta, std::move(StrTab)); } @@ -63,8 +62,7 @@ static void checkStandalone(remarks::Format SerializerFormat, const remarks::Remark &R, StringRef ExpectedR, std::optional StrTab = std::nullopt) { - return check(SerializerFormat, remarks::SerializerMode::Standalone, - ArrayRef(&R, &R + 1), ExpectedR, + return check(SerializerFormat, ArrayRef(&R, &R + 1), ExpectedR, /*ExpectedMeta=*/std::nullopt, std::move(StrTab)); } @@ -165,3 +163,33 @@ TEST(YAMLRemarks, SerializerRemarkParsedStrTabStandaloneNoStrTab) { "...\n"), std::move(PreFilledStrTab)); } + +TEST(YAMLRemarks, SerializerRemarkStringRefOOBRead) { + remarks::Remark R; + R.RemarkType = remarks::Type::Missed; + R.PassName = StringRef("passAAAA", 4); + R.RemarkName = StringRef("nameAAAA", 4); + R.FunctionName = StringRef("funcAAAA", 4); + R.Loc = remarks::RemarkLocation{StringRef("pathAAAA", 4), 3, 4}; + R.Hotness = 5; + R.Args.emplace_back(); + R.Args.back().Key = StringRef("keyAAAA", 3); + R.Args.back().Val = StringRef("valueAAAA", 5); + R.Args.emplace_back(); + R.Args.back().Key = StringRef("keydebugAAAA", 8); + R.Args.back().Val = StringRef("valuedebugAAAA", 10); + R.Args.back().Loc = + remarks::RemarkLocation{StringRef("argpathAAAA", 7), 6, 7}; + checkStandalone(remarks::Format::YAML, R, + "--- !Missed\n" + "Pass: pass\n" + "Name: name\n" + "DebugLoc: { File: path, Line: 3, Column: 4 }\n" + "Function: func\n" + "Hotness: 5\n" + "Args:\n" + " - key: value\n" + " - keydebug: valuedebug\n" + " DebugLoc: { File: argpath, Line: 6, Column: 7 }\n" + "...\n"); +} diff --git a/llvm/unittests/Support/AlignmentTest.cpp b/llvm/unittests/Support/AlignmentTest.cpp index 7b771977027b4..3a4416128d0e5 100644 --- a/llvm/unittests/Support/AlignmentTest.cpp +++ b/llvm/unittests/Support/AlignmentTest.cpp @@ -44,6 +44,16 @@ TEST(AlignmentTest, AlignConstexprConstant) { EXPECT_EQ(Align(alignof(uint64_t)), kConstantAlign); } +TEST(AlignmentTest, ConstexprAssign) { + constexpr auto assignAndGet = []() constexpr { + Align A = Align::Constant<8>(); + Align B = Align::Constant<16>(); + A = B; + return A.value(); + }; + static_assert(assignAndGet() == 16); +} + std::vector getValidAlignments() { std::vector Out; for (size_t Shift = 0; Shift < 64; ++Shift) diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp index 88e6445190b59..7f538f155be15 100644 --- a/llvm/unittests/Support/CommandLineTest.cpp +++ b/llvm/unittests/Support/CommandLineTest.cpp @@ -2117,6 +2117,22 @@ TEST(CommandLineTest, ConsumeAfterTwoPositionals) { EXPECT_TRUE(Errs.empty()); } +TEST(CommandLineTest, ConsumeOptionalString) { + cl::ResetCommandLineParser(); + + StackOption, cl::opt>> + Input("input"); + + const char *Args[] = {"prog", "--input=\"value\""}; + + std::string Errs; + raw_string_ostream OS(Errs); + ASSERT_TRUE(cl::ParseCommandLineOptions(2, Args, StringRef(), &OS)); + ASSERT_TRUE(Input.has_value()); + EXPECT_EQ("\"value\"", *Input); + EXPECT_TRUE(Errs.empty()); +} + TEST(CommandLineTest, ResetAllOptionOccurrences) { cl::ResetCommandLineParser(); diff --git a/llvm/unittests/Support/EndianTest.cpp b/llvm/unittests/Support/EndianTest.cpp index c48b7707b7751..0ee631db74ac1 100644 --- a/llvm/unittests/Support/EndianTest.cpp +++ b/llvm/unittests/Support/EndianTest.cpp @@ -24,16 +24,15 @@ TEST(Endian, Read) { unsigned char littleval[] = {0x00, 0x04, 0x03, 0x02, 0x01}; int32_t BigAsHost = 0x00010203; EXPECT_EQ(BigAsHost, - (endian::read(bigval))); + (endian::read(bigval, llvm::endianness::big))); int32_t LittleAsHost = 0x02030400; - EXPECT_EQ( - LittleAsHost, - (endian::read(littleval))); + EXPECT_EQ(LittleAsHost, (endian::read( + littleval, llvm::endianness::little))); EXPECT_EQ( - (endian::read(bigval + 1)), - (endian::read(littleval + - 1))); + (endian::read(bigval + 1, llvm::endianness::big)), + (endian::read(littleval + 1, + llvm::endianness::little))); } TEST(Endian, WriteNext) { diff --git a/llvm/unittests/Support/FileCollectorTest.cpp b/llvm/unittests/Support/FileCollectorTest.cpp index 184d0e3fdfd17..0ece86947b4f2 100644 --- a/llvm/unittests/Support/FileCollectorTest.cpp +++ b/llvm/unittests/Support/FileCollectorTest.cpp @@ -43,7 +43,8 @@ class TestingFileCollector : public FileCollector { TEST(FileCollectorTest, addFile) { TempDir root("add_file_root", /*Unique*/ true); std::string root_fs(root.path()); - TestingFileCollector FileCollector(root_fs, root_fs); + TestingFileCollector FileCollector(root_fs, root_fs, + vfs::getRealFileSystem()); FileCollector.addFile("/path/to/a"); FileCollector.addFile("/path/to/b"); @@ -77,7 +78,8 @@ TEST(FileCollectorTest, addDirectory) { TempFile c(ccc.str()); std::string root_fs(file_root.path()); - TestingFileCollector FileCollector(root_fs, root_fs); + TestingFileCollector FileCollector(root_fs, root_fs, + vfs::getRealFileSystem()); FileCollector.addDirectory(file_root.path()); @@ -105,7 +107,8 @@ TEST(FileCollectorTest, copyFiles) { // Create file collector and add files. TempDir root("copy_files_root", /*Unique*/ true); std::string root_fs(root.path()); - TestingFileCollector FileCollector(root_fs, root_fs); + TestingFileCollector FileCollector(root_fs, root_fs, + vfs::getRealFileSystem()); FileCollector.addFile(a.path()); FileCollector.addFile(b.path()); FileCollector.addFile(c.path()); @@ -133,7 +136,8 @@ TEST(FileCollectorTest, recordAndConstructDirectory) { // Create file collector and add files. TempDir root("copy_files_root", /*Unique*/ true); std::string root_fs(root.path()); - TestingFileCollector FileCollector(root_fs, root_fs); + TestingFileCollector FileCollector(root_fs, root_fs, + vfs::getRealFileSystem()); FileCollector.addFile(a.path()); // The empty directory isn't seen until we add it. @@ -169,7 +173,8 @@ TEST(FileCollectorTest, recordVFSAccesses) { // Create file collector and add files. TempDir root("copy_files_root", /*Unique*/ true); std::string root_fs(root.path()); - auto Collector = std::make_shared(root_fs, root_fs); + auto Collector = std::make_shared( + root_fs, root_fs, vfs::getRealFileSystem()); auto VFS = FileCollector::createCollectorVFS(vfs::getRealFileSystem(), Collector); VFS->status(a.path()); @@ -216,7 +221,8 @@ TEST(FileCollectorTest, Symlinks) { // Root where files are copied to. TempDir reproducer_root("reproducer_root", /*Unique*/ true); std::string root_fs(reproducer_root.path()); - TestingFileCollector FileCollector(root_fs, root_fs); + TestingFileCollector FileCollector(root_fs, root_fs, + vfs::getRealFileSystem()); // Add all the files to the collector. FileCollector.addFile(a.path()); @@ -264,7 +270,8 @@ TEST(FileCollectorTest, recordVFSSymlinkAccesses) { // Create file collector and add files. TempDir root("copy_files_root", true); std::string root_fs(root.path()); - auto Collector = std::make_shared(root_fs, root_fs); + auto Collector = std::make_shared( + root_fs, root_fs, vfs::getRealFileSystem()); auto VFS = FileCollector::createCollectorVFS(vfs::getRealFileSystem(), Collector); SmallString<256> Output; diff --git a/llvm/unittests/Support/MustacheTest.cpp b/llvm/unittests/Support/MustacheTest.cpp index 6ab3d4b01bc1b..02eaed4244cc7 100644 --- a/llvm/unittests/Support/MustacheTest.cpp +++ b/llvm/unittests/Support/MustacheTest.cpp @@ -991,6 +991,16 @@ TEST(MustachePartials, PaddingWhitespace) { EXPECT_EQ("|[]|", Out); } +TEST(MustachePartials, StandaloneIndentation) { + Value D = Object{{"content", "<\n->"}}; + auto T = Template("\\\n {{>partial}}\n/\n"); + T.registerPartial("partial", "|\n{{{content}}}\n|\n"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_NE("\\\n |\n <\n ->\n |\n/\n", Out); +} + TEST(MustacheLambdas, BasicInterpolation) { Value D = Object{}; auto T = Template("Hello, {{lambda}}!"); @@ -1224,3 +1234,97 @@ TEST(MustacheComments, VariableNameCollision) { T.render(D, OS); EXPECT_EQ("comments never show: ><", Out); } + +// XFAIL: The following tests for the Triple Mustache feature are expected to +// fail. The assertions have been inverted from EXPECT_EQ to EXPECT_NE to allow +// them to pass against the current implementation. Once Triple Mustache is +// implemented, these assertions should be changed back to EXPECT_EQ. +TEST(MustacheTripleMustache, Basic) { + Value D = Object{{"subject", "World"}}; + auto T = Template("Hello, {{{subject}}}!"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("Hello, World!", Out); +} + +TEST(MustacheTripleMustache, IntegerInterpolation) { + Value D = Object{{"mph", 85}}; + auto T = Template("{{{mph}}} miles an hour!"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("85 miles an hour!", Out); +} + +TEST(MustacheTripleMustache, DecimalInterpolation) { + Value D = Object{{"power", 1.21}}; + auto T = Template("{{{power}}} jiggawatts!"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("1.21 jiggawatts!", Out); +} + +TEST(MustacheTripleMustache, NullInterpolation) { + Value D = Object{{"cannot", nullptr}}; + auto T = Template("I ({{{cannot}}}) be seen!"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("I () be seen!", Out); +} + +TEST(MustacheTripleMustache, ContextMissInterpolation) { + Value D = Object{}; + auto T = Template("I ({{{cannot}}}) be seen!"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("I () be seen!", Out); +} + +TEST(MustacheTripleMustache, DottedNames) { + Value D = Object{{"person", Object{{"name", "Joe"}}}}; + auto T = Template("{{{person.name}}}"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("Joe", Out); +} + +TEST(MustacheTripleMustache, ImplicitIterator) { + Value D = Object{{"list", Array{"", ""}}}; + auto T = Template("{{#list}}({{{.}}}){{/list}}"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("()()", Out); +} + +TEST(MustacheTripleMustache, SurroundingWhitespace) { + Value D = Object{{"string", "---"}}; + auto T = Template("| {{{string}}} |"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("| --- |", Out); +} + +TEST(MustacheTripleMustache, Standalone) { + Value D = Object{{"string", "---"}}; + auto T = Template(" {{{string}}}\n"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ(" ---\n", Out); +} + +TEST(MustacheTripleMustache, WithPadding) { + Value D = Object{{"string", "---"}}; + auto T = Template("|{{{ string }}}|"); + std::string Out; + raw_string_ostream OS(Out); + T.render(D, OS); + EXPECT_EQ("|---|", Out); +} diff --git a/llvm/unittests/Support/ProgramTest.cpp b/llvm/unittests/Support/ProgramTest.cpp index eac0246d8c59e..13a142fcb0624 100644 --- a/llvm/unittests/Support/ProgramTest.cpp +++ b/llvm/unittests/Support/ProgramTest.cpp @@ -695,7 +695,14 @@ TEST_F(ProgramEnvTest, TestExecuteEmptyEnvironment) { int RetCode = ExecuteAndWait(Executable, argv, ArrayRef{}, {}, 0, 0, &Error, &ExecutionFailed); EXPECT_FALSE(ExecutionFailed) << Error; +#ifndef __MINGW32__ + // When running with an empty environment, the child process doesn't in herit + // the PATH variable. On MinGW, it is common for executables to require a + // shared libstdc++ or libc++ DLL, which may be in PATH but not in the + // directory of SupportTests.exe - leading to STATUS_DLL_NOT_FOUND errors. + // Therefore, waive this failure in MinGW environments. ASSERT_EQ(0, RetCode); +#endif } } // end anonymous namespace diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index b8efab6399779..5c6c824dadd7d 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1137,6 +1137,7 @@ R"(All available -march extensions for RISC-V xandesvbfhcvt 5.0 xandesvdot 5.0 xandesvpackfph 5.0 + xandesvsinth 5.0 xandesvsintload 5.0 xcvalu 1.0 xcvbi 1.0 diff --git a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp index 68bf640334b5f..f477a118b4c8b 100644 --- a/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp +++ b/llvm/unittests/Transforms/Coroutines/ExtraRematTest.cpp @@ -97,7 +97,7 @@ StringRef Text = R"( call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0) + call void @llvm.coro.end(ptr %hdl, i1 0) ret ptr %hdl } @@ -110,7 +110,7 @@ StringRef Text = R"( declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin(token, ptr) - declare i1 @llvm.coro.end(ptr, i1) + declare void @llvm.coro.end(ptr, i1) declare i32 @should.remat(i32) @@ -212,7 +212,7 @@ StringRef TextCoroBeginCustomABI = R"( call void @free(ptr %mem) br label %suspend suspend: - call i1 @llvm.coro.end(ptr %hdl, i1 0) + call void @llvm.coro.end(ptr %hdl, i1 0) ret ptr %hdl } @@ -225,7 +225,7 @@ StringRef TextCoroBeginCustomABI = R"( declare token @llvm.coro.id(i32, ptr, ptr, ptr) declare i1 @llvm.coro.alloc(token) declare ptr @llvm.coro.begin.custom.abi(token, ptr, i32) - declare i1 @llvm.coro.end(ptr, i1) + declare void @llvm.coro.end(ptr, i1) declare i32 @should.remat(i32) diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp index 40a8c1d8d3da1..3c9374b526b09 100644 --- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp +++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp @@ -672,7 +672,7 @@ define void @positive_case(i32 %0) #0 { destroy: ret void exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } @@ -689,13 +689,13 @@ define void @notpresplit(i32 %0) { destroy: ret void exit: - call i1 @llvm.coro.end(ptr null, i1 false, token none) + call void @llvm.coro.end(ptr null, i1 false, token none) ret void } declare token @llvm.coro.save(ptr) declare i8 @llvm.coro.suspend(token, i1) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) attributes #0 = { presplitcoroutine } )IR"); diff --git a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp index 17809e7beda95..1b0073e026bae 100644 --- a/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp +++ b/llvm/unittests/tools/llvm-mca/X86/TestIncrementalMCA.cpp @@ -130,6 +130,10 @@ TEST_F(X86TestBase, TestInstructionRecycling) { mca::InstrBuilder IB(*STI, *MCII, *MRI, MCIA.get(), *IM, /*CallLatency=*/100); IB.setInstRecycleCallback(GetRecycledInst); + // Setup a generic IPP that does not do anything (as it is not target + // specific) for testing purposes. + auto IPP = std::make_unique(*STI, *MCII); + const SmallVector Instruments; // Tile size = 7 for (unsigned i = 0U, E = MCIs.size(); i < E;) { @@ -147,8 +151,10 @@ TEST_F(X86TestBase, TestInstructionRecycling) { }); ASSERT_FALSE(bool(RemainingE)); ASSERT_TRUE(RecycledInst); + IPP->postProcessInstruction(*RecycledInst, MCIs[i]); ISM.addRecycledInst(RecycledInst); } else { + IPP->postProcessInstruction(*InstOrErr.get(), MCIs[i]); ISM.addInst(std::move(InstOrErr.get())); } } diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp index 185b6b30994fc..305c28b4c7257 100644 --- a/llvm/utils/FileCheck/FileCheck.cpp +++ b/llvm/utils/FileCheck/FileCheck.cpp @@ -384,7 +384,7 @@ BuildInputAnnotations(const SourceMgr &SM, unsigned CheckFileBufferID, std::vector &Annotations, unsigned &LabelWidth) { struct CompareSMLoc { - bool operator()(const SMLoc &LHS, const SMLoc &RHS) const { + bool operator()(SMLoc LHS, SMLoc RHS) const { return LHS.getPointer() < RHS.getPointer(); } }; diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp index bc42efa3b2e9c..be7537c83da3a 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp @@ -21,6 +21,13 @@ #include using namespace llvm; +// As the type of more than one return values is represented as an anonymous +// struct, which is encoded with `IIT_STRUCT` followed by a byte specifying +// the number of return values, starting from 2 (encoded as 0) to 257 +// (encoded as 255). So, the maximum number of values that an intrinsic can +// return is 257. +static constexpr unsigned MaxNumReturn = 257; + //===----------------------------------------------------------------------===// // CodeGenIntrinsic Implementation //===----------------------------------------------------------------------===// @@ -29,15 +36,6 @@ CodeGenIntrinsicContext::CodeGenIntrinsicContext(const RecordKeeper &RC) { for (const Record *Rec : RC.getAllDerivedDefinitions("IntrinsicProperty")) if (Rec->getValueAsBit("IsDefault")) DefaultProperties.push_back(Rec); - - // The maximum number of values that an intrinsic can return is the size of - // of `IIT_RetNumbers` list - 1 (since we index into this list using the - // number of return values as the index). - const auto *IIT_RetNumbers = - dyn_cast_or_null(RC.getGlobal("IIT_RetNumbers")); - if (!IIT_RetNumbers) - PrintFatalError("unable to find 'IIT_RetNumbers' list"); - MaxNumReturn = IIT_RetNumbers->size() - 1; } CodeGenIntrinsicTable::CodeGenIntrinsicTable(const RecordKeeper &RC) { @@ -302,11 +300,10 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R, } unsigned NumRet = R->getValueAsListInit("RetTypes")->size(); - if (NumRet > Ctx.MaxNumReturn) + if (NumRet > MaxNumReturn) PrintFatalError(DefLoc, "intrinsics can only return upto " + - Twine(Ctx.MaxNumReturn) + " values, '" + - DefName + "' returns " + Twine(NumRet) + - " values"); + Twine(MaxNumReturn) + " values, '" + DefName + + "' returns " + Twine(NumRet) + " values"); const Record *TypeInfo = R->getValueAsDef("TypeInfo"); if (!TypeInfo->isSubClassOf("TypeInfoGen")) diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h index 676f575b2749d..2e86149514f46 100644 --- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h @@ -30,9 +30,6 @@ class RecordKeeper; struct CodeGenIntrinsicContext { explicit CodeGenIntrinsicContext(const RecordKeeper &RC); std::vector DefaultProperties; - - // Maximum number of values an intrinsic can return. - unsigned MaxNumReturn; }; struct CodeGenIntrinsic { diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp index a61ba54d3ffd2..f2fd889746bac 100644 --- a/llvm/utils/TableGen/CodeEmitterGen.cpp +++ b/llvm/utils/TableGen/CodeEmitterGen.cpp @@ -471,12 +471,8 @@ void CodeEmitterGen::run(raw_ostream &O) { << ";\n"; O << R"( const unsigned opcode = MI.getOpcode(); - if (opcode < FirstSupportedOpcode) { - std::string msg; - raw_string_ostream Msg(msg); - Msg << "Unsupported instruction: " << MI; - report_fatal_error(Msg.str().c_str()); - } + if (opcode < FirstSupportedOpcode) + reportUnsupportedInst(MI); unsigned TableIndex = opcode - FirstSupportedOpcode; )"; @@ -502,10 +498,7 @@ void CodeEmitterGen::run(raw_ostream &O) { // Default case: unhandled opcode. O << " default:\n" - << " std::string msg;\n" - << " raw_string_ostream Msg(msg);\n" - << " Msg << \"Not supported instr: \" << MI;\n" - << " report_fatal_error(Msg.str().c_str());\n" + << " reportUnsupportedInst(MI);\n" << " }\n"; if (UseAPInt) O << " Inst = Value;\n"; @@ -521,12 +514,10 @@ void CodeEmitterGen::run(raw_ostream &O) { << " const MCSubtargetInfo &STI) const {\n" << " switch (MI.getOpcode()) {\n"; emitCaseMap(O, BitOffsetCaseMap); - O << " }\n" - << " std::string msg;\n" - << " raw_string_ostream Msg(msg);\n" - << " Msg << \"Not supported instr[opcode]: \" << MI << \"[\" << OpNum " - "<< \"]\";\n" - << " report_fatal_error(Msg.str().c_str());\n" + O << " default:\n" + << " reportUnsupportedInst(MI);\n" + << " }\n" + << " reportUnsupportedOperand(MI, OpNum);\n" << "}\n\n" << "#endif // GET_OPERAND_BIT_OFFSET\n\n"; } diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index af75e44f63e48..75bea77faba42 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -335,6 +335,8 @@ bool TypeSetByHwMode::intersect(SetType &Out, const SetType &In) { using WildPartT = std::pair>; static const WildPartT WildParts[] = { {MVT::iPTR, [](MVT T) { return T.isScalarInteger() || T == MVT::iPTR; }}, + {MVT::cPTR, + [](MVT T) { return T.isCheriCapability() || T == MVT::cPTR; }}, }; bool Changed = false; @@ -816,6 +818,10 @@ void TypeInfer::expandOverloads(TypeSetByHwMode::SetType &Out, if (Out.count(MVT::pAny)) { Out.erase(MVT::pAny); Out.insert(MVT::iPTR); + for (MVT T : MVT::cheri_capability_valuetypes()) { + if (Legal.count(T)) + Out.insert(MVT::cPTR); + } } else if (Out.count(MVT::iAny)) { Out.erase(MVT::iAny); for (MVT T : MVT::integer_valuetypes()) @@ -1647,9 +1653,11 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode &N, case SDTCisVT: // Operand must be a particular type. return NodeToApply.UpdateNodeType(ResNo, VVT, TP); - case SDTCisPtrTy: - // Operand must be same as target pointer type. - return NodeToApply.UpdateNodeType(ResNo, MVT::iPTR, TP); + case SDTCisPtrTy: { + // Operand must be a legal pointer (iPTR, or possibly cPTR) type. + const TypeSetByHwMode &PtrTys = TP.getDAGPatterns().getLegalPtrTypes(); + return NodeToApply.UpdateNodeType(ResNo, PtrTys, TP); + } case SDTCisInt: // Require it to be one of the legal integer VTs. return TI.EnforceInteger(NodeToApply.getExtType(ResNo)); @@ -3293,6 +3301,7 @@ CodeGenDAGPatterns::CodeGenDAGPatterns(const RecordKeeper &R, PatternRewriterFn PatternRewriter) : Records(R), Target(R), Intrinsics(R), LegalVTS(Target.getLegalValueTypes()), + LegalPtrVTS(ComputeLegalPtrTypes()), PatternRewriter(std::move(PatternRewriter)) { ParseNodeInfo(); ParseNodeTransforms(); @@ -3328,6 +3337,36 @@ const Record *CodeGenDAGPatterns::getSDNodeNamed(StringRef Name) const { return N; } +// Compute the subset of iPTR and cPTR legal for each mode, coalescing into the +// default mode where possible to avoid predicate explosion. +TypeSetByHwMode CodeGenDAGPatterns::ComputeLegalPtrTypes() const { + auto LegalPtrsForSet = [](const MachineValueTypeSet &In) { + MachineValueTypeSet Out; + Out.insert(MVT::iPTR); + for (MVT T : MVT::cheri_capability_valuetypes()) { + if (In.count(T)) { + Out.insert(MVT::cPTR); + break; + } + } + return Out; + }; + + const TypeSetByHwMode &LegalTypes = getLegalTypes(); + MachineValueTypeSet LegalPtrsDefault = + LegalPtrsForSet(LegalTypes.get(DefaultMode)); + + TypeSetByHwMode LegalPtrTypes; + for (const auto &I : LegalTypes) { + MachineValueTypeSet S = LegalPtrsForSet(I.second); + if (I.first != DefaultMode && S == LegalPtrsDefault) + continue; + LegalPtrTypes.getOrCreate(I.first).insert(S); + } + + return LegalPtrTypes; +} + // Parse all of the SDNode definitions for the target, populating SDNodes. void CodeGenDAGPatterns::ParseNodeInfo() { const CodeGenHwModes &CGH = getTargetInfo().getHwModes(); diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h index 64fec275faa68..2ed8d1376b045 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.h @@ -1135,6 +1135,7 @@ class CodeGenDAGPatterns { std::vector PatternsToMatch; TypeSetByHwMode LegalVTS; + TypeSetByHwMode LegalPtrVTS; using PatternRewriterFn = std::function; PatternRewriterFn PatternRewriter; @@ -1148,6 +1149,7 @@ class CodeGenDAGPatterns { CodeGenTarget &getTargetInfo() { return Target; } const CodeGenTarget &getTargetInfo() const { return Target; } const TypeSetByHwMode &getLegalTypes() const { return LegalVTS; } + const TypeSetByHwMode &getLegalPtrTypes() const { return LegalPtrVTS; } const Record *getSDNodeNamed(StringRef Name) const; @@ -1249,6 +1251,7 @@ class CodeGenDAGPatterns { } private: + TypeSetByHwMode ComputeLegalPtrTypes() const; void ParseNodeInfo(); void ParseNodeTransforms(); void ParseComplexPatterns(); diff --git a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp index 255974624e8f0..4fdb386bf45e7 100644 --- a/llvm/utils/TableGen/Common/DAGISelMatcher.cpp +++ b/llvm/utils/TableGen/Common/DAGISelMatcher.cpp @@ -328,6 +328,14 @@ static bool TypesAreContradictory(MVT::SimpleValueType T1, if (T1 == T2) return false; + if (T1 == MVT::pAny) + return TypesAreContradictory(MVT::iPTR, T2) && + TypesAreContradictory(MVT::cPTR, T2); + + if (T2 == MVT::pAny) + return TypesAreContradictory(T1, MVT::iPTR) && + TypesAreContradictory(T1, MVT::cPTR); + // If either type is about iPtr, then they don't conflict unless the other // one is not a scalar integer type. if (T1 == MVT::iPTR) @@ -336,7 +344,13 @@ static bool TypesAreContradictory(MVT::SimpleValueType T1, if (T2 == MVT::iPTR) return !MVT(T1).isInteger() || MVT(T1).isVector(); - // Otherwise, they are two different non-iPTR types, they conflict. + if (T1 == MVT::cPTR) + return !MVT(T2).isCheriCapability() || MVT(T2).isVector(); + + if (T2 == MVT::cPTR) + return !MVT(T1).isCheriCapability() || MVT(T1).isVector(); + + // Otherwise, they are two different non-iPTR/cPTR types, they conflict. return true; } diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp index 3f615160f683e..5d49715879280 100644 --- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp +++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.cpp @@ -1467,7 +1467,9 @@ Error OperandMatcher::addTypeCheckPredicate(const TypeSetByHwMode &VTy, if (!VTy.isMachineValueType()) return failUnsupported("unsupported typeset"); - if (VTy.getMachineValueType() == MVT::iPTR && OperandIsAPointer) { + if ((VTy.getMachineValueType() == MVT::iPTR || + VTy.getMachineValueType() == MVT::cPTR) && + OperandIsAPointer) { addPredicate(0); return Error::success(); } diff --git a/llvm/utils/TableGen/Common/InstructionEncoding.cpp b/llvm/utils/TableGen/Common/InstructionEncoding.cpp index 7260ee3d9b534..30bbac463c0f4 100644 --- a/llvm/utils/TableGen/Common/InstructionEncoding.cpp +++ b/llvm/utils/TableGen/Common/InstructionEncoding.cpp @@ -316,6 +316,14 @@ static void addOneOperandFields(const Record *EncodingDef, else OpInfo.addField(I, J - I, Offset); } + + if (!OpInfo.InitValue && OpInfo.fields().empty()) { + // We found a field in InstructionEncoding record that corresponds to the + // named operand, but that field has no constant bits and doesn't contribute + // to the Inst field. For now, treat that field as if it didn't exist. + // TODO: Remove along with IgnoreNonDecodableOperands. + OpInfo.HasNoEncoding = true; + } } void InstructionEncoding::parseFixedLenOperands(const BitsInit &Bits) { diff --git a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp index b617a3dbca586..3a2ef55656067 100644 --- a/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp +++ b/llvm/utils/TableGen/Common/VarLenCodeEmitterGen.cpp @@ -320,10 +320,7 @@ void VarLenCodeEmitterGen::run(raw_ostream &OS) { } // Default case: unhandled opcode OS << " default:\n" - << " std::string msg;\n" - << " raw_string_ostream Msg(msg);\n" - << " Msg << \"Not supported instr: \" << MI;\n" - << " report_fatal_error(Msg.str().c_str());\n" + << " reportUnsupportedInst(MI);\n" << " }\n"; OS << "}\n\n"; } diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index 8d8189983270e..268e6bbc4eee3 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -519,9 +519,9 @@ static void FactorScope(std::unique_ptr &MatcherPtr) { CheckTypeMatcher *CTM = cast_or_null( FindNodeWithKind(Optn, Matcher::CheckType)); if (!CTM || - // iPTR checks could alias any other case without us knowing, don't - // bother with them. - CTM->getType() == MVT::iPTR || + // iPTR/cPTR checks could alias any other case without us knowing, + // don't bother with them. + CTM->getType() == MVT::iPTR || CTM->getType() == MVT::cPTR || // SwitchType only works for result #0. CTM->getResNo() != 0 || // If the CheckType isn't at the start of the list, see if we can move diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index e83df47d541c6..961dc2815f6b9 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -696,8 +696,6 @@ static void emitBinaryParser(raw_ostream &OS, indent Indent, // Special case for 'bits<0>'. if (OpInfo.Fields.empty() && !OpInfo.InitValue) { - if (IgnoreNonDecodableOperands) - return; assert(!OpInfo.Decoder.empty()); // The operand has no encoding, so the corresponding argument is omitted. // This avoids confusion and allows the function to be overloaded if the diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index 1c795afa9e700..a5e3c39bfdecd 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -882,6 +882,7 @@ def __str__(self): class FunctionTestBuilder: def __init__(self, run_list, flags, scrubber_args, path, ginfo): + self._run_list = run_list self._verbose = flags.verbose self._record_args = flags.function_signature self._check_attributes = flags.check_attributes @@ -917,15 +918,53 @@ def __init__(self, run_list, flags, scrubber_args, path, ginfo): self._func_order.update({prefix: []}) self._global_var_dict.update({prefix: dict()}) + # Return true if there is conflicting output for different runs for the + # given prefix and function name. + def has_conflicting_output(self, prefix, func): + # There was conflicting output if the func_dict is None for this + # prefix and function. + return self._func_dict[prefix].get(func) is None + def finish_and_get_func_dict(self): - for prefix in self.get_failed_prefixes(): - warn( - "Prefix %s had conflicting output from different RUN lines for all functions in test %s" - % ( - prefix, - self._path, + all_funcs = set() + for prefix in self._func_dict: + all_funcs.update(self._func_dict[prefix].keys()) + + warnings_to_print = collections.defaultdict(list) + for func in sorted(list(all_funcs)): + for i, run_info in enumerate(self._run_list): + prefixes = run_info[0] + if not prefixes: + continue + + # Check if this RUN line produces this function at all. If + # not, we can skip analysing this function for this RUN. + run_contains_func = all( + func in self._func_dict.get(p, {}) for p in prefixes ) + if not run_contains_func: + continue + + # Check if this RUN line can print any checks for this + # function. It can't if all of its prefixes have conflicting + # (None) output. + cannot_print_for_this_run = all( + self.has_conflicting_output(p, func) for p in prefixes + ) + if cannot_print_for_this_run: + warnings_to_print[func].append((i, prefixes)) + + for func, warning_info in warnings_to_print.items(): + conflict_strs = [] + for run_index, prefixes in warning_info: + conflict_strs.append( + f"RUN #{run_index + 1} (prefixes: {', '.join(prefixes)})" + ) + warn( + f"For function '{func}', the following RUN lines will not generate checks due to conflicting output: {', '.join(conflict_strs)}", + test_file=self._path, ) + return self._func_dict def func_order(self): @@ -1078,20 +1117,6 @@ def processed_prefixes(self, prefixes): """ self._processed_prefixes.update(prefixes) - def get_failed_prefixes(self): - # This returns the list of those prefixes that failed to match any function, - # because there were conflicting bodies produced by different RUN lines, in - # all instances of the prefix. - for prefix in self._func_dict: - if self._func_dict[prefix] and ( - not [ - fct - for fct in self._func_dict[prefix] - if self._func_dict[prefix][fct] is not None - ] - ): - yield prefix - ##### Generator of LLVM IR CHECK lines diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py index 7a5311d668f79..6f809c5977c75 100755 --- a/llvm/utils/git/code-format-helper.py +++ b/llvm/utils/git/code-format-helper.py @@ -173,7 +173,8 @@ def run(self, changed_files: List[str], args: FormatArgs) -> bool: f":warning: The {self.friendly_name} failed without printing " "a diff. Check the logs for stderr output. :warning:" ) - self.update_pr(comment_text, args, create_new=False) + if should_update_gh: + self.update_pr(comment_text, args, create_new=False) return False diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py index eb1abb2f37f20..948788d501d1d 100755 --- a/llvm/utils/git/github-automation.py +++ b/llvm/utils/git/github-automation.py @@ -297,9 +297,12 @@ def run(self) -> bool: print(e) continue + total_prs_url = f"https://github.com/llvm/llvm-project/pulls?q=author%3A{self.issue.user.login}+is%3Apr" + merged_prs_url = total_prs_url + "+is%3Amerged" comment = f""" ### Activity Summary: - * [{total_prs} Pull Requests](https://github.com/llvm/llvm-project/pulls/{self.issue.user.login}) ({merged_prs} merged) + * [{total_prs} Pull Requests]({total_prs_url}) + * [{merged_prs} Merged Pull Requests]({merged_prs_url}) * Top 3 Committers: {get_user_values_str(get_top_values(merged_by))} * Top 3 Reviewers: {get_user_values_str(get_top_values(reviewed_by))} """ diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index d18cefff335c5..036123371d24c 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -27,6 +27,7 @@ static_library("bugprone") { "CapturingThisInMemberVariableCheck.cpp", "CastingThroughVoidCheck.cpp", "ChainedComparisonCheck.cpp", + "CommandProcessorCheck.cpp", "ComparePointerToMemberVirtualFunctionCheck.cpp", "CopyConstructorInitCheck.cpp", "CrtpConstructorAccessibilityCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn index ac2cc2fd8236f..b097e139b9c7f 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn @@ -16,7 +16,6 @@ static_library("cert") { ] sources = [ "CERTTidyModule.cpp", - "CommandProcessorCheck.cpp", "DefaultOperatorNewAlignmentCheck.cpp", "DontModifyStdNamespaceCheck.cpp", "FloatLoopCounter.cpp", diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn index 4de101d600040..327b80b449e78 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn @@ -52,6 +52,7 @@ static_library("readability") { "RedundantFunctionPtrDereferenceCheck.cpp", "RedundantInlineSpecifierCheck.cpp", "RedundantMemberInitCheck.cpp", + "RedundantParenthesesCheck.cpp", "RedundantPreprocessorCheck.cpp", "RedundantSmartptrGetCheck.cpp", "RedundantStringCStrCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn index b8c8585a33a9b..5f9eb9adce04a 100644 --- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn @@ -27,6 +27,7 @@ static_library("Analysis") { "FixitUtil.cpp", "IntervalPartition.cpp", "IssueHash.cpp", + "LifetimeAnnotations.cpp", "LifetimeSafety.cpp", "LiveVariables.cpp", "MacroExpansionContext.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/DependencyScanning/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/DependencyScanning/BUILD.gn index 739e2fbb35982..6733cf4ef3772 100644 --- a/llvm/utils/gn/secondary/clang/lib/Tooling/DependencyScanning/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Tooling/DependencyScanning/BUILD.gn @@ -18,6 +18,7 @@ static_library("DependencyScanning") { "//llvm/lib/TargetParser", ] sources = [ + "DependencyScannerImpl.cpp", "DependencyScanningFilesystem.cpp", "DependencyScanningService.cpp", "DependencyScanningTool.cpp", diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index aa29b80fe8747..42a7940ccd44e 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -290,6 +290,7 @@ if (current_toolchain == default_toolchain) { "__atomic/check_memory_order.h", "__atomic/contention_t.h", "__atomic/fence.h", + "__atomic/floating_point_helper.h", "__atomic/is_always_lock_free.h", "__atomic/kill_dependency.h", "__atomic/memory_order.h", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index 822e3cec4823d..646f61d15f4a3 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -8,7 +8,6 @@ static_library("Vectorize") { "//llvm/lib/Transforms/Utils", ] sources = [ - "EVLIndVarSimplify.cpp", "LoadStoreVectorizer.cpp", "LoopIdiomVectorize.cpp", "LoopVectorizationLegality.cpp", diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-remarkutil/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-remarkutil/BUILD.gn index 4e4ffb54dbe3e..2d02c151058e9 100644 --- a/llvm/utils/gn/secondary/llvm/tools/llvm-remarkutil/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-remarkutil/BUILD.gn @@ -10,6 +10,7 @@ executable("llvm-remarkutil") { "RemarkConvert.cpp", "RemarkCount.cpp", "RemarkCounter.cpp", + "RemarkFilter.cpp", "RemarkInstructionMix.cpp", "RemarkSizeDiff.cpp", "RemarkUtil.cpp", diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 9ae8ac75bee08..a7e2705f609af 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1541,8 +1541,10 @@ def regex_escape(s): return s path_substitutions = [ - ("s", sourcepath), ("S", sourcedir), ("p", sourcedir), - ("t", tmpName), ("T", tmpDir) + ("s", sourcepath), + ("S", sourcedir), + ("p", sourcedir), + ("t", tmpName), ] for path_substitution in path_substitutions: letter = path_substitution[0] @@ -1919,6 +1921,14 @@ def processLine(ln): # seems reasonable. ln = _caching_re_compile(a).sub(str(b), escapePercents(ln)) + # TODO(boomanaiden154): Remove when we branch LLVM 22 so people on the + # release branch will have sufficient time to migrate. + if bool(_caching_re_compile("%T").search(ln)): + raise ValueError( + "%T is no longer supported. Please create directories with names " + "based on %t." + ) + # Strip the trailing newline and any extra whitespace. return ln.strip() diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/capital-t-error-message.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/capital-t-error-message.txt new file mode 100644 index 0000000000000..e69dfee8fced8 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/capital-t-error-message.txt @@ -0,0 +1,2 @@ +# Check that we return a decent error message when someone uses %T +# RUN: echo %T > %t diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt b/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt index 75ce8b7733ad7..cfa829f0bf2f7 100644 --- a/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt +++ b/llvm/utils/lit/tests/Inputs/shtest-shell/valid-shell.txt @@ -18,15 +18,15 @@ # Check force remove commands success whether the directory does or doesn't exist. # # Check the mkdir command with -p option. -# RUN: rm -f -r %T/test -# RUN: %{python} %S/../check_path.py dir %T/test > %t.out +# RUN: rm -f -r %t.test +# RUN: %{python} %S/../check_path.py dir %t.test > %t.out # RUN: FileCheck --check-prefix=REMOVE-PARENT-DIR < %t.out %s -# RUN: mkdir -p %T/test -# RUN: %{python} %S/../check_path.py dir %T/test > %t.out +# RUN: mkdir -p %t.test +# RUN: %{python} %S/../check_path.py dir %t.test > %t.out # RUN: FileCheck --check-prefix=MAKE-PARENT-DIR < %t.out %s -# RUN: rm -f %T/test || true -# RUN: rm -f -r %T/test -# RUN: %{python} %S/../check_path.py dir %T/test > %t.out +# RUN: rm -f %t.test || true +# RUN: rm -f -r %t.test +# RUN: %{python} %S/../check_path.py dir %t.test > %t.out # RUN: FileCheck --check-prefix=REMOVE-PARENT-DIR < %t.out %s # # MAKE-PARENT-DIR: True @@ -34,15 +34,15 @@ # # Check the mkdir command without -p option. # -# RUN: rm -rf %T/test1 -# RUN: mkdir %T/test1 -# RUN: %{python} %S/../check_path.py dir %T/test1 > %t.out +# RUN: rm -rf %t.test1 +# RUN: mkdir %t.test1 +# RUN: %{python} %S/../check_path.py dir %t.test1 > %t.out # RUN: FileCheck --check-prefix=MAKE-DIR < %t.out %s -# RUN: cd %T/test1 && mkdir foo -# RUN: %{python} %S/../check_path.py dir %T/test1 > %t.out +# RUN: cd %t.test1 && mkdir foo +# RUN: %{python} %S/../check_path.py dir %t.test1 > %t.out # RUN: FileCheck --check-prefix=MAKE-DIR < %t.out %s -# RUN: cd %T && rm -rf %T/test1 -# RUN: %{python} %S/../check_path.py dir %T/test1 > %t.out +# RUN: cd .. && rm -rf %t.test1 +# RUN: %{python} %S/../check_path.py dir %t.test1 > %t.out # RUN: FileCheck --check-prefix=REMOVE-DIR < %t.out %s # # MAKE-DIR: True @@ -50,18 +50,18 @@ # # Check creating and removing multiple folders and rm * operation. # -# RUN: rm -rf %T/test -# RUN: mkdir -p %T/test/test1 %T/test/test2 -# RUN: %{python} %S/../check_path.py dir %T/test %T/test/test1 %T/test/test2 > %t.out +# RUN: rm -rf %t.test +# RUN: mkdir -p %t.test/test1 %t.test/test2 +# RUN: %{python} %S/../check_path.py dir %t.test %t.test/test1 %t.test/test2 > %t.out # RUN: FileCheck --check-prefix=DIRS-EXIST < %t.out %s -# RUN: mkdir %T/test || true -# RUN: echo "create a temp file" > %T/test/temp.write -# RUN: echo "create a temp1 file" > %T/test/test1/temp1.write -# RUN: echo "create a temp2 file" > %T/test/test2/temp2.write -# RUN: %{python} %S/../check_path.py file %T/test/temp.write %T/test/test1/temp1.write %T/test/test2/temp2.write> %t.out +# RUN: mkdir %t.test || true +# RUN: echo "create a temp file" > %t.test/temp.write +# RUN: echo "create a temp1 file" > %t.test/test1/temp1.write +# RUN: echo "create a temp2 file" > %t.test/test2/temp2.write +# RUN: %{python} %S/../check_path.py file %t.test/temp.write %t.test/test1/temp1.write %t.test/test2/temp2.write> %t.out # RUN: FileCheck --check-prefix=FILES-EXIST < %t.out %s -# RUN: rm -r -f %T/* -# RUN: %{python} %S/../check_path.py dir %T/test > %t.out +# RUN: rm -r -f %t* +# RUN: %{python} %S/../check_path.py dir %t.test > %t.out # RUN: FileCheck --check-prefix=REMOVE-ALL < %t.out %s # # DIRS-EXIST: True @@ -81,7 +81,7 @@ # RUN: echo "hello-2" > %t1.stdout # RUN: diff %t.stdout %t1.stdout || true # -# RUN: mkdir -p %T/dir1 %T/dir2 -# RUN: cd %T/dir1 && echo "hello" > temp1.txt -# RUN: cd %T/dir2 && echo "hello" > temp2.txt -# RUN: diff temp2.txt ../dir1/temp1.txt +# RUN: mkdir -p %t.dir1 %t.dir2 +# RUN: cd %t.dir1 && echo "hello" > temp1.txt +# RUN: cd %t.dir2 && echo "hello" > temp2.txt +# RUN: diff temp2.txt ../%{t:stem}.tmp.dir1/temp1.txt diff --git a/llvm/utils/lit/tests/shtest-readfile-external.py b/llvm/utils/lit/tests/shtest-readfile-external.py index 99b0160d933fe..c00bff45c8703 100644 --- a/llvm/utils/lit/tests/shtest-readfile-external.py +++ b/llvm/utils/lit/tests/shtest-readfile-external.py @@ -1,5 +1,8 @@ ## Tests the readfile substitution. +# TODO(boomanaiden154): This sometimes fails, possibly due to buffers not being flushed. +# ALLOW_RETRIES: 2 + # UNSUPPORTED: system-windows # RUN: env LIT_USE_INTERNAL_SHELL=0 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S/Inputs/shtest-readfile/Output %s @@ -11,7 +14,7 @@ # CHECK-LABEL: FAIL: shtest-readfile :: file-does-not-exist.txt ({{[^)]*}}) # CHECK: echo $(cat /file/does/not/exist) && test -e /file/does/not/exist {{.*}} -# CHECK: cat: /file/does/not/exist: No such file or directory +# CHECK: {{.*}}cat{{.*}}/file/does/not/exist{{.*}} # CHECK-LABEL: FAIL: shtest-readfile :: relative-paths.txt ({{[^)]*}}) # CHECK: echo $(cat rel_path_test_folder/test_file) && test -e rel_path_test_folder/test_file {{.*}} diff --git a/llvm/utils/lit/tests/shtest-readfile.py b/llvm/utils/lit/tests/shtest-readfile.py index a122dd7664272..66e3a042bf787 100644 --- a/llvm/utils/lit/tests/shtest-readfile.py +++ b/llvm/utils/lit/tests/shtest-readfile.py @@ -1,5 +1,8 @@ ## Tests the readfile substitution. +# TODO(boomanaiden154): This sometimes fails, possibly due to buffers not being flushed. +# ALLOW_RETRIES: 2 + # RUN: env LIT_USE_INTERNAL_SHELL=1 not %{lit} -a -v %{inputs}/shtest-readfile | FileCheck -match-full-lines -DTEMP_PATH=%S%{fs-sep}Inputs%{fs-sep}shtest-readfile%{fs-sep}Output %s # CHECK: -- Testing: 4 tests{{.*}} @@ -10,7 +13,7 @@ # CHECK-LABEL: FAIL: shtest-readfile :: file-does-not-exist.txt ({{[^)]*}}) # CHECK: # executed command: @echo 'echo %{readfile:/file/does/not/exist}' -# CHECK: # | File specified in readfile substitution does not exist: /file/does/not/exist +# CHECK: # | File specified in readfile substitution does not exist: {{.*}}/file/does/not/exist # CHECK-LABEL: FAIL: shtest-readfile :: relative-paths.txt ({{[^)]*}}) # CHECK: echo hello diff --git a/llvm/utils/lit/tests/shtest-shell.py b/llvm/utils/lit/tests/shtest-shell.py index 498f6bb0adc11..38db1b75486cf 100644 --- a/llvm/utils/lit/tests/shtest-shell.py +++ b/llvm/utils/lit/tests/shtest-shell.py @@ -12,6 +12,10 @@ # CHECK: -- Testing: +# CHECK: UNRESOLVED: shtest-shell :: capital-t-error-message.txt +# CHECK: *** TEST 'shtest-shell :: capital-t-error-message.txt' FAILED *** +# CHECK: ValueError: %T is no longer supported. Please create directories with names based on %t. + # CHECK: FAIL: shtest-shell :: colon-error.txt # CHECK: *** TEST 'shtest-shell :: colon-error.txt' FAILED *** # CHECK: : @@ -633,5 +637,5 @@ # CHECK: *** # CHECK: PASS: shtest-shell :: valid-shell.txt -# CHECK: Unresolved Tests (1) +# CHECK: Unresolved Tests (2) # CHECK: Failed Tests (37) diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py index 2661a2c8d6448..2d96feae5b58e 100644 --- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py +++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py @@ -2,7 +2,7 @@ # ulimit does not work on non-POSIX platforms. # These tests are specific to options that Darwin does not support. -# UNSUPPORTED: system-windows, system-darwin +# UNSUPPORTED: system-windows, system-darwin, system-aix # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp index 1f566e13f070a..ea1395b2646f6 100644 --- a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp +++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp @@ -128,21 +128,7 @@ static const StringMap> XFailTestNames = {{ "Section - Multiple Calls", }}, - {"interpolation.json", - { - "Triple Mustache", - "Triple Mustache Integer Interpolation", - "Triple Mustache Decimal Interpolation", - "Triple Mustache Null Interpolation", - "Triple Mustache Context Miss Interpolation", - "Dotted Names - Triple Mustache Interpolation", - "Implicit Iterators - Triple Mustache", - "Triple Mustache - Surrounding Whitespace", - "Triple Mustache - Standalone", - "Triple Mustache With Padding", - }}, {"partials.json", {"Standalone Indentation"}}, - {"sections.json", {"Implicit Iterator - Triple mustache"}}, }}; struct TestData { diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index f25595116edca..208cbdd1dd535 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -113,11 +113,12 @@ endfunction() # DEPENDS_TARGET_SRC_DEPS: List of cpp sources for extension library (for generating a DEPFILE). # IMPORT_PATHS: List of paths to add to PYTHONPATH for stubgen. # PATTERN_FILE: (Optional) Pattern file (see https://nanobind.readthedocs.io/en/latest/typing.html#pattern-files). +# VERBOSE: Emit logging/status messages during stub generation (default: OFF). # Outputs: # NB_STUBGEN_CUSTOM_TARGET: The target corresponding to generation which other targets can depend on. function(mlir_generate_type_stubs) cmake_parse_arguments(ARG - "" + "VERBOSE" "MODULE_NAME;OUTPUT_DIR;PATTERN_FILE" "IMPORT_PATHS;DEPENDS_TARGETS;OUTPUTS;DEPENDS_TARGET_SRC_DEPS" ${ARGN}) @@ -152,6 +153,9 @@ function(mlir_generate_type_stubs) --include-private --output-dir "${ARG_OUTPUT_DIR}") + if(NOT ARG_VERBOSE) + list(APPEND _nb_stubgen_cmd "--quiet") + endif() if(ARG_PATTERN_FILE) list(APPEND _nb_stubgen_cmd "-p;${ARG_PATTERN_FILE}") list(APPEND ARG_DEPENDS_TARGETS "${ARG_PATTERN_FILE}") @@ -166,7 +170,9 @@ function(mlir_generate_type_stubs) file(GENERATE OUTPUT "${_depfile}" CONTENT "${_depfiles}") endif() - message(DEBUG "Generating type-stubs outputs ${_generated_type_stubs}") + if(ARG_VERBOSE) + message(STATUS "Generating type-stubs outputs ${_generated_type_stubs}") + endif() add_custom_command( OUTPUT ${_generated_type_stubs} COMMAND ${_nb_stubgen_cmd} diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md index f988bebea1223..7c1be84727476 100644 --- a/mlir/docs/DefiningDialects/Operations.md +++ b/mlir/docs/DefiningDialects/Operations.md @@ -1649,6 +1649,15 @@ inline constexpr MyBitEnum operator&(MyBitEnum a, MyBitEnum b) { inline constexpr MyBitEnum operator^(MyBitEnum a, MyBitEnum b) { return static_cast(static_cast(a) ^ static_cast(b)); } +inline constexpr MyBitEnum &operator|=(MyBitEnum &a, MyBitEnum b) { + return a = a | b; +} +inline constexpr MyBitEnum &operator&=(MyBitEnum &a, MyBitEnum b) { + return a = a & b; +} +inline constexpr MyBitEnum &operator^=(MyBitEnum &a, MyBitEnum b) { + return a = a ^ b; +} inline constexpr MyBitEnum operator~(MyBitEnum bits) { // Ensure only bits that can be present in the enum are set return static_cast(~static_cast(bits) & static_cast(15u)); diff --git a/mlir/examples/standalone/CMakeLists.txt b/mlir/examples/standalone/CMakeLists.txt index e2bcda7fa6f0b..c6c49fde12d2e 100644 --- a/mlir/examples/standalone/CMakeLists.txt +++ b/mlir/examples/standalone/CMakeLists.txt @@ -63,8 +63,12 @@ if(MLIR_ENABLE_BINDINGS_PYTHON) include(MLIRDetectPythonEnv) mlir_configure_python_dev_packages() # Note: for EXTERNAL_PROJECT_BUILD this must be set from the command line. - set(MLIR_PYTHON_PACKAGE_PREFIX "mlir_standalone" CACHE STRING "" FORCE) - set(MLIR_BINDINGS_PYTHON_INSTALL_PREFIX "python_packages/standalone/${MLIR_PYTHON_PACKAGE_PREFIX}" CACHE STRING "" FORCE) + if(NOT MLIR_PYTHON_PACKAGE_PREFIX) + set(MLIR_PYTHON_PACKAGE_PREFIX "mlir_standalone" CACHE STRING "" FORCE) + endif() + if(NOT MLIR_BINDINGS_PYTHON_INSTALL_PREFIX) + set(MLIR_BINDINGS_PYTHON_INSTALL_PREFIX "python_packages/standalone/${MLIR_PYTHON_PACKAGE_PREFIX}" CACHE STRING "" FORCE) + endif() add_subdirectory(python) endif() add_subdirectory(test) diff --git a/mlir/examples/standalone/include/Standalone/StandalonePasses.td b/mlir/examples/standalone/include/Standalone/StandalonePasses.td index 4cb2be02e4a20..d5aad34f2f457 100644 --- a/mlir/examples/standalone/include/Standalone/StandalonePasses.td +++ b/mlir/examples/standalone/include/Standalone/StandalonePasses.td @@ -1,4 +1,4 @@ -//===- StandalonePsss.td - Standalone dialect passes -------*- tablegen -*-===// +//===- StandalonePasses.td - Standalone dialect passes -------*- tablegen -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/mlir/examples/standalone/pyproject.toml b/mlir/examples/standalone/pyproject.toml new file mode 100644 index 0000000000000..5a1e6e86513c3 --- /dev/null +++ b/mlir/examples/standalone/pyproject.toml @@ -0,0 +1,65 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# Copyright (c) 2025. + +[project] +name = "standalone-python-bindings" +dynamic = ["version"] +requires-python = ">=3.8,<=3.14" +dependencies = [ + "numpy>=1.19.5, <=2.1.2", + "PyYAML>=5.4.0, <=6.0.1", + "ml_dtypes>=0.1.0, <=0.6.0; python_version<'3.13'", + "ml_dtypes>=0.5.0, <=0.6.0; python_version>='3.13'", +] + +[project.urls] +Homepage = "https://github.com/llvm/llvm-project" +Discussions = "https://discourse.llvm.org/" +"Issue Tracker" = "https://github.com/llvm/llvm-project/issues?q=is%3Aissue%20state%3Aopen%20label%3Amlir%3Apython%20" +"Source Code" = "https://github.com/llvm/llvm-project/tree/main/mlir/python" + +[build-system] +requires = [ + "scikit-build-core>=0.10.7", + "typing_extensions>=4.12.2", + "nanobind>=2.9, <3.0", + "pybind11>=2.10.0, <=2.13.6", +] +build-backend = "scikit_build_core.build" + +[tool.scikit-build] +# This is the minimum version of scikit-build-core. +minimum-version = "0.10.7" +# This pyproject.toml must be adjacent to the root CMakeLists.txt (wherever project(...) is specified). +cmake.source-dir = "." +# This is for installing/distributing the python bindings target and only the python bindings target. +build.targets = ["StandalonePythonModules"] +install.components = ["StandalonePythonModules"] + +[tool.scikit-build.cmake.define] +# Optional +CMAKE_C_COMPILER = { env = "CMAKE_C_COMPILER", default = "" } +CMAKE_CXX_COMPILER = { env = "CMAKE_CXX_COMPILER", default = "" } +CMAKE_C_COMPILER_LAUNCHER = { env = "CMAKE_C_COMPILER_LAUNCHER", default = "" } +CMAKE_CXX_COMPILER_LAUNCHER = { env = "CMAKE_CXX_COMPILER_LAUNCHER", default = "" } +CMAKE_GENERATOR = { env = "CMAKE_GENERATOR", default = "Ninja" } +LLVM_USE_LINKER = { env = "LLVM_USE_LINKER", default = "" } +# Optional but highly recommended (this makes the bindings compatible with other bindings packages +# by preventing symbol collisions). +CMAKE_VISIBILITY_INLINES_HIDDEN = "ON" +CMAKE_C_VISIBILITY_PRESET = "hidden" +CMAKE_CXX_VISIBILITY_PRESET = "hidden" + +# Non-optional (alternatively you could use CMAKE_PREFIX_PATH here). +MLIR_DIR = { env = "MLIR_DIR", default = "" } +# Non-optional +CMAKE_BUILD_TYPE = { env = "CMAKE_BUILD_TYPE", default = "Release" } +MLIR_ENABLE_BINDINGS_PYTHON = "ON" +# Effectively non-optional (any downstream project should specify this). +MLIR_PYTHON_PACKAGE_PREFIX = "mlir_standalone" +# This specifies the directory in the install directory (i.e., /tmp/pip-wheel/platlib) where _mlir_libs, dialects, etc. +# are installed. Thus, this will be the package location (and the name of the package) that pip assumes is +# the root package. +MLIR_BINDINGS_PYTHON_INSTALL_PREFIX = "mlir_standalone" diff --git a/mlir/examples/standalone/python/CMakeLists.txt b/mlir/examples/standalone/python/CMakeLists.txt index d48c5bcdde137..905c944939756 100644 --- a/mlir/examples/standalone/python/CMakeLists.txt +++ b/mlir/examples/standalone/python/CMakeLists.txt @@ -30,6 +30,9 @@ declare_mlir_python_extension(StandalonePythonSources.Pybind11Extension PRIVATE_LINK_LIBS LLVMSupport EMBED_CAPI_LINK_LIBS + MLIRCAPIIR + MLIRCAPIArith + MLIRCAPITransforms StandaloneCAPI PYTHON_BINDINGS_LIBRARY pybind11 ) @@ -42,6 +45,9 @@ declare_mlir_python_extension(StandalonePythonSources.NanobindExtension PRIVATE_LINK_LIBS LLVMSupport EMBED_CAPI_LINK_LIBS + MLIRCAPIIR + MLIRCAPIArith + MLIRCAPITransforms StandaloneCAPI PYTHON_BINDINGS_LIBRARY nanobind ) @@ -58,9 +64,6 @@ add_mlir_python_common_capi_library(StandalonePythonCAPI RELATIVE_INSTALL_ROOT "../../../.." DECLARED_SOURCES StandalonePythonSources - # TODO: Remove this in favor of showing fine grained registration once - # available. - MLIRPythonExtension.RegisterEverything MLIRPythonSources.Core MLIRPythonSources.Dialects.builtin ) @@ -71,75 +74,77 @@ add_mlir_python_common_capi_library(StandalonePythonCAPI set(StandalonePythonModules_ROOT_PREFIX "${MLIR_BINARY_DIR}/${MLIR_BINDINGS_PYTHON_INSTALL_PREFIX}") -# Everything here is very tightly coupled. See the ample descriptions at the bottom of -# mlir/python/CMakeLists.txt. - -# For a non-external projects build (e.g., installed distro) the type gen targets for the core _mlir module -# need to be re-declared. On the contrary, for an external projects build, the MLIRPythonExtension.Core.type_stub_gen -# target already exists and can just be added to DECLARED_SOURCES (see below). -if(NOT EXTERNAL_PROJECT_BUILD) - set(_core_type_stub_sources - _mlir/__init__.pyi - _mlir/ir.pyi - _mlir/passmanager.pyi - _mlir/rewrite.pyi - ) - get_target_property(_core_extension_srcs MLIRPythonExtension.Core INTERFACE_SOURCES) +if(NOT CMAKE_CROSSCOMPILING) + # Everything here is very tightly coupled. See the ample descriptions at the bottom of + # mlir/python/CMakeLists.txt. + + # For a non-external projects build (e.g., installed distro) the type gen targets for the core _mlir module + # need to be re-declared. On the contrary, for an external projects build, the MLIRPythonExtension.Core.type_stub_gen + # target already exists and can just be added to DECLARED_SOURCES (see below). + if(NOT EXTERNAL_PROJECT_BUILD) + set(_core_type_stub_sources + _mlir/__init__.pyi + _mlir/ir.pyi + _mlir/passmanager.pyi + _mlir/rewrite.pyi + ) + get_target_property(_core_extension_srcs MLIRPythonExtension.Core INTERFACE_SOURCES) + mlir_generate_type_stubs( + MODULE_NAME _mlir + DEPENDS_TARGETS StandalonePythonModules.extension._mlir.dso + OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs" + OUTPUTS "${_core_type_stub_sources}" + DEPENDS_TARGET_SRC_DEPS "${_core_extension_srcs}" + IMPORT_PATHS "${StandalonePythonModules_ROOT_PREFIX}/_mlir_libs" + VERBOSE + ) + set(_mlir_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") + + list(TRANSFORM _core_type_stub_sources PREPEND "_mlir_libs/") + declare_mlir_python_sources( + StandalonePythonExtension.Core.type_stub_gen + ROOT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs" + ADD_TO_PARENT StandalonePythonSources + SOURCES "${_core_type_stub_sources}" + ) + endif() + + get_target_property(_standalone_extension_srcs StandalonePythonSources.NanobindExtension INTERFACE_SOURCES) mlir_generate_type_stubs( - MODULE_NAME _mlir - DEPENDS_TARGETS StandalonePythonModules.extension._mlir.dso + MODULE_NAME mlir_standalone._mlir_libs._standaloneDialectsNanobind + DEPENDS_TARGETS + StandalonePythonModules.extension._mlir.dso + StandalonePythonModules.extension._standaloneDialectsNanobind.dso OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs" - OUTPUTS "${_core_type_stub_sources}" - DEPENDS_TARGET_SRC_DEPS "${_core_extension_srcs}" - IMPORT_PATHS "${StandalonePythonModules_ROOT_PREFIX}/_mlir_libs" + OUTPUTS + _standaloneDialectsNanobind/__init__.pyi + _standaloneDialectsNanobind/standalone.pyi + DEPENDS_TARGET_SRC_DEPS "${_standalone_extension_srcs}" + IMPORT_PATHS "${StandalonePythonModules_ROOT_PREFIX}/.." ) - set(_mlir_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") + set(_standaloneDialectsNanobind_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") - list(TRANSFORM _core_type_stub_sources PREPEND "_mlir_libs/") declare_mlir_python_sources( - StandalonePythonExtension.Core.type_stub_gen + StandalonePythonSources.type_stub_gen ROOT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs" ADD_TO_PARENT StandalonePythonSources - SOURCES "${_core_type_stub_sources}" + SOURCES + _mlir_libs/_standaloneDialectsNanobind/__init__.pyi + _mlir_libs/_standaloneDialectsNanobind/standalone.pyi ) endif() -get_target_property(_standalone_extension_srcs StandalonePythonSources.NanobindExtension INTERFACE_SOURCES) -mlir_generate_type_stubs( - MODULE_NAME mlir_standalone._mlir_libs._standaloneDialectsNanobind - DEPENDS_TARGETS - StandalonePythonModules.extension._mlir.dso - StandalonePythonModules.extension._standaloneDialectsNanobind.dso - OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs" - OUTPUTS - _standaloneDialectsNanobind/__init__.pyi - _standaloneDialectsNanobind/standalone.pyi - DEPENDS_TARGET_SRC_DEPS "${_standalone_extension_srcs}" - IMPORT_PATHS "${StandalonePythonModules_ROOT_PREFIX}/.." -) -set(_standaloneDialectsNanobind_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") - -declare_mlir_python_sources( - StandalonePythonSources.type_stub_gen - ROOT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs" - ADD_TO_PARENT StandalonePythonSources - SOURCES - _mlir_libs/_standaloneDialectsNanobind/__init__.pyi - _mlir_libs/_standaloneDialectsNanobind/standalone.pyi -) set(_declared_sources StandalonePythonSources - # TODO: Remove this in favor of showing fine grained registration once - # available. - MLIRPythonExtension.RegisterEverything MLIRPythonSources.Core MLIRPythonSources.Dialects.builtin ) # For an external projects build, the MLIRPythonExtension.Core.type_stub_gen # target already exists and can just be added to DECLARED_SOURCES. -if(EXTERNAL_PROJECT_BUILD) +if(EXTERNAL_PROJECT_BUILD AND (NOT CMAKE_CROSSCOMPILING)) list(APPEND _declared_sources MLIRPythonExtension.Core.type_stub_gen) endif() + add_mlir_python_modules(StandalonePythonModules ROOT_PREFIX "${StandalonePythonModules_ROOT_PREFIX}" INSTALL_PREFIX "${MLIR_BINDINGS_PYTHON_INSTALL_PREFIX}" @@ -147,7 +152,10 @@ add_mlir_python_modules(StandalonePythonModules COMMON_CAPI_LINK_LIBS StandalonePythonCAPI ) -if(NOT EXTERNAL_PROJECT_BUILD) - add_dependencies(StandalonePythonModules "${_mlir_typestub_gen_target}") + +if(NOT CMAKE_CROSSCOMPILING) + if(NOT EXTERNAL_PROJECT_BUILD) + add_dependencies(StandalonePythonModules "${_mlir_typestub_gen_target}") + endif() + add_dependencies(StandalonePythonModules "${_standaloneDialectsNanobind_typestub_gen_target}") endif() -add_dependencies(StandalonePythonModules "${_standaloneDialectsNanobind_typestub_gen_target}") diff --git a/mlir/examples/standalone/python/StandaloneExtensionNanobind.cpp b/mlir/examples/standalone/python/StandaloneExtensionNanobind.cpp index e06ec3b6472b8..0ec6cdfa7994b 100644 --- a/mlir/examples/standalone/python/StandaloneExtensionNanobind.cpp +++ b/mlir/examples/standalone/python/StandaloneExtensionNanobind.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "Standalone-c/Dialects.h" +#include "mlir-c/Dialect/Arith.h" #include "mlir/Bindings/Python/Nanobind.h" #include "mlir/Bindings/Python/NanobindAdaptors.h" @@ -22,17 +23,21 @@ NB_MODULE(_standaloneDialectsNanobind, m) { auto standaloneM = m.def_submodule("standalone"); standaloneM.def( - "register_dialect", + "register_dialects", [](MlirContext context, bool load) { - MlirDialectHandle handle = mlirGetDialectHandle__standalone__(); - mlirDialectHandleRegisterDialect(handle, context); + MlirDialectHandle arithHandle = mlirGetDialectHandle__arith__(); + MlirDialectHandle standaloneHandle = + mlirGetDialectHandle__standalone__(); + mlirDialectHandleRegisterDialect(arithHandle, context); + mlirDialectHandleRegisterDialect(standaloneHandle, context); if (load) { - mlirDialectHandleLoadDialect(handle, context); + mlirDialectHandleLoadDialect(arithHandle, context); + mlirDialectHandleRegisterDialect(standaloneHandle, context); } }, nb::arg("context").none() = nb::none(), nb::arg("load") = true, // clang-format off - nb::sig("def register_dialect(context: " MAKE_MLIR_PYTHON_QUALNAME("ir.Context") ", load: bool = True) -> None") + nb::sig("def register_dialects(context: " MAKE_MLIR_PYTHON_QUALNAME("ir.Context") ", load: bool = True) -> None") // clang-format on ); } diff --git a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp index 397db4c20e743..da8c2167dc36b 100644 --- a/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp +++ b/mlir/examples/standalone/python/StandaloneExtensionPybind11.cpp @@ -10,6 +10,7 @@ //===----------------------------------------------------------------------===// #include "Standalone-c/Dialects.h" +#include "mlir-c/Dialect/Arith.h" #include "mlir/Bindings/Python/PybindAdaptors.h" using namespace mlir::python::adaptors; @@ -21,12 +22,16 @@ PYBIND11_MODULE(_standaloneDialectsPybind11, m) { auto standaloneM = m.def_submodule("standalone"); standaloneM.def( - "register_dialect", + "register_dialects", [](MlirContext context, bool load) { - MlirDialectHandle handle = mlirGetDialectHandle__standalone__(); - mlirDialectHandleRegisterDialect(handle, context); + MlirDialectHandle arithHandle = mlirGetDialectHandle__arith__(); + MlirDialectHandle standaloneHandle = + mlirGetDialectHandle__standalone__(); + mlirDialectHandleRegisterDialect(arithHandle, context); + mlirDialectHandleRegisterDialect(standaloneHandle, context); if (load) { - mlirDialectHandleLoadDialect(handle, context); + mlirDialectHandleLoadDialect(arithHandle, context); + mlirDialectHandleRegisterDialect(standaloneHandle, context); } }, py::arg("context") = py::none(), py::arg("load") = true); diff --git a/mlir/examples/standalone/test/CAPI/CMakeLists.txt b/mlir/examples/standalone/test/CAPI/CMakeLists.txt index eaa6cfc102c73..9d5cda5eca5fc 100644 --- a/mlir/examples/standalone/test/CAPI/CMakeLists.txt +++ b/mlir/examples/standalone/test/CAPI/CMakeLists.txt @@ -6,9 +6,7 @@ add_mlir_aggregate(StandaloneCAPITestLib SHARED EMBED_LIBS MLIRCAPIIR - # TODO: Remove this in favor of showing fine grained dialect registration - # (once available). - MLIRCAPIRegisterEverything + MLIRCAPIArith StandaloneCAPI ) diff --git a/mlir/examples/standalone/test/CAPI/standalone-capi-test.c b/mlir/examples/standalone/test/CAPI/standalone-capi-test.c index 54f3ca7f7ff14..62add133fd4c5 100644 --- a/mlir/examples/standalone/test/CAPI/standalone-capi-test.c +++ b/mlir/examples/standalone/test/CAPI/standalone-capi-test.c @@ -12,21 +12,12 @@ #include #include "Standalone-c/Dialects.h" +#include "mlir-c/Dialect/Arith.h" #include "mlir-c/IR.h" -#include "mlir-c/RegisterEverything.h" - -static void registerAllUpstreamDialects(MlirContext ctx) { - MlirDialectRegistry registry = mlirDialectRegistryCreate(); - mlirRegisterAllDialects(registry); - mlirContextAppendDialectRegistry(ctx, registry); - mlirDialectRegistryDestroy(registry); -} int main(int argc, char **argv) { MlirContext ctx = mlirContextCreate(); - // TODO: Create the dialect handles for the builtin dialects and avoid this. - // This adds dozens of MB of binary size over just the standalone dialect. - registerAllUpstreamDialects(ctx); + mlirDialectHandleRegisterDialect(mlirGetDialectHandle__arith__(), ctx); mlirDialectHandleRegisterDialect(mlirGetDialectHandle__standalone__(), ctx); MlirModule module = mlirModuleCreateParse( diff --git a/mlir/examples/standalone/test/python/smoketest.py b/mlir/examples/standalone/test/python/smoketest.py index bd40c65d16164..26d84fd63e947 100644 --- a/mlir/examples/standalone/test/python/smoketest.py +++ b/mlir/examples/standalone/test/python/smoketest.py @@ -3,7 +3,6 @@ import sys from mlir_standalone.ir import * -from mlir_standalone.dialects import builtin as builtin_d if sys.argv[1] == "pybind11": from mlir_standalone.dialects import standalone_pybind11 as standalone_d @@ -14,7 +13,7 @@ with Context(): - standalone_d.register_dialect() + standalone_d.register_dialects() module = Module.parse( """ %0 = arith.constant 2 : i32 diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h index 65b14254e4492..c1ade9ed8617c 100644 --- a/mlir/include/mlir-c/Dialect/LLVM.h +++ b/mlir/include/mlir-c/Dialect/LLVM.h @@ -306,7 +306,8 @@ typedef enum MlirLLVMDINameTableKind MlirLLVMDINameTableKind; MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDICompileUnitAttrGet( MlirContext ctx, MlirAttribute id, unsigned int sourceLanguage, MlirAttribute file, MlirAttribute producer, bool isOptimized, - MlirLLVMDIEmissionKind emissionKind, MlirLLVMDINameTableKind nameTableKind); + MlirLLVMDIEmissionKind emissionKind, MlirLLVMDINameTableKind nameTableKind, + MlirAttribute splitDebugFilename); /// Creates a LLVM DIFlags attribute. MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMDIFlagsAttrGet(MlirContext ctx, diff --git a/mlir/include/mlir-c/Rewrite.h b/mlir/include/mlir-c/Rewrite.h index 374d2fb78de88..77be1f480eacf 100644 --- a/mlir/include/mlir-c/Rewrite.h +++ b/mlir/include/mlir-c/Rewrite.h @@ -37,6 +37,7 @@ DEFINE_C_API_STRUCT(MlirRewriterBase, void); DEFINE_C_API_STRUCT(MlirFrozenRewritePatternSet, void); DEFINE_C_API_STRUCT(MlirGreedyRewriteDriverConfig, void); DEFINE_C_API_STRUCT(MlirRewritePatternSet, void); +DEFINE_C_API_STRUCT(MlirPatternRewriter, void); //===----------------------------------------------------------------------===// /// RewriterBase API inherited from OpBuilder @@ -315,6 +316,8 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirApplyPatternsAndFoldGreedily( #if MLIR_ENABLE_PDL_IN_PATTERNMATCH DEFINE_C_API_STRUCT(MlirPDLPatternModule, void); +DEFINE_C_API_STRUCT(MlirPDLValue, const void); +DEFINE_C_API_STRUCT(MlirPDLResultList, void); MLIR_CAPI_EXPORTED MlirPDLPatternModule mlirPDLPatternModuleFromModule(MlirModule op); @@ -323,6 +326,69 @@ MLIR_CAPI_EXPORTED void mlirPDLPatternModuleDestroy(MlirPDLPatternModule op); MLIR_CAPI_EXPORTED MlirRewritePatternSet mlirRewritePatternSetFromPDLPatternModule(MlirPDLPatternModule op); + +/// Cast the MlirPDLValue to an MlirValue. +/// Return a null value if the cast fails, just like llvm::dyn_cast. +MLIR_CAPI_EXPORTED MlirValue mlirPDLValueAsValue(MlirPDLValue value); + +/// Cast the MlirPDLValue to an MlirType. +/// Return a null value if the cast fails, just like llvm::dyn_cast. +MLIR_CAPI_EXPORTED MlirType mlirPDLValueAsType(MlirPDLValue value); + +/// Cast the MlirPDLValue to an MlirOperation. +/// Return a null value if the cast fails, just like llvm::dyn_cast. +MLIR_CAPI_EXPORTED MlirOperation mlirPDLValueAsOperation(MlirPDLValue value); + +/// Cast the MlirPDLValue to an MlirAttribute. +/// Return a null value if the cast fails, just like llvm::dyn_cast. +MLIR_CAPI_EXPORTED MlirAttribute mlirPDLValueAsAttribute(MlirPDLValue value); + +/// Push the MlirValue into the given MlirPDLResultList. +MLIR_CAPI_EXPORTED void +mlirPDLResultListPushBackValue(MlirPDLResultList results, MlirValue value); + +/// Push the MlirType into the given MlirPDLResultList. +MLIR_CAPI_EXPORTED void mlirPDLResultListPushBackType(MlirPDLResultList results, + MlirType value); + +/// Push the MlirOperation into the given MlirPDLResultList. +MLIR_CAPI_EXPORTED void +mlirPDLResultListPushBackOperation(MlirPDLResultList results, + MlirOperation value); + +/// Push the MlirAttribute into the given MlirPDLResultList. +MLIR_CAPI_EXPORTED void +mlirPDLResultListPushBackAttribute(MlirPDLResultList results, + MlirAttribute value); + +/// This function type is used as callbacks for PDL native rewrite functions. +/// Input values can be accessed by `values` with its size `nValues`; +/// output values can be added into `results` by `mlirPDLResultListPushBack*` +/// APIs. And the return value indicates whether the rewrite succeeds. +typedef MlirLogicalResult (*MlirPDLRewriteFunction)( + MlirPatternRewriter rewriter, MlirPDLResultList results, size_t nValues, + MlirPDLValue *values, void *userData); + +/// Register a rewrite function into the given PDL pattern module. +/// `userData` will be provided as an argument to the rewrite function. +MLIR_CAPI_EXPORTED void mlirPDLPatternModuleRegisterRewriteFunction( + MlirPDLPatternModule pdlModule, MlirStringRef name, + MlirPDLRewriteFunction rewriteFn, void *userData); + +/// This function type is used as callbacks for PDL native constraint functions. +/// Input values can be accessed by `values` with its size `nValues`; +/// output values can be added into `results` by `mlirPDLResultListPushBack*` +/// APIs. And the return value indicates whether the constraint holds. +typedef MlirLogicalResult (*MlirPDLConstraintFunction)( + MlirPatternRewriter rewriter, MlirPDLResultList results, size_t nValues, + MlirPDLValue *values, void *userData); + +/// Register a constraint function into the given PDL pattern module. +/// `userData` will be provided as an argument to the constraint function. +MLIR_CAPI_EXPORTED void mlirPDLPatternModuleRegisterConstraintFunction( + MlirPDLPatternModule pdlModule, MlirStringRef name, + MlirPDLConstraintFunction constraintFn, void *userData); + #endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH #undef DEFINE_C_API_STRUCT diff --git a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h index 8744d8d0e4bca..b5f985f803de6 100644 --- a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h +++ b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h @@ -19,7 +19,9 @@ #ifndef MLIR_BINDINGS_PYTHON_NANOBINDADAPTORS_H #define MLIR_BINDINGS_PYTHON_NANOBINDADAPTORS_H +#include #include +#include #include #include "mlir-c/Diagnostics.h" @@ -30,6 +32,57 @@ // clang-format on #include "llvm/ADT/Twine.h" +namespace mlir { +namespace python { +namespace { + +// Safely calls Python initialization code on first use, avoiding deadlocks. +template +class SafeInit { +public: + typedef std::unique_ptr (*F)(); + + explicit SafeInit(F init_fn) : initFn(init_fn) {} + + T &get() { + if (T *result = output.load()) { + return *result; + } + + // Note: init_fn() may be called multiple times if, for example, the GIL is + // released during its execution. The intended use case is for module + // imports which are safe to perform multiple times. We are careful not to + // hold a lock across init_fn() to avoid lock ordering problems. + std::unique_ptr m = initFn(); + { + nanobind::ft_lock_guard lock(mu); + if (T *result = output.load()) { + return *result; + } + T *p = m.release(); + output.store(p); + return *p; + } + } + +private: + nanobind::ft_mutex mu; + std::atomic output{nullptr}; + F initFn; +}; + +nanobind::module_ &irModule() { + static SafeInit init([]() { + return std::make_unique( + nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir"))); + }); + return init.get(); +} + +} // namespace +} // namespace python +} // namespace mlir + // Raw CAPI type casters need to be declared before use, so always include them // first. namespace nanobind { @@ -75,7 +128,7 @@ struct type_caster { cleanup_list *cleanup) noexcept { nanobind::object capsule = nanobind::steal(mlirPythonAffineMapToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("AffineMap") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .release(); @@ -97,7 +150,7 @@ struct type_caster { cleanup_list *cleanup) noexcept { nanobind::object capsule = nanobind::steal(mlirPythonAttributeToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("Attribute") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .attr(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR)() @@ -128,9 +181,7 @@ struct type_caster { // TODO: This raises an error of "No current context" currently. // Update the implementation to pretty-print the helpful error that the // core implementations print in this case. - src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) - .attr("Context") - .attr("current"); + src = mlir::python::irModule().attr("Context").attr("current"); } std::optional capsule = mlirApiObjectToCapsule(src); value = mlirPythonCapsuleToContext(capsule->ptr()); @@ -153,7 +204,7 @@ struct type_caster { cleanup_list *cleanup) noexcept { nanobind::object capsule = nanobind::steal( mlirPythonDialectRegistryToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("DialectRegistry") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .release(); @@ -167,9 +218,7 @@ struct type_caster { bool from_python(handle src, uint8_t flags, cleanup_list *cleanup) noexcept { if (src.is_none()) { // Gets the current thread-bound context. - src = nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) - .attr("Location") - .attr("current"); + src = mlir::python::irModule().attr("Location").attr("current"); } if (auto capsule = mlirApiObjectToCapsule(src)) { value = mlirPythonCapsuleToLocation(capsule->ptr()); @@ -181,7 +230,7 @@ struct type_caster { cleanup_list *cleanup) noexcept { nanobind::object capsule = nanobind::steal(mlirPythonLocationToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("Location") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .release(); @@ -203,7 +252,7 @@ struct type_caster { cleanup_list *cleanup) noexcept { nanobind::object capsule = nanobind::steal(mlirPythonModuleToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("Module") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .release(); @@ -250,7 +299,7 @@ struct type_caster { return nanobind::none(); nanobind::object capsule = nanobind::steal(mlirPythonOperationToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("Operation") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .release(); @@ -274,7 +323,7 @@ struct type_caster { return nanobind::none(); nanobind::object capsule = nanobind::steal(mlirPythonValueToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("Value") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .attr(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR)() @@ -312,7 +361,7 @@ struct type_caster { return nanobind::none(); nanobind::object capsule = nanobind::steal(mlirPythonTypeIDToCapsule(v)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("TypeID") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .release(); @@ -334,7 +383,7 @@ struct type_caster { cleanup_list *cleanup) noexcept { nanobind::object capsule = nanobind::steal(mlirPythonTypeToCapsule(t)); - return nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) + return mlir::python::irModule() .attr("Type") .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule) .attr(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR)() @@ -453,11 +502,9 @@ class mlir_attribute_subclass : public pure_subclass { mlir_attribute_subclass(nanobind::handle scope, const char *attrClassName, IsAFunctionTy isaFunction, GetTypeIDFunctionTy getTypeIDFunction = nullptr) - : mlir_attribute_subclass( - scope, attrClassName, isaFunction, - nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) - .attr("Attribute"), - getTypeIDFunction) {} + : mlir_attribute_subclass(scope, attrClassName, isaFunction, + irModule().attr("Attribute"), + getTypeIDFunction) {} /// Subclasses with a provided mlir.ir.Attribute super-class. This must /// be used if the subclass is being defined in the same extension module @@ -540,11 +587,8 @@ class mlir_type_subclass : public pure_subclass { mlir_type_subclass(nanobind::handle scope, const char *typeClassName, IsAFunctionTy isaFunction, GetTypeIDFunctionTy getTypeIDFunction = nullptr) - : mlir_type_subclass( - scope, typeClassName, isaFunction, - nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) - .attr("Type"), - getTypeIDFunction) {} + : mlir_type_subclass(scope, typeClassName, isaFunction, + irModule().attr("Type"), getTypeIDFunction) {} /// Subclasses with a provided mlir.ir.Type super-class. This must /// be used if the subclass is being defined in the same extension module @@ -631,10 +675,8 @@ class mlir_value_subclass : public pure_subclass { /// Subclasses by looking up the super-class dynamically. mlir_value_subclass(nanobind::handle scope, const char *valueClassName, IsAFunctionTy isaFunction) - : mlir_value_subclass( - scope, valueClassName, isaFunction, - nanobind::module_::import_(MAKE_MLIR_PYTHON_QUALNAME("ir")) - .attr("Value")) {} + : mlir_value_subclass(scope, valueClassName, isaFunction, + irModule().attr("Value")) {} /// Subclasses with a provided mlir.ir.Value super-class. This must /// be used if the subclass is being defined in the same extension module diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 1a37d057776e2..3c18ecc753d0f 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -1489,8 +1489,8 @@ def ConvertVectorToLLVMPass : Pass<"convert-vector-to-llvm"> { VectorContractLoweringAttr.summary, [{::llvm::cl::values( clEnumValN(::mlir::vector::VectorContractLowering::Dot, "dot", "Progressively lower to finer grained `vector.contract` and dot-products. (default)"), - clEnumValN(::mlir::vector::VectorContractLowering::Matmul, "matmul", - "Lower to `vector.matrix_multiply`, maps 1-1 to LLVM matrix intrinsics."), + clEnumValN(::mlir::vector::VectorContractLowering::LLVMIntr, "llvmintr", + "Lower directly to `llvm.intr.matrix.multiply`."), clEnumValN(::mlir::vector::VectorContractLowering::OuterProduct, "outerproduct", "Lower to `vector.outerproduct`."), clEnumValN(::mlir::vector::VectorContractLowering::ParallelArith, "parallelarith", @@ -1502,8 +1502,8 @@ def ConvertVectorToLLVMPass : Pass<"convert-vector-to-llvm"> { VectorTransposeLoweringAttr.summary, [{::llvm::cl::values( clEnumValN(::mlir::vector::VectorTransposeLowering::EltWise, "eltwise", "Lower transpose into element-wise extract and inserts (default)"), - clEnumValN(::mlir::vector::VectorTransposeLowering::Flat, "flat", - "Lower 2-D transpose to `vector.flat_transpose`, maps 1-1 to LLVM matrix intrinsics"), + clEnumValN(::mlir::vector::VectorTransposeLowering::LLVMIntr, "llvmintr", + "Lower 2-D transpose directly to `llvm.intr.matrix.transpose`"), clEnumValN(::mlir::vector::VectorTransposeLowering::Shuffle1D, "shuffle1d", "Lower 2-D transpose to `vector.shuffle` on 1-D vector."), clEnumValN(::mlir::vector::VectorTransposeLowering::Shuffle16x16, "shuffle16x16", diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index a24a918357f2d..8370d350afd1e 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -235,7 +235,7 @@ def AMDGPU_FatRawBufferCastOp : DeclareOpInterfaceMethods, ViewLikeOpInterface, AttrSizedOperandSegments]>, Arguments<(ins AnyMemRef:$source, - Optional:$validBytes, + Optional:$validBytes, Optional>:$cacheSwizzleStride, DefaultValuedAttr:$boundsCheck, UnitAttr:$resetOffset)>, @@ -680,8 +680,8 @@ def AMDGPU_PermlaneSwapOp : AMDGPU_Op<"permlane_swap", [Pure, AllTypesMatch<["re * `$fetch_inactive`: Optional. Used to dertermine behavior of a fetch from a disabled lane. `fetch_inactive = false`: If the source lane is disabled, use `bound_ctrl` to determine the source value. `fetch_inactive = true`: If the source lane is disabled, fetch the source value anyway (ignoring `bound_ctrl`). - * `$bound_ctrl`: Optional. Used to determine what a thread should do if its source operand is from - a disabled lane: use the value zero, or disable the write. + * `$bound_ctrl`: Optional. Used to determine what a thread should do if its source operand is from + a disabled lane: use the value zero, or disable the write. `bound_ctrl = false`: Do not write when source is from a disabled lane `bound_ctrl = true`: Use zero as input if source is from a disabled lane diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index f3b34f9fded7f..dd693a25fd54f 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -260,12 +260,12 @@ struct BufferizationOptions { std::function; /// Initializer function for analysis state. using AnalysisStateInitFn = std::function; - /// Tensor -> MemRef type converter. - /// Parameters: tensor type, memory space, func op, bufferization options + /// Tensor-like -> Buffer-like type conversion. + /// Parameters: tensor-like type, memory space, func op, bufferization options using FunctionArgTypeConverterFn = - std::function; - /// Tensor -> MemRef type converter. + /// Tensor -> MemRef type conversion. /// Parameters: tensor type, memory space, bufferization options using UnknownTypeConverterFn = std::function; @@ -335,10 +335,12 @@ struct BufferizationOptions { /// predictable. void setFunctionBoundaryTypeConversion(LayoutMapOption layoutMapOption); - /// Type converter from tensors to memrefs. This type converter is used to - /// determine bufferized function argument and result types. By default, a - /// type converter that returns a memref type with a fully dynamic layout map - /// is used. + /// Type conversion from tensors to buffers. This type conversion is used to + /// determine bufferized function argument and result types. + /// + /// By default, if tensor is a (builtin) tensor type, it is converted to a + /// memref type with a fully dynamic layout map; if tensor is a (generic) + /// tensor-like type, it is converted using TensorLikeType::getBufferType(). /// /// If `bufferizeFunctionBoundaries` is not set, this function isn't used. FunctionArgTypeConverterFn functionArgTypeConverterFn = nullptr; @@ -350,10 +352,9 @@ struct BufferizationOptions { /// If `bufferizeFunctionBoundaries` is not set, this flag has no effect. bool inferFunctionResultLayout = true; - /// Type converter from tensors to memrefs. This type converter is used if no - /// memref type could be inferred during bufferization. By default, a type - /// converter that returns a memref type with a fully dynamic layout map is - /// used. + /// Type conversion from tensors to memrefs. This type conversion is used if + /// no memref type could be inferred during bufferization. By default, returns + /// a memref type with a fully dynamic layout map. UnknownTypeConverterFn unknownTypeConverterFn = nullptr; // Use during type conversion to determine the memory space for memref based diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td index 721f9f6b320ad..f52eb7b91dc4c 100644 --- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td +++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td @@ -1645,7 +1645,7 @@ def EmitC_ClassOp return } } - // Class with a final speciferAdd commentMore actions + // Class with a final specifer emitc.class final @modelClass { emitc.field @fieldName0 : !emitc.array<1xf32> = {emitc.opaque = "input_tensor"} emitc.func @execute() { @@ -1667,8 +1667,6 @@ def EmitC_ClassOp Block &getBlock(); }]; - let hasCustomAssemblyFormat = 1; - let assemblyFormat = [{ (`final` $final_specifier^)? $sym_name attr-dict-with-keyword $body }]; } diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index 187ac9aa18aac..0c8a0c7a677ab 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -39,8 +39,19 @@ def GpuMapParallelLoopsPass encountered to the local workgroup. Within each mapping, the first three dimensions are mapped to x/y/z hardware ids and all following dimensions are mapped to sequential loops. + + Ordering of the loop mapping against the different dimensions is controlled + by the `mapping-policy` option. + Two policies are supported: + 1. `outermost-first` (default): the outermost loop maps to X, then Y + and finally Z. + 2. `innermost-first`: the innermost loop maps to X, then Y and finally Z. }]; let dependentDialects = ["mlir::gpu::GPUDialect"]; + let options = [Option<"mappingPolicyStr", "mapping-policy", "std::string", + /*default=*/"\"outermost-first\"", + "Policy outlining how to assign loops to GPU dimensions." + "Supported values are `outermost-first` and `innermost-first`.">]; } def GpuEliminateBarriers diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td index 75bce6b0a0e54..147f8c2040049 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td @@ -425,17 +425,19 @@ def LLVM_DICompileUnitAttr : LLVM_Attr<"DICompileUnit", "di_compile_unit", OptionalParameter<"StringAttr">:$producer, "bool":$isOptimized, "DIEmissionKind":$emissionKind, - OptionalParameter<"DINameTableKind">:$nameTableKind + OptionalParameter<"DINameTableKind">:$nameTableKind, + OptionalParameter<"StringAttr">:$splitDebugFilename ); let builders = [ AttrBuilderWithInferredContext<(ins "DistinctAttr":$id, "unsigned":$sourceLanguage, "DIFileAttr":$file, "StringAttr":$producer, "bool":$isOptimized, "DIEmissionKind":$emissionKind, - CArg<"DINameTableKind", "DINameTableKind::Default">:$nameTableKind + CArg<"DINameTableKind", "DINameTableKind::Default">:$nameTableKind, + CArg<"StringAttr", "{}">:$splitDebugFilename ), [{ return $_get(id.getContext(), id, sourceLanguage, file, producer, - isOptimized, emissionKind, nameTableKind); + isOptimized, emissionKind, nameTableKind, splitDebugFilename); }]> ]; let assemblyFormat = "`<` struct(params) `>`"; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index e12b8ac84ba23..398388bd720be 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -184,6 +184,15 @@ def LLVM_UMinOp : LLVM_BinarySameArgsIntrOpI<"umin">; def LLVM_SinOp : LLVM_UnaryIntrOpF<"sin">; def LLVM_CosOp : LLVM_UnaryIntrOpF<"cos">; def LLVM_TanOp : LLVM_UnaryIntrOpF<"tan">; +def LLVM_SincosOp : LLVM_TwoResultIntrOp<"sincos", [], [0], + [Pure], /*requiresFastmath=*/1> { + let arguments = + (ins LLVM_ScalarOrVectorOf:$val, + DefaultValuedAttr:$fastmathFlags); + let assemblyFormat = "`(` operands `)` attr-dict `:` " + "functional-type(operands, results)"; + let hasVerifier = 1; +} def LLVM_ASinOp : LLVM_UnaryIntrOpF<"asin">; def LLVM_ACosOp : LLVM_UnaryIntrOpF<"acos">; diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 797f8ada9f238..f56c1e5b936e6 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -2827,26 +2827,21 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", [DeclareOpInterfaceMethods, AttrSizedOperandSegments, NVVMRequiresSM<90>]>, - Arguments<(ins LLVM_PointerShared:$dstMem, - LLVM_AnyPointer:$tmaDescriptor, + Arguments<(ins AnyTypeOf<[LLVM_PointerShared, LLVM_PointerSharedCluster]>:$dstMem, + LLVM_PointerGeneric:$tmaDescriptor, Variadic:$coordinates, LLVM_PointerShared:$mbar, Variadic:$im2colOffsets, Optional:$multicastMask, Optional:$l2CacheHint, + DefaultValuedAttr:$mode, + DefaultValuedAttr:$isCTAOnly, + OptionalAttr:$group, PtxPredicate:$predicate)> { let description = [{ Initiates an asynchronous copy operation on the tensor data from global - memory to shared memory. - - The Op operates has two load modes: - 1) Tiled Mode: It's the default mode. The source multi-dimensional tensor - layout is preserved at the destination. - - 2) Im2col Mode: This mode is used when `im2colOffsets` operands are present. - the elements in the Bounding Box of the source tensor are rearranged into - columns at the destination. In this mode, the tensor has to be at least - 3-dimensional. + memory to shared::cluster (or) shared::cta memory. This Op supports all + the load modes specified in `TMALoadMode`. The `multicastMask` operand is optional. When it is present, the Op copies data from global memory to shared memory of multiple CTAs in the cluster. @@ -2857,6 +2852,10 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : The `l2CacheHint` operand is optional, and it is used to specify cache eviction policy that may be used during the memory access. + When the `isCTAOnly` attribute is set to true, the destination is + shared::cta only. Hence, `multicastMask` and `CTAGroup` are not applicable + when `isCTAOnly` is true. + [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) }]; @@ -2904,6 +2903,23 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : } }]; let hasVerifier = 1; + + let extraClassDeclaration = [{ + bool hasIntrinsic() { return !getPredicate(); } + + bool getAsmValues(RewriterBase &rewriter, + llvm::SmallVectorImpl> &asmValues); + + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, + llvm::IRBuilderBase& builder); + }]; + + string llvmBuilder = [{ + auto [id, args] = NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); + }]; } def NVVM_CpAsyncBulkTensorSharedCTAToGlobalOp : @@ -3191,35 +3207,17 @@ def NVVM_CpAsyncBulkTensorReduceOp : }]; let extraClassDeclaration = [{ - static llvm::Intrinsic::ID getIntrinsicID(int tensorDims, - NVVM::TMAReduxKind kind, - bool isIm2Col); + static mlir::NVVM::IDArgPair + getIntrinsicIDAndArgs(Operation &op, + LLVM::ModuleTranslation &mt, llvm::IRBuilderBase& builder); }]; let hasVerifier = 1; string llvmBuilder = [{ - // Arguments to the intrinsic: - // shared_mem_ptr, tmaDesc, tensorDims - // cache_hint(if applicable) and flag(boolean) - llvm::SmallVector translatedOperands; - translatedOperands.push_back($srcMem); - translatedOperands.push_back($tmaDescriptor); - - for (auto v : op.getCoordinates()) - translatedOperands.push_back(moduleTranslation.lookupValue(v)); - - llvm::LLVMContext &ctx = moduleTranslation.getLLVMContext(); - auto *i64Undef = llvm::UndefValue::get(llvm::IntegerType::get(ctx, 64)); - - bool isCacheHint = op.getL2CacheHint() ? true : false; - translatedOperands.push_back(isCacheHint ? $l2CacheHint : i64Undef); - translatedOperands.push_back(builder.getInt1(isCacheHint)); - - auto intId = NVVM::CpAsyncBulkTensorReduceOp::getIntrinsicID( - op.getCoordinates().size(), $redKind, - (op.getMode() == NVVM::TMAStoreMode::IM2COL)); - createIntrinsicCall(builder, intId, translatedOperands); + auto [id, args] = NVVM::CpAsyncBulkTensorReduceOp::getIntrinsicIDAndArgs( + *op, moduleTranslation, builder); + createIntrinsicCall(builder, id, args); }]; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 1f3974846a5ef..8b687a7f29bef 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -569,7 +569,7 @@ def ROCDL_MakeBufferRsrcOp : ROCDL_IntrOp<"make.buffer.rsrc", [0], [0], [Pure], 1>, Arguments<(ins LLVM_AnyPointer:$base, I16:$stride, - I32:$numRecords, + I64:$numRecords, I32:$flags)> { let results = (outs LLVM_AnyPointer:$res); let assemblyFormat = "operands attr-dict `:` type($base) `to` type($res)"; @@ -1360,6 +1360,37 @@ def ROCDL_CvtScaleF32PkFp4F32Op : }]; } +//===----------------------------------------------------------------------===// +// FMED3 operations +//===----------------------------------------------------------------------===// + +def ROCDL_FMed3Op : ROCDL_IntrOp<"fmed3", [0], [], [Pure, AllTypesMatch<["res", "src0", "src1", "src2"]>], 1>, + Arguments<(ins LLVM_ScalarOrVectorOf:$src0, + LLVM_ScalarOrVectorOf:$src1, + LLVM_ScalarOrVectorOf:$src2)> { + let results = (outs LLVM_ScalarOrVectorOf:$res); + let summary = "Median of three float/half values"; + let description = [{ + Computes the median of three floating-point values using the AMDGPU fmed3 intrinsic. + This operation is equivalent to `max(min(a, b), min(max(a, b), c))` but uses the + hardware-accelerated V_MED3_F16/V_MED3_F32 instruction for better performance. + + The operation supports both scalar and vector floating-point types (f16, f32). + + Example: + ```mlir + // Scalar f32 median + %result = rocdl.fmed3 %a, %b, %c : f32 + + // Vector f16 median + %result = rocdl.fmed3 %va, %vb, %vc : vector<4xf16> + ``` + }]; + let assemblyFormat = [{ + $src0 `,` $src1 `,` $src2 attr-dict `:` type($res) + }]; +} + //===----------------------------------------------------------------------===// // ROCDL target attribute. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td index f36b41ccf6745..3390f380c7eb8 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td @@ -239,6 +239,14 @@ def Linalg_PackOp : Linalg_RelayoutOp<"pack", [ ArrayRef outerDimsPerm, ArrayRef innerTiles); + // Same as above function but here dynamic dimensions are assumed + // to require padding. + static bool requirePaddingValueStrict(ArrayRef inputShape, + ArrayRef innerDimsPos, + ArrayRef outputShape, + ArrayRef outerDimsPerm, + ArrayRef innerTiles); + static Value createDestinationTensor(OpBuilder &b, Location loc, Value source, ArrayRef innerTileSizes, ArrayRef innerDimsPos, ArrayRef outerDimsPerm); diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 64d3a2448b409..7266687584b38 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -1858,6 +1858,7 @@ void populateDecomposePadPatterns(RewritePatternSet &patterns); /// Populates patterns to transform linalg.conv_2d_xxx operations into /// linalg.generic (for img2col packing) and linalg.matmul. +/// Note: currently limited to Tensor semantics only. /// \see rewriteInIm2Col for more details. void populateConvertConv2DToImg2ColPatterns(RewritePatternSet &patterns); @@ -1914,9 +1915,12 @@ void populateElementwiseOpsFusionPatterns( using ControlPropagationFn = std::function; /// Patterns to bubble up or down data layout ops across other operations. +/// The function also has an option to allow the patterns to propagate with +/// poison padding if requested by the caller. void populateDataLayoutPropagationPatterns( RewritePatternSet &patterns, - const ControlPropagationFn &controlPackUnPackPropagation); + const ControlPropagationFn &controlPackUnPackPropagation, + bool PoisonPaddingOk = false); /// Patterns to sink extract slice across other operations. void populateExtractSliceSinkingPatterns( diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h index bdec699eb4ce4..30f33ed2fd1d6 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRef.h @@ -18,6 +18,7 @@ #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/InferIntRangeInterface.h" #include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/MemOpInterfaces.h" #include "mlir/Interfaces/MemorySlotInterfaces.h" #include "mlir/Interfaces/ShapedOpInterfaces.h" #include "mlir/Interfaces/SideEffectInterfaces.h" diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td index 671cc05e963b4..2bf953e32ccce 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td @@ -15,6 +15,7 @@ include "mlir/Interfaces/CastInterfaces.td" include "mlir/Interfaces/ControlFlowInterfaces.td" include "mlir/Interfaces/InferIntRangeInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/MemOpInterfaces.td" include "mlir/Interfaces/MemorySlotInterfaces.td" include "mlir/Interfaces/ShapedOpInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" @@ -145,7 +146,8 @@ def AssumeAlignmentOp : MemRef_Op<"assume_alignment", [ DeclareOpInterfaceMethods, Pure, ViewLikeOpInterface, - SameOperandsAndResultType + SameOperandsAndResultType, + DeclareOpInterfaceMethods ]> { let summary = "assumption that gives alignment information to the input memref"; @@ -456,6 +458,7 @@ def MemRef_AllocaScopeReturnOp : MemRef_Op<"alloca_scope.return", def MemRef_CastOp : MemRef_Op<"cast", [ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, MemRefsNormalizable, Pure, SameOperandsAndResultShape, @@ -1194,6 +1197,7 @@ def LoadOp : MemRef_Op<"load", "memref", "result", "::llvm::cast($_self).getElementType()">, MemRefsNormalizable, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let summary = "load operation"; @@ -1237,28 +1241,28 @@ def LoadOp : MemRef_Op<"load", OpBuilder<(ins "Value":$memref, "ValueRange":$indices, CArg<"bool", "false">:$nontemporal, - CArg<"uint64_t", "0">:$alignment), [{ + CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{ return build($_builder, $_state, memref, indices, nontemporal, - alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : - nullptr); + alignment ? $_builder.getI64IntegerAttr(alignment->value()) : + nullptr); }]>, OpBuilder<(ins "Type":$resultType, "Value":$memref, "ValueRange":$indices, CArg<"bool", "false">:$nontemporal, - CArg<"uint64_t", "0">:$alignment), [{ + CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{ return build($_builder, $_state, resultType, memref, indices, nontemporal, - alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : - nullptr); + alignment ? $_builder.getI64IntegerAttr(alignment->value()) : + nullptr); }]>, OpBuilder<(ins "TypeRange":$resultTypes, "Value":$memref, "ValueRange":$indices, CArg<"bool", "false">:$nontemporal, - CArg<"uint64_t", "0">:$alignment), [{ + CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{ return build($_builder, $_state, resultTypes, memref, indices, nontemporal, - alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : - nullptr); + alignment ? $_builder.getI64IntegerAttr(alignment->value()) : + nullptr); }]> ]; @@ -1284,6 +1288,7 @@ def LoadOp : MemRef_Op<"load", def MemRef_MemorySpaceCastOp : MemRef_Op<"memory_space_cast", [ DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + MemorySpaceCastOpInterface, MemRefsNormalizable, Pure, SameOperandsAndResultElementType, @@ -1302,6 +1307,10 @@ def MemRef_MemorySpaceCastOp : MemRef_Op<"memory_space_cast", [ If the source and target address spaces are the same, this operation is a noop. + Finally, if the target memory-space is the generic/default memory-space, + then it is assumed this cast can be bubbled down safely. See the docs of + `MemorySpaceCastOpInterface` interface for more details. + Example: ```mlir @@ -1321,6 +1330,27 @@ def MemRef_MemorySpaceCastOp : MemRef_Op<"memory_space_cast", [ let extraClassDeclaration = [{ Value getViewSource() { return getSource(); } + + //===------------------------------------------------------------------===// + // MemorySpaceCastConsumerOpInterface + //===------------------------------------------------------------------===// + /// Returns the `source` memref. + TypedValue getSourcePtr(); + /// Returns the `dest` memref. + TypedValue getTargetPtr(); + /// Returns whether the memory-space cast is valid. Only casts between + /// memrefs are considered valid. Further, the `tgt` and `src` should only + /// differ on the memory-space parameter of the memref type. + bool isValidMemorySpaceCast(PtrLikeTypeInterface tgt, + PtrLikeTypeInterface src); + /// Clones the operation using a new target type and source value. + MemorySpaceCastOpInterface cloneMemorySpaceCastOp( + OpBuilder &b, PtrLikeTypeInterface tgt, + TypedValue src); + /// Returns whether the `source` value can be promoted by the + /// `MemorySpaceCastConsumerOpInterface::bubbleDownCasts` method. The only + /// casts the op recognizes as promotable are to the generic memory-space. + bool isSourcePromotable(); }]; let hasFolder = 1; @@ -1376,6 +1406,7 @@ def MemRef_PrefetchOp : MemRef_Op<"prefetch"> { def MemRef_ReinterpretCastOp : MemRef_OpWithOffsetSizesAndStrides<"reinterpret_cast", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, AttrSizedOperandSegments, MemRefsNormalizable, Pure, @@ -1603,6 +1634,7 @@ def MemRef_RankOp : MemRef_Op<"rank", [Pure]> { def MemRef_ReshapeOp: MemRef_Op<"reshape", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, Pure, ViewLikeOpInterface]> { let summary = "memref reshape operation"; @@ -1701,6 +1733,7 @@ class MemRef_ReassociativeReshapeOp traits = []> : def MemRef_ExpandShapeOp : MemRef_ReassociativeReshapeOp<"expand_shape", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let summary = "operation to produce a memref with a higher rank."; let description = [{ @@ -1822,7 +1855,9 @@ def MemRef_ExpandShapeOp : MemRef_ReassociativeReshapeOp<"expand_shape", [ } def MemRef_CollapseShapeOp : MemRef_ReassociativeReshapeOp<"collapse_shape", [ - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods + ]> { let summary = "operation to produce a memref with a smaller rank."; let description = [{ The `memref.collapse_shape` op produces a new view with a smaller rank @@ -1929,6 +1964,7 @@ def MemRef_StoreOp : MemRef_Op<"store", "memref", "value", "::llvm::cast($_self).getElementType()">, MemRefsNormalizable, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods]> { let summary = "store operation"; @@ -1971,10 +2007,10 @@ def MemRef_StoreOp : MemRef_Op<"store", "Value":$memref, "ValueRange":$indices, CArg<"bool", "false">:$nontemporal, - CArg<"uint64_t", "0">:$alignment), [{ + CArg<"llvm::MaybeAlign", "llvm::MaybeAlign()">:$alignment), [{ return build($_builder, $_state, valueToStore, memref, indices, nontemporal, - alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : - nullptr); + alignment ? $_builder.getI64IntegerAttr(alignment->value()) : + nullptr); }]>, OpBuilder<(ins "Value":$valueToStore, "Value":$memref), [{ $_state.addOperands(valueToStore); @@ -2006,6 +2042,7 @@ def MemRef_StoreOp : MemRef_Op<"store", def SubViewOp : MemRef_OpWithOffsetSizesAndStrides<"subview", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface, @@ -2281,6 +2318,7 @@ def SubViewOp : MemRef_OpWithOffsetSizesAndStrides<"subview", [ def MemRef_TransposeOp : MemRef_Op<"transpose", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, Pure]>, Arguments<(ins AnyStridedMemRef:$in, AffineMapAttr:$permutation)>, Results<(outs AnyStridedMemRef)> { @@ -2316,6 +2354,7 @@ def MemRef_TransposeOp : MemRef_Op<"transpose", [ def MemRef_ViewOp : MemRef_Op<"view", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, Pure]> { let summary = "memref view operation"; @@ -2392,6 +2431,7 @@ def MemRef_ViewOp : MemRef_Op<"view", [ //===----------------------------------------------------------------------===// def AtomicRMWOp : MemRef_Op<"atomic_rmw", [ + DeclareOpInterfaceMethods, AllTypesMatch<["value", "result"]>, TypesMatchWith<"value type matches element type of memref", "memref", "value", diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td index 9dbe6897a3304..f693a0737e0fc 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPEnums.td @@ -230,14 +230,24 @@ def TargetRegionFlagsNone : I32BitEnumAttrCaseNone<"none">; def TargetRegionFlagsGeneric : I32BitEnumAttrCaseBit<"generic", 0>; def TargetRegionFlagsSpmd : I32BitEnumAttrCaseBit<"spmd", 1>; def TargetRegionFlagsTripCount : I32BitEnumAttrCaseBit<"trip_count", 2>; +def TargetRegionFlagsNoLoop : I32BitEnumAttrCaseBit<"no_loop", 3>; def TargetRegionFlags : OpenMP_BitEnumAttr< "TargetRegionFlags", - "target region property flags", [ + "These flags describe properties of the target kernel. " + "TargetRegionFlagsGeneric - denotes generic kernel. " + "TargetRegionFlagsSpmd - denotes SPMD kernel. " + "TargetRegionFlagsNoLoop - denotes kernel where " + "num_teams * num_threads >= loop_trip_count. It allows the conversion " + "of loops into sequential code by ensuring that each team/thread " + "executes at most one iteration. " + "TargetRegionFlagsTripCount - checks if the loop trip count should be " + "calculated.", [ TargetRegionFlagsNone, TargetRegionFlagsGeneric, TargetRegionFlagsSpmd, - TargetRegionFlagsTripCount + TargetRegionFlagsTripCount, + TargetRegionFlagsNoLoop ]>; //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrEnums.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrEnums.td index c169f48e573d0..c97bd04d32896 100644 --- a/mlir/include/mlir/Dialect/Ptr/IR/PtrEnums.td +++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrEnums.td @@ -79,4 +79,14 @@ def Ptr_PtrAddFlags : I32Enum<"PtrAddFlags", "Pointer add flags", [ let cppNamespace = "::mlir::ptr"; } +//===----------------------------------------------------------------------===// +// Ptr diff flags enum properties. +//===----------------------------------------------------------------------===// + +def Ptr_PtrDiffFlags : I8BitEnum<"PtrDiffFlags", "Pointer difference flags", [ + I8BitEnumCase<"none", 0>, I8BitEnumCase<"nuw", 1>, I8BitEnumCase<"nsw", 2> + ]> { + let cppNamespace = "::mlir::ptr"; +} + #endif // PTR_ENUMS diff --git a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td index 468a3004d5c62..e14f64330c294 100644 --- a/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td +++ b/mlir/include/mlir/Dialect/Ptr/IR/PtrOps.td @@ -415,6 +415,63 @@ def Ptr_PtrAddOp : Pointer_Op<"ptr_add", [ }]; } +//===----------------------------------------------------------------------===// +// PtrDiffOp +//===----------------------------------------------------------------------===// + +def Ptr_PtrDiffOp : Pointer_Op<"ptr_diff", [ + Pure, AllTypesMatch<["lhs", "rhs"]>, SameOperandsAndResultShape + ]> { + let summary = "Pointer difference operation"; + let description = [{ + The `ptr_diff` operation computes the difference between two pointers, + returning an integer or index value representing the number of bytes + between them. + + The operation supports both scalar and shaped types with value semantics: + - When both operands are scalar: produces a single difference value + - When both are shaped: performs element-wise subtraction, + shapes must be the same + + The operation also supports the following flags: + - `none`: No flags are set. + - `nuw`: No Unsigned Wrap, if the subtraction causes an unsigned overflow + (that is: the result would be negative), the result is a poison value. + - `nsw`: No Signed Wrap, if the subtraction causes a signed overflow, the + result is a poison value. + + NOTE: The pointer difference is calculated using an integer type specified + by the data layout. The final result will be sign-extended or truncated to + fit the result type as necessary. + + Example: + + ```mlir + // Scalar pointers + %diff = ptr.ptr_diff %p1, %p2 : !ptr.ptr<#ptr.generic_space> -> i64 + + // Shaped pointers + %diffs = ptr.ptr_diff nsw %ptrs1, %ptrs2 : + vector<4x!ptr.ptr<#ptr.generic_space>> -> vector<4xi64> + ``` + }]; + let arguments = (ins + Ptr_PtrLikeType:$lhs, Ptr_PtrLikeType:$rhs, + DefaultValuedProp, "PtrDiffFlags::none">:$flags + ); + let results = (outs Ptr_IntLikeType:$result); + let assemblyFormat = [{ + ($flags^)? $lhs `,` $rhs attr-dict `:` type($lhs) `->` type($result) + }]; + let extraClassDeclaration = [{ + /// Returns the operand's ptr type. + ptr::PtrType getPtrType(); + /// Returns the result's underlying int type. + Type getIntType(); + }]; + let hasVerifier = 1; +} + //===----------------------------------------------------------------------===// // ScatterOp //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h index 3205da6e448fc..668ee6386f71f 100644 --- a/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h +++ b/mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h @@ -33,6 +33,14 @@ using SCFTileSizeComputationFunction = /// Options to use to control tiling. struct SCFTilingOptions { + /// Specify which loop construct to use for tile and fuse. + enum class LoopType { ForOp, ForallOp, CustomOp }; + LoopType loopType = LoopType::ForOp; + SCFTilingOptions &setLoopType(LoopType type) { + loopType = type; + return *this; + } + /// Computation function that returns the tile sizes to use for each loop. /// Returning a tile size of zero implies no tiling for that loop. If the /// size of the returned vector is smaller than the number of loops, the inner @@ -50,6 +58,17 @@ struct SCFTilingOptions { /// proper interaction with folding. SCFTilingOptions &setTileSizes(ArrayRef tileSizes); + /// The interchange vector to reorder the tiled loops. + SmallVector interchangeVector = {}; + SCFTilingOptions &setInterchange(ArrayRef interchange) { + interchangeVector = llvm::to_vector(interchange); + return *this; + } + + //-------------------------------------------------------------------------// + // Options related to tiling using `scf.forall`. + //-------------------------------------------------------------------------// + /// Computation function that returns the number of threads to use for /// each loop. Returning a num threads of zero implies no tiling for that /// loop. If the size of the returned vector is smaller than the number of @@ -70,21 +89,6 @@ struct SCFTilingOptions { /// function that computes num threads at the point they are needed. SCFTilingOptions &setNumThreads(ArrayRef numThreads); - /// The interchange vector to reorder the tiled loops. - SmallVector interchangeVector = {}; - SCFTilingOptions &setInterchange(ArrayRef interchange) { - interchangeVector = llvm::to_vector(interchange); - return *this; - } - - /// Specify which loop construct to use for tile and fuse. - enum class LoopType { ForOp, ForallOp }; - LoopType loopType = LoopType::ForOp; - SCFTilingOptions &setLoopType(LoopType type) { - loopType = type; - return *this; - } - /// Specify mapping of loops to devices. This is only respected when the loop /// constructs support such a mapping (like `scf.forall`). Will be ignored /// when using loop constructs that dont support such a mapping (like @@ -117,6 +121,96 @@ struct SCFTilingOptions { reductionDims.insert(dims.begin(), dims.end()); return *this; } + + //-------------------------------------------------------------------------// + // Options related to tiling using custom loop. + //-------------------------------------------------------------------------// + + // For generating the inter-tile loops using a custom loop, two callback + // functions are needed + // 1. That generates the "loop header", i.e. the loop that iterates over the + // different tiles. + // 2. That generates the loop terminator + // + // For `scf.forall` case the call back to generate loop header would generate + // + // ```mlir + // scf.forall (...) = ... { + // .. + // } + // ``` + // + // and the call back to generate the loop terminator would generate the + // `scf.in_parallel` region + // + // ```mlir + // scf.forall (...) = ... { + // scf.in_parallel { + // tensor.parallel_insert_slice ... + // } + // } + // ``` + // + + // Information that is to be returned by loop header callback needed for the + // rest of the tiled codegeneration. + // - `loops`: The generated loops + // - `tileOffset`: The values that represent the offset of the iteration space + // tile. + // - `tileSizes` : The values that represent the size of the iteration space + // tile. + // - `destinationTensors` : The tensors to use as destinations during tiling. + struct CustomLoopHeaderInfo { + SmallVector loops; + SmallVector tileOffset; + SmallVector tileSizes; + SmallVector destinationTensors; + }; + + // Type of the callback function that generates the loop headers. + // - `loopRanges` : Values that represent the full size of the iteration space + // being tiled. + // - `givenTileSizes` : The tile sizes that are to be used to tile the + // iteration space. + // - `destinationTensors` : The tensors to use as destinations for the results + // of the tiled loop for loops that implement + // `DestinationStyleOpInterface`. + // Returns the `CustomLoopHeaderInfo` object (described above). it is expected + // that this function sets the insertion point of `rewriter` to the program + // point where the intra-tile loop computation is to be generated. + using GenerateLoopHeaderFn = std::function( + RewriterBase &rewriter, Location loc, ArrayRef loopRanges, + ArrayRef givenTileSizes, ValueRange destinationTensors)>; + + // Type of the callback function that generates the loop terminator. + // - `tiledResults` : Tiles of the result computed for the iteration space + // tile. + // - `resultOffsets` : For each of the `tiledResults`, the offset at which + // the result tile is to be "inserted" back into the + // destination tensor. + // - `resultSizes` : For each of the `tiledResults`, the size of the result + // tile that is to be "inserted" back into the destination + // tensor. + // Returns the `CustomLoopHeaderInfo` object (described above) + using GenerateLoopTerminatorFn = std::function> resultOffsets, + ArrayRef> resultSizes, + ValueRange destinationTensors)>; + + // Callback function to generate the inter-tile loop header. + GenerateLoopHeaderFn generateLoopHeaderFn = nullptr; + // Callback function to generate the inter-tile loop terminator. + GenerateLoopTerminatorFn generateLoopTerminatorFn = nullptr; + // Helper function to set the callbacks for inter-tile loop header and + // terminator functions when using a custom operation for the loop. + SCFTilingOptions & + setCustomLoopGenerationFns(GenerateLoopHeaderFn headerFn, + GenerateLoopTerminatorFn terminatorFn) { + generateLoopHeaderFn = std::move(headerFn); + generateLoopTerminatorFn = std::move(terminatorFn); + return *this; + } }; /// Transformation information returned after tiling. diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h index 6beffc17d6d58..e46b576810316 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h @@ -88,11 +88,6 @@ class ScalarType : public SPIRVType { static bool isValid(FloatType); /// Returns true if the given float type is valid for the SPIR-V dialect. static bool isValid(IntegerType); - - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); - - std::optional getSizeInBytes(); }; // SPIR-V composite type: VectorType, SPIR-V ArrayType, SPIR-V @@ -115,11 +110,6 @@ class CompositeType : public SPIRVType { /// Return true if the number of elements is known at compile time and is not /// implementation dependent. bool hasCompileTimeKnownNumElements() const; - - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); - - std::optional getSizeInBytes(); }; // SPIR-V array type @@ -143,13 +133,6 @@ class ArrayType : public Type::TypeBase storage = std::nullopt); - - /// Returns the array size in bytes. Since array type may have an explicit - /// stride declaration (in bytes), we also include it in the calculation. - std::optional getSizeInBytes(); }; // SPIR-V image type @@ -186,9 +169,6 @@ class ImageType ImageSamplerUseInfo getSamplerUseInfo() const; ImageFormat getImageFormat() const; // TODO: Add support for Access qualifier - - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); }; // SPIR-V pointer type @@ -204,9 +184,6 @@ class PointerType : public Type::TypeBase storage = std::nullopt); }; // SPIR-V run-time array type @@ -228,9 +205,6 @@ class RuntimeArrayType /// Returns the array stride in bytes. 0 means no stride decorated on this /// type. unsigned getArrayStride() const; - - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); }; // SPIR-V sampled image type @@ -252,10 +226,6 @@ class SampledImageType Type imageType); Type getImageType() const; - - void - getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); }; /// SPIR-V struct type. Two kinds of struct types are supported: @@ -405,9 +375,6 @@ class StructType trySetBody(ArrayRef memberTypes, ArrayRef offsetInfo = {}, ArrayRef memberDecorations = {}, ArrayRef structDecorations = {}); - - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); }; llvm::hash_code @@ -440,9 +407,6 @@ class CooperativeMatrixType /// Returns the use parameter of the cooperative matrix. CooperativeMatrixUseKHR getUse() const; - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); - operator ShapedType() const { return llvm::cast(*this); } ArrayRef getShape() const; @@ -493,9 +457,6 @@ class MatrixType : public Type::TypeBase storage = std::nullopt); }; /// SPIR-V TensorARM Type @@ -531,9 +492,6 @@ class TensorArmType ArrayRef getShape() const; bool hasRank() const { return !getShape().empty(); } operator ShapedType() const { return llvm::cast(*this); } - - void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage = std::nullopt); }; } // namespace spirv diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 953e7c304da85..48759f2a3c9e8 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1331,8 +1331,6 @@ def Tosa_ExpOp : Tosa_ElementwiseUnaryOp<"exp"> { Extension<[Tosa_EXT_BF16]>, ]; - let hasFolder = 1; - let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; } @@ -1385,8 +1383,6 @@ def Tosa_LogOp : Tosa_ElementwiseUnaryOp<"log"> { Extension<[Tosa_EXT_BF16]>, ]; - let hasFolder = 1; - let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)"; } diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index 553d69cc21d17..93ab120339d55 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -282,8 +282,7 @@ def Tosa_Shape : Tosa_Type<"shape", "shape"> { !tosa.shape<0> ``` }]; - let parameters = (ins "int" : $rank); - let builders = [TypeBuilder<(ins "int" : $rank)>]; + let parameters = (ins "int":$rank); let assemblyFormat = "`<` $rank `>`"; let genVerifyDecl = 1; diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h index 63410b8bea747..bbf55f5d507e3 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.h @@ -27,6 +27,7 @@ #include "mlir/Interfaces/DestinationStyleOpInterface.h" #include "mlir/Interfaces/IndexingMapOpInterface.h" #include "mlir/Interfaces/InferTypeOpInterface.h" +#include "mlir/Interfaces/MemOpInterfaces.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/VectorInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index 26d06624cb976..252c0b72456df 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -24,6 +24,7 @@ include "mlir/Interfaces/DestinationStyleOpInterface.td" include "mlir/Interfaces/IndexingMapOpInterface.td" include "mlir/Interfaces/InferIntRangeInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/MemOpInterfaces.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/VectorInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" @@ -1246,6 +1247,7 @@ def Vector_TransferReadOp : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, AttrSizedOperandSegments, DestinationStyleOpInterface ]>, @@ -1493,6 +1495,7 @@ def Vector_TransferWriteOp : DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, AttrSizedOperandSegments, DestinationStyleOpInterface ]>, @@ -1649,6 +1652,7 @@ def Vector_TransferWriteOp : def Vector_LoadOp : Vector_Op<"load", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods ]> { let summary = "reads an n-D slice of memory into an n-D vector"; let description = [{ @@ -1765,6 +1769,7 @@ def Vector_LoadOp : Vector_Op<"load", [ def Vector_StoreOp : Vector_Op<"store", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods ]> { let summary = "writes an n-D vector to an n-D slice of memory"; let description = [{ @@ -1869,7 +1874,7 @@ def Vector_StoreOp : Vector_Op<"store", [ } def Vector_MaskedLoadOp : - Vector_Op<"maskedload">, + Vector_Op<"maskedload", [DeclareOpInterfaceMethods]>, Arguments<(ins Arg:$base, Variadic:$indices, VectorOfNonZeroRankOf<[I1]>:$mask, @@ -1961,7 +1966,7 @@ def Vector_MaskedLoadOp : } def Vector_MaskedStoreOp : - Vector_Op<"maskedstore">, + Vector_Op<"maskedstore", [DeclareOpInterfaceMethods]>, Arguments<(ins Arg:$base, Variadic:$indices, VectorOfNonZeroRankOf<[I1]>:$mask, @@ -2041,6 +2046,7 @@ def Vector_MaskedStoreOp : def Vector_GatherOp : Vector_Op<"gather", [ DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, DeclareOpInterfaceMethods ]>, Arguments<(ins Arg, "", [MemRead]>:$base, @@ -2144,7 +2150,7 @@ def Vector_GatherOp : } def Vector_ScatterOp : - Vector_Op<"scatter">, + Vector_Op<"scatter", [DeclareOpInterfaceMethods]>, Arguments<(ins Arg:$base, Variadic:$offsets, VectorOfNonZeroRankOf<[AnyInteger, Index]>:$indices, @@ -2229,7 +2235,7 @@ def Vector_ScatterOp : } def Vector_ExpandLoadOp : - Vector_Op<"expandload">, + Vector_Op<"expandload", [DeclareOpInterfaceMethods]>, Arguments<(ins Arg:$base, Variadic:$indices, FixedVectorOfNonZeroRankOf<[I1]>:$mask, @@ -2317,7 +2323,7 @@ def Vector_ExpandLoadOp : } def Vector_CompressStoreOp : - Vector_Op<"compressstore">, + Vector_Op<"compressstore", [DeclareOpInterfaceMethods]>, Arguments<(ins Arg:$base, Variadic:$indices, FixedVectorOfNonZeroRankOf<[I1]>:$mask, diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td index ef0951ab1d166..34febf2c4ff4b 100644 --- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td +++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorTransformsBase.td @@ -14,10 +14,9 @@ include "mlir/IR/EnumAttr.td" // Lower transpose into element-wise extract and inserts. def VectorTransposeLowering_Elementwise: I32EnumAttrCase<"EltWise", 0, "eltwise">; -// Lower 2-D transpose to `vector.flat_transpose`, maps 1-1 to LLVM matrix -// intrinsics. -def VectorTransposeLowering_FlatTranspose: - I32EnumAttrCase<"Flat", 1, "flat_transpose">; +// Lower directly to LLVM matrix intrinsics. +def VectorTransposeLowering_LLVMIntr: + I32EnumAttrCase<"LLVMIntr", 1, "llvmintr">; // Lower 2-D transpose to `vector.shuffle` on 1-D vector. def VectorTransposeLowering_Shuffle1D: I32EnumAttrCase<"Shuffle1D", 2, "shuffle_1d">; @@ -27,7 +26,7 @@ def VectorTransposeLowering_Shuffle16x16: def VectorTransposeLoweringAttr : I32EnumAttr< "VectorTransposeLowering", "control the lowering of `vector.transpose` operations.", - [VectorTransposeLowering_Elementwise, VectorTransposeLowering_FlatTranspose, + [VectorTransposeLowering_Elementwise, VectorTransposeLowering_LLVMIntr, VectorTransposeLowering_Shuffle1D, VectorTransposeLowering_Shuffle16x16]> { let cppNamespace = "::mlir::vector"; } @@ -48,9 +47,9 @@ def VectorMultiReductionLoweringAttr: I32EnumAttr< // Progressively lower to finer grained `vector.contract` and dot-products. def VectorContractLowering_Dot: I32EnumAttrCase<"Dot", 0, "dot">; -// Lower to `vector.matrix_multiply`, maps 1-1 to LLVM matrix intrinsics. -def VectorContractLowering_Matmul: - I32EnumAttrCase<"Matmul", 1, "matmulintrinsics">; +// Lower directly to LLVM intrinsics. +def VectorContractLowering_LLVMIntr: + I32EnumAttrCase<"LLVMIntr", 1, "llvmintr">; // Lower to `vector.outerproduct`. def VectorContractLowering_OuterProduct: I32EnumAttrCase<"OuterProduct", 2, "outerproduct">; @@ -61,7 +60,7 @@ def VectorContractLowering_ParallelArith: def VectorContractLoweringAttr: I32EnumAttr< "VectorContractLowering", "control the lowering of `vector.contract` operations.", - [VectorContractLowering_Dot, VectorContractLowering_Matmul, + [VectorContractLowering_Dot, VectorContractLowering_LLVMIntr, VectorContractLowering_OuterProduct, VectorContractLowering_ParallelArith]> { let cppNamespace = "::mlir::vector"; } diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h index 97163c4532378..a57aadcdcc5b0 100644 --- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h +++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h @@ -227,7 +227,8 @@ bool isLinearizableVector(VectorType type); /// /// Note: all read offsets are set to 0. Value createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, - ArrayRef inputVectorSizes, Value padValue, + ArrayRef inputVectorSizes, + std::optional padValue = std::nullopt, bool useInBoundsInsteadOfMasking = false, ArrayRef inputScalableVecDims = {}); diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h index 44b81796b1313..b74c15e5b7ac1 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h @@ -9,9 +9,9 @@ #ifndef MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H #define MLIR_DIALECT_XEGPU_TRANSFORMS_TRANSFORMS_H +#include "mlir/IR/Operation.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/LogicalResult.h" -#include "mlir/IR/Operation.h" #include #include @@ -47,9 +47,11 @@ struct UnrollOptions { /// Function that converts a ShapedType (TensorDescType or VectorType) /// into the unrolled type based on the tileShape. It returns a vector of - /// types representing the unrolled types for simplicity. + /// types representing the unrolled types for simplicity. When + /// `returnSingleType` is true, it returns a vector containing only one single + /// unrolled type. using UnrolledTypeFnType = std::function( - ShapedType type, ArrayRef tileShape)>; + ShapedType type, ArrayRef tileShape, bool returnSingleType)>; UnrolledTypeFnType getUnrolledTypes = nullptr; UnrollOptions &setUnrolledTypesFn(UnrolledTypeFnType fn) { getUnrolledTypes = std::move(fn); diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h index 9d8d81a839fcb..9205f16f97bbb 100644 --- a/mlir/include/mlir/IR/Builders.h +++ b/mlir/include/mlir/IR/Builders.h @@ -515,6 +515,12 @@ class OpBuilder : public Builder { /// Create an operation of specific op type at the current insertion point, /// and immediately try to fold it. This functions populates 'results' with /// the results of the operation. + /// + /// Note: This performs opportunistic eager folding during IR construction. + /// The folders are designed to operate efficiently on canonical IR, which + /// this API does not enforce. Complete folding isn't only expected in the + /// context of canonicalization which intertwine folders with pattern + /// rewrites until fixed-point. template void createOrFold(SmallVectorImpl &results, Location location, Args &&...args) { diff --git a/mlir/include/mlir/IR/PDLPatternMatch.h.inc b/mlir/include/mlir/IR/PDLPatternMatch.h.inc index 96ba98a850de0..d5fb57d7c360d 100644 --- a/mlir/include/mlir/IR/PDLPatternMatch.h.inc +++ b/mlir/include/mlir/IR/PDLPatternMatch.h.inc @@ -53,7 +53,7 @@ public: /// value is not an instance of `T`. template ::value, T, std::optional>> + std::is_constructible_v, T, std::optional>> ResultT dyn_cast() const { return isa() ? castImpl() : ResultT(); } diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt index 2add220fdfb7c..a5feb592045c0 100644 --- a/mlir/include/mlir/Interfaces/CMakeLists.txt +++ b/mlir/include/mlir/Interfaces/CMakeLists.txt @@ -8,6 +8,7 @@ add_mlir_interface(IndexingMapOpInterface) add_mlir_interface(InferIntRangeInterface) add_mlir_interface(InferTypeOpInterface) add_mlir_interface(LoopLikeInterface) +add_mlir_interface(MemOpInterfaces) add_mlir_interface(ParallelCombiningOpInterface) add_mlir_interface(RuntimeVerifiableOpInterface) add_mlir_interface(ShapedOpInterfaces) diff --git a/mlir/include/mlir/Interfaces/MemOpInterfaces.h b/mlir/include/mlir/Interfaces/MemOpInterfaces.h new file mode 100644 index 0000000000000..cdc423f5da1a5 --- /dev/null +++ b/mlir/include/mlir/Interfaces/MemOpInterfaces.h @@ -0,0 +1,36 @@ +//===- MemOpInterfaces.h - Memory operation interfaces ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains declarations of interfaces for operations that interact +// with memory. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_INTERFACES_MEMOPINTERFACES_H +#define MLIR_INTERFACES_MEMOPINTERFACES_H + +#include "mlir/IR/OpDefinition.h" + +namespace mlir { +namespace detail { +/// Attempt to verify the given memory space cast operation. +LogicalResult verifyMemorySpaceCastOpInterface(Operation *op); + +/// Tries to bubble-down inplace a `MemorySpaceCastOpInterface` operation +/// referenced by `operand`. On success, it returns `std::nullopt`. It +/// returns failure if `operand` doesn't reference a +/// `MemorySpaceCastOpInterface` op. +FailureOr>> +bubbleDownInPlaceMemorySpaceCastImpl(OpOperand &operand, ValueRange results); +} // namespace detail +} // namespace mlir + +/// Include the generated interface declarations. +#include "mlir/Interfaces/MemOpInterfaces.h.inc" + +#endif // MLIR_INTERFACES_MEMOPINTERFACES_H diff --git a/mlir/include/mlir/Interfaces/MemOpInterfaces.td b/mlir/include/mlir/Interfaces/MemOpInterfaces.td new file mode 100644 index 0000000000000..1a64e97c3412d --- /dev/null +++ b/mlir/include/mlir/Interfaces/MemOpInterfaces.td @@ -0,0 +1,125 @@ +//===- MemOpInterfaces.td - Memory operation interfaces -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains interfaces for operations that interact with memory. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_INTERFACES_MEMOPINTERFACES_TD +#define MLIR_INTERFACES_MEMOPINTERFACES_TD + +include "mlir/IR/OpBase.td" + +def MemorySpaceCastConsumerOpInterface : + OpInterface<"MemorySpaceCastConsumerOpInterface"> { + let description = [{ + An interface for operations that can consume memory-space cast-like + operations. + + This interface can be used to bubble-down memory-space cast operations, + see the `bubble-down-memory-space-casts` pass for an example. + }]; + let cppNamespace = "::mlir"; + let methods = [ + InterfaceMethod<[{ + Attempt to bubble-down the incoming cast-like operands. On success + returns a `std::optional>`, otherwise it returns + failure. If the optional is `std::nullopt` then the cast was performed + in place, otherwise the method returns a list of replacement values. + If new results are produced, these must be compatible with the original + operation results. + + If the operation was not modified in place, then the interface + guarantees it is valid to erase the original operation. + If the operation was modified in place, then the interface must + guarantee no operations were created by the method, and that no further + IR modification is necessary. + + Any implementations of this method must not erase/replace the original + operation, instead it is the caller responsibility to erase or replace + the op with the results provided by the method. + + Finally, any implementations of this method have to guarantee that the + IR remains valid at all times. + }], + "::llvm::FailureOr>>", + "bubbleDownCasts", + (ins "::mlir::OpBuilder &":$builder) + >, + ]; +} + +def MemorySpaceCastOpInterface : OpInterface<"MemorySpaceCastOpInterface"> { + let description = [{ + An interface for operations that perform memory-space casts. This + interface assumes that the cast operation is `pure`. + + These operations expect to have a well-defined ptr-like operand, and + a well-defined target ptr-like result. + + This interface also allows to determine whether a cast can be bubbled-down + by the `MemorySpaceCastConsumerOpInterface`, allowing control over which + casts can be bubbled-down or not. + }]; + let cppNamespace = "::mlir"; + let methods = [ + InterfaceMethod<[{ + Returns the source ptr-like value. + }], + "::mlir::TypedValue<::mlir::PtrLikeTypeInterface>", "getSourcePtr" + >, + InterfaceMethod<[{ + Returns the target ptr-like value. + }], + "::mlir::TypedValue<::mlir::PtrLikeTypeInterface>", "getTargetPtr" + >, + InterfaceMethod<[{ + Returns whether the memory space cast specified by `tgt` and `src` + is supported. + }], + "bool", "isValidMemorySpaceCast", + (ins "::mlir::PtrLikeTypeInterface":$tgt, + "::mlir::PtrLikeTypeInterface":$src) + >, + InterfaceMethod<[{ + Clones the memory space cast op with the given source and target type. + }], + "::mlir::MemorySpaceCastOpInterface", "cloneMemorySpaceCastOp", + (ins "::mlir::OpBuilder &":$builder, "::mlir::PtrLikeTypeInterface":$tgt, + "::mlir::TypedValue<::mlir::PtrLikeTypeInterface>":$src) + >, + InterfaceMethod<[{ + Returns whether the source pointer of the memory-space cast can be used + by the `MemorySpaceCastConsumerOpInterface::bubbleDownCasts` method to + promote the source pointer and bubble down the cast. + + For example, a cast operation might decide that all casts to the generic + memory-space can be promoted. + }], + "bool", "isSourcePromotable" + > + ]; + let verify = [{ + return ::mlir::detail::verifyMemorySpaceCastOpInterface($_op); + }]; + let extraClassDeclaration = [{ + /// Returns the underlying `MemorySpaceCastOpInterface` op if `value` + /// is produced by a `MemorySpaceCastOpInterface` op, and + /// `isSourcePromotable` returns true, otherwise it returns null. + static ::mlir::MemorySpaceCastOpInterface + getIfPromotableCast(::mlir::Value value) { + auto op = ::llvm::dyn_cast_or_null<::mlir::MemorySpaceCastOpInterface>( + value.getDefiningOp()); + if (!op || !op.isSourcePromotable()) + return nullptr; + return op; + } + }]; +} + +#endif // MLIR_INTERFACES_MEMOPINTERFACES_TD diff --git a/mlir/include/mlir/Remark/RemarkStreamer.h b/mlir/include/mlir/Remark/RemarkStreamer.h index 8bfd176d9bade..170d6b439a442 100644 --- a/mlir/include/mlir/Remark/RemarkStreamer.h +++ b/mlir/include/mlir/Remark/RemarkStreamer.h @@ -26,14 +26,15 @@ class LLVMRemarkStreamer final : public MLIRRemarkStreamerBase { createToFile(llvm::StringRef path, llvm::remarks::Format fmt); void streamOptimizationRemark(const Remark &remark) override; - void finalize() override {} + void finalize() override; ~LLVMRemarkStreamer() override; private: LLVMRemarkStreamer() = default; - std::unique_ptr remarkStreamer; std::unique_ptr file; + // RemarkStreamer must be destructed before file is destroyed! + std::unique_ptr remarkStreamer; }; } // namespace mlir::remark::detail diff --git a/mlir/include/mlir/TableGen/Class.h b/mlir/include/mlir/TableGen/Class.h index 10349676625d1..e6bedc7cc896d 100644 --- a/mlir/include/mlir/TableGen/Class.h +++ b/mlir/include/mlir/TableGen/Class.h @@ -789,6 +789,10 @@ class Class { std::forward(args)...); } + const std::vector> &getMethods() const { + return methods; + } + /// Add a new field to the class. Class fields added this way are always /// private. template diff --git a/mlir/include/mlir/Transforms/BubbleDownMemorySpaceCasts.h b/mlir/include/mlir/Transforms/BubbleDownMemorySpaceCasts.h new file mode 100644 index 0000000000000..99db092879a90 --- /dev/null +++ b/mlir/include/mlir/Transforms/BubbleDownMemorySpaceCasts.h @@ -0,0 +1,20 @@ +//===-- BubbleDownMemorySpaceCasts.h - Bubble down cast patterns ---C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TRANSFORMS_BUBBLEDOWNMEMORYSPACECASTS_H +#define MLIR_TRANSFORMS_BUBBLEDOWNMEMORYSPACECASTS_H + +namespace mlir { +class PatternBenefit; +class RewritePatternSet; +/// Collect a set of patterns to bubble-down memory-space cast operations. +void populateBubbleDownMemorySpaceCastPatterns(RewritePatternSet &patterns, + PatternBenefit benefit); +} // namespace mlir + +#endif // MLIR_TRANSFORMS_BUBBLEDOWNMEMORYSPACECASTS_H diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h index 2e7a6fe3e362c..ee89e8d0e7c3f 100644 --- a/mlir/include/mlir/Transforms/FoldUtils.h +++ b/mlir/include/mlir/Transforms/FoldUtils.h @@ -40,7 +40,10 @@ class OperationFolder { /// deduplicated constants. If successful, replaces `op`'s uses with /// folded results, and returns success. If the op was completely folded it is /// erased. If it is just updated in place, `inPlaceUpdate` is set to true. - LogicalResult tryToFold(Operation *op, bool *inPlaceUpdate = nullptr); + /// On success() and when in-place, the folder is invoked until + /// `maxIterations` is reached (default INT_MAX). + LogicalResult tryToFold(Operation *op, bool *inPlaceUpdate = nullptr, + int maxIterations = INT_MAX); /// Tries to fold a pre-existing constant operation. `constValue` represents /// the value of the constant, and can be optionally passed if the value is @@ -82,7 +85,10 @@ class OperationFolder { /// Tries to perform folding on the given `op`. If successful, populates /// `results` with the results of the folding. - LogicalResult tryToFold(Operation *op, SmallVectorImpl &results); + /// On success() and when in-place, the folder is invoked until + /// `maxIterations` is reached (default INT_MAX). + LogicalResult tryToFold(Operation *op, SmallVectorImpl &results, + int maxIterations = INT_MAX); /// Try to process a set of fold results. Populates `results` on success, /// otherwise leaves it unchanged. diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index 9cd2ef34e15ea..1c035f2a843ff 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -46,6 +46,7 @@ class GreedyRewriteConfig; #define GEN_PASS_DECL_SYMBOLPRIVATIZE #define GEN_PASS_DECL_TOPOLOGICALSORT #define GEN_PASS_DECL_COMPOSITEFIXEDPOINTPASS +#define GEN_PASS_DECL_BUBBLEDOWNMEMORYSPACECASTS #include "mlir/Transforms/Passes.h.inc" /// Creates an instance of the Canonicalizer pass, configured with default diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td index beb59784947c5..b2b7f20a497e3 100644 --- a/mlir/include/mlir/Transforms/Passes.td +++ b/mlir/include/mlir/Transforms/Passes.td @@ -585,4 +585,48 @@ def CompositeFixedPointPass : Pass<"composite-fixed-point-pass"> { ]; } +def BubbleDownMemorySpaceCasts : + Pass<"bubble-down-memory-space-casts"> { + let summary = "Bubbles down memory-space cast operations."; + let description = [{ + This pass tries to iteratively bubble down all possible memory-space cast + operations. It is important to note that the determination of which casts + are bubbled down is based on the interfaces + `MemorySpaceCastConsumerOpInterface`, and `MemorySpaceCastOpInterface`, and + not the pass. The pass only looks for operations implementing the + `MemorySpaceCastConsumerOpInterface` interface, and invoking the interface + methods to perform the bubbling down. + + Example: + + ```mlir + func.func @op_with_cast_sequence(%arg0: memref<4x4xf32, 1>, %arg1: index, %arg2: f32) -> memref<16xf32> { + %memspacecast = memref.memory_space_cast %arg0 : memref<4x4xf32, 1> to memref<4x4xf32> + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %expanded = memref.expand_shape %memspacecast [[0], [1, 2]] output_shape [4, 2, 2] : memref<4x4xf32> into memref<4x2x2xf32> + %collapsed = memref.collapse_shape %expanded [[0, 1, 2]] : memref<4x2x2xf32> into memref<16xf32> + %loaded = memref.load %collapsed[%c0] : memref<16xf32> + %added = arith.addf %loaded, %arg2 : f32 + memref.store %added, %collapsed[%c0] : memref<16xf32> + %atomic_result = memref.atomic_rmw addf %arg2, %collapsed[%c4] : (f32, memref<16xf32>) -> f32 + return %collapsed : memref<16xf32> + } + // mlir-opt --bubble-down-memory-space-casts + func.func @op_with_cast_sequence(%arg0: memref<4x4xf32, 1>, %arg1: index, %arg2: f32) -> memref<16xf32> { + %c4 = arith.constant 4 : index + %c0 = arith.constant 0 : index + %expand_shape = memref.expand_shape %arg0 [[0], [1, 2]] output_shape [4, 2, 2] : memref<4x4xf32, 1> into memref<4x2x2xf32, 1> + %collapse_shape = memref.collapse_shape %expand_shape [[0, 1, 2]] : memref<4x2x2xf32, 1> into memref<16xf32, 1> + %memspacecast = memref.memory_space_cast %collapse_shape : memref<16xf32, 1> to memref<16xf32> + %0 = memref.load %collapse_shape[%c0] : memref<16xf32, 1> + %1 = arith.addf %0, %arg2 : f32 + memref.store %1, %collapse_shape[%c0] : memref<16xf32, 1> + %2 = memref.atomic_rmw addf %arg2, %collapse_shape[%c4] : (f32, memref<16xf32, 1>) -> f32 + return %memspacecast : memref<16xf32> + } + ``` + }]; +} + #endif // MLIR_TRANSFORMS_PASSES diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp index 65df355216f74..d705d8d4c7819 100644 --- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp +++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp @@ -109,19 +109,19 @@ LivenessAnalysis::visitOperation(Operation *op, ArrayRef operands, foundLiveResult = true; } LDBG() << "[visitOperation] Adding dependency for result: " << r - << " after op: " << *op; + << " after op: " << OpWithFlags(op, OpPrintingFlags().skipRegions()); addDependency(const_cast(r), getProgramPointAfter(op)); } return success(); } void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { + Operation *op = operand.getOwner(); LDBG() << "Visiting branch operand: " << operand.get() - << " in op: " << *operand.getOwner(); + << " in op: " << OpWithFlags(op, OpPrintingFlags().skipRegions()); // We know (at the moment) and assume (for the future) that `operand` is a // non-forwarded branch operand of a `RegionBranchOpInterface`, // `BranchOpInterface`, `RegionBranchTerminatorOpInterface` or return-like op. - Operation *op = operand.getOwner(); assert((isa(op) || isa(op) || isa(op)) && "expected the op to be `RegionBranchOpInterface`, " @@ -146,12 +146,13 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { // Therefore, if the result value is live, we conservatively consider the // non-forwarded operand of the region branch operation with result may // live and record all result. - for (Value result : op->getResults()) { + for (auto [resultIndex, result] : llvm::enumerate(op->getResults())) { if (getLatticeElement(result)->isLive) { mayLive = true; - LDBG() << "[visitBranchOperand] Non-forwarded branch " - "operand may be live due to live result: " - << result; + LDBG() << "[visitBranchOperand] Non-forwarded branch operand may be " + "live due to live result #" + << resultIndex << ": " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); break; } } @@ -233,7 +234,8 @@ void LivenessAnalysis::visitBranchOperand(OpOperand &operand) { SmallVector resultsLiveness; for (const Value result : op->getResults()) resultsLiveness.push_back(getLatticeElement(result)); - LDBG() << "Visiting operation for non-forwarded branch operand: " << *op; + LDBG() << "Visiting operation for non-forwarded branch operand: " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); (void)visitOperation(op, operandLiveness, resultsLiveness); // We also visit the parent op with the parent's results and this operand if @@ -299,8 +301,6 @@ RunLivenessAnalysis::RunLivenessAnalysis(Operation *op) { // The framework doesn't visit operations in dead blocks, so we need to // explicitly mark them as dead. op->walk([&](Operation *op) { - if (op->getNumResults() == 0) - return; for (auto result : llvm::enumerate(op->getResults())) { if (getLiveness(result.value())) continue; diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp index 7037fa644c7be..12dff19ed31d3 100644 --- a/mlir/lib/Analysis/SliceAnalysis.cpp +++ b/mlir/lib/Analysis/SliceAnalysis.cpp @@ -109,7 +109,7 @@ static LogicalResult getBackwardSliceImpl(Operation *op, DenseSet &visited, SetVector *backwardSlice, const BackwardSliceOptions &options) { - if (!op || op->hasTrait()) + if (!op) return success(); // Evaluate whether we should keep this def. @@ -136,7 +136,8 @@ static LogicalResult getBackwardSliceImpl(Operation *op, // blocks of parentOp, which are not technically backward unless they flow // into us. For now, just bail. if (parentOp && backwardSlice->count(parentOp) == 0) { - if (parentOp->getNumRegions() == 1 && + if (!parentOp->hasTrait() && + parentOp->getNumRegions() == 1 && parentOp->getRegion(0).hasOneBlock()) { return getBackwardSliceImpl(parentOp, visited, backwardSlice, options); @@ -150,7 +151,8 @@ static LogicalResult getBackwardSliceImpl(Operation *op, bool succeeded = true; - if (!options.omitUsesFromAbove) { + if (!options.omitUsesFromAbove && + !op->hasTrait()) { llvm::for_each(op->getRegions(), [&](Region ®ion) { // Walk this region recursively to collect the regions that descend from // this op's nested regions (inclusive). diff --git a/mlir/lib/Bindings/Python/DialectLLVM.cpp b/mlir/lib/Bindings/Python/DialectLLVM.cpp index 55b9331270cdc..38de4a0e329a0 100644 --- a/mlir/lib/Bindings/Python/DialectLLVM.cpp +++ b/mlir/lib/Bindings/Python/DialectLLVM.cpp @@ -33,21 +33,37 @@ static void populateDialectLLVMSubmodule(const nanobind::module_ &m) { auto llvmStructType = mlir_type_subclass(m, "StructType", mlirTypeIsALLVMStructType); - llvmStructType.def_classmethod( - "get_literal", - [](const nb::object &cls, const std::vector &elements, - bool packed, MlirLocation loc) { - CollectDiagnosticsToStringScope scope(mlirLocationGetContext(loc)); - - MlirType type = mlirLLVMStructTypeLiteralGetChecked( - loc, elements.size(), elements.data(), packed); - if (mlirTypeIsNull(type)) { - throw nb::value_error(scope.takeMessage().c_str()); - } - return cls(type); - }, - "cls"_a, "elements"_a, nb::kw_only(), "packed"_a = false, - "loc"_a = nb::none()); + llvmStructType + .def_classmethod( + "get_literal", + [](const nb::object &cls, const std::vector &elements, + bool packed, MlirLocation loc) { + CollectDiagnosticsToStringScope scope(mlirLocationGetContext(loc)); + + MlirType type = mlirLLVMStructTypeLiteralGetChecked( + loc, elements.size(), elements.data(), packed); + if (mlirTypeIsNull(type)) { + throw nb::value_error(scope.takeMessage().c_str()); + } + return cls(type); + }, + "cls"_a, "elements"_a, nb::kw_only(), "packed"_a = false, + "loc"_a = nb::none()) + .def_classmethod( + "get_literal_unchecked", + [](const nb::object &cls, const std::vector &elements, + bool packed, MlirContext context) { + CollectDiagnosticsToStringScope scope(context); + + MlirType type = mlirLLVMStructTypeLiteralGet( + context, elements.size(), elements.data(), packed); + if (mlirTypeIsNull(type)) { + throw nb::value_error(scope.takeMessage().c_str()); + } + return cls(type); + }, + "cls"_a, "elements"_a, nb::kw_only(), "packed"_a = false, + "context"_a = nb::none()); llvmStructType.def_classmethod( "get_identified", diff --git a/mlir/lib/Bindings/Python/IRAffine.cpp b/mlir/lib/Bindings/Python/IRAffine.cpp index bc6aa0dac6221..7147f2cbad149 100644 --- a/mlir/lib/Bindings/Python/IRAffine.cpp +++ b/mlir/lib/Bindings/Python/IRAffine.cpp @@ -574,7 +574,9 @@ void mlir::python::populateIRAffine(nb::module_ &m) { }) .def_prop_ro( "context", - [](PyAffineExpr &self) { return self.getContext().getObject(); }) + [](PyAffineExpr &self) -> nb::typed { + return self.getContext().getObject(); + }) .def("compose", [](PyAffineExpr &self, PyAffineMap &other) { return PyAffineExpr(self.getContext(), @@ -706,28 +708,29 @@ void mlir::python::populateIRAffine(nb::module_ &m) { [](PyAffineMap &self) { return static_cast(llvm::hash_value(self.get().ptr)); }) - .def_static("compress_unused_symbols", - [](const nb::list &affineMaps, - DefaultingPyMlirContext context) { - SmallVector maps; - pyListToVector( - affineMaps, maps, "attempting to create an AffineMap"); - std::vector compressed(affineMaps.size()); - auto populate = [](void *result, intptr_t idx, - MlirAffineMap m) { - static_cast(result)[idx] = (m); - }; - mlirAffineMapCompressUnusedSymbols( - maps.data(), maps.size(), compressed.data(), populate); - std::vector res; - res.reserve(compressed.size()); - for (auto m : compressed) - res.emplace_back(context->getRef(), m); - return res; - }) + .def_static( + "compress_unused_symbols", + [](const nb::list &affineMaps, DefaultingPyMlirContext context) { + SmallVector maps; + pyListToVector( + affineMaps, maps, "attempting to create an AffineMap"); + std::vector compressed(affineMaps.size()); + auto populate = [](void *result, intptr_t idx, MlirAffineMap m) { + static_cast(result)[idx] = (m); + }; + mlirAffineMapCompressUnusedSymbols(maps.data(), maps.size(), + compressed.data(), populate); + std::vector res; + res.reserve(compressed.size()); + for (auto m : compressed) + res.emplace_back(context->getRef(), m); + return res; + }) .def_prop_ro( "context", - [](PyAffineMap &self) { return self.getContext().getObject(); }, + [](PyAffineMap &self) -> nb::typed { + return self.getContext().getObject(); + }, "Context that owns the Affine Map") .def( "dump", [](PyAffineMap &self) { mlirAffineMapDump(self); }, @@ -893,7 +896,9 @@ void mlir::python::populateIRAffine(nb::module_ &m) { }) .def_prop_ro( "context", - [](PyIntegerSet &self) { return self.getContext().getObject(); }) + [](PyIntegerSet &self) -> nb::typed { + return self.getContext().getObject(); + }) .def( "dump", [](PyIntegerSet &self) { mlirIntegerSetDump(self); }, kDumpDocstring) diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp index 7818caf2e8a55..045c0fbf4630f 100644 --- a/mlir/lib/Bindings/Python/IRAttributes.cpp +++ b/mlir/lib/Bindings/Python/IRAttributes.cpp @@ -526,7 +526,8 @@ class PyArrayAttribute : public PyConcreteAttribute { "Gets a uniqued Array attribute"); c.def( "__getitem__", - [](PyArrayAttribute &arr, intptr_t i) { + [](PyArrayAttribute &arr, + intptr_t i) -> nb::typed { if (i >= mlirArrayAttrGetNumElements(arr)) throw nb::index_error("ArrayAttribute index out of range"); return PyAttribute(arr.getContext(), arr.getItem(i)).maybeDownCast(); @@ -574,6 +575,18 @@ class PyFloatAttribute : public PyConcreteAttribute { }, nb::arg("type"), nb::arg("value"), nb::arg("loc") = nb::none(), "Gets an uniqued float point attribute associated to a type"); + c.def_static( + "get_unchecked", + [](PyType &type, double value, DefaultingPyMlirContext context) { + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirAttribute attr = + mlirFloatAttrDoubleGet(context.get()->get(), type, value); + if (mlirAttributeIsNull(attr)) + throw MLIRError("Invalid attribute", errors.take()); + return PyFloatAttribute(type.getContext(), attr); + }, + nb::arg("type"), nb::arg("value"), nb::arg("context") = nb::none(), + "Gets an uniqued float point attribute associated to a type"); c.def_static( "get_f32", [](double value, DefaultingPyMlirContext context) { @@ -1010,14 +1023,16 @@ class PyDenseElementsAttribute [](PyDenseElementsAttribute &self) -> bool { return mlirDenseElementsAttrIsSplat(self); }) - .def("get_splat_value", [](PyDenseElementsAttribute &self) { - if (!mlirDenseElementsAttrIsSplat(self)) - throw nb::value_error( - "get_splat_value called on a non-splat attribute"); - return PyAttribute(self.getContext(), - mlirDenseElementsAttrGetSplatValue(self)) - .maybeDownCast(); - }); + .def("get_splat_value", + [](PyDenseElementsAttribute &self) + -> nb::typed { + if (!mlirDenseElementsAttrIsSplat(self)) + throw nb::value_error( + "get_splat_value called on a non-splat attribute"); + return PyAttribute(self.getContext(), + mlirDenseElementsAttrGetSplatValue(self)) + .maybeDownCast(); + }); } static PyType_Slot slots[]; @@ -1332,7 +1347,7 @@ class PyDenseIntElementsAttribute /// Returns the element at the given linear position. Asserts if the index /// is out of range. - nb::object dunderGetItem(intptr_t pos) { + nb::int_ dunderGetItem(intptr_t pos) { if (pos < 0 || pos >= dunderLen()) { throw nb::index_error("attempt to access out of bounds element"); } @@ -1522,13 +1537,15 @@ class PyDictAttribute : public PyConcreteAttribute { }, nb::arg("value") = nb::dict(), nb::arg("context") = nb::none(), "Gets an uniqued dict attribute"); - c.def("__getitem__", [](PyDictAttribute &self, const std::string &name) { - MlirAttribute attr = - mlirDictionaryAttrGetElementByName(self, toMlirStringRef(name)); - if (mlirAttributeIsNull(attr)) - throw nb::key_error("attempt to access a non-existent attribute"); - return PyAttribute(self.getContext(), attr).maybeDownCast(); - }); + c.def("__getitem__", + [](PyDictAttribute &self, + const std::string &name) -> nb::typed { + MlirAttribute attr = + mlirDictionaryAttrGetElementByName(self, toMlirStringRef(name)); + if (mlirAttributeIsNull(attr)) + throw nb::key_error("attempt to access a non-existent attribute"); + return PyAttribute(self.getContext(), attr).maybeDownCast(); + }); c.def("__getitem__", [](PyDictAttribute &self, intptr_t index) { if (index < 0 || index >= self.dunderLen()) { throw nb::index_error("attempt to access out of bounds attribute"); @@ -1594,10 +1611,11 @@ class PyTypeAttribute : public PyConcreteAttribute { }, nb::arg("value"), nb::arg("context") = nb::none(), "Gets a uniqued Type attribute"); - c.def_prop_ro("value", [](PyTypeAttribute &self) { - return PyType(self.getContext(), mlirTypeAttrGetValue(self.get())) - .maybeDownCast(); - }); + c.def_prop_ro( + "value", [](PyTypeAttribute &self) -> nb::typed { + return PyType(self.getContext(), mlirTypeAttrGetValue(self.get())) + .maybeDownCast(); + }); } }; diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 609502041f4ae..83a8757bb72c7 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -725,7 +725,7 @@ nb::object PyMlirContext::attachDiagnosticHandler(nb::object callback) { new PyDiagnosticHandler(get(), std::move(callback)); nb::object pyHandlerObject = nb::cast(pyHandler, nb::rv_policy::take_ownership); - pyHandlerObject.inc_ref(); + (void)pyHandlerObject.inc_ref(); // In these C callbacks, the userData is a PyDiagnosticHandler* that is // guaranteed to be known to pybind. @@ -1395,7 +1395,7 @@ nb::object PyOperation::getCapsule() { return nb::steal(mlirPythonOperationToCapsule(get())); } -nb::object PyOperation::createFromCapsule(nb::object capsule) { +nb::object PyOperation::createFromCapsule(const nb::object &capsule) { MlirOperation rawOperation = mlirPythonCapsuleToOperation(capsule.ptr()); if (mlirOperationIsNull(rawOperation)) throw nb::python_error(); @@ -1534,7 +1534,7 @@ nb::object PyOperation::create(std::string_view name, return created.getObject(); } -nb::typed PyOperation::clone(const nb::object &maybeIp) { +nb::object PyOperation::clone(const nb::object &maybeIp) { MlirOperation clonedOperation = mlirOperationClone(operation); PyOperationRef cloned = PyOperation::createDetached(getContext(), clonedOperation); @@ -1543,7 +1543,7 @@ nb::typed PyOperation::clone(const nb::object &maybeIp) { return cloned->createOpView(); } -nb::typed PyOperation::createOpView() { +nb::object PyOperation::createOpView() { checkValid(); MlirIdentifier ident = mlirOperationGetName(get()); MlirStringRef identStr = mlirIdentifierStr(ident); @@ -1605,7 +1605,9 @@ class PyConcreteValue : public PyValue { }, nb::arg("other_value")); cls.def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, - [](DerivedTy &self) { return self.maybeDownCast(); }); + [](DerivedTy &self) -> nb::typed { + return self.maybeDownCast(); + }); DerivedTy::bindDerived(cls); } @@ -1623,13 +1625,14 @@ class PyOpResult : public PyConcreteValue { using PyConcreteValue::PyConcreteValue; static void bindDerived(ClassTy &c) { - c.def_prop_ro("owner", [](PyOpResult &self) { - assert( - mlirOperationEqual(self.getParentOperation()->get(), - mlirOpResultGetOwner(self.get())) && - "expected the owner of the value in Python to match that in the IR"); - return self.getParentOperation().getObject(); - }); + c.def_prop_ro( + "owner", [](PyOpResult &self) -> nb::typed { + assert(mlirOperationEqual(self.getParentOperation()->get(), + mlirOpResultGetOwner(self.get())) && + "expected the owner of the value in Python to match that in " + "the IR"); + return self.getParentOperation().getObject(); + }); c.def_prop_ro("result_number", [](PyOpResult &self) { return mlirOpResultGetResultNumber(self.get()); }); @@ -1671,9 +1674,10 @@ class PyOpResultList : public Sliceable { c.def_prop_ro("types", [](PyOpResultList &self) { return getValueTypes(self, self.operation->getContext()); }); - c.def_prop_ro("owner", [](PyOpResultList &self) { - return self.operation->createOpView(); - }); + c.def_prop_ro("owner", + [](PyOpResultList &self) -> nb::typed { + return self.operation->createOpView(); + }); } PyOperationRef &getOperation() { return operation; } @@ -2104,7 +2108,7 @@ PyInsertionPoint PyInsertionPoint::after(PyOperationBase &op) { size_t PyMlirContext::getLiveModuleCount() { return liveModules.size(); } nb::object PyInsertionPoint::contextEnter(nb::object insertPoint) { - return PyThreadContextEntry::pushInsertionPoint(insertPoint); + return PyThreadContextEntry::pushInsertionPoint(std::move(insertPoint)); } void PyInsertionPoint::contextExit(const nb::object &excType, @@ -2125,7 +2129,7 @@ nb::object PyAttribute::getCapsule() { return nb::steal(mlirPythonAttributeToCapsule(*this)); } -PyAttribute PyAttribute::createFromCapsule(nb::object capsule) { +PyAttribute PyAttribute::createFromCapsule(const nb::object &capsule) { MlirAttribute rawAttr = mlirPythonCapsuleToAttribute(capsule.ptr()); if (mlirAttributeIsNull(rawAttr)) throw nb::python_error(); @@ -2133,7 +2137,7 @@ PyAttribute PyAttribute::createFromCapsule(nb::object capsule) { PyMlirContext::forContext(mlirAttributeGetContext(rawAttr)), rawAttr); } -nb::typed PyAttribute::maybeDownCast() { +nb::object PyAttribute::maybeDownCast() { MlirTypeID mlirTypeID = mlirAttributeGetTypeID(this->get()); assert(!mlirTypeIDIsNull(mlirTypeID) && "mlirTypeID was expected to be non-null."); @@ -2179,7 +2183,7 @@ PyType PyType::createFromCapsule(nb::object capsule) { rawType); } -nb::typed PyType::maybeDownCast() { +nb::object PyType::maybeDownCast() { MlirTypeID mlirTypeID = mlirTypeGetTypeID(this->get()); assert(!mlirTypeIDIsNull(mlirTypeID) && "mlirTypeID was expected to be non-null."); @@ -2219,7 +2223,7 @@ nb::object PyValue::getCapsule() { return nb::steal(mlirPythonValueToCapsule(get())); } -nanobind::typed PyValue::maybeDownCast() { +nb::object PyValue::maybeDownCast() { MlirType type = mlirValueGetType(get()); MlirTypeID mlirTypeID = mlirTypeGetTypeID(type); assert(!mlirTypeIDIsNull(mlirTypeID) && @@ -2263,8 +2267,7 @@ PySymbolTable::PySymbolTable(PyOperationBase &operation) } } -nb::typed -PySymbolTable::dunderGetItem(const std::string &name) { +nb::object PySymbolTable::dunderGetItem(const std::string &name) { operation->checkValid(); MlirOperation symbol = mlirSymbolTableLookup( symbolTable, mlirStringRefCreate(name.data(), name.length())); @@ -2964,24 +2967,27 @@ void mlir::python::populateIRCore(nb::module_ &m) { }) .def_static("_get_live_count", &PyMlirContext::getLiveCount) .def("_get_context_again", - [](PyMlirContext &self) { + [](PyMlirContext &self) -> nb::typed { PyMlirContextRef ref = PyMlirContext::forContext(self.get()); return ref.releaseObject(); }) .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount) .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyMlirContext::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyMlirContext::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, + &PyMlirContext::createFromCapsule) .def("__enter__", &PyMlirContext::contextEnter) .def("__exit__", &PyMlirContext::contextExit, nb::arg("exc_type").none(), nb::arg("exc_value").none(), nb::arg("traceback").none()) .def_prop_ro_static( "current", - [](nb::object & /*class*/) { + [](nb::object & /*class*/) + -> std::optional> { auto *context = PyThreadContextEntry::getDefaultContext(); if (!context) - return nb::none(); + return {}; return nb::cast(context); }, + nb::sig("def current(/) -> Context | None"), "Gets the Context bound to the current thread or raises ValueError") .def_prop_ro( "dialects", @@ -3125,7 +3131,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { //---------------------------------------------------------------------------- nb::class_(m, "DialectRegistry") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyDialectRegistry::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyDialectRegistry::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, + &PyDialectRegistry::createFromCapsule) .def(nb::init<>()); //---------------------------------------------------------------------------- @@ -3133,7 +3140,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { //---------------------------------------------------------------------------- nb::class_(m, "Location") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule) .def("__enter__", &PyLocation::contextEnter) .def("__exit__", &PyLocation::contextExit, nb::arg("exc_type").none(), nb::arg("exc_value").none(), nb::arg("traceback").none()) @@ -3288,7 +3295,9 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Gets a Location from a LocationAttr") .def_prop_ro( "context", - [](PyLocation &self) { return self.getContext().getObject(); }, + [](PyLocation &self) -> nb::typed { + return self.getContext().getObject(); + }, "Context that owns the Location") .def_prop_ro( "attr", @@ -3315,12 +3324,13 @@ void mlir::python::populateIRCore(nb::module_ &m) { //---------------------------------------------------------------------------- nb::class_(m, "Module", nb::is_weak_referenceable()) .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyModule::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyModule::createFromCapsule, - kModuleCAPICreate) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyModule::createFromCapsule, + kModuleCAPICreate) .def("_clear_mlir_module", &PyModule::clearMlirModule) .def_static( "parse", - [](const std::string &moduleAsm, DefaultingPyMlirContext context) { + [](const std::string &moduleAsm, DefaultingPyMlirContext context) + -> nb::typed { PyMlirContext::ErrorCapture errors(context->getRef()); MlirModule module = mlirModuleCreateParse( context->get(), toMlirStringRef(moduleAsm)); @@ -3332,7 +3342,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { kModuleParseDocstring) .def_static( "parse", - [](nb::bytes moduleAsm, DefaultingPyMlirContext context) { + [](nb::bytes moduleAsm, DefaultingPyMlirContext context) + -> nb::typed { PyMlirContext::ErrorCapture errors(context->getRef()); MlirModule module = mlirModuleCreateParse( context->get(), toMlirStringRef(moduleAsm)); @@ -3344,7 +3355,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { kModuleParseDocstring) .def_static( "parseFile", - [](const std::string &path, DefaultingPyMlirContext context) { + [](const std::string &path, DefaultingPyMlirContext context) + -> nb::typed { PyMlirContext::ErrorCapture errors(context->getRef()); MlirModule module = mlirModuleCreateParseFromFile( context->get(), toMlirStringRef(path)); @@ -3356,7 +3368,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { kModuleParseDocstring) .def_static( "create", - [](const std::optional &loc) { + [](const std::optional &loc) + -> nb::typed { PyLocation pyLoc = maybeGetTracebackLocation(loc); MlirModule module = mlirModuleCreateEmpty(pyLoc.get()); return PyModule::forModule(module).releaseObject(); @@ -3364,11 +3377,13 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("loc") = nb::none(), "Creates an empty module") .def_prop_ro( "context", - [](PyModule &self) { return self.getContext().getObject(); }, + [](PyModule &self) -> nb::typed { + return self.getContext().getObject(); + }, "Context that created the Module") .def_prop_ro( "operation", - [](PyModule &self) { + [](PyModule &self) -> nb::typed { return PyOperation::forOperation(self.getContext(), mlirModuleGetOperation(self.get()), self.getRef().releaseObject()) @@ -3432,7 +3447,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { }) .def_prop_ro( "context", - [](PyOperationBase &self) { + [](PyOperationBase &self) -> nb::typed { PyOperation &concreteOperation = self.getOperation(); concreteOperation.checkValid(); return concreteOperation.getContext().getObject(); @@ -3463,7 +3478,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Returns the list of Operation results.") .def_prop_ro( "result", - [](PyOperationBase &self) { + [](PyOperationBase &self) -> nb::typed { auto &operation = self.getOperation(); return PyOpResult(operation.getRef(), getUniqueResult(operation)) .maybeDownCast(); @@ -3480,11 +3495,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Returns the source location the operation was defined or derived " "from.") .def_prop_ro("parent", - [](PyOperationBase &self) -> nb::object { + [](PyOperationBase &self) + -> std::optional> { auto parent = self.getOperation().getParentOperation(); if (parent) return parent->getObject(); - return nb::none(); + return {}; }) .def( "__str__", @@ -3555,13 +3571,14 @@ void mlir::python::populateIRCore(nb::module_ &m) { "of the parent block.") .def( "clone", - [](PyOperationBase &self, nb::object ip) { + [](PyOperationBase &self, + const nb::object &ip) -> nb::typed { return self.getOperation().clone(ip); }, nb::arg("ip") = nb::none()) .def( "detach_from_parent", - [](PyOperationBase &self) { + [](PyOperationBase &self) -> nb::typed { PyOperation &operation = self.getOperation(); operation.checkValid(); if (!operation.isAttached()) @@ -3597,7 +3614,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { std::optional attributes, std::optional> successors, int regions, const std::optional &location, - const nb::object &maybeIp, bool inferType) { + const nb::object &maybeIp, + bool inferType) -> nb::typed { // Unpack/validate operands. llvm::SmallVector mlirOperands; if (operands) { @@ -3622,7 +3640,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { .def_static( "parse", [](const std::string &sourceStr, const std::string &sourceName, - DefaultingPyMlirContext context) { + DefaultingPyMlirContext context) + -> nb::typed { return PyOperation::parse(context->getRef(), sourceStr, sourceName) ->createOpView(); }, @@ -3631,9 +3650,16 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Parses an operation. Supports both text assembly format and binary " "bytecode format.") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyOperation::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyOperation::createFromCapsule) - .def_prop_ro("operation", [](nb::object self) { return self; }) - .def_prop_ro("opview", &PyOperation::createOpView) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, + &PyOperation::createFromCapsule) + .def_prop_ro("operation", + [](nb::object self) -> nb::typed { + return self; + }) + .def_prop_ro("opview", + [](PyOperation &self) -> nb::typed { + return self.createOpView(); + }) .def_prop_ro("block", &PyOperation::getBlock) .def_prop_ro( "successors", @@ -3646,7 +3672,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { auto opViewClass = nb::class_(m, "OpView") - .def(nb::init(), nb::arg("operation")) + .def(nb::init>(), + nb::arg("operation")) .def( "__init__", [](PyOpView *self, std::string_view name, @@ -3673,9 +3700,15 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("successors") = nb::none(), nb::arg("regions") = nb::none(), nb::arg("loc") = nb::none(), nb::arg("ip") = nb::none()) - - .def_prop_ro("operation", &PyOpView::getOperationObject) - .def_prop_ro("opview", [](nb::object self) { return self; }) + .def_prop_ro( + "operation", + [](PyOpView &self) -> nb::typed { + return self.getOperationObject(); + }) + .def_prop_ro("opview", + [](nb::object self) -> nb::typed { + return self; + }) .def( "__str__", [](PyOpView &self) { return nb::str(self.getOperationObject()); }) @@ -3719,7 +3752,8 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Builds a specific, generated OpView based on class level attributes."); opViewClass.attr("parse") = classmethod( [](const nb::object &cls, const std::string &sourceStr, - const std::string &sourceName, DefaultingPyMlirContext context) { + const std::string &sourceName, + DefaultingPyMlirContext context) -> nb::typed { PyOperationRef parsed = PyOperation::parse(context->getRef(), sourceStr, sourceName); @@ -3754,7 +3788,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Returns a forward-optimized sequence of blocks.") .def_prop_ro( "owner", - [](PyRegion &self) { + [](PyRegion &self) -> nb::typed { return self.getParentOperation()->createOpView(); }, "Returns the operation owning this region.") @@ -3779,7 +3813,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyBlock::getCapsule) .def_prop_ro( "owner", - [](PyBlock &self) { + [](PyBlock &self) -> nb::typed { return self.getParentOperation()->createOpView(); }, "Returns the owning operation of this block.") @@ -3962,11 +3996,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { "Returns the block that this InsertionPoint points to.") .def_prop_ro( "ref_operation", - [](PyInsertionPoint &self) -> nb::object { + [](PyInsertionPoint &self) + -> std::optional> { auto refOperation = self.getRefOperation(); if (refOperation) return refOperation->getObject(); - return nb::none(); + return {}; }, "The reference operation before which new operations are " "inserted, or None if the insertion point is at the end of " @@ -3981,10 +4016,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { .def(nb::init(), nb::arg("cast_from_type"), "Casts the passed attribute to the generic Attribute") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyAttribute::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAttribute::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, + &PyAttribute::createFromCapsule) .def_static( "parse", - [](const std::string &attrSpec, DefaultingPyMlirContext context) { + [](const std::string &attrSpec, DefaultingPyMlirContext context) + -> nb::typed { PyMlirContext::ErrorCapture errors(context->getRef()); MlirAttribute attr = mlirAttributeParseGet( context->get(), toMlirStringRef(attrSpec)); @@ -3997,10 +4034,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { "failure.") .def_prop_ro( "context", - [](PyAttribute &self) { return self.getContext().getObject(); }, + [](PyAttribute &self) -> nb::typed { + return self.getContext().getObject(); + }, "Context that owns the Attribute") .def_prop_ro("type", - [](PyAttribute &self) { + [](PyAttribute &self) -> nb::typed { return PyType(self.getContext(), mlirAttributeGetType(self)) .maybeDownCast(); @@ -4051,7 +4090,10 @@ void mlir::python::populateIRCore(nb::module_ &m) { "mlirTypeID was expected to be non-null."); return PyTypeID(mlirTypeID); }) - .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, &PyAttribute::maybeDownCast); + .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](PyAttribute &self) -> nb::typed { + return self.maybeDownCast(); + }); //---------------------------------------------------------------------------- // Mapping of PyNamedAttribute @@ -4093,10 +4135,11 @@ void mlir::python::populateIRCore(nb::module_ &m) { .def(nb::init(), nb::arg("cast_from_type"), "Casts the passed type to the generic Type") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyType::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyType::createFromCapsule) .def_static( "parse", - [](std::string typeSpec, DefaultingPyMlirContext context) { + [](std::string typeSpec, + DefaultingPyMlirContext context) -> nb::typed { PyMlirContext::ErrorCapture errors(context->getRef()); MlirType type = mlirTypeParseGet(context->get(), toMlirStringRef(typeSpec)); @@ -4107,7 +4150,10 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("asm"), nb::arg("context") = nb::none(), kContextParseTypeDocstring) .def_prop_ro( - "context", [](PyType &self) { return self.getContext().getObject(); }, + "context", + [](PyType &self) -> nb::typed { + return self.getContext().getObject(); + }, "Context that owns the Type") .def("__eq__", [](PyType &self, PyType &other) { return self == other; }) .def( @@ -4141,7 +4187,10 @@ void mlir::python::populateIRCore(nb::module_ &m) { printAccum.parts.append(")"); return printAccum.join(); }) - .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, &PyType::maybeDownCast) + .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](PyType &self) -> nb::typed { + return self.maybeDownCast(); + }) .def_prop_ro("typeid", [](PyType &self) { MlirTypeID mlirTypeID = mlirTypeGetTypeID(self); if (!mlirTypeIDIsNull(mlirTypeID)) @@ -4156,7 +4205,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { //---------------------------------------------------------------------------- nb::class_(m, "TypeID") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyTypeID::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyTypeID::createFromCapsule) // Note, this tests whether the underlying TypeIDs are the same, // not whether the wrapper MlirTypeIDs are the same, nor whether // the Python objects are the same (i.e., PyTypeID is a value type). @@ -4177,10 +4226,10 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::class_(m, "Value") .def(nb::init(), nb::keep_alive<0, 1>(), nb::arg("value")) .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, &PyValue::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule) + .def_static(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyValue::createFromCapsule) .def_prop_ro( "context", - [](PyValue &self) { + [](PyValue &self) -> nb::typed { return self.getParentOperation()->getContext().getObject(); }, "Context in which the value lives.") @@ -4268,7 +4317,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("state"), kGetNameAsOperand) .def_prop_ro("type", - [](PyValue &self) { + [](PyValue &self) -> nb::typed { return PyType(self.getParentOperation()->getContext(), mlirValueGetType(self.get())) .maybeDownCast(); @@ -4285,6 +4334,33 @@ void mlir::python::populateIRCore(nb::module_ &m) { mlirValueReplaceAllUsesOfWith(self.get(), with.get()); }, kValueReplaceAllUsesWithDocstring) + .def( + "replace_all_uses_except", + [](MlirValue self, MlirValue with, PyOperation &exception) { + MlirOperation exceptedUser = exception.get(); + mlirValueReplaceAllUsesExcept(self, with, 1, &exceptedUser); + }, + nb::arg("with_"), nb::arg("exceptions"), + nb::sig("def replace_all_uses_except(self, with_: Value, exceptions: " + "Operation) -> None"), + kValueReplaceAllUsesExceptDocstring) + .def( + "replace_all_uses_except", + [](MlirValue self, MlirValue with, nb::list exceptions) { + // Convert Python list to a SmallVector of MlirOperations + llvm::SmallVector exceptionOps; + for (nb::handle exception : exceptions) { + exceptionOps.push_back(nb::cast(exception).get()); + } + + mlirValueReplaceAllUsesExcept( + self, with, static_cast(exceptionOps.size()), + exceptionOps.data()); + }, + nb::arg("with_"), nb::arg("exceptions"), + nb::sig("def replace_all_uses_except(self, with_: Value, exceptions: " + "Sequence[Operation]) -> None"), + kValueReplaceAllUsesExceptDocstring) .def( "replace_all_uses_except", [](PyValue &self, PyValue &with, PyOperation &exception) { @@ -4307,7 +4383,10 @@ void mlir::python::populateIRCore(nb::module_ &m) { }, nb::arg("with_"), nb::arg("exceptions"), kValueReplaceAllUsesExceptDocstring) - .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, &PyValue::maybeDownCast) + .def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR, + [](PyValue &self) -> nb::typed { + return self.maybeDownCast(); + }) .def_prop_ro( "location", [](MlirValue self) { @@ -4332,7 +4411,11 @@ void mlir::python::populateIRCore(nb::module_ &m) { //---------------------------------------------------------------------------- nb::class_(m, "SymbolTable") .def(nb::init()) - .def("__getitem__", &PySymbolTable::dunderGetItem) + .def("__getitem__", + [](PySymbolTable &self, + const std::string &name) -> nb::typed { + return self.dunderGetItem(name); + }) .def("insert", &PySymbolTable::insert, nb::arg("operation")) .def("erase", &PySymbolTable::erase, nb::arg("operation")) .def("__delitem__", &PySymbolTable::dunderDel) diff --git a/mlir/lib/Bindings/Python/IRInterfaces.cpp b/mlir/lib/Bindings/Python/IRInterfaces.cpp index 6c53289c5011e..31d4798ffb906 100644 --- a/mlir/lib/Bindings/Python/IRInterfaces.cpp +++ b/mlir/lib/Bindings/Python/IRInterfaces.cpp @@ -212,11 +212,9 @@ class PyConcreteOpInterface { /// Returns the operation instance from which this object was constructed. /// Throws a type error if this object was constructed from a subclass of /// OpView. - nb::object getOperationObject() { - if (operation == nullptr) { + nb::typed getOperationObject() { + if (operation == nullptr) throw nb::type_error("Cannot get an operation from a static interface"); - } - return operation->getRef().releaseObject(); } @@ -224,10 +222,8 @@ class PyConcreteOpInterface { /// constructed. Throws a type error if this object was constructed form a /// subclass of OpView. nb::typed getOpView() { - if (operation == nullptr) { + if (operation == nullptr) throw nb::type_error("Cannot get an opview from a static interface"); - } - return operation->createOpView(); } @@ -362,10 +358,9 @@ class PyShapedTypeComponents { "Returns whether the given shaped type component is ranked.") .def_prop_ro( "rank", - [](PyShapedTypeComponents &self) -> nb::object { - if (!self.ranked) { - return nb::none(); - } + [](PyShapedTypeComponents &self) -> std::optional { + if (!self.ranked) + return {}; return nb::int_(self.shape.size()); }, "Returns the rank of the given ranked shaped type components. If " @@ -373,10 +368,9 @@ class PyShapedTypeComponents { "returned.") .def_prop_ro( "shape", - [](PyShapedTypeComponents &self) -> nb::object { - if (!self.ranked) { - return nb::none(); - } + [](PyShapedTypeComponents &self) -> std::optional { + if (!self.ranked) + return {}; return nb::list(self.shape); }, "Returns the shape of the ranked shaped type components as a list " diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 414f37cc97f2a..598ae0188464a 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -76,7 +76,7 @@ class PyObjectRef { /// Releases the object held by this instance, returning it. /// This is the proper thing to return from a function that wants to return /// the reference. Note that this does not work from initializers. - nanobind::typed releaseObject() { + nanobind::object releaseObject() { assert(referrent && object); referrent = nullptr; auto stolen = std::move(object); @@ -88,12 +88,14 @@ class PyObjectRef { assert(referrent && object); return referrent; } - nanobind::typed getObject() { + nanobind::object getObject() { assert(referrent && object); return object; } operator bool() const { return referrent && object; } + using NBTypedT = nanobind::typed; + private: T *referrent; nanobind::object object; @@ -669,7 +671,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject { /// Creates a PyOperation from the MlirOperation wrapped by a capsule. /// Ownership of the underlying MlirOperation is taken by calling this /// function. - static nanobind::object createFromCapsule(nanobind::object capsule); + static nanobind::object createFromCapsule(const nanobind::object &capsule); /// Creates an operation. See corresponding python docstring. static nanobind::object @@ -680,7 +682,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject { PyLocation &location, const nanobind::object &ip, bool inferType); /// Creates an OpView suitable for this operation. - nanobind::typed createOpView(); + nanobind::object createOpView(); /// Erases the underlying MlirOperation, removes its pointer from the /// parent context's live operations map, and sets the valid bit false. @@ -690,7 +692,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject { void setInvalid() { valid = false; } /// Clones this operation. - nanobind::typed clone(const nanobind::object &ip); + nanobind::object clone(const nanobind::object &ip); PyOperation(PyMlirContextRef contextRef, MlirOperation operation); @@ -890,7 +892,7 @@ class PyType : public BaseContextObject { /// is taken by calling this function. static PyType createFromCapsule(nanobind::object capsule); - nanobind::typed maybeDownCast(); + nanobind::object maybeDownCast(); private: MlirType type; @@ -1018,9 +1020,9 @@ class PyAttribute : public BaseContextObject { /// Note that PyAttribute instances are uniqued, so the returned object /// may be a pre-existing object. Ownership of the underlying MlirAttribute /// is taken by calling this function. - static PyAttribute createFromCapsule(nanobind::object capsule); + static PyAttribute createFromCapsule(const nanobind::object &capsule); - nanobind::typed maybeDownCast(); + nanobind::object maybeDownCast(); private: MlirAttribute attr; @@ -1099,10 +1101,12 @@ class PyConcreteAttribute : public BaseTy { return DerivedTy::isaFunction(otherAttr); }, nanobind::arg("other")); - cls.def_prop_ro("type", [](PyAttribute &attr) { - return PyType(attr.getContext(), mlirAttributeGetType(attr)) - .maybeDownCast(); - }); + cls.def_prop_ro( + "type", + [](PyAttribute &attr) -> nanobind::typed { + return PyType(attr.getContext(), mlirAttributeGetType(attr)) + .maybeDownCast(); + }); cls.def_prop_ro_static( "static_typeid", [](nanobind::object & /*class*/) -> PyTypeID { @@ -1178,7 +1182,7 @@ class PyValue { /// Gets a capsule wrapping the void* within the MlirValue. nanobind::object getCapsule(); - nanobind::typed maybeDownCast(); + nanobind::object maybeDownCast(); /// Creates a PyValue from the MlirValue wrapped by a capsule. Ownership of /// the underlying MlirValue is still tied to the owning operation. @@ -1269,8 +1273,7 @@ class PySymbolTable { /// Returns the symbol (opview) with the given name, throws if there is no /// such symbol in the table. - nanobind::typed - dunderGetItem(const std::string &name); + nanobind::object dunderGetItem(const std::string &name); /// Removes the given operation from the symbol table and erases it. void erase(PyOperationBase &symbol); diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp index 09ef64d4e0baf..3488d92250b45 100644 --- a/mlir/lib/Bindings/Python/IRTypes.cpp +++ b/mlir/lib/Bindings/Python/IRTypes.cpp @@ -501,7 +501,7 @@ class PyComplexType : public PyConcreteType { "Create a complex type"); c.def_prop_ro( "element_type", - [](PyComplexType &self) { + [](PyComplexType &self) -> nb::typed { return PyType(self.getContext(), mlirComplexTypeGetElementType(self)) .maybeDownCast(); }, @@ -515,7 +515,7 @@ class PyComplexType : public PyConcreteType { void mlir::PyShapedType::bindDerived(ClassTy &c) { c.def_prop_ro( "element_type", - [](PyShapedType &self) { + [](PyShapedType &self) -> nb::typed { return PyType(self.getContext(), mlirShapedTypeGetElementType(self)) .maybeDownCast(); }, @@ -639,11 +639,16 @@ class PyVectorType : public PyConcreteType { using PyConcreteType::PyConcreteType; static void bindDerived(ClassTy &c) { - c.def_static("get", &PyVectorType::get, nb::arg("shape"), + c.def_static("get", &PyVectorType::getChecked, nb::arg("shape"), nb::arg("element_type"), nb::kw_only(), nb::arg("scalable") = nb::none(), nb::arg("scalable_dims") = nb::none(), nb::arg("loc") = nb::none(), "Create a vector type") + .def_static("get_unchecked", &PyVectorType::get, nb::arg("shape"), + nb::arg("element_type"), nb::kw_only(), + nb::arg("scalable") = nb::none(), + nb::arg("scalable_dims") = nb::none(), + nb::arg("context") = nb::none(), "Create a vector type") .def_prop_ro( "scalable", [](MlirType self) { return mlirVectorTypeIsScalable(self); }) @@ -658,10 +663,11 @@ class PyVectorType : public PyConcreteType { } private: - static PyVectorType get(std::vector shape, PyType &elementType, - std::optional scalable, - std::optional> scalableDims, - DefaultingPyLocation loc) { + static PyVectorType + getChecked(std::vector shape, PyType &elementType, + std::optional scalable, + std::optional> scalableDims, + DefaultingPyLocation loc) { if (scalable && scalableDims) { throw nb::value_error("'scalable' and 'scalable_dims' kwargs " "are mutually exclusive."); @@ -696,6 +702,42 @@ class PyVectorType : public PyConcreteType { throw MLIRError("Invalid type", errors.take()); return PyVectorType(elementType.getContext(), type); } + + static PyVectorType get(std::vector shape, PyType &elementType, + std::optional scalable, + std::optional> scalableDims, + DefaultingPyMlirContext context) { + if (scalable && scalableDims) { + throw nb::value_error("'scalable' and 'scalable_dims' kwargs " + "are mutually exclusive."); + } + + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirType type; + if (scalable) { + if (scalable->size() != shape.size()) + throw nb::value_error("Expected len(scalable) == len(shape)."); + + SmallVector scalableDimFlags = llvm::to_vector(llvm::map_range( + *scalable, [](const nb::handle &h) { return nb::cast(h); })); + type = mlirVectorTypeGetScalable(shape.size(), shape.data(), + scalableDimFlags.data(), elementType); + } else if (scalableDims) { + SmallVector scalableDimFlags(shape.size(), false); + for (int64_t dim : *scalableDims) { + if (static_cast(dim) >= scalableDimFlags.size() || dim < 0) + throw nb::value_error("Scalable dimension index out of bounds."); + scalableDimFlags[dim] = true; + } + type = mlirVectorTypeGetScalable(shape.size(), shape.data(), + scalableDimFlags.data(), elementType); + } else { + type = mlirVectorTypeGet(shape.size(), shape.data(), elementType); + } + if (mlirTypeIsNull(type)) + throw MLIRError("Invalid type", errors.take()); + return PyVectorType(elementType.getContext(), type); + } }; /// Ranked Tensor Type subclass - RankedTensorType. @@ -724,6 +766,22 @@ class PyRankedTensorType nb::arg("shape"), nb::arg("element_type"), nb::arg("encoding") = nb::none(), nb::arg("loc") = nb::none(), "Create a ranked tensor type"); + c.def_static( + "get_unchecked", + [](std::vector shape, PyType &elementType, + std::optional &encodingAttr, + DefaultingPyMlirContext context) { + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirType t = mlirRankedTensorTypeGet( + shape.size(), shape.data(), elementType, + encodingAttr ? encodingAttr->get() : mlirAttributeGetNull()); + if (mlirTypeIsNull(t)) + throw MLIRError("Invalid type", errors.take()); + return PyRankedTensorType(elementType.getContext(), t); + }, + nb::arg("shape"), nb::arg("element_type"), + nb::arg("encoding") = nb::none(), nb::arg("context") = nb::none(), + "Create a ranked tensor type"); c.def_prop_ro( "encoding", [](PyRankedTensorType &self) @@ -758,6 +816,17 @@ class PyUnrankedTensorType }, nb::arg("element_type"), nb::arg("loc") = nb::none(), "Create a unranked tensor type"); + c.def_static( + "get_unchecked", + [](PyType &elementType, DefaultingPyMlirContext context) { + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirType t = mlirUnrankedTensorTypeGet(elementType); + if (mlirTypeIsNull(t)) + throw MLIRError("Invalid type", errors.take()); + return PyUnrankedTensorType(elementType.getContext(), t); + }, + nb::arg("element_type"), nb::arg("context") = nb::none(), + "Create a unranked tensor type"); } }; @@ -790,6 +859,27 @@ class PyMemRefType : public PyConcreteType { nb::arg("shape"), nb::arg("element_type"), nb::arg("layout") = nb::none(), nb::arg("memory_space") = nb::none(), nb::arg("loc") = nb::none(), "Create a memref type") + .def_static( + "get_unchecked", + [](std::vector shape, PyType &elementType, + PyAttribute *layout, PyAttribute *memorySpace, + DefaultingPyMlirContext context) { + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirAttribute layoutAttr = + layout ? *layout : mlirAttributeGetNull(); + MlirAttribute memSpaceAttr = + memorySpace ? *memorySpace : mlirAttributeGetNull(); + MlirType t = + mlirMemRefTypeGet(elementType, shape.size(), shape.data(), + layoutAttr, memSpaceAttr); + if (mlirTypeIsNull(t)) + throw MLIRError("Invalid type", errors.take()); + return PyMemRefType(elementType.getContext(), t); + }, + nb::arg("shape"), nb::arg("element_type"), + nb::arg("layout") = nb::none(), + nb::arg("memory_space") = nb::none(), + nb::arg("context") = nb::none(), "Create a memref type") .def_prop_ro( "layout", [](PyMemRefType &self) -> nb::typed { @@ -858,6 +948,22 @@ class PyUnrankedMemRefType }, nb::arg("element_type"), nb::arg("memory_space").none(), nb::arg("loc") = nb::none(), "Create a unranked memref type") + .def_static( + "get_unchecked", + [](PyType &elementType, PyAttribute *memorySpace, + DefaultingPyMlirContext context) { + PyMlirContext::ErrorCapture errors(context->getRef()); + MlirAttribute memSpaceAttr = {}; + if (memorySpace) + memSpaceAttr = *memorySpace; + + MlirType t = mlirUnrankedMemRefTypeGet(elementType, memSpaceAttr); + if (mlirTypeIsNull(t)) + throw MLIRError("Invalid type", errors.take()); + return PyUnrankedMemRefType(elementType.getContext(), t); + }, + nb::arg("element_type"), nb::arg("memory_space").none(), + nb::arg("context") = nb::none(), "Create a unranked memref type") .def_prop_ro( "memory_space", [](PyUnrankedMemRefType &self) @@ -895,9 +1001,21 @@ class PyTupleType : public PyConcreteType { }, nb::arg("elements"), nb::arg("context") = nb::none(), "Create a tuple type"); + c.def_static( + "get_tuple", + [](std::vector elements, DefaultingPyMlirContext context) { + MlirType t = mlirTupleTypeGet(context->get(), elements.size(), + elements.data()); + return PyTupleType(context->getRef(), t); + }, + nb::arg("elements"), nb::arg("context") = nb::none(), + // clang-format off + nb::sig("def get_tuple(elements: Sequence[Type], context: mlir.ir.Context | None = None) -> TupleType"), + // clang-format on + "Create a tuple type"); c.def( "get_type", - [](PyTupleType &self, intptr_t pos) { + [](PyTupleType &self, intptr_t pos) -> nb::typed { return PyType(self.getContext(), mlirTupleTypeGetType(self, pos)) .maybeDownCast(); }, @@ -941,6 +1059,20 @@ class PyFunctionType : public PyConcreteType { }, nb::arg("inputs"), nb::arg("results"), nb::arg("context") = nb::none(), "Gets a FunctionType from a list of input and result types"); + c.def_static( + "get", + [](std::vector inputs, std::vector results, + DefaultingPyMlirContext context) { + MlirType t = + mlirFunctionTypeGet(context->get(), inputs.size(), inputs.data(), + results.size(), results.data()); + return PyFunctionType(context->getRef(), t); + }, + nb::arg("inputs"), nb::arg("results"), nb::arg("context") = nb::none(), + // clang-format off + nb::sig("def get(inputs: Sequence[Type], results: Sequence[Type], context: mlir.ir.Context | None = None) -> FunctionType"), + // clang-format on + "Gets a FunctionType from a list of input and result types"); c.def_prop_ro( "inputs", [](PyFunctionType &self) { diff --git a/mlir/lib/Bindings/Python/NanobindUtils.h b/mlir/lib/Bindings/Python/NanobindUtils.h index 40b3215f6f5fe..64ea4329f65f1 100644 --- a/mlir/lib/Bindings/Python/NanobindUtils.h +++ b/mlir/lib/Bindings/Python/NanobindUtils.h @@ -276,7 +276,7 @@ class Sliceable { /// Returns the element at the given slice index. Supports negative indices /// by taking elements in inverse order. Returns a nullptr object if out /// of bounds. - nanobind::typed getItem(intptr_t index) { + nanobind::object getItem(intptr_t index) { // Negative indices mean we count from the end. index = wrapIndex(index); if (index < 0) { diff --git a/mlir/lib/Bindings/Python/Rewrite.cpp b/mlir/lib/Bindings/Python/Rewrite.cpp index 96f00cface64f..20392b9002706 100644 --- a/mlir/lib/Bindings/Python/Rewrite.cpp +++ b/mlir/lib/Bindings/Python/Rewrite.cpp @@ -9,12 +9,15 @@ #include "Rewrite.h" #include "IRModule.h" +#include "mlir-c/IR.h" #include "mlir-c/Rewrite.h" +#include "mlir-c/Support.h" // clang-format off #include "mlir/Bindings/Python/Nanobind.h" #include "mlir-c/Bindings/Python/Interop.h" // This is expected after nanobind. // clang-format on #include "mlir/Config/mlir-config.h" +#include "nanobind/nanobind.h" namespace nb = nanobind; using namespace mlir; @@ -24,6 +27,40 @@ using namespace mlir::python; namespace { #if MLIR_ENABLE_PDL_IN_PATTERNMATCH +static nb::object objectFromPDLValue(MlirPDLValue value) { + if (MlirValue v = mlirPDLValueAsValue(value); !mlirValueIsNull(v)) + return nb::cast(v); + if (MlirOperation v = mlirPDLValueAsOperation(value); !mlirOperationIsNull(v)) + return nb::cast(v); + if (MlirAttribute v = mlirPDLValueAsAttribute(value); !mlirAttributeIsNull(v)) + return nb::cast(v); + if (MlirType v = mlirPDLValueAsType(value); !mlirTypeIsNull(v)) + return nb::cast(v); + + throw std::runtime_error("unsupported PDL value type"); +} + +static std::vector objectsFromPDLValues(size_t nValues, + MlirPDLValue *values) { + std::vector args; + args.reserve(nValues); + for (size_t i = 0; i < nValues; ++i) + args.push_back(objectFromPDLValue(values[i])); + return args; +} + +// Convert the Python object to a boolean. +// If it evaluates to False, treat it as success; +// otherwise, treat it as failure. +// Note that None is considered success. +static MlirLogicalResult logicalResultFromObject(const nb::object &obj) { + if (obj.is_none()) + return mlirLogicalResultSuccess(); + + return nb::cast(obj) ? mlirLogicalResultFailure() + : mlirLogicalResultSuccess(); +} + /// Owning Wrapper around a PDLPatternModule. class PyPDLPatternModule { public: @@ -38,6 +75,34 @@ class PyPDLPatternModule { } MlirPDLPatternModule get() { return module; } + void registerRewriteFunction(const std::string &name, + const nb::callable &fn) { + mlirPDLPatternModuleRegisterRewriteFunction( + get(), mlirStringRefCreate(name.data(), name.size()), + [](MlirPatternRewriter rewriter, MlirPDLResultList results, + size_t nValues, MlirPDLValue *values, + void *userData) -> MlirLogicalResult { + nb::handle f = nb::handle(static_cast(userData)); + return logicalResultFromObject( + f(rewriter, results, objectsFromPDLValues(nValues, values))); + }, + fn.ptr()); + } + + void registerConstraintFunction(const std::string &name, + const nb::callable &fn) { + mlirPDLPatternModuleRegisterConstraintFunction( + get(), mlirStringRefCreate(name.data(), name.size()), + [](MlirPatternRewriter rewriter, MlirPDLResultList results, + size_t nValues, MlirPDLValue *values, + void *userData) -> MlirLogicalResult { + nb::handle f = nb::handle(static_cast(userData)); + return logicalResultFromObject( + f(rewriter, results, objectsFromPDLValues(nValues, values))); + }, + fn.ptr()); + } + private: MlirPDLPatternModule module; }; @@ -78,11 +143,59 @@ class PyFrozenRewritePatternSet { /// Create the `mlir.rewrite` here. void mlir::python::populateRewriteSubmodule(nb::module_ &m) { + nb::class_(m, "PatternRewriter"); //---------------------------------------------------------------------------- - // Mapping of the top-level PassManager + // Mapping of the PDLResultList and PDLModule //---------------------------------------------------------------------------- #if MLIR_ENABLE_PDL_IN_PATTERNMATCH + nb::class_(m, "PDLResultList") + .def( + "append", + [](MlirPDLResultList results, const PyValue &value) { + mlirPDLResultListPushBackValue(results, value); + }, + // clang-format off + nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Value") ")") + // clang-format on + ) + .def( + "append", + [](MlirPDLResultList results, const PyOperation &op) { + mlirPDLResultListPushBackOperation(results, op); + }, + // clang-format off + nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Operation") ")") + // clang-format on + ) + .def( + "append", + [](MlirPDLResultList results, const PyType &type) { + mlirPDLResultListPushBackType(results, type); + }, + // clang-format off + nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Type") ")") + // clang-format on + ) + .def( + "append", + [](MlirPDLResultList results, const PyAttribute &attr) { + mlirPDLResultListPushBackAttribute(results, attr); + }, + // clang-format off + nb::sig("def append(self, " MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute") ")") + // clang-format on + ); nb::class_(m, "PDLModule") + .def( + "__init__", + [](PyPDLPatternModule &self, MlirModule module) { + new (&self) + PyPDLPatternModule(mlirPDLPatternModuleFromModule(module)); + }, + // clang-format off + nb::sig("def __init__(self, module: " MAKE_MLIR_PYTHON_QUALNAME("ir.Module") ") -> None"), + // clang-format on + "module"_a, "Create a PDL module from the given module.") .def( "__init__", [](PyPDLPatternModule &self, PyModule &module) { @@ -93,10 +206,27 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) { nb::sig("def __init__(self, module: " MAKE_MLIR_PYTHON_QUALNAME("ir.Module") ") -> None"), // clang-format on "module"_a, "Create a PDL module from the given module.") - .def("freeze", [](PyPDLPatternModule &self) { - return new PyFrozenRewritePatternSet(mlirFreezeRewritePattern( - mlirRewritePatternSetFromPDLPatternModule(self.get()))); - }); + .def( + "freeze", + [](PyPDLPatternModule &self) { + return new PyFrozenRewritePatternSet(mlirFreezeRewritePattern( + mlirRewritePatternSetFromPDLPatternModule(self.get()))); + }, + nb::keep_alive<0, 1>()) + .def( + "register_rewrite_function", + [](PyPDLPatternModule &self, const std::string &name, + const nb::callable &fn) { + self.registerRewriteFunction(name, fn); + }, + nb::keep_alive<1, 3>()) + .def( + "register_constraint_function", + [](PyPDLPatternModule &self, const std::string &name, + const nb::callable &fn) { + self.registerConstraintFunction(name, fn); + }, + nb::keep_alive<1, 3>()); #endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH nb::class_(m, "FrozenRewritePatternSet") .def_prop_ro(MLIR_PYTHON_CAPI_PTR_ATTR, @@ -117,6 +247,22 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) { // clang-format on "Applys the given patterns to the given module greedily while folding " "results.") + .def( + "apply_patterns_and_fold_greedily", + [](PyModule &module, MlirFrozenRewritePatternSet set) { + auto status = + mlirApplyPatternsAndFoldGreedily(module.get(), set, {}); + if (mlirLogicalResultIsFailure(status)) + throw std::runtime_error( + "pattern application failed to converge"); + }, + "module"_a, "set"_a, + // clang-format off + nb::sig("def apply_patterns_and_fold_greedily(module: " MAKE_MLIR_PYTHON_QUALNAME("ir.Module") ", set: FrozenRewritePatternSet) -> None"), + // clang-format on + "Applys the given patterns to the given module greedily while " + "folding " + "results.") .def( "apply_patterns_and_fold_greedily", [](PyOperationBase &op, PyFrozenRewritePatternSet &set) { @@ -131,5 +277,20 @@ void mlir::python::populateRewriteSubmodule(nb::module_ &m) { nb::sig("def apply_patterns_and_fold_greedily(op: " MAKE_MLIR_PYTHON_QUALNAME("ir._OperationBase") ", set: FrozenRewritePatternSet) -> None"), // clang-format on "Applys the given patterns to the given op greedily while folding " + "results.") + .def( + "apply_patterns_and_fold_greedily", + [](PyOperationBase &op, MlirFrozenRewritePatternSet set) { + auto status = mlirApplyPatternsAndFoldGreedilyWithOp( + op.getOperation(), set, {}); + if (mlirLogicalResultIsFailure(status)) + throw std::runtime_error( + "pattern application failed to converge"); + }, + "op"_a, "set"_a, + // clang-format off + nb::sig("def apply_patterns_and_fold_greedily(op: " MAKE_MLIR_PYTHON_QUALNAME("ir._OperationBase") ", set: FrozenRewritePatternSet) -> None"), + // clang-format on + "Applys the given patterns to the given op greedily while folding " "results."); } diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp index 7a33046c6c872..eaad8a87aab9b 100644 --- a/mlir/lib/CAPI/Dialect/LLVM.cpp +++ b/mlir/lib/CAPI/Dialect/LLVM.cpp @@ -253,17 +253,16 @@ MlirAttribute mlirLLVMDIFileAttrGet(MlirContext ctx, MlirAttribute name, cast(unwrap(directory)))); } -MlirAttribute -mlirLLVMDICompileUnitAttrGet(MlirContext ctx, MlirAttribute id, - unsigned int sourceLanguage, MlirAttribute file, - MlirAttribute producer, bool isOptimized, - MlirLLVMDIEmissionKind emissionKind, - MlirLLVMDINameTableKind nameTableKind) { +MlirAttribute mlirLLVMDICompileUnitAttrGet( + MlirContext ctx, MlirAttribute id, unsigned int sourceLanguage, + MlirAttribute file, MlirAttribute producer, bool isOptimized, + MlirLLVMDIEmissionKind emissionKind, MlirLLVMDINameTableKind nameTableKind, + MlirAttribute splitDebugFilename) { return wrap(DICompileUnitAttr::get( unwrap(ctx), cast(unwrap(id)), sourceLanguage, cast(unwrap(file)), cast(unwrap(producer)), - isOptimized, DIEmissionKind(emissionKind), - DINameTableKind(nameTableKind))); + isOptimized, DIEmissionKind(emissionKind), DINameTableKind(nameTableKind), + cast(unwrap(splitDebugFilename)))); } MlirAttribute mlirLLVMDIFlagsAttrGet(MlirContext ctx, uint64_t value) { diff --git a/mlir/lib/CAPI/Transforms/Rewrite.cpp b/mlir/lib/CAPI/Transforms/Rewrite.cpp index 6f85357a14a18..8ee6308cadf83 100644 --- a/mlir/lib/CAPI/Transforms/Rewrite.cpp +++ b/mlir/lib/CAPI/Transforms/Rewrite.cpp @@ -13,6 +13,8 @@ #include "mlir/CAPI/Rewrite.h" #include "mlir/CAPI/Support.h" #include "mlir/CAPI/Wrap.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/PDLPatternMatch.h.inc" #include "mlir/IR/PatternMatch.h" #include "mlir/Rewrite/FrozenRewritePatternSet.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -301,6 +303,19 @@ mlirApplyPatternsAndFoldGreedilyWithOp(MlirOperation op, return wrap(mlir::applyPatternsGreedily(unwrap(op), *unwrap(patterns))); } +//===----------------------------------------------------------------------===// +/// PatternRewriter API +//===----------------------------------------------------------------------===// + +inline mlir::PatternRewriter *unwrap(MlirPatternRewriter rewriter) { + assert(rewriter.ptr && "unexpected null rewriter"); + return static_cast(rewriter.ptr); +} + +inline MlirPatternRewriter wrap(mlir::PatternRewriter *rewriter) { + return {rewriter}; +} + //===----------------------------------------------------------------------===// /// PDLPatternModule API //===----------------------------------------------------------------------===// @@ -331,4 +346,93 @@ mlirRewritePatternSetFromPDLPatternModule(MlirPDLPatternModule op) { op.ptr = nullptr; return wrap(m); } + +inline const mlir::PDLValue *unwrap(MlirPDLValue value) { + assert(value.ptr && "unexpected null PDL value"); + return static_cast(value.ptr); +} + +inline MlirPDLValue wrap(const mlir::PDLValue *value) { return {value}; } + +inline mlir::PDLResultList *unwrap(MlirPDLResultList results) { + assert(results.ptr && "unexpected null PDL results"); + return static_cast(results.ptr); +} + +inline MlirPDLResultList wrap(mlir::PDLResultList *results) { + return {results}; +} + +MlirValue mlirPDLValueAsValue(MlirPDLValue value) { + return wrap(unwrap(value)->dyn_cast()); +} + +MlirType mlirPDLValueAsType(MlirPDLValue value) { + return wrap(unwrap(value)->dyn_cast()); +} + +MlirOperation mlirPDLValueAsOperation(MlirPDLValue value) { + return wrap(unwrap(value)->dyn_cast()); +} + +MlirAttribute mlirPDLValueAsAttribute(MlirPDLValue value) { + return wrap(unwrap(value)->dyn_cast()); +} + +void mlirPDLResultListPushBackValue(MlirPDLResultList results, + MlirValue value) { + unwrap(results)->push_back(unwrap(value)); +} + +void mlirPDLResultListPushBackType(MlirPDLResultList results, MlirType value) { + unwrap(results)->push_back(unwrap(value)); +} + +void mlirPDLResultListPushBackOperation(MlirPDLResultList results, + MlirOperation value) { + unwrap(results)->push_back(unwrap(value)); +} + +void mlirPDLResultListPushBackAttribute(MlirPDLResultList results, + MlirAttribute value) { + unwrap(results)->push_back(unwrap(value)); +} + +inline std::vector wrap(ArrayRef values) { + std::vector mlirValues; + mlirValues.reserve(values.size()); + for (auto &value : values) { + mlirValues.push_back(wrap(&value)); + } + return mlirValues; +} + +void mlirPDLPatternModuleRegisterRewriteFunction( + MlirPDLPatternModule pdlModule, MlirStringRef name, + MlirPDLRewriteFunction rewriteFn, void *userData) { + unwrap(pdlModule)->registerRewriteFunction( + unwrap(name), + [userData, rewriteFn](PatternRewriter &rewriter, PDLResultList &results, + ArrayRef values) -> LogicalResult { + std::vector mlirValues = wrap(values); + return unwrap(rewriteFn(wrap(&rewriter), wrap(&results), + mlirValues.size(), mlirValues.data(), + userData)); + }); +} + +void mlirPDLPatternModuleRegisterConstraintFunction( + MlirPDLPatternModule pdlModule, MlirStringRef name, + MlirPDLConstraintFunction constraintFn, void *userData) { + unwrap(pdlModule)->registerConstraintFunction( + unwrap(name), + [userData, constraintFn](PatternRewriter &rewriter, + PDLResultList &results, + ArrayRef values) -> LogicalResult { + std::vector mlirValues = wrap(values); + return unwrap(constraintFn(wrap(&rewriter), wrap(&results), + mlirValues.size(), mlirValues.data(), + userData)); + }); +} #endif // MLIR_ENABLE_PDL_IN_PATTERNMATCH diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 0078eed8b7a67..85f0fd1dd1048 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -57,8 +57,25 @@ static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter, static Value createI32Constant(ConversionPatternRewriter &rewriter, Location loc, int32_t value) { - Type i32 = rewriter.getI32Type(); - return LLVM::ConstantOp::create(rewriter, loc, i32, value); + return LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(), value); +} + +/// Convert an unsigned number `val` to i64. +static Value convertUnsignedToI64(ConversionPatternRewriter &rewriter, + Location loc, Value val) { + IntegerType i64 = rewriter.getI64Type(); + // Force check that `val` is of int type. + auto valTy = cast(val.getType()); + if (i64 == valTy) + return val; + return valTy.getWidth() > 64 + ? Value(LLVM::TruncOp::create(rewriter, loc, i64, val)) + : Value(LLVM::ZExtOp::create(rewriter, loc, i64, val)); +} + +static Value createI64Constant(ConversionPatternRewriter &rewriter, + Location loc, int64_t value) { + return LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), value); } static Value createI1Constant(ConversionPatternRewriter &rewriter, Location loc, @@ -95,7 +112,7 @@ static Value getNumRecords(ConversionPatternRewriter &rewriter, Location loc, MemRefType memrefType, MemRefDescriptor &memrefDescriptor, ArrayRef strides, - uint32_t elementByteWidth) { + int64_t elementByteWidth) { if (memrefType.hasStaticShape() && !llvm::any_of(strides, ShapedType::isDynamic)) { int64_t size = memrefType.getRank() == 0 ? 1 : 0; @@ -103,9 +120,7 @@ static Value getNumRecords(ConversionPatternRewriter &rewriter, Location loc, for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) size = std::max(shape[i] * strides[i], size); size = size * elementByteWidth; - assert(size < std::numeric_limits::max() && - "the memref buffer is too large"); - return createI32Constant(rewriter, loc, static_cast(size)); + return createI64Constant(rewriter, loc, size); } Value maxIndex; for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) { @@ -116,9 +131,9 @@ static Value getNumRecords(ConversionPatternRewriter &rewriter, Location loc, ? LLVM::UMaxOp::create(rewriter, loc, maxIndex, maxThisDim) : maxThisDim; } - Value maxIndexI32 = convertUnsignedToI32(rewriter, loc, maxIndex); - Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth); - return LLVM::MulOp::create(rewriter, loc, maxIndexI32, byteWidthConst); + Value maxIndexI64 = convertUnsignedToI64(rewriter, loc, maxIndex); + Value byteWidthConst = createI64Constant(rewriter, loc, elementByteWidth); + return LLVM::MulOp::create(rewriter, loc, maxIndexI64, byteWidthConst); } static Value makeBufferRsrc(ConversionPatternRewriter &rewriter, Location loc, @@ -536,52 +551,49 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { LogicalResult matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - bool requiresInlineAsm = chipset < kGfx90a || chipset.majorVersion == 11; - + Location loc = op.getLoc(); + // This ensures that waits on global memory aren't introduced on + // chips that don't have the BackOffBarrier feature enabled in LLVM. + bool requiresInlineAsm = chipset < kGfx90a; + + Attribute mmra = + rewriter.getAttr("amdgpu-synchronize-as", "local"); + // Note: while there *is* a workgroup-one-as scope, this, when combined with + // the MMRA, will lead to the fence having no effect. This is because the + // codepaths for an atomic load or store will observe that a + // one-address-space atomic to LDS requires no synchronization because + // operations on LDS are totally ordered with respect to each other, and so + // will not emit the correct waitcnt operations that these fences are + // intended to produce. Therefore, we use a broader type of fence and rely + // on the MMRA to relax it to the semantics we want. + StringRef scope = "workgroup"; + + auto relFence = LLVM::FenceOp::create(rewriter, loc, + LLVM::AtomicOrdering::release, scope); + relFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); if (requiresInlineAsm) { auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(), LLVM::AsmDialect::AD_ATT); - const char *asmStr = - ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier"; + const char *asmStr = ";;;WARNING: BREAKS DEBUG WATCHES\ns_barrier"; const char *constraints = ""; - rewriter.replaceOpWithNewOp( - op, + LLVM::InlineAsmOp::create( + rewriter, loc, /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(), /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true, /*is_align_stack=*/false, LLVM::TailCallKind::None, /*asm_dialect=*/asmDialectAttr, /*operand_attrs=*/ArrayAttr()); - return success(); - } - if (chipset.majorVersion < 12) { - constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8); - constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8); - // Left in place in case someone disables the inline ASM path or future - // chipsets use the same bit pattern. - constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4); - - int32_t ldsOnlyBits; - if (chipset.majorVersion == 11) - ldsOnlyBits = ldsOnlyBitsGfx11; - else if (chipset.majorVersion == 10) - ldsOnlyBits = ldsOnlyBitsGfx10; - else if (chipset.majorVersion <= 9) - ldsOnlyBits = ldsOnlyBitsGfx6789; - else - return op.emitOpError( - "don't know how to lower this for chipset major version") - << chipset.majorVersion; - - Location loc = op->getLoc(); - ROCDL::SWaitcntOp::create(rewriter, loc, ldsOnlyBits); - rewriter.replaceOpWithNewOp(op); + } else if (chipset.majorVersion < 12) { + ROCDL::SBarrierOp::create(rewriter, loc); } else { - Location loc = op->getLoc(); - ROCDL::WaitDscntOp::create(rewriter, loc, 0); ROCDL::BarrierSignalOp::create(rewriter, loc, -1); - rewriter.replaceOpWithNewOp(op, -1); + ROCDL::BarrierWaitOp::create(rewriter, loc, -1); } + auto acqFence = LLVM::FenceOp::create(rewriter, loc, + LLVM::AtomicOrdering::acquire, scope); + acqFence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); + rewriter.replaceOp(op, acqFence); return success(); } }; diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp index 1037e296c8128..a73afbcb6474b 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -663,7 +663,7 @@ static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) { /// Generates a symbol with 0-sized array type for dynamic shared memory usage, /// or uses existing symbol. -LLVM::GlobalOp getDynamicSharedMemorySymbol( +static LLVM::GlobalOp getDynamicSharedMemorySymbol( ConversionPatternRewriter &rewriter, gpu::GPUModuleOp moduleOp, gpu::DynamicSharedMemoryOp op, const LLVMTypeConverter *typeConverter, MemRefType memrefType, unsigned alignmentBit) { diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp index d95aeba8a4488..da4443dc86053 100644 --- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp +++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp @@ -67,7 +67,7 @@ LogicalResult mlir::LLVM::createPrintStrCall( auto arrayTy = LLVM::LLVMArrayType::get(IntegerType::get(ctx, 8), elementVals.size()); auto globalOp = LLVM::GlobalOp::create( - builder, loc, arrayTy, /*constant=*/true, LLVM::Linkage::Private, + builder, loc, arrayTy, /*isConstant=*/true, LLVM::Linkage::Private, ensureSymbolNameIsUnique(moduleOp, symbolName, symbolTables), dataAttr); auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext()); diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp index 49d06497dbeea..f44552c4556c2 100644 --- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp +++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp @@ -512,7 +512,7 @@ calculateMemoryRequirements(Value accessedPtr, bool isNontemporal, if (!sizeInBytes.has_value()) return failure(); - memoryAccess = memoryAccess | spirv::MemoryAccess::Aligned; + memoryAccess |= spirv::MemoryAccess::Aligned; auto memAccessAttr = spirv::MemoryAccessAttr::get(ctx, memoryAccess); auto alignmentValue = preferredAlignment ? preferredAlignment : *sizeInBytes; auto alignment = IntegerAttr::get(IntegerType::get(ctx, 32), alignmentValue); diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index b7e3491117e9b..a9efada28a320 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -993,6 +993,14 @@ struct NVGPUTmaAsyncLoadOpLowering auto srcMemrefType = cast(op.getDst().getType()); Value dest = getStridedElementPtr(rewriter, op->getLoc(), srcMemrefType, adaptor.getDst(), {}); + // Intrinsics takes a shared-cluster pointer so we need an + // address space cast from 3 to 7. + // TODO: Introduce AS(7) in NVGPU. + auto ptrSharedClusterType = LLVM::LLVMPointerType::get( + op->getContext(), + static_cast(NVVM::NVVMMemorySpace::SharedCluster)); + dest = LLVM::AddrSpaceCastOp::create(b, ptrSharedClusterType, dest); + Value barrier = getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(), adaptor.getMbarId(), rewriter); @@ -1001,9 +1009,14 @@ struct NVGPUTmaAsyncLoadOpLowering for (auto [index, value] : llvm::enumerate(coords)) { coords[index] = truncToI32(b, value); } + + // TODO: Enhance the NVGPU Op for other modes too rewriter.replaceOpWithNewOp( op, dest, adaptor.getTensorMapDescriptor(), coords, barrier, ValueRange{}, adaptor.getMulticastMask(), Value{}, + NVVM::TMALoadMode::TILE, // default is TILE mode + false, // default is cluster-scope + nullptr, // default is no cta-group adaptor.getPredicate()); return success(); } diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 1955eec9964eb..a5336ed6bf2cd 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -186,56 +186,63 @@ static Value createLinalgBodyCalculationForElementwiseOp( if (isa(op)) { auto negate = cast(op); + int64_t inZp = 0, outZp = 0; FailureOr maybeInZp = negate.getInput1ZeroPoint(); - if (failed(maybeInZp)) { - (void)rewriter.notifyMatchFailure( - op, "input1 zero point cannot be statically determined"); - return nullptr; - } - FailureOr maybeOutZp = negate.getOutputZeroPoint(); - if (failed(maybeOutZp)) { - (void)rewriter.notifyMatchFailure( - op, "output zero point cannot be statically determined"); - return nullptr; - } - - int64_t inZp = *maybeInZp; - int64_t outZp = *maybeOutZp; + bool hasInZp = !failed(maybeInZp); + bool hasOutZp = !failed(maybeOutZp); + if (hasInZp) + inZp = *maybeInZp; + if (hasOutZp) + outZp = *maybeOutZp; if (isa(elementTy)) return arith::NegFOp::create(rewriter, loc, resultTypes, args[0]); if (isa(elementTy)) { - if (!inZp && !outZp) { + if (hasInZp && hasOutZp && !inZp && !outZp) { auto constant = arith::ConstantOp::create( rewriter, loc, IntegerAttr::get(elementTy, 0)); return arith::SubIOp::create(rewriter, loc, resultTypes, constant, args[0]); } + Value zpAddValue; + Type intermediateType; // Compute the maximum value that can occur in the intermediate buffer. const int32_t inputBitWidth = elementTy.getIntOrFloatBitWidth(); - const int64_t zpAdd = inZp + outZp; - const int64_t maxValue = - APInt::getSignedMaxValue(inputBitWidth).getSExtValue() + - std::abs(zpAdd) + 1; - - // Convert that maximum value into the maximum bitwidth needed to - // represent it. We assume 48-bit numbers may be supported further in - // the pipeline. int intermediateBitWidth = 64; - if (maxValue <= APInt::getSignedMaxValue(16).getSExtValue()) { - intermediateBitWidth = 16; - } else if (maxValue <= APInt::getSignedMaxValue(32).getSExtValue()) { - intermediateBitWidth = 32; - } else if (maxValue <= APInt::getSignedMaxValue(48).getSExtValue()) { - intermediateBitWidth = 48; - } - Type intermediateType = rewriter.getIntegerType(intermediateBitWidth); - Value zpAddValue = arith::ConstantOp::create( - rewriter, loc, rewriter.getIntegerAttr(intermediateType, zpAdd)); + if (hasInZp && hasOutZp) { + // Compute the maximum value that can occur in the intermediate buffer. + const int64_t zpAdd = inZp + outZp; + const int64_t maxValue = + APInt::getSignedMaxValue(inputBitWidth).getSExtValue() + + std::abs(zpAdd) + 1; + + // Convert that maximum value into the maximum bitwidth needed to + // represent it. We assume 48-bit numbers may be supported further in + // the pipeline. + if (maxValue <= APInt::getSignedMaxValue(16).getSExtValue()) { + intermediateBitWidth = 16; + } else if (maxValue <= APInt::getSignedMaxValue(32).getSExtValue()) { + intermediateBitWidth = 32; + } else if (maxValue <= APInt::getSignedMaxValue(48).getSExtValue()) { + intermediateBitWidth = 48; + } + + intermediateType = rewriter.getIntegerType(intermediateBitWidth); + zpAddValue = rewriter.create( + loc, rewriter.getIntegerAttr(intermediateType, zpAdd)); + } else { + intermediateType = rewriter.getIntegerType(intermediateBitWidth); + auto arg1 = + rewriter.create(loc, intermediateType, args[1]); + auto arg2 = + rewriter.create(loc, intermediateType, args[2]); + zpAddValue = + rewriter.create(loc, intermediateType, arg1, arg2); + } // The negation can be applied by doing: // outputValue = inZp + outZp - inputValue @@ -298,6 +305,8 @@ static Value createLinalgBodyCalculationForElementwiseOp( IntegerAttr::get(elementTy, 1)); auto zero = arith::ConstantOp::create(rewriter, loc, IntegerAttr::get(elementTy, 0)); + auto i1zero = + arith::ConstantOp::create(rewriter, loc, IntegerAttr::get(i1Ty, 0)); auto i1one = arith::ConstantOp::create(rewriter, loc, IntegerAttr::get(i1Ty, 1)); @@ -315,9 +324,9 @@ static Value createLinalgBodyCalculationForElementwiseOp( ArrayRef()); auto isInputOdd = arith::AndIOp::create(rewriter, loc, i1Ty, truncated, i1one); - - auto shouldRound = arith::AndIOp::create( - rewriter, loc, i1Ty, shiftValueGreaterThanZero, isInputOdd); + // shifted, truncated, isInputOdd can be poison when input2 is 0. + auto shouldRound = arith::SelectOp::create( + rewriter, loc, i1Ty, shiftValueGreaterThanZero, isInputOdd, i1zero); auto extended = arith::ExtUIOp::create(rewriter, loc, resultTypes, shouldRound); return arith::AddIOp::create(rewriter, loc, resultTypes, result, extended); @@ -1013,9 +1022,14 @@ static ValueRange getBroadcastableOperands(Operation *operation, else return operands.take_front(3); } - // Input1_zp and output_zp cannot broadcast - if (isa(operation)) + if (auto negate = dyn_cast(operation)) { + FailureOr maybeInZp = negate.getInput1ZeroPoint(); + FailureOr maybeOutZp = negate.getOutputZeroPoint(); + if (failed(maybeOutZp) && failed(maybeInZp)) + return operands; + // Input1_zp and output_zp cannot broadcast when they are constants. return operands.take_front(1); + } return operands; } diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp index 79cb49a4f7dbc..d6a262275be3d 100644 --- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp +++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp @@ -741,7 +741,7 @@ creatLdMatrixCompatibleLoads(RewriterBase &rewriter, vector::TransferReadOp op, } // Adjust the load offset. - auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr); + auto laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); FailureOr offsets = nvgpu::getLaneIdToLdMatrixMatrixCoord(rewriter, loc, *params); if (failed(offsets)) { @@ -781,7 +781,7 @@ createNonLdMatrixLoads(RewriterBase &rewriter, vector::TransferReadOp op, "conversion to distributed non-ldmatrix compatible load"); } - Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr); + Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); // This is the individual element type. Type loadedElType = regInfo->registerLLVMType; @@ -915,7 +915,7 @@ convertTransferWriteToStores(RewriterBase &rewriter, vector::TransferWriteOp op, return rewriter.notifyMatchFailure(op, "not mma sync reg info"); VectorType vectorType = getMmaSyncVectorOperandType(*regInfo); - Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upperBound=*/nullptr); + Value laneId = gpu::LaneIdOp::create(rewriter, loc, /*upper_bound=*/nullptr); for (unsigned i = 0; i < vectorType.getShape()[0]; i++) { Value logicalValueId = arith::ConstantOp::create( diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index e7266740894b1..e0b1a88d01cdc 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -1987,17 +1987,13 @@ struct VectorScalableStepOpLowering /// %e = add %c, %d /// ``` /// `vector.matrix_multiply` later lowers to `llvm.matrix.multiply`. -// -/// This only kicks in when vectorContractLowering is set to Matmul and -/// the vector.contract op is a row-major matrix multiply. class ContractionOpToMatmulOpLowering : public vector::MaskableOpRewritePattern { public: using MaskableOpRewritePattern::MaskableOpRewritePattern; - ContractionOpToMatmulOpLowering( - vector::VectorContractLowering vectorContractLowering, - MLIRContext *context, PatternBenefit benefit = 100) + ContractionOpToMatmulOpLowering(MLIRContext *context, + PatternBenefit benefit = 100) : MaskableOpRewritePattern(context, benefit) {} FailureOr @@ -2005,23 +2001,22 @@ class ContractionOpToMatmulOpLowering PatternRewriter &rewriter) const override; }; -/// Progressively lower a `vector.contract %a, %b, %c` with row-major matmul -/// semantics to: +/// Lower a qualifying `vector.contract %a, %b, %c` (with row-major matmul +/// semantics directly into `llvm.intr.matrix.multiply`: +/// BEFORE: +/// ```mlir +/// %res = vector.contract #matmat_trait %lhs, %rhs, %acc +/// : vector<2x4xf32>, vector<4x3xf32> into vector<2x3xf32> /// ``` -/// %mta = maybe_transpose -/// %mtb = maybe_transpose -/// %flattened_a = vector.shape_cast %mta -/// %flattened_b = vector.shape_cast %mtb -/// %flattened_d = llvm.intr.matrix.multiply %flattened_a, %flattened_b -/// %mtd = vector.shape_cast %flattened_d -/// %d = maybe_untranspose %mtd -/// %e = add %c, %d +/// +/// AFTER: +/// ```mlir +/// %lhs = vector.shape_cast %arg0 : vector<2x4xf32> to vector<8xf32> +/// %rhs = vector.shape_cast %arg1 : vector<4x3xf32> to vector<12xf32> +/// %matmul = llvm.intr.matrix.multiply %lhs, %rhs +/// %res = arith.addf %acc, %matmul : vector<2x3xf32> /// ``` // -/// This only kicks in when vectorContractLowering is set to `Matmul`. -/// vector.transpose operations are inserted if the vector.contract op is not a -/// row-major matrix multiply. -/// /// Scalable vectors are not supported. FailureOr ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp( vector::ContractionOp op, MaskingOpInterface maskOp, @@ -2116,7 +2111,19 @@ FailureOr ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp( return res; } -/// Lowers vector.transpose to llvm.intr.matrix.transpose +/// Lowers vector.transpose directly to llvm.intr.matrix.transpose +/// +/// BEFORE: +/// ```mlir +/// %tr = vector.transpose %vec, [1, 0] : vector<2x4xf32> to vector<4x2xf32> +/// ``` +/// AFTER: +/// ```mlir +/// %vec_cs = vector.shape_cast %vec : vector<2x4xf32> to vector<8xf32> +/// %tr = llvm.intr.matrix.transpose %vec_sc +/// {columns = 2 : i32, rows = 4 : i32} : vector<8xf32> into vector<8xf32> +/// %res = vector.shape_cast %tr : vector<8xf32> to vector<4x2xf32> +/// ``` class TransposeOpToMatrixTransposeOpLowering : public OpRewritePattern { public: diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp index cae490e5f03e7..f958edf2746e9 100644 --- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp +++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.cpp @@ -70,7 +70,7 @@ void ConvertVectorToLLVMPass::runOnOperation() { populateVectorBitCastLoweringPatterns(patterns); populateVectorBroadcastLoweringPatterns(patterns); populateVectorContractLoweringPatterns(patterns, vectorContractLowering); - if (vectorContractLowering == vector::VectorContractLowering::Matmul) { + if (vectorContractLowering == vector::VectorContractLowering::LLVMIntr) { // This pattern creates a dependency on the LLVM dialect, hence we don't // include it in `populateVectorContractLoweringPatterns` that is part of // the Vector dialect (and should not depend on LLVM). @@ -80,7 +80,7 @@ void ConvertVectorToLLVMPass::runOnOperation() { populateVectorShapeCastLoweringPatterns(patterns); populateVectorInterleaveLoweringPatterns(patterns); populateVectorTransposeLoweringPatterns(patterns, vectorTransposeLowering); - if (vectorTransposeLowering == vector::VectorTransposeLowering::Flat) { + if (vectorTransposeLowering == vector::VectorTransposeLowering::LLVMIntr) { // This pattern creates a dependency on the LLVM dialect, hence we don't // include it in `populateVectorTransposeLoweringPatterns` that is part of // the Vector dialect (and should not depend on LLVM). diff --git a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp index 04f56b9691fd1..5061a4454a7fd 100644 --- a/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp +++ b/mlir/lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp @@ -753,7 +753,7 @@ struct VectorLoadOpConverter final spirv::MemoryAccessAttr memoryAccessAttr; IntegerAttr alignmentAttr; if (alignment.has_value()) { - memoryAccess = memoryAccess | spirv::MemoryAccess::Aligned; + memoryAccess |= spirv::MemoryAccess::Aligned; memoryAccessAttr = spirv::MemoryAccessAttr::get(rewriter.getContext(), memoryAccess); alignmentAttr = rewriter.getI32IntegerAttr(alignment.value()); @@ -822,7 +822,7 @@ struct VectorStoreOpConverter final spirv::MemoryAccessAttr memoryAccessAttr; IntegerAttr alignmentAttr; if (alignment.has_value()) { - memoryAccess = memoryAccess | spirv::MemoryAccess::Aligned; + memoryAccess |= spirv::MemoryAccess::Aligned; memoryAccessAttr = spirv::MemoryAccessAttr::get(rewriter.getContext(), memoryAccess); alignmentAttr = rewriter.getI32IntegerAttr(alignment.value()); diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp index 6e7421daeb223..adeb50b6da628 100644 --- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp +++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp @@ -387,12 +387,15 @@ struct F4E2M1ExtFOpConverter : public OpRewritePattern { Value c0x1 = createConst(loc, i4Ty, 0x1, rewriter); Value c0x2 = createConst(loc, i4Ty, 0x2, rewriter); Value c0x4 = createConst(loc, i4Ty, 0x4, rewriter); + Value c0x7 = createConst(loc, i4Ty, 0x7, rewriter); + + Value i4BitsNoSign = arith::AndIOp::create(b, i4Bits, c0x7); // Set last Exponent bit and Mantissa. Value c0x00000014 = createConst(loc, i32Ty, 0x14, rewriter); - Value bits1To24 = arith::ShLIOp::create(b, i4Bits, c0x2); + Value bits1To24 = arith::ShLIOp::create(b, i4BitsNoSign, c0x2); Value isHalf = - arith::CmpIOp::create(b, arith::CmpIPredicate::eq, i4Bits, c0x1); + arith::CmpIOp::create(b, arith::CmpIPredicate::eq, i4BitsNoSign, c0x1); bits1To24 = arith::SelectOp::create(b, isHalf, c0x0, bits1To24); bits1To24 = arith::ExtUIOp::create(b, i32Ty, bits1To24); bits1To24 = arith::ShLIOp::create(b, bits1To24, c0x00000014); @@ -402,11 +405,11 @@ struct F4E2M1ExtFOpConverter : public OpRewritePattern { Value highExpBits = createConst(loc, i32Ty, 0x40000000, rewriter); Value lowExpBits = createConst(loc, i32Ty, 0x3f000000, rewriter); Value useLargerExp = - arith::CmpIOp::create(b, arith::CmpIPredicate::uge, i4Bits, c0x4); + arith::CmpIOp::create(b, arith::CmpIPredicate::uge, i4BitsNoSign, c0x4); Value bits25To31 = arith::SelectOp::create(b, useLargerExp, highExpBits, lowExpBits); Value zeroExp = - arith::CmpIOp::create(b, arith::CmpIPredicate::eq, i4Bits, c0x0); + arith::CmpIOp::create(b, arith::CmpIPredicate::eq, i4BitsNoSign, c0x0); bits25To31 = arith::SelectOp::create(b, zeroExp, zeroExpBits, bits25To31); // Set sign. diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp index f7b0b87085f3d..e0cf353da207f 100644 --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -338,11 +338,21 @@ bool OpFilter::isOpAllowed(Operation *op) const { namespace { /// Default function arg type converter: Use a fully dynamic layout map. -BaseMemRefType -defaultFunctionArgTypeConverter(TensorType type, Attribute memorySpace, +BufferLikeType +defaultFunctionArgTypeConverter(TensorLikeType type, Attribute memorySpace, func::FuncOp funcOp, const BufferizationOptions &options) { - return getMemRefTypeWithFullyDynamicLayout(type, memorySpace); + if (auto tensorType = mlir::dyn_cast(type)) { + return cast( + getMemRefTypeWithFullyDynamicLayout(tensorType, memorySpace)); + } + + // If not builtin, fallback to TensorLikeType::getBufferType() + auto bufferType = + type.getBufferType(options, [&]() { return funcOp->emitError(); }); + assert(succeeded(bufferType) && + "a valid buffer is always expected at function boundary"); + return *bufferType; } /// Default unknown type converter: Use a fully dynamic layout map. BaseMemRefType @@ -385,14 +395,25 @@ BufferizationOptions::dynCastBufferizableOp(Value value) const { void BufferizationOptions::setFunctionBoundaryTypeConversion( LayoutMapOption layoutMapOption) { - functionArgTypeConverterFn = [=](TensorType tensorType, Attribute memorySpace, + functionArgTypeConverterFn = [=](TensorLikeType type, Attribute memorySpace, func::FuncOp funcOp, const BufferizationOptions &options) { - if (layoutMapOption == LayoutMapOption::IdentityLayoutMap) - return bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType, - memorySpace); - return bufferization::getMemRefTypeWithFullyDynamicLayout(tensorType, - memorySpace); + if (auto tensorType = mlir::dyn_cast(type)) { + if (layoutMapOption == LayoutMapOption::IdentityLayoutMap) + return cast( + bufferization::getMemRefTypeWithStaticIdentityLayout(tensorType, + memorySpace)); + return cast( + bufferization::getMemRefTypeWithFullyDynamicLayout(tensorType, + memorySpace)); + } + + // If not builtin, fallback to TensorLikeType::getBufferType() + auto bufferType = + type.getBufferType(options, [&]() { return funcOp->emitError(); }); + assert(succeeded(bufferType) && + "a valid buffer is always expected at function boundary"); + return *bufferType; }; inferFunctionResultLayout = layoutMapOption == LayoutMapOption::InferLayoutMap; diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp index 68ef51992efee..701ab52a491a8 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -401,7 +401,7 @@ bufferization::bufferizeBlockSignature(Block *block, RewriterBase &rewriter, // Compute the new signature. SmallVector newTypes; for (BlockArgument &bbArg : block->getArguments()) { - auto tensorType = dyn_cast(bbArg.getType()); + auto tensorType = dyn_cast(bbArg.getType()); if (!tensorType) { newTypes.push_back(bbArg.getType()); continue; diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp index f69efd1b3fa8c..d9d69342e42a8 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp @@ -49,29 +49,47 @@ void FuncAnalysisState::startFunctionAnalysis(FuncOp funcOp) { #endif // NDEBUG } +// Note: this is a local adaptor to unify TensorType and TensorLikeType code +// paths that both work with BufferizationOptions. +static mlir::Attribute +getDefaultMemorySpace(const BufferizationOptions &options, + TensorLikeType type) { + if (auto tensorType = dyn_cast(type)) { + return *options.defaultMemorySpaceFn(tensorType); + } + return nullptr; +} + /// Return the index-th bufferized function argument type. This assumes that the /// specified argument is a tensor. If the tensor is ranked, a layout map may be /// specified by the user (as per `options.functionArgTypeConverterFn`). -static BaseMemRefType +static BufferLikeType getBufferizedFunctionArgType(FuncOp funcOp, int64_t index, const BufferizationOptions &options) { - auto tensorType = - dyn_cast(funcOp.getFunctionType().getInput(index)); - assert(tensorType && "expected TensorType"); - - BaseMemRefType memrefType = options.functionArgTypeConverterFn( - tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, options); - - auto layoutAttr = funcOp.getArgAttrOfType( - index, BufferizationDialect::kBufferLayoutAttrName); - if (!layoutAttr) - return memrefType; - - auto rankedMemrefType = dyn_cast(memrefType); - assert(rankedMemrefType && "buffer layout not supported on unranked tensors"); - return MemRefType::get(rankedMemrefType.getShape(), - rankedMemrefType.getElementType(), layoutAttr, - rankedMemrefType.getMemorySpace()); + auto type = + dyn_cast(funcOp.getFunctionType().getInput(index)); + assert(type && "expected TensorLikeType"); + + // Note: For builtin tensors there is additional logic related to layout. + if (auto tensorType = dyn_cast(type)) { + BufferLikeType memrefType = options.functionArgTypeConverterFn( + type, *options.defaultMemorySpaceFn(tensorType), funcOp, options); + + auto layoutAttr = funcOp.getArgAttrOfType( + index, BufferizationDialect::kBufferLayoutAttrName); + if (!layoutAttr) + return memrefType; + + auto rankedMemrefType = dyn_cast(memrefType); + assert(rankedMemrefType && + "buffer layout not supported on unranked tensors"); + return cast(MemRefType::get( + rankedMemrefType.getShape(), rankedMemrefType.getElementType(), + layoutAttr, rankedMemrefType.getMemorySpace())); + } + + return options.functionArgTypeConverterFn(type, /*memSpace=*/nullptr, funcOp, + options); } /// Return the FuncOp called by `callOp`. @@ -227,13 +245,13 @@ struct CallOpInterface FunctionType funcType = funcOp.getFunctionType(); Type resultType = funcType.getResult(cast(value).getResultNumber()); - if (auto bufferizedType = dyn_cast(resultType)) - return cast(bufferizedType); + if (auto bufferizedType = dyn_cast(resultType)) + return bufferizedType; // Otherwise, call the type converter to compute the bufferized type. - auto tensorType = cast(resultType); + auto tensorType = cast(resultType); return cast(options.functionArgTypeConverterFn( - tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, + tensorType, getDefaultMemorySpace(options, tensorType), funcOp, options)); } @@ -248,7 +266,7 @@ struct CallOpInterface SmallVector resultTypes; for (Value result : callOp.getResults()) { Type returnType = result.getType(); - if (!isa(returnType)) { + if (!isa(returnType)) { // Non-tensor values are returned. resultTypes.push_back(returnType); continue; @@ -272,7 +290,7 @@ struct CallOpInterface for (OpOperand &opOperand : callOp->getOpOperands()) { // Non-tensor operands are just copied. - if (!isa(opOperand.get().getType())) { + if (!isa(opOperand.get().getType())) { newOperands.push_back(opOperand.get()); continue; } @@ -285,8 +303,8 @@ struct CallOpInterface Value buffer = *maybeBuffer; // Caller / callee type mismatch is handled with castOrReallocMemRefValue. - auto memRefType = funcType.getInput(opOperand.getOperandNumber()); - if (!isa(memRefType)) { + auto bufferType = funcType.getInput(opOperand.getOperandNumber()); + if (!isa(bufferType)) { // The called function was not bufferized yet. This can happen when // there cycles in the function call graph. Compute the bufferized // result type. @@ -296,7 +314,7 @@ struct CallOpInterface state); if (failed(maybeBufferType)) return failure(); - memRefType = *maybeBufferType; + bufferType = *maybeBufferType; } // Since we don't yet have a clear layout story, to_buffer may @@ -305,8 +323,8 @@ struct CallOpInterface // that will either canonicalize away or fail compilation until we can do // something better. Insert a reallocation + copy if it cannot be // statically guaranteed that a direct cast would be valid. - if (buffer.getType() != memRefType) { - auto memrefDstType = dyn_cast(memRefType); + if (buffer.getType() != bufferType) { + auto memrefDstType = dyn_cast(bufferType); assert(memrefDstType && "buffer layout not supported on unranked tensors"); FailureOr replacement = bufferization::castOrReallocMemRefValue( @@ -370,7 +388,7 @@ struct FuncOpInterface static bool supportsUnstructuredControlFlow() { return true; } bool hasTensorSemantics(Operation *op) const { - auto isaTensor = llvm::IsaPred; + auto isaTensor = llvm::IsaPred; // A function has tensor semantics if it has tensor arguments/results. auto funcOp = cast(op); @@ -406,8 +424,8 @@ struct FuncOpInterface // Function arguments are special. if (bbArg.getOwner() == &funcOp.getBody().front()) - return cast( - getBufferizedFunctionArgType(funcOp, bbArg.getArgNumber(), options)); + return getBufferizedFunctionArgType(funcOp, bbArg.getArgNumber(), + options); return OpWithUnstructuredControlFlowBufferizableOpInterfaceExternalModel:: getBufferType(op, value, options, state, invocationStack); @@ -430,7 +448,7 @@ struct FuncOpInterface SmallVector argTypes; for (const auto &it : llvm::enumerate(funcType.getInputs())) { Type argType = it.value(); - if (isa(argType)) { + if (isa(argType)) { argTypes.push_back( getBufferizedFunctionArgType(funcOp, it.index(), options)); continue; @@ -441,9 +459,9 @@ struct FuncOpInterface // Compute the result types. SmallVector retTypes; for (Type resultType : funcType.getResults()) { - if (auto tensorType = dyn_cast(resultType)) { - BaseMemRefType resultType = options.functionArgTypeConverterFn( - tensorType, *options.defaultMemorySpaceFn(tensorType), funcOp, + if (auto tensorType = dyn_cast(resultType)) { + BufferLikeType resultType = options.functionArgTypeConverterFn( + tensorType, getDefaultMemorySpace(options, tensorType), funcOp, options); retTypes.push_back(resultType); continue; @@ -473,7 +491,7 @@ struct FuncOpInterface SmallVector returnValues; for (auto [returnVal, bufferizedType] : llvm::zip_equal(returnOp->getOperands(), retTypes)) { - auto tensorType = dyn_cast(returnVal.getType()); + auto tensorType = dyn_cast(returnVal.getType()); rewriter.setInsertionPoint(returnOp); // If not a tensor type just forward it. diff --git a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp index 582593adfa5c0..f1da1a125e9ef 100644 --- a/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp +++ b/mlir/lib/Dialect/ControlFlow/IR/ControlFlowOps.cpp @@ -122,6 +122,16 @@ static LogicalResult collapseBranch(Block *&successor, Block *successorDest = successorBranch.getDest(); if (successorDest == successor) return failure(); + // Don't try to collapse branches which participate in a cycle. + BranchOp nextBranch = dyn_cast(successorDest->getTerminator()); + llvm::DenseSet visited{successor, successorDest}; + while (nextBranch) { + Block *nextBranchDest = nextBranch.getDest(); + if (visited.contains(nextBranchDest)) + return failure(); + visited.insert(nextBranchDest); + nextBranch = dyn_cast(nextBranchDest->getTerminator()); + } // Update the operands to the successor. If the branch parent has no // arguments, we can use the branch operands directly. diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index a098e721303a8..594c7a265667e 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -52,6 +52,7 @@ gpu::setMappingAttr(ParallelOp ploopOp, namespace gpu { namespace { enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; +enum class MappingPolicy { OutermostFirst, InnermostFirst }; } // namespace static constexpr int kNumHardwareIds = 3; @@ -65,16 +66,30 @@ static MappingLevel &operator++(MappingLevel &mappingLevel) { return mappingLevel; } +// Map the policy string to a typed mapping policy. +// TODO: Revisit this and possibly use a loop interchange pass instead. +static FailureOr getMappingPolicyFromStr(StringRef policy) { + std::string policyCanonical = policy.trim().lower(); + + std::optional option = + llvm::StringSwitch>(policyCanonical) + .Case("innermost-first", MappingPolicy::InnermostFirst) + .Case("outermost-first", MappingPolicy::OutermostFirst) + .Default(std::nullopt); + + if (!option) + return failure(); + return *option; +} + /// Computed the hardware id to use for a given mapping level. Will /// assign x,y and z hardware ids for the first 3 dimensions and use /// sequential after. -/// TODO: Make this use x for the inner-most loop that is -/// distributed to map to x, the next innermost to y and the next innermost to -/// z. static Processor getHardwareIdForMapping(MappingLevel level, int dimension) { if (dimension >= kNumHardwareIds || level == Sequential) return Processor::Sequential; + switch (level) { case MapGrid: switch (dimension) { @@ -107,20 +122,35 @@ static Processor getHardwareIdForMapping(MappingLevel level, int dimension) { /// Add mapping information to the given parallel loop. Do not add /// mapping information if the loop already has it. Also, don't /// start a mapping at a nested loop. -static void mapParallelOp(ParallelOp parallelOp, - MappingLevel mappingLevel = MapGrid) { +static void +mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid, + MappingPolicy mappingPolicy = MappingPolicy::OutermostFirst) { // Do not try to add a mapping to already mapped loops or nested loops. if (parallelOp->getAttr(getMappingAttrName()) || ((mappingLevel == MapGrid) && parallelOp->getParentOfType())) return; + const int numLoops = static_cast(parallelOp.getNumLoops()); + const int loopsToMap = std::min(numLoops, kNumHardwareIds); + MLIRContext *ctx = parallelOp.getContext(); Builder b(ctx); SmallVector attrs; - attrs.reserve(parallelOp.getNumLoops()); - for (int i = 0, e = parallelOp.getNumLoops(); i < e; ++i) { + attrs.reserve(numLoops); + + for (int i = 0; i < numLoops; ++i) { + + // Determine the mapping to use for this loop. + // If the are more loops to map than HW IDs map to sequential. + int hwMapping = kNumHardwareIds; + if (i < loopsToMap) { + hwMapping = (mappingPolicy == MappingPolicy::OutermostFirst) + ? i + : (loopsToMap - 1 - i); + } + attrs.push_back(b.getAttr( - getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(), + getHardwareIdForMapping(mappingLevel, hwMapping), b.getDimIdentityMap(), b.getDimIdentityMap())); } (void)setMappingAttr(parallelOp, attrs); @@ -129,16 +159,31 @@ static void mapParallelOp(ParallelOp parallelOp, // walk but just iterate over the operations. for (Operation &op : *parallelOp.getBody()) { if (ParallelOp nested = dyn_cast(op)) - mapParallelOp(nested, mappingLevel); + mapParallelOp(nested, mappingLevel, mappingPolicy); } } namespace { struct GpuMapParallelLoopsPass : public impl::GpuMapParallelLoopsPassBase { + using Base::Base; + void runOnOperation() override { + // Parse the mapping policy. + FailureOr policyOrFailure = + getMappingPolicyFromStr(mappingPolicyStr); + if (failed(policyOrFailure)) { + getOperation()->emitError() << "Invalid mapping policy specified."; + return signalPassFailure(); + } + + MappingPolicy policy = *policyOrFailure; + MappingLevel topLevel = MappingLevel::MapGrid; + for (Region ®ion : getOperation()->getRegions()) { - region.walk([](ParallelOp parallelOp) { mapParallelOp(parallelOp); }); + region.walk([&](ParallelOp parallelOp) { + mapParallelOp(parallelOp, topLevel, policy); + }); } } }; diff --git a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp index b45fdf34e78e1..81c3069cec16e 100644 --- a/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp @@ -430,7 +430,7 @@ createSubgroupDPPReduction(PatternRewriter &rewriter, gpu::SubgroupReduceOp op, dpp = ROCDL::PermlaneX16Op::create(rewriter, loc, res.getType(), res, res, uint32Max, uint32Max, /*fi=*/true, - /*bound_ctrl=*/false); + /*boundControl=*/false); res = vector::makeArithReduction( rewriter, loc, gpu::convertReductionKind(mode), res, dpp); } else { diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index a3d5d25b96ec2..5d08cccb4faab 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -4085,6 +4085,25 @@ printIndirectBrOpSucessors(OpAsmPrinter &p, IndirectBrOp op, Type flagType, p << "]"; } +//===----------------------------------------------------------------------===// +// SincosOp (intrinsic) +//===----------------------------------------------------------------------===// + +LogicalResult LLVM::SincosOp::verify() { + auto operandType = getOperand().getType(); + auto resultType = getResult().getType(); + auto resultStructType = + mlir::dyn_cast(resultType); + if (!resultStructType || resultStructType.getBody().size() != 2 || + resultStructType.getBody()[0] != operandType || + resultStructType.getBody()[1] != operandType) { + return emitOpError("expected result type to be an homogeneous struct with " + "two elements matching the operand type, but got ") + << resultType; + } + return success(); +} + //===----------------------------------------------------------------------===// // AssumeOp (intrinsic) //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp index 13f1dd9a664e5..682bf8cc102cb 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp @@ -45,12 +45,14 @@ using namespace NVVM; #include "mlir/Dialect/LLVMIR/NVVMOpsDialect.cpp.inc" #include "mlir/Dialect/LLVMIR/NVVMOpsEnums.cpp.inc" +static constexpr unsigned notIntrinsic = llvm::Intrinsic::not_intrinsic; + //===----------------------------------------------------------------------===// // Verifier methods //===----------------------------------------------------------------------===// // This verifier is shared among the following Ops: -// CpAsyncBulkTensorGlobalToSharedClusterOp (TMA Load) +// CpAsyncBulkTensorSharedCTAToGlobalOp (TMA Store) // CpAsyncBulkTensorReduceOp (TMA Store-Reduce) static LogicalResult cpAsyncBulkTensorCommonVerifier(size_t tensorDims, bool isIm2Col, @@ -74,13 +76,6 @@ static LogicalResult cpAsyncBulkTensorCommonVerifier(size_t tensorDims, return success(); } -LogicalResult CpAsyncBulkTensorGlobalToSharedClusterOp::verify() { - size_t numIm2ColOffsets = getIm2colOffsets().size(); - bool isIm2Col = numIm2ColOffsets > 0; - return cpAsyncBulkTensorCommonVerifier(getCoordinates().size(), isIm2Col, - numIm2ColOffsets, getLoc()); -} - LogicalResult CpAsyncBulkTensorSharedCTAToGlobalOp::verify() { TMAStoreMode mode = getMode(); // We lower through inline-ptx when getPredicate() is true. @@ -158,6 +153,38 @@ LogicalResult CpAsyncBulkTensorPrefetchOp::verify() { getMode(), getLoc()); } +LogicalResult CpAsyncBulkTensorGlobalToSharedClusterOp::verify() { + TMALoadMode mode = getMode(); + bool isCTAOnly = getIsCTAOnly(); + if (getPredicate()) { // Inline-asm based lowering + if (isCTAOnly) + return emitError("Predicate is supported only for shared::cluster mode."); + if (mode != TMALoadMode::TILE && mode != TMALoadMode::IM2COL) + return emitError( + "Predicate is supported only for Tile and Im2col modes."); + } else { // Intrinsics-based lowering + NVVMMemorySpace expectedAS = + isCTAOnly ? NVVMMemorySpace::Shared : NVVMMemorySpace::SharedCluster; + unsigned AS = llvm::cast(getDstMem().getType()) + .getAddressSpace(); + if (AS != expectedAS) + return emitError() + << (isCTAOnly + ? "Shared::cta destination requires address-space 3." + : "Shared::cluster destination requires address-space 7."); + // Checks specific to shared::cta mode + if (isCTAOnly) { + if (getMulticastMask()) + return emitError("Multicast is not supported with shared::cta mode."); + if (getGroup()) + return emitError("CTAGroup is not supported with shared::cta mode."); + } + } + + return verifyTMALoadParams(getCoordinates().size(), getIm2colOffsets().size(), + getMode(), getLoc()); +} + LogicalResult CpAsyncBulkTensorReduceOp::verify() { TMAStoreMode mode = getMode(); size_t dims = getCoordinates().size(); @@ -1553,6 +1580,130 @@ mlir::NVVM::IDArgPair CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs( return {id, std::move(args)}; } +bool CpAsyncBulkTensorGlobalToSharedClusterOp::getAsmValues( + RewriterBase &rewriter, + llvm::SmallVectorImpl> + &asmValues) { + // Add all the operands but not the attrs to the asmValues list. + // The attrs here are used to generate the right variants for + // intrinsics-lowering. So, we ignore them while generating inline-PTX. + for (auto val : getOperands()) + asmValues.push_back({val, mlir::NVVM::PTXRegisterMod::Read}); + + return false; +} + +mlir::NVVM::IDArgPair +CpAsyncBulkTensorGlobalToSharedClusterOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + const bool isCTAOnly = thisOp.getIsCTAOnly(); + llvm::SmallVector args; + + // Fill the Intrinsic Args + args.push_back(mt.lookupValue(thisOp.getDstMem())); + args.push_back(mt.lookupValue(thisOp.getMbar())); + args.push_back(mt.lookupValue(thisOp.getTmaDescriptor())); + + // Coordinates and im2col-offsets + for (mlir::Value v : thisOp.getCoordinates()) + args.push_back(mt.lookupValue(v)); + for (mlir::Value v : thisOp.getIm2colOffsets()) + args.push_back(mt.lookupValue(v)); + + // MulticastMask, if available + mlir::Value mcMask = thisOp.getMulticastMask(); + const bool hasMC = static_cast(mcMask); + llvm::Value *i16Zero = + llvm::ConstantInt::get(llvm::Type::getInt16Ty(mt.getLLVMContext()), 0); + + // CacheHint, if available + mlir::Value cacheHint = thisOp.getL2CacheHint(); + const bool hasCacheHint = static_cast(cacheHint); + llvm::Value *i64Zero = + llvm::ConstantInt::get(llvm::Type::getInt64Ty(mt.getLLVMContext()), 0); + + // Flag argument CTAGroup + // CTA_1/2 is mapped to values 1 and 2 for the intrinsics. + // Hence, the +1 to getGroup(). + const int32_t val = + thisOp.getGroup() ? (static_cast(*thisOp.getGroup()) + 1) : 0; + llvm::Value *cg = + llvm::ConstantInt::get(llvm::Type::getInt32Ty(mt.getLLVMContext()), val); + + if (!isCTAOnly) { + // For shared::cluster, all the arguments that we build are applicable. + args.push_back(hasMC ? mt.lookupValue(mcMask) : i16Zero); + args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Zero); + args.push_back(builder.getInt1(hasMC)); + args.push_back(builder.getInt1(hasCacheHint)); + args.push_back(cg); + } else { + // For shared::cta, only cache-hint is applicable. + args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Zero); + args.push_back(builder.getInt1(hasCacheHint)); + } + + constexpr size_t numDims = 5; // 1D to 5D + constexpr size_t numModes = 5; // Tile, Im2col, w, w_128, gather4 + using rowTy = std::array; + using TableTy = std::array; + static constexpr TableTy IDTable{ + {{notIntrinsic, llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_w_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_w_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_w_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_w_128_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_w_128_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_w_128_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_gather4_2d}}}; + + static constexpr TableTy IDTableCTA{ + {{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_tile_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_w_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_w_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_w_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_w_128_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_w_128_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_im2col_w_128_5d}, + {notIntrinsic, notIntrinsic, notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d}}}; + + static_assert( + (getMaxEnumValForTMALoadMode() == std::size(IDTable) - 1) && + (getMaxEnumValForTMALoadMode() == std::size(IDTableCTA) - 1), + "TMALoadModes must match number of rows in IDTable and IDTableCTA"); + size_t mode = static_cast(thisOp.getMode()); + size_t dim = thisOp.getCoordinates().size(); + auto id = isCTAOnly ? IDTableCTA[mode][dim] : IDTable[mode][dim]; + assert(id != notIntrinsic && + "Invalid intrinsic for CpAsyncBulkTensorGlobalToSharedClusterOp."); + + return {id, std::move(args)}; +} + mlir::NVVM::IDArgPair CpAsyncBulkTensorPrefetchOp::getIntrinsicIDAndArgs( Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { auto thisOp = cast(op); @@ -1651,53 +1802,148 @@ CpAsyncBulkTensorSharedCTAToGlobalOp::getIntrinsicIDAndArgs( return {id, std::move(args)}; } -#define CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, dim, mode) \ - llvm::Intrinsic::nvvm_cp_async_bulk_tensor_##op##_##mode##_##dim##d +NVVM::IDArgPair CpAsyncBulkTensorReduceOp::getIntrinsicIDAndArgs( + Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) { + auto thisOp = cast(op); + llvm::LLVMContext &ctx = mt.getLLVMContext(); -#define CP_ASYNC_BULK_TENSOR_REDUCE(op, dim, is_im2col) \ - is_im2col ? CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, dim, im2col) \ - : CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, dim, tile) + llvm::SmallVector args; -#define GET_CP_ASYNC_BULK_TENSOR_ID(op, dims, is_im2col) \ - [&]() -> auto { \ - switch (dims) { \ - case 1: \ - return CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, 1, tile); \ - case 2: \ - return CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, 2, tile); \ - case 3: \ - return CP_ASYNC_BULK_TENSOR_REDUCE(op, 3, is_im2col); \ - case 4: \ - return CP_ASYNC_BULK_TENSOR_REDUCE(op, 4, is_im2col); \ - case 5: \ - return CP_ASYNC_BULK_TENSOR_REDUCE(op, 5, is_im2col); \ - default: \ - llvm_unreachable("Invalid TensorDim in CpAsyncBulkTensorReduceOp."); \ - } \ - }() + // Arguments to the intrinsic: + // shared_mem_ptr, tmaDesc, tensorDims + // cache_hint(if applicable) and flag(boolean) + args.push_back(mt.lookupValue(thisOp.getSrcMem())); + args.push_back(mt.lookupValue(thisOp.getTmaDescriptor())); + + for (Value v : thisOp.getCoordinates()) + args.push_back(mt.lookupValue(v)); + + mlir::Value cacheHint = thisOp.getL2CacheHint(); + const bool hasCacheHint = static_cast(cacheHint); + llvm::Value *i64ZeroValue = + llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0); + args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64ZeroValue); + args.push_back(builder.getInt1(hasCacheHint)); + + const llvm::Intrinsic::ID notIntrinsic = llvm::Intrinsic::not_intrinsic; + + constexpr unsigned numRedKinds = 8; // ADD, MIN, MAX, INC, DEC, AND, OR, XOR + constexpr unsigned numLayouts = 2; // TILE, IM2COL + constexpr unsigned maxDim = 5; // 1D to 5D + using row = std::array; + using layoutTable = std::array; + using fullTable = std::array; + static constexpr fullTable IDTable{ + {// RedTy::ADD + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_im2col_5d}}}}, + // RedTy::MIN + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_min_im2col_5d}}}}, + // RedTy::MAX + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_max_im2col_5d}}}}, + // RedTy::INC + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_inc_im2col_5d}}}}, + // RedTy::DEC + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_dec_im2col_5d}}}}, + // RedTy::AND + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_and_im2col_5d}}}}, + // RedTy::OR + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_or_im2col_5d}}}}, + // RedTy::XOR + {{{{notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_1d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_2d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_4d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_tile_5d}}, + {{notIntrinsic, notIntrinsic, notIntrinsic, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_3d, + llvm::Intrinsic::nvvm_cp_async_bulk_tensor_reduce_xor_im2col_4d, + llvm::Intrinsic:: + nvvm_cp_async_bulk_tensor_reduce_xor_im2col_5d}}}}}}; + + static_assert(getMaxEnumValForTMAReduxKind() == std::size(IDTable) - 1, + "TMAReduxKinds must match number of rows in IDTable"); + + size_t redKind = static_cast(thisOp.getRedKind()); + size_t mode = static_cast(thisOp.getMode()); + size_t dim = thisOp.getCoordinates().size(); + + assert(redKind < IDTable.size() && + "Invalid redKind for CpAsyncBulkTensorReduceOp"); + assert(mode < IDTable[redKind].size() && + "Invalid mode for CpAsyncBulkTensorReduceOp"); + assert(dim < IDTable[redKind][mode].size() && + "Invalid dim for CpAsyncBulkTensorReduceOp"); + + llvm::Intrinsic::ID intrinsicID = IDTable[redKind][mode][dim]; + + assert(intrinsicID != notIntrinsic && + "Invalid intrinsic for CpAsyncBulkTensorReduceOp."); -llvm::Intrinsic::ID CpAsyncBulkTensorReduceOp::getIntrinsicID( - int tensorDims, NVVM::TMAReduxKind kind, bool isIm2Col) { - using RedTy = NVVM::TMAReduxKind; - switch (kind) { - case RedTy::ADD: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_add, tensorDims, isIm2Col); - case RedTy::MIN: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_min, tensorDims, isIm2Col); - case RedTy::MAX: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_max, tensorDims, isIm2Col); - case RedTy::INC: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_inc, tensorDims, isIm2Col); - case RedTy::DEC: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_dec, tensorDims, isIm2Col); - case RedTy::AND: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_and, tensorDims, isIm2Col); - case RedTy::OR: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_or, tensorDims, isIm2Col); - case RedTy::XOR: - return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_xor, tensorDims, isIm2Col); - } - llvm_unreachable("Invalid Reduction Op for CpAsyncBulkTensorReduceOp"); + return {intrinsicID, std::move(args)}; } #define _none diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp b/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp index 5ceae9b16af20..67573c4ee6061 100644 --- a/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp +++ b/mlir/lib/Dialect/LLVMIR/Transforms/DIScopeForLLVMFuncOp.cpp @@ -77,7 +77,7 @@ static void addScopeToFunction(LLVM::LLVMFuncOp llvmFunc, auto subprogramFlags = LLVM::DISubprogramFlags::Optimized; if (!llvmFunc.isExternal()) { id = DistinctAttr::create(UnitAttr::get(context)); - subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition; + subprogramFlags |= LLVM::DISubprogramFlags::Definition; } else { compileUnitAttr = {}; } diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 578931e1351c6..59013a23b3e3b 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -5310,6 +5310,32 @@ bool PackOp::requirePaddingValue(ArrayRef inputShape, return false; } +bool PackOp::requirePaddingValueStrict(ArrayRef inputShape, + ArrayRef innerDimsPos, + ArrayRef outputShape, + ArrayRef outerDimsPerm, + ArrayRef innerTiles) { + SmallVector outputTileSizes( + outputShape.take_front(inputShape.size())); + if (!outerDimsPerm.empty()) { + assert(outerDimsPerm.size() == outputTileSizes.size() && + "expected output and outer_dims_perm to have same size"); + applyPermutationToVector(outputTileSizes, + invertPermutationVector(outerDimsPerm)); + } + for (auto [pos, tileSize] : llvm::zip_equal(innerDimsPos, innerTiles)) { + if (ShapedType::isDynamic(inputShape[pos]) || + ShapedType::isDynamic(outputTileSizes[pos])) + return true; + std::optional constantTile = getConstantIntValue(tileSize); + if (!constantTile) + return true; + if (inputShape[pos] % (*constantTile) != 0) + return true; + } + return false; +} + LogicalResult PackOp::verify() { if (failed(commonVerifierPackAndUnPackOp(*this))) return failure(); @@ -5583,14 +5609,13 @@ static bool inferStaticShape(PackOp packOp, SmallVectorImpl &srcShape, LogicalResult PackOp::canonicalize(PackOp packOp, PatternRewriter &rewriter) { // Fold an pack(unpack(x)) to x. if (auto unPackOp = packOp.getSource().getDefiningOp()) { - if (unPackOp.getSourceType() != packOp.getDestType()) - return failure(); - if (packOp.getPaddingValue() || - !hasSameInnerOuterAttribute(packOp, unPackOp) || - !haveSameTiles(packOp, unPackOp)) - return failure(); - rewriter.replaceOp(packOp, unPackOp.getSource()); - return success(); + if (unPackOp.getSourceType() == packOp.getDestType() && + !packOp.getPaddingValue() && + hasSameInnerOuterAttribute(packOp, unPackOp) && + haveSameTiles(packOp, unPackOp)) { + rewriter.replaceOp(packOp, unPackOp.getSource()); + return success(); + } } // Fold optional PaddingValue operand away if padding is not needed. diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp index 108abe800b13e..ebc4dcf6bbcb5 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp @@ -20,6 +20,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypes.h" +#include #include namespace mlir { @@ -124,6 +125,10 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp) { auto filterType = cast(convOp.getInputs()[1].getType()); auto outputType = cast(convOp.getOutputs()[0].getType()); + if (!convOp.hasPureTensorSemantics()) + return rewriter.notifyMatchFailure( + convOp, "expected op to have pure tensor semantics"); + if (!filterType.hasStaticShape()) return rewriter.notifyMatchFailure( convOp, "expected a static shape for the filter"); @@ -155,10 +160,15 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcHwcfOp convOp) { Location loc = convOp.getLoc(); + assert(isa(filterType) && + "expected filter type to be a ranked tensor"); + auto tensorFilterType = cast(filterType); + // Reshape output and filter to the LHS and result of a (B)MNK matmul. SmallVector filterReassocIndices = {{0, 1, 2}, {3}}; auto reshapedFilterType = - RankedTensorType::get({fh * fw * ic, oc}, filterType.getElementType()); + RankedTensorType::get({fh * fw * ic, oc}, filterType.getElementType(), + tensorFilterType.getEncoding()); Value reshapedFilter = tensor::CollapseShapeOp::create( rewriter, loc, reshapedFilterType, filter, filterReassocIndices); @@ -253,6 +263,10 @@ rewriteInIm2Col(RewriterBase &rewriter, auto filterType = cast(convOp.getInputs()[1].getType()); auto outputType = cast(convOp.getOutputs()[0].getType()); + if (!convOp.hasPureTensorSemantics()) + return rewriter.notifyMatchFailure( + convOp, "expected op to have pure tensor semantics"); + if (!filterType.hasStaticShape()) return rewriter.notifyMatchFailure( convOp, "expected a static shape for the filter"); @@ -404,6 +418,10 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp) { auto filterType = cast(convOp.getInputs()[1].getType()); auto outputType = cast(convOp.getOutputs()[0].getType()); + if (!convOp.hasPureTensorSemantics()) + return rewriter.notifyMatchFailure( + convOp, "expected op to have pure tensor semantics"); + if (!filterType.hasStaticShape()) return rewriter.notifyMatchFailure( convOp, "expected a static shape for the filter"); @@ -435,9 +453,14 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNchwFchwOp convOp) { auto loc = convOp.getLoc(); MLIRContext *context = rewriter.getContext(); + assert(isa(filterType) && + "expected filter type to be a ranked tensor"); + auto tensorFilterType = cast(filterType); + SmallVector filterReassocIndices = {{0}, {1, 2, 3}}; auto reshapedFilterType = - RankedTensorType::get({oc, ic * fh * fw}, inputType.getElementType()); + RankedTensorType::get({oc, ic * fh * fw}, inputType.getElementType(), + tensorFilterType.getEncoding()); Value reshapedFilter = tensor::CollapseShapeOp::create( rewriter, loc, reshapedFilterType, filter, filterReassocIndices); @@ -529,6 +552,10 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp) { auto filterType = cast(convOp.getInputs()[1].getType()); auto outputType = cast(convOp.getOutputs()[0].getType()); + if (!convOp.hasPureTensorSemantics()) + return rewriter.notifyMatchFailure( + convOp, "expected op to have pure tensor semantics"); + if (!filterType.hasStaticShape()) return rewriter.notifyMatchFailure( convOp, "expected a static shape for the filter"); @@ -560,11 +587,16 @@ rewriteInIm2Col(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp convOp) { Location loc = convOp.getLoc(); + assert(isa(filterType) && + "expected filter type to be a ranked tensor"); + auto tensorFilterType = cast(filterType); + // Reshape output and filter to the LHS and result of a "row-wise" matrix // multiplication. SmallVector filterReassocIndices = {{0}, {1, 2, 3}}; auto reshapedFilterType = - RankedTensorType::get({oc, fh * fw * ic}, filterType.getElementType()); + RankedTensorType::get({oc, fh * fw * ic}, filterType.getElementType(), + tensorFilterType.getEncoding()); Value reshapedFilter = tensor::CollapseShapeOp::create( rewriter, loc, reshapedFilterType, filter, filterReassocIndices); diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp index 6c17c3c2d0cab..3bb5f8af821c0 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/UB/IR/UBOps.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/IR/Dominance.h" +#include "mlir/IR/TypeUtilities.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/TypeSwitch.h" @@ -189,40 +190,20 @@ static SmallVector computeOuterDims(ArrayRef perm, return outerDimsPerm; } -/// Returns a tuple for packed operand and indexing_map with the assumptions: -/// 1) The generic op is the producer of the pack op. -/// 2) The generic op has only one result. -/// If the operand is a scalar or packing dimensions are all irrelevant to the -/// operand, the operand and the updated indexing map will be returned. -/// Otherwise, it returns the packed operand and the updated indexing map. E.g., -/// -/// #map0 = affine_map<(d0, d1) -> (d0, d1)> -/// #map1 = affine_map<(d0, d1) -> (d0)> -/// #map2 = affine_map<(d0, d1) -> (d1)> -/// %0 = linalg.generic {indexing_maps = [#map1, #map2, #map0], -/// iterator_types = ["parallel", "parallel"]} -/// ins(%arg0, %arg1 : tensor, tensor) -/// outs(%init : tensor) { -/// ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): -/// %4 = arith.addf %arg3, %arg4 : f32 -/// linalg.yield %4 : f32 -/// } -> tensor -/// %1 = linalg.pack %0 -/// inner_dims_pos = [0, 1] -/// inner_tiles = [8, 2] -/// into %dest : tensor -> tensor -/// -/// Taking the first input operand as an example, the inner tile size of d1 is -/// 8. Thus, the below operation and `affine_map<(d0, d1, d2, d3)> -> -/// affine_map<(d1, d3)>` will be returned. -/// -/// %pack = linalg.pack %arg0 -/// inner_dims_pos = [0] -/// inner_tiles = [8] -/// into %init : tensor -> tensor -static std::tuple -getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, - GenericOp genericOp, OpOperand *opOperand) { +struct PackedOperandDetails { + SmallVector innerTileSizes; + SmallVector innerDimsPos; + SmallVector outerDimsPerm; + AffineMap indexingMap; +}; + +/// Helper function for getOrCreatePackedViewOfOperand that populates +/// the details of the packedOperand that needs to be formed and also +/// returns if the packing would require padding. +static bool getPackedOperandDetails( + OpBuilder &b, PackInfo packInfo, GenericOp genericOp, OpOperand *opOperand, + DenseMap &packedOperandMap) { + PackedOperandDetails currOperandDetails; int64_t numOrigLoops = genericOp.getNumLoops(); int64_t numInnerLoops = packInfo.getNumTiledLoops(); int64_t numLoops = numOrigLoops + numInnerLoops; @@ -231,9 +212,12 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, SmallVector exprs(origIndexingMap.getResults()); // If the OpOperand is a scalar or a zero-rank tensor, no need to pack. - if (genericOp.isScalar(opOperand) || exprs.empty()) - return std::make_tuple(opOperand->get(), - AffineMap::get(numLoops, 0, exprs, b.getContext())); + if (genericOp.isScalar(opOperand) || exprs.empty()) { + currOperandDetails.indexingMap = + AffineMap::get(numLoops, 0, exprs, b.getContext()); + packedOperandMap[opOperand] = currOperandDetails; + return false; + } // Step 1. Construct the information of packing data dimensions; append inner // dimensions to the indexing maps for the operand. @@ -281,18 +265,86 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, exprs = auxVec; } } - auto indexingMap = AffineMap::get(numLoops, 0, exprs, b.getContext()); + currOperandDetails.indexingMap = + AffineMap::get(numLoops, 0, exprs, b.getContext()); // The operand does not have dimensions that relates to pack op. + if (innerDimsPos.empty() && outerDimsPerm.empty()) { + packedOperandMap[opOperand] = currOperandDetails; + return false; + } + auto inputType = cast(opOperand->get().getType()); + + auto maybeIntInnerTileSizes = + llvm::map_to_vector(innerTileSizes, [](OpFoldResult ofr) -> int64_t { + std::optional maybeCst = getConstantIntValue(ofr); + return maybeCst.value_or(ShapedType::kDynamic); + }); + bool requirePadding = linalg::PackOp::requirePaddingValueStrict( + inputType.getShape(), innerDimsPos, + linalg::PackOp::inferPackedType(inputType, maybeIntInnerTileSizes, + innerDimsPos, outerDimsPerm) + .getShape(), + outerDimsPerm, innerTileSizes); + currOperandDetails.innerDimsPos = innerDimsPos; + currOperandDetails.innerTileSizes = innerTileSizes; + currOperandDetails.outerDimsPerm = outerDimsPerm; + packedOperandMap[opOperand] = currOperandDetails; + + return requirePadding; +} + +/// Returns a tuple for packed operand and indexing_map with the assumptions: +/// 1) The generic op is the producer of the pack op. +/// 2) The generic op has only one result. +/// If the operand is a scalar or packing dimensions are all irrelevant to the +/// operand, the operand and the updated indexing map will be returned. +/// Otherwise, it returns the packed operand and the updated indexing map. E.g., +/// +/// #map0 = affine_map<(d0, d1) -> (d0, d1)> +/// #map1 = affine_map<(d0, d1) -> (d0)> +/// #map2 = affine_map<(d0, d1) -> (d1)> +/// %0 = linalg.generic {indexing_maps = [#map1, #map2, #map0], +/// iterator_types = ["parallel", "parallel"]} +/// ins(%arg0, %arg1 : tensor, tensor) +/// outs(%init : tensor) { +/// ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): +/// %4 = arith.addf %arg3, %arg4 : f32 +/// linalg.yield %4 : f32 +/// } -> tensor +/// %1 = linalg.pack %0 +/// inner_dims_pos = [0, 1] +/// inner_tiles = [8, 2] +/// into %dest : tensor -> tensor +/// +/// Taking the first input operand as an example, the inner tile size of d1 is +/// 8. Thus, the below operation and `affine_map<(d0, d1, d2, d3)> -> +/// affine_map<(d1, d3)>` will be returned. +/// +/// %pack = linalg.pack %arg0 +/// inner_dims_pos = [0] +/// inner_tiles = [8] +/// into %init : tensor -> tensor +static std::tuple getOrCreatePackedViewOfOperand( + OpBuilder &b, Location loc, OpOperand *opOperand, + const DenseMap &packedOperandMap) { + assert(packedOperandMap.contains(opOperand) && + "packed operand details expected to be populated"); + auto currOperandDetails = packedOperandMap.at(opOperand); + auto innerDimsPos = currOperandDetails.innerDimsPos; + auto outerDimsPerm = currOperandDetails.outerDimsPerm; + auto innerTileSizes = currOperandDetails.innerTileSizes; if (innerDimsPos.empty() && outerDimsPerm.empty()) - return std::make_tuple(opOperand->get(), indexingMap); + return std::make_tuple(opOperand->get(), currOperandDetails.indexingMap); auto empty = linalg::PackOp::createDestinationTensor( b, loc, opOperand->get(), innerTileSizes, innerDimsPos, outerDimsPerm); - auto packedOperand = linalg::PackOp::create( - b, loc, opOperand->get(), empty, innerDimsPos, innerTileSizes, - /*padding=*/std::nullopt, outerDimsPerm); - return std::make_tuple(packedOperand, indexingMap); + auto poison = ub::PoisonOp::create( + b, loc, getElementTypeOrSelf(opOperand->get().getType())); + Value packedOperand = + linalg::PackOp::create(b, loc, opOperand->get(), empty, innerDimsPos, + innerTileSizes, poison, outerDimsPerm); + return std::make_tuple(packedOperand, currOperandDetails.indexingMap); } /// This function is a helper subroutine to pack a genericOp and return it. It @@ -301,10 +353,10 @@ getOrCreatePackedViewOfOperand(OpBuilder &b, Location loc, PackInfo packInfo, /// around it. Implicitly this will only work when a packInfo can be obtained. /// This make sure that we are only using this function on parallel permuted /// dimensions. -static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, - Value dest, AffineMap packedOutIndexingMap, - const PackInfo &packInfo, - bool isFoldableUnpackPack) { +static FailureOr +packGenericOp(RewriterBase &rewriter, GenericOp genericOp, Value dest, + AffineMap packedOutIndexingMap, const PackInfo &packInfo, + bool isFoldableUnpackPack, bool poisonPaddingOk) { Location loc = genericOp.getLoc(); SmallVector inputOperands; SmallVector inputOperandsFromUnpackedSource; @@ -314,9 +366,18 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp, packOp.getInnerDimsPos() == unPackOp.getInnerDimsPos() && llvm::equal(packOp.getMixedTiles(), unPackOp.getMixedTiles()); }; + DenseMap packedOperandMap; + bool requiresPadding = false; + for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) { + requiresPadding |= getPackedOperandDetails(rewriter, packInfo, genericOp, + inputOperand, packedOperandMap); + } + if (requiresPadding && !poisonPaddingOk) + return failure(); + for (OpOperand *inputOperand : genericOp.getDpsInputOperands()) { auto [packedOperand, packedIndexingMap] = getOrCreatePackedViewOfOperand( - rewriter, loc, packInfo, genericOp, inputOperand); + rewriter, loc, inputOperand, packedOperandMap); auto unpackOp = inputOperand->get().getDefiningOp(); auto packOp = packedOperand.getDefiningOp(); if (packOp && unpackOp && hasEquivalentTiles(packOp, unpackOp)) { @@ -407,7 +468,8 @@ static bool isGenericOutsNotUsed(linalg::GenericOp genericOp) { /// } -> tensor static FailureOr bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp, - const ControlPropagationFn &controlFn) { + const ControlPropagationFn &controlFn, + bool poisonPaddingOk) { auto genericOp = packOp.getSource().getDefiningOp(); if (!genericOp) return failure(); @@ -470,10 +532,15 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp, } // Rebuild the indexing map for the corresponding init operand. - auto [packedOutOperand, packedOutIndexingMap] = - getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo, - genericOp, opOperand); + DenseMap packedOperandMap; + bool requiresPadding = getPackedOperandDetails(rewriter, *packInfo, genericOp, + opOperand, packedOperandMap); + if (requiresPadding && !poisonPaddingOk) + return failure(); + auto [packedOutOperand, packedOutIndexingMap] = + getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), opOperand, + packedOperandMap); // Forward the new tensor.empty as a destination if it is one of the following // situations: // 1) The dps init operand is a tensor.empty. @@ -488,7 +555,8 @@ bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, linalg::PackOp packOp, // pack(unpack) isn't naively foldable because the unpack op can be from // an arbitrary domain so we need to keep both. return packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, - *packInfo, /*isFoldableUnpackPack=*/false); + *packInfo, /*isFoldableUnpackPack=*/false, + poisonPaddingOk); } /// Wrapper pattern that applies bubbleUpPackOpThroughGenericOp method. @@ -496,13 +564,15 @@ struct BubbleUpPackOpThroughGenericOpPattern : public OpRewritePattern { public: BubbleUpPackOpThroughGenericOpPattern(MLIRContext *context, - ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + ControlPropagationFn fun, + bool poisonPaddingOk) + : OpRewritePattern(context), controlFn(std::move(fun)), + poisonPaddingOk(std::move(poisonPaddingOk)) {} LogicalResult matchAndRewrite(linalg::PackOp packOp, PatternRewriter &rewriter) const override { - auto genericOp = - bubbleUpPackOpThroughGenericOp(rewriter, packOp, controlFn); + auto genericOp = bubbleUpPackOpThroughGenericOp(rewriter, packOp, controlFn, + poisonPaddingOk); if (failed(genericOp)) return failure(); rewriter.replaceOp(packOp, genericOp->getResults()); @@ -511,6 +581,7 @@ struct BubbleUpPackOpThroughGenericOpPattern private: ControlPropagationFn controlFn; + bool poisonPaddingOk; }; /// Propagate a linalg.pack operation up through a tensor.pad. The idea is to @@ -1080,7 +1151,8 @@ static FailureOr getUnPackedOperand(GenericOp genericOp) { /// static FailureOr> pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, - ControlPropagationFn controlFn) { + ControlPropagationFn controlFn, + bool poisonPaddingOk) { if (genericOp.getNumResults() != 1) return failure(); @@ -1107,9 +1179,17 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, return failure(); // Rebuild the indexing map for the corresponding init operand. + DenseMap packedOperandMap; + bool requiresPadding = + getPackedOperandDetails(rewriter, *packInfo, genericOp, + genericOp.getDpsInitOperand(0), packedOperandMap); + if (requiresPadding && !poisonPaddingOk) + return failure(); + auto [packedOutOperand, packedOutIndexingMap] = - getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), *packInfo, - genericOp, genericOp.getDpsInitOperand(0)); + getOrCreatePackedViewOfOperand(rewriter, genericOp.getLoc(), + genericOp.getDpsInitOperand(0), + packedOperandMap); auto destPack = packedOutOperand.getDefiningOp(); // Forward the new tensor.empty as a destination if it is one of the following @@ -1129,9 +1209,12 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, // pack(unpack) is foldable in this case. This is because in pushing down the // unpack, by default we will populate an additional pack op after the unpack. // This guarantees them to be foldable. - GenericOp newGenericOp = + auto maybeGenericOp = packGenericOp(rewriter, genericOp, dest, packedOutIndexingMap, *packInfo, - /*isFoldableUnpackPack=*/true); + /*isFoldableUnpackPack=*/true, poisonPaddingOk); + if (failed(maybeGenericOp)) + return failure(); + GenericOp newGenericOp = *maybeGenericOp; Value newResult = newGenericOp.getTiedOpResult(newGenericOp.getDpsInitOperand(0)); @@ -1157,13 +1240,15 @@ pushDownUnPackOpThroughGenericOp(RewriterBase &rewriter, GenericOp genericOp, struct PushDownUnPackOpThroughGenericOp : public OpRewritePattern { public: PushDownUnPackOpThroughGenericOp(MLIRContext *context, - ControlPropagationFn fun) - : OpRewritePattern(context), controlFn(std::move(fun)) {} + ControlPropagationFn fun, + bool poisonPaddingOk) + : OpRewritePattern(context), controlFn(std::move(fun)), + poisonPaddingOk(std::move(poisonPaddingOk)) {} LogicalResult matchAndRewrite(GenericOp genericOp, PatternRewriter &rewriter) const override { - auto genericAndRepl = - pushDownUnPackOpThroughGenericOp(rewriter, genericOp, controlFn); + auto genericAndRepl = pushDownUnPackOpThroughGenericOp( + rewriter, genericOp, controlFn, poisonPaddingOk); if (failed(genericAndRepl)) return failure(); rewriter.replaceOp(genericOp, std::get<1>(*genericAndRepl)); @@ -1172,6 +1257,7 @@ struct PushDownUnPackOpThroughGenericOp : public OpRewritePattern { private: ControlPropagationFn controlFn; + bool poisonPaddingOk; }; /// Propagate a linalg.unpack operation through a tensor.pad. The idea is to @@ -1522,12 +1608,14 @@ class PushDownExtractSliceOpThroughGenericOp final void mlir::linalg::populateDataLayoutPropagationPatterns( RewritePatternSet &patterns, - const ControlPropagationFn &controlPackUnPackPropagation) { - patterns - .insert( - patterns.getContext(), controlPackUnPackPropagation); + const ControlPropagationFn &controlPackUnPackPropagation, + bool PoisonPaddingOk) { + patterns.insert( + patterns.getContext(), controlPackUnPackPropagation); + patterns.insert( + patterns.getContext(), controlPackUnPackPropagation, PoisonPaddingOk); } void mlir::linalg::populateExtractSliceSinkingPatterns( diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp index 8942670767231..0956c5d771394 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/PadTilingInterface.cpp @@ -141,7 +141,7 @@ SmallVector linalg::computePaddedShape( projectedDims.flip(paddingDim); AffineMap projectedMap = mlir::projectDims(partialIndexingMap, projectedDims, - /*compressDims=*/true); + /*compressDimsFlag=*/true); // If we are padding to the next multiple of, compose with ceil(sz) * sz. OpFoldResult paddingDimOfr; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 3ee6ae1029f72..15c467b21c81e 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1770,12 +1770,9 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, linalg::PackOp packOp, rewriter.setInsertionPoint(packOp); Location loc = packOp.getLoc(); - auto padValue = packOp.getPaddingValue(); - if (!padValue) { - padValue = arith::ConstantOp::create( - rewriter, loc, - rewriter.getZeroAttr(packOp.getSourceType().getElementType())); - } + std::optional padValue = packOp.getPaddingValue() + ? std::optional(packOp.getPaddingValue()) + : std::nullopt; // If the input vector sizes are not provided, then the vector sizes are // determined by the result tensor shape. In case the vector sizes aren't @@ -1936,11 +1933,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, } // -- Generate the read operation -- - auto padValue = arith::ConstantOp::create( - rewriter, loc, - rewriter.getZeroAttr(unpackOp.getSourceType().getElementType())); Value readResult = vector::createReadOrMaskedRead( - rewriter, loc, unpackOp.getSource(), readVectorSizes, padValue, + rewriter, loc, unpackOp.getSource(), readVectorSizes, std::nullopt, useInBoundsInsteadOfMasking, readScalableVectorFlags); // -- Generate the transpose operation -- diff --git a/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt b/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt index 734294bd014c6..e25a0121a3359 100644 --- a/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/MemRef/IR/CMakeLists.txt @@ -20,6 +20,7 @@ add_mlir_dialect_library(MLIRMemRefDialect MLIRInferIntRangeInterface MLIRInferTypeOpInterface MLIRIR + MLIRMemOpInterfaces MLIRMemorySlotInterfaces MLIRShapedOpInterfaces MLIRSideEffectInterfaces diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index 5d15d5f6e3de4..349b4deb29023 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -111,6 +111,65 @@ static void constifyIndexValues(SmallVectorImpl &values, } } +/// Helper function to retrieve a lossless memory-space cast, and the +/// corresponding new result memref type. +static std::tuple +getMemorySpaceCastInfo(BaseMemRefType resultTy, Value src) { + MemorySpaceCastOpInterface castOp = + MemorySpaceCastOpInterface::getIfPromotableCast(src); + + // Bail if the cast is not lossless. + if (!castOp) + return {}; + + // Transform the source and target type of `castOp` to have the same metadata + // as `resultTy`. Bail if not possible. + FailureOr srcTy = resultTy.clonePtrWith( + castOp.getSourcePtr().getType().getMemorySpace(), std::nullopt); + if (failed(srcTy)) + return {}; + + FailureOr tgtTy = resultTy.clonePtrWith( + castOp.getTargetPtr().getType().getMemorySpace(), std::nullopt); + if (failed(tgtTy)) + return {}; + + // Check if this is a valid memory-space cast. + if (!castOp.isValidMemorySpaceCast(*tgtTy, *srcTy)) + return {}; + + return std::make_tuple(castOp, *tgtTy, *srcTy); +} + +/// Implementation of `bubbleDownCasts` method for memref operations that +/// return a single memref result. +template +static FailureOr>> +bubbleDownCastsPassthroughOpImpl(ConcreteOpTy op, OpBuilder &builder, + OpOperand &src) { + auto [castOp, tgtTy, resTy] = getMemorySpaceCastInfo(op.getType(), src.get()); + // Bail if we cannot cast. + if (!castOp) + return failure(); + + // Create the new operands. + SmallVector operands; + llvm::append_range(operands, op->getOperands()); + operands[src.getOperandNumber()] = castOp.getSourcePtr(); + + // Create the new op and results. + auto newOp = ConcreteOpTy::create( + builder, op.getLoc(), TypeRange(resTy), operands, op.getProperties(), + llvm::to_vector_of(op->getDiscardableAttrs())); + + // Insert a memory-space cast to the original memory space of the op. + MemorySpaceCastOpInterface result = castOp.cloneMemorySpaceCastOp( + builder, tgtTy, + cast>(newOp.getResult())); + return std::optional>( + SmallVector({result.getTargetPtr()})); +} + //===----------------------------------------------------------------------===// // AllocOp / AllocaOp //===----------------------------------------------------------------------===// @@ -542,6 +601,11 @@ OpFoldResult AssumeAlignmentOp::fold(FoldAdaptor adaptor) { return getMemref(); } +FailureOr>> +AssumeAlignmentOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getMemrefMutable()); +} + //===----------------------------------------------------------------------===// // CastOp //===----------------------------------------------------------------------===// @@ -710,6 +774,11 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) { return succeeded(foldMemRefCast(*this)) ? getResult() : Value(); } +FailureOr>> +CastOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSourceMutable()); +} + //===----------------------------------------------------------------------===// // CopyOp //===----------------------------------------------------------------------===// @@ -1601,6 +1670,12 @@ OpFoldResult LoadOp::fold(FoldAdaptor adaptor) { return OpFoldResult(); } +FailureOr>> +LoadOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getMemrefMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // MemorySpaceCastOp //===----------------------------------------------------------------------===// @@ -1645,6 +1720,32 @@ OpFoldResult MemorySpaceCastOp::fold(FoldAdaptor adaptor) { return Value{}; } +TypedValue MemorySpaceCastOp::getSourcePtr() { + return cast>(getSource()); +} + +TypedValue MemorySpaceCastOp::getTargetPtr() { + return cast>(getDest()); +} + +bool MemorySpaceCastOp::isValidMemorySpaceCast(PtrLikeTypeInterface tgt, + PtrLikeTypeInterface src) { + return isa(tgt) && + tgt.clonePtrWith(src.getMemorySpace(), std::nullopt) == src; +} + +MemorySpaceCastOpInterface MemorySpaceCastOp::cloneMemorySpaceCastOp( + OpBuilder &b, PtrLikeTypeInterface tgt, + TypedValue src) { + assert(isValidMemorySpaceCast(tgt, src.getType()) && "invalid arguments"); + return MemorySpaceCastOp::create(b, getLoc(), tgt, src); +} + +/// The only cast we recognize as promotable is to the generic space. +bool MemorySpaceCastOp::isSourcePromotable() { + return getDest().getType().getMemorySpace() == nullptr; +} + //===----------------------------------------------------------------------===// // PrefetchOp //===----------------------------------------------------------------------===// @@ -2041,6 +2142,11 @@ void ReinterpretCastOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +ReinterpretCastOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSourceMutable()); +} + //===----------------------------------------------------------------------===// // Reassociative reshape ops //===----------------------------------------------------------------------===// @@ -2348,6 +2454,11 @@ void ExpandShapeOp::getCanonicalizationPatterns(RewritePatternSet &results, ComposeExpandOfCollapseOp>(context); } +FailureOr>> +ExpandShapeOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSrcMutable()); +} + /// Compute the layout map after collapsing a given source MemRef type with the /// specified reassociation indices. /// @@ -2569,6 +2680,11 @@ OpFoldResult CollapseShapeOp::fold(FoldAdaptor adaptor) { adaptor.getOperands()); } +FailureOr>> +CollapseShapeOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSrcMutable()); +} + //===----------------------------------------------------------------------===// // ReshapeOp //===----------------------------------------------------------------------===// @@ -2609,6 +2725,11 @@ LogicalResult ReshapeOp::verify() { return success(); } +FailureOr>> +ReshapeOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSourceMutable()); +} + //===----------------------------------------------------------------------===// // StoreOp //===----------------------------------------------------------------------===// @@ -2626,6 +2747,12 @@ LogicalResult StoreOp::fold(FoldAdaptor adaptor, return foldMemRefCast(*this, getValueToStore()); } +FailureOr>> +StoreOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getMemrefMutable(), + ValueRange()); +} + //===----------------------------------------------------------------------===// // SubViewOp //===----------------------------------------------------------------------===// @@ -3282,6 +3409,11 @@ OpFoldResult SubViewOp::fold(FoldAdaptor adaptor) { return {}; } +FailureOr>> +SubViewOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSourceMutable()); +} + //===----------------------------------------------------------------------===// // TransposeOp //===----------------------------------------------------------------------===// @@ -3382,6 +3514,11 @@ OpFoldResult TransposeOp::fold(FoldAdaptor) { return {}; } +FailureOr>> +TransposeOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getInMutable()); +} + //===----------------------------------------------------------------------===// // ViewOp //===----------------------------------------------------------------------===// @@ -3525,6 +3662,11 @@ void ViewOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +ViewOp::bubbleDownCasts(OpBuilder &builder) { + return bubbleDownCastsPassthroughOpImpl(*this, builder, getSourceMutable()); +} + //===----------------------------------------------------------------------===// // AtomicRMWOp //===----------------------------------------------------------------------===// @@ -3570,6 +3712,12 @@ OpFoldResult AtomicRMWOp::fold(FoldAdaptor adaptor) { return OpFoldResult(); } +FailureOr>> +AtomicRMWOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getMemrefMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // TableGen'd op method definitions //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp index 3de9c3898c713..6200366cded29 100644 --- a/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp +++ b/mlir/lib/Dialect/MemRef/Utils/MemRefUtils.cpp @@ -191,7 +191,7 @@ computeSuffixProductIRBlock(Location loc, OpBuilder &builder, } MemrefValue skipFullyAliasingOperations(MemrefValue source) { - while (auto op = source.getDefiningOp()) { + while (auto *op = source.getDefiningOp()) { if (auto subViewOp = dyn_cast(op); subViewOp && subViewOp.hasZeroOffset() && subViewOp.hasUnitStride()) { // A `memref.subview` with an all zero offset, and all unit strides, still @@ -208,7 +208,7 @@ MemrefValue skipFullyAliasingOperations(MemrefValue source) { } MemrefValue skipViewLikeOps(MemrefValue source) { - while (auto op = source.getDefiningOp()) { + while (auto *op = source.getDefiningOp()) { if (auto viewLike = dyn_cast(op)) { if (source == viewLike.getViewDest()) { source = cast(viewLike.getViewSource()); diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 3d70e28ed23ab..f01ad05a778ec 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -2111,6 +2111,31 @@ Operation *TargetOp::getInnermostCapturedOmpOp() { }); } +/// Check if we can promote SPMD kernel to No-Loop kernel. +static bool canPromoteToNoLoop(Operation *capturedOp, TeamsOp teamsOp, + WsloopOp *wsLoopOp) { + // num_teams clause can break no-loop teams/threads assumption. + if (teamsOp.getNumTeamsUpper()) + return false; + + // Reduction kernels are slower in no-loop mode. + if (teamsOp.getNumReductionVars()) + return false; + if (wsLoopOp->getNumReductionVars()) + return false; + + // Check if the user allows the promotion of kernels to no-loop mode. + OffloadModuleInterface offloadMod = + capturedOp->getParentOfType(); + if (!offloadMod) + return false; + auto ompFlags = offloadMod.getFlags(); + if (!ompFlags) + return false; + return ompFlags.getAssumeTeamsOversubscription() && + ompFlags.getAssumeThreadsOversubscription(); +} + TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { // A non-null captured op is only valid if it resides inside of a TargetOp // and is the result of calling getInnermostCapturedOmpOp() on it. @@ -2139,7 +2164,8 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { // Detect target-teams-distribute-parallel-wsloop[-simd]. if (numWrappers == 2) { - if (!isa(innermostWrapper)) + WsloopOp *wsloopOp = dyn_cast(innermostWrapper); + if (!wsloopOp) return TargetRegionFlags::generic; innermostWrapper = std::next(innermostWrapper); @@ -2150,12 +2176,17 @@ TargetRegionFlags TargetOp::getKernelExecFlags(Operation *capturedOp) { if (!isa_and_present(parallelOp)) return TargetRegionFlags::generic; - Operation *teamsOp = parallelOp->getParentOp(); - if (!isa_and_present(teamsOp)) + TeamsOp teamsOp = dyn_cast(parallelOp->getParentOp()); + if (!teamsOp) return TargetRegionFlags::generic; - if (teamsOp->getParentOp() == targetOp.getOperation()) - return TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + if (teamsOp->getParentOp() == targetOp.getOperation()) { + TargetRegionFlags result = + TargetRegionFlags::spmd | TargetRegionFlags::trip_count; + if (canPromoteToNoLoop(capturedOp, teamsOp, wsloopOp)) + result = result | TargetRegionFlags::no_loop; + return result; + } } // Detect target-teams-distribute[-simd] and target-teams-loop. else if (isa(innermostWrapper)) { diff --git a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp index f0209af8a1ca3..51f25f755a8a6 100644 --- a/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp +++ b/mlir/lib/Dialect/Ptr/IR/PtrDialect.cpp @@ -15,6 +15,7 @@ #include "mlir/IR/Matchers.h" #include "mlir/Interfaces/DataLayoutInterfaces.h" #include "mlir/Transforms/InliningUtils.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/TypeSwitch.h" using namespace mlir; @@ -391,6 +392,39 @@ LogicalResult PtrAddOp::inferReturnTypes( return success(); } +//===----------------------------------------------------------------------===// +// PtrDiffOp +//===----------------------------------------------------------------------===// + +LogicalResult PtrDiffOp::verify() { + // If the operands are not shaped early exit. + if (!isa(getLhs().getType())) + return success(); + + // Just check the container type matches, `SameOperandsAndResultShape` handles + // the actual shape. + if (getResult().getType().getTypeID() != getLhs().getType().getTypeID()) { + return emitError() << "expected the result to have the same container " + "type as the operands when operands are shaped"; + } + + return success(); +} + +ptr::PtrType PtrDiffOp::getPtrType() { + Type lhsType = getLhs().getType(); + if (auto shapedType = dyn_cast(lhsType)) + return cast(shapedType.getElementType()); + return cast(lhsType); +} + +Type PtrDiffOp::getIntType() { + Type resultType = getResult().getType(); + if (auto shapedType = dyn_cast(resultType)) + return shapedType.getElementType(); + return resultType; +} + //===----------------------------------------------------------------------===// // ToPtrOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp index fb179e64d8e7b..47c99642b9c37 100644 --- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp @@ -188,8 +188,8 @@ struct ExecuteRegionOpInterface TypeRange newResultTypes(yieldOp.getResults()); // Create new op and move over region. - auto newOp = - scf::ExecuteRegionOp::create(rewriter, op->getLoc(), newResultTypes); + auto newOp = scf::ExecuteRegionOp::create( + rewriter, op->getLoc(), newResultTypes, executeRegionOp.getNoInline()); newOp.getRegion().takeBody(executeRegionOp.getRegion()); // Bufferize every block. diff --git a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp index b0c781c7aff11..9468927021495 100644 --- a/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/StructuralTypeConversions.cpp @@ -185,6 +185,30 @@ class ConvertWhileOpTypes }; } // namespace +namespace { +class ConvertIndexSwitchOpTypes + : public Structural1ToNConversionPattern { +public: + using Structural1ToNConversionPattern::Structural1ToNConversionPattern; + + std::optional + convertSourceOp(IndexSwitchOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter, + TypeRange dstTypes) const { + auto newOp = + IndexSwitchOp::create(rewriter, op.getLoc(), dstTypes, op.getArg(), + op.getCases(), op.getNumCases()); + + for (unsigned i = 0u; i < op.getNumRegions(); i++) { + auto &dstRegion = newOp.getRegion(i); + rewriter.inlineRegionBefore(op.getRegion(i), dstRegion, dstRegion.end()); + } + return newOp; + } +}; +} // namespace + namespace { // When the result types of a ForOp/IfOp get changed, the operand types of the // corresponding yield op need to be changed. In order to trigger the @@ -220,18 +244,19 @@ void mlir::scf::populateSCFStructuralTypeConversions( const TypeConverter &typeConverter, RewritePatternSet &patterns, PatternBenefit benefit) { patterns.add( - typeConverter, patterns.getContext(), benefit); + ConvertWhileOpTypes, ConvertConditionOpTypes, + ConvertIndexSwitchOpTypes>(typeConverter, patterns.getContext(), + benefit); } void mlir::scf::populateSCFStructuralTypeConversionTarget( const TypeConverter &typeConverter, ConversionTarget &target) { - target.addDynamicallyLegalOp( + target.addDynamicallyLegalOp( [&](Operation *op) { return typeConverter.isLegal(op->getResults()); }); target.addDynamicallyLegalOp([&](scf::YieldOp op) { // We only have conversions for a subset of ops that use scf.yield // terminators. - if (!isa(op->getParentOp())) + if (!isa(op->getParentOp())) return true; return typeConverter.isLegal(op.getOperands()); }); diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 834c02126fa53..89e2c57d709dd 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -155,18 +155,18 @@ getUserTileSizesAndNumThreads(RewriterBase &rewriter, TilingInterface op, static LogicalResult checkTileSizes(TilingInterface op, scf::SCFTilingOptions::LoopType loopType, ReductionTilingStrategy reductionStrategy, - ArrayRef tileSizes, + ArrayRef givenTileSizes, ArrayRef numThreads) { auto iterators = op.getLoopIteratorTypes(); - assert(iterators.size() == tileSizes.size() && + assert(iterators.size() == givenTileSizes.size() && "expected as many tile size values as number of loops"); assert((numThreads.empty() || (numThreads.size() == iterators.size())) && "when specified, expected number of threads to use for each loop"); bool isParallelTiling = false; - for (auto [index, iterator, tileSize] : - llvm::enumerate(iterators, tileSizes)) { - if (!isConstantIntValue(tileSize, 0)) { + for (auto [index, iterator, givenTileSize] : + llvm::enumerate(iterators, givenTileSizes)) { + if (!isConstantIntValue(givenTileSize, 0)) { isParallelTiling |= iterator == utils::IteratorType::parallel; } @@ -186,7 +186,7 @@ static LogicalResult checkTileSizes(TilingInterface op, } if (std::optional constTileSize = - getConstantIntValue(tileSize)) { + getConstantIntValue(givenTileSize)) { if (constTileSize.value() > 0 && iterator != utils::IteratorType::parallel) { op.emitWarning() << "tiling is not thread safe at axis #" << index; @@ -207,11 +207,11 @@ static LogicalResult checkTileSizes(TilingInterface op, /// Get the reduction dims that are tiled. This accounts for reduction dims /// that are specified as tiled, but the tile size is 0. static SetVector -getSanitizedReductionDims(ArrayRef tileSizes, +getSanitizedReductionDims(ArrayRef givenTileSizes, const scf::SCFTilingOptions &options) { SetVector reductionDims; for (auto dim : options.reductionDims) { - if (isConstantIntValue(tileSizes[dim], 0)) + if (isConstantIntValue(givenTileSizes[dim], 0)) continue; reductionDims.insert(dim); } @@ -236,14 +236,14 @@ static bool tileDividesIterationDomain(Range loopRange) { /// `tileSize`, i.e., `min(tileSize, range.end() - offset)`. static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, Range loopRange, OpFoldResult offset, - OpFoldResult tileSize) { - std::optional ts = getConstantIntValue(tileSize); + OpFoldResult givenTileSize) { + std::optional ts = getConstantIntValue(givenTileSize); if (ts && ts.value() == 1) - return tileSize; + return givenTileSize; if (tileDividesIterationDomain( - Range{loopRange.offset, loopRange.size, tileSize})) - return tileSize; + Range{loopRange.offset, loopRange.size, givenTileSize})) + return givenTileSize; // The tile size to use (to avoid out of bounds access) is minimum of // `tileSize` and `ub - iv`, where `iv` is the induction variable of the tiled @@ -254,15 +254,15 @@ static OpFoldResult getBoundedTileSize(OpBuilder &b, Location loc, AffineMap minMap = AffineMap::get(1, 2, {s0 - d0, s1}, b.getContext()); Value size = getValueOrCreateConstantIndexOp(b, loc, loopRange.size); return affine::makeComposedFoldedAffineMin( - b, loc, minMap, SmallVector{offset, size, tileSize}); + b, loc, minMap, SmallVector{offset, size, givenTileSize}); } /// Returns true if the maximum tile offset `tileSize * numThreads-1` is less /// than `iterationSize`. -static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, +static bool canOmitTileOffsetInBoundsCheck(OpFoldResult givenTileSize, OpFoldResult numThreads, OpFoldResult iterationSize) { - std::optional tileSizeConst = getConstantIntValue(tileSize); + std::optional tileSizeConst = getConstantIntValue(givenTileSize); std::optional numThreadsConst = getConstantIntValue(numThreads); std::optional iterSizeConst = getConstantIntValue(iterationSize); if (!tileSizeConst || !numThreadsConst || !iterSizeConst) @@ -274,114 +274,51 @@ static bool canOmitTileOffsetInBoundsCheck(OpFoldResult tileSize, /// `offset`s and `size`s of the tile of the iteration space that the /// innermost loop body of the generated tiled loops corresponds to. static std::tuple, SmallVector> -getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, - ReductionTilingStrategy strategy, ValueRange ivs, +getTileOffsetAndSizes(RewriterBase &rewriter, Location loc, ValueRange ivs, ArrayRef iterationDomain, - ArrayRef tileSizes, - ArrayRef numThreads, - const llvm::SetVector &reductionDims) { + ArrayRef givenTileSizes) { SmallVector offsets, sizes; int materializedLoopNum = 0; - - if (!numThreads.empty()) { - AffineExpr d0, d1, s0, s1; - AffineExpr offsetExpr, residualTileSizeExpr; - bindDims(rewriter.getContext(), d0, d1); - bindSymbols(rewriter.getContext(), s0, s1); - offsetExpr = d0 + d1 * s0; - residualTileSizeExpr = s1 - (d0 + d1 * s0); - - for (auto [index, nt, tileSize, loopRange] : - llvm::enumerate(numThreads, tileSizes, iterationDomain)) { - - // Non-tiled cases, set the offset and size to the - // `loopRange.offset/size`. - if (isZeroInteger(nt)) { - offsets.push_back(loopRange.offset); - sizes.push_back(loopRange.size); - continue; - } - - Value iv = ivs[materializedLoopNum++]; - OpFoldResult offset = affine::makeComposedFoldedAffineApply( - rewriter, loc, offsetExpr, - ArrayRef{loopRange.offset, iv, tileSize}); - OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply( - rewriter, loc, residualTileSizeExpr, - {loopRange.offset, nt, tileSize, loopRange.size}); - - OpFoldResult size = tileSize; - if (!isZeroInteger(residualTileSize)) { - OpFoldResult sizeMinusOffsetPerThread = - affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0, - {offset, loopRange.size}); - size = affine::makeComposedFoldedAffineMin( - rewriter, loc, - AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()), - {sizeMinusOffsetPerThread, tileSize}); - } - - // Consider the case where the original loop was `[0, 100)`. - // If number of threads are `7`, the tile size would be computed as - // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6) - // - `offset = 0 + 6 * 15 = 105` - // - `tileSize = min(15, 100 - 105) = -5` - // To avoid negative tile sizes, we need to do a further - // `nonNegativeTileSize = affine.max(0, tileSize)`. - // This `max` can be avoided if - // `offset + tileSize * (numThreads - 1) < (ub - lb)` - if (!canOmitTileOffsetInBoundsCheck(tileSize, nt, loopRange.size)) { - AffineMap maxMap = - AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()); - size = affine::makeComposedFoldedAffineMax( - rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size}); - } - - offsets.push_back(offset); - sizes.push_back(size); + for (auto [givenTileSize, loopRange] : + llvm::zip_equal(givenTileSizes, iterationDomain)) { + + // Non-tiled cases, set the offset and size to the + // `loopRange.offset/size`. + if (isZeroInteger(givenTileSize)) { + offsets.push_back(loopRange.offset); + sizes.push_back(loopRange.size); + continue; } - return {offsets, sizes}; - } else { - for (auto [tileSize, loopRange] : - llvm::zip_equal(tileSizes, iterationDomain)) { - - // Non-tiled cases, set the offset and size to the - // `loopRange.offset/size`. - if (isZeroInteger(tileSize)) { - offsets.push_back(loopRange.offset); - sizes.push_back(loopRange.size); - continue; - } - Value iv = ivs[materializedLoopNum++]; - OpFoldResult offset = getAsOpFoldResult(iv); - offsets.push_back(offset); - OpFoldResult size = - getBoundedTileSize(rewriter, loc, loopRange, offset, tileSize); - sizes.push_back(size); - } - return {offsets, sizes}; + Value iv = ivs[materializedLoopNum++]; + OpFoldResult offset = getAsOpFoldResult(iv); + offsets.push_back(offset); + OpFoldResult size = + getBoundedTileSize(rewriter, loc, loopRange, offset, givenTileSize); + sizes.push_back(size); } + return {offsets, sizes}; } /// Function to return the bounds of the loops to be generated. static std::tuple, SmallVector, SmallVector> getLoopBounds(RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes) { + ArrayRef givenTileSizes) { SmallVector lbs, ubs, steps; - for (auto [loopRange, tileSize] : llvm::zip_equal(loopRanges, tileSizes)) { + for (auto [loopRange, givenTileSize] : + llvm::zip_equal(loopRanges, givenTileSizes)) { // No loop if the tile size is 0. - if (isZeroInteger(tileSize)) + if (isZeroInteger(givenTileSize)) continue; lbs.push_back(loopRange.offset); ubs.push_back(loopRange.size); - steps.push_back(tileSize); + steps.push_back(givenTileSize); } return {lbs, ubs, steps}; } -/// A function that allows returning additional yielded values during +/// Typedef for function that allows returning additional yielded values during /// `yieldTiledValuesAndReplace`. /// - `ivs` induction variable for the loop. /// - `newBbArgs` basic block arguments corresponding to newly added iter_args. @@ -402,6 +339,30 @@ using YieldTiledValuesFn = std::function> &resultOffsets, SmallVector> &resultSizes)>; +/// Typedef for function that implements the body of a tiled loop. +/// - `ivs` induction variable for the loop. +/// - `tileOffsets` represents offsets for the tiled iteration space. +/// - `tileSizes` represents the sizes for the tiled iteraiton space. +/// - `outerDestinationTensors` tensor that holds the result. Is same size +/// as the destination operands of the original operations. +/// - `tiledResults` results of the tiled computation, corresponds to +/// tiles of the original operation computed by the loop body. +/// Should be same size as the `destinationTensors` +/// - `resultOffsets` is of the same size as `tiledResults` and represents +/// the offset to use when writing the corresponding element from +/// `tiledResults` into `destinationTensors`. +/// - `resultOffsets` is of the same size as `tiledResults` and represents +/// the size to use when writing the corresponding element from +/// `tiledResults` into `destinationTensors`. +/// In case the method needs to return `failure()` the method is expected +/// to clean up any inserted operations. +using GenerateTiledBodyFn = std::function tileOffsets, ArrayRef tileSizes, + ValueRange outerDestinationTensors, SmallVector &tiledResults, + SmallVector> &resultOffsets, + SmallVector> &resultSizes)>; + /// Clones the operation and updates the destination if the operation /// implements the `DestinationStyleOpInterface`. static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter, @@ -417,26 +378,25 @@ static Operation *cloneOpAndUpdateDestinationArgs(RewriterBase &rewriter, /// Generate the tile-loop nest using `scf.for` operation. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. -/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. -/// - `destinationTensors` are the init values to use for the outer most loop. -/// - `yieldTiledValuesFn` is called to generated the loop body of the inner +/// - `givenTileSizes` is the tile sizes to use. Zero represent untiled loops. +/// - `outerDestinationTensors` are the init values to use for the outer most +/// loop. +/// - `tiledBodyFn` is called to generated the loop body of the inner /// most /// loop. -/// - `loops` is an in-out parameter into which the generated loops are -/// populated. -static LogicalResult generateLoopNestUsingForOp( +/// Returns the generated `scf.for` loops on success. +static FailureOr> generateLoopNestUsingForOp( RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes, ValueRange destinationTensors, - YieldTiledValuesFn yieldTiledValuesFn, - SmallVector &loops) { + ArrayRef givenTileSizes, ValueRange outerDestinationTensors, + GenerateTiledBodyFn tiledBodyFn) { assert(!loopRanges.empty() && "unexpected empty loop ranges"); - assert(loopRanges.size() == tileSizes.size() && + assert(loopRanges.size() == givenTileSizes.size() && "expected as many tile sizes as loop ranges"); OpBuilder::InsertionGuard guard(rewriter); SmallVector lbs, ubs, steps; std::tie(lbs, ubs, steps) = - getLoopBounds(rewriter, loc, loopRanges, tileSizes); + getLoopBounds(rewriter, loc, loopRanges, givenTileSizes); SmallVector lbVals = getValueOrCreateConstantIndexOp(rewriter, loc, lbs); SmallVector ubVals = @@ -445,34 +405,44 @@ static LogicalResult generateLoopNestUsingForOp( getValueOrCreateConstantIndexOp(rewriter, loc, steps); SmallVector ivs; + SmallVector loops; + ValueRange innerDestinationTensors(outerDestinationTensors); for (auto [lb, ub, step] : llvm::zip_equal(lbVals, ubVals, stepVals)) { auto loop = - scf::ForOp::create(rewriter, loc, lb, ub, step, destinationTensors, + scf::ForOp::create(rewriter, loc, lb, ub, step, innerDestinationTensors, [](OpBuilder &bodyBuilder, Location bodyLoc, Value iv, ValueRange /*iterArgs*/) {}); loops.push_back(loop); ivs.push_back(loop.getInductionVar()); rewriter.setInsertionPointToEnd(loop.getBody()); - destinationTensors = loop.getRegionIterArgs(); + innerDestinationTensors = loop.getRegionIterArgs(); } + if (loops.empty()) + return success(); + + // Compute the `offsets` and `sizes` to use for tiling. + SmallVector offsets, sizes; + std::tie(offsets, sizes) = + getTileOffsetAndSizes(rewriter, loc, ivs, loopRanges, givenTileSizes); SmallVector tiledResults; SmallVector> resultOffsets, resultSizes; - if (failed(yieldTiledValuesFn(rewriter, loc, ivs, destinationTensors, - tiledResults, resultOffsets, resultSizes))) { + if (failed(tiledBodyFn(rewriter, loc, ivs, offsets, sizes, + innerDestinationTensors, tiledResults, resultOffsets, + resultSizes))) { return rewriter.notifyMatchFailure( loc, "failed to generate inner tile loop body"); } if (loops.empty()) - return success(); + return loops; - assert(tiledResults.size() == destinationTensors.size() && + assert(tiledResults.size() == innerDestinationTensors.size() && "Number of results of body should be equal to number of iter args"); // 6. Yield all the results of the tiled operation. SmallVector yieldedValues; for (auto [tiledValue, destinationTensor, resultOffset, resultSize] : - llvm::zip_equal(tiledResults, destinationTensors, resultOffsets, + llvm::zip_equal(tiledResults, innerDestinationTensors, resultOffsets, resultSizes)) { SmallVector resultStride(resultOffset.size(), rewriter.getIndexAttr(1)); @@ -491,27 +461,108 @@ static LogicalResult generateLoopNestUsingForOp( cast(outerLoop.getOperation()).getBody()); scf::YieldOp::create(rewriter, outerLoop.getLoc(), innerLoop->getResults()); } - return success(); + return loops; +} + +/// Compute the `OpFoldResult`s that represents the multi-dimensional +/// `offset`s and `size`s of the tile of the iteration space that the +/// innermost loop body of the generated tiled loops corresponds to +/// when tiling using `forall` op. This is handle separately due to +/// the special case handling needed for when the tiling is done by +/// specifying number of threads. +static std::tuple, SmallVector> +getTileOffsetAndSizesWithForAllOp(RewriterBase &rewriter, Location loc, + ValueRange ivs, + ArrayRef iterationDomain, + ArrayRef givenTileSizes, + ArrayRef numThreads) { + if (numThreads.empty()) { + return getTileOffsetAndSizes(rewriter, loc, ivs, iterationDomain, + givenTileSizes); + } + + SmallVector offsets, sizes; + int materializedLoopNum = 0; + + AffineExpr d0, d1, s0, s1; + AffineExpr offsetExpr, residualTileSizeExpr; + bindDims(rewriter.getContext(), d0, d1); + bindSymbols(rewriter.getContext(), s0, s1); + offsetExpr = d0 + d1 * s0; + residualTileSizeExpr = s1 - (d0 + d1 * s0); + + for (auto [index, nt, givenTileSize, loopRange] : + llvm::enumerate(numThreads, givenTileSizes, iterationDomain)) { + + // Non-tiled cases, set the offset and size to the + // `loopRange.offset/size`. + if (isZeroInteger(nt)) { + offsets.push_back(loopRange.offset); + sizes.push_back(loopRange.size); + continue; + } + + Value iv = ivs[materializedLoopNum++]; + OpFoldResult offset = affine::makeComposedFoldedAffineApply( + rewriter, loc, offsetExpr, + ArrayRef{loopRange.offset, iv, givenTileSize}); + OpFoldResult residualTileSize = affine::makeComposedFoldedAffineApply( + rewriter, loc, residualTileSizeExpr, + {loopRange.offset, nt, givenTileSize, loopRange.size}); + + OpFoldResult size = givenTileSize; + if (!isZeroInteger(residualTileSize)) { + OpFoldResult sizeMinusOffsetPerThread = + affine::makeComposedFoldedAffineApply(rewriter, loc, s0 - d0, + {offset, loopRange.size}); + size = affine::makeComposedFoldedAffineMin( + rewriter, loc, + AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()), + {sizeMinusOffsetPerThread, givenTileSize}); + } + + // Consider the case where the original loop was `[0, 100)`. + // If number of threads are `7`, the tile size would be computed as + // `ceilDiv(100, 7) = 15`. For the last thread (thread_id = 6) + // - `offset = 0 + 6 * 15 = 105` + // - `tileSize = min(15, 100 - 105) = -5` + // To avoid negative tile sizes, we need to do a further + // `nonNegativeTileSize = affine.max(0, tileSize)`. + // This `max` can be avoided if + // `offset + tileSize * (numThreads - 1) < (ub - lb)` + if (!canOmitTileOffsetInBoundsCheck(givenTileSize, nt, loopRange.size)) { + AffineMap maxMap = + AffineMap::getMultiDimIdentityMap(2, rewriter.getContext()); + size = affine::makeComposedFoldedAffineMax( + rewriter, loc, maxMap, {rewriter.getIndexAttr(0), size}); + } + + offsets.push_back(offset); + sizes.push_back(size); + } + return {offsets, sizes}; } /// Generate the tile-loop nest using `scf.forall` operation. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. -/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. -/// - `destinationTensors` are the init values to use for the outer most loop. +/// - `giventileSizes` is the tile sizes to use. Zero represent untiled loops. +/// - `outerDestinationTensors` are the init values to use for the loop. /// - `mappingVector` is the mapping attributes to use for loop construction. /// Can be empty. -/// - `yieldTiledValuesFn` is called to generated the loop body of the inner +/// - `tiledBodyFn` is called to generated the loop body of the inner /// most /// loop. -/// - `loops` is an in-out parameter into which the generated loops are -/// populated. -static LogicalResult generateLoopNestUsingForallOp( - RewriterBase &rewriter, Location loc, ArrayRef loopRanges, - ArrayRef tileSizes, ArrayRef numThreads, - ArrayRef mappingVector, ValueRange destinationTensors, - YieldTiledValuesFn tiledBodyFn, SmallVector &loops) { +/// Returns the generated `scf.forall` loop on success. +static FailureOr> +generateLoopNestUsingForallOp(RewriterBase &rewriter, Location loc, + ArrayRef loopRanges, + ArrayRef givenTileSizes, + ArrayRef numThreads, + ArrayRef mappingVector, + ValueRange outerDestinationTensors, + GenerateTiledBodyFn tiledBodyFn) { assert(!loopRanges.empty() && "unexpected empty loop ranges"); - assert(loopRanges.size() == tileSizes.size() && + assert(loopRanges.size() == givenTileSizes.size() && "expected as many tile sizes as loop ranges"); OpBuilder::InsertionGuard guard(rewriter); @@ -522,6 +573,7 @@ static LogicalResult generateLoopNestUsingForallOp( scf::ForallOp forallOp; bool useNumThreads = !numThreads.empty(); + SmallVector loops; if (useNumThreads) { // Prune the zero numthreads. SmallVector nonZeroNumThreads; @@ -531,29 +583,35 @@ static LogicalResult generateLoopNestUsingForallOp( nonZeroNumThreads.push_back(nt); } forallOp = scf::ForallOp::create(rewriter, loc, nonZeroNumThreads, - destinationTensors, mappingAttr); + outerDestinationTensors, mappingAttr); } else { SmallVector lbs, ubs, steps; std::tie(lbs, ubs, steps) = - getLoopBounds(rewriter, loc, loopRanges, tileSizes); + getLoopBounds(rewriter, loc, loopRanges, givenTileSizes); forallOp = scf::ForallOp::create(rewriter, loc, lbs, ubs, steps, - destinationTensors, mappingAttr); + outerDestinationTensors, mappingAttr); } loops.push_back(forallOp); rewriter.setInsertionPoint(forallOp.getTerminator()); - destinationTensors = forallOp.getRegionOutArgs(); + ValueRange innerDestinationTensors = forallOp.getRegionOutArgs(); + SmallVector ivs = forallOp.getInductionVars(); + + // Compute the `offsets` and `sizes` to use for tiling. + SmallVector offsets, sizes; + std::tie(offsets, sizes) = getTileOffsetAndSizesWithForAllOp( + rewriter, loc, ivs, loopRanges, givenTileSizes, numThreads); SmallVector tiledResults; SmallVector> resultOffsets, resultSizes; - if (failed(tiledBodyFn(rewriter, loc, forallOp.getInductionVars(), - destinationTensors, tiledResults, resultOffsets, + if (failed(tiledBodyFn(rewriter, loc, ivs, offsets, sizes, + innerDestinationTensors, tiledResults, resultOffsets, resultSizes))) return rewriter.notifyMatchFailure(loc, "failed to generate loop body"); rewriter.setInsertionPointToEnd(forallOp.getTerminator().getBody()); for (auto [tiledValue, destinationTensor, resultOffset, resultSize] : - llvm::zip_equal(tiledResults, destinationTensors, resultOffsets, + llvm::zip_equal(tiledResults, innerDestinationTensors, resultOffsets, resultSizes)) { SmallVector resultStride(resultOffset.size(), rewriter.getIndexAttr(1)); @@ -562,41 +620,105 @@ static LogicalResult generateLoopNestUsingForallOp( destinationTensor, resultOffset, resultSize, resultStride); } - return success(); + return loops; +} + +/// Generate the tile-loop nest using custom loop operation. +/// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. +/// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. +/// - `destinationTensors` are the init values to use for the outer most loop. +/// - `mappingVector` is the mapping attributes to use for loop construction. +/// Can be empty. +/// - `tiledBodyFn` is called to generated the loop body of the inner +/// most +/// loop. +/// Returns the generated `scf.forall` loop on success. +static FailureOr> +generateLoopNestUsingCustomOp( + RewriterBase &rewriter, Location loc, ArrayRef loopRanges, + ArrayRef givenTileSizes, ValueRange outerDestinationTensors, + const scf::SCFTilingOptions::GenerateLoopHeaderFn &generateLoopHeaderFn, + const scf::SCFTilingOptions::GenerateLoopTerminatorFn + &generateLoopTerminatorFn, + GenerateTiledBodyFn tiledBodyFn) { + assert(!loopRanges.empty() && "unexpected empty loop ranges"); + assert(loopRanges.size() == givenTileSizes.size() && + "expected as many tile sizes as loop ranges"); + assert(generateLoopHeaderFn && generateLoopTerminatorFn && + "expected loop header/terminator generation function"); + OpBuilder::InsertionGuard guard(rewriter); + + FailureOr loopHeaderInfo = + generateLoopHeaderFn(rewriter, loc, loopRanges, givenTileSizes, + outerDestinationTensors); + if (failed(loopHeaderInfo)) { + return failure(); + } + + SmallVector ivs; + SmallVector tiledResults; + SmallVector> resultOffsets, resultSizes; + if (failed(tiledBodyFn(rewriter, loc, ivs, loopHeaderInfo->tileOffset, + loopHeaderInfo->tileSizes, + loopHeaderInfo->destinationTensors, tiledResults, + resultOffsets, resultSizes))) { + return failure(); + } + + if (failed(generateLoopTerminatorFn(rewriter, loc, tiledResults, + resultOffsets, resultSizes, + loopHeaderInfo->destinationTensors))) { + return failure(); + } + + return loopHeaderInfo->loops; } /// Generate the tile-loop nest using the loop construct specifed in `options`. /// - `options`: Tiling options specified. /// - `loopRanges` specifies the lb, ub and step of the untiled iteration space. /// - `tileSizes` is the tile sizes to use. Zero represent untiled loops. -/// - `destinationTensors` are the init values to use for the outer most loop. +/// - `outerDestinationTensors` are the init values to use for the outer most +/// loop. /// - `yieldTiledValuesFn` is called to generated the loop body of the inner /// most /// loop. -/// - `loops` is an in-out parameter into which the generated loops are -/// populated. -static LogicalResult generateLoopNest( - RewriterBase &rewriter, Location loc, - scf::SCFTilingOptions::LoopType loopType, ArrayRef loopRanges, - ArrayRef tileSizes, ArrayRef numThreads, - ValueRange destinationTensors, ArrayRef mappingVector, - YieldTiledValuesFn tiledBodyFn, SmallVector &loops) { +/// Returns the generated loops on success. +static FailureOr> generateLoopNest( + RewriterBase &rewriter, Location loc, const scf::SCFTilingOptions &options, + ArrayRef loopRanges, ArrayRef givenTileSizes, + ArrayRef numThreads, ValueRange destinationTensors, + GenerateTiledBodyFn tiledBodyFn) { // If the tile sizes are all zero, no loops are generated. Just call the // callback function to handle untiled case. - if (llvm::all_of(tileSizes, isZeroInteger)) { + if (llvm::all_of(givenTileSizes, isZeroInteger)) { SmallVector tiledResults; SmallVector> resultOffsets, resultSizes; - return tiledBodyFn(rewriter, loc, ValueRange{}, destinationTensors, - tiledResults, resultOffsets, resultSizes); + auto tileOffsets = + llvm::map_to_vector(loopRanges, [](Range r) { return r.offset; }); + auto tileSizes = + llvm::map_to_vector(loopRanges, [](Range r) { return r.size; }); + if (failed(tiledBodyFn(rewriter, loc, ValueRange{}, tileOffsets, tileSizes, + destinationTensors, tiledResults, resultOffsets, + resultSizes))) { + return failure(); + } + return SmallVector{}; } - if (loopType == scf::SCFTilingOptions::LoopType::ForOp) { - return generateLoopNestUsingForOp(rewriter, loc, loopRanges, tileSizes, - destinationTensors, tiledBodyFn, loops); + if (options.loopType == scf::SCFTilingOptions::LoopType::ForOp) { + return generateLoopNestUsingForOp(rewriter, loc, loopRanges, givenTileSizes, + destinationTensors, tiledBodyFn); } - if (loopType == scf::SCFTilingOptions::LoopType::ForallOp) { + if (options.loopType == scf::SCFTilingOptions::LoopType::ForallOp) { return generateLoopNestUsingForallOp( - rewriter, loc, loopRanges, tileSizes, numThreads, mappingVector, - destinationTensors, tiledBodyFn, loops); + rewriter, loc, loopRanges, givenTileSizes, numThreads, + options.mappingVector, destinationTensors, tiledBodyFn); + } + if (options.loopType == scf::SCFTilingOptions::LoopType::CustomOp) { + return generateLoopNestUsingCustomOp( + rewriter, loc, loopRanges, givenTileSizes, destinationTensors, + options.generateLoopHeaderFn, options.generateLoopTerminatorFn, + tiledBodyFn); } return rewriter.notifyMatchFailure(loc, "unhandled loop type"); } @@ -604,7 +726,7 @@ static LogicalResult generateLoopNest( static FailureOr> createInitialTensorsForTiling( RewriterBase &rewriter, TilingInterface op, ReductionTilingStrategy reductionStrategy, ArrayRef iterationDomain, - ArrayRef numThreads, ArrayRef tileSizes, + ArrayRef numThreads, ArrayRef givenTileSizes, const SetVector &reductionDims) { SmallVector initTensors; Location loc = op->getLoc(); @@ -626,7 +748,7 @@ static FailureOr> createInitialTensorsForTiling( AffineExpr sizeExpr = ((s0 - s1).ceilDiv(s2)); AffineExpr divExpr = s0.ceilDiv(s1); for (auto [index, domain, tileSize] : - llvm::enumerate(iterationDomain, tileSizes)) { + llvm::enumerate(iterationDomain, givenTileSizes)) { if (!numThreads.empty()) { // Untiled case. if (isConstantIntValue(numThreads[index], 0)) { @@ -672,7 +794,7 @@ static SmallVector getSplitReductionIvs(RewriterBase &rewriter, Location loc, ReductionTilingStrategy reductionStrategy, ValueRange ivs, ArrayRef numThreads, - ArrayRef tileSizes, + ArrayRef givenTileSizes, const SetVector &reductionDims) { SmallVector splitReductionIvs; splitReductionIvs.resize(reductionDims.size(), rewriter.getIndexAttr(0)); @@ -689,7 +811,7 @@ getSplitReductionIvs(RewriterBase &rewriter, Location loc, } splitReductionIvs[index] = affine::makeComposedFoldedAffineApply( rewriter, loc, divExpr, - ArrayRef{ivs[ivIndex++], tileSizes[reductionDim]}); + ArrayRef{ivs[ivIndex++], givenTileSizes[reductionDim]}); } } return splitReductionIvs; @@ -701,7 +823,7 @@ getTiledImplementation(RewriterBase &rewriter, TilingInterface op, ValueRange regionIterArg, ArrayRef offsets, ArrayRef sizes, ValueRange ivs, ArrayRef numThreads, - ArrayRef tileSizes, + ArrayRef givenTileSizes, const SetVector &reductionDims) { if (reductionStrategy == ReductionTilingStrategy::FullReduction) { return op.getTiledImplementation(rewriter, offsets, sizes); @@ -717,7 +839,7 @@ getTiledImplementation(RewriterBase &rewriter, TilingInterface op, SmallVector splitReductionIvs = getSplitReductionIvs(rewriter, op.getLoc(), reductionStrategy, ivs, - numThreads, tileSizes, reductionDims); + numThreads, givenTileSizes, reductionDims); return redOp.tileToPartialReduction(rewriter, op.getLoc(), reductionStrategy, regionIterArg, offsets, sizes, reductionDims, splitReductionIvs); @@ -728,7 +850,8 @@ static LogicalResult getResultTilePosition( int64_t index, Value tiledResult, TilingInterface op, ArrayRef offsets, ArrayRef sizes, ValueRange ivs, ArrayRef numThreads, - ArrayRef tileSizes, const SetVector &reductionDims, + ArrayRef givenTileSizes, + const SetVector &reductionDims, SmallVector &resultOffset, SmallVector &resultSize) { @@ -744,7 +867,7 @@ static LogicalResult getResultTilePosition( } SmallVector splitReductionIvs = getSplitReductionIvs(rewriter, op.getLoc(), reductionStrategy, ivs, - numThreads, tileSizes, reductionDims); + numThreads, givenTileSizes, reductionDims); return redOp.getPartialResultTilePosition( rewriter, index, reductionStrategy, offsets, sizes, reductionDims, splitReductionIvs, resultOffset, resultSize); @@ -999,20 +1122,20 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, SmallVector iterationDomain = op.getIterationDomain(rewriter); // 2. Materialize the tile sizes and/or number of threads; - SmallVector tileSizes, numThreads; - std::tie(tileSizes, numThreads) = + SmallVector givenTileSizes, numThreads; + std::tie(givenTileSizes, numThreads) = getUserTileSizesAndNumThreads(rewriter, op, iterationDomain, options); // Check if it is safe to tile. This is hold over from previous iterations // of tile to for-all. Consider dropping it. if (failed(checkTileSizes(op, options.loopType, options.reductionStrategy, - tileSizes, numThreads))) { + givenTileSizes, numThreads))) { return failure(); } // Get the reduction dims SetVector reductionDims = - getSanitizedReductionDims(tileSizes, options); + getSanitizedReductionDims(givenTileSizes, options); // 3. If there is an interchange specified, permute the iteration domain and // the tile sizes. @@ -1024,7 +1147,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, "expected interchange vector to be a permutation"); applyPermutationToVector(iterationDomain, interchangeVector); - applyPermutationToVector(tileSizes, interchangeVector); + applyPermutationToVector(givenTileSizes, interchangeVector); if (!numThreads.empty()) applyPermutationToVector(numThreads, interchangeVector); } @@ -1032,24 +1155,21 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, FailureOr tilingResult; // 4. Define the lambda function used later to generate the body of the // innermost tiled loop. - YieldTiledValuesFn innerYieldTiledValuesFn = + GenerateTiledBodyFn innerYieldTiledValuesFn = [&](RewriterBase &rewriter, Location loc, ValueRange ivs, + ArrayRef tileOffsets, ArrayRef tileSizes, ValueRange regionIterArgs, SmallVector &tiledResults, SmallVector> &resultOffsets, SmallVector> &resultSizes) -> LogicalResult { - // 4a. Compute the `offsets` and `sizes` to use for tiling. - SmallVector offsets, sizes; - std::tie(offsets, sizes) = getTileOffsetAndSizes( - rewriter, loc, options.reductionStrategy, ivs, iterationDomain, - tileSizes, numThreads, reductionDims); - // 4b. If interchange was provided, apply inverse of the interchange // to get back the offsets/sizes in the order to be specified. + SmallVector tileOffsetsVec = llvm::to_vector(tileOffsets); + SmallVector tileSizesVec = llvm::to_vector(tileSizes); if (!interchangeVector.empty()) { auto inversePermutation = invertPermutationVector(interchangeVector); - applyPermutationToVector(offsets, inversePermutation); - applyPermutationToVector(sizes, inversePermutation); + applyPermutationToVector(tileOffsetsVec, inversePermutation); + applyPermutationToVector(tileSizesVec, inversePermutation); } // 5. Generate the tiled implementation within the inner most loop. @@ -1061,7 +1181,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 5b. Early return cloned op if tiling is not happening. We can not // return the original op because it could lead to `rewriter.replaceOp(op, // op->getResults())` and users would get crash. - if (llvm::all_of(tileSizes, isZeroInteger)) { + if (llvm::all_of(givenTileSizes, isZeroInteger)) { tiledResults.append(clonedOp->result_begin(), clonedOp->result_end()); tilingResult = TilingResult{/*tiledOps=*/{clonedOp}, clonedOp->getResults(), @@ -1070,9 +1190,10 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, } // 5c. Tile the cloned operation. - tilingResult = getTiledImplementation( - rewriter, clonedOp, options.reductionStrategy, regionIterArgs, offsets, - sizes, ivs, numThreads, tileSizes, reductionDims); + tilingResult = + getTiledImplementation(rewriter, clonedOp, options.reductionStrategy, + regionIterArgs, tileOffsetsVec, tileSizesVec, + ivs, numThreads, givenTileSizes, reductionDims); if (failed(tilingResult)) { rewriter.eraseOp(clonedOp); return op.emitOpError("faild to tile operation"); @@ -1089,8 +1210,8 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, SmallVector resultOffset, resultSize; if (failed(getResultTilePosition( rewriter, options.reductionStrategy, index, tiledValue, op, - offsets, sizes, ivs, numThreads, tileSizes, reductionDims, - resultOffset, resultSize))) { + tileOffsetsVec, tileSizesVec, ivs, numThreads, givenTileSizes, + reductionDims, resultOffset, resultSize))) { for (auto op : tilingResult->tiledOps) { rewriter.eraseOp(op); } @@ -1107,7 +1228,7 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 6. Find the destination tensors to use for the operation. FailureOr> maybeInits = createInitialTensorsForTiling( rewriter, op, options.reductionStrategy, iterationDomain, numThreads, - tileSizes, reductionDims); + givenTileSizes, reductionDims); if (failed(maybeInits)) { return rewriter.notifyMatchFailure( op, "unable to create initial tensors for tiling"); @@ -1116,13 +1237,16 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op, // 7. Generate the tiled loops nest using the callback defined above. SmallVector loops; - if (failed(generateLoopNest(rewriter, op.getLoc(), options.loopType, - iterationDomain, tileSizes, numThreads, - initTensors, options.mappingVector, - innerYieldTiledValuesFn, loops))) - return op.emitOpError("failed to generate tiling loops"); - assert(succeeded(tilingResult) && - "expected tiling result to be computed after loop generation"); + { + FailureOr> loopsOr = generateLoopNest( + rewriter, op.getLoc(), options, iterationDomain, givenTileSizes, + numThreads, initTensors, innerYieldTiledValuesFn); + if (failed(loopsOr)) + return op.emitOpError("failed to generate tiling loops"); + assert(succeeded(tilingResult) && + "expected tiling result to be computed after loop generation"); + std::swap(loops, loopsOr.value()); + } if (loops.empty()) { // If loops are empty, the tiled op is used as the replacement for the diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp index e7bce98c607df..10eae8906ce31 100644 --- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp +++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp @@ -671,9 +671,10 @@ LogicalResult mlir::loopUnrollJamByFactor(scf::ForOp forOp, return success(); } -Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, Location loc, - OpFoldResult lb, OpFoldResult ub, - OpFoldResult step) { +static Range emitNormalizedLoopBoundsForIndexType(RewriterBase &rewriter, + Location loc, OpFoldResult lb, + OpFoldResult ub, + OpFoldResult step) { Range normalizedLoopBounds; normalizedLoopBounds.offset = rewriter.getIndexAttr(0); normalizedLoopBounds.stride = rewriter.getIndexAttr(1); diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp index 8244e64abba12..7e9a80e7d73a1 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp @@ -20,6 +20,7 @@ #include "llvm/Support/ErrorHandling.h" #include +#include using namespace mlir; using namespace mlir::spirv; @@ -45,17 +46,67 @@ class TypeExtensionVisitor { return; TypeSwitch(type) - .Case( + .Case( [this](auto concreteType) { addConcrete(concreteType); }) - .Case( + .Case( [this](auto concreteType) { add(concreteType.getElementType()); }) + .Case([this](SampledImageType concreteType) { + add(concreteType.getImageType()); + }) .Case([this](StructType concreteType) { for (Type elementType : concreteType.getElementTypes()) add(elementType); }) + .Default([](SPIRVType) { llvm_unreachable("Unhandled type"); }); + } + + void add(Type type) { add(cast(type)); } + +private: + // Types that add unique extensions. + void addConcrete(CooperativeMatrixType type); + void addConcrete(PointerType type); + void addConcrete(ScalarType type); + void addConcrete(TensorArmType type); + + SPIRVType::ExtensionArrayRefVector &extensions; + std::optional storage; + llvm::SmallDenseSet>> seen; +}; + +// Helper function to collect capabilities implied by a type by visiting all its +// subtypes. Maintains a set of `seen` types to avoid recursion in structs. +// +// Serves as the source-of-truth for type capability information. All capability +// logic should be added to this class, while the +// `SPIRVType::getCapabilities` function should not handle capability-related +// logic directly and only invoke `TypeCapabilityVisitor::add(Type *)`. +class TypeCapabilityVisitor { +public: + TypeCapabilityVisitor(SPIRVType::CapabilityArrayRefVector &capabilities, + std::optional storage) + : capabilities(capabilities), storage(storage) {} + + // Main visitor entry point. Adds all extensions to the vector. Saves `type` + // as seen and dispatches to the right concrete `.add` function. + void add(SPIRVType type) { + if (auto [_it, inserted] = seen.insert({type, storage}); !inserted) + return; + + TypeSwitch(type) + .Case( + [this](auto concreteType) { addConcrete(concreteType); }) + .Case([this](ArrayType concreteType) { + add(concreteType.getElementType()); + }) .Case([this](SampledImageType concreteType) { add(concreteType.getImageType()); }) + .Case([this](StructType concreteType) { + for (Type elementType : concreteType.getElementTypes()) + add(elementType); + }) .Default([](SPIRVType) { llvm_unreachable("Unhandled type"); }); } @@ -63,12 +114,16 @@ class TypeExtensionVisitor { private: // Types that add unique extensions. - void addConcrete(ScalarType type); - void addConcrete(PointerType type); void addConcrete(CooperativeMatrixType type); + void addConcrete(ImageType type); + void addConcrete(MatrixType type); + void addConcrete(PointerType type); + void addConcrete(RuntimeArrayType type); + void addConcrete(ScalarType type); void addConcrete(TensorArmType type); + void addConcrete(VectorType type); - SPIRVType::ExtensionArrayRefVector &extensions; + SPIRVType::CapabilityArrayRefVector &capabilities; std::optional storage; llvm::SmallDenseSet>> seen; }; @@ -118,21 +173,6 @@ Type ArrayType::getElementType() const { return getImpl()->elementType; } unsigned ArrayType::getArrayStride() const { return getImpl()->stride; } -void ArrayType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - llvm::cast(getElementType()) - .getCapabilities(capabilities, storage); -} - -std::optional ArrayType::getSizeInBytes() { - auto elementType = llvm::cast(getElementType()); - std::optional size = elementType.getSizeInBytes(); - if (!size) - return std::nullopt; - return (*size + getArrayStride()) * getNumElements(); -} - //===----------------------------------------------------------------------===// // CompositeType //===----------------------------------------------------------------------===// @@ -163,77 +203,27 @@ Type CompositeType::getElementType(unsigned index) const { } unsigned CompositeType::getNumElements() const { - if (auto arrayType = llvm::dyn_cast(*this)) - return arrayType.getNumElements(); - if (auto matrixType = llvm::dyn_cast(*this)) - return matrixType.getNumColumns(); - if (auto structType = llvm::dyn_cast(*this)) - return structType.getNumElements(); - if (auto vectorType = llvm::dyn_cast(*this)) - return vectorType.getNumElements(); - if (auto tensorArmType = dyn_cast(*this)) - return tensorArmType.getNumElements(); - if (llvm::isa(*this)) { - llvm_unreachable( - "invalid to query number of elements of spirv Cooperative Matrix type"); - } - if (llvm::isa(*this)) { - llvm_unreachable( - "invalid to query number of elements of spirv::RuntimeArray type"); - } - llvm_unreachable("invalid composite type"); + return TypeSwitch(*this) + .Case( + [](auto type) { return type.getNumElements(); }) + .Case([](MatrixType type) { return type.getNumColumns(); }) + .Default([](SPIRVType) -> unsigned { + llvm_unreachable("Invalid type for number of elements query"); + }); } bool CompositeType::hasCompileTimeKnownNumElements() const { return !llvm::isa(*this); } -void CompositeType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - TypeSwitch(*this) - .Case( - [&](auto type) { type.getCapabilities(capabilities, storage); }) - .Case([&](VectorType type) { - auto vecSize = getNumElements(); - if (vecSize == 8 || vecSize == 16) { - static const Capability caps[] = {Capability::Vector16}; - ArrayRef ref(caps, std::size(caps)); - capabilities.push_back(ref); - } - return llvm::cast(type.getElementType()) - .getCapabilities(capabilities, storage); - }) - .Case([&](TensorArmType type) { - static constexpr Capability cap{Capability::TensorsARM}; - capabilities.push_back(cap); - return llvm::cast(type.getElementType()) - .getCapabilities(capabilities, storage); - }) - .Default([](Type) { llvm_unreachable("invalid composite type"); }); -} - -std::optional CompositeType::getSizeInBytes() { - if (auto arrayType = llvm::dyn_cast(*this)) - return arrayType.getSizeInBytes(); - if (auto structType = llvm::dyn_cast(*this)) - return structType.getSizeInBytes(); - if (auto vectorType = llvm::dyn_cast(*this)) { - std::optional elementSize = - llvm::cast(vectorType.getElementType()).getSizeInBytes(); - if (!elementSize) - return std::nullopt; - return *elementSize * vectorType.getNumElements(); - } - if (auto tensorArmType = llvm::dyn_cast(*this)) { - std::optional elementSize = - llvm::cast(tensorArmType.getElementType()).getSizeInBytes(); - if (!elementSize) - return std::nullopt; - return *elementSize * tensorArmType.getNumElements(); +void TypeCapabilityVisitor::addConcrete(VectorType type) { + add(type.getElementType()); + + int64_t vecSize = type.getNumElements(); + if (vecSize == 8 || vecSize == 16) { + static constexpr auto cap = Capability::Vector16; + capabilities.push_back(cap); } - return std::nullopt; } //===----------------------------------------------------------------------===// @@ -317,12 +307,9 @@ void TypeExtensionVisitor::addConcrete(CooperativeMatrixType type) { extensions.push_back(ext); } -void CooperativeMatrixType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - llvm::cast(getElementType()) - .getCapabilities(capabilities, storage); - static constexpr Capability caps[] = {Capability::CooperativeMatrixKHR}; +void TypeCapabilityVisitor::addConcrete(CooperativeMatrixType type) { + add(type.getElementType()); + static constexpr auto caps = Capability::CooperativeMatrixKHR; capabilities.push_back(caps); } @@ -428,14 +415,14 @@ ImageSamplerUseInfo ImageType::getSamplerUseInfo() const { ImageFormat ImageType::getImageFormat() const { return getImpl()->format; } -void ImageType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional) { - if (auto dimCaps = spirv::getCapabilities(getDim())) +void TypeCapabilityVisitor::addConcrete(ImageType type) { + if (auto dimCaps = spirv::getCapabilities(type.getDim())) capabilities.push_back(*dimCaps); - if (auto fmtCaps = spirv::getCapabilities(getImageFormat())) + if (auto fmtCaps = spirv::getCapabilities(type.getImageFormat())) capabilities.push_back(*fmtCaps); + + add(type.getElementType()); } //===----------------------------------------------------------------------===// @@ -486,15 +473,15 @@ void TypeExtensionVisitor::addConcrete(PointerType type) { extensions.push_back(*scExts); } -void PointerType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { +void TypeCapabilityVisitor::addConcrete(PointerType type) { // Use this pointer type's storage class because this pointer indicates we are // using the pointee type in that specific storage class. - llvm::cast(getPointeeType()) - .getCapabilities(capabilities, getStorageClass()); + std::optional oldStorageClass = storage; + storage = type.getStorageClass(); + add(type.getPointeeType()); + storage = oldStorageClass; - if (auto scCaps = spirv::getCapabilities(getStorageClass())) + if (auto scCaps = spirv::getCapabilities(type.getStorageClass())) capabilities.push_back(*scCaps); } @@ -534,16 +521,10 @@ Type RuntimeArrayType::getElementType() const { return getImpl()->elementType; } unsigned RuntimeArrayType::getArrayStride() const { return getImpl()->stride; } -void RuntimeArrayType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - { - static const Capability caps[] = {Capability::Shader}; - ArrayRef ref(caps, std::size(caps)); - capabilities.push_back(ref); - } - llvm::cast(getElementType()) - .getCapabilities(capabilities, storage); +void TypeCapabilityVisitor::addConcrete(RuntimeArrayType type) { + add(type.getElementType()); + static constexpr auto cap = Capability::Shader; + capabilities.push_back(cap); } //===----------------------------------------------------------------------===// @@ -601,10 +582,8 @@ void TypeExtensionVisitor::addConcrete(ScalarType type) { } } -void ScalarType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - unsigned bitwidth = getIntOrFloatBitWidth(); +void TypeCapabilityVisitor::addConcrete(ScalarType type) { + unsigned bitwidth = type.getIntOrFloatBitWidth(); // 8- or 16-bit integer/floating-point numbers will require extra capabilities // to appear in interface storage classes. See SPV_KHR_16bit_storage and @@ -613,15 +592,13 @@ void ScalarType::getCapabilities( #define STORAGE_CASE(storage, cap8, cap16) \ case StorageClass::storage: { \ if (bitwidth == 8) { \ - static const Capability caps[] = {Capability::cap8}; \ - ArrayRef ref(caps, std::size(caps)); \ - capabilities.push_back(ref); \ + static constexpr auto cap = Capability::cap8; \ + capabilities.push_back(cap); \ return; \ } \ if (bitwidth == 16) { \ - static const Capability caps[] = {Capability::cap16}; \ - ArrayRef ref(caps, std::size(caps)); \ - capabilities.push_back(ref); \ + static constexpr auto cap = Capability::cap16; \ + capabilities.push_back(cap); \ return; \ } \ /* For 64-bit integers/floats, Int64/Float64 enables support for all */ \ @@ -640,9 +617,8 @@ void ScalarType::getCapabilities( case StorageClass::Input: case StorageClass::Output: { if (bitwidth == 16) { - static const Capability caps[] = {Capability::StorageInputOutput16}; - ArrayRef ref(caps, std::size(caps)); - capabilities.push_back(ref); + static constexpr auto cap = Capability::StorageInputOutput16; + capabilities.push_back(cap); return; } break; @@ -658,12 +634,11 @@ void ScalarType::getCapabilities( #define WIDTH_CASE(type, width) \ case width: { \ - static const Capability caps[] = {Capability::type##width}; \ - ArrayRef ref(caps, std::size(caps)); \ - capabilities.push_back(ref); \ + static constexpr auto cap = Capability::type##width; \ + capabilities.push_back(cap); \ } break - if (auto intType = llvm::dyn_cast(*this)) { + if (auto intType = dyn_cast(type)) { switch (bitwidth) { WIDTH_CASE(Int, 8); WIDTH_CASE(Int, 16); @@ -675,14 +650,14 @@ void ScalarType::getCapabilities( llvm_unreachable("invalid bitwidth to getCapabilities"); } } else { - assert(llvm::isa(*this)); + assert(isa(type)); switch (bitwidth) { case 16: { - if (isa(*this)) { - static const Capability cap = Capability::BFloat16TypeKHR; + if (isa(type)) { + static constexpr auto cap = Capability::BFloat16TypeKHR; capabilities.push_back(cap); } else { - static const Capability cap = Capability::Float16; + static constexpr auto cap = Capability::Float16; capabilities.push_back(cap); } break; @@ -698,19 +673,6 @@ void ScalarType::getCapabilities( #undef WIDTH_CASE } -std::optional ScalarType::getSizeInBytes() { - auto bitWidth = getIntOrFloatBitWidth(); - // According to the SPIR-V spec: - // "There is no physical size or bit pattern defined for values with boolean - // type. If they are stored (in conjunction with OpVariable), they can only - // be used with logical addressing operations, not physical, and only with - // non-externally visible shader Storage Classes: Workgroup, CrossWorkgroup, - // Private, Function, Input, and Output." - if (bitWidth == 1) - return std::nullopt; - return bitWidth / 8; -} - //===----------------------------------------------------------------------===// // SPIRVType //===----------------------------------------------------------------------===// @@ -740,31 +702,39 @@ void SPIRVType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, void SPIRVType::getCapabilities( SPIRVType::CapabilityArrayRefVector &capabilities, std::optional storage) { - if (auto scalarType = llvm::dyn_cast(*this)) { - scalarType.getCapabilities(capabilities, storage); - } else if (auto compositeType = llvm::dyn_cast(*this)) { - compositeType.getCapabilities(capabilities, storage); - } else if (auto imageType = llvm::dyn_cast(*this)) { - imageType.getCapabilities(capabilities, storage); - } else if (auto sampledImageType = llvm::dyn_cast(*this)) { - sampledImageType.getCapabilities(capabilities, storage); - } else if (auto matrixType = llvm::dyn_cast(*this)) { - matrixType.getCapabilities(capabilities, storage); - } else if (auto ptrType = llvm::dyn_cast(*this)) { - ptrType.getCapabilities(capabilities, storage); - } else if (auto tensorArmType = llvm::dyn_cast(*this)) { - tensorArmType.getCapabilities(capabilities, storage); - } else { - llvm_unreachable("invalid SPIR-V Type to getCapabilities"); - } + TypeCapabilityVisitor{capabilities, storage}.add(*this); } std::optional SPIRVType::getSizeInBytes() { - if (auto scalarType = llvm::dyn_cast(*this)) - return scalarType.getSizeInBytes(); - if (auto compositeType = llvm::dyn_cast(*this)) - return compositeType.getSizeInBytes(); - return std::nullopt; + return TypeSwitch>(*this) + .Case([](ScalarType type) -> std::optional { + // According to the SPIR-V spec: + // "There is no physical size or bit pattern defined for values with + // boolean type. If they are stored (in conjunction with OpVariable), + // they can only be used with logical addressing operations, not + // physical, and only with non-externally visible shader Storage + // Classes: Workgroup, CrossWorkgroup, Private, Function, Input, and + // Output." + int64_t bitWidth = type.getIntOrFloatBitWidth(); + if (bitWidth == 1) + return std::nullopt; + return bitWidth / 8; + }) + .Case([](ArrayType type) -> std::optional { + // Since array type may have an explicit stride declaration (in bytes), + // we also include it in the calculation. + auto elementType = cast(type.getElementType()); + if (std::optional size = elementType.getSizeInBytes()) + return (*size + type.getArrayStride()) * type.getNumElements(); + return std::nullopt; + }) + .Case([](auto type) -> std::optional { + if (std::optional elementSize = + cast(type.getElementType()).getSizeInBytes()) + return *elementSize * type.getNumElements(); + return std::nullopt; + }) + .Default(std::optional()); } //===----------------------------------------------------------------------===// @@ -814,12 +784,6 @@ SampledImageType::verifyInvariants(function_ref emitError, return success(); } -void SampledImageType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - llvm::cast(getImageType()).getCapabilities(capabilities, storage); -} - //===----------------------------------------------------------------------===// // StructType //===----------------------------------------------------------------------===// @@ -1172,13 +1136,6 @@ StructType::trySetBody(ArrayRef memberTypes, structDecorations); } -void StructType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - for (Type elementType : getElementTypes()) - llvm::cast(elementType).getCapabilities(capabilities, storage); -} - llvm::hash_code spirv::hash_value( const StructType::MemberDecorationInfo &memberDecorationInfo) { return llvm::hash_combine(memberDecorationInfo.memberIndex, @@ -1271,16 +1228,10 @@ unsigned MatrixType::getNumElements() const { return (getImpl()->columnCount) * getNumRows(); } -void MatrixType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - { - static const Capability caps[] = {Capability::Matrix}; - ArrayRef ref(caps, std::size(caps)); - capabilities.push_back(ref); - } - // Add any capabilities associated with the underlying vectors (i.e., columns) - llvm::cast(getColumnType()).getCapabilities(capabilities, storage); +void TypeCapabilityVisitor::addConcrete(MatrixType type) { + add(type.getColumnType()); + static constexpr auto cap = Capability::Matrix; + capabilities.push_back(cap); } //===----------------------------------------------------------------------===// @@ -1332,12 +1283,9 @@ void TypeExtensionVisitor::addConcrete(TensorArmType type) { extensions.push_back(ext); } -void TensorArmType::getCapabilities( - SPIRVType::CapabilityArrayRefVector &capabilities, - std::optional storage) { - llvm::cast(getElementType()) - .getCapabilities(capabilities, storage); - static constexpr Capability cap{Capability::TensorsARM}; +void TypeCapabilityVisitor::addConcrete(TensorArmType type) { + add(type.getElementType()); + static constexpr auto cap = Capability::TensorsARM; capabilities.push_back(cap); } diff --git a/mlir/lib/Dialect/Shard/Transforms/ShardingPropagation.cpp b/mlir/lib/Dialect/Shard/Transforms/ShardingPropagation.cpp index a647128cf0500..3bfbf373209e3 100644 --- a/mlir/lib/Dialect/Shard/Transforms/ShardingPropagation.cpp +++ b/mlir/lib/Dialect/Shard/Transforms/ShardingPropagation.cpp @@ -128,13 +128,13 @@ getOrderedPossibleShardingAttrs(ArrayRef mustShardings, curShardingAttrs.push_back(optionalShardings[i]); dfsCreateShardingAttrs(i + 1); curShardingAttrs.pop_back(); - curShardingAttrs.push_back({}); + curShardingAttrs.emplace_back(); dfsCreateShardingAttrs(i + 1); curShardingAttrs.pop_back(); return; } - curShardingAttrs.push_back({}); + curShardingAttrs.emplace_back(); dfsCreateShardingAttrs(i + 1); curShardingAttrs.pop_back(); }; diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp index 659282a995123..f53950242e10c 100644 --- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp @@ -344,7 +344,7 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) { // Reverse queue into a stack. std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end()); for (auto [loop, coeff] : dependentLvlMap[t][lvl]) - depRedOrder.emplace_back(std::make_tuple(loop, t, lvl)); + depRedOrder.emplace_back(loop, t, lvl); } if (depRedOrder.empty()) diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp index 8d636460c667e..caf80165fc640 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp @@ -1562,26 +1562,6 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) { return getInput1(); } -OpFoldResult tosa::LogOp::fold(FoldAdaptor adaptor) { - auto input = getInput1(); - // Element-wise log(exp(x)) = x - if (auto op = input.getDefiningOp()) { - return op.getInput1(); - } - - return {}; -} - -OpFoldResult tosa::ExpOp::fold(FoldAdaptor adaptor) { - auto input = getInput1(); - // Element-wise exp(log(x)) = x - if (auto op = input.getDefiningOp()) { - return op.getInput1(); - } - - return {}; -} - OpFoldResult tosa::NegateOp::fold(FoldAdaptor adaptor) { // Element-wise negate(negate(x)) = x // iff all zero points are constant 0 diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp index aa58fc21fe26f..332f1a0e5506f 100644 --- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp +++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp @@ -1843,12 +1843,6 @@ LogicalResult MatMulOp::verify() { return emitOpError("expect quantized operands to have same widths, got ") << aQuantWidth << " and " << bQuantWidth; } - } else { - // non-quantized element types - if (aElementType != bElementType) { - return emitOpError("expect same element type for inputs a and b, got ") - << aElementType << " and " << bElementType; - } } // check a_zp and b_zp @@ -4073,16 +4067,26 @@ LogicalResult WhileOp::verify() { .failed()) return failure(); - auto bodyYield = cast(getBodyGraph().front().getTerminator()); - if (errorIfTypeOrShapeMismatch(*this, bodyYield.getInputs(), - "'body_graph' results", getInputList(), - "'input_list'") - .failed()) - return failure(); + if (getBodyGraph().front().mightHaveTerminator()) { + auto bodyYield = + dyn_cast(getBodyGraph().front().getTerminator()); + if (bodyYield && errorIfTypeOrShapeMismatch(*this, bodyYield.getInputs(), + "'body_graph' results", + getInputList(), "'input_list'") + .failed()) + return failure(); + } // Condition block output must be a single element tensor with a single bool // value. - auto condYield = cast(getCondGraph().front().getTerminator()); + if (!getCondGraph().front().mightHaveTerminator()) + return success(); + + auto condYield = + dyn_cast(getCondGraph().front().getTerminator()); + if (!condYield) + return success(); + if (condYield.getInputs().size() != 1) return emitOpError() << "require 'cond_graph' only have one result"; diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp index 91fea676ac44a..4fc7ce81d9821 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp @@ -205,148 +205,142 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { constCheckers.emplace_back(checkConstantOperandNegate); } - bool levelCheckKernel(Operation *op, int32_t v, const StringRef checkDesc) { - if (v > tosaLevel.MAX_KERNEL) { - op->emitOpError() << "failed level check: " << checkDesc; - return false; - } - return true; + LogicalResult levelCheckKernel(Operation *op, int32_t v, + const StringRef checkDesc) { + if (v > tosaLevel.MAX_KERNEL) + return op->emitOpError() << "failed level check: " << checkDesc; + return success(); } - bool levelCheckStride(Operation *op, int32_t v, const StringRef checkDesc) { - if (v > tosaLevel.MAX_STRIDE) { - op->emitOpError() << "failed level check: " << checkDesc; - return false; - } - return true; + LogicalResult levelCheckStride(Operation *op, int32_t v, + const StringRef checkDesc) { + if (v > tosaLevel.MAX_STRIDE) + return op->emitOpError() << "failed level check: " << checkDesc; + return success(); } - bool levelCheckScale(Operation *op, int32_t v, const StringRef checkDesc) { - if (v > tosaLevel.MAX_SCALE) { - op->emitOpError() << "failed level check: " << checkDesc; - return false; - } - return true; + LogicalResult levelCheckScale(Operation *op, int32_t v, + const StringRef checkDesc) { + if (v > tosaLevel.MAX_SCALE) + return op->emitOpError() << "failed level check: " << checkDesc; + return success(); } - bool levelCheckListSize(Operation *op, int32_t v, const StringRef checkDesc) { - if (v > tosaLevel.MAX_TENSOR_LIST_SIZE) { - op->emitOpError() << "failed level check for MAX_TENSOR_LIST_SIZE: " - << checkDesc; - return false; - } - return true; + LogicalResult levelCheckListSize(Operation *op, int32_t v, + const StringRef checkDesc) { + if (v > tosaLevel.MAX_TENSOR_LIST_SIZE) + return op->emitOpError() + << "failed level check for MAX_TENSOR_LIST_SIZE: " << checkDesc; + return success(); } // Perform the Level Rank check on the tensor type. - bool levelCheckRank(Operation *op, const Type typeToCheck, - const StringRef operandOrResult, int32_t highest_rank) { + LogicalResult levelCheckRank(Operation *op, const Type typeToCheck, + const StringRef operandOrResult, + int32_t highest_rank) { if (ShapedType type = dyn_cast(typeToCheck)) { - if (!type.hasRank()) { - op->emitOpError() << "failed level check: unranked tensor"; - return false; - } - if (type.getRank() > highest_rank) { - op->emitOpError() << "failed level check: " << operandOrResult - << " rank(shape) <= MAX_RANK"; - return false; - } + if (!type.hasRank()) + return op->emitOpError() << "failed level check: unranked tensor"; + if (type.getRank() > highest_rank) + return op->emitOpError() << "failed level check: " << operandOrResult + << " rank(shape) <= MAX_RANK"; } - return true; + return success(); } // Perform the Level Rank check on the tensor value. - bool levelCheckRank(Operation *op, const Value &v, - const StringRef operandOrResult, int32_t highest_rank) { + LogicalResult levelCheckRank(Operation *op, const Value &v, + const StringRef operandOrResult, + int32_t highest_rank) { return levelCheckRank(op, v.getType(), operandOrResult, highest_rank); } // Perform the Level tensor size check on the tensor type. - bool levelCheckSize(Operation *op, const Type &typeToCheck, - const StringRef operandOrResult); + LogicalResult levelCheckSize(Operation *op, const Type &typeToCheck, + const StringRef operandOrResult); // Perform the Level tensor size check on the tensor value. - bool levelCheckSize(Operation *op, const Value &v, - const StringRef operandOrResult) { + LogicalResult levelCheckSize(Operation *op, const Value &v, + const StringRef operandOrResult) { return levelCheckSize(op, v.getType(), operandOrResult); } // Level check sizes of all operands and results of the operation. template - bool levelCheckSizes(T tosaOp) { + LogicalResult levelCheckSizes(T tosaOp) { auto op = tosaOp.getOperation(); for (auto v : op->getOperands()) { - if (!levelCheckSize(op, v, "operand")) - return false; + if (failed(levelCheckSize(op, v, "operand"))) + return failure(); } for (auto v : op->getResults()) { - if (!levelCheckSize(op, v, "result")) - return false; + if (failed(levelCheckSize(op, v, "result"))) + return failure(); } - return true; + return success(); } // Level check ranks of all operands, attribute and results of the operation. template - bool levelCheckRanks(T tosaOp) { + LogicalResult levelCheckRanks(T tosaOp) { auto op = tosaOp.getOperation(); for (auto v : op->getOperands()) { - if (!levelCheckRank(op, v, "operand", tosaLevel.MAX_RANK)) - return false; + if (failed(levelCheckRank(op, v, "operand", tosaLevel.MAX_RANK))) + return failure(); } for (auto v : op->getResults()) { - if (!levelCheckRank(op, v, "result", tosaLevel.MAX_RANK)) - return false; + if (failed(levelCheckRank(op, v, "result", tosaLevel.MAX_RANK))) + return failure(); } - return true; + return success(); } // Level check ranks and sizes. - bool levelCheckRanksAndSizes(Operation *op); + LogicalResult levelCheckRanksAndSizes(Operation *op); // Pool Op: level check kernel/stride/pad values template - bool levelCheckPool(Operation *op) { + LogicalResult levelCheckPool(Operation *op) { if (auto poolOp = dyn_cast(op)) { for (auto k : poolOp.getKernel()) { - if (!levelCheckKernel(op, k, "kernel <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, k, "kernel <= MAX_KERNEL"))) { + return failure(); } } for (auto s : poolOp.getStride()) { - if (!levelCheckStride(op, s, "stride <= MAX_STRIDE")) { - return false; + if (failed(levelCheckStride(op, s, "stride <= MAX_STRIDE"))) { + return failure(); } } for (auto p : poolOp.getPad()) { - if (!levelCheckKernel(op, p, "pad <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, p, "pad <= MAX_KERNEL"))) { + return failure(); } } } - return true; + return success(); } // Conv Op: level check dilation/stride/pad values template - bool levelCheckConv(Operation *op) { + LogicalResult levelCheckConv(Operation *op) { if (auto convOp = dyn_cast(op)) { for (auto k : convOp.getDilation()) { - if (!levelCheckKernel(op, k, "dilation <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, k, "dilation <= MAX_KERNEL"))) { + return failure(); } } for (auto p : convOp.getPad()) { - if (!levelCheckKernel(op, p, "pad <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, p, "pad <= MAX_KERNEL"))) { + return failure(); } } for (auto s : convOp.getStride()) { - if (!levelCheckStride(op, s, "stride <= MAX_STRIDE")) { - return false; + if (failed(levelCheckStride(op, s, "stride <= MAX_STRIDE"))) { + return failure(); } } auto dilation = convOp.getDilation(); @@ -356,100 +350,100 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { if (isa(op)) { assert(shape.size() == 4); assert(dilation.size() == 2); - if (!levelCheckKernel(op, dilation[0] * shape[1], - "dilation_y * KH <= MAX_KERNEL)") || - !levelCheckKernel(op, dilation[1] * shape[2], - "dilation_x * KW <= MAX_KERNEL)")) - return false; + if (failed(levelCheckKernel(op, dilation[0] * shape[1], + "dilation_y * KH <= MAX_KERNEL)")) || + failed(levelCheckKernel(op, dilation[1] * shape[2], + "dilation_x * KW <= MAX_KERNEL)"))) + return failure(); } else if (isa(op)) { assert(shape.size() == 5); assert(dilation.size() == 3); - if (!levelCheckKernel(op, dilation[0] * shape[1], - "dilation_d * KD <= MAX_KERNEL)") || - !levelCheckKernel(op, dilation[1] * shape[2], - "dilation_y * KH <= MAX_KERNEL)") || - !levelCheckKernel(op, dilation[2] * shape[3], - "dilation_x * KW <= MAX_KERNEL)")) - return false; + if (failed(levelCheckKernel(op, dilation[0] * shape[1], + "dilation_d * KD <= MAX_KERNEL)")) || + failed(levelCheckKernel(op, dilation[1] * shape[2], + "dilation_y * KH <= MAX_KERNEL)")) || + failed(levelCheckKernel(op, dilation[2] * shape[3], + "dilation_x * KW <= MAX_KERNEL)"))) + return failure(); } else if (isa(op)) { assert(shape.size() == 4); assert(dilation.size() == 2); - if (!levelCheckKernel(op, dilation[0] * shape[0], - "dilation_y * KH <= MAX_KERNEL)") || - !levelCheckKernel(op, dilation[1] * shape[1], - "dilation_x * KW <= MAX_KERNEL)")) - return false; + if (failed(levelCheckKernel(op, dilation[0] * shape[0], + "dilation_y * KH <= MAX_KERNEL)")) || + failed(levelCheckKernel(op, dilation[1] * shape[1], + "dilation_x * KW <= MAX_KERNEL)"))) + return failure(); } } } - return true; + return success(); } // FFT op: level check H, W in input shape [N,H,W] template - bool levelCheckFFT(Operation *op) { + LogicalResult levelCheckFFT(Operation *op) { if (isa(op)) { for (auto v : op->getOperands()) { if (ShapedType type = dyn_cast(v.getType())) { auto shape = type.getShape(); assert(shape.size() == 3); - if (!levelCheckKernel(op, shape[1], "H <= MAX_KERNEL") || - !levelCheckKernel(op, shape[2], "W <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, shape[1], "H <= MAX_KERNEL")) || + failed(levelCheckKernel(op, shape[2], "W <= MAX_KERNEL"))) { + return failure(); } } } } - return true; + return success(); } // TransposeConv2d op: level check kH/kW, outpad, and stride - bool levelCheckTransposeConv2d(Operation *op) { + LogicalResult levelCheckTransposeConv2d(Operation *op) { if (auto transpose = dyn_cast(op)) { if (ShapedType filterType = dyn_cast(transpose.getWeight().getType())) { auto shape = filterType.getShape(); assert(shape.size() == 4); // level check kernel sizes for kH and KW - if (!levelCheckKernel(op, shape[1], "KH <= MAX_KERNEL") || - !levelCheckKernel(op, shape[2], "KW <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, shape[1], "KH <= MAX_KERNEL")) || + failed(levelCheckKernel(op, shape[2], "KW <= MAX_KERNEL"))) { + return failure(); } } for (auto p : transpose.getOutPad()) { - if (!levelCheckKernel(op, p, "pad <= MAX_KERNEL")) { - return false; + if (failed(levelCheckKernel(op, p, "pad <= MAX_KERNEL"))) { + return failure(); } } for (auto s : transpose.getStride()) { - if (!levelCheckStride(op, s, "stride <= MAX_STRIDE")) { - return false; + if (failed(levelCheckStride(op, s, "stride <= MAX_STRIDE"))) { + return failure(); } } } - return true; + return success(); } // Resize op: level check max scales - bool levelCheckResize(Operation *op) { + LogicalResult levelCheckResize(Operation *op) { if (auto resize = dyn_cast(op)) { SmallVector scale; if (!tosa::getConstShapeValues(resize.getScale().getDefiningOp(), scale)) { - return false; + return failure(); } const int64_t scaleYN = scale[0]; const int64_t scaleYD = scale[1]; const int64_t scaleXN = scale[2]; const int64_t scaleXD = scale[3]; - if (!levelCheckScale(op, scaleYN / scaleYD, - "scale_y_n/scale_y_d <= MAX_SCALE") || - !levelCheckScale(op, scaleXN / scaleXD, - "scale_x_n/scale_x_d <= MAX_SCALE")) { - return false; + if (failed(levelCheckScale(op, scaleYN / scaleYD, + "scale_y_n/scale_y_d <= MAX_SCALE")) || + failed(levelCheckScale(op, scaleXN / scaleXD, + "scale_x_n/scale_x_d <= MAX_SCALE"))) { + return failure(); } } - return true; + return success(); } // Recursively perform a bottom-up search to determine the maximum nesting @@ -468,62 +462,65 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { getMaxNestedDepth(op, depth); } - bool levelCheckMaxNesting(Operation *op) { + LogicalResult levelCheckMaxNesting(Operation *op) { int32_t maxNestedDepth = 0; getMaxNestedDepth(op, maxNestedDepth); if (maxNestedDepth >= tosaLevel.MAX_NESTING) { op->emitOpError() << "failed level check: " << maxNestedDepth << " >= MAX_NESTING"; - return false; + return failure(); } - return true; + return success(); } - bool levelCheckListSize(Operation *op) { + LogicalResult levelCheckListSize(Operation *op) { if (auto concat = dyn_cast(op)) { return levelCheckListSize(op, concat.getInput1().size(), "input1"); } if (auto custom = dyn_cast(op)) { - if (!levelCheckListSize(op, custom.getInputList().size(), "input_list") || - !levelCheckListSize(op, custom.getOutputList().size(), - "output_list")) { - return false; + if (failed(levelCheckListSize(op, custom.getInputList().size(), + "input_list")) || + failed(levelCheckListSize(op, custom.getOutputList().size(), + "output_list"))) { + return failure(); } } if (auto condIf = dyn_cast(op)) { - if (!levelCheckListSize(op, condIf.getInputList().size(), "inputs") || - !levelCheckListSize(op, condIf.getOutputList().size(), "outputs")) { - return false; + if (failed( + levelCheckListSize(op, condIf.getInputList().size(), "inputs")) || + failed(levelCheckListSize(op, condIf.getOutputList().size(), + "outputs"))) { + return failure(); } } if (auto w = dyn_cast(op)) { - if (!levelCheckListSize(op, w.getInputList().size(), "inputs") || - !levelCheckListSize(op, w.getOutputList().size(), "outputs")) { - return false; + if (failed(levelCheckListSize(op, w.getInputList().size(), "inputs")) || + failed(levelCheckListSize(op, w.getOutputList().size(), "outputs"))) { + return failure(); } } - return true; + return success(); } - bool attributeCheckRescale(Operation *op) { + LogicalResult attributeCheckRescale(Operation *op) { if (auto rescale = dyn_cast(op)) { if (rescale.getRoundingMode() == RoundingMode::DOUBLE_ROUND && !targetEnv.allows(Extension::doubleround)) { op->emitOpError() << "failed attribute check: rounding_mode = DOUBLE_ROUND " << "requires extension [doubleround]"; - return false; + return failure(); } if (rescale.getRoundingMode() == RoundingMode::INEXACT_ROUND && !targetEnv.allows(Extension::inexactround)) { op->emitOpError() << "failed attribute check: rounding_mode = INEXACT_ROUND " << "requires extension [inexactround]"; - return false; + return failure(); } } - return true; + return success(); } // configure profile and level values from pass options profileName and @@ -563,8 +560,8 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { } } - bool CheckVariable(Operation *op); - bool CheckVariableReadOrWrite(Operation *op); + LogicalResult CheckVariable(Operation *op); + LogicalResult CheckVariableReadOrWrite(Operation *op); bool isValidElementType(Type type, const bool allowUnsigned = false); SmallVector< @@ -577,62 +574,66 @@ struct TosaValidation : public tosa::impl::TosaValidationBase { }; template <> -bool TosaValidation::levelCheckRanks(tosa::ArgMaxOp tosaOp) { +LogicalResult TosaValidation::levelCheckRanks(tosa::ArgMaxOp tosaOp) { auto *op = tosaOp.getOperation(); - if (!levelCheckRank(op, tosaOp.getInput(), "operand", tosaLevel.MAX_RANK)) - return false; + if (failed( + levelCheckRank(op, tosaOp.getInput(), "operand", tosaLevel.MAX_RANK))) + return failure(); // rank(output) = rank(input) - 1 - if (!levelCheckRank(op, tosaOp.getOutput(), "result", tosaLevel.MAX_RANK - 1)) - return false; + if (failed(levelCheckRank(op, tosaOp.getOutput(), "result", + tosaLevel.MAX_RANK - 1))) + return failure(); - return true; + return success(); } template <> -bool TosaValidation::levelCheckRanks(tosa::IfOp tosaOp) { +LogicalResult TosaValidation::levelCheckRanks(tosa::IfOp tosaOp) { auto *op = tosaOp.getOperation(); // Only the condition input has rank limitation. - if (!levelCheckRank(op, tosaOp.getCondition(), "operand", tosaLevel.MAX_RANK)) - return false; + if (failed(levelCheckRank(op, tosaOp.getCondition(), "operand", + tosaLevel.MAX_RANK))) + return failure(); - return true; + return success(); } template <> -bool TosaValidation::levelCheckRanks(tosa::VariableOp tosaOp) { +LogicalResult TosaValidation::levelCheckRanks(tosa::VariableOp tosaOp) { auto *op = tosaOp.getOperation(); auto variableType = getVariableType(tosaOp); - if (!levelCheckRank(op, variableType, "variable type", tosaLevel.MAX_RANK)) - return false; + if (failed(levelCheckRank(op, variableType, "variable type", + tosaLevel.MAX_RANK))) + return failure(); - return true; + return success(); } template <> -bool TosaValidation::levelCheckSizes(tosa::VariableOp tosaOp) { +LogicalResult TosaValidation::levelCheckSizes(tosa::VariableOp tosaOp) { auto *op = tosaOp.getOperation(); auto variableType = getVariableType(tosaOp); - if (!levelCheckSize(op, variableType, "variable type")) - return false; + if (failed(levelCheckSize(op, variableType, "variable type"))) + return failure(); - return true; + return success(); } -bool TosaValidation::levelCheckRanksAndSizes(Operation *op) { +LogicalResult TosaValidation::levelCheckRanksAndSizes(Operation *op) { #define CHECK_RANKS_AND_SIZES(tosaOp) \ if (isa(op)) { \ - if (!levelCheckRanks(cast(op))) \ - return false; \ - if (!levelCheckSizes(cast(op))) \ - return false; \ + if (failed(levelCheckRanks(cast(op)))) \ + return failure(); \ + if (failed(levelCheckSizes(cast(op)))) \ + return failure(); \ } #define CHECK_SIZES(tosaOp) \ if (isa(op)) { \ - if (!levelCheckSizes(cast(op))) \ - return false; \ + if (failed(levelCheckSizes(cast(op)))) \ + return failure(); \ } // Tensor Operators @@ -735,24 +736,21 @@ bool TosaValidation::levelCheckRanksAndSizes(Operation *op) { #undef CHECK_RANKS_AND_SIZES #undef CHECK_SIZES - return true; + return success(); } // Perform the Level tensor size check on the tensor type. -bool TosaValidation::levelCheckSize(Operation *op, const Type &typeToCheck, - const StringRef operandOrResult) { +LogicalResult TosaValidation::levelCheckSize(Operation *op, + const Type &typeToCheck, + const StringRef operandOrResult) { if (ShapedType type = dyn_cast(typeToCheck)) { - if (!type.hasRank()) { - op->emitOpError() << "failed level check: unranked tensor"; - return false; - } + if (!type.hasRank()) + return op->emitOpError() << "failed level check: unranked tensor"; auto shape = type.getShape(); for (auto dim : shape) { - if (mlir::ShapedType::isDynamic(dim)) { - op->emitOpError() << "failed level check: " << operandOrResult - << " shape dimension cannot be dynamic"; - return false; - } + if (mlir::ShapedType::isDynamic(dim)) + return op->emitOpError() << "failed level check: " << operandOrResult + << " shape dimension cannot be dynamic"; } int64_t element_bits = type.getElementTypeBitWidth(); @@ -765,14 +763,12 @@ bool TosaValidation::levelCheckSize(Operation *op, const Type &typeToCheck, // For each tensor, the number of tensor elements multiplied by the // element size in bytes must be representable as a tensor_size_t. const int64_t max_size = (INT64_C(1) << tosaLevel.MAX_LOG2_SIZE) - 1; - if (size > max_size) { - op->emitOpError() - << "failed level check: " << operandOrResult - << " tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)"; - return false; - } + if (size > max_size) + return op->emitOpError() + << "failed level check: " << operandOrResult + << " tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)"; } - return true; + return success(); } LogicalResult TosaValidation::applyLevelCheck(Operation *op) { @@ -782,28 +778,28 @@ LogicalResult TosaValidation::applyLevelCheck(Operation *op) { } // check rank and sizes early so later checks can assume shaped operands - if (!levelCheckRanksAndSizes(op)) + if (failed(levelCheckRanksAndSizes(op))) return failure(); // additional level checks from spec 0.70 - if (!levelCheckPool(op) || - !levelCheckConv(op) || - !levelCheckConv(op) || - !levelCheckConv(op) || - !levelCheckFFT(op) || - !levelCheckPool(op) || - !levelCheckFFT(op) || !levelCheckTransposeConv2d(op) || - !levelCheckResize(op)) { + if (failed(levelCheckPool(op)) || + failed(levelCheckConv(op)) || + failed(levelCheckConv(op)) || + failed(levelCheckConv(op)) || + failed(levelCheckFFT(op)) || + failed(levelCheckPool(op)) || + failed(levelCheckFFT(op)) || + failed(levelCheckTransposeConv2d(op)) || failed(levelCheckResize(op))) { return failure(); } // level check MAX_TENSOR_LIST_SIZE - if (!levelCheckListSize(op)) { + if (failed(levelCheckListSize(op))) { return failure(); } if (isa(op) || isa(op)) { - if (!levelCheckMaxNesting(op)) { + if (failed(levelCheckMaxNesting(op))) { return failure(); } } @@ -812,7 +808,7 @@ LogicalResult TosaValidation::applyLevelCheck(Operation *op) { } LogicalResult TosaValidation::applyAttributeCheck(Operation *op) { - if (!attributeCheckRescale(op)) + if (failed(attributeCheckRescale(op))) return failure(); return success(); } @@ -823,14 +819,12 @@ inline bool CompatibleTypes(const mlir::Type &type, return type == declaredType; } -bool TosaValidation::CheckVariable(Operation *op) { +LogicalResult TosaValidation::CheckVariable(Operation *op) { if (auto variableOp = dyn_cast(op)) { mlir::StringAttr nameAttr = variableOp.getNameAttr(); - if (variablesMap.count(nameAttr)) { - op->emitOpError() << "name has already been declared"; - return false; - } + if (variablesMap.count(nameAttr)) + return op->emitOpError() << "name has already been declared"; auto elementType = variableOp.getType(); DenseIntElementsAttr varShapeAttr = variableOp.getVarShape(); @@ -841,51 +835,44 @@ bool TosaValidation::CheckVariable(Operation *op) { variablesMap[nameAttr] = variableType; } - return true; + return success(); } -bool TosaValidation::CheckVariableReadOrWrite(Operation *op) { +LogicalResult TosaValidation::CheckVariableReadOrWrite(Operation *op) { if (isa(op) || isa(op)) { mlir::StringAttr nameAttr = cast(op->getAttr("name")); - if (!variablesMap.count(nameAttr)) { - op->emitOpError() << "name has not been declared"; - return false; - } + if (!variablesMap.count(nameAttr)) + return op->emitOpError() << "name has not been declared"; auto varType = variablesMap[nameAttr]; for (auto v : op->getOperands()) { auto type = v.getType(); - if (!CompatibleTypes(type, varType)) { - op->emitOpError() << "operand type does not equal variable type"; - return false; - } + if (!CompatibleTypes(type, varType)) + return op->emitOpError() << "operand type does not equal variable type"; } for (auto v : op->getResults()) { auto type = v.getType(); - if (!CompatibleTypes(type, varType)) { - op->emitOpError() << "result type does not equal variable type"; - return false; - } + if (!CompatibleTypes(type, varType)) + return op->emitOpError() << "result type does not equal variable type"; } } - return true; + return success(); } LogicalResult TosaValidation::applyVariableCheck(Operation *op) { - if (!CheckVariable(op) || !CheckVariableReadOrWrite(op)) { + if (failed(CheckVariable(op)) || failed(CheckVariableReadOrWrite(op))) return failure(); - } return success(); } -bool checkErrorIfResize(Operation *op) { +LogicalResult checkErrorIfResize(Operation *op) { auto resize = dyn_cast(op); if (!resize) - return true; + return success(); const Value input = resize.getInput(); const Value output = resize.getOutput(); @@ -894,10 +881,8 @@ bool checkErrorIfResize(Operation *op) { const RankedTensorType outputType = llvm::dyn_cast(output.getType()); - if (!inputType || !outputType) { - op->emitOpError("expect ranked input/output tensor"); - return false; - } + if (!inputType || !outputType) + return op->emitOpError("expect ranked input/output tensor"); // Ensure the image size is supported by GPU APIs and that for integer // implementations, position * stride does not overflow int32_t. @@ -906,17 +891,15 @@ bool checkErrorIfResize(Operation *op) { outputType.getDimSize(1), outputType.getDimSize(2), inputType.getDimSize(1), inputType.getDimSize(2)}; const int64_t *maxDim = llvm::max_element(sizes); - if (maxDim != sizes.end() && *maxDim >= 16384) { - op->emitOpError("expect input/output height/width dims to be < 16384, ") - << "got [OH, OW, IH, IW] = " << sizes; - return false; - } + if (maxDim != sizes.end() && *maxDim >= 16384) + return op->emitOpError( + "expect input/output height/width dims to be < 16384, ") + << "got [OH, OW, IH, IW] = " << sizes; } SmallVector scale; - if (!tosa::getConstShapeValues(resize.getScale().getDefiningOp(), scale)) { - return false; - } + if (!tosa::getConstShapeValues(resize.getScale().getDefiningOp(), scale)) + return failure(); const int64_t scaleYN = scale[0]; const int64_t scaleYD = scale[1]; @@ -924,57 +907,45 @@ bool checkErrorIfResize(Operation *op) { const int64_t scaleXD = scale[3]; // Ensure scale values don't overflow int32 accumulator - if (scaleYN > (1 << 11) || scaleXN > (1 << 11)) { - op->emitOpError("expect all scale numerator values to be <= (1 << 11), " - "got scale_y_n=") - << scaleYN << ", scale_x_n=" << scaleXN; - return false; - } + if (scaleYN > (1 << 11) || scaleXN > (1 << 11)) + return op->emitOpError( + "expect all scale numerator values to be <= (1 << 11), " + "got scale_y_n=") + << scaleYN << ", scale_x_n=" << scaleXN; - if (scaleYD >= 16 * scaleYN || scaleXD >= 16 * scaleXN) { - op->emitOpError("expect a downscale ratio larger than 1/16, got y=") - << scaleYN << "/" << scaleYD << ", x=" << scaleXN << "/" << scaleXD; - return false; - } + if (scaleYD >= 16 * scaleYN || scaleXD >= 16 * scaleXN) + return op->emitOpError("expect a downscale ratio larger than 1/16, got y=") + << scaleYN << "/" << scaleYD << ", x=" << scaleXN << "/" << scaleXD; SmallVector offset; SmallVector border; if (!tosa::getConstShapeValues(resize.getOffset().getDefiningOp(), offset) || - !tosa::getConstShapeValues(resize.getBorder().getDefiningOp(), border)) { - return false; - } + !tosa::getConstShapeValues(resize.getBorder().getDefiningOp(), border)) + return failure(); const int64_t offsetY = offset[0]; const int64_t offsetX = offset[1]; // Set a consistent lower limit of 1/16 downscale to simplify // implementations - if (offsetY < -scaleYN || offsetY >= 16 * scaleYN) { - op->emitOpError( - "expect offsetY / scaleYNumerator to be in range [-1, 16), got ") - << offsetY << "/" << scaleYN; - return false; - } - if (offsetX < -scaleXN || offsetX >= 16 * scaleXN) { - op->emitOpError( - "expect offsetX / scaleXNumerator to be in range [-1, 16), got ") - << offsetX << "/" << scaleXN; - return false; - } + if (offsetY < -scaleYN || offsetY >= 16 * scaleYN) + return op->emitOpError( + "expect offsetY / scaleYNumerator to be in range [-1, 16), got ") + << offsetY << "/" << scaleYN; + if (offsetX < -scaleXN || offsetX >= 16 * scaleXN) + return op->emitOpError( + "expect offsetX / scaleXNumerator to be in range [-1, 16), got ") + << offsetX << "/" << scaleXN; const int64_t borderY = border[0]; const int64_t borderX = border[1]; - if (borderY < -16 * scaleYN || borderY >= scaleYN) { - op->emitOpError( - "expect borderY / scaleYNumerator to be in range [-16, 1), got ") - << borderY << "/" << scaleYN; - return false; - } - if (borderX < -16 * scaleXN || borderX >= scaleXN) { - op->emitOpError( - "expect borderX / scaleXNumerator to be in range [-16, 1), got ") - << borderX << "/" << scaleXN; - return false; - } + if (borderY < -16 * scaleYN || borderY >= scaleYN) + return op->emitOpError( + "expect borderY / scaleYNumerator to be in range [-16, 1), got ") + << borderY << "/" << scaleYN; + if (borderX < -16 * scaleXN || borderX >= scaleXN) + return op->emitOpError( + "expect borderX / scaleXNumerator to be in range [-16, 1), got ") + << borderX << "/" << scaleXN; // The following section of code is mostly duplicated with ResizeOp::verify(). // @@ -1001,81 +972,72 @@ bool checkErrorIfResize(Operation *op) { if (ih != ShapedType::kDynamic) { const std::optional calculatedOutHeightMinusOne = idivCheck((ih - 1) * scaleYN - offsetY + borderY, scaleYD); - if (!calculatedOutHeightMinusOne.has_value()) { - op->emitOpError("expected (input_height - 1) * scale_y_n - offset_y + " - "border_y ") - << "to be wholly divisible by scale_y_d, got ((" << ih << " - 1) * " - << scaleYN << " - " << offsetY << " + " << borderY << ") / " - << scaleYD; - return false; - } + if (!calculatedOutHeightMinusOne.has_value()) + return op->emitOpError( + "expected (input_height - 1) * scale_y_n - offset_y + " + "border_y ") + << "to be wholly divisible by scale_y_d, got ((" << ih + << " - 1) * " << scaleYN << " - " << offsetY << " + " << borderY + << ") / " << scaleYD; const int64_t calculatedOutHeight = calculatedOutHeightMinusOne.value() + 1; - if (oh != ShapedType::kDynamic && calculatedOutHeight != oh) { - op->emitOpError("calculated output height did not match expected: ") - << "calculated=" << calculatedOutHeight << ", expected=" << oh; - return false; - } + if (oh != ShapedType::kDynamic && calculatedOutHeight != oh) + return op->emitOpError( + "calculated output height did not match expected: ") + << "calculated=" << calculatedOutHeight << ", expected=" << oh; } if (iw != ShapedType::kDynamic) { const std::optional calculatedOutWidthMinusOne = idivCheck((iw - 1) * scaleXN - offsetX + borderX, scaleXD); - if (!calculatedOutWidthMinusOne.has_value()) { - op->emitOpError("expected (input_width - 1) * scale_x_n - offset_x + " - "border_x ") - << "to be wholly divisible by scale_x_d, got ((" << iw << " - 1) * " - << scaleXN << " - " << offsetX << " + " << borderX << ") / " - << scaleXD; - return false; - } + if (!calculatedOutWidthMinusOne.has_value()) + return op->emitOpError( + "expected (input_width - 1) * scale_x_n - offset_x + " + "border_x ") + << "to be wholly divisible by scale_x_d, got ((" << iw + << " - 1) * " << scaleXN << " - " << offsetX << " + " << borderX + << ") / " << scaleXD; const int64_t calculatedOutWidth = calculatedOutWidthMinusOne.value() + 1; - if (ow != ShapedType::kDynamic && calculatedOutWidth != ow) { - op->emitOpError("calculated output width did not match expected: ") - << "calculated=" << calculatedOutWidth << ", expected=" << ow; - return false; - } + if (ow != ShapedType::kDynamic && calculatedOutWidth != ow) + return op->emitOpError("calculated output width did not match expected: ") + << "calculated=" << calculatedOutWidth << ", expected=" << ow; } - return true; + return success(); } -bool checkErrorIfMul(Operation *op) { +LogicalResult checkErrorIfMul(Operation *op) { auto mul = dyn_cast(op); if (!mul) - return true; + return success(); // REQUIRE(0 <= shift && shift <= 63); // REQUIRE(is_same() || shift == 0); ElementsAttr shift_elem; - if (!matchPattern(mul.getShift(), m_Constant(&shift_elem))) { - return true; - } + if (!matchPattern(mul.getShift(), m_Constant(&shift_elem))) + return success(); int32_t shift = shift_elem.getValues()[0].getInt(); auto inputElemType = getElementTypeOrSelf(mul.getInput1()); if (inputElemType.isInteger(32)) { // 0 <= shift <= 63 for int32_t type - if (shift < 0 || shift > 63) { - op->emitOpError() << "requires 0 <= shift && shift <= 63, but got: " - << shift; - return false; - } + if (shift < 0 || shift > 63) + return op->emitOpError() + << "requires 0 <= shift && shift <= 63, but got: " << shift; } else { // shift must be 0 for all other types - if (shift != 0) { - op->emitOpError() << "requires shift = 0 for all input data types that " - "are not int32_t, but got: " - << shift; - return false; - } + if (shift != 0) + return op->emitOpError() + << "requires shift = 0 for all input data types that " + "are not int32_t, but got: " + << shift; } - return true; + return success(); } -bool checkErrorIfTable(Operation *op) { +LogicalResult checkErrorIfTable(Operation *op) { auto table = dyn_cast(op); if (!table) - return true; + return success(); // REQUIRE(length(table) == TABLE_SIZE) where TABLE_SIZE is 256 or 513 const auto inputElemType = getElementTypeOrSelf(table.getInput1().getType()); @@ -1084,26 +1046,24 @@ bool checkErrorIfTable(Operation *op) { const ShapeAdaptor tableShape(table.getTable().getType()); if (tableShape.hasStaticShape()) { const auto numElements = tableShape.getNumElements(); - if (numElements != tableSize) { - op->emitOpError() << "requires table size of " << tableSize << ", got " - << numElements; - return false; - } + if (numElements != tableSize) + return op->emitOpError() << "requires table size of " << tableSize + << ", got " << numElements; } - return true; + return success(); } -bool checkErrorIfRescale(Operation *op) { +LogicalResult checkErrorIfRescale(Operation *op) { auto rescale = dyn_cast(op); if (!rescale) - return true; + return success(); auto inputType = llvm::dyn_cast(rescale.getInput().getType()); auto outputType = llvm::dyn_cast(rescale.getOutput().getType()); if (!inputType || !outputType || !inputType.getElementType().isInteger() || !outputType.getElementType().isInteger()) - return true; + return success(); auto inElemType = inputType.getElementType(); auto outElemType = outputType.getElementType(); @@ -1117,81 +1077,65 @@ bool checkErrorIfRescale(Operation *op) { auto roundingMode = rescale.getRoundingMode(); // ERROR_IF(scale32 && is_same()) - if (scale32 && inWidth == 48) { - op->emitOpError() << "scale32 is not allowed with 48-bit input."; - return false; - } + if (scale32 && inWidth == 48) + return op->emitOpError() << "scale32 is not allowed with 48-bit input."; // ERROR_IF(!scale32 && (rounding_mode == DOUBLE_ROUND)) - if (!scale32 && roundingMode == RoundingMode::DOUBLE_ROUND) { - op->emitOpError() << "DOUBLE_ROUND is only allowed with scale32=true."; - return false; - } + if (!scale32 && roundingMode == RoundingMode::DOUBLE_ROUND) + return op->emitOpError() + << "DOUBLE_ROUND is only allowed with scale32=true."; // ERROR_IF(input_unsigned && output_unsigned) - if (inputUnsigned && outputUnsigned) { - op->emitOpError() << "input and output cannot be both unsigned."; - return false; - } + if (inputUnsigned && outputUnsigned) + return op->emitOpError() << "input and output cannot be both unsigned."; // ERROR_IF(is_same() && input_unsigned) - if (outWidth == 32 && inputUnsigned) { - op->emitOpError() << "i32 output type is not allowed with unsigned input."; - return false; - } + if (outWidth == 32 && inputUnsigned) + return op->emitOpError() + << "i32 output type is not allowed with unsigned input."; // ERROR_IF(is_same() && output_unsigned) - if (inWidth == 32 && outputUnsigned) { - op->emitOpError() << "i32 input type is not allowed with unsigned output."; - return false; - } + if (inWidth == 32 && outputUnsigned) + return op->emitOpError() + << "i32 input type is not allowed with unsigned output."; // ERROR_IF(is_same() && output_unsigned) - if (inWidth == 48 && outputUnsigned) { - op->emitOpError() << "i48 input type is not allowed with unsigned output."; - return false; - } + if (inWidth == 48 && outputUnsigned) + return op->emitOpError() + << "i48 input type is not allowed with unsigned output."; // ERROR_IF(is_same && input_unsigned) - if (inWidth == 48 && inputUnsigned) { - op->emitOpError() << "i48 input type cannot be unsigned."; - return false; - } + if (inWidth == 48 && inputUnsigned) + return op->emitOpError() << "i48 input type cannot be unsigned."; // ERROR_IF(is_same && input_unsigned) - if (inWidth == 32 && inputUnsigned) { - op->emitOpError() << "i32 input type cannot be unsigned."; - return false; - } + if (inWidth == 32 && inputUnsigned) + return op->emitOpError() << "i32 input type cannot be unsigned."; // ERROR_IF(is_same && output_unsigned) - if (outWidth == 32 && outputUnsigned) { - op->emitOpError() << "i32 output type cannot be unsigned."; - return false; - } + if (outWidth == 32 && outputUnsigned) + return op->emitOpError() << "i32 output type cannot be unsigned."; - return true; + return success(); } -bool checkErrorIfPad(Operation *op) { +LogicalResult checkErrorIfPad(Operation *op) { auto pad = dyn_cast(op); if (!pad) - return true; + return success(); DenseIntElementsAttr paddingAttr; if (!matchPattern(pad.getPadding(), m_Constant(&paddingAttr))) // Pad verifier will catch this - return true; + return success(); for (const APInt &val : paddingAttr.getValues()) { - if (val.getSExtValue() < 0) { - op->emitOpError() << "padding value must all be non-negative, got " - << val.getSExtValue(); - return false; - } + if (val.getSExtValue() < 0) + return op->emitOpError() << "padding value must all be non-negative, got " + << val.getSExtValue(); } - return true; + return success(); } static bool isOpIsolatedWithinRegion(Operation *op, Region *region) { @@ -1201,7 +1145,7 @@ static bool isOpIsolatedWithinRegion(Operation *op, Region *region) { }); } -static bool isRegionIsolatedFromAbove(Region ®ionToCheck) { +static LogicalResult isRegionIsolatedFromAbove(Region ®ionToCheck) { bool noLiveInValue = true; regionToCheck.walk([&noLiveInValue, ®ionToCheck](Operation *op) { if (!isOpIsolatedWithinRegion(op, ®ionToCheck)) { @@ -1210,23 +1154,22 @@ static bool isRegionIsolatedFromAbove(Region ®ionToCheck) { } return WalkResult::advance(); }); - return noLiveInValue; + return noLiveInValue ? success() : failure(); } LogicalResult checkIsolatedRegion(Operation *op, Region ®ionToCheck, StringRef regionName) { - if (isRegionIsolatedFromAbove(regionToCheck)) + if (succeeded(isRegionIsolatedFromAbove(regionToCheck))) return success(); - op->emitOpError() - << "is not conformant to the TOSA specification. It requires the '" - << regionName << "' region is isolated from above.\n"; - return failure(); + return op->emitOpError() + << "is not conformant to the TOSA specification. It requires the '" + << regionName << "' region is isolated from above.\n"; } -bool checkErrorIfCondIf(Operation *op) { +LogicalResult checkErrorIfCondIf(Operation *op) { auto ifOp = dyn_cast(op); if (!ifOp) - return true; + return success(); // Currently the dialect supports declaring cond_if operations that // have then/else regions that reference values from outside these @@ -1257,49 +1200,53 @@ bool checkErrorIfCondIf(Operation *op) { // tosa.yield %arg4 // } - return failed(checkIsolatedRegion(op, ifOp.getThenGraph(), "then")) || - failed(checkIsolatedRegion(op, ifOp.getElseGraph(), "else")); + if (failed(checkIsolatedRegion(op, ifOp.getThenGraph(), "then")) || + failed(checkIsolatedRegion(op, ifOp.getElseGraph(), "else"))) + return failure(); + return success(); } -bool checkErrorIfWhileLoop(Operation *op) { +LogicalResult checkErrorIfWhileLoop(Operation *op) { auto whileOp = dyn_cast(op); if (!whileOp) - return true; + return success(); - return failed(checkIsolatedRegion(op, whileOp.getCondGraph(), "cond")) || - failed(checkIsolatedRegion(op, whileOp.getBodyGraph(), "body")); + if (failed(checkIsolatedRegion(op, whileOp.getCondGraph(), "cond")) || + failed(checkIsolatedRegion(op, whileOp.getBodyGraph(), "body"))) + return failure(); + return success(); } -bool checkErrorIfScatter(Operation *op) { +LogicalResult checkErrorIfScatter(Operation *op) { auto scatterOp = dyn_cast(op); if (!scatterOp) - return true; + return success(); // for constant indices, check that there are no duplicate values DenseIntElementsAttr indicesAttr; if (!matchPattern(scatterOp.getIndices(), m_Constant(&indicesAttr))) - return true; + return success(); auto const indicesType = dyn_cast(scatterOp.getIndices().getType()); if (!indicesType || !indicesType.hasRank()) { op->emitOpError("expect ranked indices tensor"); - return false; + return failure(); } if (!hasUniqueConstantScatterIndices(indicesType, indicesAttr)) { op->emitOpError("indices values contain duplicates"); - return false; + return failure(); } - return true; + return success(); } LogicalResult TosaValidation::applyErrorIfCheck(Operation *op) { - if (!checkErrorIfResize(op) || !checkErrorIfMul(op) || - !checkErrorIfTable(op) || !checkErrorIfRescale(op) || - !checkErrorIfPad(op) || !checkErrorIfCondIf(op) || - !checkErrorIfWhileLoop(op) || !checkErrorIfScatter(op)) + if (failed(checkErrorIfResize(op)) || failed(checkErrorIfMul(op)) || + failed(checkErrorIfTable(op)) || failed(checkErrorIfRescale(op)) || + failed(checkErrorIfPad(op)) || failed(checkErrorIfCondIf(op)) || + failed(checkErrorIfWhileLoop(op)) || failed(checkErrorIfScatter(op))) return failure(); return success(); } diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 347141e2773b8..eb4686997c1b9 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -396,14 +396,31 @@ std::optional vector::getConstantVscaleMultiplier(Value value) { return {}; } -/// Converts an IntegerAttr to have the specified type if needed. -/// This handles cases where integer constant attributes have a different type -/// than the target element type. -static IntegerAttr convertIntegerAttr(IntegerAttr intAttr, Type expectedType) { - if (intAttr.getType() == expectedType) - return intAttr; // Already correct type +/// Converts numeric attributes to the expected type. Supports +/// integer-to-integer and float-to-integer conversions. Returns the original +/// attribute if no conversion is needed or supported. +static Attribute convertNumericAttr(Attribute attr, Type expectedType) { + // Integer-to-integer conversion + if (auto intAttr = dyn_cast(attr)) { + if (auto intType = dyn_cast(expectedType)) { + if (intAttr.getType() != expectedType) + return IntegerAttr::get(expectedType, intAttr.getInt()); + } + return attr; + } + + // Float-to-integer bitcast (preserves bit representation) + if (auto floatAttr = dyn_cast(attr)) { + auto intType = dyn_cast(expectedType); + if (!intType) + return attr; + + APFloat floatVal = floatAttr.getValue(); + APInt intVal = floatVal.bitcastToAPInt(); + return IntegerAttr::get(expectedType, intVal); + } - return IntegerAttr::get(expectedType, intAttr.getInt()); + return attr; } //===----------------------------------------------------------------------===// @@ -2473,16 +2490,11 @@ static OpFoldResult foldFromElementsToConstant(FromElementsOp fromElementsOp, if (!destEltType.isIntOrIndexOrFloat() && !isa(destEltType)) return {}; - // Convert integer attributes to the target type if needed, leave others - // unchanged. - auto convertedElements = - llvm::map_to_vector(elements, [&](Attribute attr) -> Attribute { - if (auto intAttr = dyn_cast(attr)) { - return convertIntegerAttr(intAttr, destEltType); - } - return attr; // Non-integer attributes (FloatAttr, etc.) returned - // unchanged - }); + // Constant attributes might have a different type than the return type. + // Convert them before creating the dense elements attribute. + auto convertedElements = llvm::map_to_vector(elements, [&](Attribute attr) { + return convertNumericAttr(attr, destEltType); + }); return DenseElementsAttr::get(destVecType, convertedElements); } @@ -3503,19 +3515,13 @@ foldDenseElementsAttrDestInsertOp(InsertOp insertOp, Attribute srcAttr, SmallVector insertedValues; Type destEltType = destTy.getElementType(); - /// Converts integer attributes to the expected type if there's a mismatch. - /// Non-integer attributes are left unchanged. + /// Converts attribute to the expected type if there's + /// a mismatch. if (auto denseSource = llvm::dyn_cast(srcAttr)) { for (auto value : denseSource.getValues()) - if (auto intAttr = dyn_cast(value)) - insertedValues.push_back(convertIntegerAttr(intAttr, destEltType)); - else - insertedValues.push_back(value); // Non-integer attributes unchanged + insertedValues.push_back(convertNumericAttr(value, destEltType)); } else { - if (auto intAttr = dyn_cast(srcAttr)) - insertedValues.push_back(convertIntegerAttr(intAttr, destEltType)); - else - insertedValues.push_back(srcAttr); // Non-integer attributes unchanged + insertedValues.push_back(convertNumericAttr(srcAttr, destEltType)); } auto allValues = llvm::to_vector(denseDst.getValues()); @@ -5099,6 +5105,14 @@ void TransferReadOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +TransferReadOp::bubbleDownCasts(OpBuilder &builder) { + if (!hasPureBufferSemantics()) + return failure(); + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // TransferWriteOp //===----------------------------------------------------------------------===// @@ -5586,6 +5600,14 @@ void TransferWriteOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +TransferWriteOp::bubbleDownCasts(OpBuilder &builder) { + if (!hasPureBufferSemantics()) + return failure(); + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + ValueRange()); +} + //===----------------------------------------------------------------------===// // LoadOp //===----------------------------------------------------------------------===// @@ -5640,6 +5662,12 @@ std::optional> LoadOp::getShapeForUnroll() { return llvm::to_vector<4>(getVectorType().getShape()); } +FailureOr>> +LoadOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // StoreOp //===----------------------------------------------------------------------===// @@ -5679,6 +5707,12 @@ std::optional> StoreOp::getShapeForUnroll() { return llvm::to_vector<4>(getVectorType().getShape()); } +FailureOr>> +StoreOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + ValueRange()); +} + //===----------------------------------------------------------------------===// // MaskedLoadOp //===----------------------------------------------------------------------===// @@ -5733,6 +5767,12 @@ OpFoldResult MaskedLoadOp::fold(FoldAdaptor) { return OpFoldResult(); } +FailureOr>> +MaskedLoadOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // MaskedStoreOp //===----------------------------------------------------------------------===// @@ -5783,6 +5823,12 @@ LogicalResult MaskedStoreOp::fold(FoldAdaptor adaptor, return memref::foldMemRefCast(*this); } +FailureOr>> +MaskedStoreOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + ValueRange()); +} + //===----------------------------------------------------------------------===// // GatherOp //===----------------------------------------------------------------------===// @@ -5886,6 +5932,12 @@ void GatherOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +GatherOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // ScatterOp //===----------------------------------------------------------------------===// @@ -5948,6 +6000,12 @@ void ScatterOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +ScatterOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + ValueRange()); +} + //===----------------------------------------------------------------------===// // ExpandLoadOp //===----------------------------------------------------------------------===// @@ -5996,6 +6054,12 @@ void ExpandLoadOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +ExpandLoadOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + getResult()); +} + //===----------------------------------------------------------------------===// // CompressStoreOp //===----------------------------------------------------------------------===// @@ -6042,6 +6106,12 @@ void CompressStoreOp::getCanonicalizationPatterns(RewritePatternSet &results, results.add(context); } +FailureOr>> +CompressStoreOp::bubbleDownCasts(OpBuilder &builder) { + return mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(getBaseMutable(), + ValueRange()); +} + //===----------------------------------------------------------------------===// // ShapeCastOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp index 78f74eef7bee3..bdbb792041e3d 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateMaskedLoadStore.cpp @@ -64,7 +64,6 @@ struct VectorMaskedLoadOpConverter final Value mask = maskedLoadOp.getMask(); Value base = maskedLoadOp.getBase(); Value iValue = maskedLoadOp.getPassThru(); - std::optional alignment = maskedLoadOp.getAlignment(); auto indices = llvm::to_vector_of(maskedLoadOp.getIndices()); Value one = arith::ConstantOp::create(rewriter, loc, indexType, IntegerAttr::get(indexType, 1)); @@ -76,7 +75,7 @@ struct VectorMaskedLoadOpConverter final [&](OpBuilder &builder, Location loc) { auto loadedValue = memref::LoadOp::create( builder, loc, base, indices, /*nontemporal=*/false, - alignment.value_or(0)); + llvm::MaybeAlign(maskedLoadOp.getAlignment().value_or(0))); auto combinedValue = vector::InsertOp::create(builder, loc, loadedValue, iValue, i); scf::YieldOp::create(builder, loc, combinedValue.getResult()); @@ -135,7 +134,6 @@ struct VectorMaskedStoreOpConverter final Value base = maskedStoreOp.getBase(); Value value = maskedStoreOp.getValueToStore(); bool nontemporal = false; - std::optional alignment = maskedStoreOp.getAlignment(); auto indices = llvm::to_vector_of(maskedStoreOp.getIndices()); Value one = arith::ConstantOp::create(rewriter, loc, indexType, IntegerAttr::get(indexType, 1)); @@ -145,8 +143,9 @@ struct VectorMaskedStoreOpConverter final auto ifOp = scf::IfOp::create(rewriter, loc, maskBit, /*else=*/false); rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); auto extractedValue = vector::ExtractOp::create(rewriter, loc, value, i); - memref::StoreOp::create(rewriter, loc, extractedValue, base, indices, - nontemporal, alignment.value_or(0)); + memref::StoreOp::create( + rewriter, loc, extractedValue, base, indices, nontemporal, + llvm::MaybeAlign(maskedStoreOp.getAlignment().value_or(0))); rewriter.setInsertionPointAfter(ifOp); indices.back() = diff --git a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp index 6551a60b5812e..025ee9a04a1de 100644 --- a/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/Utils/VectorUtils.cpp @@ -319,7 +319,7 @@ bool vector::isLinearizableVector(VectorType type) { Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, Value source, ArrayRef inputVectorSizes, - Value padValue, + std::optional padValue, bool useInBoundsInsteadOfMasking, ArrayRef inputScalableVecDims) { assert(!llvm::is_contained(inputVectorSizes, ShapedType::kDynamic) && @@ -328,9 +328,11 @@ Value vector::createReadOrMaskedRead(OpBuilder &builder, Location loc, auto sourceShape = sourceShapedType.getShape(); assert(sourceShape.size() == inputVectorSizes.size() && "expected same ranks."); - auto vectorType = VectorType::get(inputVectorSizes, padValue.getType(), - inputScalableVecDims); - assert(padValue.getType() == sourceShapedType.getElementType() && + auto vectorType = + VectorType::get(inputVectorSizes, sourceShapedType.getElementType(), + inputScalableVecDims); + assert((!padValue.has_value() || + padValue.value().getType() == sourceShapedType.getElementType()) && "expected same pad element type to match source element type"); int64_t readRank = inputVectorSizes.size(); auto zero = arith::ConstantIndexOp::create(builder, loc, 0); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 20608f97611bb..81b5788d0b9b4 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -23,7 +23,7 @@ namespace mlir { namespace xegpu { -bool isSharedMemory(const MemRefType &memrefTy) { +static bool isSharedMemory(const MemRefType &memrefTy) { Attribute attr = memrefTy.getMemorySpace(); if (auto intAttr = llvm::dyn_cast(attr)) return intAttr.getInt() == 3; @@ -340,7 +340,7 @@ LogicalResult CreateNdDescOp::verify() { return success(); } -ParseResult parseOptionalDynamicIndexList( +static ParseResult parseOptionalDynamicIndexList( OpAsmParser &parser, SmallVectorImpl &values, DenseI64ArrayAttr &integers, SmallVectorImpl *valueTypes = nullptr, @@ -378,9 +378,9 @@ ParseResult parseOptionalDynamicIndexList( return success(); } -void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op, - OperandRange values, - DenseI64ArrayAttr integers) { +static void printOptionalDynamicIndexList(OpAsmPrinter &printer, Operation *op, + OperandRange values, + DenseI64ArrayAttr integers) { if (!integers || integers.empty()) return; printDynamicIndexList(printer, op, values, integers, diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 7efa4b9fbd934..36c498e8b849d 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -319,7 +319,8 @@ void XeGPUBlockingPass::runOnOperation() { options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); }); - options.setUnrolledTypesFn([&](ShapedType type, ArrayRef tileShape) { + options.setUnrolledTypesFn([&](ShapedType type, ArrayRef tileShape, + bool returnSingleType = false) { Type elemTy = type.getElementType(); Type newTy; @@ -352,6 +353,8 @@ void XeGPUBlockingPass::runOnOperation() { newTy = type.clone(tileShape, elemTy); } + if (returnSingleType) + return SmallVector{newTy}; std::optional> ratio = computeShapeRatio(type.getShape(), tileShape); assert(ratio && "The shape of the type must be a multiple of tileShape."); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp index 449b8eb030b07..882691fd19f58 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp @@ -268,7 +268,7 @@ struct MoveFuncBodyToWarpExecuteOnLane0 /// %r = gpu.warp_execute_on_lane_0(%laneid) -> /// (!xegpu.tensor_desc<4x8xf32, #layout0>) { /// ... -/// %td = xegpu.create_nd_tdesc %arg0[0, 0] +/// %td = xegpu.create_nd_tdesc %arg0 /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0> /// vector.yield %td /// } @@ -277,11 +277,11 @@ struct MoveFuncBodyToWarpExecuteOnLane0 /// ``` /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) { /// ... -/// %dead = xegpu.create_nd_tdesc %arg0[0, 0] +/// %dead = xegpu.create_nd_tdesc %arg0 /// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0> /// vector.yield %arg0, %dead /// } -/// %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32> +/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32> /// -> !xegpu.tensor_desc<4x8xf32> /// /// ``` @@ -301,6 +301,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { if (!layout) return rewriter.notifyMatchFailure( descOp, "the tensor descriptor lacks layout attribute"); + // CreateNdOp must not have offsets. + if (descOp.getMixedOffsets().size()) + return rewriter.notifyMatchFailure( + descOp, "xegpu::CreateNdDescOp must not have offsets"); SmallVector newRetIndices; rewriter.setInsertionPoint(warpOp); @@ -339,22 +343,23 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { /// #layout0 = #xegpu.layout /// gpu.warp_execute_on_lane_0(%laneid) -> () { /// ... -/// xegpu.store_nd %arg0, %arg1: vector<4x8xf32>, +/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>, /// !xegpu.tensor_desc<4x8xf32, #layout0> /// } /// ``` /// To /// ``` /// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>, -/// !xegpu.tensor_desc<4x8xf32, #layout0>) { -/// gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32, -/// #layout0> +/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { +/// ... +/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>, +/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index /// } /// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32> /// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, /// #layout0> /// -> !xegpu.tensor_desc<4x8xf32> -/// xegpu.store_nd %0, %1: vector<4xf32>, +/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>, /// !xegpu.tensor_desc<4x8xf32> /// /// ``` @@ -368,10 +373,15 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { if (!storeOp) return failure(); - int64_t offsetSize = static_cast(storeOp.getOffsets().size()); - if ((offsetSize != 0) || storeOp.getConstOffsetsAttr()) - return failure(); - + SmallVector offsets = storeOp.getMixedOffsets(); + // Expecting offsets to be present. + if (offsets.empty()) + return rewriter.notifyMatchFailure(storeOp, + "the store op must have offsets"); + SmallVector offsetsAsValues = + vector::getAsValues(rewriter, storeOp.getLoc(), offsets); + SmallVector offsetTypes = llvm::to_vector( + llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); })); xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType(); xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); if (!layout) @@ -387,13 +397,13 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { distributedTypeByWarpOpOrFailure.value(); SmallVector newRetIndices; + SmallVector newYieldedValues = {storeOp.getValue(), + storeOp.getTensorDesc()}; + SmallVector newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy}; + newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); + newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end()); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, - /* new yielded values = */ - ValueRange{storeOp.getValue(), storeOp.getTensorDesc()}, - /* new yielded types = */ - TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()}, - newRetIndices); + rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices); // Create a new store op outside the warp op with the distributed vector // type. Tensor descriptor is not distributed. rewriter.setInsertionPointAfter(newWarpOp); @@ -418,6 +428,9 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { newStoreOperands.push_back( resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]), distributedTensorDescTy, rewriter)); + // Collect offsets. + for (size_t i = 2; i < newRetIndices.size(); ++i) + newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i])); auto newStoreOp = xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{}, @@ -491,9 +504,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp, "xegpu::LoadNdOp require chip information to determine transpose " "requirement"); - int64_t offsetSize = static_cast(loadOp.getOffsets().size()); - if ((offsetSize != 0) || loadOp.getConstOffsetsAttr()) - return failure(); + // Expecting offsets to be present. + SmallVector offsets = loadOp.getMixedOffsets(); + if (offsets.empty()) + return rewriter.notifyMatchFailure(loadOp, + "the load op must have offsets"); + SmallVector offsetsAsValues = + vector::getAsValues(rewriter, loadOp.getLoc(), offsets); + SmallVector offsetTypes = llvm::to_vector( + llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); })); xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType(); xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr(); @@ -506,10 +525,12 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { cast(warpOp.getResult(operandIdx).getType()); SmallVector newRetIndices; + SmallVector newYieldedValues = {loadOp.getTensorDesc()}; + SmallVector newYieldedTypes = {tensorDescTy}; + newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); + newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end()); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, - /* new yielded values = */ loadOp.getTensorDesc(), - /* new yielded types = */ tensorDescTy, newRetIndices); + rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices); // Create a new load op outside the warp op with the distributed vector // type. @@ -523,11 +544,15 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { loadOp.getTensorDescType().dropLayouts(); // Distributed tensor // descriptor type does not // contain layout info. + SmallVector newLoadOperands{ + resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]), + distributedTensorDescTy, rewriter)}; + // Collect offsets. + for (size_t i = 1; i < newRetIndices.size(); ++i) + newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i])); auto newLoadOp = xegpu::LoadNdOp::create( rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(), - resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]), - distributedTensorDescTy, rewriter), - loadOp->getAttrs()); + newLoadOperands, loadOp->getAttrs()); xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. newLoadOp.setPacked(requirePacked(layout)); @@ -677,85 +702,6 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { } }; -/// Sink an update_nd_offset op feeding into yield op of an enclosing -/// `gpu.warp_execute_on_lane_0` region. The warp op will still contain the -/// original op that will not be used by the yield op (and should be cleaned -/// up later). The yield op will bypass the updateOp's arguments. The tensor -/// descriptor type is not distributed. Appropriate cast ops are inserted if -/// the distributed types does not match expected xegpu SIMT types. -/// Example: -/// ``` -/// #layout0 = #xegpu.layout -/// %r = gpu.warp_execute_on_lane_0(%laneid) -> -/// (!xegpu.tensor_desc<4x8xf32, #layout0>) { -/// ... -/// %update = xegpu.update_nd_offset %arg0, [%c32, %c16]: -/// !xegpu.tensor_desc<4x8xf32, #layout0> -/// gpu.yield %update -/// } -/// ... -/// ``` -/// To -/// ``` -/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> ( -/// !xegpu.tensor_desc<4x8xf32, #layout0>, -/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { -/// ... -/// %dead = xegpu.update_nd_offset %arg0, [%c32, %c16]: -/// !xegpu.tensor_desc<4x8xf32, #layout0> gpu.yield %dead, %arg0 -/// gpu.yield %dead, %arg0, %c32, %c16 -/// } -/// %0 = xegpu.unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32, -/// #layout0> -> !xegpu.tensor_desc<4x8xf32> -/// %1 = xegpu.update_nd_offset %0, [%r#2, %r#3]: -/// !xegpu.tensor_desc<4x8xf32> -/// ... -/// ``` -struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { - using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, - PatternRewriter &rewriter) const override { - OpOperand *operand = - getWarpResult(warpOp, llvm::IsaPred); - if (!operand) - return rewriter.notifyMatchFailure( - warpOp, "warp result is not a xegpu::UpdateNdOffset op"); - auto updateOp = operand->get().getDefiningOp(); - unsigned operandIdx = operand->getOperandNumber(); - - SmallVector newRetIndices; - gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, warpOp, updateOp->getOperands(), updateOp.getOperandTypes(), - newRetIndices); - rewriter.setInsertionPointAfter(newWarpOp); - // new update op does not have layout attribute. - xegpu::TensorDescType distributedTensorDescTy = - updateOp.getTensorDescType().dropLayouts(); - SmallVector newUpdateOperands = - llvm::map_to_vector(newRetIndices, [&](size_t i) { - // For the tensor descriptor operand, the layout attribute is - // dropped after distribution. Types needs to be resolved in this - // case. - if (isa(newWarpOp.getResult(i).getType())) { - return resolveDistributedTy(newWarpOp.getResult(i), - distributedTensorDescTy, rewriter); - } - return newWarpOp.getResult(i); - }); - // Create a new update op outside the warp op. - auto newUpdateOp = xegpu::UpdateNdOffsetOp::create( - rewriter, newWarpOp.getLoc(), distributedTensorDescTy, - newUpdateOperands, updateOp->getAttrs()); - xegpu::removeLayoutAttrs(newUpdateOp); - Value distributedVal = newWarpOp.getResult(operandIdx); - // Resolve the distributed type with the original type. - Value typeResolved = resolveDistributedTy( - newUpdateOp.getResult(), distributedVal.getType(), rewriter); - rewriter.replaceAllUsesWith(distributedVal, typeResolved); - return success(); - } -}; - /// Distribute a prefetch_nd op at the end of enclosing /// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed /// through the warp op interface they would be propagated as returned values. @@ -769,18 +715,19 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { /// #layout0 = #xegpu.layout /// gpu.warp_execute_on_lane_0(%laneid) -> () { /// ... -/// xegpu.prefetch_nd %arg0 : !xegpu.tensor_desc<4x8xf32, #layout0> +/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0> /// } /// ``` /// To /// ``` /// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> ( -/// !xegpu.tensor_desc<4x8xf32, #layout0>) { -/// gpu.yield %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> +/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) { +/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index, +/// index /// } /// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32, /// #layout0> -> !xegpu.tensor_desc<4x8xf32> -/// xegpu.prefetch_nd %1 : !xegpu.tensor_desc<4x8xf32> +/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32> /// /// ``` struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { @@ -793,17 +740,25 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { if (!prefetchOp) return failure(); - int64_t offsetSize = static_cast(prefetchOp.getOffsets().size()); - if ((offsetSize != 0) || prefetchOp.getConstOffsetsAttr()) - return failure(); + SmallVector offsets = prefetchOp.getMixedOffsets(); + // PrefetchNdOp must have offsets. + if (offsets.empty()) + return rewriter.notifyMatchFailure(prefetchOp, + "the prefetch op must have offsets"); + SmallVector offsetsAsValues = + vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets); + SmallVector offsetTypes = llvm::to_vector( + llvm::map_range(offsetsAsValues, [](Value v) { return v.getType(); })); xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr(); if (!layout) return rewriter.notifyMatchFailure( prefetchOp, "the source tensor descriptor lacks layout attribute"); - SmallVector newYieldValues = {prefetchOp.getTensorDesc()}; - SmallVector newYieldTypes = {prefetchOp.getTensorDescType()}; + SmallVector newYieldValues = {prefetchOp.getTensorDesc()}; + SmallVector newYieldTypes = {prefetchOp.getTensorDescType()}; + newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end()); + newYieldTypes.append(offsetTypes.begin(), offsetTypes.end()); SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); @@ -814,6 +769,9 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { rewriter.setInsertionPointAfter(newWarpOp); SmallVector newPrefetchOperands = {resolveDistributedTy( newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)}; + // Collect offsets. + for (size_t i = 1; i < newRetIndices.size(); ++i) + newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i])); xegpu::PrefetchNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands, prefetchOp->getAttrs()); xegpu::removeLayoutAttrs(prefetchOp); @@ -1456,15 +1414,14 @@ struct XeGPUSubgroupDistributePass final void xegpu::populateXeGPUSubgroupDistributePatterns( RewritePatternSet &patterns) { - patterns - .add( - patterns.getContext(), - /*pattern benefit=*/regularPatternBenefit); + patterns.add( + patterns.getContext(), + /*pattern benefit=*/regularPatternBenefit); patterns.add( patterns.getContext(), /*pattern benefit=*/highPatternBenefit); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 29c9fcdfebcdb..a178d0fe4b0b0 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -56,8 +56,9 @@ struct UnrollPattern : public OpRewritePattern { } SmallVector getUnrolledTypes(ShapedType type, - ArrayRef tileShape) const { - return options.getUnrolledTypes(type, tileShape); + ArrayRef tileShape, + bool returnSingleType = false) const { + return options.getUnrolledTypes(type, tileShape, returnSingleType); } /// Emulate the the unpack behavior using insert_strided_slice for VectorType @@ -121,53 +122,79 @@ struct UnrollPattern : public OpRewritePattern { xegpu::UnrollOptions options; }; +// Generic helper function for unrolling operations with offsets. +// +// Iterates over tile offsets within the tensor descriptor shape and calls +// the provided createOp function for each computed offset. This is used by +// operations like LoadNd, StoreNd, CreateNdDesc, and PrefetchNd when they +// have explicit offsets that need to be adjusted for each unrolled tile. +SmallVector computeUnrolledOffsets( + SmallVector mixedOffsets, xegpu::TensorDescType tdescTy, + ArrayRef targetShape, + const std::function)> &createOp, + Location loc, PatternRewriter &rewriter) { + int64_t rank = tdescTy.getRank(); + ArrayRef shape = tdescTy.getShape(); + + auto addi = [&](OpFoldResult a, int64_t b) -> Value { + std::optional maybeInt = getConstantIntValue(a); + if (maybeInt) { + return arith::ConstantIndexOp::create(rewriter, loc, *maybeInt + b); + } else { + auto aV = llvm::cast(a); + auto bV = arith::ConstantIndexOp::create(rewriter, loc, b); + return rewriter.createOrFold(loc, aV, bV); + } + }; + + SmallVector oldOffsets = llvm::to_vector( + llvm::drop_begin(mixedOffsets, mixedOffsets.size() - rank)); + auto validIdxes = + llvm::seq(mixedOffsets.size() - rank, mixedOffsets.size()); + + SmallVector newOps; + for (SmallVector offsets : + StaticTileOffsetRange(shape, targetShape)) { + + for (auto [idx, oldOff, offset] : + llvm::zip(validIdxes, oldOffsets, offsets)) + mixedOffsets[idx] = addi(oldOff, offset); + + auto newOp = createOp(mixedOffsets); + newOps.push_back(newOp); + } + return newOps; +} + struct UnrollCreateNdOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::CreateNdDescOp op, PatternRewriter &rewriter) const override { Location loc = op.getLoc(); xegpu::TensorDescType tdescTy = op.getType(); - int64_t rank = tdescTy.getRank(); - ArrayRef shape = tdescTy.getShape(); std::optional> targetShape = getTargetShape(op); if (!targetShape) return failure(); - auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0]; - - auto addi = [&](OpFoldResult a, int64_t b) -> Value { - std::optional maybeInt = getConstantIntValue(a); - if (maybeInt) { - return arith::ConstantIndexOp::create(rewriter, loc, *maybeInt + b); - } else { - auto aV = llvm::cast(a); - auto bV = arith::ConstantIndexOp::create(rewriter, loc, b); - return rewriter.createOrFold(loc, aV, bV); - } - }; - - SmallVector mixedOffsets = op.getMixedOffsets(); - - // For n-D memrefs where n > rank, we need to handle the last `rank` - // dimensions only, and keep the first `n-rank` dimensions as is. - SmallVector oldOffsets = llvm::to_vector( - llvm::drop_begin(mixedOffsets, mixedOffsets.size() - rank)); - auto validIdxes = - llvm::seq(mixedOffsets.size() - rank, mixedOffsets.size()); - SmallVector newOps; - for (SmallVector offsets : - StaticTileOffsetRange(shape, *targetShape)) { - - for (auto [idx, oldOff, offset] : - llvm::zip(validIdxes, oldOffsets, offsets)) - mixedOffsets[idx] = addi(oldOff, offset); + auto newTdescTy = getUnrolledTypes(tdescTy, *targetShape)[0]; + bool hasOffsets = op.getMixedOffsets().size() != 0; + if (!hasOffsets) { auto newOp = xegpu::CreateNdDescOp::create( - rewriter, loc, newTdescTy, op.getSource(), mixedOffsets, - op.getMixedSizes(), op.getMixedStrides()); + rewriter, loc, newTdescTy, op.getSource(), op.getMixedSizes(), + op.getMixedStrides()); newOps.push_back(newOp); + } else { + auto createOp = [&](SmallVector offsets) -> Value { + return xegpu::CreateNdDescOp::create( + rewriter, loc, newTdescTy, op.getSource(), offsets, + op.getMixedSizes(), op.getMixedStrides()); + }; + + newOps = computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, + *targetShape, createOp, loc, rewriter); } Value castOp = unpack(newOps, tdescTy, *targetShape, loc, rewriter); rewriter.replaceOp(op, castOp); @@ -216,17 +243,30 @@ struct UnrollPrefetchNdOp : public UnrollPattern { return failure(); int64_t offsetSize = static_cast(op.getOffsets().size()); - if ((offsetSize != 0) || op.getConstOffsetsAttr()) - return failure(); + bool hasOffsets = (offsetSize != 0) || op.getConstOffsetsAttr(); + + SmallVector convertedTdescTypes = getUnrolledTypes( + tdescTy, *targetShape, /*returnSingleType*/ hasOffsets); - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); SmallVector convertedTdesc = pack( op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - for (auto t : convertedTdesc) - xegpu::PrefetchNdOp::create(rewriter, loc, TypeRange(), t, - op->getAttrs()); + if (!hasOffsets) { + for (auto t : convertedTdesc) + xegpu::PrefetchNdOp::create(rewriter, loc, TypeRange(), t, + op->getAttrs()); + } else { + auto createPrefetch = [&](SmallVector offsets) -> Value { + xegpu::PrefetchNdOp::create(rewriter, loc, convertedTdesc[0], offsets, + op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr()); + // return dummy Value to satisfy function's signature + return nullptr; + }; + + computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, + createPrefetch, loc, rewriter); + } rewriter.eraseOp(op); return success(); @@ -247,22 +287,33 @@ struct UnrollLoadNdOp : public UnrollPattern { return failure(); int64_t offsetSize = static_cast(op.getOffsets().size()); - if ((offsetSize != 0) || op.getConstOffsetsAttr()) - return failure(); + bool hasOffsets = (offsetSize != 0) || op.getConstOffsetsAttr(); Type elemTy = tdescTy.getElementType(); VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); + SmallVector convertedTdescTypes = getUnrolledTypes( + tdescTy, *targetShape, /*returnSingleType*/ hasOffsets); + SmallVector convertedTdescs = pack( op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - SmallVector newOps; - for (auto t : convertedTdescs) { - auto newOp = - xegpu::LoadNdOp::create(rewriter, loc, newValueTy, t, op->getAttrs()); - newOps.push_back(newOp); + + if (!hasOffsets) { + for (auto t : convertedTdescs) { + auto newOp = xegpu::LoadNdOp::create(rewriter, loc, newValueTy, t, + op->getAttrs()); + newOps.push_back(newOp); + } + } else { + auto createLoad = [&](SmallVector offsets) { + return xegpu::LoadNdOp::create( + rewriter, loc, newValueTy, convertedTdescs[0], offsets, + op.getPackedAttr(), op.getTransposeAttr(), op.getL1HintAttr(), + op.getL2HintAttr(), op.getL3HintAttr()); + }; + newOps = computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, + *targetShape, createLoad, loc, rewriter); } Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); @@ -285,22 +336,36 @@ struct UnrollStoreNdOp : public UnrollPattern { return failure(); int64_t offsetSize = static_cast(op.getOffsets().size()); - if ((offsetSize != 0) || op.getConstOffsetsAttr()) - return failure(); + bool hasOffsets = (offsetSize != 0) || op.getConstOffsetsAttr(); SmallVector convertedValTypes = getUnrolledTypes(valueTy, *targetShape); - SmallVector convertedTdescTypes = - getUnrolledTypes(tdescTy, *targetShape); + SmallVector convertedTdescTypes = getUnrolledTypes( + tdescTy, *targetShape, /*returnSingleType*/ hasOffsets); - SmallVector convertedValues = - pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); SmallVector convertedTdescs = pack( op.getTensorDesc(), convertedTdescTypes, *targetShape, loc, rewriter); - for (auto [v, t] : llvm::zip(convertedValues, convertedTdescs)) - xegpu::StoreNdOp::create(rewriter, loc, v, t, op.getL1HintAttr(), - op.getL2HintAttr(), op.getL3HintAttr()); + SmallVector convertedValues = + pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); + if (!hasOffsets) { + for (auto [v, t] : llvm::zip(convertedValues, convertedTdescs)) + xegpu::StoreNdOp::create(rewriter, loc, v, t, op.getL1HintAttr(), + op.getL2HintAttr(), op.getL3HintAttr()); + } else { + size_t valueIndex = 0; + auto createStore = [&](SmallVector offsets) { + xegpu::StoreNdOp::create(rewriter, loc, convertedValues[valueIndex++], + convertedTdescs[0], offsets, + op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr()); + // return dummy Value to satisfy function's signature + return nullptr; + }; + + computeUnrolledOffsets(op.getMixedOffsets(), tdescTy, *targetShape, + createStore, loc, rewriter); + } rewriter.eraseOp(op); return success(); @@ -537,6 +602,195 @@ struct UnrollLoadGatherOp : public UnrollPattern { } }; +/// This pattern handles the unrolling of LoadGatherOp with offsets (gathered +/// load). +/// It unrolls the offsets and mask operands accordingly, and creates multiple +/// LoadGatherOp with the unrolled operands. +struct UnrollLoadGatherOpWithOffset + : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::LoadGatherOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + VectorType valueTy = llvm::dyn_cast(op.getType()); + Value offsets = op.getOffsets(); + Value mask = op.getMask(); + + // Only handle the case where offsets are present (scattered load) + if (!offsets) + return failure(); + + std::optional> targetShape = getTargetShape(op); + if (!targetShape) + return failure(); + + SmallVector targetMaskShape(*targetShape); + int64_t chunkSize = 1; + if (auto chunkSizeAttr = op->getAttr("chunk_size")) { + if (auto intAttr = llvm::dyn_cast(chunkSizeAttr)) + chunkSize = intAttr.getInt(); + } + + // Unroll mask and offsets with correct shape + VectorType maskTy = llvm::dyn_cast(mask.getType()); + VectorType offsetsTy = llvm::dyn_cast(offsets.getType()); + Type elemTy = valueTy.getElementType(); + VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); + + SmallVector convertedMaskTypes; + SmallVector convertedMasks; + SmallVector convertedOffsetTypes; + SmallVector convertedOffsets; + + if (chunkSize > 1) { + // For chunked loads, mask and offsets have one less dimension + targetMaskShape.pop_back(); + int64_t blockedChunkSize = targetShape->back(); + int64_t numNewChunks = chunkSize / blockedChunkSize; + chunkSize = blockedChunkSize; + + convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); + convertedOffsetTypes = getUnrolledTypes(offsetsTy, targetMaskShape); + + SmallVector convertedMasksBase = + pack(mask, convertedMaskTypes, targetMaskShape, loc, rewriter); + SmallVector convertedOffsetsBase = + pack(offsets, convertedOffsetTypes, targetMaskShape, loc, rewriter); + + for (auto maskVal : convertedMasksBase) + convertedMasks.append(numNewChunks, maskVal); + + for (auto [baseOffset, offsetType] : + llvm::zip(convertedOffsetsBase, convertedOffsetTypes)) { + for (int64_t i = 0; i < numNewChunks; ++i) { + Value inc = arith::ConstantIndexOp::create(rewriter, loc, + i * blockedChunkSize); + Value incVec = + vector::BroadcastOp::create(rewriter, loc, offsetType, inc); + Value offsetVal = + arith::AddIOp::create(rewriter, loc, baseOffset, incVec); + convertedOffsets.push_back(offsetVal); + } + } + } else { + convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); + convertedMasks = + pack(mask, convertedMaskTypes, targetMaskShape, loc, rewriter); + + convertedOffsetTypes = getUnrolledTypes(offsetsTy, *targetShape); + convertedOffsets = + pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); + } + + SmallVector newOps; + for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) { + auto newOp = xegpu::LoadGatherOp::create( + rewriter, loc, newValueTy, op.getSource(), o, m, + rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(), + op.getL2HintAttr(), op.getL3HintAttr()); + newOps.push_back(newOp); + } + + Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); + rewriter.replaceOp(op, castOp); + return success(); + } +}; + +/// This pattern handles the unrolling of StoreScatterOp with offsets (scattered +/// store). +/// It unrolls the offsets and mask operands accordingly, and creates multiple +/// StoreScatterOp with the unrolled operands. +struct UnrollStoreScatterOpWithOffsets + : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::StoreScatterOp op, + PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + VectorType valueTy = llvm::dyn_cast(op.getValue().getType()); + Value offsets = op.getOffsets(); + Value mask = op.getMask(); + + // Only handle the case where offsets are present (scattered store) + if (!offsets) + return failure(); + + std::optional> targetShape = getTargetShape(op); + if (!targetShape) + return failure(); + + int64_t chunkSize = 1; + if (auto chunkSizeAttr = op->getAttr("chunk_size")) { + if (auto intAttr = llvm::dyn_cast(chunkSizeAttr)) + chunkSize = intAttr.getInt(); + } + + SmallVector targetMaskShape(*targetShape); + VectorType maskTy = llvm::dyn_cast(mask.getType()); + VectorType offsetsTy = llvm::dyn_cast(offsets.getType()); + + SmallVector convertedMaskTypes; + SmallVector convertedMasks; + SmallVector convertedOffsetTypes; + SmallVector convertedOffsets; + + if (chunkSize > 1) { + targetMaskShape.pop_back(); + int64_t blockedChunkSize = targetShape->back(); + int64_t numNewChunks = chunkSize / blockedChunkSize; + chunkSize = blockedChunkSize; + + convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); + convertedOffsetTypes = getUnrolledTypes(offsetsTy, targetMaskShape); + + SmallVector convertedMasksBase = + pack(mask, convertedMaskTypes, targetMaskShape, loc, rewriter); + SmallVector convertedOffsetsBase = + pack(offsets, convertedOffsetTypes, targetMaskShape, loc, rewriter); + + for (auto maskVal : convertedMasksBase) + convertedMasks.append(numNewChunks, maskVal); + + for (auto [baseOffset, offsetType] : + llvm::zip(convertedOffsetsBase, convertedOffsetTypes)) { + for (int64_t i = 0; i < numNewChunks; ++i) { + Value inc = arith::ConstantIndexOp::create(rewriter, loc, + i * blockedChunkSize); + Value incVec = + vector::BroadcastOp::create(rewriter, loc, offsetType, inc); + Value offsetVal = + arith::AddIOp::create(rewriter, loc, baseOffset, incVec); + convertedOffsets.push_back(offsetVal); + } + } + } else { + convertedMaskTypes = getUnrolledTypes(maskTy, targetMaskShape); + convertedMasks = + pack(mask, convertedMaskTypes, targetMaskShape, loc, rewriter); + + convertedOffsetTypes = getUnrolledTypes(offsetsTy, *targetShape); + convertedOffsets = + pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter); + } + + SmallVector convertedValTypes = + getUnrolledTypes(valueTy, *targetShape); + SmallVector convertedValues = + pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter); + + for (auto [v, o, m] : + llvm::zip(convertedValues, convertedOffsets, convertedMasks)) { + xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m, + rewriter.getI64IntegerAttr(chunkSize), + op.getL1HintAttr(), op.getL2HintAttr(), + op.getL3HintAttr()); + } + + rewriter.eraseOp(op); + return success(); + } +}; + struct UnrollPrefetchOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::PrefetchOp op, @@ -766,6 +1020,7 @@ void mlir::xegpu::populateXeGPUUnrollPatterns( .add( + UnrollUpdateOffsetOp, UnrollLoadMatrixOp, UnrollStoreMatrixOp, + UnrollLoadGatherOpWithOffset, UnrollStoreScatterOpWithOffsets>( patterns.getContext(), options); } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index d7592fed6d186..9413a9296b184 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -1027,6 +1027,70 @@ struct WgToSgVectorShapeCastOp } }; +/// Pattern for lowering vector.multi_reduction op to subgroup level. +/// Current limitation: the sg_layout in the reduced dimension being 1 +/// so that reduction is local to subgroup & no cross-subgroup communication is +/// needed. +/// TODO: Add cases to handle more general situations which require SLM access. +struct WgToSgMultiDimReductionOp + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(vector::MultiDimReductionOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + VectorType srcType = op.getSourceVectorType(); + VectorType dstType = dyn_cast(op.getResult().getType()); + if (!dstType) + return failure(); + + auto srcShape = srcType.getShape(); + xegpu::DistributeLayoutAttr layout = + xegpu::getDistributeLayoutAttr(op.getResult()); + if (!layout || !layout.isForWorkgroup()) + return failure(); + + auto reductionDims = llvm::to_vector(op.getReductionDims()); + + SmallVector sgLayout = llvm::cast(layout) + .getParent() + .getEffectiveSgLayoutAsInt(); + SmallVector sgData = llvm::cast(layout) + .getParent() + .getEffectiveSgDataAsInt(); + + // Check that the sgLayout in the reduced dimension is 1 and + // each sg gets the entire slice to reduce. + for (int64_t dim : reductionDims) { + if (sgLayout[dim] != 1 || sgData[dim] != srcShape[dim]) + return rewriter.notifyMatchFailure( + op, + "sgLayout in each reduced dimension must be 1 and sgData in the " + "reduced dim must match srcShape in that dim"); + } + + SmallVector sgShape = getSgShapeAndCount(srcShape, layout).first; + + VectorType newDstType = + VectorType::get({sgShape}, dstType.getElementType()); + + SmallVector newReductions; + for (auto sgSrc : adaptor.getSource()) { + auto newOp = rewriter.create( + op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0], + op.getReductionDims()); + if (!layout.getEffectiveLaneLayoutAsInt().empty() || + !layout.getEffectiveInstDataAsInt().empty()) + xegpu::setDistributeLayoutAttr(newOp->getResult(0), + layout.dropSgLayoutAndData()); + newReductions.push_back(newOp.getResult()); + } + + rewriter.replaceOpWithMultiple(op, {newReductions}); + return success(); + } +}; + } // namespace namespace mlir { @@ -1040,8 +1104,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp, WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset, WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp, - WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp>( - patterns.getContext()); + WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp, + WgToSgMultiDimReductionOp>(patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -1195,6 +1259,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); }); + target.addDynamicallyLegalOp( + [=](vector::MultiDimReductionOp op) -> bool { + return isLegal(xegpu::getDistributeLayoutAttr(op.getResult())); + }); + target.addDynamicallyLegalOp( [=](xegpu::ConvertLayoutOp op) -> bool { return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp index 0ada4cc96570a..db0516533afcb 100644 --- a/mlir/lib/ExecutionEngine/JitRunner.cpp +++ b/mlir/lib/ExecutionEngine/JitRunner.cpp @@ -271,7 +271,7 @@ Error checkCompatibleReturnType(LLVM::LLVMFuncOp mainFunction) { return Error::success(); } template -Error compileAndExecuteSingleReturnFunction( +static Error compileAndExecuteSingleReturnFunction( Options &options, Operation *module, StringRef entryPoint, CompileAndExecuteConfig config, std::unique_ptr tm) { auto mainFunction = dyn_cast_or_null( diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp index 3d366276b4375..c84e760a3f363 100644 --- a/mlir/lib/IR/Builders.cpp +++ b/mlir/lib/IR/Builders.cpp @@ -14,6 +14,7 @@ #include "mlir/IR/IRMapping.h" #include "mlir/IR/Matchers.h" #include "llvm/ADT/SmallVectorExtras.h" +#include "llvm/Support/DebugLog.h" using namespace mlir; @@ -486,9 +487,25 @@ OpBuilder::tryFold(Operation *op, SmallVectorImpl &results, // Try to fold the operation. SmallVector foldResults; + LDBG() << "Trying to fold: " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); + if (op->getName().getStringRef() == "vector.extract") { + Operation *parent = op->getParentOp(); + while (parent && parent->getName().getStringRef() != "spirv.func") + parent = parent->getParentOp(); + if (parent) + parent->dump(); + } if (failed(op->fold(foldResults))) return cleanupFailure(); + int count = 0; + do { + LDBG() << "Folded in place #" << count + << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions()); + count++; + } while (foldResults.empty() && succeeded(op->fold(foldResults))); + // An in-place fold does not require generation of any constants. if (foldResults.empty()) return success(); diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt index fdc19844702bc..388de1c3e5abf 100644 --- a/mlir/lib/Interfaces/CMakeLists.txt +++ b/mlir/lib/Interfaces/CMakeLists.txt @@ -11,6 +11,7 @@ set(LLVM_OPTIONAL_SOURCES InferIntRangeInterface.cpp InferTypeOpInterface.cpp LoopLikeInterface.cpp + MemOpInterfaces.cpp MemorySlotInterfaces.cpp ParallelCombiningOpInterface.cpp RuntimeVerifiableOpInterface.cpp @@ -79,6 +80,7 @@ add_mlir_library(MLIRLoopLikeInterface MLIRFunctionInterfaces ) +add_mlir_interface_library(MemOpInterfaces) add_mlir_interface_library(MemorySlotInterfaces) add_mlir_interface_library(ParallelCombiningOpInterface) add_mlir_interface_library(RuntimeVerifiableOpInterface) diff --git a/mlir/lib/Interfaces/MemOpInterfaces.cpp b/mlir/lib/Interfaces/MemOpInterfaces.cpp new file mode 100644 index 0000000000000..fe5c717f67bc4 --- /dev/null +++ b/mlir/lib/Interfaces/MemOpInterfaces.cpp @@ -0,0 +1,73 @@ +//===- MemOpInterfaces.cpp - Memory operation interfaces ---------*- C++-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Interfaces/MemOpInterfaces.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Value.h" + +using namespace mlir; + +LogicalResult mlir::detail::verifyMemorySpaceCastOpInterface(Operation *op) { + auto memCastOp = cast(op); + + // Verify that the source and target pointers are valid + Value sourcePtr = memCastOp.getSourcePtr(); + Value targetPtr = memCastOp.getTargetPtr(); + + if (!sourcePtr || !targetPtr) { + return op->emitError() + << "memory space cast op must have valid source and target pointers"; + } + + if (sourcePtr.getType().getTypeID() != targetPtr.getType().getTypeID()) { + return op->emitError() + << "expected source and target types of the same kind"; + } + + // Verify the Types are of `PtrLikeTypeInterface` type. + auto sourceType = dyn_cast(sourcePtr.getType()); + if (!sourceType) { + return op->emitError() + << "source type must implement `PtrLikeTypeInterface`, but got: " + << sourcePtr.getType(); + } + + auto targetType = dyn_cast(targetPtr.getType()); + if (!targetType) { + return op->emitError() + << "target type must implement `PtrLikeTypeInterface`, but got: " + << targetPtr.getType(); + } + + // Verify that the operation has exactly one result + if (op->getNumResults() != 1) { + return op->emitError() + << "memory space cast op must have exactly one result"; + } + + return success(); +} + +FailureOr>> +mlir::detail::bubbleDownInPlaceMemorySpaceCastImpl(OpOperand &operand, + ValueRange results) { + MemorySpaceCastOpInterface castOp = + MemorySpaceCastOpInterface::getIfPromotableCast(operand.get()); + + // Bail if the src is not valid. + if (!castOp) + return failure(); + + // Modify the op. + operand.set(castOp.getSourcePtr()); + return std::optional>(); +} + +#include "mlir/Interfaces/MemOpInterfaces.cpp.inc" diff --git a/mlir/lib/Remark/RemarkStreamer.cpp b/mlir/lib/Remark/RemarkStreamer.cpp index 8e3544ff2c34c..d213a1a2068d6 100644 --- a/mlir/lib/Remark/RemarkStreamer.cpp +++ b/mlir/lib/Remark/RemarkStreamer.cpp @@ -20,8 +20,7 @@ LLVMRemarkStreamer::createToFile(llvm::StringRef path, if (ec) return failure(); - auto serOr = llvm::remarks::createRemarkSerializer( - fmt, llvm::remarks::SerializerMode::Separate, f->os()); + auto serOr = llvm::remarks::createRemarkSerializer(fmt, f->os()); if (!serOr) { llvm::consumeError(serOr.takeError()); return failure(); @@ -50,6 +49,12 @@ LLVMRemarkStreamer::~LLVMRemarkStreamer() { if (file && remarkStreamer) file->keep(); } + +void LLVMRemarkStreamer::finalize() { + if (!remarkStreamer) + return; + remarkStreamer->releaseSerializer(); +} } // namespace mlir::remark::detail namespace mlir::remark { diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp index 510ec6fe6456f..8b0326518770d 100644 --- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp +++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp @@ -61,7 +61,8 @@ DICompileUnitAttr DebugImporter::translateImpl(llvm::DICompileUnit *node) { return DICompileUnitAttr::get( context, getOrCreateDistinctID(node), node->getSourceLanguage(), translate(node->getFile()), getStringAttrOrNull(node->getRawProducer()), - node->isOptimized(), emissionKind.value(), nameTableKind.value()); + node->isOptimized(), emissionKind.value(), nameTableKind.value(), + getStringAttrOrNull(node->getRawSplitDebugFilename())); } DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) { diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp index a55445deddc2d..eeb87253e5eb8 100644 --- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp @@ -124,7 +124,9 @@ llvm::DICompileUnit *DebugTranslation::translateImpl(DICompileUnitAttr attr) { attr.getSourceLanguage(), translate(attr.getFile()), attr.getProducer() ? attr.getProducer().getValue() : "", attr.getIsOptimized(), - /*Flags=*/"", /*RV=*/0, /*SplitName=*/{}, + /*Flags=*/"", /*RV=*/0, + attr.getSplitDebugFilename() ? attr.getSplitDebugFilename().getValue() + : "", static_cast( attr.getEmissionKind()), 0, true, false, diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 4921a1990b6e8..53209a40665ae 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -2591,13 +2591,34 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder, } builder.SetInsertPoint(*regionBlock, (*regionBlock)->begin()); + + // Check if we can generate no-loop kernel + bool noLoopMode = false; + omp::TargetOp targetOp = wsloopOp->getParentOfType(); + if (targetOp) { + Operation *targetCapturedOp = targetOp.getInnermostCapturedOmpOp(); + // We need this check because, without it, noLoopMode would be set to true + // for every omp.wsloop nested inside a no-loop SPMD target region, even if + // that loop is not the top-level SPMD one. + if (loopOp == targetCapturedOp) { + omp::TargetRegionFlags kernelFlags = + targetOp.getKernelExecFlags(targetCapturedOp); + if (omp::bitEnumContainsAll(kernelFlags, + omp::TargetRegionFlags::spmd | + omp::TargetRegionFlags::no_loop) && + !omp::bitEnumContainsAny(kernelFlags, + omp::TargetRegionFlags::generic)) + noLoopMode = true; + } + } + llvm::OpenMPIRBuilder::InsertPointOrErrorTy wsloopIP = ompBuilder->applyWorkshareLoop( ompLoc.DL, loopInfo, allocaIP, loopNeedsBarrier, convertToScheduleKind(schedule), chunk, isSimd, scheduleMod == omp::ScheduleModifier::monotonic, scheduleMod == omp::ScheduleModifier::nonmonotonic, isOrdered, - workshareLoopType); + workshareLoopType, noLoopMode); if (failed(handleError(wsloopIP, opInst))) return failure(); @@ -3595,8 +3616,10 @@ getDeclareTargetRefPtrSuffix(LLVM::GlobalOp globalOp, llvm::StringRef(loc.getFilename()), loc.getLine()); }; + auto vfs = llvm::vfs::getRealFileSystem(); os << llvm::format( - "_%x", ompBuilder.getTargetEntryUniqueInfo(fileInfoCallBack).FileID); + "_%x", + ompBuilder.getTargetEntryUniqueInfo(fileInfoCallBack, *vfs).FileID); } os << "_decl_tgt_ref_ptr"; @@ -5425,6 +5448,12 @@ initTargetDefaultAttrs(omp::TargetOp targetOp, Operation *capturedOp, ? llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD : llvm::omp::OMP_TGT_EXEC_MODE_GENERIC : llvm::omp::OMP_TGT_EXEC_MODE_SPMD; + if (omp::bitEnumContainsAll(kernelFlags, + omp::TargetRegionFlags::spmd | + omp::TargetRegionFlags::no_loop) && + !omp::bitEnumContainsAny(kernelFlags, omp::TargetRegionFlags::generic)) + attrs.ExecFlags = llvm::omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP; + attrs.MinTeams = minTeamsVal; attrs.MaxTeams.front() = maxTeamsVal; attrs.MinThreads = 1; @@ -5888,10 +5917,12 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute, lineNo); }; + auto vfs = llvm::vfs::getRealFileSystem(); + ompBuilder->registerTargetGlobalVariable( captureClause, deviceClause, isDeclaration, isExternallyVisible, - ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack), mangledName, - generatedRefs, /*OpenMPSimd*/ false, targetTriple, + ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack, *vfs), + mangledName, generatedRefs, /*OpenMPSimd*/ false, targetTriple, /*GlobalInitializer*/ nullptr, /*VariableLinkage*/ nullptr, gVal->getType(), gVal); @@ -5901,9 +5932,9 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute, ompBuilder->Config.hasRequiresUnifiedSharedMemory())) { ompBuilder->getAddrOfDeclareTargetVar( captureClause, deviceClause, isDeclaration, isExternallyVisible, - ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack), mangledName, - generatedRefs, /*OpenMPSimd*/ false, targetTriple, gVal->getType(), - /*GlobalInitializer*/ nullptr, + ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack, *vfs), + mangledName, generatedRefs, /*OpenMPSimd*/ false, targetTriple, + gVal->getType(), /*GlobalInitializer*/ nullptr, /*VariableLinkage*/ nullptr); } } diff --git a/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp index 7e610cd42e931..8d6fffcca45f2 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/Ptr/PtrToLLVMIRTranslation.cpp @@ -351,6 +351,42 @@ translateConstantOp(ConstantOp constantOp, llvm::IRBuilderBase &builder, return success(); } +/// Translate ptr.ptr_diff operation operation to LLVM IR. +static LogicalResult +translatePtrDiffOp(PtrDiffOp ptrDiffOp, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + llvm::Value *lhs = moduleTranslation.lookupValue(ptrDiffOp.getLhs()); + llvm::Value *rhs = moduleTranslation.lookupValue(ptrDiffOp.getRhs()); + + if (!lhs || !rhs) + return ptrDiffOp.emitError("Failed to lookup operands"); + + // Translate result type to LLVM type + llvm::Type *resultType = + moduleTranslation.convertType(ptrDiffOp.getResult().getType()); + if (!resultType) + return ptrDiffOp.emitError("Failed to translate result type"); + + PtrDiffFlags flags = ptrDiffOp.getFlags(); + + // Convert both pointers to integers using ptrtoaddr, and compute the + // difference: lhs - rhs + llvm::Value *llLhs = builder.CreatePtrToAddr(lhs); + llvm::Value *llRhs = builder.CreatePtrToAddr(rhs); + llvm::Value *result = builder.CreateSub( + llLhs, llRhs, /*Name=*/"", + /*HasNUW=*/(flags & PtrDiffFlags::nuw) == PtrDiffFlags::nuw, + /*HasNSW=*/(flags & PtrDiffFlags::nsw) == PtrDiffFlags::nsw); + + // Convert the difference to the expected result type by truncating or + // extending. + if (result->getType() != resultType) + result = builder.CreateIntCast(result, resultType, /*isSigned=*/true); + + moduleTranslation.mapValue(ptrDiffOp.getResult(), result); + return success(); +} + /// Implementation of the dialect interface that translates operations belonging /// to the `ptr` dialect to LLVM IR. class PtrDialectLLVMIRTranslationInterface @@ -371,6 +407,9 @@ class PtrDialectLLVMIRTranslationInterface .Case([&](PtrAddOp ptrAddOp) { return translatePtrAddOp(ptrAddOp, builder, moduleTranslation); }) + .Case([&](PtrDiffOp ptrDiffOp) { + return translatePtrDiffOp(ptrDiffOp, builder, moduleTranslation); + }) .Case([&](LoadOp loadOp) { return translateLoadOp(loadOp, builder, moduleTranslation); }) diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 7a888bb3778a9..9603813e059d3 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -447,7 +447,7 @@ ModuleImport::processAliasScopeMetadata(const llvm::MDNode *node) { if (verifySelfRef(node)) return DistinctAttr::create(builder.getUnitAttr()); - auto name = cast(node->getOperand(0)); + auto *name = cast(node->getOperand(0)); return builder.getStringAttr(name->getString()); }; @@ -1123,7 +1123,7 @@ void ModuleImport::setExactFlag(llvm::Instruction *inst, Operation *op) const { void ModuleImport::setDisjointFlag(llvm::Instruction *inst, Operation *op) const { auto iface = cast(op); - auto instDisjoint = cast(inst); + auto *instDisjoint = cast(inst); iface.setIsDisjoint(instDisjoint->isDisjoint()); } @@ -1374,7 +1374,7 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) { AliasOp aliasOp = AliasOp::create(builder, mlirModule.getLoc(), type, convertLinkageFromLLVM(alias->getLinkage()), alias->getName(), - /*dso_local=*/alias->isDSOLocal(), + /*dsoLocal=*/alias->isDSOLocal(), /*thread_local=*/alias->isThreadLocal(), /*attrs=*/ArrayRef()); globalInsertionOp = aliasOp; @@ -1507,8 +1507,8 @@ LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) { GlobalOp globalOp = GlobalOp::create( builder, mlirModule.getLoc(), type, globalVar->isConstant(), convertLinkageFromLLVM(globalVar->getLinkage()), StringRef(globalName), - valueAttr, alignment, /*addr_space=*/globalVar->getAddressSpace(), - /*dso_local=*/globalVar->isDSOLocal(), + valueAttr, alignment, /*addrSpace=*/globalVar->getAddressSpace(), + /*dsoLocal=*/globalVar->isDSOLocal(), /*thread_local=*/globalVar->isThreadLocal(), /*comdat=*/SymbolRefAttr(), /*attrs=*/ArrayRef(), /*dbgExprs=*/globalExpressionAttrs); globalInsertionOp = globalOp; diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index adc5a74e2031f..5a3eb209f0a92 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -647,7 +647,7 @@ llvm::Constant *mlir::LLVM::detail::getLLVMConstant( llvm::ElementCount::get(numElements, /*Scalable=*/isScalable), child); if (llvmType->isArrayTy()) { auto *arrayType = llvm::ArrayType::get(elementType, numElements); - if (child->isZeroValue()) { + if (child->isZeroValue() && !elementType->isFPOrFPVectorTy()) { return llvm::ConstantAggregateZero::get(arrayType); } else { if (llvm::ConstantDataSequential::isElementTypeCompatible( diff --git a/mlir/lib/Transforms/BubbleDownMemorySpaceCasts.cpp b/mlir/lib/Transforms/BubbleDownMemorySpaceCasts.cpp new file mode 100644 index 0000000000000..00dac19e37171 --- /dev/null +++ b/mlir/lib/Transforms/BubbleDownMemorySpaceCasts.cpp @@ -0,0 +1,69 @@ +//===- BubbleDownMemorySpaceCasts.cpp - Bubble down casts transform -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Transforms/BubbleDownMemorySpaceCasts.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Interfaces/MemOpInterfaces.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/Support/Debug.h" + +using namespace mlir; + +namespace mlir { +#define GEN_PASS_DEF_BUBBLEDOWNMEMORYSPACECASTS +#include "mlir/Transforms/Passes.h.inc" +} // namespace mlir + +namespace { +//===----------------------------------------------------------------------===// +// BubbleDownCastsPattern pattern +//===----------------------------------------------------------------------===// +/// Pattern to bubble down casts into consumer operations. +struct BubbleDownCastsPattern + : public OpInterfaceRewritePattern { + using OpInterfaceRewritePattern::OpInterfaceRewritePattern; + + LogicalResult matchAndRewrite(MemorySpaceCastConsumerOpInterface op, + PatternRewriter &rewriter) const override { + FailureOr>> results = + op.bubbleDownCasts(rewriter); + if (failed(results)) + return failure(); + if (!results->has_value()) { + rewriter.modifyOpInPlace(op, []() {}); + return success(); + } + rewriter.replaceOp(op, **results); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// BubbleDownMemorySpaceCasts pass +//===----------------------------------------------------------------------===// + +struct BubbleDownMemorySpaceCasts + : public impl::BubbleDownMemorySpaceCastsBase { + using impl::BubbleDownMemorySpaceCastsBase< + BubbleDownMemorySpaceCasts>::BubbleDownMemorySpaceCastsBase; + + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + populateBubbleDownMemorySpaceCastPatterns(patterns, PatternBenefit(1)); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) + signalPassFailure(); + } +}; +} // namespace + +void mlir::populateBubbleDownMemorySpaceCastPatterns( + RewritePatternSet &patterns, PatternBenefit benefit) { + patterns.add(patterns.getContext(), benefit); +} diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt index 058039e47313e..54b67f5c7a91e 100644 --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -6,6 +6,7 @@ add_mlir_library(MLIRTransforms ControlFlowSink.cpp CSE.cpp GenerateRuntimeVerification.cpp + BubbleDownMemorySpaceCasts.cpp InlinerPass.cpp LocationSnapshot.cpp LoopInvariantCodeMotion.cpp @@ -31,6 +32,7 @@ add_mlir_library(MLIRTransforms MLIRAnalysis MLIRFunctionInterfaces MLIRLoopLikeInterface + MLIRMemOpInterfaces MLIRMemorySlotInterfaces MLIRPass MLIRRuntimeVerifiableOpInterface diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp index d36a3c1362c19..b3057129fb9fd 100644 --- a/mlir/lib/Transforms/Mem2Reg.cpp +++ b/mlir/lib/Transforms/Mem2Reg.cpp @@ -286,7 +286,7 @@ LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses( mlir::getForwardSlice(slot.ptr, &forwardSlice); for (Operation *user : forwardSlice) { // If the next operation has no blocking uses, everything is fine. - auto it = userToBlockingUses.find(user); + auto *it = userToBlockingUses.find(user); if (it == userToBlockingUses.end()) continue; diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp index 0e84b6dd17f29..e0c65b0e09774 100644 --- a/mlir/lib/Transforms/RemoveDeadValues.cpp +++ b/mlir/lib/Transforms/RemoveDeadValues.cpp @@ -88,6 +88,8 @@ struct FunctionToCleanUp { struct OperationToCleanup { Operation *op; BitVector nonLive; + Operation *callee = + nullptr; // Optional: For CallOpInterface ops, stores the callee function }; struct BlockArgsToCleanup { @@ -287,7 +289,8 @@ static void processSimpleOp(Operation *op, RunLivenessAnalysis &la, static void processFuncOp(FunctionOpInterface funcOp, Operation *module, RunLivenessAnalysis &la, DenseSet &nonLiveSet, RDVFinalCleanupList &cl) { - LDBG() << "Processing function op: " << funcOp.getOperation()->getName(); + LDBG() << "Processing function op: " + << OpWithFlags(funcOp, OpPrintingFlags().skipRegions()); if (funcOp.isPublic() || funcOp.isExternal()) { LDBG() << "Function is public or external, skipping: " << funcOp.getOperation()->getName(); @@ -306,19 +309,19 @@ static void processFuncOp(FunctionOpInterface funcOp, Operation *module, nonLiveSet.insert(arg); } - // Do (2). + // Do (2). (Skip creating generic operand cleanup entries for call ops. + // Call arguments will be removed in the call-site specific segment-aware + // cleanup, avoiding generic eraseOperands bitvector mechanics.) SymbolTable::UseRange uses = *funcOp.getSymbolUses(module); for (SymbolTable::SymbolUse use : uses) { Operation *callOp = use.getUser(); assert(isa(callOp) && "expected a call-like user"); - // The number of operands in the call op may not match the number of - // arguments in the func op. - BitVector nonLiveCallOperands(callOp->getNumOperands(), false); - SmallVector callOpOperands = - operandsToOpOperands(cast(callOp).getArgOperands()); - for (int index : nonLiveArgs.set_bits()) - nonLiveCallOperands.set(callOpOperands[index]->getOperandNumber()); - cl.operands.push_back({callOp, nonLiveCallOperands}); + // Push an empty operand cleanup entry so that call-site specific logic in + // cleanUpDeadVals runs (it keys off CallOpInterface). The BitVector is + // intentionally all false to avoid generic erasure. + // Store the funcOp as the callee to avoid expensive symbol lookup later. + cl.operands.push_back({callOp, BitVector(callOp->getNumOperands(), false), + funcOp.getOperation()}); } // Do (3). @@ -746,6 +749,10 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) { // 3. Functions LDBG() << "Cleaning up " << list.functions.size() << " functions"; + // Record which function arguments were erased so we can shrink call-site + // argument segments for CallOpInterface operations (e.g. ops using + // AttrSizedOperandSegments) in the next phase. + DenseMap erasedFuncArgs; for (auto &f : list.functions) { LDBG() << "Cleaning up function: " << f.funcOp.getOperation()->getName(); LDBG() << " Erasing " << f.nonLiveArgs.count() << " non-live arguments"; @@ -754,17 +761,52 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) { // Some functions may not allow erasing arguments or results. These calls // return failure in such cases without modifying the function, so it's okay // to proceed. - (void)f.funcOp.eraseArguments(f.nonLiveArgs); + if (succeeded(f.funcOp.eraseArguments(f.nonLiveArgs))) { + // Record only if we actually erased something. + if (f.nonLiveArgs.any()) + erasedFuncArgs.try_emplace(f.funcOp.getOperation(), f.nonLiveArgs); + } (void)f.funcOp.eraseResults(f.nonLiveRets); } // 4. Operands LDBG() << "Cleaning up " << list.operands.size() << " operand lists"; for (OperationToCleanup &o : list.operands) { - if (o.op->getNumOperands() > 0) { - LDBG() << "Erasing " << o.nonLive.count() - << " non-live operands from operation: " - << OpWithFlags(o.op, OpPrintingFlags().skipRegions()); + // Handle call-specific cleanup only when we have a cached callee reference. + // This avoids expensive symbol lookup and is defensive against future + // changes. + bool handledAsCall = false; + if (o.callee && isa(o.op)) { + auto call = cast(o.op); + auto it = erasedFuncArgs.find(o.callee); + if (it != erasedFuncArgs.end()) { + const BitVector &deadArgIdxs = it->second; + MutableOperandRange args = call.getArgOperandsMutable(); + // First, erase the call arguments corresponding to erased callee + // args. We iterate backwards to preserve indices. + for (unsigned argIdx : llvm::reverse(deadArgIdxs.set_bits())) + args.erase(argIdx); + // If this operand cleanup entry also has a generic nonLive bitvector, + // clear bits for call arguments we already erased above to avoid + // double-erasing (which could impact other segments of ops with + // AttrSizedOperandSegments). + if (o.nonLive.any()) { + // Map the argument logical index to the operand number(s) recorded. + int operandOffset = call.getArgOperands().getBeginOperandIndex(); + for (int argIdx : deadArgIdxs.set_bits()) { + int operandNumber = operandOffset + argIdx; + if (operandNumber < static_cast(o.nonLive.size())) + o.nonLive.reset(operandNumber); + } + } + handledAsCall = true; + } + } + // Perform generic operand erasure for: + // - Non-call operations + // - Call operations without cached callee (where handledAsCall is false) + // But skip call operations that were already handled via segment-aware path + if (!handledAsCall && o.nonLive.any()) { o.op->eraseOperands(o.nonLive); } } diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp index 5e07509871ea2..68ad3acf295c8 100644 --- a/mlir/lib/Transforms/Utils/FoldUtils.cpp +++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp @@ -16,6 +16,7 @@ #include "mlir/IR/Builders.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/Operation.h" +#include "llvm/Support/DebugLog.h" using namespace mlir; @@ -67,7 +68,8 @@ static Operation *materializeConstant(Dialect *dialect, OpBuilder &builder, // OperationFolder //===----------------------------------------------------------------------===// -LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) { +LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate, + int maxIterations) { if (inPlaceUpdate) *inPlaceUpdate = false; @@ -86,7 +88,7 @@ LogicalResult OperationFolder::tryToFold(Operation *op, bool *inPlaceUpdate) { // Try to fold the operation. SmallVector results; - if (failed(tryToFold(op, results))) + if (failed(tryToFold(op, results, maxIterations))) return failure(); // Check to see if the operation was just updated in place. @@ -224,10 +226,19 @@ bool OperationFolder::isFolderOwnedConstant(Operation *op) const { /// Tries to perform folding on the given `op`. If successful, populates /// `results` with the results of the folding. LogicalResult OperationFolder::tryToFold(Operation *op, - SmallVectorImpl &results) { + SmallVectorImpl &results, + int maxIterations) { SmallVector foldResults; - if (failed(op->fold(foldResults)) || - failed(processFoldResults(op, results, foldResults))) + if (failed(op->fold(foldResults))) + return failure(); + int count = 1; + do { + LDBG() << "Folded in place #" << count + << " times: " << OpWithFlags(op, OpPrintingFlags().skipRegions()); + } while (count++ < maxIterations && foldResults.empty() && + succeeded(op->fold(foldResults))); + + if (failed(processFoldResults(op, results, foldResults))) return failure(); return success(); } diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt index d6686bb89ce4e..9f5246de6bda0 100644 --- a/mlir/python/CMakeLists.txt +++ b/mlir/python/CMakeLists.txt @@ -873,85 +873,89 @@ if(NOT LLVM_ENABLE_IDE) ) endif() -# _mlir stubgen -# Note: All this needs to come before add_mlir_python_modules(MLIRPythonModules so that the install targets for the -# generated type stubs get created. - -set(_core_type_stub_sources - _mlir/__init__.pyi - _mlir/ir.pyi - _mlir/passmanager.pyi - _mlir/rewrite.pyi -) - -# Note 1: INTERFACE_SOURCES is a genex ($ $) -# which will be evaluated by file(GENERATE ...) inside mlir_generate_type_stubs. This will evaluate to the correct -# thing in the build dir (i.e., actual source dir paths) and in the install dir -# (where it's a conventional path; see install/lib/cmake/mlir/MLIRTargets.cmake). -# -# Note 2: MLIRPythonExtension.Core is the target that is defined using target_sources(INTERFACE) -# **NOT** MLIRPythonModules.extension._mlir.dso. So be sure to use the correct target! -get_target_property(_core_extension_srcs MLIRPythonExtension.Core INTERFACE_SOURCES) - -# Why is MODULE_NAME _mlir here but mlir._mlir_libs._mlirPythonTestNanobind below??? -# The _mlir extension can be imported independently of any other python code and/or extension modules. -# I.e., you could do `cd $MLIRPythonModules_ROOT_PREFIX/_mlir_libs && python -c "import _mlir"` (try it!). -# _mlir is also (currently) the only extension for which this is possible because dialect extensions modules, -# which generally make use of `mlir_value_subclass/mlir_type_subclass/mlir_attribute_subclass`, perform an -# `import mlir` right when they're loaded (see the mlir_*_subclass ctors in NanobindAdaptors.h). -# Note, this also why IMPORT_PATHS "${MLIRPythonModules_ROOT_PREFIX}/_mlir_libs" here while below -# "${MLIRPythonModules_ROOT_PREFIX}/.." (because MLIR_BINDINGS_PYTHON_INSTALL_PREFIX, by default, ends at mlir). -# -# Further note: this function creates file targets like -# "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs/_mlir/__init__.pyi". These must match the file targets -# that declare_mlir_python_sources expects, which are like "${ROOT_DIR}/${WHATEVER_SOURCE}". -# This is why _mlir_libs is prepended below. -mlir_generate_type_stubs( - MODULE_NAME _mlir - DEPENDS_TARGETS MLIRPythonModules.extension._mlir.dso - OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs" - OUTPUTS "${_core_type_stub_sources}" - DEPENDS_TARGET_SRC_DEPS "${_core_extension_srcs}" - IMPORT_PATHS "${MLIRPythonModules_ROOT_PREFIX}/_mlir_libs" -) -set(_mlir_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") - -list(TRANSFORM _core_type_stub_sources PREPEND "_mlir_libs/") -# Note, we do not do ADD_TO_PARENT here so that the type stubs are not associated (as mlir_DEPENDS) with -# MLIRPythonSources.Core (or something) when a distro is installed/created. Otherwise they would not be regenerated -# by users of the distro (the stubs are still installed in the distro - they are just not added to mlir_DEPENDS). -declare_mlir_python_sources( - MLIRPythonExtension.Core.type_stub_gen - ROOT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs" - SOURCES "${_core_type_stub_sources}" -) - -# _mlirPythonTestNanobind stubgen +# Stubgen doesn't work when cross-compiling (stubgen will run in the host interpreter and then fail +# to find the extension module for the host arch). +if(NOT CMAKE_CROSSCOMPILING) + # _mlir stubgen + # Note: All this needs to come before add_mlir_python_modules(MLIRPythonModules so that the install targets for the + # generated type stubs get created. + + set(_core_type_stub_sources + _mlir/__init__.pyi + _mlir/ir.pyi + _mlir/passmanager.pyi + _mlir/rewrite.pyi + ) -if(MLIR_INCLUDE_TESTS) - get_target_property(_test_extension_srcs MLIRPythonTestSources.PythonTestExtensionNanobind INTERFACE_SOURCES) + # Note 1: INTERFACE_SOURCES is a genex ($ $) + # which will be evaluated by file(GENERATE ...) inside mlir_generate_type_stubs. This will evaluate to the correct + # thing in the build dir (i.e., actual source dir paths) and in the install dir + # (where it's a conventional path; see install/lib/cmake/mlir/MLIRTargets.cmake). + # + # Note 2: MLIRPythonExtension.Core is the target that is defined using target_sources(INTERFACE) + # **NOT** MLIRPythonModules.extension._mlir.dso. So be sure to use the correct target! + get_target_property(_core_extension_srcs MLIRPythonExtension.Core INTERFACE_SOURCES) + + # Why is MODULE_NAME _mlir here but mlir._mlir_libs._mlirPythonTestNanobind below??? + # The _mlir extension can be imported independently of any other python code and/or extension modules. + # I.e., you could do `cd $MLIRPythonModules_ROOT_PREFIX/_mlir_libs && python -c "import _mlir"` (try it!). + # _mlir is also (currently) the only extension for which this is possible because dialect extensions modules, + # which generally make use of `mlir_value_subclass/mlir_type_subclass/mlir_attribute_subclass`, perform an + # `import mlir` right when they're loaded (see the mlir_*_subclass ctors in NanobindAdaptors.h). + # Note, this also why IMPORT_PATHS "${MLIRPythonModules_ROOT_PREFIX}/_mlir_libs" here while below + # "${MLIRPythonModules_ROOT_PREFIX}/.." (because MLIR_BINDINGS_PYTHON_INSTALL_PREFIX, by default, ends at mlir). + # + # Further note: this function creates file targets like + # "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs/_mlir/__init__.pyi". These must match the file targets + # that declare_mlir_python_sources expects, which are like "${ROOT_DIR}/${WHATEVER_SOURCE}". + # This is why _mlir_libs is prepended below. mlir_generate_type_stubs( - # This is the FQN path because dialect modules import _mlir when loaded. See above. - MODULE_NAME mlir._mlir_libs._mlirPythonTestNanobind - DEPENDS_TARGETS - # You need both _mlir and _mlirPythonTestNanobind because dialect modules import _mlir when loaded - # (so _mlir needs to be built before calling stubgen). - MLIRPythonModules.extension._mlir.dso - MLIRPythonModules.extension._mlirPythonTestNanobind.dso - # You need this one so that ir.py "built" because mlir._mlir_libs.__init__.py import mlir.ir in _site_initialize. - MLIRPythonModules.sources.MLIRPythonSources.Core.Python + MODULE_NAME _mlir + DEPENDS_TARGETS MLIRPythonModules.extension._mlir.dso OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs" - OUTPUTS _mlirPythonTestNanobind.pyi - DEPENDS_TARGET_SRC_DEPS "${_test_extension_srcs}" - IMPORT_PATHS "${MLIRPythonModules_ROOT_PREFIX}/.." + OUTPUTS "${_core_type_stub_sources}" + DEPENDS_TARGET_SRC_DEPS "${_core_extension_srcs}" + IMPORT_PATHS "${MLIRPythonModules_ROOT_PREFIX}/_mlir_libs" ) - set(_mlirPythonTestNanobind_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") + set(_mlir_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") + + list(TRANSFORM _core_type_stub_sources PREPEND "_mlir_libs/") + # Note, we do not do ADD_TO_PARENT here so that the type stubs are not associated (as mlir_DEPENDS) with + # MLIRPythonSources.Core (or something) when a distro is installed/created. Otherwise they would not be regenerated + # by users of the distro (the stubs are still installed in the distro - they are just not added to mlir_DEPENDS). declare_mlir_python_sources( - MLIRPythonTestSources.PythonTestExtensionNanobind.type_stub_gen + MLIRPythonExtension.Core.type_stub_gen ROOT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs" - ADD_TO_PARENT MLIRPythonTestSources.Dialects - SOURCES _mlir_libs/_mlirPythonTestNanobind.pyi + SOURCES "${_core_type_stub_sources}" ) + + # _mlirPythonTestNanobind stubgen + + if(MLIR_INCLUDE_TESTS) + get_target_property(_test_extension_srcs MLIRPythonTestSources.PythonTestExtensionNanobind INTERFACE_SOURCES) + mlir_generate_type_stubs( + # This is the FQN path because dialect modules import _mlir when loaded. See above. + MODULE_NAME mlir._mlir_libs._mlirPythonTestNanobind + DEPENDS_TARGETS + # You need both _mlir and _mlirPythonTestNanobind because dialect modules import _mlir when loaded + # (so _mlir needs to be built before calling stubgen). + MLIRPythonModules.extension._mlir.dso + MLIRPythonModules.extension._mlirPythonTestNanobind.dso + # You need this one so that ir.py "built" because mlir._mlir_libs.__init__.py import mlir.ir in _site_initialize. + MLIRPythonModules.sources.MLIRPythonSources.Core.Python + OUTPUT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs/_mlir_libs" + OUTPUTS _mlirPythonTestNanobind.pyi + DEPENDS_TARGET_SRC_DEPS "${_test_extension_srcs}" + IMPORT_PATHS "${MLIRPythonModules_ROOT_PREFIX}/.." + ) + set(_mlirPythonTestNanobind_typestub_gen_target "${NB_STUBGEN_CUSTOM_TARGET}") + declare_mlir_python_sources( + MLIRPythonTestSources.PythonTestExtensionNanobind.type_stub_gen + ROOT_DIR "${CMAKE_CURRENT_BINARY_DIR}/type_stubs" + ADD_TO_PARENT MLIRPythonTestSources.Dialects + SOURCES _mlir_libs/_mlirPythonTestNanobind.pyi + ) + endif() endif() ################################################################################ @@ -959,18 +963,23 @@ endif() # This must come last. ################################################################################ +set(_declared_sources MLIRPythonSources MLIRPythonExtension.RegisterEverything) +if(NOT CMAKE_CROSSCOMPILING) + list(APPEND _declared_sources MLIRPythonExtension.Core.type_stub_gen) +endif() + add_mlir_python_modules(MLIRPythonModules ROOT_PREFIX ${MLIRPythonModules_ROOT_PREFIX} INSTALL_PREFIX "${MLIR_BINDINGS_PYTHON_INSTALL_PREFIX}" DECLARED_SOURCES - MLIRPythonSources - MLIRPythonExtension.RegisterEverything - MLIRPythonExtension.Core.type_stub_gen + ${_declared_sources} ${_ADDL_TEST_SOURCES} COMMON_CAPI_LINK_LIBS MLIRPythonCAPI ) -add_dependencies(MLIRPythonModules "${_mlir_typestub_gen_target}") -if(MLIR_INCLUDE_TESTS) - add_dependencies(MLIRPythonModules "${_mlirPythonTestNanobind_typestub_gen_target}") +if(NOT CMAKE_CROSSCOMPILING) + add_dependencies(MLIRPythonModules "${_mlir_typestub_gen_target}") + if(MLIR_INCLUDE_TESTS) + add_dependencies(MLIRPythonModules "${_mlirPythonTestNanobind_typestub_gen_target}") + endif() endif() diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c index 12a436ad12fc4..f5fbb4645cd5d 100644 --- a/mlir/test/CAPI/llvm.c +++ b/mlir/test/CAPI/llvm.c @@ -270,7 +270,7 @@ static void testDebugInfoAttributes(MlirContext ctx) { MlirAttribute compile_unit = mlirLLVMDICompileUnitAttrGet( ctx, id, LLVMDWARFSourceLanguageC99, file, foo, false, - MlirLLVMDIEmissionKindFull, MlirLLVMDINameTableKindDefault); + MlirLLVMDIEmissionKindFull, MlirLLVMDINameTableKindDefault, bar); // CHECK: #llvm.di_compile_unit<{{.*}}> mlirAttributeDump(compile_unit); diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 628adcfb6e285..e64935364997c 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -84,6 +84,7 @@ llvm_canonicalize_cmake_booleans( MLIR_RUN_CUDA_SM80_TESTS MLIR_RUN_CUDA_SM80_LT_TESTS MLIR_RUN_CUDA_SM90_TESTS + BUILD_SHARED_LIBS ) configure_lit_site_cfg( @@ -125,6 +126,10 @@ set(MLIR_TEST_DEPENDS if(NOT MLIR_STANDALONE_BUILD) list(APPEND MLIR_TEST_DEPENDS FileCheck count not split-file yaml2obj) endif() +# Examples/standalone/test.toy (vis-a-vis the standalone example) depends on these. +if(LLVM_INCLUDE_EXAMPLES) + list(APPEND MLIR_TEST_DEPENDS MLIRCAPIArith) +endif() set(MLIR_TEST_DEPENDS ${MLIR_TEST_DEPENDS} mlir-capi-pdl-test diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index cc1162d8b0de8..2fd3df6dcfa71 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -8,6 +8,8 @@ // Note: #gpu.address_space is hardcoded to `1` here because the // test pass doesn't set up the GPU address space conversions. +// CHECK: #[[$MMRA_TAG:.+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> + #gpu_global_addrspace = 1 // CHECK-LABEL: func @fat_raw_buffer_cast @@ -17,7 +19,7 @@ func.func @fat_raw_buffer_cast(%buf: memref<8xi32, #gpu_global_addrspace>) -> me // CHECK-DAG: %[[offset:.*]] = llvm.extractvalue %[[desc]][2] // CHECK-DAG: %[[sizes:.*]] = llvm.extractvalue %[[desc]][3] // CHECK-DAG: %[[strides:.*]] = llvm.extractvalue %[[desc]][4] - // CHECK-DAG: %[[numRecords:.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK-DAG: %[[numRecords:.*]] = llvm.mlir.constant(32 : i64) : i64 // CHECK-DAG: %[[strideArg:.*]] = llvm.mlir.constant(0 : i16) : i16 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) @@ -38,7 +40,7 @@ func.func @fat_raw_buffer_cast_0d(%buf: memref) -> m // CHECK: %[[desc:.*]] = builtin.unrealized_conversion_cast %{{.*}} : memref to !llvm.struct<(ptr<1>, ptr<1>, i64)> // CHECK-DAG: %[[base:.*]] = llvm.extractvalue %[[desc]][1] // CHECK-DAG: %[[offset:.*]] = llvm.extractvalue %[[desc]][2] - // CHECK-DAG: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK-DAG: %[[numRecords:.*]] = llvm.mlir.constant(4 : i64) : i64 // CHECK-DAG: %[[strideArg:.*]] = llvm.mlir.constant(0 : i16) : i16 // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) @@ -57,9 +59,8 @@ func.func @fat_raw_buffer_cast_dyn_size_offset(%buf: memref) -> memref<8xi32, #amdgpu.address_space> { - // CHECK: %[[numRecords:.*]] = arith.constant -1 : i32 + // CHECK: %[[numRecords:.*]] = arith.constant -1 : i64 // CHECK: rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} - %cu32_max = arith.constant 0xffffffff : i32 - %ret = amdgpu.fat_raw_buffer_cast %buf validBytes(%cu32_max) : memref<8xi32, #gpu_global_addrspace> to memref<8xi32, #amdgpu.address_space> + %cu64_max = arith.constant -1 : i64 + %ret = amdgpu.fat_raw_buffer_cast %buf validBytes(%cu64_max) : memref<8xi32, #gpu_global_addrspace> to memref<8xi32, #amdgpu.address_space> return %ret : memref<8xi32, #amdgpu.address_space> } @@ -115,9 +116,7 @@ func.func @fat_raw_buffer_cast_cache_swizzle(%buf: memref<64x64xi32, #gpu_global // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32 func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref) -> i32 { - // Extra constant for byte width - // CHECK: llvm.mlir.constant(4 : i32) - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i64) // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) @@ -130,7 +129,7 @@ func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref) -> i32 { // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32 func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) @@ -155,11 +154,10 @@ func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<16x16xi32, strided<[ // CHECK: %[[stride_j:.*]] = llvm.extractvalue %[[descriptor]][4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[ext_j:.*]] = llvm.mul %[[sz_j]], %[[stride_j]] : i64 // CHECK: %[[num_records:.*]] = llvm.intr.umax(%[[ext_i]], %[[ext_j]]) : (i64, i64) -> i64 - // CHECK: %[[num_rec_i32:.*]] = llvm.trunc %[[num_records]] : i64 to i32 - // CHECK: %[[elem_size_2:.*]] = llvm.mlir.constant(4 : i32) : i32 - // CHECK: %[[num_rec_bytes_i32:.*]] = llvm.mul %[[num_rec_i32]], %[[elem_size_2]] : i32 + // CHECK: %[[elem_size_2:.*]] = llvm.mlir.constant(4 : i64) : i64 + // CHECK: %[[num_rec_bytes:.*]] = llvm.mul %[[num_records]], %[[elem_size_2]] : i64 // CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16) : i16 - // CHECK: %[[rsrc:.*]] = rocdl.make.buffer.rsrc %[[ptr]], %[[stride]], %[[num_rec_bytes_i32]], %{{.*}} : !llvm.ptr to <8> + // CHECK: %[[rsrc:.*]] = rocdl.make.buffer.rsrc %[[ptr]], %[[stride]], %[[num_rec_bytes]], %{{.*}} : !llvm.ptr to <8> // CHECK: %[[stride_i_1:.*]] = llvm.extractvalue %[[descriptor]][4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[stride_i_i32:.*]] = llvm.trunc %[[stride_i_1]] : i64 to i32 // CHECK: %[[t_0:.*]] = llvm.mul %{{.*}}, %[[stride_i_i32]] : i32 @@ -207,7 +205,7 @@ func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vec // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8 func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i64) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8 // CHECK: return %[[ret]] @@ -217,7 +215,7 @@ func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 { // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8 func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i64) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i16 // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8> @@ -237,7 +235,7 @@ func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vect // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx: i32) -> f8E5M2FNUZ { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i64) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i8 // CHECK: %[[ret:.*]] = builtin.unrealized_conversion_cast %[[loaded]] : i8 to f8E5M2FNUZ @@ -248,7 +246,7 @@ func.func @gpu_gcn_raw_buffer_load_f8E5M2FNUZ(%buf: memref<64xf8E5M2FNUZ>, %idx: // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %idx: i32) -> vector<4xf8E4M3FNUZ> { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i64) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %{{.*}} // CHECK: %[[loaded:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32 // CHECK: %[[cast:.*]] = llvm.bitcast %[[loaded]] : i32 to vector<4xi8> @@ -271,7 +269,7 @@ func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref) { // CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32 func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -307,7 +305,7 @@ func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64 // And more so for atomic add // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32 func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -318,7 +316,7 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2f16 func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: memref<64xf16>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -329,7 +327,7 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: mem // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16 func.func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16(%value: vector<2xbf16>, %buf: memref<64xbf16>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -340,7 +338,7 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16(%value: vector<2xbf16>, %buf: m // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -351,7 +349,7 @@ func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32 func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -362,7 +360,7 @@ func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32 func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -376,7 +374,7 @@ func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 { // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32 // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32 - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -390,7 +388,7 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m // CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64 // CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}}) func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32) + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i64) // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] @@ -414,19 +412,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_v2f16(%src : vector<2xf16>, %cmp : v // CHECK-LABEL: func @lds_barrier func.func @lds_barrier() { + // CHECK: llvm.fence syncscope("workgroup") release {llvm.mmra = #[[$MMRA_TAG]]} // GFX908: llvm.inline_asm has_side_effects asm_dialect = att - // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" - // GFX90A: rocdl.s.waitcnt -7937 + // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_barrier" // GFX90A-NEXT: rocdl.s.barrier - // GFX942: rocdl.s.waitcnt -7937 // GFX942-NEXT: rocdl.s.barrier - // GFX10: rocdl.s.waitcnt -16129 // GFX10-NEXT: rocdl.s.barrier - // GFX11: llvm.inline_asm has_side_effects asm_dialect = att - // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier" - // GFX12: rocdl.s.wait.dscnt 0 + // GFX11-NEXT: rocdl.s.barrier // GFX12-NEXT: rocdl.s.barrier.signal -1 // GFX12-NEXT: rocdl.s.barrier.wait -1 + // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = #[[$MMRA_TAG]]} amdgpu.lds_barrier func.return } diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 0c500e10bc810..5755ca9258283 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -854,7 +854,8 @@ module @mymodule { // CHECK: %[[desc:.+]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[c8192:.+]] = llvm.mlir.constant(8192 : index) : i64 // CHECK: %[[shmemOfset:.+]] = llvm.getelementptr %[[desc]][%[[c8192]]] : (!llvm.ptr<3>, i64) - // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[shmemOfset]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] + // CHECK: %[[dest:.+]] = llvm.addrspacecast %[[shmemOfset]] : !llvm.ptr<3> to !llvm.ptr<7> + // CHECK: nvvm.cp.async.bulk.tensor.shared.cluster.global %[[dest]], %{{.*}}, %{{.*}}, box[%{{.*}}, %{{.*}}] nvgpu.tma.async.load %rhsTensorMap[%c0, %c0], %mbarrier[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> memref<64x64xf16, strided<[64, 1], offset: 8192>, 3> return } diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index bf80d9a1668a1..6960e83be3573 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -96,119 +96,93 @@ func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p } // CHECK-LABEL: @tma_load_3d_all -func.func @tma_load_3d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "r,l,r,r,r,r,h,h,l,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_3d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4} ], [$5],{$6}, $7, $8;", "l,l,r,r,r,r,h,h,l,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_4d_all -func.func @tma_load_4d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$11 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "r,l,r,r,r,r,r,h,h,h,l,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_4d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$11 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5} ], [$6],{$7,$8}, $9, $10;", "l,l,r,r,r,r,r,h,h,h,l,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0,%off1] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_5d_all -func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr - // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "@$13 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "r,l,r,r,r,r,r,r,h,h,h,h,l,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_5d_all(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { + // CHECK: lvm.inline_asm has_side_effects asm_dialect = att "@$13 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.im2col.multicast::cluster.L2::cache_hint [$0], [$1, {$2,$3,$4,$5,$6} ], [$7],{$8,$9,$10}, $11, $12;", "l,l,r,r,r,r,r,r,h,h,h,h,l,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] im2col[%off0,%off1,%off2] multicast_mask = %ctamask l2_cache_hint = %cacheHint predicate = %p {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_1d -func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "r,l,r,r,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$4 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2} ], [$3];", "l,l,r,r,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0] predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_2d -func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "r,l,r,r,r,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3} ], [$4];", "l,l,r,r,r,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_3d -func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "r,l,r,r,r,r,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4} ], [$5];", "l,l,r,r,r,r,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2] predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_4d -func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "r,l,r,r,r,r,r,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5} ], [$6];", "l,l,r,r,r,r,r,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_5d -func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "r,l,r,r,r,r,r,r,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes [$0], [$1, {$2,$3,$4,$5,$6} ], [$7];", "l,l,r,r,r,r,r,r,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd4] predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_multicast1d -func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "r,l,r,r,h,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2} ], [$3], $4;", "l,l,r,r,h,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_multicast2d -func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "r,l,r,r,r,h,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3} ], [$4], $5;", "l,l,r,r,r,h,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_multicast3d -func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "r,l,r,r,r,r,h,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4} ], [$5], $6;", "l,l,r,r,r,r,h,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_multicast4d -func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask: !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "r,l,r,r,r,r,r,h,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$8 cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5} ], [$6], $7;", "l,l,r,r,r,r,r,h,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<7>, !llvm.ptr return } // CHECK-LABEL: @tma_load_multicast5d -func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask : !llvm.ptr<3>, !llvm.ptr - // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "r,l,r,r,r,r,r,r,h,b" - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<3>, !llvm.ptr +func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) { + // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$2,$3,$4,$5,$6} ], [$7], $8;", "l,l,r,r,r,r,r,r,h,b" + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box [%crd0,%crd1,%crd2,%crd3,%crd4] multicast_mask = %multicastMask predicate=%p : !llvm.ptr<7>, !llvm.ptr return } diff --git a/mlir/test/Conversion/SCFToSPIRV/unsupported.mlir b/mlir/test/Conversion/SCFToSPIRV/unsupported.mlir index d24f37b553bb5..1a1c24a09aa8c 100644 --- a/mlir/test/Conversion/SCFToSPIRV/unsupported.mlir +++ b/mlir/test/Conversion/SCFToSPIRV/unsupported.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --convert-scf-to-spirv %s --verify-diagnostics --split-input-file | FileCheck %s +// RUN: mlir-opt --convert-scf-to-spirv %s | FileCheck %s // `scf.parallel` conversion is not supported yet. // Make sure that we do not accidentally invalidate this function by removing @@ -19,14 +19,3 @@ func.func @func(%arg0: i64) { } return } - -// ----- - -// Make sure we don't crash on recursive structs. -// TODO(https://github.com/llvm/llvm-project/issues/159963): Promote this to a `vce-deduction.mlir` testcase. - -// expected-error@below {{failed to legalize operation 'spirv.module' that was explicitly marked illegal}} -spirv.module Physical64 GLSL450 { - spirv.GlobalVariable @recursive: - !spirv.ptr, StorageBuffer>)>, StorageBuffer> -} diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 37af8b8859852..a7a73ae904042 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -698,13 +698,14 @@ func.func @test_simple_i32(%arg0: tensor<1xi32>, %unsigned: tensor<1xui32>, %uns // CHECK: linalg.generic // CHECK: arith.constant 1 // CHECK: arith.constant 0 + // CHECK: arith.constant false // CHECK: arith.constant true // CHECK: arith.cmpi // CHECK: arith.subi // CHECK: arith.shrsi // CHECK: arith.trunci // CHECK: and - // CHECK: and + // CHECK: arith.select // CHECK: arith.extui // CHECK: arith.addi %12 = tosa.arithmetic_right_shift %arg0, %arg0 {round = 1 : i1} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32> @@ -899,6 +900,39 @@ func.func @test_negate_quantized(%arg0: tensor<1xi8>) -> () { // ----- +// CHECK-LABEL: @test_negate_no_const_1 +func.func @test_negate_no_const_1(%arg0: tensor<50x42xf16> ,%arg1: tensor<1xf16> , %arg2: tensor<1xf16> ) -> tensor<*xf16> { + // CHECK: %[[GENERIC:.+]] = linalg.generic + // CHECK: ^bb0([[ARG0:%.*]]: f16, [[ARG1:%.*]]: f16, [[ARG2:%.*]]: f16, [[OUT:%.*]]: f16) + // CHECK: [[ELEMENT:%.*]] = arith.negf [[ARG0]] : f16 + %0 = tosa.negate %arg0, %arg1, %arg2 : (tensor<50x42xf16>, tensor<1xf16>, tensor<1xf16>) -> tensor<50x42xf16> + %cast = tensor.cast %0 : tensor<50x42xf16> to tensor<*xf16> + return %cast : tensor<*xf16> +} + +// ----- + +// CHECK-LABEL: @test_negate_no_const_2 +func.func @test_negate_no_const_2(%arg0: tensor<50x42xi16> ,%arg1: tensor<1xi16> , %arg2: tensor<1xi16> ) -> tensor<*xi16> { + // CHECK: %[[GENERIC:.+]] = linalg.generic + // CHECK: ^bb0([[ARG0:%.*]]: i16, [[ARG1:%.*]]: i16, [[ARG2:%.*]]: i16, [[OUT:%.*]]: i16) + // CHECK: [[EXTSI1:%.*]] = arith.extsi [[ARG1]] : i16 to i64 + // CHECK: [[EXTSI2:%.*]] = arith.extsi [[ARG2]] : i16 to i64 + // CHECK: [[SUM:%.*]] = arith.addi [[EXTSI1]], [[EXTSI2]] : i64 + // CHECK: [[EXTSI0:%.*]] = arith.extsi [[ARG0]] : i16 to i64 + // CHECK: [[SUB:%.*]] = arith.subi [[SUM]], [[EXTSI0]] : i64 + // CHECK: [[C_32768:%.*]] = arith.constant -32768 : i64 + // CHECK: [[C32767:%.*]] = arith.constant 32767 : i64 + // CHECK: [[MAX:%.*]] = arith.maxsi [[C_32768]], [[SUB]] : i64 + // CHECK: [[MIN:%.*]] = arith.minsi [[C32767]], [[MAX]] : i64 + // CHECK: [[TRUNC:%.*]] = arith.trunci [[MIN]] : i64 to i16 + %0 = tosa.negate %arg0, %arg1, %arg2 : (tensor<50x42xi16>, tensor<1xi16>, tensor<1xi16>) -> tensor<50x42xi16> + %cast = tensor.cast %0 : tensor<50x42xi16> to tensor<*xi16> + return %cast : tensor<*xi16> +} + +// ----- + // CHECK-LABEL: @test_identity // CHECK-SAME: %[[ARG0:[0-9a-zA-Z_]*]]: tensor<1xf32>, // CHECK-SAME: %[[ARG1:[0-9a-zA-Z_]*]]: tensor<1xi32> diff --git a/mlir/test/Conversion/VectorToLLVM/pass-option-serialization.mlir b/mlir/test/Conversion/VectorToLLVM/pass-option-serialization.mlir index 323d86ac40988..7e7925a473d53 100644 --- a/mlir/test/Conversion/VectorToLLVM/pass-option-serialization.mlir +++ b/mlir/test/Conversion/VectorToLLVM/pass-option-serialization.mlir @@ -13,7 +13,7 @@ // RUN: mlir-opt --convert-vector-to-llvm --dump-pass-pipeline %s 2>&1 | FileCheck %s --check-prefix=DEFAULT -// RUN: mlir-opt --convert-vector-to-llvm='vector-contract-lowering=matmul vector-transpose-lowering=flat' \ +// RUN: mlir-opt --convert-vector-to-llvm='vector-contract-lowering=llvmintr vector-transpose-lowering=llvmintr' \ // RUN: --dump-pass-pipeline %s 2>&1 | FileCheck %s --check-prefix=NON-DEFAULT // CHECK: builtin.module( @@ -26,5 +26,5 @@ // CHECK-SAME: reassociate-fp-reductions={{[aA-zZ0-9]+}} // DEFAULT: vector-contract-lowering=dot // DEFAULT: vector-transpose-lowering=eltwise -// NON-DEFAULT: vector-contract-lowering=matmul -// NON-DEFAULT: vector-transpose-lowering=flat +// NON-DEFAULT: vector-contract-lowering=llvm +// NON-DEFAULT: vector-transpose-lowering=llvm diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 369e0fff538e1..8f427e9d56f45 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -360,7 +360,7 @@ func.func @fat_raw_buffer_cast_easy(%m: memref<8xi32>) -> memref<8xi32, #amdgpu. // CHECK-SAME: cacheSwizzleStride(%{{[^)]*}}) // CHECK-SAME: boundsCheck(false) // CHECK-SAME: resetOffset -func.func @fat_raw_buffer_cast(%m: memref<8xi32, strided<[1], offset: ?>>, %validBytes: i32, %cacheSwizzle: i14) -> memref<8xi32, #amdgpu.address_space> { +func.func @fat_raw_buffer_cast(%m: memref<8xi32, strided<[1], offset: ?>>, %validBytes: i64, %cacheSwizzle: i14) -> memref<8xi32, #amdgpu.address_space> { %ret = amdgpu.fat_raw_buffer_cast %m validBytes(%validBytes) cacheSwizzleStride(%cacheSwizzle) boundsCheck(false) resetOffset : memref<8xi32, strided<[1], offset: ?>> to memref<8xi32, #amdgpu.address_space> func.return %ret : memref<8xi32, #amdgpu.address_space> diff --git a/mlir/test/Dialect/Arith/constant-fold.mlir b/mlir/test/Dialect/Arith/constant-fold.mlir new file mode 100644 index 0000000000000..172945fafdaf3 --- /dev/null +++ b/mlir/test/Dialect/Arith/constant-fold.mlir @@ -0,0 +1,18 @@ +// Test with the default (one application of the folder) and then with 2 iterations. +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-single-fold))" | FileCheck %s --check-prefixes=CHECK,CHECK-ONE +// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-single-fold{max-iterations=2}))" | FileCheck %s --check-prefixes=CHECK,CHECK-TWO + + +// Folding entirely this requires to move the constant to the right +// before invoking the op-specific folder. +// With one iteration, we just push the constant to the right. +// With a second iteration, we actually fold the "add" (x+0->x) +// CHECK: func @recurse_fold_traits(%[[ARG0:.*]]: i32) +func.func @recurse_fold_traits(%arg0 : i32) -> i32 { + %cst0 = arith.constant 0 : i32 +// CHECK-ONE: %[[ADD:.*]] = arith.addi %[[ARG0]], + %res = arith.addi %cst0, %arg0 : i32 +// CHECK-ONE: return %[[ADD]] : i32 +// CHECK-TWO: return %[[ARG0]] : i32 + return %res : i32 +} diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir index 2efb5893c8511..6054a61912532 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir @@ -380,6 +380,20 @@ func.func @execute_region_test(%t1 : tensor) // ----- +// CHECK-LABEL: func @no_inline_execute_region_not_canonicalized +func.func @no_inline_execute_region_not_canonicalized() { + %c = arith.constant 42 : i32 + // CHECK: scf.execute_region + // CHECK-SAME: no_inline + %v = scf.execute_region -> i32 no_inline { + scf.yield %c : i32 + } + // CHECK: return + return +} + +// ----- + // CHECK: func private @some_external_func(memref>) func.func private @some_external_func(tensor) @@ -810,3 +824,59 @@ module @inner_module { return %t : tensor<5xf32> } } + +// ----- + +// CHECK: func.func @custom_types( +// CHECK-SAME: %[[arg:.*]]: !test.test_memref<[4, 4], f64> +// CHECK-SAME: ) -> (!test.test_memref<[4, 8], f64>, +// CHECK-SAME: !test.test_memref<[4, 8], f64>) +func.func @custom_types(%arg: !test.test_tensor<[4, 4], f64>) + -> (!test.test_tensor<[4, 8], f64>, !test.test_tensor<[4, 8], f64>) { + // CHECK: %[[out1:.*]] = "test.dummy_memref_op"(%[[arg]]) : + // CHECK-SAME: (!test.test_memref<[4, 4], f64>) -> !test.test_memref<[4, 8], f64> + %out1 = "test.dummy_tensor_op"(%arg) : (!test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 8], f64> + + // CHECK: %[[alloc:.*]] = "test.create_memref_op" + // CHECK: %[[out2:.*]] = "test.dummy_memref_op"(%[[alloc]]) + // CHECK-SAME: (!test.test_memref<[4, 4], f64>) -> !test.test_memref<[4, 8], f64> + %alloc = "test.create_tensor_op"() : () -> !test.test_tensor<[4, 4], f64> + %out2 = "test.dummy_tensor_op"(%alloc) : (!test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 8], f64> + + // CHECK: return %[[out1]], %[[out2]] + return %out1, %out2 : + !test.test_tensor<[4, 8], f64>, !test.test_tensor<[4, 8], f64> +} + +// ----- + +// CHECK: func.func @custom_types_foo( +// CHECK-SAME: %[[arg:.*]]: !test.test_memref<[4, 4], f64> +// CHECK-SAME: ) -> !test.test_memref<[4, 4], f64> +func.func @custom_types_foo(%arg: !test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 4], f64> { + // CHECK: %[[out:.*]] = "test.dummy_memref_op"(%[[arg]]) + %out = "test.dummy_tensor_op"(%arg) : (!test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 4], f64> + // CHECK: return %[[out]] + return %out : !test.test_tensor<[4, 4], f64> +} + +// CHECK: func.func @custom_types_bar( +// CHECK-SAME: %[[arg:.*]]: !test.test_memref<[4, 4], f64> +// CHECK-SAME: ) -> !test.test_memref<[4, 8], f64> +func.func @custom_types_bar(%arg: !test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 8], f64> { + // CHECK: %[[call:.*]] = call @custom_types_foo(%[[arg]]) + %call = func.call @custom_types_foo(%arg) : (!test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 4], f64> + + // CHECK: %[[out:.*]] = "test.dummy_memref_op"(%[[call]]) + %out = "test.dummy_tensor_op"(%call) : (!test.test_tensor<[4, 4], f64>) + -> !test.test_tensor<[4, 8], f64> + + // CHECK: return %[[out]] + return %out : !test.test_tensor<[4, 8], f64> +} diff --git a/mlir/test/Dialect/ControlFlow/canonicalize.mlir b/mlir/test/Dialect/ControlFlow/canonicalize.mlir index bf69935a00bf0..17f7d28ba59fb 100644 --- a/mlir/test/Dialect/ControlFlow/canonicalize.mlir +++ b/mlir/test/Dialect/ControlFlow/canonicalize.mlir @@ -490,3 +490,147 @@ func.func @branchCondProp(%arg0: i1) { ^exit: return } + +// ----- + +/// Test that control-flow cycles are not simplified infinitely. + +// CHECK-LABEL: @cycle_2_blocks +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: cf.br ^bb1 +func.func @cycle_2_blocks() { + cf.br ^bb1 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb1 +} + +// CHECK-LABEL: @no_cycle_2_blocks +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i32 +// CHECK: return %[[VAL_0]] : i32 +func.func @no_cycle_2_blocks() -> i32 { + cf.br ^bb1 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb3 +^bb3: + %ret = arith.constant 1 : i32 + return %ret : i32 +} + +// CHECK-LABEL: @cycle_4_blocks +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: cf.br ^bb1 +func.func @cycle_4_blocks() { + cf.br ^bb1 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb3 +^bb3: + cf.br ^bb4 +^bb4: + cf.br ^bb1 +} + +// CHECK-LABEL: @no_cycle_4_blocks +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i32 +// CHECK: return %[[VAL_0]] : i32 +func.func @no_cycle_4_blocks() -> i32 { + cf.br ^bb1 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb3 +^bb3: + cf.br ^bb4 +^bb4: + cf.br ^bb5 +^bb5: + %ret = arith.constant 1 : i32 + return %ret : i32 +} + +// CHECK-LABEL: @delayed_3_cycle +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: cf.br ^bb1 +func.func @delayed_3_cycle() { + cf.br ^bb1 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb3 +^bb3: + cf.br ^bb4 +^bb4: + cf.br ^bb5 +^bb5: + cf.br ^bb3 +} + +// CHECK-LABEL: @cycle_1_block +// CHECK: cf.br ^bb1 +// CHECK: ^bb1: +// CHECK: cf.br ^bb1 +func.func @cycle_1_block() { + cf.br ^bb1 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb2 +} + +// CHECK-LABEL: @unsimplified_cycle_1 +// CHECK-SAME: %[[ARG0:.*]]: i1) { +// CHECK: cf.cond_br %[[ARG0]], ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: cf.br ^bb2 +// CHECK: ^bb2: +// CHECK: cf.br ^bb3 +// CHECK: ^bb3: +// CHECK: cf.br ^bb3 +func.func @unsimplified_cycle_1(%c : i1) { + cf.cond_br %c, ^bb1, ^bb2 +^bb1: + cf.br ^bb2 +^bb2: + cf.br ^bb3 +^bb3: + cf.br ^bb4 +^bb4: + cf.br ^bb3 +} + +// Make sure we terminate when other cf passes can't help us. + +// CHECK-LABEL: @unsimplified_cycle_2 +// CHECK-SAME: %[[ARG0:.*]]: i1) { +// CHECK: cf.cond_br %[[ARG0]], ^bb1, ^bb3 +// CHECK: ^bb1: +// CHECK: cf.br ^bb2 {A} +// CHECK: ^bb2: +// CHECK: cf.br ^bb2 {E} +// CHECK: ^bb3: +// CHECK: cf.br ^bb1 +func.func @unsimplified_cycle_2(%c : i1) { + cf.cond_br %c, ^bb6, ^bb7 +^bb6: + cf.br ^bb5 {F} +^bb5: + cf.br ^bb1 {A} +^bb1: + cf.br ^bb2 {B} +^bb2: + cf.br ^bb3 {C} +^bb3: + cf.br ^bb4 {D} +^bb4: + cf.br ^bb1 {E} +^bb7: + cf.br ^bb6 +} diff --git a/mlir/test/Dialect/GPU/mapping.mlir b/mlir/test/Dialect/GPU/mapping.mlir index 395987317a1e6..b313ab69cc001 100644 --- a/mlir/test/Dialect/GPU/mapping.mlir +++ b/mlir/test/Dialect/GPU/mapping.mlir @@ -1,4 +1,5 @@ -// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s +// RUN: mlir-opt -gpu-map-parallel-loops -split-input-file %s | FileCheck %s --check-prefix=OUTER +// RUN: mlir-opt -gpu-map-parallel-loops="mapping-policy=innermost-first" -split-input-file %s | FileCheck %s --check-prefix=INNER func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index) { @@ -14,14 +15,23 @@ func.func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index, return } -// CHECK-LABEL: func @parallel_loop( -// CHECK: scf.parallel -// CHECK: scf.parallel -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK-NOT: mapping +// OUTER-LABEL: func @parallel_loop( +// OUTER: scf.parallel +// OUTER: scf.parallel +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER-NOT: mapping + +// INNER-LABEL: func @parallel_loop( +// INNER: scf.parallel +// INNER: scf.parallel +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER-NOT: mapping // ----- @@ -42,20 +52,38 @@ func.func @parallel_loop_4d(%arg0 : index, %arg1 : index, %arg2 : index, return } -// CHECK-LABEL: func @parallel_loop_4d( -// CHECK: scf.parallel -// CHECK: scf.parallel -// CHECK: scf.parallel -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, -// CHECK-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} -// CHECK-NOT: mapping +// OUTER-LABEL: func @parallel_loop_4d( +// OUTER: scf.parallel +// OUTER: scf.parallel +// OUTER: scf.parallel +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// OUTER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// OUTER-NOT: mapping + +// INNER-LABEL: func @parallel_loop_4d( +// INNER: scf.parallel +// INNER: scf.parallel +// INNER: scf.parallel +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER: {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, +// INNER-SAME: #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} +// INNER-NOT: mapping diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index 749fb634dba76..627abd0665d8c 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -1720,37 +1720,6 @@ llvm.func @foo(%arg: !llvm.ptr) { // ----- -func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // expected-error@+1 {{to use im2col mode, the tensor has to be at least 3-dimensional}} - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr - return -} -// ----- - -func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // expected-error@+1 {{im2col offsets must be 2 less than number of coordinates}} - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint : !llvm.ptr<3>, !llvm.ptr - return -} - -// ----- - -func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // expected-error@+1 {{expects coordinates between 1 to 5 dimension}} - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[]: !llvm.ptr<3>, !llvm.ptr - return -} - -// ----- - -func.func @tma_load(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64, %p : i1) { - // expected-error@+1 {{expects coordinates between 1 to 5 dimension}} - nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3,%crd0,%crd1,%crd2,%crd3]: !llvm.ptr<3>, !llvm.ptr - return -} - -// ----- - // expected-error @below {{no_inline and always_inline attributes are incompatible}} llvm.func @alwaysinline_noinline() attributes { always_inline, no_inline } { llvm.return @@ -2045,3 +2014,24 @@ llvm.mlir.alias external @alias_resolver : !llvm.ptr { } // expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}} llvm.mlir.ifunc external @foo : !llvm.func, !llvm.ptr @alias_resolver {dso_local} + +// ----- + +llvm.func @invalid_sincos_nonhomogeneous_return_type(%f: f32) -> () { + // expected-error@+1 {{op expected result type to be an homogeneous struct with two elements matching the operand type}} + llvm.intr.sincos(%f) : (f32) -> !llvm.struct<(f32, f64)> +} + +// ----- + +llvm.func @invalid_sincos_non_struct_return_type(%f: f32) -> () { + // expected-error@+1 {{op expected result type to be an homogeneous struct with two elements matching the operand type}} + llvm.intr.sincos(%f) : (f32) -> f32 +} + +// ----- + +llvm.func @invalid_sincos_gt_2_element_struct_return_type(%f: f32) -> () { + // expected-error@+1 {{op expected result type to be an homogeneous struct with two elements matching the operand type}} + llvm.intr.sincos(%f) : (f32) -> !llvm.struct<(f32, f32, f32)> +} diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir index 959bb35302b20..0bad151570029 100644 --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -29,6 +29,20 @@ func.func @rocdl_special_regs() -> i32 { llvm.return %0 : i32 } +func.func @rocdl.fmed3.scalar(%a: f32, %b: f32, %c: f32) -> f32 { + // CHECK-LABEL: rocdl.fmed3.scalar + // CHECK: %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f32 + %0 = rocdl.fmed3 %a, %b, %c : f32 + llvm.return %0 : f32 +} + +func.func @rocdl.fmed3.vector(%a: vector<4xf16>, %b: vector<4xf16>, %c: vector<4xf16>) -> vector<4xf16> { + // CHECK-LABEL: rocdl.fmed3.vector + // CHECK: %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : vector<4xf16> + %0 = rocdl.fmed3 %a, %b, %c : vector<4xf16> + llvm.return %0 : vector<4xf16> +} + func.func @rocdl.barrier() { // CHECK: rocdl.barrier rocdl.barrier @@ -652,7 +666,7 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, %stride : i16, - %numRecords : i32, + %numRecords : i64, %flags : i32) -> !llvm.ptr<8> { // CHECK-LABEL: rocdl.make.buffer.rsrc // CHECK: %{{.*}} = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr to <8> diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir index 5c5f7e861d37d..26d2d98572f47 100644 --- a/mlir/test/Dialect/Linalg/canonicalize.mlir +++ b/mlir/test/Dialect/Linalg/canonicalize.mlir @@ -1756,10 +1756,11 @@ func.func @pack_unpack(%t: tensor<16x16x?x?xf32>, %tile1: index, %tile2: index) // CHECK-SAME: %[[T:.+]]: tensor<16x16x8x8xf32> // CHECK: return %[[T]] : tensor<16x16x8x8xf32> func.func @pack_unpack(%t: tensor<16x16x8x8xf32>) -> tensor<16x16x8x8xf32> { + %cst = arith.constant 0.000000e+00 : f32 %tensor_empty = tensor.empty() : tensor<128x128xf32> %unpacked = linalg.unpack %t inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty : tensor<16x16x8x8xf32> -> tensor<128x128xf32> %tensor_empty1 = tensor.empty() : tensor<16x16x8x8xf32> - %packed = linalg.pack %unpacked inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x8x8xf32> + %packed = linalg.pack %unpacked padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [8, 8] into %tensor_empty1 : tensor<128x128xf32> -> tensor<16x16x8x8xf32> return %packed : tensor<16x16x8x8xf32> } diff --git a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir index 8627fcd2576b9..152a392afe247 100644 --- a/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir +++ b/mlir/test/Dialect/Linalg/convert-conv2d-to-img2col.mlir @@ -26,6 +26,26 @@ module attributes {transform.with_named_sequence} { // ----- +// Memref semantics is not supported. +// Check that we emit an error. +func.func @negative_conv_memref(%arg0: memref<1x16x16x4xf32>, %arg1: memref<16x3x3x4xf32>, %arg2: memref<1x14x14x16xf32>) { + // expected-note@below {{when applied to this op}} + linalg.conv_2d_nhwc_fhwc {dilations = dense<1> : memref<2xi64>, strides = dense<1> : memref<2xi64> } + ins(%arg0, %arg1: memref<1x16x16x4xf32>, memref<16x3x3x4xf32>) outs(%arg2: memref<1x14x14x16xf32>) + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error@below {{failed to apply}} + %img2col_tensor_producer, %transformed = transform.structured.convert_conv2d_to_img2col %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + // Check that we get the proper handles for the img2col tensor producer // and the final instruction. @@ -267,6 +287,31 @@ module attributes {transform.with_named_sequence} { // ----- +// Check that the encoding on the filter (weights) tensor is propagated when applying the transform. + +// CHECK: func.func @batch_nchw_conv_with_filter_encoding(%[[INPUT:.+]]: tensor<8x4x16x16xf32>, %[[FILTER:.*]]: tensor<16x4x3x3xf32, 42 : i32>, %[[OUTPUT:.*]]: tensor<8x16x14x14xf32>) +// CHECK-DAG: %[[COLLAPSED_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] + // CHECK-SAME{LITERAL}: [[0], [1, 2, 3]] : tensor<16x4x3x3xf32, 42 : i32> into tensor<16x36xf32, 42 : i32> +// CHECK: %[[COL_TENSOR:.+]] = linalg.generic {{.*}} ins(%[[INPUT]] : tensor<8x4x16x16xf32>) +// CHECK: %[[MATMUL_RESULT:.+]] = linalg.generic {{.*}} ins(%[[COLLAPSED_FILTER]], %[[COL_TENSOR]] : tensor<16x36xf32, 42 : i32>, tensor<8x36x196xf32>) +func.func @batch_nchw_conv_with_filter_encoding(%arg0: tensor<8x4x16x16xf32>, %arg1: tensor<16x4x3x3xf32, 42 : i32>, %arg2: tensor<8x16x14x14xf32>) -> tensor<8x16x14x14xf32> { + %0 = linalg.conv_2d_nchw_fchw + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> } + ins(%arg0, %arg1: tensor<8x4x16x16xf32>, tensor<16x4x3x3xf32, 42 : i32>) + outs(%arg2: tensor<8x16x14x14xf32>) -> tensor<8x16x14x14xf32> + return %0 : tensor<8x16x14x14xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.conv_2d_nchw_fchw"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1:2 = transform.structured.convert_conv2d_to_img2col %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + // CHECK: IR printer: tensor_producer // CHECK-NEXT: %[[COL_TENSOR:.+]] = linalg.generic // CHECK-SAME: affine_map<(d0, d1, d2) -> (d0, d1 floordiv 14 + d2 floordiv 12, d1 mod 14 + (d2 mod 12) floordiv 4, d2 mod 4)> @@ -290,7 +335,7 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[COLLAPSED_OUT:.+]] = tensor.collapse_shape %[[OUTPUT]] {{\[}}[0], [1, 2], [3]] : tensor<1x14x14x16xf32> into tensor<1x196x16xf32> // CHECK: %[[INIT_COL_TENSOR:.+]] = tensor.empty() : tensor<1x196x36xf32> // CHECK: %[[COL_TENSOR:.+]] = linalg.generic -// CHECK-SAME: [#[[MAP0]], #[[MAP1]]] +// CHECK-SAME: [#[[MAP0]], #[[MAP1]]], {{.*}} ins(%[[INPUT]] : tensor<1x16x16x4xf32>) outs(%[[INIT_COL_TENSOR]] : tensor<1x196x36xf32>) // CHECK: ^bb0(%[[OUT_DATA:.+]]: f32) // CHECK: linalg.yield %{{.+}} : f32 // CHECK: %[[MATMUL_RESULT:.+]] = linalg.generic @@ -327,6 +372,31 @@ module attributes {transform.with_named_sequence} { // ----- +// Check that the encoding on the filter (weights) tensor is propagated when applying the transform. + +// CHECK: func.func @conv_2d_nhwc_fhwc_with_filter_encoding(%[[INPUT:.+]]: tensor<1x16x16x4xf32>, %[[FILTER:.*]]: tensor<16x3x3x4xf32, 42 : i32>, %[[OUTPUT:.*]]: tensor<1x14x14x16xf32>) +// CHECK-DAG: %[[COLLAPSED_FILTER:.+]] = tensor.collapse_shape %[[FILTER]] + // CHECK-SAME{LITERAL}: [[0], [1, 2, 3]] : tensor<16x3x3x4xf32, 42 : i32> into tensor<16x36xf32, 42 : i32> +// CHECK: %[[COL_TENSOR:.+]] = linalg.generic {{.*}} ins(%[[INPUT]] : tensor<1x16x16x4xf32>) +// CHECK: %[[MATMUL_RESULT:.+]] = linalg.generic {{.*}} ins(%[[COL_TENSOR]], %[[COLLAPSED_FILTER]] : tensor<1x196x36xf32>, tensor<16x36xf32, 42 : i32>) +func.func @conv_2d_nhwc_fhwc_with_filter_encoding(%input: tensor<1x16x16x4xf32>, %filter: tensor<16x3x3x4xf32, 42 : i32>, %out: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> { + %0 = linalg.conv_2d_nhwc_fhwc + { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> } + ins(%input, %filter: tensor<1x16x16x4xf32>, tensor<16x3x3x4xf32, 42 : i32>) + outs(%out: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> + return %0 : tensor<1x14x14x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.conv_2d_nhwc_fhwc"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1:2 = transform.structured.convert_conv2d_to_img2col %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + // Check for signed extend when the input type is smaller than the accumulator type. // CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir index a5f8d63a3e912..7a16bc0a4faee 100644 --- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir +++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir @@ -1450,6 +1450,33 @@ func.func @push_unpack_in_padded_domain_out_used(%arg0: tensor<8x8x4x8xf32>, %ar // ----- +#map = affine_map<(d0, d1) -> (d0, d1)> +func.func @push_unpack_in_padded_domain_multiple_inputs(%arg0: tensor<1x4x16x16xf32>, %arg1: tensor<8x64xf32>, %arg2: tensor<8x64xf32>) -> tensor<8x64xf32> { + %0 = tensor.empty() : tensor<8x64xf32> + %unpack = linalg.unpack %arg0 inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %0 : tensor<1x4x16x16xf32> -> tensor<8x64xf32> + %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %unpack : tensor<8x64xf32>, tensor<8x64xf32>) outs(%arg2 : tensor<8x64xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %2 = arith.addf %in, %in_0 : f32 + linalg.yield %2 : f32 + } -> tensor<8x64xf32> + return %1 : tensor<8x64xf32> +} +// CHECK-LABEL: func.func @push_unpack_in_padded_domain_multiple_inputs +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[POISON:.+]] = ub.poison : f32 +// CHECK: %[[PACK:.+]] = linalg.pack %[[ARG1]] padding_value(%[[POISON]] : f32) +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [16, 16] +// CHECK: %[[ELEM:.+]] = linalg.generic +// CHECK: ins(%[[PACK]], %[[ARG0]] +// CHECK: %[[UNPACK:.+]] = linalg.unpack %[[ELEM]] +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [16, 16] +// CHECK-SAME: into %[[ARG2]] +// CHECK: return %[[UNPACK]] + +// ----- + module { func.func @push_extract_through_generic(%arg0: tensor<128x7x128xf32>, %arg1: tensor, %arg2: tensor, %arg3: index) -> tensor { %extracted_slice = tensor.extract_slice %arg0[0, 0, %arg3] [128, 7, %arg3] [1, 1, 1] : tensor<128x7x128xf32> to tensor<128x7x?xf32> @@ -1473,7 +1500,7 @@ module { // CHECK: } : tensor to tensor // CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<128x5x128xbf16> // CHECK: %[[GENERIC:.+]] = linalg.generic -// CHECK-SAME: ins(%[[ARG0]], %[[PADDED]] +// CHECK-SAME: ins(%[[ARG0]], %[[PADDED]] // CHECK-SAME: outs(%[[EMPTY]] // CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %3[%[[ARG3]], 0, 0] [%[[ARG3]], 5, 128] [1, 1, 1] : tensor<128x5x128xbf16> to tensor // CHECK: return %[[EXTRACT]] @@ -1492,7 +1519,7 @@ func.func @nopush_extract_through_generic_nodimexpr1(%arg0: tensor<128x7x128xf32 // CHECK-LABEL: func.func @nopush_extract_through_generic_nodimexpr1 // CHECK: %[[GENERIC:.+]] = linalg.generic -// CHECK: return %[[GENERIC]] +// CHECK: return %[[GENERIC]] // ----- @@ -1508,7 +1535,7 @@ func.func @nopush_extract_through_generic_nodimexpr2(%arg0: tensor<128x?x128xf32 // CHECK-LABEL: func.func @nopush_extract_through_generic_nodimexpr2 // CHECK: %[[GENERIC:.+]] = linalg.generic -// CHECK: return %[[GENERIC]] +// CHECK: return %[[GENERIC]] // ----- @@ -1575,7 +1602,7 @@ func.func @push_extract_through_generic_rank0_operand(%arg0: tensor<128x128xf32> // CHECK-LABEL: func.func @push_extract_through_generic_rank0_operand // CHECK: %[[GENERIC:.+]] = linalg.generic -// CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[GENERIC]] +// CHECK: %[[EXTRACT:.+]] = tensor.extract_slice %[[GENERIC]] // CHECK: return %[[EXTRACT]] // ----- diff --git a/mlir/test/Dialect/Linalg/transform-op-tile.mlir b/mlir/test/Dialect/Linalg/transform-op-tile.mlir index 7bac850d0b7fe..0466a7ba3e2ea 100644 --- a/mlir/test/Dialect/Linalg/transform-op-tile.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-tile.mlir @@ -266,3 +266,23 @@ func.func @tile_linalg_matmul( -> tensor<128x128xf32> return %0 : tensor<128x128xf32> } + +// ----- + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @below {{op expected number of loops to tile (0) to match number of `loops` results (1)}} + %1, %loops = transform.structured.tile_using_for %0 tile_sizes [0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} + +func.func @tile_linalg_matmul( + %arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>, %arg2: tensor<128x128xf32>) + -> tensor<128x128xf32> { + %0 = linalg.matmul ins(%arg0, %arg1: tensor<128x128xf32>, tensor<128x128xf32>) + outs(%arg2: tensor<128x128xf32>) + -> tensor<128x128xf32> + return %0 : tensor<128x128xf32> +} diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir index c09046b08e898..35f520a9f22a8 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops-with-patterns.mlir @@ -339,8 +339,8 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @test_vectorize_pack( // CHECK-SAME: %[[VAL_0:.*]]: tensor<32x8x16xf32>, // CHECK-SAME: %[[VAL_1:.*]]: tensor<4x1x32x16x2xf32>) -> tensor<4x1x32x16x2xf32> { -// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[VAL_2:.*]] = ub.poison : f32 +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index // CHECK: %[[VAL_4:.*]] = vector.transfer_read %[[VAL_0]]{{\[}}%[[VAL_3]], %[[VAL_3]], %[[VAL_3]]], %[[VAL_2]] {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32> // CHECK: %[[VAL_5:.*]] = vector.shape_cast %[[VAL_4]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32> // CHECK: %[[VAL_6:.*]] = vector.transpose %[[VAL_5]], [1, 3, 0, 4, 2] : vector<32x4x2x1x16xf32> to vector<4x1x32x16x2xf32> diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index aa86678ba405f..62bf1f55c9af2 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -1068,16 +1068,16 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[DEST:.*]]: tensor, // CHECK-SAME: %[[SRC:.*]]: tensor func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec(%dest: tensor, %src: tensor) -> tensor { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 - // CHECK: %[[C01:.*]] = arith.constant 0 - // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C01:.*]] = arith.constant 0 + // CHECK-DAG: %[[C02:.*]] = arith.constant 0 // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor // CHECK: %[[CNST14:.*]] = arith.constant 1 // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[CNST14]] : tensor // CHECK: %[[CNST16:.*]] = arith.constant 16 : index // CHECK: %[[CNST2:.*]] = arith.constant 2 : index // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[CNST16]], %[[CNST2]] : vector<2x1x[16]x2xi1> - // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} %[[PAD]] {{.*}}: tensor, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32> // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32> // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1> @@ -1100,9 +1100,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[DEST:.*]]: tensor, // CHECK-SAME: %[[SRC:.*]]: tensor func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest: tensor, %src: tensor) -> tensor { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 - // CHECK: %[[C01:.*]] = arith.constant 0 - // CHECK: %[[C02:.*]] = arith.constant 0 + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C01:.*]] = arith.constant 0 + // CHECK-DAG: %[[C02:.*]] = arith.constant 0 // CHECK: %[[DIM4:.*]] = tensor.dim %[[SRC]], %[[C02]] : tensor // CHECK: %[[C1_2:.*]] = arith.constant 1 // CHECK: %[[DIM6:.*]] = tensor.dim %[[SRC]], %[[C1_2]] : tensor @@ -1110,7 +1110,7 @@ func.func @test_vectorize_dynamic_shapes_unpack_scalable_vec_and_tile_size(%dest // CHECK: %[[DIM_2:.*]] = tensor.dim %[[SRC]], %[[C2]] : tensor // CHECK: %[[C2_1:.*]] = arith.constant 2 : index // CHECK: %[[MASK_READ:.*]] = vector.create_mask %[[DIM4]], %[[DIM6]], %[[DIM_2]], %[[C2_1]] : vector<2x1x[16]x2xi1> - // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} : tensor, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> + // CHECK: %[[READ:.*]] = vector.mask %[[MASK_READ]] {{.*}} vector.transfer_read %{{.*}} %[[PAD]] {{.*}}: tensor, vector<2x1x[16]x2xf32> } : vector<2x1x[16]x2xi1> -> vector<2x1x[16]x2xf32> // CHECK: %[[TR:.*]] = vector.transpose %[[READ]], [0, 3, 1, 2] : vector<2x1x[16]x2xf32> to vector<2x2x1x[16]xf32> // CHECK: %[[SC:.*]] = vector.shape_cast %[[TR]] : vector<2x2x1x[16]xf32> to vector<4x[16]xf32> // CHECK: %[[MASK_WRITE:.*]] = vector.create_mask {{.*}} : vector<4x[16]xi1> @@ -1138,14 +1138,14 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> // CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[C0:.*]]= arith.constant 0 : index - // CHECK: %[[C8:.*]] = arith.constant 8 : index - // CHECK: %[[C80:.*]] = arith.constant 8 : index - // CHECK: %[[C32:.*]] = arith.constant 32 : index - // CHECK: %[[C16:.*]] = arith.constant 16 : index + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C0:.*]]= arith.constant 0 : index + // CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index + // CHECK-DAG: %[[C80:.*]] = arith.constant 8 : index + // CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index + // CHECK-DAG: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1> - // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] { vector.transfer_read %[[SRC]]{{.*}}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32> + // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] { vector.transfer_read %[[SRC]]{{.*}} %[[PAD]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32> // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32> // CHECK: %[[C01:.*]] = arith.constant 0 : index @@ -1171,9 +1171,9 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 // CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> // CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-AD: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}} %[[PAD]] {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index @@ -1196,9 +1196,9 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> // CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}} %[[PAD]] {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index @@ -1221,9 +1221,9 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> // CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}} %[[PAD]] {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index @@ -1246,9 +1246,9 @@ func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, // CHECK-SAME: %[[SRC:.*]]: tensor<8x4x16x16xf32> // CHECK-SAME: %[[DEST:.*]]: tensor<64x127xf32> func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> { - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32> + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}} %[[PAD]] {{.*}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index @@ -1275,9 +1275,9 @@ func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> return %0 : tensor<7x16xf32> } - // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 - // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<4x7x4xf32>, vector<4x7x4xf32> + // CHECK-DAG: %[[PAD:.*]] = ub.poison : f32 + // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}} %[[PAD]] {{.*}} : tensor<4x7x4xf32>, vector<4x7x4xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index @@ -1308,7 +1308,7 @@ func.func @test_vectorize_pack(%src: tensor<32x8x16xf32>, %dest: tensor<4x1x32x1 %pack = linalg.pack %src outer_dims_perm = [1, 2, 0] inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %dest : tensor<32x8x16xf32> -> tensor<4x1x32x16x2xf32> return %pack : tensor<4x1x32x16x2xf32> } -// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[CST:.*]] = ub.poison : f32 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %[[CST]] // CHECK-SAME: {in_bounds = [true, true, true]} : tensor<32x8x16xf32>, vector<32x8x16xf32> @@ -1376,7 +1376,7 @@ func.func @test_vectorize_dynamic_pack(%src: tensor, %dest: tensor } -// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[CST:.*]] = ub.poison : f32 // CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index @@ -1417,7 +1417,7 @@ func.func @test_vectorize_pack_no_vector_sizes(%src: tensor<64x4xf32>, %dest: te %pack = linalg.pack %src outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %dest : tensor<64x4xf32> -> tensor<2x4x16x2xf32> return %pack : tensor<2x4x16x2xf32> } -// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[CST:.*]] = ub.poison : f32 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[READ:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %[[CST]] // CHECK-SAME: {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32> diff --git a/mlir/test/Dialect/Ptr/invalid.mlir b/mlir/test/Dialect/Ptr/invalid.mlir index cc1eeb3cb5744..83e1c880650c5 100644 --- a/mlir/test/Dialect/Ptr/invalid.mlir +++ b/mlir/test/Dialect/Ptr/invalid.mlir @@ -70,3 +70,11 @@ func.func @ptr_add_shape_mismatch(%ptrs: tensor<8x!ptr.ptr<#ptr.generic_space>>, %res = ptr.ptr_add %ptrs, %offsets : tensor<8x!ptr.ptr<#ptr.generic_space>>, tensor<4xi64> return %res : tensor<8x!ptr.ptr<#ptr.generic_space>> } + +// ----- + +func.func @ptr_diff_mismatch(%lhs: tensor<8x!ptr.ptr<#ptr.generic_space>>, %rhs: tensor<8x!ptr.ptr<#ptr.generic_space>>) -> vector<8xi64> { + // expected-error@+1 {{the result to have the same container type as the operands when operands are shaped}} + %res = ptr.ptr_diff %lhs, %rhs : tensor<8x!ptr.ptr<#ptr.generic_space>> -> vector<8xi64> + return %res : vector<8xi64> +} diff --git a/mlir/test/Dialect/Ptr/ops.mlir b/mlir/test/Dialect/Ptr/ops.mlir index 7b2254185f57c..0a906ad559e21 100644 --- a/mlir/test/Dialect/Ptr/ops.mlir +++ b/mlir/test/Dialect/Ptr/ops.mlir @@ -211,3 +211,31 @@ func.func @constant_large_address_ops() -> (!ptr.ptr<#ptr.generic_space>, !ptr.p %addr_large = ptr.constant #ptr.address<0x123456789ABCDEF0> : !ptr.ptr<#llvm.address_space<0>> return %addr_max32, %addr_large : !ptr.ptr<#ptr.generic_space>, !ptr.ptr<#llvm.address_space<0>> } + +/// Test ptr_diff operations with scalar pointers +func.func @ptr_diff_scalar_ops(%ptr1: !ptr.ptr<#ptr.generic_space>, %ptr2: !ptr.ptr<#ptr.generic_space>) -> (i64, index, i32) { + %diff_i64 = ptr.ptr_diff %ptr1, %ptr2 : !ptr.ptr<#ptr.generic_space> -> i64 + %diff_index = ptr.ptr_diff %ptr1, %ptr2 : !ptr.ptr<#ptr.generic_space> -> index + %diff_i32 = ptr.ptr_diff nuw %ptr1, %ptr2 : !ptr.ptr<#ptr.generic_space> -> i32 + return %diff_i64, %diff_index, %diff_i32 : i64, index, i32 +} + +/// Test ptr_diff operations with vector pointers +func.func @ptr_diff_vector_ops(%ptrs1: vector<4x!ptr.ptr<#ptr.generic_space>>, %ptrs2: vector<4x!ptr.ptr<#ptr.generic_space>>) -> (vector<4xi64>, vector<4xindex>) { + %diff_i64 = ptr.ptr_diff none %ptrs1, %ptrs2 : vector<4x!ptr.ptr<#ptr.generic_space>> -> vector<4xi64> + %diff_index = ptr.ptr_diff %ptrs1, %ptrs2 : vector<4x!ptr.ptr<#ptr.generic_space>> -> vector<4xindex> + return %diff_i64, %diff_index : vector<4xi64>, vector<4xindex> +} + +/// Test ptr_diff operations with tensor pointers +func.func @ptr_diff_tensor_ops(%ptrs1: tensor<8x!ptr.ptr<#ptr.generic_space>>, %ptrs2: tensor<8x!ptr.ptr<#ptr.generic_space>>) -> (tensor<8xi64>, tensor<8xi32>) { + %diff_i64 = ptr.ptr_diff nsw %ptrs1, %ptrs2 : tensor<8x!ptr.ptr<#ptr.generic_space>> -> tensor<8xi64> + %diff_i32 = ptr.ptr_diff nsw | nuw %ptrs1, %ptrs2 : tensor<8x!ptr.ptr<#ptr.generic_space>> -> tensor<8xi32> + return %diff_i64, %diff_i32 : tensor<8xi64>, tensor<8xi32> +} + +/// Test ptr_diff operations with 2D tensor pointers +func.func @ptr_diff_tensor_2d_ops(%ptrs1: tensor<4x8x!ptr.ptr<#ptr.generic_space>>, %ptrs2: tensor<4x8x!ptr.ptr<#ptr.generic_space>>) -> tensor<4x8xi64> { + %diff = ptr.ptr_diff %ptrs1, %ptrs2 : tensor<4x8x!ptr.ptr<#ptr.generic_space>> -> tensor<4x8xi64> + return %diff : tensor<4x8xi64> +} diff --git a/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir b/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir index 2d20ae0a13105..7dab87f8081ed 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/vce-deduction.mlir @@ -232,7 +232,7 @@ spirv.module Logical GLSL450 attributes { } } -// CHECK: requires #spirv.vce +// CHECK: requires #spirv.vce spirv.module Logical Vulkan attributes { spirv.target_env = #spirv.target_env< #spirv.vce, @@ -242,3 +242,14 @@ spirv.module Logical Vulkan attributes { spirv.ARM.GraphOutputs %arg0 : !spirv.arm.tensor<14x19xi8> } } + +// Check that extension and capability queries handle recursive types. +// CHECK: requires #spirv.vce +spirv.module Physical64 GLSL450 attributes { + spirv.target_env = #spirv.target_env< + #spirv.vce, + #spirv.resource_limits<>> +} { + spirv.GlobalVariable @recursive: + !spirv.ptr, StorageBuffer>)>, StorageBuffer> +} diff --git a/mlir/test/Dialect/SparseTensor/scf_1_N_conversion.mlir b/mlir/test/Dialect/SparseTensor/scf_1_N_conversion.mlir index f5d6a08b7de31..515de5502f322 100644 --- a/mlir/test/Dialect/SparseTensor/scf_1_N_conversion.mlir +++ b/mlir/test/Dialect/SparseTensor/scf_1_N_conversion.mlir @@ -86,3 +86,47 @@ func.func @while(%arg0: tensor<1024xf32, #SparseVector>, %c: i1) -> tensor<1024x } return %0: tensor<1024xf32, #SparseVector> } + +// CHECK-LABEL: func.func @index_switch( +// CHECK-SAME: %[[PRED:.*0]]: index, +// CHECK-SAME: %[[VAL_A_1:.*1]]: memref, +// CHECK-SAME: %[[VAL_A_2:.*2]]: memref, +// CHECK-SAME: %[[VAL_A_3:.*3]]: memref, +// CHECK-SAME: %[[VAL_A_4:.*4]]: !sparse_tensor.storage_specifier +// CHECK-SAME: %[[VAL_B_1:.*5]]: memref, +// CHECK-SAME: %[[VAL_B_2:.*6]]: memref, +// CHECK-SAME: %[[VAL_B_3:.*7]]: memref, +// CHECK-SAME: %[[VAL_B_4:.*8]]: !sparse_tensor.storage_specifier +// CHECK-SAME: %[[VAL_C_1:.*9]]: memref, +// CHECK-SAME: %[[VAL_C_2:.*10]]: memref, +// CHECK-SAME: %[[VAL_C_3:.*11]]: memref, +// CHECK-SAME: %[[VAL_C_4:.*12]]: !sparse_tensor.storage_specifier + +// CHECK: %[[RES:.*]]:4 = scf.index_switch %[[PRED]] +// CHECK-SAME: -> memref, memref, memref, !sparse_tensor.storage_specifier +// CHECK: case 1 { +// CHECK: scf.yield %[[VAL_A_1]], %[[VAL_A_2]], %[[VAL_A_3]], %[[VAL_A_4]] +// CHECK: case 2 { +// CHECK: scf.yield %[[VAL_B_1]], %[[VAL_B_2]], %[[VAL_B_3]], %[[VAL_B_4]] +// CHECK: default { +// CHECK: scf.yield %[[VAL_C_1]], %[[VAL_C_2]], %[[VAL_C_3]], %[[VAL_C_4]] + +// CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2, %[[RES]]#3 : +// CHECK-SAME: memref, memref, memref, !sparse_tensor.storage_specifier + +func.func @index_switch(%pred: index, %a: tensor<5xf32, #SparseVector>, + %b: tensor<5xf32, #SparseVector>, + %c: tensor<5xf32, #SparseVector>) -> tensor<5xf32, #SparseVector> { + %0 = scf.index_switch %pred -> tensor<5xf32, #SparseVector> + case 1 { + scf.yield %a : tensor<5xf32, #SparseVector> + } + case 2 { + scf.yield %b : tensor<5xf32, #SparseVector> + } + default { + scf.yield %c : tensor<5xf32, #SparseVector> + } + + return %0 : tensor<5xf32, #SparseVector> +} diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir index fd2a3f1d361eb..e8525a5d2ed62 100644 --- a/mlir/test/Dialect/Tosa/canonicalize.mlir +++ b/mlir/test/Dialect/Tosa/canonicalize.mlir @@ -1104,26 +1104,6 @@ func.func @canonicalize_pad_slice_dynamic_noupdate(%arg0: tensor<1x16x?x3xf32>) // ----- -// CHECK-LABEL: @fold_log_exp -func.func @fold_log_exp(%arg0: tensor) -> tensor { - // CHECK: return %arg{{.*}} : tensor - %0 = tosa.exp %arg0 : (tensor) -> tensor - %1 = tosa.log %0 : (tensor) -> tensor - return %1 : tensor -} - -// ----- - -// CHECK-LABEL: @fold_exp_log -func.func @fold_exp_log(%arg0: tensor) -> tensor { - // CHECK: return %arg{{.*}} : tensor - %0 = tosa.log %arg0 : (tensor) -> tensor - %1 = tosa.exp %0 : (tensor) -> tensor - return %1 : tensor -} - -// ----- - // CHECK-LABEL: @fold_negate_negate func.func @fold_negate_negate(%arg0: tensor) -> tensor { // CHECK: return %arg{{.*}} : tensor diff --git a/mlir/test/Dialect/Tosa/error_if_check.mlir b/mlir/test/Dialect/Tosa/error_if_check.mlir index 290773b23193f..2f9421c43d2fb 100644 --- a/mlir/test/Dialect/Tosa/error_if_check.mlir +++ b/mlir/test/Dialect/Tosa/error_if_check.mlir @@ -269,20 +269,6 @@ func.func @test_cond_if_simplified_form_not_isolated_from_above(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { - %0 = "tosa.cond_if"(%arg2, %arg0, %arg1) ({ - ^bb0(%arg3: tensor, %arg4: tensor): - tosa.yield %arg3 : tensor - }, { - ^bb0(%arg3: tensor, %arg4: tensor): - tosa.yield %arg4 : tensor - }) : (tensor, tensor, tensor) -> tensor - return %0 : tensor -} - -// ----- - func.func @test_while_loop_cond_not_isolated_from_above(%arg0: tensor, %arg1: tensor, %arg2: tensor) { %0 = "tosa.const"() {values = dense<0> : tensor} : () -> tensor // expected-error@+1 {{'tosa.while_loop' op is not conformant to the TOSA specification. It requires the 'cond' region is isolated from above.}} @@ -318,22 +304,3 @@ func.func @test_while_loop_body_not_isolated_from_above(%arg0: tensor, %arg }) : (tensor) -> (tensor) return } - -// ----- - -// Check isolated while_loops are valid -func.func @test_while_loop_isolated_from_above(%arg0: tensor, %arg1: tensor) { - %0 = "tosa.const"() {values = dense<0> : tensor} : () -> tensor - %1:3 = "tosa.while_loop"(%0, %arg0, %arg1) ({ - ^bb0(%arg3: tensor, %arg4: tensor, %arg5: tensor): - %2 = "tosa.greater_equal"(%arg3, %arg5) : (tensor, tensor) -> tensor - %3 = "tosa.logical_not"(%2) : (tensor) -> tensor - "tosa.yield"(%3) : (tensor) -> () - }, { - ^bb0(%arg3: tensor, %arg4: tensor, %arg5: tensor): - %2 = "tosa.const"() {values = dense<1> : tensor} : () -> tensor - %3 = "tosa.add"(%arg3, %2) : (tensor, tensor) -> tensor - "tosa.yield"(%3, %arg4, %arg5) : (tensor, tensor, tensor) -> () - }) : (tensor, tensor, tensor) -> (tensor, tensor, tensor) - return -} diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir index bee0eb1309572..868b7b7a93335 100644 --- a/mlir/test/Dialect/Tosa/ops.mlir +++ b/mlir/test/Dialect/Tosa/ops.mlir @@ -976,6 +976,15 @@ func.func @test_matmul_f8E5M2(%arg0: tensor<1x14x19xf8E5M2>, %arg1: tensor<1x19x return %0 : tensor<1x14x28xf16> } +// ----- +// CHECK-LABEL: test_matmul_f8E5M2_f8E4M3 +func.func @test_matmul_f8E5M2_f8E4M3(%arg0: tensor<1x14x19xf8E5M2>, %arg1: tensor<1x19x28xf8E4M3FN>) -> tensor<1x14x28xf16> { + %azp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E5M2>}> : () -> tensor<1xf8E5M2> + %bzp0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf8E4M3FN>}> : () -> tensor<1xf8E4M3FN> + %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E5M2>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E5M2>, tensor<1xf8E4M3FN>) -> tensor<1x14x28xf16> + return %0 : tensor<1x14x28xf16> +} + // ----- // CHECK-LABEL: max_pool2d_f8E5M2 func.func @test_max_pool2d_f8E5M2(%arg0: tensor<1x32x32x8xf8E5M2>) -> tensor<1x32x32x8xf8E5M2> { diff --git a/mlir/test/Dialect/Tosa/tosa-validation-valid-strict.mlir b/mlir/test/Dialect/Tosa/tosa-validation-valid-strict.mlir new file mode 100644 index 0000000000000..f05ae7f58261d --- /dev/null +++ b/mlir/test/Dialect/Tosa/tosa-validation-valid-strict.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-opt %s -split-input-file -verify-diagnostics --tosa-validate="profile=pro_int,pro_fp extension=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround strict-op-spec-alignment" | FileCheck %s + +// ----- + +// CHECK-LABEL: test_cond_if_isolated_from_above +func.func @test_cond_if_isolated_from_above(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %0 = "tosa.cond_if"(%arg2, %arg0, %arg1) ({ + ^bb0(%arg3: tensor, %arg4: tensor): + tosa.yield %arg3 : tensor + }, { + ^bb0(%arg3: tensor, %arg4: tensor): + tosa.yield %arg4 : tensor + }) : (tensor, tensor, tensor) -> tensor + return %0 : tensor +} + +// ----- + +// CHECK-LABEL: test_while_loop_isolated_from_above +func.func @test_while_loop_isolated_from_above(%arg0: tensor, %arg1: tensor) { + %0 = "tosa.const"() {values = dense<0> : tensor} : () -> tensor + %1:3 = "tosa.while_loop"(%0, %arg0, %arg1) ({ + ^bb0(%arg3: tensor, %arg4: tensor, %arg5: tensor): + %2 = "tosa.greater_equal"(%arg3, %arg5) : (tensor, tensor) -> tensor + %3 = "tosa.logical_not"(%2) : (tensor) -> tensor + "tosa.yield"(%3) : (tensor) -> () + }, { + ^bb0(%arg3: tensor, %arg4: tensor, %arg5: tensor): + %2 = "tosa.const"() {values = dense<1> : tensor} : () -> tensor + %3 = "tosa.add"(%arg3, %2) : (tensor, tensor) -> tensor + "tosa.yield"(%3, %arg4, %arg5) : (tensor, tensor, tensor) -> () + }) : (tensor, tensor, tensor) -> (tensor, tensor, tensor) + return +} diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir index e5571b6b4412c..0128da729136e 100644 --- a/mlir/test/Dialect/Tosa/verifier.mlir +++ b/mlir/test/Dialect/Tosa/verifier.mlir @@ -686,6 +686,48 @@ func.func @test_cond_if_incorrect_type_simple(%arg0: tensor, %arg1: tensor< return %0 : tensor } +// ----- +func.func @test_while_loop_wrong_terminator(%arg0: tensor, %arg1: tensor) -> tensor { + %0 = tosa.while_loop (%arg2 = %arg0) : (tensor) -> tensor { + // expected-error@+2 {{'func.return' op expects parent op 'func.func'}} + %1 = tosa.greater_equal %arg1, %arg2 : (tensor, tensor) -> tensor + "func.return"(%arg2) : (tensor) -> () + } do { + ^bb0(%arg2: tensor): + %1 = "tosa.const"() <{values = dense<1> : tensor}> : () -> tensor + %2 = tosa.add %arg2, %1 : (tensor, tensor) -> tensor + tosa.yield %2 : tensor + } + return %0 : tensor +} + +// ----- +func.func @test_while_loop_missing_cond_terminator(%arg0: tensor, %arg1: tensor) -> tensor { + %0 = tosa.while_loop (%arg2 = %arg0) : (tensor) -> tensor { + // expected-error@+1 {{block with no terminator}} + %1 = tosa.greater_equal %arg1, %arg2 : (tensor, tensor) -> tensor + } do { + ^bb0(%arg2: tensor): + %1 = "tosa.const"() <{values = dense<1> : tensor}> : () -> tensor + %2 = tosa.add %arg2, %1 : (tensor, tensor) -> tensor + tosa.yield %2 : tensor + } + return %0 : tensor +} + +// ----- +func.func @test_while_loop_missing_body_terminator(%arg0: tensor, %arg1: tensor) -> tensor { + %0 = tosa.while_loop (%arg2 = %arg0) : (tensor) -> tensor { + %1 = tosa.greater_equal %arg1, %arg2 : (tensor, tensor) -> tensor + tosa.yield %1 : tensor + } do { + ^bb0(%arg2: tensor): + // expected-error@+1 {{block with no terminator}} + %1 = "tosa.const"() <{values = dense<1> : tensor}> : () -> tensor + } + return %0 : tensor +} + // ----- func.func @test_while_loop_input_list_mismatch_body_block_in(%arg0: tensor<10xi32>, %arg1: tensor) { diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index 08d28be3f8f73..5448976f84760 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -3411,16 +3411,86 @@ func.func @negative_from_elements_poison_constant_mix() -> vector<2xf32> { return %1 : vector<2xf32> } +// ----- + +// CHECK-LABEL: func @from_elements_float8_to_i8_conversion( +// CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0, 56, -72, 69, 127, -1]> : vector<6xi8> +// CHECK-NEXT: return %[[CST]] : vector<6xi8> +func.func @from_elements_float8_to_i8_conversion() -> vector<6xi8> { + %cst0 = llvm.mlir.constant(0.0 : f8E4M3FN) : i8 + %cst1 = llvm.mlir.constant(1.0 : f8E4M3FN) : i8 + %cst_neg1 = llvm.mlir.constant(-1.0 : f8E4M3FN) : i8 + %cst_pi = llvm.mlir.constant(3.14 : f8E4M3FN) : i8 + %cst_inf = llvm.mlir.constant(0x7F : f8E4M3FN) : i8 + %cst_neg_inf = llvm.mlir.constant(0xFF : f8E4M3FN) : i8 + %v = vector.from_elements %cst0, %cst1, %cst_neg1, %cst_pi, %cst_inf, %cst_neg_inf : vector<6xi8> + return %v : vector<6xi8> +} + +// CHECK-LABEL: func @from_elements_float16_to_i16_conversion( +// CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0, 15360, -17408, 16968, 31743, -1025]> : vector<6xi16> +// CHECK-NEXT: return %[[CST]] : vector<6xi16> +func.func @from_elements_float16_to_i16_conversion() -> vector<6xi16> { + %cst0 = llvm.mlir.constant(0.0 : f16) : i16 + %cst1 = llvm.mlir.constant(1.0 : f16) : i16 + %cst_neg1 = llvm.mlir.constant(-1.0 : f16) : i16 + %cst_pi = llvm.mlir.constant(3.14 : f16) : i16 + %cst_max = llvm.mlir.constant(65504.0 : f16) : i16 + %cst_min = llvm.mlir.constant(-65504.0 : f16) : i16 + %v = vector.from_elements %cst0, %cst1, %cst_neg1, %cst_pi, %cst_max, %cst_min : vector<6xi16> + return %v : vector<6xi16> +} + +// CHECK-LABEL: func @from_elements_f64_to_i64_conversion( +// CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0, 4607182418800017408, -4616189618054758400, 4614253070214989087, 9218868437227405311, -4503599627370497]> : vector<6xi64> +// CHECK-NEXT: return %[[CST]] : vector<6xi64> +func.func @from_elements_f64_to_i64_conversion() -> vector<6xi64> { + %cst0 = llvm.mlir.constant(0.0 : f64) : i64 + %cst1 = llvm.mlir.constant(1.0 : f64) : i64 + %cst_neg1 = llvm.mlir.constant(-1.0 : f64) : i64 + %cst_pi = llvm.mlir.constant(3.14 : f64) : i64 + %cst_max = llvm.mlir.constant(1.7976931348623157e+308 : f64) : i64 + %cst_min = llvm.mlir.constant(-1.7976931348623157e+308 : f64) : i64 + %v = vector.from_elements %cst0, %cst1, %cst_neg1, %cst_pi, %cst_max, %cst_min : vector<6xi64> + return %v : vector<6xi64> +} + +// ----- + +// CHECK-LABEL: func @from_elements_i1_to_i8_conversion( +// CHECK-NEXT: %[[CST:.*]] = arith.constant dense<0> : vector<1xi8> +// CHECK-NEXT: return %[[CST]] : vector<1xi8> +func.func @from_elements_i1_to_i8_conversion() -> vector<1xi8> { + %cst = llvm.mlir.constant(0: i1) : i8 + %v = vector.from_elements %cst : vector<1xi8> + return %v : vector<1xi8> +} + +// ----- + +// CHECK-LABEL: func @from_elements_index_to_i64_conversion( +// CHECK-NEXT: %[[CST:.*]] = arith.constant dense<[0, 1, 42]> : vector<3xi64> +// CHECK-NEXT: return %[[CST]] : vector<3xi64> +func.func @from_elements_index_to_i64_conversion() -> vector<3xi64> { + %cst0 = llvm.mlir.constant(0 : index) : i64 + %cst1 = llvm.mlir.constant(1 : index) : i64 + %cst42 = llvm.mlir.constant(42 : index) : i64 + %v = vector.from_elements %cst0, %cst1, %cst42 : vector<3xi64> + return %v : vector<3xi64> +} + // +--------------------------------------------------------------------------- // End of Tests for foldFromElementsToConstant // +--------------------------------------------------------------------------- // ----- -// CHECK-LABEL: func @vector_insert_const_regression( +// Not a DenseElementsAttr, don't fold. + +// CHECK-LABEL: func @negative_insert_llvm_undef( // CHECK: llvm.mlir.undef // CHECK: vector.insert -func.func @vector_insert_const_regression(%arg0: i8) -> vector<4xi8> { +func.func @negative_insert_llvm_undef(%arg0: i8) -> vector<4xi8> { %0 = llvm.mlir.undef : vector<4xi8> %1 = vector.insert %arg0, %0 [0] : i8 into vector<4xi8> return %1 : vector<4xi8> diff --git a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir index 3950e54006eec..bf4f094263545 100644 --- a/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-contract-to-matrix-intrinsics-transforms.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --convert-vector-to-llvm='vector-contract-lowering=matmul' | FileCheck %s +// RUN: mlir-opt %s --convert-vector-to-llvm='vector-contract-lowering=llvmintr' | FileCheck %s #matmat_accesses = [ affine_map<(i, j, k) -> (i, k)>, diff --git a/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir b/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir index 94689fa0dfb88..f68badaa122cd 100644 --- a/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir +++ b/mlir/test/Dialect/Vector/vector-transpose-to-matrix-intrinsics-transform.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --convert-vector-to-llvm='vector-transpose-lowering=flat' --split-input-file | FileCheck %s +// RUN: mlir-opt %s --convert-vector-to-llvm='vector-transpose-lowering=llvmintr' --split-input-file | FileCheck %s // CHECK-LABEL: func @transpose( func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> { diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 13b0ed176eb0c..59fac26d18cf4 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -8,15 +8,15 @@ // CHECK-LABEL: gpu.func @store_nd_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>) { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> // CHECK: gpu.return gpu.module @xevm_module{ gpu.func @store_nd_1d(%arg0: memref<16xf32>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %0 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %cst, %0 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -25,14 +25,14 @@ gpu.module @xevm_module{ // CHECK-LABEL: gpu.func @store_nd_2d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>) { // CHECK-DAG: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf16> -// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[CST]], %[[T0]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[CST]], %[[T0]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @store_nd_2d(%arg0: memref<16x16xf16>) { %c0 = arith.constant 0 : index %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf16> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %cst, %0 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %cst, %0 [%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -42,17 +42,17 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16xf32>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16xf32> -> vector<1xf32> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16xf32> -> !xegpu.tensor_desc<16xf32> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<1xf32>, !xegpu.tensor_desc<16xf32> gpu.module @xevm_module{ gpu.func @load_nd_1d(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> - %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + %1 = xegpu.load_nd %0 [%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf32, #xegpu.layout> -> vector<16xf32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> + xegpu.store_nd %1, %2 [%c0] : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> gpu.return } } @@ -60,17 +60,17 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_2d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK-DAG: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @load_nd_2d(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -78,21 +78,21 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_nd_array_length // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<32xf16> // CHECK: %[[T2:.*]] = vector.shape_cast %[[T1]] : vector<32xf16> to vector<2x16x1xf16> // CHECK: %[[T3:.*]] = vector.extract %[[T2]][0] : vector<16x1xf16> from vector<2x16x1xf16> -// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK-DAG: %[[T4:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> // CHECK-DAG: %[[T5:.*]] = vector.shape_cast %[[T3]] : vector<16x1xf16> to vector<16xf16> -// CHECK: xegpu.store_nd %[[T5]], %[[T4]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T5]], %[[T4]][%{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @load_nd_array_length(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.layout> -> vector<2x16x16xf16> %2 = vector.extract %1[%c0] {layout_result_0 = #xegpu.layout} : vector<16x16xf16> from vector<2x16x16xf16> - %3 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %2, %3 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %2, %3[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -100,23 +100,23 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_dpas_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> -// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T4]], %[[T5]][%{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @load_dpas_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %5 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %4, %5 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %5 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %4, %5[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -125,27 +125,27 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @load_dpas_postop_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][%{{.*}}] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> // CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> // CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> // CHECK-DAG: %[[T8:.*]] = vector.shape_cast %[[T6]] : vector<8x1xf32> to vector<8xf32> -// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]][{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK: xegpu.store_nd %[[T8]], %[[T7]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-DAG: %[[T7:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK: xegpu.store_nd %[[T8]], %[[T7]][{{.*}}] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @load_dpas_postop_store(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> + %2 = xegpu.create_nd_tdesc %arg1: memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> %4 = xegpu.dpas %1, %3 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> %5 = math.exp %4 {layout_result_0 = #xegpu.layout} : vector<8x16xf32> - %6 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %5, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %6 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %5, %6[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } @@ -155,17 +155,17 @@ gpu.module @xevm_module{ // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> + %2 = xegpu.create_nd_tdesc %arg1, shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.store_nd %1, %2[%c0, %c0] : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -178,21 +178,21 @@ gpu.module @xevm_module{ // CHECK-DAG: %[[BLOCK_ID_Y:.*]] = gpu.block_id y // CHECK-DAG: %[[Y_COORD:.*]] = arith.muli %[[BLOCK_ID_Y]], %c16 : index // CHECK-DAG: %[[X_COORD:.*]] = arith.muli %[[BLOCK_ID_X]], %c8 : index -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%[[X_COORD]], %[[Y_COORD]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> -// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: %[[T3:.*]] = xegpu.load_nd %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> // CHECK-NEXT: %[[T4:.*]] = vector.shape_cast %[[T3]] : vector<8xf32> to vector<8x1xf32> // CHECK: %[[T5:.*]] = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %[[T4]]) -> (vector<8x1xf32>) { -// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[K]], %[[Y_COORD]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> -// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> -// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[X_COORD]], %[[K]]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> -// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> +// CHECK-DAG: %[[T10:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16> +// CHECK-DAG: %[[T11:.*]] = xegpu.load_nd %[[T10]][%[[K]], %[[Y_COORD]]] <{packed}> : !xegpu.tensor_desc<16x16xbf16> -> vector<16xbf16> +// CHECK-DAG: %[[T12:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16> +// CHECK-DAG: %[[T13:.*]] = xegpu.load_nd %[[T12]][%[[X_COORD]], %[[K]]] : !xegpu.tensor_desc<8x16xbf16> -> vector<8xbf16> // CHECK-DAG: %[[T14:.*]] = vector.shape_cast %[[ARG4]] : vector<8x1xf32> to vector<8xf32> // CHECK-NEXT: %[[T15:.*]] = xegpu.dpas %[[T13]], %[[T11]], %[[T14]] : vector<8xbf16>, vector<16xbf16>, vector<8xf32> -> vector<8xf32> // CHECK-NEXT: %[[T16:.*]] = vector.shape_cast %[[T15]] : vector<8xf32> to vector<8x1xf32> // CHECK-NEXT: scf.yield %[[T16]] : vector<8x1xf32> // CHECK-NEXT: } // CHECK-NEXT: %[[T9:.*]] = vector.shape_cast %[[T5]] : vector<8x1xf32> to vector<8xf32> -// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> +// CHECK-NEXT: xegpu.store_nd %[[T9]], %[[T2]][%[[X_COORD]], %[[Y_COORD]]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> gpu.module @xevm_module{ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %arg2: memref<1024x1024xf32>){ %c0 = arith.constant 0 : index @@ -203,91 +203,31 @@ gpu.func @gemm(%arg0: memref<1024x1024xbf16>, %arg1: memref<1024x1024xbf16>, %ar %block_id_y = gpu.block_id y %0 = arith.muli %block_id_x, %c8 : index %1 = arith.muli %block_id_y, %c16 : index - %2 = xegpu.create_nd_tdesc %arg2[%0, %1] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> + %2 = xegpu.create_nd_tdesc %arg2 : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + %3 = xegpu.load_nd %2[%0, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf32, #xegpu.layout> -> vector<8x16xf32> %4 = scf.for %arg3 = %c0 to %c1024 step %c16 iter_args(%arg4 = %3) -> (vector<8x16xf32>) { - %5 = xegpu.create_nd_tdesc %arg0[%0, %arg3] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> - %6 = xegpu.create_nd_tdesc %arg1[%arg3, %1] : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> - %7 = xegpu.load_nd %5 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> - %8 = xegpu.load_nd %6 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> + %5 = xegpu.create_nd_tdesc %arg0: memref<1024x1024xbf16> -> !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> + %6 = xegpu.create_nd_tdesc %arg1 : memref<1024x1024xbf16> -> !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> + %7 = xegpu.load_nd %5[%0, %arg3] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xbf16, #xegpu.layout> -> vector<8x16xbf16> + %8 = xegpu.load_nd %6[%arg3, %1] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xbf16, #xegpu.layout> -> vector<16x16xbf16> %9 = xegpu.dpas %7, %8, %arg4 {layout_result_0 = #xegpu.layout} : vector<8x16xbf16>, vector<16x16xbf16>, vector<8x16xf32> -> vector<8x16xf32> scf.yield %9 : vector<8x16xf32> } {layout_result_0 = #xegpu.layout} - xegpu.store_nd %4, %2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> + xegpu.store_nd %4, %2[%0, %1] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return } } -// ----- -// CHECK-LABEL: gpu.func @update_nd_offset_1d( -// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf32>) { -// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<1xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf32> -> !xegpu.tensor_desc<16xf32> -// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32] : !xegpu.tensor_desc<16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<1xf32>, !xegpu.tensor_desc<16xf32> -gpu.module @xevm_module{ - gpu.func @update_nd_offset_1d(%arg0: memref<256xf32>) { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.layout> - %1 = xegpu.update_nd_offset %0, [%c32] : !xegpu.tensor_desc<16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.layout> - gpu.return - } -} - -// ----- -// CHECK-LABEL: gpu.func @update_nd_offset_2d -// CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf32>) { -// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32> -// CHECK: %[[T1:.*]] = xegpu.update_nd_offset %[[T0]], [%c32, %c32] : !xegpu.tensor_desc<16x16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[T1]] : vector<16xf32>, !xegpu.tensor_desc<16x16xf32> -gpu.module @xevm_module{ - gpu.func @update_nd_offset_2d(%arg0: memref<256x256xf32>) { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - gpu.return - } -} - // ----- // CHECK-LABEL: gpu.func @prefetch_2d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256x256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16> gpu.module @xevm_module{ gpu.func @prefetch_2d(%arg0: memref<256x256xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> - gpu.return - } -} - -// ----- -// Explicitly check that update_nd_offset op's source retain layout when yielded from the warp op (PR150545) -// CHECK-LABEL: gpu.func @check_update_nd_offset_distributed_tensor_desc -// CHECK: %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[16] -> -// CHECK-SAME: (!xegpu.tensor_desc<16x16xf32, #xegpu.layout>) { -// CHECK: %[[T0:.*]] = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -// CHECK: gpu.yield %[[T0]] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -// CHECK: } -// CHECK: %[[T1:.*]] = builtin.unrealized_conversion_cast %[[W]] : -// CHECK-SAME: !xegpu.tensor_desc<16x16xf32, #xegpu.layout> to !xegpu.tensor_desc<16x16xf32> {resolve_simt_type_mismatch} -// CHECK: xegpu.update_nd_offset %[[T1]], [%{{.*}}] : !xegpu.tensor_desc<16x16xf32> -gpu.module @xevm_module{ - gpu.func @check_update_nd_offset_distributed_tensor_desc() { - %c32 = arith.constant 32 : index - %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16x16xf32> - %0 = "some_op"() : () -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - %1 = xegpu.update_nd_offset %0, [%c32, %c32] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> - xegpu.store_nd %cst, %1 : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + xegpu.prefetch_nd %0[%c0, %c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } } @@ -295,13 +235,13 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @prefetch_1d // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK: xegpu.prefetch_nd %[[T0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<256xf16> -> !xegpu.tensor_desc<16xf16> +// CHECK: xegpu.prefetch_nd %[[T0]][%{{.*}}] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16> gpu.module @xevm_module{ gpu.func @prefetch_1d(%arg0: memref<256xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.prefetch_nd %0 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0: memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.prefetch_nd %0[%c0] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf16, #xegpu.layout> gpu.return } } @@ -309,18 +249,18 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @gpu_barrier({{.*}}) { // CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> +// CHECK-NEXT: %[[T1:.*]] = xegpu.load_nd %[[T0]][{{.*}}] : !xegpu.tensor_desc<16xf16> -> vector<1xf16> // CHECK-NEXT: gpu.barrier // CHECK-NEXT: %[[T2:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256xf16> -> !xegpu.tensor_desc<16xf16> -// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]] : vector<1xf16>, !xegpu.tensor_desc<16xf16> +// CHECK-NEXT: xegpu.store_nd %[[T1]], %[[T2]][{{.*}}] : vector<1xf16>, !xegpu.tensor_desc<16xf16> gpu.module @xevm_module{ gpu.func @gpu_barrier(%arg0: memref<256xf16>, %arg1: memref<256xf16>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + %1 = xegpu.load_nd %0[%c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16xf16, #xegpu.layout> -> vector<16xf16> gpu.barrier - %2 = xegpu.create_nd_tdesc %arg1[%c0] : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.store_nd %1, %2 : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> + %2 = xegpu.create_nd_tdesc %arg1 : memref<256xf16> -> !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.store_nd %1, %2[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> gpu.return } } @@ -341,6 +281,7 @@ gpu.module @xevm_module{ // CHECK-NEXT: vector.from_elements %[[RED0]], %[[RED1]] : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<1x32xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x32xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<32xf32> @@ -348,7 +289,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { : vector<16x32xf32> to vector<32xf32> %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<1x32xf32> - xegpu.store_nd %3, %0 : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> + xegpu.store_nd %3, %0[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.layout> gpu.return } } @@ -367,6 +308,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim0_reduction() { // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<2x16xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<2xf32> @@ -375,7 +317,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<2xf32> to vector<2x1xf32> %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<2x1xf32> to vector<2x16xf32> - xegpu.store_nd %4, %0 : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> + xegpu.store_nd %4, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> gpu.return } } @@ -394,6 +336,7 @@ gpu.func @vector_multi_reduction_dim1_distributed_dim1_reduction() { // CHECK-NEXT: vector.from_elements %[[R0]], %[[R1]] : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<32x1xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<32x16xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<0.0> : vector<32xf32> @@ -401,7 +344,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { : vector<32x16xf32> to vector<32xf32> %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<32xf32> to vector<32x1xf32> - xegpu.store_nd %3, %0 : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> + xegpu.store_nd %3, %0[%c0, %c0] : vector<32x1xf32>, !xegpu.tensor_desc<32x1xf32, #xegpu.layout> gpu.return } } @@ -422,6 +365,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim1_reduction() { // CHECK-REDUCTION-NEXT: vector.from_elements %[[W]]#2, %[[W]]#1 : vector<2xf32> gpu.module @xevm_module{ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { + %c0 = arith.constant 0 : index %0 = "some_def"() : () -> !xegpu.tensor_desc<16x2xf32, #xegpu.layout> %src = "some_def"() {layout_result_0 = #xegpu.layout} : () -> (vector<16x2xf32>) %acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<0.0> : vector<2xf32> @@ -430,7 +374,7 @@ gpu.func @vector_multi_reduction_dim0_distributed_dim0_reduction() { %3 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout} : vector<2xf32> to vector<1x2xf32> %4 = vector.broadcast %3 {layout_result_0 = #xegpu.layout} : vector<1x2xf32> to vector<16x2xf32> - xegpu.store_nd %4, %0 : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> + xegpu.store_nd %4, %0[%c0, %c0] : vector<16x2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.layout> gpu.return } } @@ -537,9 +481,9 @@ gpu.module @xevm_module{ %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> : vector<16xf16> %ptr = memref.extract_aligned_pointer_as_index %arg0 : memref<256x256xf16> -> index %ptr_i64 = arith.index_cast %ptr : index to i64 - %tdesc = xegpu.create_nd_tdesc %ptr_i64[%c0], shape: [16], strides: [16] : i64 + %tdesc = xegpu.create_nd_tdesc %ptr_i64, shape: [16], strides: [16] : i64 -> !xegpu.tensor_desc<16xf16, #xegpu.layout> - xegpu.store_nd %cst, %tdesc : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> + xegpu.store_nd %cst, %tdesc[%c0] : vector<16xf16>, !xegpu.tensor_desc<16xf16, #xegpu.layout> gpu.return } } @@ -549,7 +493,7 @@ gpu.module @xevm_module{ // CHECK-LABEL: gpu.func @vector_transpose( // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<2xf32> // CHECK: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32> -// CHECK: xegpu.store_nd %[[CST]], %[[DEST]] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> +// CHECK: xegpu.store_nd %[[CST]], %[[DEST]][{{.*}}] : vector<2xf32>, !xegpu.tensor_desc<2x16xf32> gpu.module @xevm_module{ gpu.func @vector_transpose(%arg0: memref<2x16xf32>) { %cst = arith.constant {layout_result_0 = #xegpu.layout} dense<1.000000e+00> @@ -557,9 +501,9 @@ gpu.module @xevm_module{ %c0 = arith.constant 0 : index %transpose = vector.transpose %cst, [1, 0] {layout_result_0 = #xegpu.layout} : vector<16x2xf32> to vector<2x16xf32> - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<2x16xf32> + %0 = xegpu.create_nd_tdesc %arg0 : memref<2x16xf32> -> !xegpu.tensor_desc<2x16xf32, #xegpu.layout> - xegpu.store_nd %transpose, %0 : vector<2x16xf32>, + xegpu.store_nd %transpose, %0[%c0, %c0] : vector<2x16xf32>, !xegpu.tensor_desc<2x16xf32, #xegpu.layout> gpu.return } @@ -570,7 +514,7 @@ gpu.module @xevm_module{ // CHECK: %[[CAST:.*]] = vector.bitcast %{{.*}} : vector<4x2xi8> to vector<4x1xi16> // CHECK-NEXT: %[[DEST:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16> // CHECK-NEXT: %[[T0:.*]] = vector.shape_cast %[[CAST]] : vector<4x1xi16> to vector<4xi16> -// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> +// CHECK-NEXT: xegpu.store_nd %[[T0]], %[[DEST]][{{.*}}] : vector<4xi16>, !xegpu.tensor_desc<4x16xi16> gpu.module @xevm_module{ gpu.func @vector_bitcast(%arg0: memref<4x16xi16>) { %cst = "some_op"() {layout_result_0 = #xegpu.layout} @@ -578,9 +522,9 @@ gpu.module @xevm_module{ %bitcast = vector.bitcast %cst {layout_result_0 = #xegpu.layout} : vector<4x32xi8> to vector<4x16xi16> %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<4x16xi16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<4x16xi16> -> !xegpu.tensor_desc<4x16xi16, #xegpu.layout> - xegpu.store_nd %bitcast, %0 : vector<4x16xi16>, + xegpu.store_nd %bitcast, %0[%c0, %c0] : vector<4x16xi16>, !xegpu.tensor_desc<4x16xi16, #xegpu.layout> gpu.return } @@ -589,10 +533,10 @@ gpu.module @xevm_module{ // ----- // CHECK-LABEL: gpu.func @mma_transpose_b( // CHECK: %[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x8xi32>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> -// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> -// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> -// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> +// CHECK-DAG: %[[ADESC:.*]] = xegpu.create_nd_tdesc %[[ARG0]] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> +// CHECK-DAG: %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[ARG1]] : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32> +// CHECK-DAG: %[[A:.*]] = xegpu.load_nd %[[ADESC]][%{{.*}}] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK-DAG: %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}] <{transpose = array}> : !xegpu.tensor_desc<16x8xi32> -> vector<8xi32> // CHECK-NEXT: %[[BCAST0:.*]] = vector.shape_cast %[[B]] : vector<8xi32> to vector<1x8xi32> // CHECK-NEXT: %[[BCAST1:.*]] = vector.bitcast %[[BCAST0]] : vector<1x8xi32> to vector<1x16xf16> // CHECK-NEXT: %[[BCAST2:.*]] = vector.shape_cast %[[BCAST1]] : vector<1x16xf16> to vector<16xf16> @@ -600,13 +544,13 @@ gpu.module @xevm_module{ gpu.module @xevm_module{ gpu.func @mma_transpose_b(%arg0: memref<8x16xf16>, %arg1: memref<16x8xi32>, %arg2: memref<8x16xf32>) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0] : memref<8x16xf16> + %0 = xegpu.create_nd_tdesc %arg0 : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.layout> - %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} + %1 = xegpu.load_nd %0[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<8x16xf16, #xegpu.layout> -> vector<8x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<16x8xi32> + %2 = xegpu.create_nd_tdesc %arg1 : memref<16x8xi32> -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout> - %3 = xegpu.load_nd %2 {layout_result_0 = #xegpu.layout} + %3 = xegpu.load_nd %2[%c0, %c0] {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x8xi32, #xegpu.layout> -> vector<16x8xi32> %4 = vector.bitcast %3 {layout_result_0 = #xegpu.layout} : vector<16x8xi32> to vector<16x16xf16> @@ -614,9 +558,9 @@ gpu.module @xevm_module{ : vector<16x16xf16> to vector<16x16xf16> %6 = xegpu.dpas %1, %5 {layout_result_0 = #xegpu.layout} : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32> - %7 = xegpu.create_nd_tdesc %arg2[%c0, %c0] : memref<8x16xf32> + %7 = xegpu.create_nd_tdesc %arg2 : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.layout> - xegpu.store_nd %6, %7 : vector<8x16xf32>, + xegpu.store_nd %6, %7[%c0, %c0] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.layout> gpu.return diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir index 547c7355e00c6..b73bc69393dab 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir @@ -7,10 +7,8 @@ gpu.module @test { //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] //CHECK: [[c32:%.+]] = arith.constant 32 : index //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> @@ -23,10 +21,8 @@ gpu.module @test { //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]] //CHECK: [[c32:%.+]] = arith.constant 32 : index //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] - //CHECK: [[c0:%.+]] = arith.constant 0 : index - //CHECK: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir new file mode 100644 index 0000000000000..6eee5a544e3f8 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns-no-desc-offsets.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt --test-xegpu-unrolling-patterns -split-input-file %s | FileCheck %s + +gpu.module @xevm_test { + + // CHECK-LABEL: create_nd_tdesc + // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK: [[cast:%.+]] = builtin.unrealized_conversion_cast + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32> + // CHECK-SAME: to !xegpu.tensor_desc<24x32xf32, #xegpu.layout> {__xegpu_blocking_tile_shape__ = array, __xegpu_blocking_unpack__} + gpu.func @create_nd_tdesc(%src: memref<24x32xf32>) -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> { + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + gpu.return %tdesc : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + } + +//----- + // CHECK-LABEL: load_nd + // CHECK-SAME: [[arg0:%.+]]: memref<256x318xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<256x318xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: [[ld:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + // CHECK-COUNT-6: [[insert:%.+]] = vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<24x32xf32> + gpu.func @load_nd(%src: memref<256x318xf32>) -> vector<24x32xf32> { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> + gpu.return %ld : vector<24x32xf32> + } + +//----- + // CHECK-LABEL: load_nd_store_nd + // CHECK-SAME: [[arg0:%.+]]: memref<256x318xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<256x318xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: [[data:%.+]] = xegpu.load_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + // CHECK-COUNT-6: xegpu.store_nd {{.*}}[{{.*}}] : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.func @load_nd_store_nd(%src: memref<256x318xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> + xegpu.store_nd %ld, %tdesc[0, 0] : vector<24x32xf32>, !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + gpu.return + } + +//----- + // CHECK-LABEL: prefetch_nd_tdesc + // CHECK-SAME: [[arg0:%.+]]: memref<24x32xf32> + // CHECK: [[tdesc:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + // CHECK-COUNT-6: xegpu.prefetch_nd {{.*}}[{{.*}}] : !xegpu.tensor_desc<8x16xf32> + gpu.func @prefetch_nd_tdesc(%src: memref<24x32xf32>) { + %tdesc = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + xegpu.prefetch_nd %tdesc[8, 16] : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + gpu.return + } + +//----- + + // CHECK-LABEL: load_nd_offsets_at_both_places + // CHECK-COUNT-2: builtin.unrealized_conversion_cast + gpu.func @load_nd_offsets_at_both_places(%src: memref<256x318xf32>) -> vector<24x32xf32> { + %tdesc = xegpu.create_nd_tdesc %src[16, 8] : memref<256x318xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %ld = xegpu.load_nd %tdesc[8, 16]: !xegpu.tensor_desc<24x32xf32, #xegpu.layout> -> vector<24x32xf32> + gpu.return %ld : vector<24x32xf32> + } +} diff --git a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir index 6999da5d222fe..dbc52b8a98894 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-unroll-patterns.mlir @@ -210,6 +210,27 @@ gpu.module @test { gpu.return %ld : vector<32xf32> } +//----- + + + // CHECK-LABEL: load_with_offsets + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK-COUNT-2: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32> + gpu.func @load_with_offsets(%src: ui64) -> vector<32xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32xf32> + + gpu.return %ld : vector<32xf32> + } + //----- // CHECK-LABEL: prefetch @@ -254,6 +275,28 @@ gpu.module @test { gpu.return } + + //----- + + // CHECK-LABEL: store_with_offsets + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK-COUNT-2: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint}> : vector<16xf32>, ui64, vector<16xindex>, vector<16xi1> + gpu.func @store_with_offsets(%src: ui64) { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + + %st_vec = arith.constant dense<1023.0>: vector<32xf32> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 1, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32xf32>, ui64, vector<32xindex>, vector<32xi1> + + gpu.return + } //----- // CHECK-LABEL: create_tdesc_step_chunk @@ -319,6 +362,29 @@ gpu.module @test { gpu.return %ld : vector<32x4xf32> } +//----- + // CHECK-LABEL: load_with_offsets_chunk + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<32x4xf32> + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.load {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16x2xf32> + gpu.func @load_with_offsets_chunk(%src: ui64) -> vector<32x4xf32> { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + %ld = xegpu.load %src[%cst], %mask {chunk_size = 4, layout_result_0 = #xegpu.layout, l1_hint = #xegpu.cache_hint} : ui64, vector<32xindex>, vector<32xi1> -> vector<32x4xf32> + gpu.return %ld : vector<32x4xf32> + } + //----- // CHECK-LABEL: store_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 @@ -342,6 +408,31 @@ gpu.module @test { gpu.return } +//----- + // CHECK-LABEL: store_with_offsets_chunk + // CHECK-SAME: [[arg0:%.+]]: ui64 + // CHECK: [[cst:%.+]] = arith.constant dense<1.023000e+03> : vector<16x2xf32 + // CHECK: [[cst0:%.+]] = arith.constant dense<[130, 138, 146, 154, 162, 170, 178, 186, 194, 202, 210, 218, 226, 234, 242, 250]> : vector<16xindex> + // CHECK: [[cst1:%.+]] = arith.constant dense<[2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122]> : vector<16xindex> + // CHECK: [[cst2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex> + // CHECK: [[cst3:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> + // CHECK-COUNT-4: xegpu.store {{.*}}[{{.*}}], {{.*}} <{chunk_size = 2 : i64, l1_hint = #xegpu.cache_hint}> : vector<16x2xf32>, ui64, vector<16xindex>, vector<16xi1> + gpu.func @store_with_offsets_chunk(%src: ui64) { + %cst = arith.constant dense<[ + 0, 8, 16, 24, 32, 40, 48, 56, + 64, 72, 80, 88, 96, 104, 112, 120, + 128, 136, 144, 152, 160, 168, 176, 184, + 192, 200, 208, 216, 224, 232, 240, 248 + ]> : vector<32xindex> + + %c17 = arith.constant 17: index + %mask = vector.create_mask %c17: vector<32xi1> + + %st_vec = arith.constant dense<1023.>: vector<32x4xf32> + xegpu.store %st_vec, %src[%cst], %mask {chunk_size = 4, layout = #xegpu.layout, l1_hint = #xegpu.cache_hint} : vector<32x4xf32>, ui64, vector<32xindex>, vector<32xi1> + gpu.return + } + //----- // CHECK-LABEL: prefetch_chunk // CHECK-SAME: [[arg0:%.+]]: ui64 diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index e5cc65e6bd3d7..d2d250cbe0f66 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -27,12 +27,10 @@ gpu.module @test_round_robin_assignment { //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]] //CHECK: [[C0:%.+]] = arith.constant 0 : index //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[ADDY:%.+]] = arith.addi [[LY]], [[C0]] : index - //CHECK: [[ADDX:%.+]] = arith.addi [[LX]], [[C0_1]] : index //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[offY:%.+]] = index.remu [[ADDY]], [[C128]] + //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]] //CHECK: [[C64_2:%.+]] = arith.constant 64 : index - //CHECK: [[offX:%.+]] = index.remu [[ADDX]], [[C64_2]] + //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]] //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir index 6ff7a94d678a3..dce73dee507e1 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir @@ -82,4 +82,20 @@ gpu.module @test_distribution { : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32> gpu.return } + + // CHECK-LABEL: vector_reduce_dim_1 + gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) { + // CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32> + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> + -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + -> vector<256x64xf32> + // CHECK-COUNT-2: vector.multi_reduction , {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32> + // CHECK-NOT: vector.multi_reduction + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] + : vector<256x64xf32> to vector<256xf32> + gpu.return + } } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir index 3478a9b91da5f..03c63861705d9 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir @@ -325,12 +325,10 @@ gpu.module @test_distribution { //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[c0_1:%.+]] = arith.constant 0 : index - //CHECK: [[l_off_y_0:%.+]] = arith.addi [[l_off_y]], [[c0]] : index - //CHECK: [[l_off_x_0:%.+]] = arith.addi [[l_off_x]], [[c0_1]] : index //CHECK: [[c64:%.+]] = arith.constant 64 : index - //CHECK: [[off_y:%.+]] = index.remu [[l_off_y_0]], [[c64]] + //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index - //CHECK: [[off_x:%.+]] = index.remu [[l_off_x_0]], [[c128]] + //CHECK: [[off_x:%.+]] = index.remu [[l_off_x]], [[c128]] //CHECK: xegpu.load_matrix [[mdesc]][[[off_y]], [[off_x]]] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32>, index, index -> vector<32x32xf32> %0 = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<64x128xf32> -> vector<64x128xf32> @@ -349,13 +347,11 @@ gpu.module @test_distribution { //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]] //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]] //CHECK: [[c32:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_y_0:%.+]] = index.mul [[id_y]], [[c32]] + //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]] //CHECK: [[c32_1:%.+]] = arith.constant 32 : index - //CHECK: [[l_off_x_0:%.+]] = index.mul [[id_x]], [[c32_1]] + //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]] //CHECK: [[c0:%.+]] = arith.constant 0 : index //CHECK: [[c0_2:%.+]] = arith.constant 0 : index - //CHECK: [[l_off_y:%.+]] = arith.addi [[l_off_y_0]], [[c0]] : index - //CHECK: [[l_off_x:%.+]] = arith.addi [[l_off_x_0]], [[c0_2]] : index //CHECK: [[c64:%.+]] = arith.constant 64 : index //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]] //CHECK: [[c128:%.+]] = arith.constant 128 : index @@ -367,16 +363,55 @@ gpu.module @test_distribution { gpu.return } + // CHECK-LABEL: @vector_reduce_dim_0 + gpu.func @vector_reduce_dim_0(%src: memref<4x128xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} dense<1.0> : vector<128xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32> + -> !xegpu.tensor_desc<4x128xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<4x128xf32, #xegpu.layout> + -> vector<4x128xf32> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} [0] + : vector<4x128xf32> to vector<128xf32> + gpu.return + } + + // CHECK-LABEL: @vector_reduce_dim_1 + gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) { + %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} dense<1.0> : vector<256xf32> + %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32> + -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + %load = xegpu.load_nd %tdesc[0, 0] + : !xegpu.tensor_desc<256x64xf32, #xegpu.layout> + -> vector<256x64xf32> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32> + %reduce = vector.multi_reduction , %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} [1] + : vector<256x64xf32> to vector<256xf32> + gpu.return + } + + // CHECK-LABEL: @vector_reduce_4D + gpu.func @vector_reduce_4D(%src: ui64) { + %cst_acc = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} dense<0.0> : vector<4x2x6xf16> + %offset = arith.constant {layout_result_0 = #xegpu.layout} dense<0> : vector<4x2x6x32xindex> + %mask = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<4x2x6x32xi1> + %load = xegpu.load %src[%offset], %mask {layout_result_0 = #xegpu.layout} : ui64, vector<4x2x6x32xindex>, vector<4x2x6x32xi1> -> vector<4x2x6x32xf16> + // CHECK: vector.multi_reduction , {{.*}}, {{.*}} [3] : vector<1x1x1x32xf16> to vector<1x1x1xf16> + %reduce = vector.multi_reduction , %load, %cst_acc {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [3]>} [3] + : vector<4x2x6x32xf16> to vector<4x2x6xf16> + gpu.return + } + // CHECK-LABEL: vector_step_op gpu.func @vector_step_op_slice_attr() { //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]] //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index - //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]] + //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]] //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]] //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex> //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex> @@ -390,9 +425,8 @@ gpu.module @test_distribution { //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]] //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index - //CHECK-DAG: [[Y:%.+]] = arith.addi [[LOCALY]], [[c0]] : index //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index - //CHECK-DAG: [[MODY:%.+]] = index.remu [[Y]], [[c128]] + //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]] //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex> //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex> //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex> diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index c0fb373835e3d..e83229e3a3995 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -14,12 +14,10 @@ gpu.module @test_1_1_assignment { //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] //CHECK: [[C0:%.+]] = arith.constant 0 : index //CHECK: [[C0_1:%.+]] = arith.constant 0 : index - //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index - //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_1]] : index //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[Y:%.+]] = index.remu [[UY]], [[C256]] + //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]] //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[X:%.+]] = index.remu [[UX]], [[C128]] + //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]] //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> @@ -37,17 +35,13 @@ gpu.module @test_1_1_assignment { //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] //CHECK: [[C0:%.+]] = arith.constant 0 : index //CHECK: [[C0_2:%.+]] = arith.constant 0 : index - //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index - //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index //CHECK: [[C256:%.+]] = arith.constant 256 : index - //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]] + //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]] //CHECK: [[C128:%.+]] = arith.constant 128 : index - //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]] + //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]] //CHECK: [[C0_3:%.+]] = arith.constant 0 : index - //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]] //CHECK: [[C0_4:%.+]] = arith.constant 0 : index - //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]] - //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> gpu.return diff --git a/mlir/test/Examples/standalone/lit.local.cfg b/mlir/test/Examples/standalone/lit.local.cfg index 3b12dcbd99e83..ac03503e46ea3 100644 --- a/mlir/test/Examples/standalone/lit.local.cfg +++ b/mlir/test/Examples/standalone/lit.local.cfg @@ -7,7 +7,13 @@ config.substitutions.append(("%cmake_exe", config.host_cmake)) config.substitutions.append(("%cmake_generator", config.host_cmake_generator)) config.substitutions.append(("%host_cxx", config.host_cxx)) config.substitutions.append(("%host_cc", config.host_cc)) +config.substitutions.append(("%hostc_compiler_launcher", config.host_c_compiler_launcher)) +config.substitutions.append(("%hostcxx_compiler_launcher", config.host_cxx_compiler_launcher)) config.substitutions.append(("%enable_libcxx", config.enable_libcxx)) config.substitutions.append(("%mlir_cmake_dir", config.mlir_cmake_dir)) +config.substitutions.append(("%mlir_obj_root", config.mlir_obj_root)) config.substitutions.append(("%llvm_use_linker", config.llvm_use_linker)) config.substitutions.append(("%cmake_build_type", config.cmake_build_type)) + +if not config.llvm_shared_libs_build: + config.available_features.add("non-shared-libs-build") diff --git a/mlir/test/Examples/standalone/test.wheel.toy b/mlir/test/Examples/standalone/test.wheel.toy new file mode 100644 index 0000000000000..17d8cb5b246c9 --- /dev/null +++ b/mlir/test/Examples/standalone/test.wheel.toy @@ -0,0 +1,32 @@ +# There's no real issue with windows here, it's just that some CMake generated paths for targets end up being longer +# than 255 chars when combined with the fact that pip wants to install into a tmp directory buried under +# C/Users/ContainerAdministrator/AppData/Local/Temp. +# UNSUPPORTED: target={{.*(windows).*}} +# REQUIRES: non-shared-libs-build + +# RUN: export CMAKE_BUILD_TYPE=%cmake_build_type +# RUN: export CMAKE_CXX_COMPILER=%host_cxx +# RUN: export CMAKE_CXX_COMPILER_LAUNCHER=%hostcxx_compiler_launcher +# RUN: export CMAKE_C_COMPILER=%host_cc +# RUN: export CMAKE_C_COMPILER_LAUNCHER=%hostc_compiler_launcher +# RUN: export CMAKE_GENERATOR=%cmake_generator +# RUN: export LLVM_USE_LINKER=%llvm_use_linker +# RUN: export MLIR_DIR="%mlir_cmake_dir" + +# RUN: %python -m pip wheel "%mlir_src_root/examples/standalone" -w "%mlir_obj_root/wheelhouse" -v | tee %t + +# RUN: rm -rf "%mlir_obj_root/standalone-python-bindings-install" +# RUN: %python -m pip install standalone_python_bindings -f "%mlir_obj_root/wheelhouse" --target "%mlir_obj_root/standalone-python-bindings-install" -v | tee -a %t + +# RUN: export PYTHONPATH="%mlir_obj_root/standalone-python-bindings-install" +# RUN: %python "%mlir_src_root/examples/standalone/test/python/smoketest.py" nanobind | tee -a %t + +# RUN: FileCheck --input-file=%t %s + +# CHECK: Successfully built standalone-python-bindings + +# CHECK: module { +# CHECK: %[[C2:.*]] = arith.constant 2 : i32 +# CHECK: %[[V0:.*]] = standalone.foo %[[C2]] : i32 +# CHECK: } + diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir index 9c310d80d4c2d..f2970618d5b6e 100644 --- a/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir +++ b/mlir/test/Integration/Dialect/Arith/CPU/test-arith-expand-truncf-extf.mlir @@ -28,7 +28,18 @@ func.func @entry() { %zero = arith.constant 0.0 : f32 %half = arith.constant 0.5 : f32 %one = arith.constant 1.0 : f32 + %oneAndAHalf = arith.constant 1.5 : f32 + %two = arith.constant 2.0 : f32 + %three = arith.constant 3.0 : f32 + %four = arith.constant 4.0 : f32 %max = arith.constant 6.0 : f32 + %minZero = arith.constant -0.0 : f32 + %minHalf = arith.constant -0.5 : f32 + %minOne = arith.constant -1.0 : f32 + %minOneAndAHalf = arith.constant -1.5 : f32 + %minTwo = arith.constant -2.0 : f32 + %minThree = arith.constant -3.0 : f32 + %minFour = arith.constant -4.0 : f32 %min = arith.constant -6.0 : f32 %lowerThanMin = arith.constant -1000000.0 : f32 %higherThanMax = arith.constant 1000000.0 : f32 @@ -41,8 +52,28 @@ func.func @entry() { func.call @check_truncf(%half) : (f32) -> () // CHECK: 2 func.call @check_truncf(%one) : (f32) -> () + // CHECK: 3 + func.call @check_truncf(%oneAndAHalf) : (f32) -> () + // CHECK: 4 + func.call @check_truncf(%two) : (f32) -> () + // CHECK: 5 + func.call @check_truncf(%three) : (f32) -> () + // CHECK: 6 + func.call @check_truncf(%four) : (f32) -> () // CHECK: 7 func.call @check_truncf(%max) : (f32) -> () + // CHECK: 9 + func.call @check_truncf(%minHalf) : (f32) -> () + // CHECK: 10 + func.call @check_truncf(%minOne) : (f32) -> () + // CHECK: 11 + func.call @check_truncf(%minOneAndAHalf) : (f32) -> () + // CHECK: 12 + func.call @check_truncf(%minTwo) : (f32) -> () + // CHECK: 13 + func.call @check_truncf(%minThree) : (f32) -> () + // CHECK: 14 + func.call @check_truncf(%minFour) : (f32) -> () // CHECK: 15 func.call @check_truncf(%min) : (f32) -> () // CHECK: 7 @@ -60,9 +91,45 @@ func.func @entry() { // CHECK: 0.5 %halfF4 = arith.truncf %half : f32 to f4E2M1FN func.call @check_extf(%halfF4) : (f4E2M1FN) -> () + // CHECK: 1 + %oneF4 = arith.truncf %one : f32 to f4E2M1FN + func.call @check_extf(%oneF4) : (f4E2M1FN) -> () + // CHECK: 1.5 + %oneAndAHalfF4 = arith.truncf %oneAndAHalf : f32 to f4E2M1FN + func.call @check_extf(%oneAndAHalfF4) : (f4E2M1FN) -> () + // CHECK: 2 + %twoF4 = arith.truncf %two : f32 to f4E2M1FN + func.call @check_extf(%twoF4) : (f4E2M1FN) -> () + // CHECK: 3 + %threeF4 = arith.truncf %three : f32 to f4E2M1FN + func.call @check_extf(%threeF4) : (f4E2M1FN) -> () + // CHECK: 4 + %fourF4 = arith.truncf %four : f32 to f4E2M1FN + func.call @check_extf(%fourF4) : (f4E2M1FN) -> () // CHECK: 6 %higherThanMaxF4 = arith.truncf %higherThanMax : f32 to f4E2M1FN func.call @check_extf(%higherThanMaxF4) : (f4E2M1FN) -> () + // CHECK: -0 + %minZeroF4 = arith.truncf %minZero : f32 to f4E2M1FN + func.call @check_extf(%minZeroF4) : (f4E2M1FN) -> () + // CHECK: -0.5 + %minHalfF4 = arith.truncf %minHalf : f32 to f4E2M1FN + func.call @check_extf(%minHalfF4) : (f4E2M1FN) -> () + // CHECK: -1 + %minOneF4 = arith.truncf %minOne : f32 to f4E2M1FN + func.call @check_extf(%minOneF4) : (f4E2M1FN) -> () + // CHECK: -1.5 + %minOneAndAHalfF4 = arith.truncf %minOneAndAHalf : f32 to f4E2M1FN + func.call @check_extf(%minOneAndAHalfF4) : (f4E2M1FN) -> () + // CHECK: -2 + %minTwoF4 = arith.truncf %minTwo : f32 to f4E2M1FN + func.call @check_extf(%minTwoF4) : (f4E2M1FN) -> () + // CHECK: -3 + %minThreeF4 = arith.truncf %minThree : f32 to f4E2M1FN + func.call @check_extf(%minThreeF4) : (f4E2M1FN) -> () + // CHECK: -4 + %minFourF4 = arith.truncf %minFour : f32 to f4E2M1FN + func.call @check_extf(%minFourF4) : (f4E2M1FN) -> () // CHECK: -6 %lowerThanMinF4 = arith.truncf %lowerThanMin : f32 to f4E2M1FN func.call @check_extf(%lowerThanMinF4) : (f4E2M1FN) -> () diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-mmt4d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-mmt4d.mlir index f7d79a304acb0..6192ed345debf 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-mmt4d.mlir +++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/pack-unpack-mmt4d.mlir @@ -2,7 +2,7 @@ // DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ // DEFINE: -cse -canonicalize -test-lower-to-llvm // DEFINE: %{entry_point} = main -// DEFINE: %{run} = mlir-runner -e %{entry_point} -entry-point-result=void \ +// DEFINE: %{run} = %mcr_aarch64_cmd -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ // DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils // RUN: %{compile} | %{run} | FileCheck %s diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir index a42344cb800db..a1e2729146c64 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir @@ -20,8 +20,8 @@ // Basic PTX check to make sure we are generating the right instructions. // CHECK-PTX: mbarrier.init.shared.b64 // CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 -// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes -// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes +// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes +// CHECK-PTX: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes // CHECK-PTX: mbarrier.arrive.expect_tx.shared.b64 // CHECK-PTX: mbarrier.try_wait.parity.shared.b64 diff --git a/mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir b/mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir new file mode 100644 index 0000000000000..d335e9c3fb5d0 --- /dev/null +++ b/mlir/test/Interfaces/TilingInterface/tile-using-custom-op.mlir @@ -0,0 +1,60 @@ +// RUN: mlir-opt --transform-interpreter --cse --split-input-file --mlir-print-local-scope %s | FileCheck %s + +module { + func.func @generic_parallel(%arg0 : tensor, %arg1 : tensor) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %d0 = tensor.dim %arg0, %c0 : tensor + %d1 = tensor.dim %arg0, %c1 : tensor + %empty = tensor.empty(%d0, %d1) : tensor + %generic = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d1)>, + affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%arg0, %arg1 : tensor, tensor) outs(%empty : tensor) { + ^bb(%b0 : f32, %b1 : f32, %b2 : f32): + %add = arith.addf %b0, %b1 : f32 + linalg.yield %add : f32 + } -> tensor + return %generic : tensor + } +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops {["linalg.generic"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %tiled_op, %loop = transform.test.tile_using_custom_loop %op tile_sizes = [10, 20] + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK-LABEL: func @generic_parallel +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor +// CHECK-SAME: %[[ARG1:.+]]: tensor +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C0]] +// CHECK-DAG: %[[D1:.+]] = tensor.dim %[[ARG0]], %[[C1]] +// CHECK-DAG: %[[EMPTY:.+]] = tensor.empty(%[[D0]], %[[D1]]) : tensor +// CHECK-DAG: %[[NITERS0:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 10)>()[%[[D0]]] +// CHECK-DAG: %[[NITERS1:.+]] = affine.apply affine_map<()[s0] -> (s0 ceildiv 20)>()[%[[D1]]] +// CHECK-DAG: %[[NITERS:.+]] = affine.apply affine_map<()[s0, s1] -> ((s0 ceildiv 10) * (s1 ceildiv 20))>()[%[[D0]], %[[D1]]] +// CHECK: %[[FOR:.+]] = scf.for %[[IV:[a-zA-Z0-9]+]] = %[[C0]] to %[[NITERS]] step %[[C1]] +// CHECK-SAME: iter_args(%[[INIT:.+]] = %[[EMPTY]]) +// CHECK: %[[DELINEARIZE:.+]]:2 = affine.delinearize_index %[[IV]] into (%[[NITERS0]], %[[NITERS1]]) +// CHECK-DAG: %[[SIZE0:.+]] = affine.min affine_map<(d0)[s0] -> (d0 * -10 + s0, 10)>(%[[DELINEARIZE]]#0)[%[[D0]]] +// CHECK-DAG: %[[SIZE1:.+]] = affine.min affine_map<(d0)[s0] -> (d0 * -20 + s0, 20)>(%[[DELINEARIZE]]#1)[%[[D1]]] +// CHECK-DAG: %[[OFFSET0:.+]] = affine.apply affine_map<(d0) -> (d0 * 10)>(%[[DELINEARIZE]]#0) +// CHECK-DAG: %[[OFFSET1:.+]] = affine.apply affine_map<(d0) -> (d0 * 20)>(%[[DELINEARIZE]]#1) +// CHECK-DAG: %[[ARG0_SLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[OFFSET0]], %[[OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]] [1, 1] +// CHECK-DAG: %[[ARG1_SLICE:.+]] = tensor.extract_slice %[[ARG1]][%[[OFFSET1]]] [%[[SIZE1]]] [1] +// CHECK-DAG: %[[INIT_SLICE:.+]] = tensor.extract_slice %[[INIT]][%[[OFFSET0]], %[[OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]] [1, 1] +// CHECK: %[[GENERIC:.+]] = linalg.generic +// CHECK-SAME: ins(%[[ARG0_SLICE]], %[[ARG1_SLICE]] : +// CHECK-SAME: outs(%[[INIT_SLICE]] : +// CHECK: %[[INSERT_SLICE:.+]] = tensor.insert_slice %[[GENERIC]] into %[[INIT]] +// CHECK-SAME: [%[[OFFSET0]], %[[OFFSET1]]] [%[[SIZE0]], %[[SIZE1]]] [1, 1] +// CHECK: scf.yield %[[INSERT_SLICE]] +// CHECK: return %[[FOR]] diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll index 9e2a17fb436af..e056e43a0982c 100644 --- a/mlir/test/Target/LLVMIR/Import/debug-info.ll +++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll @@ -215,7 +215,7 @@ define void @composite_type() !dbg !3 { ; // ----- ; CHECK-DAG: #[[FILE:.+]] = #llvm.di_file<"debug-info.ll" in "/"> -; CHECK-DAG: #[[CU:.+]] = #llvm.di_compile_unit, sourceLanguage = DW_LANG_C, file = #[[FILE]], isOptimized = false, emissionKind = None, nameTableKind = None> +; CHECK-DAG: #[[CU:.+]] = #llvm.di_compile_unit, sourceLanguage = DW_LANG_C, file = #[[FILE]], isOptimized = false, emissionKind = None, nameTableKind = None, splitDebugFilename = "test.dwo"> ; Verify an empty subroutine types list is supported. ; CHECK-DAG: #[[SP_TYPE:.+]] = #llvm.di_subroutine_type ; CHECK-DAG: #[[SP:.+]] = #llvm.di_subprogram, compileUnit = #[[CU]], scope = #[[FILE]], name = "subprogram", linkageName = "subprogram", file = #[[FILE]], line = 42, scopeLine = 42, subprogramFlags = Definition, type = #[[SP_TYPE]]> @@ -227,7 +227,7 @@ define void @subprogram() !dbg !3 { !llvm.dbg.cu = !{!1} !llvm.module.flags = !{!0} !0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, nameTableKind: None) +!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2, nameTableKind: None, splitDebugFilename: "test.dwo") !2 = !DIFile(filename: "debug-info.ll", directory: "/") !3 = distinct !DISubprogram(name: "subprogram", linkageName: "subprogram", scope: !2, file: !2, line: 42, scopeLine: 42, spFlags: DISPFlagDefinition, unit: !1, type: !4) !4 = !DISubroutineType(cc: DW_CC_normal, types: !5) diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll index 5e913691a59b0..d2bb80982bb3d 100644 --- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll +++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll @@ -733,12 +733,12 @@ define void @assume(i1 %true) { } ; CHECK-LABEL: @assume_with_opbundles -; CHECK-SAME: %[[TRUE:[a-zA-Z0-9]+]] ; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]] -define void @assume_with_opbundles(i1 %true, ptr %p) { +define void @assume_with_opbundles(ptr %p) { + ; CHECK: %[[TRUE:.+]] = llvm.mlir.constant(true) : i1 ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32 ; CHECK: llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1 - call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)] + call void @llvm.assume(i1 true) ["align"(ptr %p, i32 8)] ret void } @@ -829,7 +829,7 @@ define void @coro_suspend(i32 %0, i1 %1, ptr %2) { ; CHECK-LABEL: llvm.func @coro_end define void @coro_end(ptr %0, i1 %1) { ; CHECK: llvm.intr.coro.end - call i1 @llvm.coro.end(ptr %0, i1 %1, token none) + call void @llvm.coro.end(ptr %0, i1 %1, token none) ret void } @@ -1296,7 +1296,7 @@ declare i64 @llvm.coro.align.i64() declare i32 @llvm.coro.align.i32() declare token @llvm.coro.save(ptr) declare i8 @llvm.coro.suspend(token, i1) -declare i1 @llvm.coro.end(ptr, i1, token) +declare void @llvm.coro.end(ptr, i1, token) declare ptr @llvm.coro.free(token, ptr nocapture readonly) declare void @llvm.coro.resume(ptr) declare ptr @llvm.coro.promise(ptr nocapture, i32, i1) diff --git a/mlir/test/Target/LLVMIR/global_float_array.mlir b/mlir/test/Target/LLVMIR/global_float_array.mlir new file mode 100644 index 0000000000000..eba7948d2c55e --- /dev/null +++ b/mlir/test/Target/LLVMIR/global_float_array.mlir @@ -0,0 +1,4 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK: @test = internal global [1 x float] [float -0.000000e+00] +llvm.mlir.global internal @test(dense<-0.000000e+00> : tensor<1xf32>) {addr_space = 0 : i32} : !llvm.array<1 x f32> diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir index 274d64af78283..38ae63d1908e9 100644 --- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir @@ -43,7 +43,7 @@ llvm.func @func_no_debug() { #cu = #llvm.di_compile_unit< id = distinct[0]<>, sourceLanguage = DW_LANG_C, file = #file, producer = "MLIR", isOptimized = true, emissionKind = Full, - nameTableKind = None + nameTableKind = None, splitDebugFilename = "test.dwo" > #composite = #llvm.di_composite_type< tag = DW_TAG_structure_type, name = "composite", file = #file, @@ -140,7 +140,7 @@ llvm.func @empty_types() { llvm.return } loc(fused<#sp1>["foo.mlir":2:1]) -// CHECK: ![[CU_LOC:.*]] = distinct !DICompileUnit(language: DW_LANG_C, file: ![[CU_FILE_LOC:.*]], producer: "MLIR", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, nameTableKind: None) +// CHECK: ![[CU_LOC:.*]] = distinct !DICompileUnit(language: DW_LANG_C, file: ![[CU_FILE_LOC:.*]], producer: "MLIR", isOptimized: true, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, nameTableKind: None) // CHECK: ![[CU_FILE_LOC]] = !DIFile(filename: "foo.mlir", directory: "/test/") // CHECK: ![[FUNC_LOC]] = distinct !DISubprogram(name: "func_with_debug", linkageName: "func_with_debug", scope: ![[NESTED_NAMESPACE:.*]], file: ![[CU_FILE_LOC]], line: 3, type: ![[FUNC_TYPE:.*]], scopeLine: 3, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: ![[CU_LOC]]) diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir index e4f18f3e524e7..d63584e5e03ab 100644 --- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir +++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir @@ -146,6 +146,11 @@ llvm.func @trig_test(%arg0: f32, %arg1: vector<8xf32>) { llvm.intr.tan(%arg0) : (f32) -> f32 // CHECK: call <8 x float> @llvm.tan.v8f32 llvm.intr.tan(%arg1) : (vector<8xf32>) -> vector<8xf32> + + // CHECK: call { float, float } @llvm.sincos.f32 + llvm.intr.sincos(%arg0) : (f32) -> !llvm.struct<(f32, f32)> + // CHECK: call { <8 x float>, <8 x float> } @llvm.sincos.v8f32 + llvm.intr.sincos(%arg1) : (vector<8xf32>) -> !llvm.struct<(vector<8xf32>, vector<8xf32>)> llvm.return } @@ -460,10 +465,11 @@ llvm.func @assume_without_opbundles(%cond: i1) { } // CHECK-LABEL: @assume_with_opbundles -llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) { +llvm.func @assume_with_opbundles(%p: !llvm.ptr) { + %true = llvm.mlir.constant(true) : i1 %0 = llvm.mlir.constant(8 : i32) : i32 - // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ] - llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1 + // CHECK: call void @llvm.assume(i1 true) [ "align"(ptr %{{.+}}, i32 8) ] + llvm.intr.assume %true ["align"(%p, %0 : !llvm.ptr, i32)] : i1 llvm.return } @@ -846,8 +852,8 @@ llvm.func @coro_suspend(%arg0: i32, %arg1 : i1, %arg2 : !llvm.ptr) { // CHECK-LABEL: @coro_end llvm.func @coro_end(%arg0: !llvm.ptr, %arg1 : i1) { %none = llvm.mlir.none : !llvm.token - // CHECK: call i1 @llvm.coro.end - %0 = llvm.intr.coro.end %arg0, %arg1, %none : (!llvm.ptr, i1, !llvm.token) -> i1 + // CHECK: call void @llvm.coro.end + llvm.intr.coro.end %arg0, %arg1, %none : (!llvm.ptr, i1, !llvm.token) -> !llvm.void llvm.return } @@ -1301,6 +1307,8 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) { // CHECK-DAG: declare <8 x float> @llvm.ceil.v8f32(<8 x float>) #0 // CHECK-DAG: declare float @llvm.cos.f32(float) // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0 +// CHECK-DAG: declare { float, float } @llvm.sincos.f32(float) +// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) #0 // CHECK-DAG: declare float @llvm.copysign.f32(float, float) // CHECK-DAG: declare float @llvm.rint.f32(float) // CHECK-DAG: declare double @llvm.rint.f64(double) @@ -1374,7 +1382,7 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) { // CHECK-DAG: declare i32 @llvm.coro.size.i32() // CHECK-DAG: declare token @llvm.coro.save(ptr) // CHECK-DAG: declare i8 @llvm.coro.suspend(token, i1) -// CHECK-DAG: declare i1 @llvm.coro.end(ptr, i1, token) +// CHECK-DAG: declare void @llvm.coro.end(ptr, i1, token) // CHECK-DAG: declare ptr @llvm.coro.free(token, ptr readonly captures(none)) // CHECK-DAG: declare void @llvm.coro.resume(ptr) // CHECK-DAG: declare ptr @llvm.coro.promise(ptr captures(none), i32, i1) diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_load_cluster_im2col.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_load_cluster_im2col.mlir new file mode 100644 index 0000000000000..2fb98d3c1215e --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tma_load_cluster_im2col.mlir @@ -0,0 +1,298 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @tma_load_3d_im2col(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %off0: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_im2col(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 0, i64 %8, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 0, i64 %8, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 0, i64 %8, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_4d_im2col(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %mask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_im2col(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] multicast_mask = %mask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] multicast_mask = %mask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] multicast_mask = %mask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] multicast_mask = %mask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] multicast_mask = %mask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] multicast_mask = %mask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_5d_im2col(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %mask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_im2col(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 %12) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 %12, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 0, i64 %12, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 %12, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 0, i64 %12, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 %12, i1 true, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 0, i64 %12, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 %11, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] multicast_mask = %mask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] multicast_mask = %mask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] multicast_mask = %mask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] multicast_mask = %mask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] multicast_mask = %mask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] multicast_mask = %mask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_3d_im2col_w(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %wHalo: i16, %wOffset: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_im2col_w(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 %9, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 %9, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 %9, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_4d_im2col_w(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %wHalo: i16, %wOffset: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_im2col_w(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_5d_im2col_w(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %wHalo: i16, %wOffset: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_im2col_w(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 %11, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 %11, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 %11, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_3d_im2col_w_128(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %wHalo: i16, %wOffset: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_im2col_w_128(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 %9, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 %9, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 0, i64 %9, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i16 %8, i64 %9, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_4d_im2col_w_128(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %wHalo: i16, %wOffset: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_im2col_w_128(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 0, i64 %10, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i16 %9, i64 %10, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_5d_im2col_w_128(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %wHalo: i16, %wOffset: i16, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_im2col_w_128(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 %11, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 %11, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 0, i64 %11, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_load_cluster_tile.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_load_cluster_tile.mlir new file mode 100644 index 0000000000000..de0b929e6db72 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tma_load_cluster_tile.mlir @@ -0,0 +1,204 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @tma_load_1d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_1d_all_tile(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i16 %4, i64 %5) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 0, i64 %5, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 %4, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 %4, i64 %5, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 0, i64 %5, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 %4, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 %4, i64 %5, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 0, i64 %5, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 %4, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i16 %4, i64 %5, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_2d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_2d_all_tile(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i16 %5, i64 %6) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 0, i64 %6, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 %5, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 %5, i64 %6, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 0, i64 %6, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 %5, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 %5, i64 %6, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 0, i64 %6, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 %5, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i16 %5, i64 %6, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_3d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_all_tile(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i64 %7) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 0, i64 %7, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 %7, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 0, i64 %7, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 %7, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 0, i64 %7, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 %7, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_4d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_all_tile(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 %8) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 0, i64 %8, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 %8, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 0, i64 %8, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 %8, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 0, i64 %8, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i64 %8, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_5d_all(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %ctamask: i16, %cache: i64) { + // CHECK-LABEL: define void @tma_load_5d_all(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 %9, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 %9, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 %9, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] multicast_mask = %ctamask : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] l2_cache_hint = %cache : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] multicast_mask = %ctamask l2_cache_hint = %cache : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] l2_cache_hint = %cache {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] multicast_mask = %ctamask l2_cache_hint = %cache {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] l2_cache_hint = %cache {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] multicast_mask = %ctamask l2_cache_hint = %cache {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + llvm.return +} + +llvm.func @tma_load_2d_tile_gather4(%tma: !llvm.ptr, %dest: !llvm.ptr<7>, %bar: !llvm.ptr<3>, %row0: i32, %col0: i32, %col1: i32, %col2: i32, %col3: i32, %ctamask: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_2d_tile_gather4(ptr %0, ptr addrspace(7) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 0, i1 false, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 0, i1 true, i1 false, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 %9, i1 false, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9, i1 true, i1 true, i32 0) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 0, i1 false, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 0, i1 true, i1 false, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 %9, i1 false, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9, i1 true, i1 true, i32 1) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 0, i1 false, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 0, i1 true, i1 false, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 0, i64 %9, i1 false, i1 true, i32 2) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i64 %9, i1 true, i1 true, i32 2) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] multicast_mask = %ctamask {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_load_cta_im2col.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_load_cta_im2col.mlir new file mode 100644 index 0000000000000..0ebae19a682be --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tma_load_cta_im2col.mlir @@ -0,0 +1,109 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @tma_load_3d_im2col(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %off0: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_im2col(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i64 %7) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i64 %7, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%off0] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_4d_im2col(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_im2col(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 %9, i1 true) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 0, i1 false) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%off0, %off1] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_5d_im2col(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %off0: i16, %off1: i16, %off2: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_im2col(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 %11, i1 true) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i16 %10, i64 0, i1 false) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%off0, %off1, %off2] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_3d_im2col_w(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %wHalo: i16, %wOffset: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_im2col_w(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_4d_im2col_w(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %wHalo: i16, %wOffset: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_im2col_w(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 %9, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_5d_im2col_w(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %wHalo: i16, %wOffset: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_im2col_w(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i64 %10) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i64 %10, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_3d_im2col_w_128(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %wHalo: i16, %wOffset: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_im2col_w_128(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i16 %6, i16 %7, i64 %8, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_4d_im2col_w_128(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %wHalo: i16, %wOffset: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_im2col_w_128(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 %9) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i16 %7, i16 %8, i64 %9, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_5d_im2col_w_128(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %wHalo: i16, %wOffset: i16, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_im2col_w_128(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i64 %10) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i16 %8, i16 %9, i64 %10, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] im2col[%wHalo, %wOffset] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_load_cta_tile.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_load_cta_tile.mlir new file mode 100644 index 0000000000000..f11de711ca50a --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tma_load_cta_tile.mlir @@ -0,0 +1,73 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +llvm.func @tma_load_1d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_1d_all_tile(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i64 %4) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i64 %4, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_2d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_2d_all_tile(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i64 %5) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i64 %5, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_3d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_3d_all_tile(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i64 %6) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i64 %6, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_4d_all_tile(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_4d_all_tile(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i64 %7) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i64 %7, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_5d_all(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_5d_all(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i64 %8) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i64 %8, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] {isCTAOnly = true} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%crd0, %crd1, %crd2, %crd3, %crd4] l2_cache_hint = %cacheHint {isCTAOnly = true} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +llvm.func @tma_load_2d_tile_gather4(%tma: !llvm.ptr, %dest: !llvm.ptr<3>, %bar: !llvm.ptr<3>, %row0: i32, %col0: i32, %col1: i32, %col2: i32, %col3: i32, %cacheHint: i64) { + // CHECK-LABEL: define void @tma_load_2d_tile_gather4(ptr %0, ptr addrspace(3) %1, ptr addrspace(3) %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i64 %8) { + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i64 0, i1 false) + // CHECK-NEXT: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %1, ptr addrspace(3) %2, ptr %0, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i64 %8, i1 true) + // CHECK-NEXT: ret void + // CHECK-NEXT: } + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma, %bar, box[%row0, %col0, %col1, %col2, %col3] l2_cache_hint = %cacheHint {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_load_invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_load_invalid.mlir new file mode 100644 index 0000000000000..d94ea41f6bb38 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nvvm/tma_load_invalid.mlir @@ -0,0 +1,98 @@ +// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s + +// ----- + +llvm.func @tma_load_1d_im2col(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %ch : i64) { + // expected-error @below {{to use im2col mode, the tensor has to be at least 3-dimensional}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0] {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_0d(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<7>, %bar: !llvm.ptr<3>) { + // expected-error @below {{expects coordinates between 1 to 5 dimension}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[] : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_gather(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %ch : i64) { + // expected-error @below {{Gather4 mode expects 5 coordinates}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0,%crd1,%crd2,%crd3] l2_cache_hint=%ch {mode = #nvvm.tma_load_mode}: !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_asm_im2col(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<7>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %wHalo: i16, %wOffset: i16, %p : i1) { + // expected-error @below {{Predicate is supported only for Tile and Im2col modes.}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] predicate=%p {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} +// ----- + +llvm.func @tma_load_cta_asm_im2col(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<3>, %bar: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %wHalo: i16, %wOffset: i16, %p : i1) { + // expected-error @below {{Predicate is supported only for shared::cluster mode.}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0, %crd1, %crd2] im2col[%wHalo, %wOffset] predicate=%p {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_cta_0d(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<3>, %bar : !llvm.ptr<3>) { + // expected-error @below {{expects coordinates between 1 to 5 dimension}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[] {isCTAOnly = true} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_cta_mc(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<3>, %bar : !llvm.ptr<3>, %crd0: i32, %ctamask : i16) { + // expected-error @below {{Multicast is not supported with shared::cta mode.}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0] multicast_mask = %ctamask {isCTAOnly = true, mode = #nvvm.tma_load_mode} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} +// ----- + +llvm.func @tma_load_cta_cg(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<3>, %bar : !llvm.ptr<3>, %crd0: i32, %crd1: i32) { + // expected-error @below {{CTAGroup is not supported with shared::cta mode.}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0, %crd1] {isCTAOnly = true, mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_cta_with_7(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<7>, %bar : !llvm.ptr<3>, %crd0: i32, %crd1: i32) { + // expected-error @below {{Shared::cta destination requires address-space 3.}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0, %crd1] {isCTAOnly = true, mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_cluster_with_3(%tma_desc: !llvm.ptr, %dest : !llvm.ptr<3>, %bar : !llvm.ptr<3>, %crd0: i32, %crd1: i32) { + // expected-error @below {{Shared::cluster destination requires address-space 7.}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tma_desc, %bar, box[%crd0, %crd1] {isCTAOnly = false, mode = #nvvm.tma_load_mode, group = #nvvm.cta_group} : !llvm.ptr<3>, !llvm.ptr + + llvm.return +} + +// ----- + +llvm.func @tma_load_im2col_off(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<7>, %barrier: !llvm.ptr<3>, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %off0: i16, %off1: i16, %ctamask : i16, %cacheHint : i64) { + // expected-error @below {{im2col offsets expected 2 (provided 1)}} + nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, box[%crd0,%crd1,%crd2,%crd3] im2col[%off0] multicast_mask = %ctamask l2_cache_hint = %cacheHint {mode = #nvvm.tma_load_mode} : !llvm.ptr<7>, !llvm.ptr + + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir b/mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir index 6e0b48489e8b0..2231f1dabd504 100644 --- a/mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir +++ b/mlir/test/Target/LLVMIR/nvvm/tma_store_reduce.mlir @@ -19,14 +19,14 @@ llvm.func @tma_store_reduce_1d(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, %d0 : nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.1d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> @@ -59,14 +59,14 @@ llvm.func @tma_store_reduce_2d(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, %d0 : nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.2d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> @@ -99,14 +99,14 @@ llvm.func @tma_store_reduce_3d_tile(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> @@ -137,14 +137,14 @@ llvm.func @tma_store_reduce_3d_im2col(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.3d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> @@ -177,14 +177,14 @@ llvm.func @tma_store_reduce_4d_tile(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> @@ -215,14 +215,14 @@ llvm.func @tma_store_reduce_4d_im2col(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.4d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> @@ -255,14 +255,14 @@ llvm.func @tma_store_reduce_5d_tile(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr, nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.tile.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] {redKind = #nvvm.tma_redux_kind} : !llvm.ptr, !llvm.ptr<3> @@ -293,14 +293,14 @@ llvm.func @tma_store_reduce_5d_im2col(%src : !llvm.ptr<3>, %tma_desc : !llvm.ptr nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] l2_cache_hint = %ch {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) - // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 undef, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.add.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.min.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.max.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.inc.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.dec.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.and.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.or.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) + // CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.reduce.xor.im2col.5d(ptr addrspace(3) %[[SRC]], ptr %[[DST]], i32 %[[D0]], i32 %[[D1]], i32 %[[D2]], i32 %[[D3]], i32 %[[D4]], i64 0, i1 false) nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> nvvm.cp.async.bulk.tensor.reduce %tma_desc, %src, box[%d0, %d1, %d2, %d3, %d4] {redKind = #nvvm.tma_redux_kind, mode = #nvvm.tma_store_mode} : !llvm.ptr, !llvm.ptr<3> diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir new file mode 100644 index 0000000000000..87ff0ba786648 --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir @@ -0,0 +1,87 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// Verifies that the IR builder can handle reductions with multi-block combiner +// regions on the GPU. + +module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + llvm.func @bar() {} + llvm.func @baz() {} + + omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + omp.yield(%arg1 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + llvm.call @bar() : () -> () + llvm.br ^bb3 + + ^bb3: // pred: ^bb1 + llvm.call @baz() : () -> () + omp.yield(%arg0 : !llvm.ptr) + } + llvm.func @foo_() { + %c1 = llvm.mlir.constant(1 : i64) : i64 + %10 = llvm.alloca %c1 x !llvm.array<5 x f32> {bindc_name = "x"} : (i64) -> !llvm.ptr<5> + %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr + %74 = omp.map.info var_ptr(%11 : !llvm.ptr, !llvm.array<5 x f32>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"} + omp.target map_entries(%74 -> %arg0 : !llvm.ptr) { + %c1_2 = llvm.mlir.constant(1 : i32) : i32 + %c10 = llvm.mlir.constant(10 : i32) : i32 + omp.teams reduction(byref @add_reduction_byref_box_5xf32 %arg0 -> %arg2 : !llvm.ptr) { + omp.parallel { + omp.distribute { + omp.wsloop { + omp.loop_nest (%arg5) : i32 = (%c1_2) to (%c10) inclusive step (%c1_2) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: call void @__kmpc_parallel_51({{.*}}, i32 1, i32 -1, i32 -1, +// CHECK-SAME: ptr @[[PAR_OUTLINED:.*]], ptr null, ptr %2, i64 1) + +// CHECK: define internal void @[[PAR_OUTLINED]]{{.*}} { +// CHECK: .omp.reduction.then: +// CHECK: br label %omp.reduction.nonatomic.body + +// CHECK: omp.reduction.nonatomic.body: +// CHECK: call void @bar() +// CHECK: br label %[[BODY_2ND_BB:.*]] + +// CHECK: [[BODY_2ND_BB]]: +// CHECK: call void @baz() +// CHECK: br label %[[CONT_BB:.*]] + +// CHECK: [[CONT_BB]]: +// CHECK-NEXT: %[[RED_RHS:.*]] = phi ptr [ %final.rhs, %{{.*}} ] +// CHECK-NEXT: store ptr %[[RED_RHS]], ptr %{{.*}}, align 8 +// CHECK-NEXT: br label %.omp.reduction.done +// CHECK: } + +// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 { +// CHECK: br label %omp.reduction.nonatomic.body + +// CHECK: [[BODY_2ND_BB:.*]]: +// CHECK: call void @baz() +// CHECK: br label %omp.region.cont + + +// CHECK: omp.reduction.nonatomic.body: +// CHECK: call void @bar() +// CHECK: br label %[[BODY_2ND_BB]] + +// CHECK: } diff --git a/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir b/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir index 2339022be8979..b91c97738f87f 100644 --- a/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir +++ b/mlir/test/Target/LLVMIR/openmp-cancel-distribute-parallel-loop.mlir @@ -32,7 +32,7 @@ llvm.func @cancel_distribute_parallel_do(%lb : i32, %ub : i32, %step : i32) { // CHECK: omp.region.cont6: // CHECK: br label %omp.region.cont4 // CHECK: omp.region.cont4: -// CHECK: br label %distribute.exit.exitStub +// CHECK: br label %omp.par.exit.exitStub // CHECK: omp_loop.body: // CHECK: %[[VAL_111:.*]] = add i32 %{{.*}}, %{{.*}} // CHECK: %[[VAL_112:.*]] = mul i32 %[[VAL_111]], %{{.*}} @@ -52,6 +52,6 @@ llvm.func @cancel_distribute_parallel_do(%lb : i32, %ub : i32, %step : i32) { // CHECK: omp_loop.inc: // CHECK: %[[VAL_100:.*]] = add nuw i32 %{{.*}}, 1 // CHECK: br label %omp_loop.header -// CHECK: distribute.exit.exitStub: +// CHECK: omp.par.exit.exitStub: // CHECK: ret void diff --git a/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir b/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir index 188c12ebfd3c7..ef118e0ad1df2 100644 --- a/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir +++ b/mlir/test/Target/LLVMIR/openmp-distribute-private.mlir @@ -34,11 +34,6 @@ llvm.func @_QQmain() { // CHECK: } // CHECK: define internal void @[[TEAMS_FUNC]]({{.*}}) { -// CHECK: call void @[[DIST_FUNC:.*]]() -// CHECK-NEXT: br label %distribute.exit -// CHECK: } - -// CHECK: define internal void @[[DIST_FUNC]]() { // CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4 // CHECK: %[[IV_ALLOC:.*]] = alloca i32, align 4 @@ -78,29 +73,22 @@ llvm.func @_QQmain() { // CHECK-LABEL: define void @_QQmain() { // CHECK: %[[SHARED_VAR_ALLOC:.*]] = alloca float, i64 1, align 4 -// CHECK: %[[SHARED_VAR_PTR:.*]] = getelementptr { ptr }, ptr %[[DIST_PARAM:.*]], i32 0, i32 0 -// CHECK: store ptr %[[SHARED_VAR_ALLOC]], ptr %[[SHARED_VAR_PTR]], align 8 -// CHECK: call void @[[DIST_FUNC:.*]](ptr %[[DIST_PARAM]]) -// CHECK-NEXT: br label %distribute.exit -// CHECK: } -// CHECK: define internal void @[[DIST_FUNC]](ptr %[[DIST_ARG:.*]]) { -// CHECK: %[[SHARED_VAR_GEP:.*]] = getelementptr { ptr }, ptr %[[DIST_ARG]], i32 0, i32 0 -// CHECK: %[[SHARED_VAR_PTR2:.*]] = load ptr, ptr %[[SHARED_VAR_GEP]], align 8 +// CHECK: distribute.alloca: // CHECK: %[[PRIV_VAR_ALLOC:.*]] = alloca float, align 4 // CHECK: omp.private.copy: -// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_PTR2]], align 4 +// CHECK-NEXT: %[[SHARED_VAR_VAL:.*]] = load float, ptr %[[SHARED_VAR_ALLOC]], align 4 // CHECK-NEXT: store float %[[SHARED_VAR_VAL]], ptr %[[PRIV_VAR_ALLOC]], align 4 +// CHECK: omp.loop_nest.region: +// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4 + // CHECK: omp_loop.after: // CHECK-NEXT: br label %omp.region.cont // CHECK: omp.region.cont: // CHECK-NEXT: call void @foo_free(ptr %[[PRIV_VAR_ALLOC]]) - -// CHECK: omp.loop_nest.region: -// CHECK-NEXT: store float 0x40091EB860000000, ptr %[[PRIV_VAR_ALLOC]], align 4 // CHECK: } diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir index 27210bc0890ce..8bd33a382197e 100644 --- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir +++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir @@ -3339,12 +3339,6 @@ llvm.func @distribute() { } // CHECK-LABEL: define void @distribute -// CHECK: call void @[[OUTLINED:.*]]({{.*}}) -// CHECK-NEXT: br label %[[EXIT:.*]] -// CHECK: [[EXIT]]: -// CHECK: ret void - -// CHECK: define internal void @[[OUTLINED]]({{.*}}) // CHECK: %[[LASTITER:.*]] = alloca i32 // CHECK: %[[LB:.*]] = alloca i64 // CHECK: %[[UB:.*]] = alloca i64 @@ -3381,9 +3375,7 @@ llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) { // CHECK: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[OUTLINED_PARALLEL:.*]], // CHECK: define internal void @[[OUTLINED_PARALLEL]] -// CHECK: call void @[[OUTLINED_DISTRIBUTE:.*]]({{.*}}) - -// CHECK: define internal void @[[OUTLINED_DISTRIBUTE]] +// CHECK: distribute.alloca: // CHECK: %[[LASTITER:.*]] = alloca i32 // CHECK: %[[LB:.*]] = alloca i32 // CHECK: %[[UB:.*]] = alloca i32 diff --git a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir index 9bb2b40a43def..504d91b1f6198 100644 --- a/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-target-generic-spmd.mlir @@ -49,9 +49,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a // HOST: call void{{.*}}@__kmpc_fork_teams({{.*}}, ptr @[[TEAMS_OUTLINE:.*]], {{.*}}) // HOST: define internal void @[[TEAMS_OUTLINE]] -// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}}) - -// HOST: define internal void @[[DISTRIBUTE_OUTLINE]] // HOST: call void @__kmpc_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 92, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}}) // HOST: call void (ptr, i32, ptr, ...) @__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}}) diff --git a/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir b/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir index 86dff678bf639..20202fc7fc16c 100644 --- a/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-target-spmd.mlir @@ -46,9 +46,6 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a // HOST: call void{{.*}}@__kmpc_fork_call({{.*}}, ptr @[[PARALLEL_OUTLINE:.*]], {{.*}}) // HOST: define internal void @[[PARALLEL_OUTLINE]] -// HOST: call void @[[DISTRIBUTE_OUTLINE:.*]]({{.*}}) - -// HOST: define internal void @[[DISTRIBUTE_OUTLINE]] // HOST: call void @__kmpc_dist_for_static_init{{.*}}(ptr {{.*}}, i32 {{.*}}, i32 34, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, ptr {{.*}}, i32 {{.*}}, i32 {{.*}}) //--- device.mlir diff --git a/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir b/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir index 4d766cc1ac4f4..69d5d225d0515 100644 --- a/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir +++ b/mlir/test/Target/LLVMIR/openmp-teams-distribute-parallel-do-simd.mlir @@ -3,22 +3,20 @@ // Check that omp.simd as a leaf of a composite construct still generates // the appropriate loop vectorization attribute. -// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.2 +// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.1 // CHECK: teams.body: // CHECK: omp.teams.region: -// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par.1 +// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par // CHECK: omp.par.entry: // CHECK: omp.par.region: -// CHECK: distribute.exit: - -// CHECK-LABEL: define internal void @test_teams_distribute_parallel_do_simd..omp_par // CHECK: distribute.body: // CHECK: omp.distribute.region: // CHECK: omp_loop.header: // CHECK: omp_loop.inc: // CHECK-NEXT: %omp_loop.next = add nuw i32 %omp_loop.iv, 1 // CHECK-NEXT: br label %omp_loop.header, !llvm.loop ![[LOOP_ATTR:.*]] +// CHECK: omp.par.exit.exitStub: // CHECK: ![[LOOP_ATTR]] = distinct !{![[LOOP_ATTR]], ![[LPAR:.*]], ![[LVEC:.*]]} // CHECK: ![[LPAR]] = !{!"llvm.loop.parallel_accesses", ![[PAR_ACC:.*]]} diff --git a/mlir/test/Target/LLVMIR/ptr.mlir b/mlir/test/Target/LLVMIR/ptr.mlir index 2fa794130ec52..e2687e52ece57 100644 --- a/mlir/test/Target/LLVMIR/ptr.mlir +++ b/mlir/test/Target/LLVMIR/ptr.mlir @@ -281,3 +281,99 @@ llvm.func @ptr_add_cst() -> !ptr.ptr<#llvm.address_space<0>> { %res = ptr.ptr_add %ptr, %off : !ptr.ptr<#llvm.address_space<0>>, i32 llvm.return %res : !ptr.ptr<#llvm.address_space<0>> } + +// CHECK-LABEL: define i64 @ptr_diff_scalar +// CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: ret i64 %[[DIFF]] +// CHECK-NEXT: } +llvm.func @ptr_diff_scalar(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.ptr<#llvm.address_space<0>>) -> i64 { + %diff = ptr.ptr_diff %ptr1, %ptr2 : !ptr.ptr<#llvm.address_space<0>> -> i64 + llvm.return %diff : i64 +} + +// CHECK-LABEL: define i32 @ptr_diff_scalar_i32 +// CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[DIFF:.*]] = sub i64 %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: %[[TRUNC:.*]] = trunc i64 %[[DIFF]] to i32 +// CHECK-NEXT: ret i32 %[[TRUNC]] +// CHECK-NEXT: } +llvm.func @ptr_diff_scalar_i32(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.ptr<#llvm.address_space<0>>) -> i32 { + %diff = ptr.ptr_diff %ptr1, %ptr2 : !ptr.ptr<#llvm.address_space<0>> -> i32 + llvm.return %diff : i32 +} + +// CHECK-LABEL: define <4 x i64> @ptr_diff_vector +// CHECK-SAME: (<4 x ptr> %[[PTRS1:.*]], <4 x ptr> %[[PTRS2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint <4 x ptr> %[[PTRS1]] to <4 x i64> +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint <4 x ptr> %[[PTRS2]] to <4 x i64> +// CHECK-NEXT: %[[DIFF:.*]] = sub <4 x i64> %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: ret <4 x i64> %[[DIFF]] +// CHECK-NEXT: } +llvm.func @ptr_diff_vector(%ptrs1: vector<4x!ptr.ptr<#llvm.address_space<0>>>, %ptrs2: vector<4x!ptr.ptr<#llvm.address_space<0>>>) -> vector<4xi64> { + %diffs = ptr.ptr_diff %ptrs1, %ptrs2 : vector<4x!ptr.ptr<#llvm.address_space<0>>> -> vector<4xi64> + llvm.return %diffs : vector<4xi64> +} + +// CHECK-LABEL: define <8 x i32> @ptr_diff_vector_i32 +// CHECK-SAME: (<8 x ptr> %[[PTRS1:.*]], <8 x ptr> %[[PTRS2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint <8 x ptr> %[[PTRS1]] to <8 x i64> +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint <8 x ptr> %[[PTRS2]] to <8 x i64> +// CHECK-NEXT: %[[DIFF:.*]] = sub <8 x i64> %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: %[[TRUNC:.*]] = trunc <8 x i64> %[[DIFF]] to <8 x i32> +// CHECK-NEXT: ret <8 x i32> %[[TRUNC]] +// CHECK-NEXT: } +llvm.func @ptr_diff_vector_i32(%ptrs1: vector<8x!ptr.ptr<#llvm.address_space<0>>>, %ptrs2: vector<8x!ptr.ptr<#llvm.address_space<0>>>) -> vector<8xi32> { + %diffs = ptr.ptr_diff %ptrs1, %ptrs2 : vector<8x!ptr.ptr<#llvm.address_space<0>>> -> vector<8xi32> + llvm.return %diffs : vector<8xi32> +} + +// CHECK-LABEL: define i64 @ptr_diff_with_constants() { +// CHECK-NEXT: ret i64 4096 +// CHECK-NEXT: } +llvm.func @ptr_diff_with_constants() -> i64 { + %ptr1 = ptr.constant #ptr.address<0x2000> : !ptr.ptr<#llvm.address_space<0>> + %ptr2 = ptr.constant #ptr.address<0x1000> : !ptr.ptr<#llvm.address_space<0>> + %diff = ptr.ptr_diff %ptr1, %ptr2 : !ptr.ptr<#llvm.address_space<0>> -> i64 + llvm.return %diff : i64 +} + +// CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw +// CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[DIFF:.*]] = sub nsw i64 %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: ret i64 %[[DIFF]] +// CHECK-NEXT: } +llvm.func @ptr_diff_with_flags_nsw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.ptr<#llvm.address_space<0>>) -> i64 { + %diff = ptr.ptr_diff nsw %ptr1, %ptr2 : !ptr.ptr<#llvm.address_space<0>> -> i64 + llvm.return %diff : i64 +} + +// CHECK-LABEL: define i64 @ptr_diff_with_flags_nuw +// CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[DIFF:.*]] = sub nuw i64 %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: ret i64 %[[DIFF]] +// CHECK-NEXT: } +llvm.func @ptr_diff_with_flags_nuw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.ptr<#llvm.address_space<0>>) -> i64 { + %diff = ptr.ptr_diff nuw %ptr1, %ptr2 : !ptr.ptr<#llvm.address_space<0>> -> i64 + llvm.return %diff : i64 +} + +// CHECK-LABEL: define i64 @ptr_diff_with_flags_nsw_nuw +// CHECK-SAME: (ptr %[[PTR1:.*]], ptr %[[PTR2:.*]]) { +// CHECK-NEXT: %[[P1INT:.*]] = ptrtoint ptr %[[PTR1]] to i64 +// CHECK-NEXT: %[[P2INT:.*]] = ptrtoint ptr %[[PTR2]] to i64 +// CHECK-NEXT: %[[DIFF:.*]] = sub nuw nsw i64 %[[P1INT]], %[[P2INT]] +// CHECK-NEXT: ret i64 %[[DIFF]] +// CHECK-NEXT: } +llvm.func @ptr_diff_with_flags_nsw_nuw(%ptr1: !ptr.ptr<#llvm.address_space<0>>, %ptr2: !ptr.ptr<#llvm.address_space<0>>) -> i64 { + %diff = ptr.ptr_diff nsw | nuw %ptr1, %ptr2 : !ptr.ptr<#llvm.address_space<0>> -> i64 + llvm.return %diff : i64 +} diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir index bebd1b4317b2f..e043a8c533d05 100644 --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -907,10 +907,10 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) { llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, %stride : i16, - %numRecords : i32, + %numRecords : i64, %flags : i32) -> !llvm.ptr<8> { // CHECK-LABEL: rocdl.make.buffer.rsrc - // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) + // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %{{.*}}, i16 %{{.*}}, i64 %{{.*}}, i32 %{{.*}}) // CHECK: ret ptr addrspace(8) %[[rsrc]] %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8> llvm.return %rsrc : !llvm.ptr<8> @@ -918,10 +918,10 @@ llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr, llvm.func @rocdl.make.buffer.rsrc.p7.p1(%ptr : !llvm.ptr<1>, %stride : i16, - %numRecords : i32, + %numRecords : i64, %flags : i32) -> !llvm.ptr<7> { // CHECK-LABEL: rocdl.make.buffer.rsrc.p7.p1 - // CHECK: %[[rsrc:.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}}) + // CHECK: %[[rsrc:.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %{{.*}}, i16 %{{.*}}, i64 %{{.*}}, i32 %{{.*}}) // CHECK: ret ptr addrspace(7) %[[rsrc]] %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : <1> to <7> llvm.return %rsrc : !llvm.ptr<7> @@ -1298,6 +1298,20 @@ llvm.func @rocdl_last_use(%ptr: !llvm.ptr<1>) -> i32 { llvm.return %ret : i32 } +llvm.func @test_fmed3_f16(%arg0: f16, %arg1: f16, %arg2: f16) -> f16 { + // CHECK-LABEL: define half @test_fmed3_f16(half %0, half %1, half %2) + %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f16 + llvm.return %0 : f16 + // CHECK: call half @llvm.amdgcn.fmed3.f16(half %0, half %1, half %2) +} + +llvm.func @test_fmed3_f32(%arg0: f32, %arg1: f32, %arg2: f32) -> f32 { + // CHECK-LABEL: define float @test_fmed3_f32(float %0, float %1, float %2) + %0 = rocdl.fmed3 %arg0, %arg1, %arg2 : f32 + llvm.return %0 : f32 + // CHECK: call float @llvm.amdgcn.fmed3.f32(float %0, float %1, float %2) +} + // CHECK-LABEL: rocdl.cvt.scale.pk8 // CHECK-SAME:(i32 %[[I32:.+]], <2 x i32> %[[V2I32:.+]], i32 %[[SCALE:.+]]) llvm.func @rocdl.cvt.scale.pk8(%i32: i32, %v2xi32: vector<2xi32>, %scale: i32) { diff --git a/mlir/test/Target/SPIRV/execution-mode.mlir b/mlir/test/Target/SPIRV/execution-mode.mlir index e51ba7c0269a4..2178a8a77a225 100644 --- a/mlir/test/Target/SPIRV/execution-mode.mlir +++ b/mlir/test/Target/SPIRV/execution-mode.mlir @@ -1,10 +1,15 @@ // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s -spirv.module Logical GLSL450 requires #spirv.vce { +// RUN: %if spirv-tools %{ rm -rf %t %} +// RUN: %if spirv-tools %{ mkdir %t %} +// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %} +// RUN: %if spirv-tools %{ spirv-val %t %} + +spirv.module Logical OpenCL requires #spirv.vce { spirv.func @foo() -> () "None" { spirv.Return } - spirv.EntryPoint "GLCompute" @foo + spirv.EntryPoint "Kernel" @foo // CHECK: spirv.ExecutionMode @foo "LocalSizeHint", 3, 4, 5 spirv.ExecutionMode @foo "LocalSizeHint", 3, 4, 5 } diff --git a/mlir/test/Target/SPIRV/function-call.mlir b/mlir/test/Target/SPIRV/function-call.mlir index a7473a8ccd7ba..2e94ded3401ce 100644 --- a/mlir/test/Target/SPIRV/function-call.mlir +++ b/mlir/test/Target/SPIRV/function-call.mlir @@ -1,26 +1,31 @@ // RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip %s | FileCheck %s -spirv.module Logical GLSL450 requires #spirv.vce { - spirv.GlobalVariable @var1 : !spirv.ptr, Input> +// RUN: %if spirv-tools %{ rm -rf %t %} +// RUN: %if spirv-tools %{ mkdir %t %} +// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %} +// RUN: %if spirv-tools %{ spirv-val %t %} + +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.GlobalVariable @var1 : !spirv.ptr, StorageBuffer> spirv.func @fmain() -> i32 "None" { %0 = spirv.Constant 16 : i32 - %1 = spirv.mlir.addressof @var1 : !spirv.ptr, Input> + %1 = spirv.mlir.addressof @var1 : !spirv.ptr, StorageBuffer> // CHECK: {{%.*}} = spirv.FunctionCall @f_0({{%.*}}) : (i32) -> i32 %3 = spirv.FunctionCall @f_0(%0) : (i32) -> i32 - // CHECK: spirv.FunctionCall @f_1({{%.*}}, {{%.*}}) : (i32, !spirv.ptr, Input>) -> () - spirv.FunctionCall @f_1(%3, %1) : (i32, !spirv.ptr, Input>) -> () - // CHECK: {{%.*}} = spirv.FunctionCall @f_2({{%.*}}) : (!spirv.ptr, Input>) -> !spirv.ptr, Input> - %4 = spirv.FunctionCall @f_2(%1) : (!spirv.ptr, Input>) -> !spirv.ptr, Input> + // CHECK: spirv.FunctionCall @f_1({{%.*}}, {{%.*}}) : (i32, !spirv.ptr, StorageBuffer>) -> () + spirv.FunctionCall @f_1(%3, %1) : (i32, !spirv.ptr, StorageBuffer>) -> () + // CHECK: {{%.*}} = spirv.FunctionCall @f_2({{%.*}}) : (!spirv.ptr, StorageBuffer>) -> !spirv.ptr, StorageBuffer> + %4 = spirv.FunctionCall @f_2(%1) : (!spirv.ptr, StorageBuffer>) -> !spirv.ptr, StorageBuffer> spirv.ReturnValue %3 : i32 } spirv.func @f_0(%arg0 : i32) -> i32 "None" { spirv.ReturnValue %arg0 : i32 } - spirv.func @f_1(%arg0 : i32, %arg1 : !spirv.ptr, Input>) -> () "None" { + spirv.func @f_1(%arg0 : i32, %arg1 : !spirv.ptr, StorageBuffer>) -> () "None" { spirv.Return } - spirv.func @f_2(%arg0 : !spirv.ptr, Input>) -> !spirv.ptr, Input> "None" { - spirv.ReturnValue %arg0 : !spirv.ptr, Input> + spirv.func @f_2(%arg0 : !spirv.ptr, StorageBuffer>) -> !spirv.ptr, StorageBuffer> "None" { + spirv.ReturnValue %arg0 : !spirv.ptr, StorageBuffer> } spirv.func @f_loop_with_function_call(%count : i32) -> () "None" { diff --git a/mlir/test/Transforms/move-operation-deps.mlir b/mlir/test/Transforms/move-operation-deps.mlir index aa7b5dc2a240a..75d8386d520ee 100644 --- a/mlir/test/Transforms/move-operation-deps.mlir +++ b/mlir/test/Transforms/move-operation-deps.mlir @@ -460,3 +460,31 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +func.func @move_isolated_from_above() -> () { + %1 = "before"() : () -> (f32) + %2 = "moved0"() : () -> (f32) + %3 = test.isolated_one_region_op %2 {} : f32 -> f32 + %4 = "moved1"(%3) : (f32) -> (f32) + return +} +// CHECK-LABEL: func @move_isolated_from_above() +// CHECK: %[[MOVED0:.+]] = "moved0" +// CHECK: %[[ISOLATED:.+]] = test.isolated_one_region_op %[[MOVED0]] +// CHECK: %[[MOVED1:.+]] = "moved1"(%[[ISOLATED]]) +// CHECK: %[[BEFORE:.+]] = "before" + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0 : !transform.any_op {transform.readonly}) { + %op1 = transform.structured.match ops{["before"]} in %arg0 + : (!transform.any_op) -> !transform.any_op + %op2 = transform.structured.match ops{["moved1"]} in %arg0 + : (!transform.any_op) -> !transform.any_op + %v1 = transform.get_result %op2[0] : (!transform.any_op) -> !transform.any_value + transform.test.move_value_defns %v1 before %op1 + : (!transform.any_value), !transform.any_op + transform.yield + } +} diff --git a/mlir/test/Transforms/remove-dead-values-call-segments.mlir b/mlir/test/Transforms/remove-dead-values-call-segments.mlir new file mode 100644 index 0000000000000..fed9cabbd2ee8 --- /dev/null +++ b/mlir/test/Transforms/remove-dead-values-call-segments.mlir @@ -0,0 +1,23 @@ +// RUN: mlir-opt --split-input-file --remove-dead-values --mlir-print-op-generic %s | FileCheck %s --check-prefix=GEN + +// ----- +// Private callee: both args become dead after internal DCE; RDV drops callee +// args and shrinks the *args* segment on the call-site to zero; sizes kept in +// sync. + +module { + func.func private @callee(%x: i32, %y: i32) { + %u = arith.addi %x, %x : i32 // %y is dead + return + } + + func.func @caller(%a: i32, %b: i32) { + // args segment initially has 2 operands. + "test.call_with_segments"(%a, %b) { callee = @callee, + operandSegmentSizes = array } : (i32, i32) -> () + return + } +} + +// GEN: "test.call_with_segments"() <{callee = @callee, operandSegmentSizes = array}> : () -> () +// ^ args shrank from 2 -> 0 diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir index fa2c145bd3701..56449469dc29f 100644 --- a/mlir/test/Transforms/remove-dead-values.mlir +++ b/mlir/test/Transforms/remove-dead-values.mlir @@ -615,3 +615,37 @@ module @last_block_not_exit { // CHECK-LABEL: @call_private_but_not_use // CHECK: call @terminated_with_condbr(%false, %true) : (i1, i1) } + +// ----- + +// Test the elimination of function arguments. + +// CHECK-LABEL: func private @single_parameter +// CHECK-SAME: () { +func.func private @single_parameter(%arg0: index) { + return +} + +// CHECK-LABEL: func.func private @mutl_parameter( +// CHECK-SAME: %[[ARG0:.*]]: index) +// CHECK: return %[[ARG0]] +func.func private @mutl_parameter(%arg0: index, %arg1: index, %arg2: index) -> index { + return %arg1 : index +} + +// CHECK-LABEL: func private @eliminate_parameter +// CHECK-SAME: () { +func.func private @eliminate_parameter(%arg0: index, %arg1: index) { + call @single_parameter(%arg0) : (index) -> () + return +} + +// CHECK-LABEL: func @callee +// CHECK-SAME: (%[[ARG0:.*]]: index, %[[ARG1:.*]]: index, %[[ARG2:.*]]: index) +func.func @callee(%arg0: index, %arg1: index, %arg2: index) -> index { +// CHECK: call @eliminate_parameter() : () -> () + call @eliminate_parameter(%arg0, %arg1) : (index, index) -> () +// CHECK: call @mutl_parameter(%[[ARG1]]) : (index) -> index + %res = call @mutl_parameter(%arg0, %arg1, %arg2) : (index, index, index) -> (index) + return %res : index +} diff --git a/mlir/test/Transforms/test-bubble-down-memory-space-casts.mlir b/mlir/test/Transforms/test-bubble-down-memory-space-casts.mlir new file mode 100644 index 0000000000000..e4fce89cffb45 --- /dev/null +++ b/mlir/test/Transforms/test-bubble-down-memory-space-casts.mlir @@ -0,0 +1,298 @@ +// RUN: mlir-opt %s --bubble-down-memory-space-casts | FileCheck %s + +#map = affine_map<(d0, d1)[s0] -> (d1 * s0 + d0)> + +// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1)[s0] -> (d1 * s0 + d0)> +// CHECK-LABEL: func.func @load_store( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = memref.load %[[ARG0]]{{\[}}%[[ARG1]]] : memref +// CHECK: memref.store %[[VAL_0]], %[[ARG0]]{{\[}}%[[ARG1]]] : memref +// CHECK: return +// CHECK: } +func.func @load_store(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %0 = memref.load %memspacecast[%arg1] : memref + memref.store %0, %memspacecast[%arg1] : memref + return +} + +// CHECK-LABEL: func.func @load_store_unfoldable( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = memref.memory_space_cast %[[ARG0]] : memref to memref +// CHECK: %[[VAL_1:.*]] = memref.load %[[VAL_0]]{{\[}}%[[ARG1]]] : memref +// CHECK: memref.store %[[VAL_1]], %[[VAL_0]]{{\[}}%[[ARG1]]] : memref +// CHECK: return +// CHECK: } +func.func @load_store_unfoldable(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %0 = memref.load %memspacecast[%arg1] : memref + memref.store %0, %memspacecast[%arg1] : memref + return +} + +// CHECK-LABEL: func.func @cast( +// CHECK-SAME: %[[ARG0:.*]]: memref<2xf32, 1>, +// CHECK-SAME: %[[ARG1:.*]]: memref<*xf32, 1>) -> (memref<*xf32>, memref<3x2xf32>) { +// CHECK: %[[VAL_0:.*]] = memref.cast %[[ARG0]] : memref<2xf32, 1> to memref<*xf32, 1> +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref<*xf32, 1> to memref<*xf32> +// CHECK: %[[VAL_2:.*]] = memref.cast %[[ARG1]] : memref<*xf32, 1> to memref<3x2xf32, 1> +// CHECK: %[[VAL_3:.*]] = memref.memory_space_cast %[[VAL_2]] : memref<3x2xf32, 1> to memref<3x2xf32> +// CHECK: return %[[VAL_1]], %[[VAL_3]] : memref<*xf32>, memref<3x2xf32> +// CHECK: } +func.func @cast(%arg0: memref<2xf32, 1>, %arg1: memref<*xf32, 1>) -> (memref<*xf32>, memref<3x2xf32>) { + %memspacecast = memref.memory_space_cast %arg0 : memref<2xf32, 1> to memref<2xf32> + %1 = memref.cast %memspacecast : memref<2xf32> to memref<*xf32> + %memspacecast_1 = memref.memory_space_cast %arg1 : memref<*xf32, 1> to memref<*xf32> + %2 = memref.cast %memspacecast_1 : memref<*xf32> to memref<3x2xf32> + return %1, %2 : memref<*xf32>, memref<3x2xf32> +} + +// CHECK-LABEL: func.func @view( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index, %[[ARG2:.*]]: index) -> memref { +// CHECK: %[[VAL_0:.*]] = arith.constant 100 : index +// CHECK: %[[VAL_1:.*]] = memref.view %[[ARG0]]{{\[}}%[[ARG1]]]{{\[}}%[[ARG2]], %[[VAL_0]]] : memref to memref +// CHECK: %[[VAL_2:.*]] = memref.memory_space_cast %[[VAL_1]] : memref to memref +// CHECK: return %[[VAL_2]] : memref +// CHECK: } +func.func @view(%arg0: memref, %arg1: index, %arg2: index) -> memref { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %c100 = arith.constant 100 : index + %view = memref.view %memspacecast[%arg1][%arg2, %c100] : memref to memref + return %view : memref +} + +// CHECK-LABEL: func.func @subview( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) -> memref<8x2xf32, strided<[?, 2], offset: ?>> { +// CHECK: %[[VAL_0:.*]] = memref.subview %[[ARG0]][4, 2] [8, 2] [3, 2] : memref to memref<8x2xf32, strided<[?, 2], offset: ?>, 1> +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref<8x2xf32, strided<[?, 2], offset: ?>, 1> to memref<8x2xf32, strided<[?, 2], offset: ?>> +// CHECK: return %[[VAL_1]] : memref<8x2xf32, strided<[?, 2], offset: ?>> +// CHECK: } +func.func @subview(%arg0: memref, %arg1: index) -> memref<8x2xf32, strided<[?, 2], offset: ?>> { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %subview = memref.subview %memspacecast[4, 2] [8, 2] [3, 2] : memref to memref<8x2xf32, strided<[?, 2], offset: ?>> + return %subview : memref<8x2xf32, strided<[?, 2], offset: ?>> +} + +// CHECK-LABEL: func.func @reinterpret_cast( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) -> memref<10x?xf32, strided<[?, 1], offset: ?>> { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant 10 : index +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_2:.*]] = memref.reinterpret_cast %[[ARG0]] to offset: {{\[}}%[[VAL_1]]], sizes: [10, %[[VAL_0]]], strides: {{\[}}%[[VAL_0]], 1] : memref to memref<10x?xf32, strided<[?, 1], offset: ?>, 1> +// CHECK: %[[VAL_3:.*]] = memref.memory_space_cast %[[VAL_2]] : memref<10x?xf32, strided<[?, 1], offset: ?>, 1> to memref<10x?xf32, strided<[?, 1], offset: ?>> +// CHECK: return %[[VAL_3]] : memref<10x?xf32, strided<[?, 1], offset: ?>> +// CHECK: } +func.func @reinterpret_cast(%arg0: memref, %arg1: index) -> memref<10x?xf32, strided<[?, 1], offset: ?>> { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %reinterpret_cast = memref.reinterpret_cast %memspacecast to offset: [%c0], sizes: [10, %c10], strides: [%c10, 1] : memref to memref<10x?xf32, strided<[?, 1], offset: ?>> + return %reinterpret_cast : memref<10x?xf32, strided<[?, 1], offset: ?>> +} + +// CHECK-LABEL: func.func @reshape( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: memref<1xindex>) -> memref { +// CHECK: %[[VAL_0:.*]] = memref.reshape %[[ARG0]](%[[ARG1]]) : (memref, memref<1xindex>) -> memref +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref to memref +// CHECK: return %[[VAL_1]] : memref +// CHECK: } +func.func @reshape(%arg0: memref, %arg1: memref<1xindex>) -> memref { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %reshape = memref.reshape %memspacecast(%arg1) : (memref, memref<1xindex>) -> memref + return %reshape : memref +} + +// CHECK-LABEL: func.func @expand_shape( +// CHECK-SAME: %[[ARG0:.*]]: memref<12xf32, 1>) -> memref<3x4xf32> { +// CHECK: %[[VAL_0:.*]] = memref.expand_shape %[[ARG0]] {{\[\[}}0, 1]] output_shape [3, 4] : memref<12xf32, 1> into memref<3x4xf32, 1> +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref<3x4xf32, 1> to memref<3x4xf32> +// CHECK: return %[[VAL_1]] : memref<3x4xf32> +// CHECK: } +func.func @expand_shape(%arg0: memref<12xf32, 1>) -> memref<3x4xf32> { + %memspacecast = memref.memory_space_cast %arg0 : memref<12xf32, 1> to memref<12xf32> + %expand_shape = memref.expand_shape %memspacecast [[0, 1]] output_shape [3, 4] : memref<12xf32> into memref<3x4xf32> + return %expand_shape : memref<3x4xf32> +} + +// CHECK-LABEL: func.func @collapse_shape( +// CHECK-SAME: %[[ARG0:.*]]: memref<3x4xf32, 1>) -> memref<12xf32> { +// CHECK: %[[VAL_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[\[}}0, 1]] : memref<3x4xf32, 1> into memref<12xf32, 1> +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref<12xf32, 1> to memref<12xf32> +// CHECK: return %[[VAL_1]] : memref<12xf32> +// CHECK: } +func.func @collapse_shape(%arg0: memref<3x4xf32, 1>) -> memref<12xf32> { + %memspacecast = memref.memory_space_cast %arg0 : memref<3x4xf32, 1> to memref<3x4xf32> + %collapse_shape = memref.collapse_shape %memspacecast [[0, 1]] : memref<3x4xf32> into memref<12xf32> + return %collapse_shape : memref<12xf32> +} + +// CHECK-LABEL: func.func @transpose( +// CHECK-SAME: %[[ARG0:.*]]: memref) -> memref { +// CHECK: %[[VAL_0:.*]] = memref.transpose %[[ARG0]] (d0, d1) -> (d1, d0) : memref to memref +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref to memref +// CHECK: return %[[VAL_1]] : memref +// CHECK: } +func.func @transpose(%arg0: memref) -> memref { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %transpose = memref.transpose %memspacecast (d0, d1) -> (d1, d0) : memref to memref + return %transpose : memref +} + +// CHECK-LABEL: func.func @atomic_rmw( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index, +// CHECK-SAME: %[[ARG2:.*]]: f32) -> f32 { +// CHECK: %[[VAL_0:.*]] = memref.atomic_rmw addf %[[ARG2]], %[[ARG0]]{{\[}}%[[ARG1]]] : (f32, memref) -> f32 +// CHECK: return %[[VAL_0]] : f32 +// CHECK: } +func.func @atomic_rmw(%arg0: memref, %arg1: index, %arg2: f32) -> f32 { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %0 = memref.atomic_rmw addf %arg2, %memspacecast[%arg1] : (f32, memref) -> f32 + return %0 : f32 +} + +// CHECK-LABEL: func.func @assume_alignment( +// CHECK-SAME: %[[ARG0:.*]]: memref) -> memref { +// CHECK: %[[VAL_0:.*]] = memref.assume_alignment %[[ARG0]], 16 : memref +// CHECK: %[[VAL_1:.*]] = memref.memory_space_cast %[[VAL_0]] : memref to memref +// CHECK: return %[[VAL_1]] : memref +// CHECK: } +func.func @assume_alignment(%arg0: memref) -> memref { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %1 = memref.assume_alignment %memspacecast, 16 : memref + return %1 : memref +} + +// CHECK-LABEL: func.func @op_with_cast_sequence( +// CHECK-SAME: %[[ARG0:.*]]: memref<4x4xf32, 1>, +// CHECK-SAME: %[[ARG1:.*]]: index, +// CHECK-SAME: %[[ARG2:.*]]: f32) -> memref<16xf32> { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant 4 : index +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_2:.*]] = memref.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2]] output_shape [4, 2, 2] : memref<4x4xf32, 1> into memref<4x2x2xf32, 1> +// CHECK: %[[VAL_3:.*]] = memref.collapse_shape %[[VAL_2]] {{\[\[}}0, 1, 2]] : memref<4x2x2xf32, 1> into memref<16xf32, 1> +// CHECK: %[[VAL_4:.*]] = memref.memory_space_cast %[[VAL_3]] : memref<16xf32, 1> to memref<16xf32> +// CHECK: %[[VAL_5:.*]] = memref.load %[[VAL_3]]{{\[}}%[[VAL_1]]] : memref<16xf32, 1> +// CHECK: %[[VAL_6:.*]] = arith.addf %[[VAL_5]], %[[ARG2]] : f32 +// CHECK: memref.store %[[VAL_6]], %[[VAL_3]]{{\[}}%[[VAL_1]]] : memref<16xf32, 1> +// CHECK: %[[VAL_7:.*]] = memref.atomic_rmw addf %[[ARG2]], %[[VAL_3]]{{\[}}%[[VAL_0]]] : (f32, memref<16xf32, 1>) -> f32 +// CHECK: return %[[VAL_4]] : memref<16xf32> +// CHECK: } +func.func @op_with_cast_sequence(%arg0: memref<4x4xf32, 1>, %arg1: index, %arg2: f32) -> memref<16xf32> { + %memspacecast = memref.memory_space_cast %arg0 : memref<4x4xf32, 1> to memref<4x4xf32> + %c0 = arith.constant 0 : index + %c4 = arith.constant 4 : index + %expanded = memref.expand_shape %memspacecast [[0], [1, 2]] output_shape [4, 2, 2] : memref<4x4xf32> into memref<4x2x2xf32> + %collapsed = memref.collapse_shape %expanded [[0, 1, 2]] : memref<4x2x2xf32> into memref<16xf32> + %loaded = memref.load %collapsed[%c0] : memref<16xf32> + %added = arith.addf %loaded, %arg2 : f32 + memref.store %added, %collapsed[%c0] : memref<16xf32> + %atomic_result = memref.atomic_rmw addf %arg2, %collapsed[%c4] : (f32, memref<16xf32>) -> f32 + return %collapsed : memref<16xf32> +} + +// CHECK-LABEL: func.func @transfer_read_write( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_1:.*]] = vector.transfer_read %[[ARG0]]{{\[}}%[[ARG1]]], %[[VAL_0]] : memref, vector<4xf32> +// CHECK: vector.transfer_write %[[VAL_1]], %[[ARG0]]{{\[}}%[[ARG1]]] : vector<4xf32>, memref +// CHECK: return +// CHECK: } +func.func @transfer_read_write(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %c0 = arith.constant 0.0 : f32 + %0 = vector.transfer_read %memspacecast[%arg1], %c0 : memref, vector<4xf32> + vector.transfer_write %0, %memspacecast[%arg1] : vector<4xf32>, memref + return +} + +// NOTE: The operations disappear because they can get folded. +// CHECK-LABEL: func.func @transfer_read_write_tensor( +// CHECK-SAME: %[[ARG0:.*]]: tensor, +// CHECK-SAME: %[[ARG1:.*]]: index) -> tensor { +// CHECK: return %[[ARG0]] : tensor +// CHECK: } +func.func @transfer_read_write_tensor(%arg0: tensor, %arg1: index) -> tensor { + %c0 = arith.constant 0.0 : f32 + %0 = vector.transfer_read %arg0[%arg1], %c0 : tensor, vector<4xf32> + %1 = vector.transfer_write %0, %arg0[%arg1] : vector<4xf32>, tensor + return %1 : tensor +} + +// CHECK-LABEL: func.func @vector_load_store( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK: %[[VAL_0:.*]] = vector.load %[[ARG0]]{{\[}}%[[ARG1]]] : memref, vector<4xf32> +// CHECK: vector.store %[[VAL_0]], %[[ARG0]]{{\[}}%[[ARG1]]] : memref, vector<4xf32> +// CHECK: return +// CHECK: } +func.func @vector_load_store(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %0 = vector.load %memspacecast[%arg1] : memref, vector<4xf32> + vector.store %0, %memspacecast[%arg1] : memref, vector<4xf32> + return +} + +// CHECK-LABEL: func.func @masked_load_store( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant dense<[true, true, false, false]> : vector<4xi1> +// CHECK: %[[VAL_2:.*]] = vector.maskedload %[[ARG0]]{{\[}}%[[ARG1]]], %[[VAL_1]], %[[VAL_0]] : memref, vector<4xi1>, vector<4xf32> into vector<4xf32> +// CHECK: vector.maskedstore %[[ARG0]]{{\[}}%[[ARG1]]], %[[VAL_1]], %[[VAL_2]] : memref, vector<4xi1>, vector<4xf32> +// CHECK: return +// CHECK: } +func.func @masked_load_store(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %mask = arith.constant dense<[true, true, false, false]> : vector<4xi1> + %passthrough = arith.constant dense<0.0> : vector<4xf32> + %0 = vector.maskedload %memspacecast[%arg1], %mask, %passthrough : memref, vector<4xi1>, vector<4xf32> into vector<4xf32> + vector.maskedstore %memspacecast[%arg1], %mask, %0 : memref, vector<4xi1>, vector<4xf32> + return +} + +// CHECK-LABEL: func.func @gather_scatter( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant dense : vector<4xi1> +// CHECK-DAG: %[[VAL_2:.*]] = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = vector.gather %[[ARG0]]{{\[}}%[[VAL_3]]] {{\[}}%[[VAL_2]]], %[[VAL_1]], %[[VAL_0]] : memref, vector<4xindex>, vector<4xi1>, vector<4xf32> into vector<4xf32> +// CHECK: vector.scatter %[[ARG0]]{{\[}}%[[VAL_3]]] {{\[}}%[[VAL_2]]], %[[VAL_1]], %[[VAL_4]] : memref, vector<4xindex>, vector<4xi1>, vector<4xf32> +// CHECK: return +// CHECK: } +func.func @gather_scatter(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %c0 = arith.constant 0 : index + %indices = arith.constant dense<[0, 1, 2, 3]> : vector<4xindex> + %mask = arith.constant dense : vector<4xi1> + %passthrough = arith.constant dense<0.0> : vector<4xf32> + %0 = vector.gather %memspacecast[%c0] [%indices], %mask, %passthrough : memref, vector<4xindex>, vector<4xi1>, vector<4xf32> into vector<4xf32> + vector.scatter %memspacecast[%c0] [%indices], %mask, %0 : memref, vector<4xindex>, vector<4xi1>, vector<4xf32> + return +} + +// CHECK-LABEL: func.func @expandload_compressstore( +// CHECK-SAME: %[[ARG0:.*]]: memref, +// CHECK-SAME: %[[ARG1:.*]]: index) { +// CHECK-DAG: %[[VAL_0:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32> +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant dense<[true, true, false, false]> : vector<4xi1> +// CHECK: %[[VAL_2:.*]] = vector.expandload %[[ARG0]]{{\[}}%[[ARG1]]], %[[VAL_1]], %[[VAL_0]] : memref, vector<4xi1>, vector<4xf32> into vector<4xf32> +// CHECK: vector.compressstore %[[ARG0]]{{\[}}%[[ARG1]]], %[[VAL_1]], %[[VAL_2]] : memref, vector<4xi1>, vector<4xf32> +// CHECK: return +// CHECK: } +func.func @expandload_compressstore(%arg0: memref, %arg1: index) { + %memspacecast = memref.memory_space_cast %arg0 : memref to memref + %mask = arith.constant dense<[true, true, false, false]> : vector<4xi1> + %passthrough = arith.constant dense<0.0> : vector<4xf32> + %0 = vector.expandload %memspacecast[%arg1], %mask, %passthrough : memref, vector<4xi1>, vector<4xf32> into vector<4xf32> + vector.compressstore %memspacecast[%arg1], %mask, %0 : memref, vector<4xi1>, vector<4xf32> + return +} diff --git a/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp b/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp index d332270468ea8..d45aaf788f9c2 100644 --- a/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp +++ b/mlir/test/lib/Dialect/Linalg/TestDataLayoutPropagation.cpp @@ -33,7 +33,8 @@ struct TestDataLayoutPropagationPass MLIRContext *context = &getContext(); RewritePatternSet patterns(context); linalg::populateDataLayoutPropagationPatterns( - patterns, [](OpOperand *opOperand) { return true; }); + patterns, [](OpOperand *opOperand) { return true; }, + /*poisonPaddingOk=*/true); linalg::ControlPropagationFn controlExtract = [](OpOperand *opOperand) -> bool { Operation *producer = opOperand->get().getDefiningOp(); diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp index 987e8f3654ce8..21d75f58b0a3a 100644 --- a/mlir/test/lib/Dialect/Test/TestDialect.cpp +++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp @@ -431,3 +431,47 @@ void TestDialect::getCanonicalizationPatterns( RewritePatternSet &results) const { results.add(&dialectCanonicalizationPattern); } + +//===----------------------------------------------------------------------===// +// TestCallWithSegmentsOp +//===----------------------------------------------------------------------===// +// The op `test.call_with_segments` models a call-like operation whose operands +// are divided into 3 variadic segments: `prefix`, `args`, and `suffix`. +// Only the middle segment represents the actual call arguments. The op uses +// the AttrSizedOperandSegments trait, so we can derive segment boundaries from +// the generated `operandSegmentSizes` attribute. We provide custom helpers to +// expose the logical call arguments as both a read-only range and a mutable +// range bound to the proper segment so that insertion/erasure updates the +// attribute automatically. + +// Segment layout indices in the DenseI32ArrayAttr: [prefix, args, suffix]. +static constexpr unsigned kTestCallWithSegmentsArgsSegIndex = 1; + +Operation::operand_range CallWithSegmentsOp::getArgOperands() { + // Leverage generated getters for segment sizes: slice between prefix and + // suffix using current operand list. + return getOperation()->getOperands().slice(getPrefix().size(), + getArgs().size()); +} + +MutableOperandRange CallWithSegmentsOp::getArgOperandsMutable() { + Operation *op = getOperation(); + + // Obtain the canonical segment size attribute name for this op. + auto segName = + CallWithSegmentsOp::getOperandSegmentSizesAttrName(op->getName()); + auto sizesAttr = op->getAttrOfType(segName); + assert(sizesAttr && "missing operandSegmentSizes attribute on op"); + + // Compute the start and length of the args segment from the prefix size and + // args size stored in the attribute. + auto sizes = sizesAttr.asArrayRef(); + unsigned start = static_cast(sizes[0]); // prefix size + unsigned len = static_cast(sizes[1]); // args size + + NamedAttribute segNamed(segName, sizesAttr); + MutableOperandRange::OperandSegment binding{kTestCallWithSegmentsArgsSegIndex, + segNamed}; + + return MutableOperandRange(op, start, len, {binding}); +} diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td index 5564264ed8b0b..6ea27187655ee 100644 --- a/mlir/test/lib/Dialect/Test/TestOps.td +++ b/mlir/test/lib/Dialect/Test/TestOps.td @@ -552,9 +552,10 @@ def OneRegionWithOperandsOp : TEST_Op<"one_region_with_operands_op", []> { def IsolatedOneRegionOp : TEST_Op<"isolated_one_region_op", [IsolatedFromAbove]> { let arguments = (ins Variadic:$operands); + let results = (outs Variadic:$results); let regions = (region AnyRegion:$my_region); let assemblyFormat = [{ - attr-dict-with-keyword $operands $my_region `:` type($operands) + attr-dict-with-keyword $operands $my_region `:` type($operands) `->` type($results) }]; } @@ -3745,4 +3746,47 @@ def TestOpWithSuccessorRef : TEST_Op<"dummy_op_with_successor_ref"> { }]; } +def CallWithSegmentsOp : TEST_Op<"call_with_segments", + [AttrSizedOperandSegments, + DeclareOpInterfaceMethods]> { + let summary = "test call op with segmented args"; + let arguments = (ins + FlatSymbolRefAttr:$callee, + Variadic:$prefix, // non-arg segment (e.g., 'in') + Variadic:$args, // <-- the call *arguments* segment + Variadic:$suffix // non-arg segment (e.g., 'out') + ); + let results = (outs); + let assemblyFormat = [{ + $callee `(` $prefix `:` type($prefix) `)` + `(` $args `:` type($args) `)` + `(` $suffix `:` type($suffix) `)` attr-dict + }]; + + // Provide stub implementations for the ArgAndResultAttrsOpInterface. + let extraClassDeclaration = [{ + ::mlir::ArrayAttr getArgAttrsAttr() { return {}; } + ::mlir::ArrayAttr getResAttrsAttr() { return {}; } + void setArgAttrsAttr(::mlir::ArrayAttr) {} + void setResAttrsAttr(::mlir::ArrayAttr) {} + ::mlir::Attribute removeArgAttrsAttr() { return {}; } + ::mlir::Attribute removeResAttrsAttr() { return {}; } + }]; + + let extraClassDefinition = [{ + ::mlir::CallInterfaceCallable $cppClass::getCallableForCallee() { + if (auto sym = (*this)->getAttrOfType<::mlir::SymbolRefAttr>("callee")) + return ::mlir::CallInterfaceCallable(sym); + return ::mlir::CallInterfaceCallable(); + } + void $cppClass::setCalleeFromCallable(::mlir::CallInterfaceCallable callee) { + if (auto sym = callee.dyn_cast<::mlir::SymbolRefAttr>()) + (*this)->setAttr("callee", sym); + else + (*this)->removeAttr("callee"); + } + }]; +} + + #endif // TEST_OPS diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index e1ba45c60ac36..094ef0a45b8d2 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -50,52 +50,71 @@ struct TestXeGPUUnrollingPatterns void runOnOperation() override { MLIRContext *ctx = &getContext(); xegpu::UnrollOptions options; - options.setNativeShapeFn( - [&](Operation *op) -> std::optional> { - if (isa(op)) { - xegpu::TensorDescType tdescTy; - if (auto createNdOp = dyn_cast(op)) { - tdescTy = createNdOp.getType(); - } else if (auto updateNdOp = - dyn_cast(op)) { - tdescTy = updateNdOp.getTensorDescType(); - } else if (auto prefetchNdOp = dyn_cast(op)) { - tdescTy = prefetchNdOp.getTensorDescType(); - } else if (auto loadNdOp = dyn_cast(op)) { - tdescTy = loadNdOp.getTensorDescType(); - } else if (auto storeNdOp = dyn_cast(op)) { - tdescTy = storeNdOp.getTensorDescType(); - } else if (auto createOp = dyn_cast(op)) { - tdescTy = createOp.getType(); - } else if (auto updateOp = dyn_cast(op)) { - tdescTy = updateOp.getTensorDescType(); - } else if (auto prefetchOp = dyn_cast(op)) { - tdescTy = prefetchOp.getTensorDescType(); - } else if (auto loadOp = dyn_cast(op)) { - tdescTy = loadOp.getTensorDescType(); - } else if (auto storeOp = dyn_cast(op)) { - tdescTy = storeOp.getTensorDescType(); + options.setNativeShapeFn([&](Operation *op) + -> std::optional> { + if (isa(op)) { + xegpu::TensorDescType tdescTy; + if (auto createNdOp = dyn_cast(op)) { + tdescTy = createNdOp.getType(); + } else if (auto updateNdOp = dyn_cast(op)) { + tdescTy = updateNdOp.getTensorDescType(); + } else if (auto prefetchNdOp = dyn_cast(op)) { + tdescTy = prefetchNdOp.getTensorDescType(); + } else if (auto loadNdOp = dyn_cast(op)) { + tdescTy = loadNdOp.getTensorDescType(); + } else if (auto storeNdOp = dyn_cast(op)) { + tdescTy = storeNdOp.getTensorDescType(); + } else if (auto createOp = dyn_cast(op)) { + tdescTy = createOp.getType(); + } else if (auto updateOp = dyn_cast(op)) { + tdescTy = updateOp.getTensorDescType(); + } else if (auto prefetchOp = dyn_cast(op)) { + tdescTy = prefetchOp.getTensorDescType(); + } else if (auto loadOp = dyn_cast(op)) { + if (loadOp.getOffsets()) { + auto layout = xegpu::getDistributeLayoutAttr(loadOp.getResult()); + if (layout && layout.isForSubgroup()) { + auto inst_data = layout.getEffectiveInstDataAsInt(); + if (!inst_data.empty()) + return SmallVector(inst_data.begin(), inst_data.end()); } - - if (auto layout = tdescTy.getLayoutAttr()) { - auto inst_data = layout.getInstData(); - if (inst_data && layout.isForSubgroup()) - return SmallVector(inst_data.asArrayRef().begin(), - inst_data.asArrayRef().end()); + return std::nullopt; + } + tdescTy = loadOp.getTensorDescType(); + } else if (auto storeOp = dyn_cast(op)) { + if (storeOp.getOffsets()) { + auto layout = llvm::dyn_cast_or_null( + op->getAttr("layout")); + if (layout && layout.isForSubgroup()) { + auto inst_data = layout.getEffectiveInstDataAsInt(); + if (!inst_data.empty()) + return SmallVector(inst_data.begin(), inst_data.end()); } + return std::nullopt; } + tdescTy = storeOp.getTensorDescType(); + } - if (isa(op)) - return SmallVector{8, 16, 16}; + if (auto layout = tdescTy.getLayoutAttr()) { + auto inst_data = layout.getInstData(); + if (inst_data && layout.isForSubgroup()) + return SmallVector(inst_data.asArrayRef().begin(), + inst_data.asArrayRef().end()); + } + } - return std::nullopt; - }); + if (isa(op)) + return SmallVector{8, 16, 16}; + + return std::nullopt; + }); options.setUnrolledTypesFn( - [&](ShapedType type, ArrayRef tileShape) -> SmallVector { + [&](ShapedType type, ArrayRef tileShape, + bool returnSingleType = false) -> SmallVector { Type elemTy = type.getElementType(); Type newTy; @@ -137,6 +156,8 @@ struct TestXeGPUUnrollingPatterns newTy = type.clone(tileShape, elemTy); } + if (returnSingleType) + return SmallVector{newTy}; std::optional> ratio = computeShapeRatio(type.getShape(), tileShape); assert(ratio && "Expecting the ratio to be valid."); diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp index 3d24d4ecc4d0d..7981c72c2f2c8 100644 --- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp +++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Index/IR/IndexDialect.h" +#include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h" #include "mlir/Dialect/Transform/IR/TransformAttrs.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" @@ -468,6 +469,158 @@ transform::TestTileAndFuseOuterParallelPartialReductionOp::apply( : DiagnosedSilenceableFailure::success(); } +//===----------------------------------------------------------------------===// +// TestTileAndFuseOuterParallelPartialReduction +//===----------------------------------------------------------------------===// + +DiagnosedSilenceableFailure transform::TestTileUsingCustomLoopOp::apply( + TransformRewriter &transformRewriter, TransformResults &transformResults, + TransformState &state) { + auto target = + dyn_cast(*state.getPayloadOps(getRootOp()).begin()); + if (!target) { + emitOpError("expected root operation to implement `TilingInterface`"); + return DiagnosedSilenceableFailure::definiteFailure(); + } + + OpFoldResult oneOfr = transformRewriter.getIndexAttr(1); + + scf::SCFTilingOptions::GenerateLoopHeaderFn loopHeaderFn = + [&](RewriterBase &rewriter, Location loc, ArrayRef loopRanges, + ArrayRef givenTileSizes, + ValueRange outerDestinationTensors) + -> FailureOr { + // Check that the strides are all 1 (to make it easier in the test). + if (llvm::any_of(loopRanges, [](Range r) { + return !isConstantIntValue(r.stride, 1); + })) { + return emitOpError("unable to handle loop ranges with strides != 1"); + } + // Check number of tile sizes is equal to loop dimensions. + if (loopRanges.size() != givenTileSizes.size()) { + return emitOpError("expected number of tile sizes to be same as the " + "number of loops in the operation"); + } + // For testing disallow any of the tile sizes being 0. + if (llvm::any_of(givenTileSizes, isZeroInteger)) { + return emitOpError("unhandled case of zero tile size"); + } + // For testing, only handle tensor tiling. + if (outerDestinationTensors.empty()) { + return emitOpError("expected destination tensors"); + } + + // Compute the number of iterations for each of the loops. + AffineExpr s0, s1, s2; + bindSymbols(rewriter.getContext(), s0, s1, s2); + AffineExpr numItersExpr = (s1 - s0).ceilDiv(s2); // (ub - lb) / tileSize + + SmallVector allNumIters; + allNumIters.reserve(loopRanges.size()); + for (auto [loopRange, tileSize] : + llvm::zip_equal(loopRanges, givenTileSizes)) { + OpFoldResult numIters = affine::makeComposedFoldedAffineApply( + rewriter, loc, numItersExpr, + {loopRange.offset, loopRange.size, tileSize}); + allNumIters.push_back(numIters); + } + if (allNumIters.empty()) { + return emitOpError("invalid empty tile sizes and loop ranges"); + } + + AffineExpr mulExpr = s0 * s1; + OpFoldResult cumulative = oneOfr; + for (auto numIters : allNumIters) { + cumulative = affine::makeComposedFoldedAffineApply( + rewriter, loc, mulExpr, {cumulative, numIters}); + } + + Value zeroVal = arith::ConstantIndexOp::create(rewriter, loc, 0); + Value oneVal = arith::ConstantIndexOp::create(rewriter, loc, 1); + Value ub = getValueOrCreateConstantIndexOp(rewriter, loc, cumulative); + + SmallVector offsets; + SmallVector sizes; + SmallVector innerDestinationTensors; + offsets.reserve(loopRanges.size()); + sizes.reserve(loopRanges.size()); + + AffineExpr d0; + bindDims(rewriter.getContext(), d0); + AffineExpr offsetExpr = s0 + d0 * s1; // lb + iv * tileSize + AffineMap minMap = + AffineMap::get(1, 2, {s0 - d0, s1}, + rewriter.getContext()); // min(ub - offset, tileSize) + auto forOp = scf::ForOp::create( + rewriter, loc, zeroVal, ub, oneVal, outerDestinationTensors, + [&](OpBuilder &b, Location bodyLoc, Value linearizedIv, + ValueRange destinations) { + auto delinearizeOp = affine::AffineDelinearizeIndexOp::create( + b, bodyLoc, linearizedIv, allNumIters); + for (auto [normalizedIv, range, tileSize] : llvm::zip_equal( + delinearizeOp.getResults(), loopRanges, givenTileSizes)) { + + OpFoldResult normalizedIvOfr = getAsOpFoldResult(normalizedIv); + OpFoldResult offset = affine::makeComposedFoldedAffineApply( + b, bodyLoc, offsetExpr, + {normalizedIvOfr, range.offset, tileSize}); + offsets.push_back(offset); + + OpFoldResult size = affine::makeComposedFoldedAffineMin( + b, bodyLoc, minMap, {offset, range.size, tileSize}); + sizes.push_back(size); + } + innerDestinationTensors = llvm::to_vector(destinations); + }); + rewriter.setInsertionPointToEnd(forOp.getBody()); + return scf::SCFTilingOptions::CustomLoopHeaderInfo{ + {cast(forOp.getOperation())}, + offsets, + sizes, + innerDestinationTensors}; + }; + + scf::SCFTilingOptions::GenerateLoopTerminatorFn terminatorFn = + [&](RewriterBase &rewriter, Location loc, ValueRange tiledResults, + ArrayRef> resultOffsets, + ArrayRef> resultSizes, + ValueRange destinationTensors) -> LogicalResult { + SmallVector yieldValues; + yieldValues.reserve(destinationTensors.size()); + for (auto [tiledResult, offsets, sizes, destination] : llvm::zip_equal( + tiledResults, resultOffsets, resultSizes, destinationTensors)) { + SmallVector strides(offsets.size(), oneOfr); + Value insertedVal = tensor::InsertSliceOp::create( + rewriter, loc, tiledResult, destination, offsets, sizes, strides); + yieldValues.push_back(insertedVal); + } + scf::YieldOp::create(rewriter, loc, yieldValues); + return success(); + }; + + scf::SCFTilingOptions tilingOptions; + SmallVector staticTileSizes = + extractFromIntegerArrayAttr(getTileSizes()); + SmallVector tileSizes = + getAsIndexOpFoldResult(transformRewriter.getContext(), staticTileSizes); + tilingOptions.setTileSizes(tileSizes) + .setLoopType(scf::SCFTilingOptions::LoopType::CustomOp) + .setCustomLoopGenerationFns(loopHeaderFn, terminatorFn); + + OpBuilder::InsertionGuard g(transformRewriter); + transformRewriter.setInsertionPoint(target); + FailureOr tiledResults = + scf::tileUsingSCF(transformRewriter, target, tilingOptions); + if (failed(tiledResults)) { + return DiagnosedSilenceableFailure::definiteFailure(); + } + transformRewriter.replaceOp(target, tiledResults->replacements); + transformResults.set(getOperation()->getResult(0), tiledResults->tiledOps); + transformResults.set(getOperation()->getResult(1), tiledResults->loops); + + return DiagnosedSilenceableFailure::success(); +} + #define GET_OP_CLASSES #include "TestTilingInterfaceTransformOps.cpp.inc" diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td index 58ccd30bb99a2..694c4229eef62 100644 --- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td +++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td @@ -150,4 +150,27 @@ def TestTileAndFuseOuterParallelPartialReductionOp : Op< }]; } +def TestTileUsingCustomLoopOp : Op< + Transform_Dialect, "test.tile_using_custom_loop", + [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface, + DeclareOpInterfaceMethods, + ReportTrackingListenerFailuresOpTrait]> { + let description = [{ + Test Transform op to tile an operation using custom loops. + + The test just folds all the loops and into a single loop and then + delinearizes the indices. + }]; + + let arguments = (ins TransformHandleTypeInterface:$root_op, + DefaultValuedAttr:$tile_sizes); + let results = (outs TransformHandleTypeInterface:$tiled_ops, + Variadic:$loops); + + let assemblyFormat = [{ + $root_op `tile_sizes` `=` $tile_sizes + attr-dict `:` functional-type(operands, results) + }]; +} + #endif // TEST_TILINGINTERFACE_TRANSFORM_OPS diff --git a/mlir/test/lib/Transforms/TestMakeIsolatedFromAbove.cpp b/mlir/test/lib/Transforms/TestMakeIsolatedFromAbove.cpp index c1fb70605ab46..f7bde79274e91 100644 --- a/mlir/test/lib/Transforms/TestMakeIsolatedFromAbove.cpp +++ b/mlir/test/lib/Transforms/TestMakeIsolatedFromAbove.cpp @@ -27,8 +27,8 @@ makeIsolatedFromAboveImpl(RewriterBase &rewriter, makeRegionIsolatedFromAbove(rewriter, region, callBack); SmallVector operands = regionOp.getOperands(); operands.append(capturedValues); - auto isolatedRegionOp = - test::IsolatedOneRegionOp::create(rewriter, regionOp.getLoc(), operands); + auto isolatedRegionOp = test::IsolatedOneRegionOp::create( + rewriter, regionOp.getLoc(), TypeRange(), operands); rewriter.inlineRegionBefore(region, isolatedRegionOp.getRegion(), isolatedRegionOp.getRegion().begin()); rewriter.eraseOp(regionOp); diff --git a/mlir/test/lib/Transforms/TestSingleFold.cpp b/mlir/test/lib/Transforms/TestSingleFold.cpp index 5bd9dd2a1f075..e55f36aea0a7c 100644 --- a/mlir/test/lib/Transforms/TestSingleFold.cpp +++ b/mlir/test/lib/Transforms/TestSingleFold.cpp @@ -26,6 +26,9 @@ struct TestSingleFold : public PassWrapper>, public RewriterBase::Listener { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestSingleFold) + TestSingleFold() = default; + TestSingleFold(const TestSingleFold &pass) : PassWrapper(pass) {} + StringRef getArgument() const final { return "test-single-fold"; } StringRef getDescription() const final { return "Test single-pass operation folding and dead constant elimination"; @@ -45,13 +48,18 @@ struct TestSingleFold : public PassWrapper>, if (it != existingConstants.end()) existingConstants.erase(it); } + + Option maxIterations{*this, "max-iterations", + llvm::cl::desc("Max iterations in the tryToFold"), + llvm::cl::init(1)}; }; } // namespace void TestSingleFold::foldOperation(Operation *op, OperationFolder &helper) { // Attempt to fold the specified operation, including handling unused or // duplicated constants. - (void)helper.tryToFold(op); + bool inPlaceUpdate = false; + (void)helper.tryToFold(op, &inPlaceUpdate, maxIterations); } void TestSingleFold::runOnOperation() { diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index 2fc595dfabbf5..1aaf7989e3ce5 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -15,6 +15,8 @@ config.native_target = "@LLVM_NATIVE_ARCH@" config.host_os = "@HOST_OS@" config.host_cc = "@HOST_CC@" config.host_cxx = "@HOST_CXX@" +config.host_c_compiler_launcher = "@CMAKE_C_COMPILER_LAUNCHER@" +config.host_cxx_compiler_launcher = "@CMAKE_CXX_COMPILER_LAUNCHER@" config.enable_libcxx = "@LLVM_ENABLE_LIBCXX@" config.host_cmake = "@CMAKE_COMMAND@" config.host_cmake_generator = "@CMAKE_GENERATOR@" @@ -58,6 +60,7 @@ config.mlir_run_cuda_sm80_tests = @MLIR_RUN_CUDA_SM80_TESTS@ config.mlir_run_cuda_sm80_lt_tests = @MLIR_RUN_CUDA_SM80_LT_TESTS@ config.mlir_run_cuda_sm90_tests = @MLIR_RUN_CUDA_SM90_TESTS@ config.mlir_include_integration_tests = @MLIR_INCLUDE_INTEGRATION_TESTS@ +config.llvm_shared_libs_build = @BUILD_SHARED_LIBS@ config.arm_emulator_executable = "@ARM_EMULATOR_EXECUTABLE@" # Some tests marked with 'UNSUPPORTED: target=aarch64{{.*}}' are still run when # configured with ARM_EMULATOR_EXECUTABLE and the default target is not aarch64. diff --git a/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td b/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td new file mode 100644 index 0000000000000..5f1c61a3a505d --- /dev/null +++ b/mlir/test/mlir-tblgen/attr-duplicated-builder-error.td @@ -0,0 +1,48 @@ +// RUN: not mlir-tblgen -gen-attrdef-decls -I %S/../../include %s 2>&1 | FileCheck %s + +include "mlir/IR/OpBase.td" + +def Test_Dialect : Dialect { + let name = "test"; + let cppNamespace = "::test"; +} + +class TestAttr traits = []> + : AttrDef { + let mnemonic = attrMnemonic; +} + +def TestAttr : TestAttr<"Test", "test"> { + let summary = "Test attrubute"; + let description = "Test attribute"; + + let parameters = (ins AttrParameter<"std::int64_t", "arg">:$arg); + let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{ + return $_get($_ctxt, arg); + }]>]; + + let assemblyFormat = "`<` $arg `>`"; + + let skipDefaultBuilders = 0; + let genVerifyDecl = 1; + let genMnemonicAlias = 1; +} + +def Test_TestAttrOp : Op { + let summary = "test operation with attribute"; + let description = "test operation with attribute"; + + let arguments = (ins TestAttr:$testAttr); + let assemblyFormat = "$testAttr attr-dict"; +} + +// CHECK: attr-duplicated-builder-error.td:20:7: error: builder `get` conflicts with an existing builder. +// CHECK-NEXT: let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{ +// CHECK-NEXT: ^ +// CHECK-NEXT: note: A new builder with signature: +// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg); +// CHECK-EMPTY: +// CHECK-NEXT: is shadowed by an existing builder with signature: +// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg); +// CHECK-EMPTY: +// CHECK-NEXT: Please remove one of the conflicting definitions. diff --git a/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td b/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td new file mode 100644 index 0000000000000..0e09f667c1ccd --- /dev/null +++ b/mlir/test/mlir-tblgen/attr-duplicated-custom-builders-error.td @@ -0,0 +1,52 @@ +// RUN: not mlir-tblgen -gen-attrdef-decls -I %S/../../include %s 2>&1 | FileCheck %s + +include "mlir/IR/OpBase.td" + +def Test_Dialect : Dialect { + let name = "test"; + let cppNamespace = "::test"; +} + +class TestAttr traits = []> + : AttrDef { + let mnemonic = attrMnemonic; +} + +def TestAttr : TestAttr<"Test", "test"> { + let summary = "Test attrubute"; + let description = "Test attribute"; + + let parameters = (ins AttrParameter<"std::int64_t", "arg">:$arg); + let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{ + return $_get($_ctxt, arg); + }]>, + AttrBuilder<(ins "std::int64_t":$arg), [{ + // Duplicated builder + return $_get($_ctxt, arg); + }]>]; + + let assemblyFormat = "`<` $arg `>`"; + + let skipDefaultBuilders = 1; + let genVerifyDecl = 1; + let genMnemonicAlias = 1; +} + +def Test_TestAttrOp : Op { + let summary = "test operation with attribute"; + let description = "test operation with attribute"; + + let arguments = (ins TestAttr:$testAttr); + let assemblyFormat = "$testAttr attr-dict"; +} + +// CHECK: attr-duplicated-custom-builders-error.td:20:7: error: builder `get` conflicts with an existing builder. +// CHECK-NEXT: let builders = [AttrBuilder<(ins "std::int64_t":$arg), [{ +// CHECK-NEXT: ^ +// CHECK-NEXT: note: A new builder with signature: +// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg); +// CHECK-EMPTY: +// CHECK-NEXT: is shadowed by an existing builder with signature: +// CHECK-NEXT: static TestAttr get(::mlir::MLIRContext *context, std::int64_t arg); +// CHECK-EMPTY: +// CHECK-NEXT: Please remove one of the conflicting definitions. diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py index 6ac25e129dacc..1194e32c960c8 100644 --- a/mlir/test/python/dialects/python_test.py +++ b/mlir/test/python/dialects/python_test.py @@ -904,7 +904,7 @@ def types(lst): assert ( typing.get_type_hints(test.same_variadic_result_vfv)["return"] - is Union[OpResult, OpResultList, test.SameVariadicResultSizeOpVFV] + == Union[OpResult, OpResultList, test.SameVariadicResultSizeOpVFV] ) assert ( type(test.same_variadic_result_vfv([i[0], i[1]], i[2], [i[3], i[4]])) @@ -992,7 +992,7 @@ def types(lst): assert ( typing.get_type_hints(test.results_variadic)["return"] - is Union[OpResult, OpResultList, test.ResultsVariadicOp] + == Union[OpResult, OpResultList, test.ResultsVariadicOp] ) assert type(test.results_variadic([i[0]])) is OpResult op_res_variadic = test.ResultsVariadicOp([i[0]]) @@ -1003,7 +1003,7 @@ def types(lst): assert type(op_res_variadic.res) is OpResultList -# CHECK-LABEL: TEST: testVariadicAndNormalRegion +# CHECK-LABEL: TEST: testVariadicAndNormalRegionOp @run def testVariadicAndNormalRegionOp(): with Context() as ctx, Location.unknown(ctx): @@ -1024,3 +1024,6 @@ def testVariadicAndNormalRegionOp(): is RegionSequence ) assert type(region_op.variadic) is RegionSequence + + assert isinstance(region_op.opview, OpView) + assert isinstance(region_op.operation.opview, OpView) diff --git a/mlir/test/python/dialects/transform_vector_ext.py b/mlir/test/python/dialects/transform_vector_ext.py index 28902b012f7cb..0cd9333dc1218 100644 --- a/mlir/test/python/dialects/transform_vector_ext.py +++ b/mlir/test/python/dialects/transform_vector_ext.py @@ -74,9 +74,9 @@ def enum_configurable_patterns(): # CHECK: transform.apply_patterns.vector.lower_contraction vector.ApplyLowerContractionPatternsOp() # CHECK: transform.apply_patterns.vector.lower_contraction - # CHECK-SAME: lowering_strategy = matmulintrinsics + # CHECK-SAME: lowering_strategy = llvmintr vector.ApplyLowerContractionPatternsOp( - lowering_strategy=vector.VectorContractLowering.Matmul + lowering_strategy=vector.VectorContractLowering.LLVMIntr ) # CHECK: transform.apply_patterns.vector.lower_contraction # CHECK-SAME: lowering_strategy = parallelarith @@ -105,9 +105,9 @@ def enum_configurable_patterns(): lowering_strategy=vector.VectorTransposeLowering.EltWise ) # CHECK: transform.apply_patterns.vector.lower_transpose - # CHECK-SAME: lowering_strategy = flat_transpose + # CHECK-SAME: lowering_strategy = llvmintr vector.ApplyLowerTransposePatternsOp( - lowering_strategy=vector.VectorTransposeLowering.Flat + lowering_strategy=vector.VectorTransposeLowering.LLVMIntr ) # CHECK: transform.apply_patterns.vector.lower_transpose # CHECK-SAME: lowering_strategy = shuffle_1d @@ -120,10 +120,10 @@ def enum_configurable_patterns(): lowering_strategy=vector.VectorTransposeLowering.Shuffle16x16 ) # CHECK: transform.apply_patterns.vector.lower_transpose - # CHECK-SAME: lowering_strategy = flat_transpose + # CHECK-SAME: lowering_strategy = llvmintr # CHECK-SAME: avx2_lowering_strategy = true vector.ApplyLowerTransposePatternsOp( - lowering_strategy=vector.VectorTransposeLowering.Flat, + lowering_strategy=vector.VectorTransposeLowering.LLVMIntr, avx2_lowering_strategy=True, ) diff --git a/mlir/test/python/integration/dialects/pdl.py b/mlir/test/python/integration/dialects/pdl.py index dd6c74ce622c8..c8e6197e03842 100644 --- a/mlir/test/python/integration/dialects/pdl.py +++ b/mlir/test/python/integration/dialects/pdl.py @@ -86,3 +86,154 @@ def add_func(a, b): frozen = get_pdl_patterns() apply_patterns_and_fold_greedily(module_.operation, frozen) return module_ + + +# If we use arith.constant and arith.addi here, +# these C++-defined folding/canonicalization will be applied +# implicitly in the greedy pattern rewrite driver to +# make our Python-defined folding useless, +# so here we define a new dialect to workaround this. +def load_myint_dialect(): + from mlir.dialects import irdl + + m = Module.create() + with InsertionPoint(m.body): + myint = irdl.dialect("myint") + with InsertionPoint(myint.body): + constant = irdl.operation_("constant") + with InsertionPoint(constant.body): + iattr = irdl.base(base_name="#builtin.integer") + i32 = irdl.is_(TypeAttr.get(IntegerType.get_signless(32))) + irdl.attributes_([iattr], ["value"]) + irdl.results_([i32], ["cst"], [irdl.Variadicity.single]) + add = irdl.operation_("add") + with InsertionPoint(add.body): + i32 = irdl.is_(TypeAttr.get(IntegerType.get_signless(32))) + irdl.operands_( + [i32, i32], + ["lhs", "rhs"], + [irdl.Variadicity.single, irdl.Variadicity.single], + ) + irdl.results_([i32], ["res"], [irdl.Variadicity.single]) + + m.operation.verify() + irdl.load_dialects(m) + + +# This PDL pattern is to fold constant additions, +# i.e. add(constant0, constant1) -> constant2 +# where constant2 = constant0 + constant1. +def get_pdl_pattern_fold(): + m = Module.create() + i32 = IntegerType.get_signless(32) + with InsertionPoint(m.body): + + @pdl.pattern(benefit=1, sym_name="myint_add_fold") + def pat(): + t = pdl.TypeOp(i32) + a0 = pdl.AttributeOp() + a1 = pdl.AttributeOp() + c0 = pdl.OperationOp( + name="myint.constant", attributes={"value": a0}, types=[t] + ) + c1 = pdl.OperationOp( + name="myint.constant", attributes={"value": a1}, types=[t] + ) + v0 = pdl.ResultOp(c0, 0) + v1 = pdl.ResultOp(c1, 0) + op0 = pdl.OperationOp(name="myint.add", args=[v0, v1], types=[t]) + + @pdl.rewrite() + def rew(): + sum = pdl.apply_native_rewrite( + [pdl.AttributeType.get()], "add_fold", [a0, a1] + ) + newOp = pdl.OperationOp( + name="myint.constant", attributes={"value": sum}, types=[t] + ) + pdl.ReplaceOp(op0, with_op=newOp) + + @pdl.pattern(benefit=1, sym_name="myint_add_zero_fold") + def pat(): + t = pdl.TypeOp(i32) + v0 = pdl.OperandOp() + v1 = pdl.OperandOp() + v = pdl.apply_native_constraint([pdl.ValueType.get()], "has_zero", [v0, v1]) + op0 = pdl.OperationOp(name="myint.add", args=[v0, v1], types=[t]) + + @pdl.rewrite() + def rew(): + pdl.ReplaceOp(op0, with_values=[v]) + + def add_fold(rewriter, results, values): + a0, a1 = values + results.append(IntegerAttr.get(i32, a0.value + a1.value)) + + def is_zero(value): + op = value.owner + if isinstance(op, Operation): + return op.name == "myint.constant" and op.attributes["value"].value == 0 + return False + + # Check if either operand is a constant zero, + # and append the other operand to the results if so. + def has_zero(rewriter, results, values): + v0, v1 = values + if is_zero(v0): + results.append(v1) + return False + if is_zero(v1): + results.append(v0) + return False + return True + + pdl_module = PDLModule(m) + pdl_module.register_rewrite_function("add_fold", add_fold) + pdl_module.register_constraint_function("has_zero", has_zero) + return pdl_module.freeze() + + +# CHECK-LABEL: TEST: test_pdl_register_function +# CHECK: "myint.constant"() {value = 8 : i32} : () -> i32 +@construct_and_print_in_module +def test_pdl_register_function(module_): + load_myint_dialect() + + module_ = Module.parse( + """ + %c0 = "myint.constant"() { value = 2 }: () -> (i32) + %c1 = "myint.constant"() { value = 3 }: () -> (i32) + %x = "myint.add"(%c0, %c1): (i32, i32) -> (i32) + "myint.add"(%x, %c1): (i32, i32) -> (i32) + """ + ) + + frozen = get_pdl_pattern_fold() + apply_patterns_and_fold_greedily(module_, frozen) + + return module_ + + +# CHECK-LABEL: TEST: test_pdl_register_function_constraint +# CHECK: return %arg0 : i32 +@construct_and_print_in_module +def test_pdl_register_function_constraint(module_): + load_myint_dialect() + + module_ = Module.parse( + """ + func.func @f(%x : i32) -> i32 { + %c0 = "myint.constant"() { value = 1 }: () -> (i32) + %c1 = "myint.constant"() { value = -1 }: () -> (i32) + %a = "myint.add"(%c0, %c1): (i32, i32) -> (i32) + %b = "myint.add"(%a, %x): (i32, i32) -> (i32) + %c = "myint.add"(%b, %a): (i32, i32) -> (i32) + func.return %c : i32 + } + """ + ) + + frozen = get_pdl_pattern_fold() + apply_patterns_and_fold_greedily(module_, frozen) + + return module_ diff --git a/mlir/test/python/ir/builtin_types.py b/mlir/test/python/ir/builtin_types.py index b42bfd9bc6587..54863253fc770 100644 --- a/mlir/test/python/ir/builtin_types.py +++ b/mlir/test/python/ir/builtin_types.py @@ -371,11 +371,16 @@ def testAbstractShapedType(): # CHECK-LABEL: TEST: testVectorType @run def testVectorType(): + shape = [2, 3] + with Context(): + f32 = F32Type.get() + # CHECK: unchecked vector type: vector<2x3xf32> + print("unchecked vector type:", VectorType.get_unchecked(shape, f32)) + with Context(), Location.unknown(): f32 = F32Type.get() - shape = [2, 3] - # CHECK: vector type: vector<2x3xf32> - print("vector type:", VectorType.get(shape, f32)) + # CHECK: checked vector type: vector<2x3xf32> + print("checked vector type:", VectorType.get(shape, f32)) none = NoneType.get() try: diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp index 3140f12c0b7e8..b9115657d6bf3 100644 --- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp +++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp @@ -513,14 +513,57 @@ getCustomBuilderParams(std::initializer_list prefix, return builderParams; } +static std::string getSignature(const Method &m) { + std::string signature; + llvm::raw_string_ostream os(signature); + raw_indented_ostream indentedOs(os); + m.writeDeclTo(indentedOs); + return signature; +} + +static void emitDuplicatedBuilderError(const Method ¤tMethod, + StringRef methodName, + const Class &defCls, + const AttrOrTypeDef &def) { + + // Try to search for method that makes `get` redundant. + auto loc = def.getDef()->getFieldLoc("builders"); + for (auto &method : defCls.getMethods()) { + if (method->getName() == methodName && + method->makesRedundant(currentMethod)) { + PrintError(loc, llvm::Twine("builder `") + methodName + + "` conflicts with an existing builder. "); + PrintFatalNote(llvm::Twine("A new builder with signature:\n") + + getSignature(currentMethod) + + "\nis shadowed by an existing builder with signature:\n" + + getSignature(*method) + + "\nPlease remove one of the conflicting " + "definitions."); + } + } + + // This code shouldn't be reached, but leaving this here for potential future + // use. + PrintFatalError(loc, "Failed to generate builder " + methodName); +} + void DefGen::emitCustomBuilder(const AttrOrTypeBuilder &builder) { // Don't emit a body if there isn't one. auto props = builder.getBody() ? Method::Static : Method::StaticDeclaration; StringRef returnType = def.getCppClassName(); if (std::optional builderReturnType = builder.getReturnType()) returnType = *builderReturnType; - Method *m = defCls.addMethod(returnType, "get", props, - getCustomBuilderParams({}, builder)); + + llvm::StringRef methodName = "get"; + const auto parameters = getCustomBuilderParams({}, builder); + Method *m = defCls.addMethod(returnType, methodName, props, parameters); + + // If method is pruned, report error and terminate. + if (!m) { + auto curMethod = Method(returnType, methodName, props, parameters); + emitDuplicatedBuilderError(curMethod, methodName, defCls, def); + } + if (!builder.getBody()) return; @@ -547,11 +590,19 @@ void DefGen::emitCheckedCustomBuilder(const AttrOrTypeBuilder &builder) { StringRef returnType = def.getCppClassName(); if (std::optional builderReturnType = builder.getReturnType()) returnType = *builderReturnType; - Method *m = defCls.addMethod( - returnType, "getChecked", props, - getCustomBuilderParams( - {{"::llvm::function_ref<::mlir::InFlightDiagnostic()>", "emitError"}}, - builder)); + + llvm::StringRef methodName = "getChecked"; + auto parameters = getCustomBuilderParams( + {{"::llvm::function_ref<::mlir::InFlightDiagnostic()>", "emitError"}}, + builder); + Method *m = defCls.addMethod(returnType, methodName, props, parameters); + + // If method is pruned, report error and terminate. + if (!m) { + auto curMethod = Method(returnType, methodName, props, parameters); + emitDuplicatedBuilderError(curMethod, methodName, defCls, def); + } + if (!builder.getBody()) return; diff --git a/mlir/tools/mlir-tblgen/EnumsGen.cpp b/mlir/tools/mlir-tblgen/EnumsGen.cpp index d152763f7382e..d4d32f5885971 100644 --- a/mlir/tools/mlir-tblgen/EnumsGen.cpp +++ b/mlir/tools/mlir-tblgen/EnumsGen.cpp @@ -364,6 +364,9 @@ getAllBitsUnsetCase(llvm::ArrayRef cases) { // inline constexpr operator|( a, b); // inline constexpr operator&( a, b); // inline constexpr operator^( a, b); +// inline constexpr &operator|=( &a, b); +// inline constexpr &operator&=( &a, b); +// inline constexpr &operator^=( &a, b); // inline constexpr operator~( bits); // inline constexpr bool bitEnumContainsAll( bits, bit); // inline constexpr bool bitEnumContainsAny( bits, bit); @@ -385,6 +388,15 @@ inline constexpr {0} operator&({0} a, {0} b) {{ inline constexpr {0} operator^({0} a, {0} b) {{ return static_cast<{0}>(static_cast<{1}>(a) ^ static_cast<{1}>(b)); } +inline constexpr {0} &operator|=({0} &a, {0} b) {{ + return a = a | b; +} +inline constexpr {0} &operator&=({0} &a, {0} b) {{ + return a = a & b; +} +inline constexpr {0} &operator^=({0} &a, {0} b) {{ + return a = a ^ b; +} inline constexpr {0} operator~({0} bits) {{ // Ensure only bits that can be present in the enum are set return static_cast<{0}>(~static_cast<{1}>(bits) & static_cast<{1}>({2}u)); diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 4fdde76a613bb..7e8e559baf878 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -3104,8 +3104,8 @@ void OpEmitter::genBuilder() { std::optional body = builder.getBody(); auto properties = body ? Method::Static : Method::StaticDeclaration; auto *method = opClass.addMethod("void", "build", properties, arguments); - if (body) - ERROR_IF_PRUNED(method, "build", op); + + ERROR_IF_PRUNED(method, "build", op); if (method) method->setDeprecated(builder.getDeprecatedMessage()); diff --git a/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp b/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp index 5e279b542fdf9..eaf04379cb529 100644 --- a/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp +++ b/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp @@ -231,10 +231,10 @@ TEST(BarvinokTest, computeNumTermsCone) { // We expect the answer to be // (⌊M⌋ + 1)(⌊N⌋ + 1)(⌊P⌋ + 1) = // ⌊M⌋⌊N⌋⌊P⌋ + ⌊M⌋⌊N⌋ + ⌊N⌋⌊P⌋ + ⌊M⌋⌊P⌋ + ⌊M⌋ + ⌊N⌋ + ⌊P⌋ + 1. - for (unsigned i = 0; i < 2; i++) + for (auto &i : count) for (unsigned j = 0; j < 2; j++) for (unsigned k = 0; k < 2; k++) - EXPECT_EQ(count[i][j][k], 1); + EXPECT_EQ(i[j][k], 1); } /// We define some simple polyhedra with unimodular tangent cones and verify diff --git a/mlir/unittests/Analysis/Presburger/MatrixTest.cpp b/mlir/unittests/Analysis/Presburger/MatrixTest.cpp index cb8df8b346011..e2c2a9bcb7d26 100644 --- a/mlir/unittests/Analysis/Presburger/MatrixTest.cpp +++ b/mlir/unittests/Analysis/Presburger/MatrixTest.cpp @@ -390,7 +390,7 @@ TEST(MatrixTest, gramSchmidt) { EXPECT_EQ_FRAC_MATRIX(gs, FracMatrix::identity(10)); } -void checkReducedBasis(FracMatrix mat, Fraction delta) { +static void checkReducedBasis(FracMatrix mat, Fraction delta) { FracMatrix gsOrth = mat.gramSchmidt(); // Size-reduced check. diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td index cc98b672a26a9..79e8038330048 100644 --- a/offload/liboffload/API/Memory.td +++ b/offload/liboffload/API/Memory.td @@ -21,6 +21,9 @@ def ol_alloc_type_t : Enum { def olMemAlloc : Function { let desc = "Creates a memory allocation on the specified device."; + let details = [ + "All allocations through olMemAlloc regardless of source share a single virtual address range. There is no risk of multiple devices returning equal pointers to different memory." + ]; let params = [ Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>, Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>, @@ -42,6 +45,56 @@ def olMemFree : Function { let returns = []; } +def ol_mem_info_t : Enum { + let desc = "Supported memory info."; + let is_typed = 1; + let etors = [ + TaggedEtor<"DEVICE", "ol_device_handle_t", "The handle of the device associated with the allocation.">, + TaggedEtor<"BASE", "void *", "Base address of this allocation.">, + TaggedEtor<"SIZE", "size_t", "Size of this allocation in bytes.">, + TaggedEtor<"TYPE", "ol_alloc_type_t", "Type of this allocation.">, + ]; +} + +def olGetMemInfo : Function { + let desc = "Queries the given property of a memory allocation allocated with olMemAlloc."; + let details = [ + "`olGetMemInfoSize` can be used to query the storage size required for the given query.", + "The provided pointer can point to any location inside the allocation.", + ]; + let params = [ + Param<"const void *", "Ptr", "pointer to the allocated memory", PARAM_IN>, + Param<"ol_mem_info_t", "PropName", "type of the info to retrieve", PARAM_IN>, + Param<"size_t", "PropSize", "the number of bytes pointed to by PropValue.", PARAM_IN>, + TypeTaggedParam<"void*", "PropValue", "array of bytes holding the info. " + "If Size is not equal to or greater to the real number of bytes needed to return the info " + "then the OL_ERRC_INVALID_SIZE error is returned and pPlatformInfo is not used.", PARAM_OUT, + TypeInfo<"PropName" , "PropSize">> + ]; + let returns = [ + Return<"OL_ERRC_INVALID_SIZE", [ + "`PropSize == 0`", + "If `PropSize` is less than the real number of bytes needed to return the info." + ]>, + Return<"OL_ERRC_NOT_FOUND", ["memory was not allocated by liboffload"]> + ]; +} + +def olGetMemInfoSize : Function { + let desc = "Returns the storage size of the given queue query."; + let details = [ + "The provided pointer can point to any location inside the allocation.", + ]; + let params = [ + Param<"const void *", "Ptr", "pointer to the allocated memory", PARAM_IN>, + Param<"ol_mem_info_t", "PropName", "type of the info to query", PARAM_IN>, + Param<"size_t*", "PropSizeRet", "pointer to the number of bytes required to store the query", PARAM_OUT> + ]; + let returns = [ + Return<"OL_ERRC_NOT_FOUND", ["memory was not allocated by liboffload"]> + ]; +} + def olMemcpy : Function { let desc = "Enqueue a memcpy operation."; let details = [ diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index c5d083db7522e..051882da7c6c7 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -39,12 +39,28 @@ using namespace llvm::omp::target; using namespace llvm::omp::target::plugin; using namespace error; +struct ol_platform_impl_t { + ol_platform_impl_t(std::unique_ptr Plugin, + ol_platform_backend_t BackendType) + : Plugin(std::move(Plugin)), BackendType(BackendType) {} + std::unique_ptr Plugin; + llvm::SmallVector> Devices; + ol_platform_backend_t BackendType; + + /// Complete all pending work for this platform and perform any needed + /// cleanup. + /// + /// After calling this function, no liboffload functions should be called with + /// this platform handle. + llvm::Error destroy(); +}; + // Handle type definitions. Ideally these would be 1:1 with the plugins, but // we add some additional data here for now to avoid churn in the plugin // interface. struct ol_device_impl_t { ol_device_impl_t(int DeviceNum, GenericDeviceTy *Device, - ol_platform_handle_t Platform, InfoTreeNode &&DevInfo) + ol_platform_impl_t &Platform, InfoTreeNode &&DevInfo) : DeviceNum(DeviceNum), Device(Device), Platform(Platform), Info(std::forward(DevInfo)) {} @@ -55,7 +71,7 @@ struct ol_device_impl_t { int DeviceNum; GenericDeviceTy *Device; - ol_platform_handle_t Platform; + ol_platform_impl_t &Platform; InfoTreeNode Info; llvm::SmallVector<__tgt_async_info *> OutstandingQueues; @@ -102,31 +118,17 @@ struct ol_device_impl_t { } }; -struct ol_platform_impl_t { - ol_platform_impl_t(std::unique_ptr Plugin, - ol_platform_backend_t BackendType) - : Plugin(std::move(Plugin)), BackendType(BackendType) {} - std::unique_ptr Plugin; - llvm::SmallVector> Devices; - ol_platform_backend_t BackendType; +llvm::Error ol_platform_impl_t::destroy() { + llvm::Error Result = Plugin::success(); + for (auto &D : Devices) + if (auto Err = D->destroy()) + Result = llvm::joinErrors(std::move(Result), std::move(Err)); - /// Complete all pending work for this platform and perform any needed - /// cleanup. - /// - /// After calling this function, no liboffload functions should be called with - /// this platform handle. - llvm::Error destroy() { - llvm::Error Result = Plugin::success(); - for (auto &D : Devices) - if (auto Err = D->destroy()) - Result = llvm::joinErrors(std::move(Result), std::move(Err)); + if (auto Res = Plugin->deinit()) + Result = llvm::joinErrors(std::move(Result), std::move(Res)); - if (auto Res = Plugin->deinit()) - Result = llvm::joinErrors(std::move(Result), std::move(Res)); - - return Result; - } -}; + return Result; +} struct ol_queue_impl_t { ol_queue_impl_t(__tgt_async_info *AsyncInfo, ol_device_handle_t Device) @@ -182,6 +184,9 @@ namespace offload { struct AllocInfo { ol_device_handle_t Device; ol_alloc_type_t Type; + void *Start; + // One byte past the end + void *End; }; // Global shared state for liboffload @@ -200,12 +205,15 @@ struct OffloadContext { bool ValidationEnabled = true; DenseMap AllocInfoMap{}; std::mutex AllocInfoMapMutex{}; - SmallVector Platforms{}; + // Partitioned list of memory base addresses. Each element in this list is a + // key in AllocInfoMap + llvm::SmallVector AllocBases{}; + SmallVector, 4> Platforms{}; size_t RefCount; ol_device_handle_t HostDevice() { // The host platform is always inserted last - return Platforms.back().Devices[0].get(); + return Platforms.back()->Devices[0].get(); } static OffloadContext &get() { @@ -244,38 +252,35 @@ Error initPlugins(OffloadContext &Context) { // Attempt to create an instance of each supported plugin. #define PLUGIN_TARGET(Name) \ do { \ - Context.Platforms.emplace_back(ol_platform_impl_t{ \ - std::unique_ptr(createPlugin_##Name()), \ - pluginNameToBackend(#Name)}); \ + if (StringRef(#Name) != "host") \ + Context.Platforms.emplace_back(std::make_unique( \ + std::unique_ptr(createPlugin_##Name()), \ + pluginNameToBackend(#Name))); \ } while (false); #include "Shared/Targets.def" // Preemptively initialize all devices in the plugin for (auto &Platform : Context.Platforms) { - // Do not use the host plugin - it isn't supported. - if (Platform.BackendType == OL_PLATFORM_BACKEND_UNKNOWN) - continue; - auto Err = Platform.Plugin->init(); + auto Err = Platform->Plugin->init(); [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); - for (auto DevNum = 0; DevNum < Platform.Plugin->number_of_devices(); + for (auto DevNum = 0; DevNum < Platform->Plugin->number_of_devices(); DevNum++) { - if (Platform.Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) { - auto Device = &Platform.Plugin->getDevice(DevNum); + if (Platform->Plugin->init_device(DevNum) == OFFLOAD_SUCCESS) { + auto Device = &Platform->Plugin->getDevice(DevNum); auto Info = Device->obtainInfoImpl(); if (auto Err = Info.takeError()) return Err; - Platform.Devices.emplace_back(std::make_unique( - DevNum, Device, &Platform, std::move(*Info))); + Platform->Devices.emplace_back(std::make_unique( + DevNum, Device, *Platform, std::move(*Info))); } } } // Add the special host device auto &HostPlatform = Context.Platforms.emplace_back( - ol_platform_impl_t{nullptr, OL_PLATFORM_BACKEND_HOST}); - HostPlatform.Devices.emplace_back( - std::make_unique(-1, nullptr, nullptr, InfoTreeNode{})); - Context.HostDevice()->Platform = &HostPlatform; + std::make_unique(nullptr, OL_PLATFORM_BACKEND_HOST)); + HostPlatform->Devices.emplace_back(std::make_unique( + -1, nullptr, *HostPlatform, InfoTreeNode{})); Context.TracingEnabled = std::getenv("OFFLOAD_TRACE"); Context.ValidationEnabled = !std::getenv("OFFLOAD_DISABLE_VALIDATION"); @@ -312,10 +317,10 @@ Error olShutDown_impl() { for (auto &P : OldContext->Platforms) { // Host plugin is nullptr and has no deinit - if (!P.Plugin || !P.Plugin->is_initialized()) + if (!P->Plugin || !P->Plugin->is_initialized()) continue; - if (auto Res = P.destroy()) + if (auto Res = P->destroy()) Result = llvm::joinErrors(std::move(Result), std::move(Res)); } @@ -380,7 +385,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device, // These are not implemented by the plugin interface switch (PropName) { case OL_DEVICE_INFO_PLATFORM: - return Info.write(Device->Platform); + return Info.write(&Device->Platform); case OL_DEVICE_INFO_TYPE: return Info.write(OL_DEVICE_TYPE_GPU); @@ -513,7 +518,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device, switch (PropName) { case OL_DEVICE_INFO_PLATFORM: - return Info.write(Device->Platform); + return Info.write(&Device->Platform); case OL_DEVICE_INFO_TYPE: return Info.write(OL_DEVICE_TYPE_HOST); case OL_DEVICE_INFO_NAME: @@ -591,7 +596,7 @@ Error olGetDeviceInfoSize_impl(ol_device_handle_t Device, Error olIterateDevices_impl(ol_device_iterate_cb_t Callback, void *UserData) { for (auto &Platform : OffloadContext::get().Platforms) { - for (auto &Device : Platform.Devices) { + for (auto &Device : Platform->Devices) { if (!Callback(Device.get(), UserData)) { break; } @@ -613,20 +618,61 @@ TargetAllocTy convertOlToPluginAllocTy(ol_alloc_type_t Type) { } } +constexpr size_t MAX_ALLOC_TRIES = 50; Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type, size_t Size, void **AllocationOut) { - auto Alloc = - Device->Device->dataAlloc(Size, nullptr, convertOlToPluginAllocTy(Type)); - if (!Alloc) - return Alloc.takeError(); + SmallVector Rejects; + + // Repeat the allocation up to a certain amount of times. If it happens to + // already be allocated (e.g. by a device from another vendor) throw it away + // and try again. + for (size_t Count = 0; Count < MAX_ALLOC_TRIES; Count++) { + auto NewAlloc = Device->Device->dataAlloc(Size, nullptr, + convertOlToPluginAllocTy(Type)); + if (!NewAlloc) + return NewAlloc.takeError(); + + void *NewEnd = &static_cast(*NewAlloc)[Size]; + auto &AllocBases = OffloadContext::get().AllocBases; + auto &AllocInfoMap = OffloadContext::get().AllocInfoMap; + { + std::lock_guard Lock(OffloadContext::get().AllocInfoMapMutex); + + // Check that this memory region doesn't overlap another one + // That is, the start of this allocation needs to be after another + // allocation's end point, and the end of this allocation needs to be + // before the next one's start. + // `Gap` is the first alloc who ends after the new alloc's start point. + auto Gap = + std::lower_bound(AllocBases.begin(), AllocBases.end(), *NewAlloc, + [&](const void *Iter, const void *Val) { + return AllocInfoMap.at(Iter).End <= Val; + }); + if (Gap == AllocBases.end() || NewEnd <= AllocInfoMap.at(*Gap).Start) { + // Success, no conflict + AllocInfoMap.insert_or_assign( + *NewAlloc, AllocInfo{Device, Type, *NewAlloc, NewEnd}); + AllocBases.insert( + std::lower_bound(AllocBases.begin(), AllocBases.end(), *NewAlloc), + *NewAlloc); + *AllocationOut = *NewAlloc; + + for (void *R : Rejects) + if (auto Err = + Device->Device->dataDelete(R, convertOlToPluginAllocTy(Type))) + return Err; + return Error::success(); + } - *AllocationOut = *Alloc; - { - std::lock_guard Lock(OffloadContext::get().AllocInfoMapMutex); - OffloadContext::get().AllocInfoMap.insert_or_assign( - *Alloc, AllocInfo{Device, Type}); + // To avoid the next attempt allocating the same memory we just freed, we + // hold onto it until we complete the allocation + Rejects.push_back(*NewAlloc); + } } - return Error::success(); + + // We've tried multiple times, and can't allocate a non-overlapping region. + return createOffloadError(ErrorCode::BACKEND_FAILURE, + "failed to allocate non-overlapping memory"); } Error olMemFree_impl(void *Address) { @@ -642,6 +688,9 @@ Error olMemFree_impl(void *Address) { Device = AllocInfo.Device; Type = AllocInfo.Type; OffloadContext::get().AllocInfoMap.erase(Address); + + auto &Bases = OffloadContext::get().AllocBases; + Bases.erase(std::lower_bound(Bases.begin(), Bases.end(), Address)); } if (auto Res = @@ -651,6 +700,60 @@ Error olMemFree_impl(void *Address) { return Error::success(); } +Error olGetMemInfoImplDetail(const void *Ptr, ol_mem_info_t PropName, + size_t PropSize, void *PropValue, + size_t *PropSizeRet) { + InfoWriter Info(PropSize, PropValue, PropSizeRet); + std::lock_guard Lock(OffloadContext::get().AllocInfoMapMutex); + + auto &AllocBases = OffloadContext::get().AllocBases; + auto &AllocInfoMap = OffloadContext::get().AllocInfoMap; + const AllocInfo *Alloc = nullptr; + if (AllocInfoMap.contains(Ptr)) { + // Fast case, we have been given the base pointer directly + Alloc = &AllocInfoMap.at(Ptr); + } else { + // Slower case, we need to look up the base pointer first + // Find the first memory allocation whose end is after the target pointer, + // and then check to see if it is in range + auto Loc = std::lower_bound(AllocBases.begin(), AllocBases.end(), Ptr, + [&](const void *Iter, const void *Val) { + return AllocInfoMap.at(Iter).End <= Val; + }); + if (Loc == AllocBases.end() || Ptr < AllocInfoMap.at(*Loc).Start) + return Plugin::error(ErrorCode::NOT_FOUND, + "allocated memory information not found"); + Alloc = &AllocInfoMap.at(*Loc); + } + + switch (PropName) { + case OL_MEM_INFO_DEVICE: + return Info.write(Alloc->Device); + case OL_MEM_INFO_BASE: + return Info.write(Alloc->Start); + case OL_MEM_INFO_SIZE: + return Info.write(static_cast(Alloc->End) - + static_cast(Alloc->Start)); + case OL_MEM_INFO_TYPE: + return Info.write(Alloc->Type); + default: + return createOffloadError(ErrorCode::INVALID_ENUMERATION, + "olGetMemInfo enum '%i' is invalid", PropName); + } + + return Error::success(); +} + +Error olGetMemInfo_impl(const void *Ptr, ol_mem_info_t PropName, + size_t PropSize, void *PropValue) { + return olGetMemInfoImplDetail(Ptr, PropName, PropSize, PropValue, nullptr); +} + +Error olGetMemInfoSize_impl(const void *Ptr, ol_mem_info_t PropName, + size_t *PropSizeRet) { + return olGetMemInfoImplDetail(Ptr, PropName, 0, nullptr, PropSizeRet); +} + Error olCreateQueue_impl(ol_device_handle_t Device, ol_queue_handle_t *Queue) { auto CreatedQueue = std::make_unique(nullptr, Device); diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 64470e9fabf46..7b834ee346e5d 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -423,7 +423,11 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { assert(MemoryManager && "Invalid memory manager"); assert(PtrStorage && "Invalid pointer storage"); - *PtrStorage = MemoryManager->allocate(Size, nullptr); + auto PtrStorageOrErr = MemoryManager->allocate(Size, nullptr); + if (!PtrStorageOrErr) + return PtrStorageOrErr.takeError(); + + *PtrStorage = *PtrStorageOrErr; if (Size && *PtrStorage == nullptr) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failure to allocate from AMDGPU memory manager"); @@ -443,15 +447,12 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy { private: /// Allocation callback that will be called once the memory manager does not /// have more previously allocated buffers. - void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override; + Expected allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) override; /// Deallocation callback that will be called by the memory manager. - int free(void *TgtPtr, TargetAllocTy Kind) override { - if (auto Err = MemoryPool->deallocate(TgtPtr)) { - consumeError(std::move(Err)); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; + Error free(void *TgtPtr, TargetAllocTy Kind) override { + return MemoryPool->deallocate(TgtPtr); } /// The underlying plugin that owns this memory manager. @@ -2339,12 +2340,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { } /// Allocate memory on the device or related to the device. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override; + Expected allocate(size_t Size, void *, TargetAllocTy Kind) override; /// Deallocate memory on the device or related to the device. - int free(void *TgtPtr, TargetAllocTy Kind) override { + Error free(void *TgtPtr, TargetAllocTy Kind) override { if (TgtPtr == nullptr) - return OFFLOAD_SUCCESS; + return Plugin::success(); AMDGPUMemoryPoolTy *MemoryPool = nullptr; switch (Kind) { @@ -2360,17 +2361,14 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { break; } - if (!MemoryPool) { - REPORT("No memory pool for the specified allocation kind\n"); - return OFFLOAD_FAIL; - } + if (!MemoryPool) + return Plugin::error(ErrorCode::OUT_OF_RESOURCES, + "no memory pool for the specified allocation kind"); - if (Error Err = MemoryPool->deallocate(TgtPtr)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } + if (auto Err = MemoryPool->deallocate(TgtPtr)) + return Err; - return OFFLOAD_SUCCESS; + return Plugin::success(); } /// Synchronize current thread with the pending operations on the async info. @@ -3813,14 +3811,13 @@ static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) { return Plugin::error(OffloadErrCode, ErrFmt, Args..., Desc); } -void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, - TargetAllocTy Kind) { +Expected AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind) { // Allocate memory from the pool. void *Ptr = nullptr; - if (auto Err = MemoryPool->allocate(Size, &Ptr)) { - consumeError(std::move(Err)); - return nullptr; - } + if (auto Err = MemoryPool->allocate(Size, &Ptr)) + return std::move(Err); + assert(Ptr && "Invalid pointer"); // Get a list of agents that can access this memory pool. @@ -3830,14 +3827,13 @@ void *AMDGPUMemoryManagerTy::allocate(size_t Size, void *HstPtr, [&](hsa_agent_t Agent) { return MemoryPool->canAccess(Agent); }); // Allow all valid kernel agents to access the allocation. - if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = MemoryPool->enableAccess(Ptr, Size, Agents)) + return std::move(Err); return Ptr; } -void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { +Expected AMDGPUDeviceTy::allocate(size_t Size, void *, + TargetAllocTy Kind) { if (Size == 0) return nullptr; @@ -3856,17 +3852,14 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { break; } - if (!MemoryPool) { - REPORT("No memory pool for the specified allocation kind\n"); - return nullptr; - } + if (!MemoryPool) + return Plugin::error(ErrorCode::UNSUPPORTED, + "no memory pool for the specified allocation kind"); // Allocate from the corresponding memory pool. void *Alloc = nullptr; - if (Error Err = MemoryPool->allocate(Size, &Alloc)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = MemoryPool->allocate(Size, &Alloc)) + return std::move(Err); if (Alloc) { // Get a list of agents that can access this memory pool. Inherently @@ -3879,10 +3872,8 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) { }); // Enable all valid kernel agents to access the buffer. - if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) { - REPORT("%s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = MemoryPool->enableAccess(Alloc, Size, Agents)) + return std::move(Err); } return Alloc; diff --git a/offload/plugins-nextgen/common/include/MemoryManager.h b/offload/plugins-nextgen/common/include/MemoryManager.h index a4f6e628c403a..8f6c1adcdaa58 100644 --- a/offload/plugins-nextgen/common/include/MemoryManager.h +++ b/offload/plugins-nextgen/common/include/MemoryManager.h @@ -25,6 +25,10 @@ #include "Shared/Utils.h" #include "omptarget.h" +#include "llvm/Support/Error.h" + +namespace llvm { + /// Base class of per-device allocator. class DeviceAllocatorTy { public: @@ -32,11 +36,13 @@ class DeviceAllocatorTy { /// Allocate a memory of size \p Size . \p HstPtr is used to assist the /// allocation. - virtual void *allocate(size_t Size, void *HstPtr, - TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; + virtual Expected + allocate(size_t Size, void *HstPtr, + TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; /// Delete the pointer \p TgtPtr on the device - virtual int free(void *TgtPtr, TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; + virtual Error free(void *TgtPtr, + TargetAllocTy Kind = TARGET_ALLOC_DEFAULT) = 0; }; /// Class of memory manager. The memory manager is per-device by using @@ -134,17 +140,17 @@ class MemoryManagerTy { size_t SizeThreshold = 1U << 13; /// Request memory from target device - void *allocateOnDevice(size_t Size, void *HstPtr) const { + Expected allocateOnDevice(size_t Size, void *HstPtr) const { return DeviceAllocator.allocate(Size, HstPtr, TARGET_ALLOC_DEVICE); } /// Deallocate data on device - int deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } + Error deleteOnDevice(void *Ptr) const { return DeviceAllocator.free(Ptr); } /// This function is called when it tries to allocate memory on device but the /// device returns out of memory. It will first free all memory in the /// FreeList and try to allocate again. - void *freeAndAllocate(size_t Size, void *HstPtr) { + Expected freeAndAllocate(size_t Size, void *HstPtr) { std::vector RemoveList; // Deallocate all memory in FreeList @@ -154,7 +160,8 @@ class MemoryManagerTy { if (List.empty()) continue; for (const NodeTy &N : List) { - deleteOnDevice(N.Ptr); + if (auto Err = deleteOnDevice(N.Ptr)) + return Err; RemoveList.push_back(N.Ptr); } FreeLists[I].clear(); @@ -175,14 +182,22 @@ class MemoryManagerTy { /// allocate directly on the device. If a \p nullptr is returned, it might /// be because the device is OOM. In that case, it will free all unused /// memory and then try again. - void *allocateOrFreeAndAllocateOnDevice(size_t Size, void *HstPtr) { - void *TgtPtr = allocateOnDevice(Size, HstPtr); + Expected allocateOrFreeAndAllocateOnDevice(size_t Size, + void *HstPtr) { + auto TgtPtrOrErr = allocateOnDevice(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); + + void *TgtPtr = *TgtPtrOrErr; // We cannot get memory from the device. It might be due to OOM. Let's // free all memory in FreeLists and try again. if (TgtPtr == nullptr) { DP("Failed to get memory on device. Free all memory in FreeLists and " "try again.\n"); - TgtPtr = freeAndAllocate(Size, HstPtr); + TgtPtrOrErr = freeAndAllocate(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); + TgtPtr = *TgtPtrOrErr; } if (TgtPtr == nullptr) @@ -204,16 +219,17 @@ class MemoryManagerTy { /// Destructor ~MemoryManagerTy() { - for (auto Itr = PtrToNodeTable.begin(); Itr != PtrToNodeTable.end(); - ++Itr) { - assert(Itr->second.Ptr && "nullptr in map table"); - deleteOnDevice(Itr->second.Ptr); + for (auto &PtrToNode : PtrToNodeTable) { + assert(PtrToNode.second.Ptr && "nullptr in map table"); + if (auto Err = deleteOnDevice(PtrToNode.second.Ptr)) + REPORT("Failure to delete memory: %s\n", + toString(std::move(Err)).data()); } } /// Allocate memory of size \p Size from target device. \p HstPtr is used to /// assist the allocation. - void *allocate(size_t Size, void *HstPtr) { + Expected allocate(size_t Size, void *HstPtr) { // If the size is zero, we will not bother the target device. Just return // nullptr directly. if (Size == 0) @@ -228,11 +244,14 @@ class MemoryManagerTy { DP("%zu is greater than the threshold %zu. Allocate it directly from " "device\n", Size, SizeThreshold); - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + auto TgtPtrOrErr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); - DP("Got target pointer " DPxMOD ". Return directly.\n", DPxPTR(TgtPtr)); + DP("Got target pointer " DPxMOD ". Return directly.\n", + DPxPTR(*TgtPtrOrErr)); - return TgtPtr; + return *TgtPtrOrErr; } NodeTy *NodePtr = nullptr; @@ -260,8 +279,11 @@ class MemoryManagerTy { if (NodePtr == nullptr) { DP("Cannot find a node in the FreeLists. Allocate on device.\n"); // Allocate one on device - void *TgtPtr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + auto TgtPtrOrErr = allocateOrFreeAndAllocateOnDevice(Size, HstPtr); + if (!TgtPtrOrErr) + return TgtPtrOrErr.takeError(); + void *TgtPtr = *TgtPtrOrErr; if (TgtPtr == nullptr) return nullptr; @@ -282,7 +304,7 @@ class MemoryManagerTy { } /// Deallocate memory pointed by \p TgtPtr - int free(void *TgtPtr) { + Error free(void *TgtPtr) { DP("MemoryManagerTy::free: target memory " DPxMOD ".\n", DPxPTR(TgtPtr)); NodeTy *P = nullptr; @@ -314,7 +336,7 @@ class MemoryManagerTy { FreeLists[B].insert(*P); } - return OFFLOAD_SUCCESS; + return Error::success(); } /// Get the size threshold from the environment variable @@ -344,4 +366,6 @@ class MemoryManagerTy { constexpr const size_t MemoryManagerTy::BucketSize[]; constexpr const int MemoryManagerTy::NumBuckets; +} // namespace llvm + #endif // LLVM_OPENMP_LIBOMPTARGET_PLUGINS_COMMON_MEMORYMANAGER_H diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 9d5651a3d7b4e..5620437716b31 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -193,7 +193,7 @@ struct InfoTreeNode { InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {} InfoTreeNode(std::string Key, VariantType Value, std::string Units) - : Key(Key), Value(Value), Units(Units) {} + : Key(std::move(Key)), Value(Value), Units(std::move(Units)) {} /// Add a new info entry as a child of this node. The entry requires at least /// a key string in \p Key. The value in \p Value is optional and can be any @@ -202,7 +202,7 @@ struct InfoTreeNode { /// use that value for an appropriate olGetDeviceInfo query template InfoTreeNode *add(std::string Key, T Value = T(), - const std::string &Units = std::string(), + std::string Units = std::string(), std::optional DeviceInfoKey = std::nullopt) { assert(!Key.empty() && "Invalid info key"); @@ -217,7 +217,8 @@ struct InfoTreeNode { else ValueVariant = std::string{Value}; - auto Ptr = &Children->emplace_back(Key, ValueVariant, Units); + auto Ptr = + &Children->emplace_back(std::move(Key), ValueVariant, std::move(Units)); if (DeviceInfoKey) DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1; diff --git a/offload/plugins-nextgen/common/src/JIT.cpp b/offload/plugins-nextgen/common/src/JIT.cpp index 07ef05e7e9d38..881e27dad384c 100644 --- a/offload/plugins-nextgen/common/src/JIT.cpp +++ b/offload/plugins-nextgen/common/src/JIT.cpp @@ -180,9 +180,10 @@ Expected> JITEngine::backend(Module &M, const std::string &ComputeUnitKind, unsigned OptLevel) { - auto RemarksFileOrErr = setupLLVMOptimizationRemarks( - M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"", - /*RemarksFormat=*/"", /*RemarksWithHotness=*/false); + Expected RemarksFileOrErr = + setupLLVMOptimizationRemarks( + M.getContext(), /*RemarksFilename=*/"", /*RemarksPasses=*/"", + /*RemarksFormat=*/"", /*RemarksWithHotness=*/false); if (Error E = RemarksFileOrErr.takeError()) return std::move(E); if (*RemarksFileOrErr) diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 30b5db782370d..15b6b9866e5a2 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -73,11 +73,17 @@ struct RecordReplayTy { }; llvm::SmallVector GlobalEntries{}; - void *suggestAddress(uint64_t MaxMemoryAllocation) { + Expected suggestAddress(uint64_t MaxMemoryAllocation) { // Get a valid pointer address for this system - void *Addr = + auto AddrOrErr = Device->allocate(1024, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); - Device->free(Addr); + if (!AddrOrErr) + return AddrOrErr.takeError(); + + void *Addr = *AddrOrErr; + if (auto Err = Device->free(Addr)) + return std::move(Err); + // Align Address to MaxMemoryAllocation Addr = (void *)utils::alignPtr((Addr), MaxMemoryAllocation); return Addr; @@ -86,8 +92,12 @@ struct RecordReplayTy { Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) { size_t ASize = MaxMemoryAllocation; - if (!VAddr && isRecording()) - VAddr = suggestAddress(MaxMemoryAllocation); + if (!VAddr && isRecording()) { + auto VAddrOrErr = suggestAddress(MaxMemoryAllocation); + if (!VAddrOrErr) + return VAddrOrErr.takeError(); + VAddr = *VAddrOrErr; + } DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr); @@ -117,8 +127,11 @@ struct RecordReplayTy { constexpr size_t STEP = 1024 * 1024 * 1024ULL; MemoryStart = nullptr; for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) { - MemoryStart = + auto MemoryStartOrErr = Device->allocate(TotalSize, /*HstPtr=*/nullptr, TARGET_ALLOC_DEFAULT); + if (!MemoryStartOrErr) + return MemoryStartOrErr.takeError(); + MemoryStart = *MemoryStartOrErr; if (MemoryStart) break; } @@ -352,13 +365,15 @@ struct RecordReplayTy { return Plugin::success(); } - void deinit() { + Error deinit() { if (UsedVAMap) { if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize)) - report_fatal_error("Error on releasing virtual memory space"); + return Err; } else { - Device->free(MemoryStart); + if (auto Err = Device->free(MemoryStart)) + return Err; } + return Plugin::success(); } }; } // namespace llvm::omp::target::plugin @@ -838,7 +853,8 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { RecordReplayTy &RecordReplay = Plugin.getRecordReplay(); if (RecordReplay.isRecordingOrReplaying()) - RecordReplay.deinit(); + if (auto Err = RecordReplay.deinit()) + return Err; if (RPCServer) if (auto Err = RPCServer->deinitDevice(*this)) @@ -1297,7 +1313,10 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: if (MemoryManager) { - Alloc = MemoryManager->allocate(Size, HostPtr); + auto AllocOrErr = MemoryManager->allocate(Size, HostPtr); + if (!AllocOrErr) + return AllocOrErr.takeError(); + Alloc = *AllocOrErr; if (!Alloc) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failed to allocate from memory manager"); @@ -1305,12 +1324,16 @@ Expected GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr, } [[fallthrough]]; case TARGET_ALLOC_HOST: - case TARGET_ALLOC_SHARED: - Alloc = allocate(Size, HostPtr, Kind); + case TARGET_ALLOC_SHARED: { + auto AllocOrErr = allocate(Size, HostPtr, Kind); + if (!AllocOrErr) + return AllocOrErr.takeError(); + Alloc = *AllocOrErr; if (!Alloc) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failed to allocate from device allocator"); } + } // Report error if the memory manager or the device allocator did not return // any memory buffer. @@ -1382,28 +1405,19 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) { #undef DEALLOCATION_ERROR } - int Res; switch (Kind) { case TARGET_ALLOC_DEFAULT: case TARGET_ALLOC_DEVICE: if (MemoryManager) { - Res = MemoryManager->free(TgtPtr); - if (Res) - return Plugin::error( - ErrorCode::OUT_OF_RESOURCES, - "failure to deallocate device pointer %p via memory manager", - TgtPtr); + if (auto Err = MemoryManager->free(TgtPtr)) + return Err; break; } [[fallthrough]]; case TARGET_ALLOC_HOST: case TARGET_ALLOC_SHARED: - Res = free(TgtPtr, Kind); - if (Res) - return Plugin::error( - ErrorCode::UNKNOWN, - "failure to deallocate device pointer %p via device deallocator", - TgtPtr); + if (auto Err = free(TgtPtr, Kind)) + return Err; } // Unregister deallocated pinned memory buffer if the type is host memory. @@ -1714,7 +1728,8 @@ int32_t GenericPluginTy::is_initialized() const { return Initialized; } int32_t GenericPluginTy::isPluginCompatible(StringRef Image) { auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); - DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); + DP("Failure to check validity of image %p: %s", Image.data(), + ErrStr.c_str()); return false; }; switch (identify_magic(Image)) { @@ -1742,7 +1757,8 @@ int32_t GenericPluginTy::isPluginCompatible(StringRef Image) { int32_t GenericPluginTy::isDeviceCompatible(int32_t DeviceId, StringRef Image) { auto HandleError = [&](Error Err) -> bool { [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); - DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); + DP("Failure to check validity of image %p: %s", Image.data(), + ErrStr.c_str()); return false; }; switch (identify_magic(Image)) { diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp index 17d69b49b3b7e..e19f2ef94de6e 100644 --- a/offload/plugins-nextgen/common/src/RPC.cpp +++ b/offload/plugins-nextgen/common/src/RPC.cpp @@ -28,15 +28,22 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device, switch (Port.get_opcode()) { case LIBC_MALLOC: { Port.recv_and_send([&](rpc::Buffer *Buffer, uint32_t) { - Buffer->data[0] = reinterpret_cast( - Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE)); + auto PtrOrErr = + Device.allocate(Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE); + void *Ptr = nullptr; + if (!PtrOrErr) + llvm::consumeError(PtrOrErr.takeError()); + else + Ptr = *PtrOrErr; + Buffer->data[0] = reinterpret_cast(Ptr); }); break; } case LIBC_FREE: { Port.recv([&](rpc::Buffer *Buffer, uint32_t) { - Device.free(reinterpret_cast(Buffer->data[0]), - TARGET_ALLOC_DEVICE); + if (auto Err = Device.free(reinterpret_cast(Buffer->data[0]), + TARGET_ALLOC_DEVICE)) + llvm::consumeError(std::move(Err)); }); break; } @@ -171,9 +178,13 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, plugin::DeviceImageTy &Image) { uint64_t NumPorts = std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT); - void *RPCBuffer = Device.allocate( + auto RPCBufferOrErr = Device.allocate( rpc::Server::allocation_size(Device.getWarpSize(), NumPorts), nullptr, TARGET_ALLOC_HOST); + if (!RPCBufferOrErr) + return RPCBufferOrErr.takeError(); + + void *RPCBuffer = *RPCBufferOrErr; if (!RPCBuffer) return plugin::Plugin::error( error::ErrorCode::UNKNOWN, @@ -198,7 +209,8 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device, Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) { std::lock_guard Lock(BufferMutex); - Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST); + if (auto Err = Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST)) + return Err; Buffers[Device.getDeviceId()] = nullptr; Devices[Device.getDeviceId()] = nullptr; return Error::success(); diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b2f840113cff3..b30c651223cad 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -561,14 +561,12 @@ struct CUDADeviceTy : public GenericDeviceTy { } /// Allocate memory on the device or related to the device. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + Expected allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; - if (auto Err = setContext()) { - REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = setContext()) + return std::move(Err); void *MemAlloc = nullptr; CUdeviceptr DevicePtr; @@ -589,23 +587,18 @@ struct CUDADeviceTy : public GenericDeviceTy { break; } - if (auto Err = - Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) { - REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data()); - return nullptr; - } + if (auto Err = Plugin::check(Res, "error in cuMemAlloc[Host|Managed]: %s")) + return std::move(Err); return MemAlloc; } /// Deallocate memory on the device or related to the device. - int free(void *TgtPtr, TargetAllocTy Kind) override { + Error free(void *TgtPtr, TargetAllocTy Kind) override { if (TgtPtr == nullptr) - return OFFLOAD_SUCCESS; + return Plugin::success(); - if (auto Err = setContext()) { - REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } + if (auto Err = setContext()) + return Err; CUresult Res; switch (Kind) { @@ -619,11 +612,7 @@ struct CUDADeviceTy : public GenericDeviceTy { break; } - if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) { - REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data()); - return OFFLOAD_FAIL; - } - return OFFLOAD_SUCCESS; + return Plugin::check(Res, "error in cuMemFree[Host]: %s"); } /// Synchronize current thread with the pending operations on the async info. @@ -1310,8 +1299,12 @@ struct CUDADeviceTy : public GenericDeviceTy { // Allocate a buffer to store all of the known constructor / destructor // functions in so we can iterate them on the device. - void *Buffer = + auto BufferOrErr = allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE); + if (!BufferOrErr) + return BufferOrErr.takeError(); + + void *Buffer = *BufferOrErr; if (!Buffer) return Plugin::error(ErrorCode::OUT_OF_RESOURCES, "failed to allocate memory for global buffer"); @@ -1360,12 +1353,10 @@ struct CUDADeviceTy : public GenericDeviceTy { Error Err = Plugin::success(); AsyncInfoWrapper.finalize(Err); + if (Err) + return Err; - if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS) - return Plugin::error(ErrorCode::UNKNOWN, - "failed to free memory for global buffer"); - - return Err; + return free(Buffer, TARGET_ALLOC_DEVICE); } /// Stream manager for CUDA streams. diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 44e2584fe53cc..0845032d0aae2 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -240,7 +240,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// Allocate memory. Use std::malloc in all cases. - void *allocate(size_t Size, void *, TargetAllocTy Kind) override { + Expected allocate(size_t Size, void *, TargetAllocTy Kind) override { if (Size == 0) return nullptr; @@ -257,9 +257,9 @@ struct GenELF64DeviceTy : public GenericDeviceTy { } /// Free the memory. Use std::free in all cases. - int free(void *TgtPtr, TargetAllocTy Kind) override { + Error free(void *TgtPtr, TargetAllocTy Kind) override { std::free(TgtPtr); - return OFFLOAD_SUCCESS; + return Plugin::success(); } /// This plugin does nothing to lock buffers. Do not return an error, just diff --git a/offload/test/offloading/fortran/target-no-loop.f90 b/offload/test/offloading/fortran/target-no-loop.f90 new file mode 100644 index 0000000000000..8e40e20e73e70 --- /dev/null +++ b/offload/test/offloading/fortran/target-no-loop.f90 @@ -0,0 +1,96 @@ +! REQUIRES: flang + +! RUN: %libomptarget-compile-fortran-generic -O3 -fopenmp-assume-threads-oversubscription -fopenmp-assume-teams-oversubscription +! RUN: env LIBOMPTARGET_INFO=16 OMP_NUM_TEAMS=16 OMP_TEAMS_THREAD_LIMIT=16 %libomptarget-run-generic 2>&1 | %fcheck-generic +function check_errors(array) result (errors) + integer, intent(in) :: array(1024) + integer :: errors + integer :: i + errors = 0 + do i = 1, 1024 + if ( array( i) .ne. (i) ) then + errors = errors + 1 + end if + end do +end function + +program main + use omp_lib + implicit none + integer :: i,j,red + integer :: array(1024), errors = 0 + array = 1 + + ! No-loop kernel + !$omp target teams distribute parallel do + do i = 1, 1024 + array(i) = i + end do + errors = errors + check_errors(array) + + ! SPMD kernel (num_teams clause blocks promotion to no-loop) + array = 1 + !$omp target teams distribute parallel do num_teams(3) + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! No-loop kernel + array = 1 + !$omp target teams distribute parallel do num_threads(64) + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! SPMD kernel + array = 1 + !$omp target parallel do + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! Generic kernel + array = 1 + !$omp target teams distribute + do i = 1, 1024 + array(i) = i + end do + + errors = errors + check_errors(array) + + ! SPMD kernel (reduction clause blocks promotion to no-loop) + array = 1 + red =0 + !$omp target teams distribute parallel do reduction(+:red) + do i = 1, 1024 + red = red + array(i) + end do + + if (red .ne. 1024) then + errors = errors + 1 + end if + + print *,"number of errors: ", errors + +end program main + +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode +! CHECK: info: #Args: 3 Teams x Thrds: 64x 16 +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: info: #Args: 3 Teams x Thrds: 3x 16 {{.*}} +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD-No-Loop mode +! CHECK: info: #Args: 3 Teams x Thrds: 64x 16 {{.*}} +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: info: #Args: 3 Teams x Thrds: 1x 16 +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} Generic mode +! CHECK: info: #Args: 3 Teams x Thrds: 16x 16 {{.*}} +! CHECK: "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}} SPMD mode +! CHECK: info: #Args: 4 Teams x Thrds: 16x 16 {{.*}} +! CHECK: number of errors: 0 + diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index ba35c1ee87aac..50c99a5d5b639 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -27,7 +27,9 @@ add_offload_unittest("memory" memory/olMemAlloc.cpp memory/olMemFill.cpp memory/olMemFree.cpp - memory/olMemcpy.cpp) + memory/olMemcpy.cpp + memory/olGetMemInfo.cpp + memory/olGetMemInfoSize.cpp) add_offload_unittest("platform" platform/olGetPlatformInfo.cpp diff --git a/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp b/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp new file mode 100644 index 0000000000000..a4b382ff298ad --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olGetMemInfo.cpp @@ -0,0 +1,130 @@ +//===------- Offload API tests - olGetMemInfo -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +constexpr size_t SIZE = 1024; + +struct olGetMemInfoBaseTest : OffloadDeviceTest { + void *OffsetPtr() { return &reinterpret_cast(Ptr)[123]; } + + void *Ptr; +}; + +template +struct olGetMemInfoTest : olGetMemInfoBaseTest { + void SetUp() override { + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp()); + ASSERT_SUCCESS(olMemAlloc(Device, AllocType, SIZE, &Ptr)); + } + + void TearDown() override { + ASSERT_SUCCESS(olMemFree(Ptr)); + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::TearDown()); + } +}; +using olGetMemInfoDeviceTest = olGetMemInfoTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoDeviceTest); +using olGetMemInfoManagedTest = olGetMemInfoTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoManagedTest); +using olGetMemInfoHostTest = olGetMemInfoTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoHostTest); + +#define PER_ALLOC_TEST(FUNCTION) \ + TEST_P(olGetMemInfoDeviceTest, FUNCTION) { \ + FUNCTION(this, Ptr, OL_ALLOC_TYPE_DEVICE); \ + } \ + TEST_P(olGetMemInfoManagedTest, FUNCTION) { \ + FUNCTION(this, Ptr, OL_ALLOC_TYPE_MANAGED); \ + } \ + TEST_P(olGetMemInfoHostTest, FUNCTION) { \ + FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_HOST); \ + } \ + TEST_P(olGetMemInfoDeviceTest, FUNCTION##Offset) { \ + FUNCTION(this, Ptr, OL_ALLOC_TYPE_DEVICE); \ + } \ + TEST_P(olGetMemInfoManagedTest, FUNCTION##Offset) { \ + FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_MANAGED); \ + } \ + TEST_P(olGetMemInfoHostTest, FUNCTION##Offset) { \ + FUNCTION(this, OffsetPtr(), OL_ALLOC_TYPE_HOST); \ + } + +void SuccessDevice(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + ol_device_handle_t RetrievedDevice; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_DEVICE, + sizeof(RetrievedDevice), &RetrievedDevice)); + ASSERT_EQ(RetrievedDevice, Fixture->Device); +} +PER_ALLOC_TEST(SuccessDevice); + +void SuccessBase(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + void *RetrievedBase; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_BASE, + sizeof(RetrievedBase), &RetrievedBase)); + ASSERT_EQ(RetrievedBase, Fixture->Ptr); +} +PER_ALLOC_TEST(SuccessBase); + +void SuccessSize(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + size_t RetrievedSize; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_SIZE, + sizeof(RetrievedSize), &RetrievedSize)); + ASSERT_EQ(RetrievedSize, SIZE); +} +PER_ALLOC_TEST(SuccessSize); + +void SuccessType(olGetMemInfoBaseTest *Fixture, void *Ptr, + ol_alloc_type_t Type) { + ol_alloc_type_t RetrievedType; + ASSERT_SUCCESS(olGetMemInfo(Fixture->Ptr, OL_MEM_INFO_TYPE, + sizeof(RetrievedType), &RetrievedType)); + ASSERT_EQ(RetrievedType, Type); +} +PER_ALLOC_TEST(SuccessType); + +TEST_P(olGetMemInfoDeviceTest, InvalidNotFound) { + // Assuming that we aren't unlucky and happen to get 0x1234 as a random + // pointer + void *RetrievedBase; + ASSERT_ERROR(OL_ERRC_NOT_FOUND, + olGetMemInfo(reinterpret_cast(0x1234), OL_MEM_INFO_BASE, + sizeof(RetrievedBase), &RetrievedBase)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidNullPtr) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olGetMemInfo(nullptr, OL_MEM_INFO_DEVICE, + sizeof(RetrievedDevice), &RetrievedDevice)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidSizeZero) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, 0, &RetrievedDevice)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidSizeSmall) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR(OL_ERRC_INVALID_SIZE, + olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, + sizeof(RetrievedDevice) - 1, &RetrievedDevice)); +} + +TEST_P(olGetMemInfoDeviceTest, InvalidNullPointerPropValue) { + ol_device_handle_t RetrievedDevice; + ASSERT_ERROR( + OL_ERRC_INVALID_NULL_POINTER, + olGetMemInfo(Ptr, OL_MEM_INFO_DEVICE, sizeof(RetrievedDevice), nullptr)); +} diff --git a/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp b/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp new file mode 100644 index 0000000000000..f1a1e790fb22f --- /dev/null +++ b/offload/unittests/OffloadAPI/memory/olGetMemInfoSize.cpp @@ -0,0 +1,63 @@ +//===------- Offload API tests - olGetMemInfoSize -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include "../common/Fixtures.hpp" + +struct olGetMemInfoSizeTest : OffloadDeviceTest { + void *OffsetPtr() { return &reinterpret_cast(Ptr)[123]; } + + void SetUp() override { + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::SetUp()); + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 0x1024, &Ptr)); + } + + void TearDown() override { + ASSERT_SUCCESS(olMemFree(Ptr)); + RETURN_ON_FATAL_FAILURE(OffloadDeviceTest::TearDown()); + } + + void *Ptr; +}; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olGetMemInfoSizeTest); + +TEST_P(olGetMemInfoSizeTest, SuccessDevice) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_DEVICE, &Size)); + ASSERT_EQ(Size, sizeof(ol_device_handle_t)); +} + +TEST_P(olGetMemInfoSizeTest, SuccessBase) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_BASE, &Size)); + ASSERT_EQ(Size, sizeof(void *)); +} + +TEST_P(olGetMemInfoSizeTest, SuccessSize) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_SIZE, &Size)); + ASSERT_EQ(Size, sizeof(size_t)); +} + +TEST_P(olGetMemInfoSizeTest, SuccessType) { + size_t Size = 0; + ASSERT_SUCCESS(olGetMemInfoSize(Ptr, OL_MEM_INFO_TYPE, &Size)); + ASSERT_EQ(Size, sizeof(ol_alloc_type_t)); +} + +TEST_P(olGetMemInfoSizeTest, InvalidSymbolInfoEnumeration) { + size_t Size = 0; + ASSERT_ERROR(OL_ERRC_INVALID_ENUMERATION, + olGetMemInfoSize(Ptr, OL_MEM_INFO_FORCE_UINT32, &Size)); +} + +TEST_P(olGetMemInfoSizeTest, InvalidNullPointer) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olGetMemInfoSize(Ptr, OL_MEM_INFO_DEVICE, nullptr)); +} diff --git a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp index 00e428ec2abc7..445262aa0c583 100644 --- a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp +++ b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp @@ -34,6 +34,26 @@ TEST_P(olMemAllocTest, SuccessAllocDevice) { olMemFree(Alloc); } +TEST_P(olMemAllocTest, SuccessAllocMany) { + std::vector Allocs; + Allocs.reserve(1000); + + constexpr ol_alloc_type_t TYPES[3] = { + OL_ALLOC_TYPE_DEVICE, OL_ALLOC_TYPE_MANAGED, OL_ALLOC_TYPE_HOST}; + + for (size_t I = 1; I < 1000; I++) { + void *Alloc = nullptr; + ASSERT_SUCCESS(olMemAlloc(Device, TYPES[I % 3], 1024 * I, &Alloc)); + ASSERT_NE(Alloc, nullptr); + + Allocs.push_back(Alloc); + } + + for (auto *A : Allocs) { + olMemFree(A); + } +} + TEST_P(olMemAllocTest, InvalidNullDevice) { void *Alloc = nullptr; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, diff --git a/openmp/device/src/Workshare.cpp b/openmp/device/src/Workshare.cpp index 59a2cc3f27aca..653104ce883d1 100644 --- a/openmp/device/src/Workshare.cpp +++ b/openmp/device/src/Workshare.cpp @@ -800,10 +800,6 @@ template class StaticLoopChunker { // If we know we have more threads than iterations we can indicate that to // avoid an outer loop. - if (config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - if (OneIterationPerThread) ASSERT(NumThreads >= NumIters, "Broken assumption"); @@ -851,10 +847,6 @@ template class StaticLoopChunker { // If we know we have more blocks than iterations we can indicate that to // avoid an outer loop. - if (config::getAssumeTeamsOversubscription()) { - OneIterationPerThread = true; - } - if (OneIterationPerThread) ASSERT(NumBlocks >= NumIters, "Broken assumption"); @@ -914,11 +906,6 @@ template class StaticLoopChunker { // If we know we have more threads (across all blocks) than iterations we // can indicate that to avoid an outer loop. - if (config::getAssumeTeamsOversubscription() & - config::getAssumeThreadsOversubscription()) { - OneIterationPerThread = true; - } - if (OneIterationPerThread) ASSERT(NumBlocks * NumThreads >= NumIters, "Broken assumption"); diff --git a/openmp/runtime/test/transform/fuse/foreach.cpp b/openmp/runtime/test/transform/fuse/foreach.cpp new file mode 100644 index 0000000000000..176465b201faa --- /dev/null +++ b/openmp/runtime/test/transform/fuse/foreach.cpp @@ -0,0 +1,191 @@ +// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include +#include +#include + +struct Reporter { + const char *name; + + Reporter(const char *name) : name(name) { print("ctor"); } + + Reporter() : name("") { print("ctor"); } + + Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); } + + Reporter(Reporter &&that) : name(that.name) { print("move ctor"); } + + ~Reporter() { print("dtor"); } + + const Reporter &operator=(const Reporter &that) { + print("copy assign"); + this->name = that.name; + return *this; + } + + const Reporter &operator=(Reporter &&that) { + print("move assign"); + this->name = that.name; + return *this; + } + + struct Iterator { + const Reporter *owner; + int pos; + + Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {} + + Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) { + owner->print("iterator copy ctor"); + } + + Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) { + owner->print("iterator move ctor"); + } + + ~Iterator() { owner->print("iterator dtor"); } + + const Iterator &operator=(const Iterator &that) { + owner->print("iterator copy assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + const Iterator &operator=(Iterator &&that) { + owner->print("iterator move assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + bool operator==(const Iterator &that) const { + owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos); + return this->pos == that.pos; + } + + Iterator &operator++() { + owner->print("iterator prefix ++"); + pos -= 1; + return *this; + } + + Iterator operator++(int) { + owner->print("iterator postfix ++"); + auto result = *this; + pos -= 1; + return result; + } + + int operator*() const { + int result = 2 - pos; + owner->print("iterator deref: %i", result); + return result; + } + + size_t operator-(const Iterator &that) const { + int result = (2 - this->pos) - (2 - that.pos); + owner->print("iterator distance: %d", result); + return result; + } + + Iterator operator+(int steps) const { + owner->print("iterator advance: %i += %i", 2 - this->pos, steps); + return Iterator(owner, pos - steps); + } + + void print(const char *msg) const { owner->print(msg); } + }; + + Iterator begin() const { + print("begin()"); + return Iterator(this, 2); + } + + Iterator end() const { + print("end()"); + return Iterator(this, -1); + } + + void print(const char *msg, ...) const { + va_list args; + va_start(args, msg); + printf("[%s] ", name); + vprintf(msg, args); + printf("\n"); + va_end(args); + } +}; + +int main() { + printf("do\n"); +#pragma omp fuse + { + for (Reporter a{"C"}; auto &&v : Reporter("A")) + printf("v=%d\n", v); + for (Reporter aa{"D"}; auto &&vv : Reporter("B")) + printf("vv=%d\n", vv); + } + printf("done\n"); + return EXIT_SUCCESS; +} + +// CHECK: [C] ctor +// CHECK-NEXT: [A] ctor +// CHECK-NEXT: [A] end() +// CHECK-NEXT: [A] begin() +// CHECK-NEXT: [A] begin() +// CHECK-NEXT: [A] iterator distance: 3 +// CHECK-NEXT: [D] ctor +// CHECK-NEXT: [B] ctor +// CHECK-NEXT: [B] end() +// CHECK-NEXT: [B] begin() +// CHECK-NEXT: [B] begin() +// CHECK-NEXT: [B] iterator distance: 3 +// CHECK-NEXT: [A] iterator advance: 0 += 0 +// CHECK-NEXT: [A] iterator move assign +// CHECK-NEXT: [A] iterator deref: 0 +// CHECK-NEXT: v=0 +// CHECK-NEXT: [A] iterator dtor +// CHECK-NEXT: [B] iterator advance: 0 += 0 +// CHECK-NEXT: [B] iterator move assign +// CHECK-NEXT: [B] iterator deref: 0 +// CHECK-NEXT: vv=0 +// CHECK-NEXT: [B] iterator dtor +// CHECK-NEXT: [A] iterator advance: 0 += 1 +// CHECK-NEXT: [A] iterator move assign +// CHECK-NEXT: [A] iterator deref: 1 +// CHECK-NEXT: v=1 +// CHECK-NEXT: [A] iterator dtor +// CHECK-NEXT: [B] iterator advance: 0 += 1 +// CHECK-NEXT: [B] iterator move assign +// CHECK-NEXT: [B] iterator deref: 1 +// CHECK-NEXT: vv=1 +// CHECK-NEXT: [B] iterator dtor +// CHECK-NEXT: [A] iterator advance: 0 += 2 +// CHECK-NEXT: [A] iterator move assign +// CHECK-NEXT: [A] iterator deref: 2 +// CHECK-NEXT: v=2 +// CHECK-NEXT: [A] iterator dtor +// CHECK-NEXT: [B] iterator advance: 0 += 2 +// CHECK-NEXT: [B] iterator move assign +// CHECK-NEXT: [B] iterator deref: 2 +// CHECK-NEXT: vv=2 +// CHECK-NEXT: [B] iterator dtor +// CHECK-NEXT: [B] iterator dtor +// CHECK-NEXT: [B] iterator dtor +// CHECK-NEXT: [B] iterator dtor +// CHECK-NEXT: [B] dtor +// CHECK-NEXT: [D] dtor +// CHECK-NEXT: [A] iterator dtor +// CHECK-NEXT: [A] iterator dtor +// CHECK-NEXT: [A] iterator dtor +// CHECK-NEXT: [A] dtor +// CHECK-NEXT: [C] dtor +// CHECK-NEXT: done + +#endif diff --git a/openmp/runtime/test/transform/fuse/intfor.c b/openmp/runtime/test/transform/fuse/intfor.c new file mode 100644 index 0000000000000..b8171b4df7042 --- /dev/null +++ b/openmp/runtime/test/transform/fuse/intfor.c @@ -0,0 +1,50 @@ +// RUN: %libomp-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include + +int main() { + printf("do\n"); +#pragma omp fuse + { + for (int i = 5; i <= 25; i += 5) + printf("i=%d\n", i); + for (int j = 10; j < 100; j += 10) + printf("j=%d\n", j); + for (int k = 10; k > 0; --k) + printf("k=%d\n", k); + } + printf("done\n"); + return EXIT_SUCCESS; +} +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: i=5 +// CHECK-NEXT: j=10 +// CHECK-NEXT: k=10 +// CHECK-NEXT: i=10 +// CHECK-NEXT: j=20 +// CHECK-NEXT: k=9 +// CHECK-NEXT: i=15 +// CHECK-NEXT: j=30 +// CHECK-NEXT: k=8 +// CHECK-NEXT: i=20 +// CHECK-NEXT: j=40 +// CHECK-NEXT: k=7 +// CHECK-NEXT: i=25 +// CHECK-NEXT: j=50 +// CHECK-NEXT: k=6 +// CHECK-NEXT: j=60 +// CHECK-NEXT: k=5 +// CHECK-NEXT: j=70 +// CHECK-NEXT: k=4 +// CHECK-NEXT: j=80 +// CHECK-NEXT: k=3 +// CHECK-NEXT: j=90 +// CHECK-NEXT: k=2 +// CHECK-NEXT: k=1 +// CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/fuse/iterfor.cpp b/openmp/runtime/test/transform/fuse/iterfor.cpp new file mode 100644 index 0000000000000..552484b2981c4 --- /dev/null +++ b/openmp/runtime/test/transform/fuse/iterfor.cpp @@ -0,0 +1,194 @@ +// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include +#include +#include + +struct Reporter { + const char *name; + + Reporter(const char *name) : name(name) { print("ctor"); } + + Reporter() : name("") { print("ctor"); } + + Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); } + + Reporter(Reporter &&that) : name(that.name) { print("move ctor"); } + + ~Reporter() { print("dtor"); } + + const Reporter &operator=(const Reporter &that) { + print("copy assign"); + this->name = that.name; + return *this; + } + + const Reporter &operator=(Reporter &&that) { + print("move assign"); + this->name = that.name; + return *this; + } + + struct Iterator { + const Reporter *owner; + int pos; + + Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {} + + Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) { + owner->print("iterator copy ctor"); + } + + Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) { + owner->print("iterator move ctor"); + } + + ~Iterator() { owner->print("iterator dtor"); } + + const Iterator &operator=(const Iterator &that) { + owner->print("iterator copy assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + const Iterator &operator=(Iterator &&that) { + owner->print("iterator move assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + bool operator==(const Iterator &that) const { + owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos); + return this->pos == that.pos; + } + + bool operator!=(const Iterator &that) const { + owner->print("iterator %d != %d", 2 - this->pos, 2 - that.pos); + return this->pos != that.pos; + } + + Iterator &operator++() { + owner->print("iterator prefix ++"); + pos -= 1; + return *this; + } + + Iterator operator++(int) { + owner->print("iterator postfix ++"); + auto result = *this; + pos -= 1; + return result; + } + + int operator*() const { + int result = 2 - pos; + owner->print("iterator deref: %i", result); + return result; + } + + size_t operator-(const Iterator &that) const { + int result = (2 - this->pos) - (2 - that.pos); + owner->print("iterator distance: %d", result); + return result; + } + + Iterator operator+(int steps) const { + owner->print("iterator advance: %i += %i", 2 - this->pos, steps); + return Iterator(owner, pos - steps); + } + }; + + Iterator begin() const { + print("begin()"); + return Iterator(this, 2); + } + + Iterator end() const { + print("end()"); + return Iterator(this, -1); + } + + void print(const char *msg, ...) const { + va_list args; + va_start(args, msg); + printf("[%s] ", name); + vprintf(msg, args); + printf("\n"); + va_end(args); + } +}; + +int main() { + printf("do\n"); + Reporter C("C"); + Reporter D("D"); +#pragma omp fuse + { + for (auto it = C.begin(); it != C.end(); ++it) + printf("v=%d\n", *it); + + for (auto it = D.begin(); it != D.end(); ++it) + printf("vv=%d\n", *it); + } + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK: [C] ctor +// CHECK-NEXT: [D] ctor +// CHECK-NEXT: [C] begin() +// CHECK-NEXT: [C] begin() +// CHECK-NEXT: [C] end() +// CHECK-NEXT: [C] iterator distance: 3 +// CHECK-NEXT: [D] begin() +// CHECK-NEXT: [D] begin() +// CHECK-NEXT: [D] end() +// CHECK-NEXT: [D] iterator distance: 3 +// CHECK-NEXT: [C] iterator advance: 0 += 0 +// CHECK-NEXT: [C] iterator move assign +// CHECK-NEXT: [C] iterator deref: 0 +// CHECK-NEXT: v=0 +// CHECK-NEXT: [C] iterator dtor +// CHECK-NEXT: [D] iterator advance: 0 += 0 +// CHECK-NEXT: [D] iterator move assign +// CHECK-NEXT: [D] iterator deref: 0 +// CHECK-NEXT: vv=0 +// CHECK-NEXT: [D] iterator dtor +// CHECK-NEXT: [C] iterator advance: 0 += 1 +// CHECK-NEXT: [C] iterator move assign +// CHECK-NEXT: [C] iterator deref: 1 +// CHECK-NEXT: v=1 +// CHECK-NEXT: [C] iterator dtor +// CHECK-NEXT: [D] iterator advance: 0 += 1 +// CHECK-NEXT: [D] iterator move assign +// CHECK-NEXT: [D] iterator deref: 1 +// CHECK-NEXT: vv=1 +// CHECK-NEXT: [D] iterator dtor +// CHECK-NEXT: [C] iterator advance: 0 += 2 +// CHECK-NEXT: [C] iterator move assign +// CHECK-NEXT: [C] iterator deref: 2 +// CHECK-NEXT: v=2 +// CHECK-NEXT: [C] iterator dtor +// CHECK-NEXT: [D] iterator advance: 0 += 2 +// CHECK-NEXT: [D] iterator move assign +// CHECK-NEXT: [D] iterator deref: 2 +// CHECK-NEXT: vv=2 +// CHECK-NEXT: [D] iterator dtor +// CHECK-NEXT: [D] iterator dtor +// CHECK-NEXT: [D] iterator dtor +// CHECK-NEXT: [C] iterator dtor +// CHECK-NEXT: [C] iterator dtor +// CHECK-NEXT: done +// CHECK-NEXT: [D] iterator dtor +// CHECK-NEXT: [C] iterator dtor +// CHECK-NEXT: [D] dtor +// CHECK-NEXT: [C] dtor diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp new file mode 100644 index 0000000000000..dcbbdf1b6734e --- /dev/null +++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-foreach.cpp @@ -0,0 +1,207 @@ +// RUN: %libomp-cxx20-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include +#include +#include + +struct Reporter { + const char *name; + + Reporter(const char *name) : name(name) { print("ctor"); } + + Reporter() : name("") { print("ctor"); } + + Reporter(const Reporter &that) : name(that.name) { print("copy ctor"); } + + Reporter(Reporter &&that) : name(that.name) { print("move ctor"); } + + ~Reporter() { print("dtor"); } + + const Reporter &operator=(const Reporter &that) { + print("copy assign"); + this->name = that.name; + return *this; + } + + const Reporter &operator=(Reporter &&that) { + print("move assign"); + this->name = that.name; + return *this; + } + + struct Iterator { + const Reporter *owner; + int pos; + + Iterator(const Reporter *owner, int pos) : owner(owner), pos(pos) {} + + Iterator(const Iterator &that) : owner(that.owner), pos(that.pos) { + owner->print("iterator copy ctor"); + } + + Iterator(Iterator &&that) : owner(that.owner), pos(that.pos) { + owner->print("iterator move ctor"); + } + + ~Iterator() { owner->print("iterator dtor"); } + + const Iterator &operator=(const Iterator &that) { + owner->print("iterator copy assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + const Iterator &operator=(Iterator &&that) { + owner->print("iterator move assign"); + this->owner = that.owner; + this->pos = that.pos; + return *this; + } + + bool operator==(const Iterator &that) const { + owner->print("iterator %d == %d", 2 - this->pos, 2 - that.pos); + return this->pos == that.pos; + } + + Iterator &operator++() { + owner->print("iterator prefix ++"); + pos -= 1; + return *this; + } + + Iterator operator++(int) { + owner->print("iterator postfix ++"); + auto result = *this; + pos -= 1; + return result; + } + + int operator*() const { + int result = 2 - pos; + owner->print("iterator deref: %i", result); + return result; + } + + size_t operator-(const Iterator &that) const { + int result = (2 - this->pos) - (2 - that.pos); + owner->print("iterator distance: %d", result); + return result; + } + + Iterator operator+(int steps) const { + owner->print("iterator advance: %i += %i", 2 - this->pos, steps); + return Iterator(owner, pos - steps); + } + + void print(const char *msg) const { owner->print(msg); } + }; + + Iterator begin() const { + print("begin()"); + return Iterator(this, 2); + } + + Iterator end() const { + print("end()"); + return Iterator(this, -1); + } + + void print(const char *msg, ...) const { + va_list args; + va_start(args, msg); + printf("[%s] ", name); + vprintf(msg, args); + printf("\n"); + va_end(args); + } +}; + +int main() { + printf("do\n"); +#pragma omp parallel for collapse(2) num_threads(1) + for (int i = 0; i < 3; ++i) +#pragma omp fuse + { + for (Reporter c{"init-stmt"}; auto &&v : Reporter("range")) + printf("i=%d v=%d\n", i, v); + for (int vv = 0; vv < 3; ++vv) + printf("i=%d vv=%d\n", i, vv); + } + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: [init-stmt] ctor +// CHECK-NEXT: [range] ctor +// CHECK-NEXT: [range] end() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] begin() +// CHECK-NEXT: [range] iterator distance: 3 +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=0 v=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=0 vv=0 +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=0 v=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=0 vv=1 +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=0 v=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=0 vv=2 +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=1 v=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=1 vv=0 +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=1 v=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=1 vv=1 +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=1 v=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=1 vv=2 +// CHECK-NEXT: [range] iterator advance: 0 += 0 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 0 +// CHECK-NEXT: i=2 v=0 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=2 vv=0 +// CHECK-NEXT: [range] iterator advance: 0 += 1 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 1 +// CHECK-NEXT: i=2 v=1 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=2 vv=1 +// CHECK-NEXT: [range] iterator advance: 0 += 2 +// CHECK-NEXT: [range] iterator move assign +// CHECK-NEXT: [range] iterator deref: 2 +// CHECK-NEXT: i=2 v=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: i=2 vv=2 +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] iterator dtor +// CHECK-NEXT: [range] dtor +// CHECK-NEXT: [init-stmt] dtor +// CHECK-NEXT: done diff --git a/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c new file mode 100644 index 0000000000000..9630fec50bc20 --- /dev/null +++ b/openmp/runtime/test/transform/fuse/parallel-wsloop-collapse-intfor.c @@ -0,0 +1,45 @@ +// RUN: %libomp-cxx-compile-and-run | FileCheck %s --match-full-lines + +#ifndef HEADER +#define HEADER + +#include +#include + +int main() { + printf("do\n"); +#pragma omp parallel for collapse(2) num_threads(1) + for (int i = 0; i < 3; ++i) +#pragma omp fuse + { + for (int j = 0; j < 3; ++j) + printf("i=%d j=%d\n", i, j); + for (int k = 0; k < 3; ++k) + printf("i=%d k=%d\n", i, k); + } + printf("done\n"); + return EXIT_SUCCESS; +} + +#endif /* HEADER */ + +// CHECK: do +// CHECK-NEXT: i=0 j=0 +// CHECK-NEXT: i=0 k=0 +// CHECK-NEXT: i=0 j=1 +// CHECK-NEXT: i=0 k=1 +// CHECK-NEXT: i=0 j=2 +// CHECK-NEXT: i=0 k=2 +// CHECK-NEXT: i=1 j=0 +// CHECK-NEXT: i=1 k=0 +// CHECK-NEXT: i=1 j=1 +// CHECK-NEXT: i=1 k=1 +// CHECK-NEXT: i=1 j=2 +// CHECK-NEXT: i=1 k=2 +// CHECK-NEXT: i=2 j=0 +// CHECK-NEXT: i=2 k=0 +// CHECK-NEXT: i=2 j=1 +// CHECK-NEXT: i=2 k=1 +// CHECK-NEXT: i=2 j=2 +// CHECK-NEXT: i=2 k=2 +// CHECK-NEXT: done diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt index fac621a076aa4..d3280a5867dec 100644 --- a/runtimes/CMakeLists.txt +++ b/runtimes/CMakeLists.txt @@ -74,7 +74,7 @@ endif() if (NOT LLVM_FOUND) set(LLVM_TOOLS_BINARY_DIR ${LLVM_BINARY_DIR}/bin) set(LLVM_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/include) - set(LLVM_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/lib) + set(LLVM_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}) endif() # Setting these variables will allow the sub-build to put their outputs into diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE index 00cfea572096a..cb6dfa3a01cfa 100644 --- a/utils/bazel/WORKSPACE +++ b/utils/bazel/WORKSPACE @@ -186,9 +186,9 @@ maybe( http_archive, name = "nanobind", build_file = "@llvm-raw//utils/bazel/third_party_build:nanobind.BUILD", - sha256 = "bb35deaed7efac5029ed1e33880a415638352f757d49207a8e6013fefb6c49a7", - strip_prefix = "nanobind-2.4.0", - url = "https://github.com/wjakob/nanobind/archive/refs/tags/v2.4.0.tar.gz", + sha256 = "8ce3667dce3e64fc06bfb9b778b6f48731482362fb89a43da156632266cd5a90", + strip_prefix = "nanobind-2.9.2", + url = "https://github.com/wjakob/nanobind/archive/refs/tags/v2.9.2.tar.gz", ) load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains") diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index 13fca5f592c7a..5af035d840946 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -1313,7 +1313,10 @@ cc_library( cc_library( name = "tooling_dependency_scanning", - srcs = glob(["lib/Tooling/DependencyScanning/**/*.cpp"]), + srcs = glob([ + "lib/Tooling/DependencyScanning/**/*.h", + "lib/Tooling/DependencyScanning/**/*.cpp", + ]), hdrs = glob(["include/clang/Tooling/DependencyScanning/**/*.h"]), deps = [ ":basic", diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 9d02ff9f459ae..8d9e80393bf20 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -2657,6 +2657,22 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_math_exp10m1f", + hdrs = ["src/__support/math/exp10m1f.h"], + deps = [ + ":__support_fputil_except_value_utils", + ":__support_fputil_fenv_impl", + ":__support_fputil_fp_bits", + ":__support_fputil_multiply_add", + ":__support_fputil_polyeval", + ":__support_fputil_rounding_mode", + ":__support_macros_optimization", + ":__support_math_exp10f_utils", + ":errno", + ], +) + libc_support_library( name = "__support_math_erff", hdrs = ["src/__support/math/erff.h"], @@ -3613,7 +3629,7 @@ libc_math_function( libc_math_function( name = "exp10m1f", additional_deps = [ - ":__support_math_exp10f_utils", + ":__support_math_exp10m1f", ], ) diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel index a38278af983f1..7d62afc982be8 100644 --- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel @@ -465,6 +465,7 @@ cc_library( ":TargetHeaders", ":Utility", ":UtilityPrivateHeaders", + "//llvm:BinaryFormat", "//llvm:DebugInfo", "//llvm:DebugInfoDWARF", "//llvm:DebugInfoDWARFLowLevel", diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 6b2a475f8c0ad..8f607c7ce087e 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -4969,6 +4969,22 @@ cc_binary( ], ) +cc_binary( + name = "llvm-remarkutil", + srcs = glob([ + "tools/llvm-remarkutil/**/*.cpp", + "tools/llvm-remarkutil/**/*.h", + ]), + copts = llvm_copts, + includes = ["tools/llvm-remarkutil"], + stamp = 0, + deps = [ + ":Demangle", + ":Remarks", + ":Support", + ], +) + cc_binary( name = "llvm-rtdyld", srcs = glob([ diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 46ff0acb547d5..422c29fc9c4d5 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -367,7 +367,6 @@ cc_library( "include/mlir/Bytecode/*.h", ]) + [ "include/mlir/IR/OpAsmOpInterface.h.inc", - "include/mlir/IR/PDLPatternMatch.h.inc", "include/mlir/Interfaces/DataLayoutInterfaces.h", "include/mlir/Interfaces/InferIntRangeInterface.h", "include/mlir/Interfaces/SideEffectInterfaces.h", @@ -375,6 +374,7 @@ cc_library( hdrs = glob([ "include/mlir/IR/*.h", ]) + [ + "include/mlir/IR/PDLPatternMatch.h.inc", "include/mlir/Interfaces/CallInterfaces.h", "include/mlir/Interfaces/FoldInterfaces.h", ], @@ -4807,6 +4807,7 @@ cc_library( ":InliningUtils", ":MaskableOpInterface", ":MaskingOpInterface", + ":MemOpInterfaces", ":MemRefDialect", ":SideEffectInterfaces", ":SubsetOpInterface", @@ -6090,6 +6091,7 @@ cc_library( ":DialectUtils", ":GPUDialect", ":IR", + ":InferIntRangeInterface", ":LLVMDialect", ":NVVMOpsIncGen", ":NVVMRequiresSMTraitsIncGen", @@ -6294,6 +6296,7 @@ cc_library( ":BytecodeOpInterface", ":GPUDialect", ":IR", + ":InferIntRangeInterface", ":LLVMDialect", ":ROCDLOpsIncGen", ":SideEffectInterfaces", @@ -6684,6 +6687,7 @@ cc_library( ":IR", ":InferTypeOpInterface", ":SMTIncGen", + ":SideEffectInterfaces", ":Support", "//llvm:Support", ], @@ -7712,6 +7716,7 @@ cc_library( "lib/Transforms/*.cpp", ]), hdrs = [ + "include/mlir/Transforms/BubbleDownMemorySpaceCasts.h", "include/mlir/Transforms/CSE.h", "include/mlir/Transforms/EndomorphismSimplification.h", "include/mlir/Transforms/HomomorphismSimplification.h", @@ -7729,6 +7734,7 @@ cc_library( ":FunctionInterfaces", ":IR", ":LoopLikeInterface", + ":MemOpInterfaces", ":MemorySlotInterfaces", ":Pass", ":RuntimeVerifiableOpInterface", @@ -11075,6 +11081,7 @@ td_library( ":InferTypeOpInterfaceTdFiles", ":MaskableOpInterfaceTdFiles", ":MaskingOpInterfaceTdFiles", + ":MemOpInterfacesTdFiles", ":OpBaseTdFiles", ":SideEffectInterfacesTdFiles", ":VectorInterfacesTdFiles", @@ -11827,6 +11834,7 @@ cc_library( srcs = glob(["lib/Dialect/Transform/PDLExtension/*.cpp"]), hdrs = glob(["include/mlir/Dialect/Transform/PDLExtension/*.h"]), deps = [ + ":BytecodeOpInterface", ":IR", ":PDLDialect", ":PDLInterpDialect", @@ -11941,6 +11949,7 @@ cc_library( srcs = glob(["lib/Dialect/Transform/IRDLExtension/*.cpp"]), hdrs = glob(["include/mlir/Dialect/Transform/IRDLExtension/*.h"]), deps = [ + ":BytecodeOpInterface", ":IR", ":IRDLDialect", ":IRDLInterfacesIncGen", @@ -11982,7 +11991,9 @@ cc_library( srcs = glob(["lib/Dialect/Transform/DebugExtension/*.cpp"]), hdrs = glob(["include/mlir/Dialect/Transform/DebugExtension/*.h"]), deps = [ + ":BytecodeOpInterface", ":IR", + ":SideEffectInterfaces", ":Support", ":TransformDebugExtensionOpsIncGen", ":TransformDialect", @@ -12019,6 +12030,7 @@ cc_library( srcs = glob(["lib/Dialect/Transform/LoopExtension/*.cpp"]), hdrs = glob(["include/mlir/Dialect/Transform/LoopExtension/*.h"]), deps = [ + ":BytecodeOpInterface", ":IR", ":LoopLikeInterface", ":Rewrite", @@ -12622,6 +12634,15 @@ cc_library( ], ) +td_library( + name = "MemOpInterfacesTdFiles", + srcs = ["include/mlir/Interfaces/MemOpInterfaces.td"], + includes = ["include"], + deps = [ + ":OpBaseTdFiles", + ], +) + td_library( name = "MemRefOpsTdFiles", srcs = [ @@ -12633,6 +12654,7 @@ td_library( ":ArithOpsTdFiles", ":CastInterfacesTdFiles", ":ControlFlowInterfacesTdFiles", + ":MemOpInterfacesTdFiles", ":MemorySlotInterfacesTdFiles", ":OpBaseTdFiles", ":ShapedOpInterfacesTdFiles", @@ -12641,6 +12663,17 @@ td_library( ], ) +gentbl_cc_library( + name = "MemOpInterfacesIncGen", + tbl_outs = { + "include/mlir/Interfaces/MemOpInterfaces.h.inc": ["-gen-op-interface-decls"], + "include/mlir/Interfaces/MemOpInterfaces.cpp.inc": ["-gen-op-interface-defs"], + }, + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Interfaces/MemOpInterfaces.td", + deps = [":MemOpInterfacesTdFiles"], +) + gentbl_cc_library( name = "MemRefBaseIncGen", tbl_outs = { @@ -12671,6 +12704,19 @@ gentbl_cc_library( ], ) +cc_library( + name = "MemOpInterfaces", + srcs = ["lib/Interfaces/MemOpInterfaces.cpp"], + hdrs = ["include/mlir/Interfaces/MemOpInterfaces.h"], + includes = ["include"], + deps = [ + ":DialectUtils", + ":IR", + ":MemOpInterfacesIncGen", + ":Support", + ], +) + cc_library( name = "MemRefDialect", srcs = glob( @@ -12700,6 +12746,7 @@ cc_library( ":InferIntRangeInterface", ":InferTypeOpInterface", ":InliningUtils", + ":MemOpInterfaces", ":MemRefBaseIncGen", ":MemRefOpsIncGen", ":MemorySlotInterfaces", @@ -13032,6 +13079,7 @@ cc_library( ":MPIOpsIncGen", ":MPITypesIncGen", ":MemRefDialect", + ":SideEffectInterfaces", "//llvm:Support", ], ) diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel index a6e8c515f0874..102c4161eb74c 100644 --- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel @@ -53,14 +53,6 @@ filegroup( ]), ) -filegroup( - name = "IRPyIFiles", - srcs = [ - "mlir/_mlir_libs/_mlir/__init__.pyi", - "mlir/_mlir_libs/_mlir/ir.pyi", - ], -) - filegroup( name = "MlirLibsPyFiles", srcs = [ @@ -75,13 +67,6 @@ filegroup( ], ) -filegroup( - name = "PassManagerPyIFiles", - srcs = [ - "mlir/_mlir_libs/_mlir/passmanager.pyi", - ], -) - filegroup( name = "RewritePyFiles", srcs = [ @@ -1461,6 +1446,22 @@ gentbl_filegroup( ], ) +gentbl_filegroup( + name = "TransformSMTExtensionOpsPyGen", + tbl_outs = {"mlir/dialects/_transform_smt_extension_ops_gen.py": [ + "-gen-python-op-bindings", + "-bind-dialect=transform", + "-dialect-extension=smt_transform", + ]}, + tblgen = "//mlir:mlir-tblgen", + td_file = "mlir/dialects/TransformSMTExtensionOps.td", + deps = [ + "//mlir:OpBaseTdFiles", + "//mlir:TransformDialectTdFiles", + "//mlir:TransformSMTExtensionOpsTdFiles", + ], +) + filegroup( name = "TransformOpsPyFiles", srcs = [ @@ -1476,6 +1477,7 @@ filegroup( ":TensorTransformOpsPyGen", ":TransformEnumPyGen", ":TransformOpsPyGen", + ":TransformSMTExtensionOpsPyGen", ":VectorTransformEnumPyGen", ":VectorTransformOpsPyGen", ], diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel index 469fcee8d9748..7e36d0ba787b6 100644 --- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel @@ -31,6 +31,7 @@ expand_template( "@LIT_SITE_CFG_IN_HEADER@": LIT_SITE_CFG_IN_HEADER, "@LLVM_TOOLS_DIR@": package_path("//llvm:BUILD"), "@ENABLE_SHARED@": "1", + "@BUILD_SHARED_LIBS@": "1", "@ENABLE_ASSERTIONS@": "1", "@MLIR_SOURCE_DIR@": package_path("//mlir:BUILD"), "@MLIR_TOOLS_DIR@": package_path("//mlir:BUILD"),